nodejieba-plus 3.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/.github/FUNDING.yml +12 -0
  2. package/.github/workflows/github_release.yml +61 -0
  3. package/.github/workflows/npm_publish.yml +24 -0
  4. package/.github/workflows/stale-issues.yml +24 -0
  5. package/.github/workflows/test.yml +42 -0
  6. package/.gitmodules +3 -0
  7. package/.npmignore +15 -0
  8. package/CHANGELOG.md +360 -0
  9. package/CONTRIBUTING.md +78 -0
  10. package/LICENSE +21 -0
  11. package/README.md +349 -0
  12. package/binding.gyp +63 -0
  13. package/index.js +77 -0
  14. package/lib/index.cpp +3 -0
  15. package/lib/nodejieba.cpp +218 -0
  16. package/lib/nodejieba.h +28 -0
  17. package/lib/utils.h +47 -0
  18. package/package.json +48 -0
  19. package/submodules/cppjieba/.github/workflows/cmake.yml +51 -0
  20. package/submodules/cppjieba/.github/workflows/stale-issues.yml +24 -0
  21. package/submodules/cppjieba/.gitmodules +3 -0
  22. package/submodules/cppjieba/CHANGELOG.md +305 -0
  23. package/submodules/cppjieba/CMakeLists.txt +42 -0
  24. package/submodules/cppjieba/LICENSE +20 -0
  25. package/submodules/cppjieba/README.md +280 -0
  26. package/submodules/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
  27. package/submodules/cppjieba/deps/limonp/.gitmodules +0 -0
  28. package/submodules/cppjieba/deps/limonp/CHANGELOG.md +160 -0
  29. package/submodules/cppjieba/deps/limonp/CMakeLists.txt +61 -0
  30. package/submodules/cppjieba/deps/limonp/LICENSE +20 -0
  31. package/submodules/cppjieba/deps/limonp/README.md +38 -0
  32. package/submodules/cppjieba/deps/limonp/include/limonp/ArgvContext.hpp +70 -0
  33. package/submodules/cppjieba/deps/limonp/include/limonp/Closure.hpp +206 -0
  34. package/submodules/cppjieba/deps/limonp/include/limonp/Colors.hpp +31 -0
  35. package/submodules/cppjieba/deps/limonp/include/limonp/Condition.hpp +38 -0
  36. package/submodules/cppjieba/deps/limonp/include/limonp/Config.hpp +103 -0
  37. package/submodules/cppjieba/deps/limonp/include/limonp/ForcePublic.hpp +7 -0
  38. package/submodules/cppjieba/deps/limonp/include/limonp/LocalVector.hpp +139 -0
  39. package/submodules/cppjieba/deps/limonp/include/limonp/Logging.hpp +90 -0
  40. package/submodules/cppjieba/deps/limonp/include/limonp/NonCopyable.hpp +21 -0
  41. package/submodules/cppjieba/deps/limonp/include/limonp/StdExtension.hpp +157 -0
  42. package/submodules/cppjieba/deps/limonp/include/limonp/StringUtil.hpp +386 -0
  43. package/submodules/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
  44. package/submodules/cppjieba/deps/limonp/test/demo.cpp +40 -0
  45. package/submodules/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
  46. package/submodules/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
  47. package/submodules/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
  48. package/submodules/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
  49. package/submodules/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
  50. package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
  51. package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
  52. package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
  53. package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
  54. package/submodules/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
  55. package/submodules/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
  56. package/submodules/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
  57. package/submodules/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
  58. package/submodules/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
  59. package/submodules/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
  60. package/submodules/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
  61. package/submodules/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
  62. package/submodules/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
  63. package/submodules/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
  64. package/submodules/cppjieba/deps/limonp/test/unittest/gtest_main.cpp +39 -0
  65. package/submodules/cppjieba/dict/README.md +31 -0
  66. package/submodules/cppjieba/dict/hmm_model.utf8 +34 -0
  67. package/submodules/cppjieba/dict/idf.utf8 +258826 -0
  68. package/submodules/cppjieba/dict/jieba.dict.utf8 +348982 -0
  69. package/submodules/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  70. package/submodules/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  71. package/submodules/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  72. package/submodules/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  73. package/submodules/cppjieba/dict/stop_words.utf8 +1534 -0
  74. package/submodules/cppjieba/dict/user.dict.utf8 +4 -0
  75. package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +381 -0
  76. package/submodules/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  77. package/submodules/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  78. package/submodules/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  79. package/submodules/cppjieba/include/cppjieba/Jieba.hpp +169 -0
  80. package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  81. package/submodules/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  82. package/submodules/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  83. package/submodules/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  84. package/submodules/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  85. package/submodules/cppjieba/include/cppjieba/QuerySegment.hpp +89 -0
  86. package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +48 -0
  87. package/submodules/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  88. package/submodules/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  89. package/submodules/cppjieba/include/cppjieba/Trie.hpp +200 -0
  90. package/submodules/cppjieba/include/cppjieba/Unicode.hpp +231 -0
  91. package/submodules/cppjieba/test/CMakeLists.txt +4 -0
  92. package/submodules/cppjieba/test/load_test.cpp +54 -0
  93. package/submodules/cppjieba/test/testdata/curl.res +1 -0
  94. package/submodules/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  95. package/submodules/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  96. package/submodules/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  97. package/submodules/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  98. package/submodules/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  99. package/submodules/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  100. package/submodules/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  101. package/submodules/cppjieba/test/testdata/load_test.urls +2 -0
  102. package/submodules/cppjieba/test/testdata/review.100 +100 -0
  103. package/submodules/cppjieba/test/testdata/review.100.res +200 -0
  104. package/submodules/cppjieba/test/testdata/server.conf +19 -0
  105. package/submodules/cppjieba/test/testdata/testlines.gbk +9 -0
  106. package/submodules/cppjieba/test/testdata/testlines.utf8 +8 -0
  107. package/submodules/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  108. package/submodules/cppjieba/test/testdata/userdict.english +2 -0
  109. package/submodules/cppjieba/test/testdata/userdict.utf8 +8 -0
  110. package/submodules/cppjieba/test/testdata/weicheng.utf8 +247 -0
  111. package/submodules/cppjieba/test/unittest/CMakeLists.txt +33 -0
  112. package/submodules/cppjieba/test/unittest/gtest_main.cpp +39 -0
  113. package/submodules/cppjieba/test/unittest/jieba_test.cpp +166 -0
  114. package/submodules/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  115. package/submodules/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  116. package/submodules/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  117. package/submodules/cppjieba/test/unittest/segments_test.cpp +256 -0
  118. package/submodules/cppjieba/test/unittest/textrank_test.cpp +86 -0
  119. package/submodules/cppjieba/test/unittest/trie_test.cpp +177 -0
  120. package/submodules/cppjieba/test/unittest/unicode_test.cpp +43 -0
  121. package/test/debug_split +0 -0
  122. package/test/debug_split2 +0 -0
  123. package/test/debug_split3 +0 -0
  124. package/test/load_dict_test.js +14 -0
  125. package/test/missing_binding_test.js +42 -0
  126. package/test/test.js +366 -0
  127. package/test/testdata/userdict.utf8 +1 -0
  128. package/tsconfig.json +59 -0
  129. package/types/index.d.ts +30 -0
  130. package/typescript_demo.ts +38 -0
@@ -0,0 +1,183 @@
1
+ #include "limonp/StringUtil.hpp"
2
+ #include "gtest/gtest.h"
3
+ using namespace limonp;
4
+
5
+ TEST(StringUtilTest, Test1) {
6
+ vector<string> vec;
7
+ string s;
8
+ Split("\t1\t3\t4\t", vec, "\t");
9
+ ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
10
+ s = " \t\n ni hao ad \r\n";
11
+ ASSERT_EQ("ni hao ad", Trim(s));
12
+ ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
13
+ s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
14
+ ASSERT_EQ("select * from table1 limit 1;" ,s);
15
+ vec.clear();
16
+ vec.push_back("1");
17
+ vec.push_back("2");
18
+ vec.push_back("3");
19
+ s.clear();
20
+ Join(vec.begin(), vec.end(), s,",");
21
+ ASSERT_EQ("1,2,3",s);
22
+ s = Join(vec.begin(), vec.end(), "..");
23
+ ASSERT_EQ("1..2..3", s);
24
+ const char* arr[] = {"2","3","5"};
25
+ ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
26
+ map<string , int> mp;
27
+ mp["key1"] =2;
28
+ ASSERT_EQ("{key1:2}", s << mp);
29
+ std::unordered_map<int,int> hmp;
30
+ hmp[1]=2;
31
+ ASSERT_EQ("{1:2}", s << hmp);
32
+ }
33
+
34
+ TEST(StringUtilTest, Test2) {
35
+ string s, gbks;
36
+ ifstream ifs("../test/testdata/dict.gbk");
37
+ ASSERT_TRUE(!!ifs);
38
+
39
+ vector<uint16_t> uni;
40
+ while(getline(ifs, s)) {
41
+ GBKTrans(s, uni);
42
+ GBKTrans(uni.begin(), uni.end(), gbks);
43
+ ASSERT_EQ(s, gbks);
44
+ }
45
+ }
46
+
47
+ TEST(StringUtilTest, Test3) {
48
+ string s, utf8;
49
+ ifstream ifs("../test/testdata/dict.utf8");
50
+ ASSERT_TRUE(!!ifs);
51
+
52
+ vector<uint16_t> uni;
53
+ while(getline(ifs, s)) {
54
+ ASSERT_TRUE(Utf8ToUnicode(s, uni));
55
+ UnicodeToUtf8(uni.begin(), uni.end(), utf8);
56
+ ASSERT_EQ(s, utf8);
57
+ }
58
+ }
59
+
60
+ TEST(StringUtilTest, Test4) {
61
+ //ASSERT_TRUE(StartsWith("--help",NULL));
62
+ ASSERT_TRUE(StartsWith("--help","--"));
63
+ ASSERT_TRUE(StartsWith("--help","-"));
64
+ ASSERT_FALSE(StartsWith("--help","he"));
65
+ ASSERT_TRUE(StartsWith("help","help"));
66
+ ASSERT_FALSE(StartsWith("","help"));
67
+ ASSERT_TRUE(StartsWith("hel",""));
68
+ ASSERT_TRUE(EndsWith("hel",""));
69
+ ASSERT_TRUE(EndsWith("hel","el"));
70
+ }
71
+
72
+ TEST(StringUtilTest, Test5) {
73
+ const char* str = "1,2,3,4";
74
+ vector<string> vec;
75
+ string res;
76
+ Split(str, vec, ",");
77
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
78
+ Split("1,2,3,4,", vec, ",");
79
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
80
+ Split(str, vec, ",", 3);
81
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
82
+
83
+ Split("1", vec, ",");
84
+ ASSERT_EQ("[\"1\"]", res << vec);
85
+
86
+ Split(str, vec, ",", 1);
87
+ ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
88
+
89
+ Split("", vec, ",");
90
+ ASSERT_EQ("[]", res << vec);
91
+
92
+ Split("1, 2", vec, ",");
93
+ ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
94
+
95
+ Split("1==2", vec, "==");
96
+ ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
97
+
98
+ Split("1,", vec, ",");
99
+ ASSERT_EQ("[\"1\"]", res << vec);
100
+
101
+ Split(",1,", vec, ",");
102
+ ASSERT_EQ("[\"\", \"1\"]", res << vec);
103
+
104
+ Split("1, ", vec, ",");
105
+ ASSERT_EQ("[\"1\", \" \"]", res << vec);
106
+
107
+ res << Split("1|2,3", "|,");
108
+ ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
109
+ }
110
+
111
+ TEST(StringUtilTest, Trim) {
112
+ string s;
113
+ s = "xxxyyyxx";
114
+ ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
115
+ ASSERT_EQ(LTrim(s, 'x'), "yyy");
116
+ s = "xxxyyyxx";
117
+ ASSERT_EQ(Trim(s, 'x'), "yyy");
118
+
119
+ s = " x y ";
120
+ ASSERT_EQ(Trim(s), "x y");
121
+
122
+ // check if it core dump when using isalpha
123
+ wchar_t w = 1000024;
124
+ ASSERT_FALSE(IsSpace(w));
125
+ w = 0x20;
126
+ ASSERT_TRUE(IsSpace(w));
127
+ }
128
+
129
+ TEST(StringUtilTest, GetTime) {
130
+ string s;
131
+ GetTime("%Y-%m-%d %H:%M:%S", s);
132
+ //print(s);
133
+ }
134
+
135
+ TEST(StringUtilTest, PathJoin) {
136
+ const char * path1 = "/home/foo/dir";
137
+ const char * path2 = "file";
138
+ const char * path3 = "/home/foo/dir/";
139
+ const char * path4 = "file";
140
+ const char * answer = "/home/foo/dir/file";
141
+
142
+ ASSERT_EQ(answer, PathJoin(path1, path2));
143
+ ASSERT_EQ(answer, PathJoin(path3, path4));
144
+ }
145
+
146
+ TEST(StringUtilTest, JapaneseUnicode) {
147
+ // Japanese
148
+ const char* s = "がんば";
149
+ vector<uint16_t> unicode;
150
+ ASSERT_TRUE(Utf8ToUnicode(s, unicode));
151
+ ASSERT_EQ(3u, unicode.size());
152
+ }
153
+
154
+ TEST(StringUtilTest, RareChinese) {
155
+ //U+10000 – U+10FFFF
156
+ const char* s = "𪚥";
157
+ vector<uint16_t> unicode;
158
+ ASSERT_FALSE(Utf8ToUnicode(s, unicode));
159
+ ASSERT_EQ(0u, unicode.size());
160
+ }
161
+
162
+ TEST(StringUtilTest, RareChineseUnicode32) {
163
+ //U+10000 – U+10FFFF
164
+ const char* s = "𪚥";
165
+ vector<uint32_t> unicode;
166
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
167
+ ASSERT_EQ(1u, unicode.size());
168
+
169
+ string s2;
170
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
171
+ ASSERT_EQ(s2, s);
172
+ }
173
+
174
+ TEST(StringUtilTest, Unicode32) {
175
+ const char* s = "1+1=2你好世界,。";
176
+ vector<uint32_t> unicode;
177
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
178
+ ASSERT_EQ(unicode.size(), 11u);
179
+
180
+ string s2;
181
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
182
+ ASSERT_EQ(s2, s);
183
+ }
@@ -0,0 +1,39 @@
1
+ // Copyright 2006, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include <iostream>
31
+
32
+ #include "gtest/gtest.h"
33
+
34
+ GTEST_API_ int main(int argc, char **argv) {
35
+ std::cout << "Running main() from gtest_main.cc\n";
36
+
37
+ testing::InitGoogleTest(&argc, argv);
38
+ return RUN_ALL_TESTS();
39
+ }
@@ -0,0 +1,31 @@
1
+ # CppJieba字典
2
+
3
+ 文件后缀名代表的是词典的编码方式。
4
+ 比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。
5
+
6
+
7
+ ## 分词
8
+
9
+ ### jieba.dict.utf8/gbk
10
+
11
+ 作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
12
+
13
+ ### hmm_model.utf8/gbk
14
+
15
+ 作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
16
+
17
+ __对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
18
+
19
+
20
+ ## 关键词抽取
21
+
22
+ ### idf.utf8
23
+
24
+ IDF(Inverse Document Frequency)
25
+ 在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。
26
+
27
+ ### stop_words.utf8
28
+
29
+ 停用词词典
30
+
31
+