nodejieba-plus 3.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/FUNDING.yml +12 -0
- package/.github/workflows/github_release.yml +61 -0
- package/.github/workflows/npm_publish.yml +24 -0
- package/.github/workflows/stale-issues.yml +24 -0
- package/.github/workflows/test.yml +42 -0
- package/.gitmodules +3 -0
- package/.npmignore +15 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +78 -0
- package/LICENSE +21 -0
- package/README.md +349 -0
- package/binding.gyp +63 -0
- package/index.js +77 -0
- package/lib/index.cpp +3 -0
- package/lib/nodejieba.cpp +218 -0
- package/lib/nodejieba.h +28 -0
- package/lib/utils.h +47 -0
- package/package.json +48 -0
- package/submodules/cppjieba/.github/workflows/cmake.yml +51 -0
- package/submodules/cppjieba/.github/workflows/stale-issues.yml +24 -0
- package/submodules/cppjieba/.gitmodules +3 -0
- package/submodules/cppjieba/CHANGELOG.md +305 -0
- package/submodules/cppjieba/CMakeLists.txt +42 -0
- package/submodules/cppjieba/LICENSE +20 -0
- package/submodules/cppjieba/README.md +280 -0
- package/submodules/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
- package/submodules/cppjieba/deps/limonp/.gitmodules +0 -0
- package/submodules/cppjieba/deps/limonp/CHANGELOG.md +160 -0
- package/submodules/cppjieba/deps/limonp/CMakeLists.txt +61 -0
- package/submodules/cppjieba/deps/limonp/LICENSE +20 -0
- package/submodules/cppjieba/deps/limonp/README.md +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ArgvContext.hpp +70 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Closure.hpp +206 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Colors.hpp +31 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Condition.hpp +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Config.hpp +103 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ForcePublic.hpp +7 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/LocalVector.hpp +139 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Logging.hpp +90 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/NonCopyable.hpp +21 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StdExtension.hpp +157 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StringUtil.hpp +386 -0
- package/submodules/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
- package/submodules/cppjieba/deps/limonp/test/demo.cpp +40 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/dict/README.md +31 -0
- package/submodules/cppjieba/dict/hmm_model.utf8 +34 -0
- package/submodules/cppjieba/dict/idf.utf8 +258826 -0
- package/submodules/cppjieba/dict/jieba.dict.utf8 +348982 -0
- package/submodules/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- package/submodules/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- package/submodules/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- package/submodules/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- package/submodules/cppjieba/dict/stop_words.utf8 +1534 -0
- package/submodules/cppjieba/dict/user.dict.utf8 +4 -0
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +381 -0
- package/submodules/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- package/submodules/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- package/submodules/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +169 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- package/submodules/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- package/submodules/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- package/submodules/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- package/submodules/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- package/submodules/cppjieba/include/cppjieba/QuerySegment.hpp +89 -0
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +48 -0
- package/submodules/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- package/submodules/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Trie.hpp +200 -0
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +231 -0
- package/submodules/cppjieba/test/CMakeLists.txt +4 -0
- package/submodules/cppjieba/test/load_test.cpp +54 -0
- package/submodules/cppjieba/test/testdata/curl.res +1 -0
- package/submodules/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/test/testdata/load_test.urls +2 -0
- package/submodules/cppjieba/test/testdata/review.100 +100 -0
- package/submodules/cppjieba/test/testdata/review.100.res +200 -0
- package/submodules/cppjieba/test/testdata/server.conf +19 -0
- package/submodules/cppjieba/test/testdata/testlines.gbk +9 -0
- package/submodules/cppjieba/test/testdata/testlines.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- package/submodules/cppjieba/test/testdata/userdict.english +2 -0
- package/submodules/cppjieba/test/testdata/userdict.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/weicheng.utf8 +247 -0
- package/submodules/cppjieba/test/unittest/CMakeLists.txt +33 -0
- package/submodules/cppjieba/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/test/unittest/jieba_test.cpp +166 -0
- package/submodules/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- package/submodules/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- package/submodules/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- package/submodules/cppjieba/test/unittest/segments_test.cpp +256 -0
- package/submodules/cppjieba/test/unittest/textrank_test.cpp +86 -0
- package/submodules/cppjieba/test/unittest/trie_test.cpp +177 -0
- package/submodules/cppjieba/test/unittest/unicode_test.cpp +43 -0
- package/test/debug_split +0 -0
- package/test/debug_split2 +0 -0
- package/test/debug_split3 +0 -0
- package/test/load_dict_test.js +14 -0
- package/test/missing_binding_test.js +42 -0
- package/test/test.js +366 -0
- package/test/testdata/userdict.utf8 +1 -0
- package/tsconfig.json +59 -0
- package/types/index.d.ts +30 -0
- package/typescript_demo.ts +38 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#include "limonp/StringUtil.hpp"
|
|
2
|
+
#include "gtest/gtest.h"
|
|
3
|
+
using namespace limonp;
|
|
4
|
+
|
|
5
|
+
TEST(StringUtilTest, Test1) {
|
|
6
|
+
vector<string> vec;
|
|
7
|
+
string s;
|
|
8
|
+
Split("\t1\t3\t4\t", vec, "\t");
|
|
9
|
+
ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
|
|
10
|
+
s = " \t\n ni hao ad \r\n";
|
|
11
|
+
ASSERT_EQ("ni hao ad", Trim(s));
|
|
12
|
+
ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
|
|
13
|
+
s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
|
|
14
|
+
ASSERT_EQ("select * from table1 limit 1;" ,s);
|
|
15
|
+
vec.clear();
|
|
16
|
+
vec.push_back("1");
|
|
17
|
+
vec.push_back("2");
|
|
18
|
+
vec.push_back("3");
|
|
19
|
+
s.clear();
|
|
20
|
+
Join(vec.begin(), vec.end(), s,",");
|
|
21
|
+
ASSERT_EQ("1,2,3",s);
|
|
22
|
+
s = Join(vec.begin(), vec.end(), "..");
|
|
23
|
+
ASSERT_EQ("1..2..3", s);
|
|
24
|
+
const char* arr[] = {"2","3","5"};
|
|
25
|
+
ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
|
|
26
|
+
map<string , int> mp;
|
|
27
|
+
mp["key1"] =2;
|
|
28
|
+
ASSERT_EQ("{key1:2}", s << mp);
|
|
29
|
+
std::unordered_map<int,int> hmp;
|
|
30
|
+
hmp[1]=2;
|
|
31
|
+
ASSERT_EQ("{1:2}", s << hmp);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
TEST(StringUtilTest, Test2) {
|
|
35
|
+
string s, gbks;
|
|
36
|
+
ifstream ifs("../test/testdata/dict.gbk");
|
|
37
|
+
ASSERT_TRUE(!!ifs);
|
|
38
|
+
|
|
39
|
+
vector<uint16_t> uni;
|
|
40
|
+
while(getline(ifs, s)) {
|
|
41
|
+
GBKTrans(s, uni);
|
|
42
|
+
GBKTrans(uni.begin(), uni.end(), gbks);
|
|
43
|
+
ASSERT_EQ(s, gbks);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
TEST(StringUtilTest, Test3) {
|
|
48
|
+
string s, utf8;
|
|
49
|
+
ifstream ifs("../test/testdata/dict.utf8");
|
|
50
|
+
ASSERT_TRUE(!!ifs);
|
|
51
|
+
|
|
52
|
+
vector<uint16_t> uni;
|
|
53
|
+
while(getline(ifs, s)) {
|
|
54
|
+
ASSERT_TRUE(Utf8ToUnicode(s, uni));
|
|
55
|
+
UnicodeToUtf8(uni.begin(), uni.end(), utf8);
|
|
56
|
+
ASSERT_EQ(s, utf8);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
TEST(StringUtilTest, Test4) {
|
|
61
|
+
//ASSERT_TRUE(StartsWith("--help",NULL));
|
|
62
|
+
ASSERT_TRUE(StartsWith("--help","--"));
|
|
63
|
+
ASSERT_TRUE(StartsWith("--help","-"));
|
|
64
|
+
ASSERT_FALSE(StartsWith("--help","he"));
|
|
65
|
+
ASSERT_TRUE(StartsWith("help","help"));
|
|
66
|
+
ASSERT_FALSE(StartsWith("","help"));
|
|
67
|
+
ASSERT_TRUE(StartsWith("hel",""));
|
|
68
|
+
ASSERT_TRUE(EndsWith("hel",""));
|
|
69
|
+
ASSERT_TRUE(EndsWith("hel","el"));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
TEST(StringUtilTest, Test5) {
|
|
73
|
+
const char* str = "1,2,3,4";
|
|
74
|
+
vector<string> vec;
|
|
75
|
+
string res;
|
|
76
|
+
Split(str, vec, ",");
|
|
77
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
|
78
|
+
Split("1,2,3,4,", vec, ",");
|
|
79
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
|
80
|
+
Split(str, vec, ",", 3);
|
|
81
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
|
82
|
+
|
|
83
|
+
Split("1", vec, ",");
|
|
84
|
+
ASSERT_EQ("[\"1\"]", res << vec);
|
|
85
|
+
|
|
86
|
+
Split(str, vec, ",", 1);
|
|
87
|
+
ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
|
|
88
|
+
|
|
89
|
+
Split("", vec, ",");
|
|
90
|
+
ASSERT_EQ("[]", res << vec);
|
|
91
|
+
|
|
92
|
+
Split("1, 2", vec, ",");
|
|
93
|
+
ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
|
|
94
|
+
|
|
95
|
+
Split("1==2", vec, "==");
|
|
96
|
+
ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
|
|
97
|
+
|
|
98
|
+
Split("1,", vec, ",");
|
|
99
|
+
ASSERT_EQ("[\"1\"]", res << vec);
|
|
100
|
+
|
|
101
|
+
Split(",1,", vec, ",");
|
|
102
|
+
ASSERT_EQ("[\"\", \"1\"]", res << vec);
|
|
103
|
+
|
|
104
|
+
Split("1, ", vec, ",");
|
|
105
|
+
ASSERT_EQ("[\"1\", \" \"]", res << vec);
|
|
106
|
+
|
|
107
|
+
res << Split("1|2,3", "|,");
|
|
108
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
TEST(StringUtilTest, Trim) {
|
|
112
|
+
string s;
|
|
113
|
+
s = "xxxyyyxx";
|
|
114
|
+
ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
|
|
115
|
+
ASSERT_EQ(LTrim(s, 'x'), "yyy");
|
|
116
|
+
s = "xxxyyyxx";
|
|
117
|
+
ASSERT_EQ(Trim(s, 'x'), "yyy");
|
|
118
|
+
|
|
119
|
+
s = " x y ";
|
|
120
|
+
ASSERT_EQ(Trim(s), "x y");
|
|
121
|
+
|
|
122
|
+
// check if it core dump when using isalpha
|
|
123
|
+
wchar_t w = 1000024;
|
|
124
|
+
ASSERT_FALSE(IsSpace(w));
|
|
125
|
+
w = 0x20;
|
|
126
|
+
ASSERT_TRUE(IsSpace(w));
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
TEST(StringUtilTest, GetTime) {
|
|
130
|
+
string s;
|
|
131
|
+
GetTime("%Y-%m-%d %H:%M:%S", s);
|
|
132
|
+
//print(s);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
TEST(StringUtilTest, PathJoin) {
|
|
136
|
+
const char * path1 = "/home/foo/dir";
|
|
137
|
+
const char * path2 = "file";
|
|
138
|
+
const char * path3 = "/home/foo/dir/";
|
|
139
|
+
const char * path4 = "file";
|
|
140
|
+
const char * answer = "/home/foo/dir/file";
|
|
141
|
+
|
|
142
|
+
ASSERT_EQ(answer, PathJoin(path1, path2));
|
|
143
|
+
ASSERT_EQ(answer, PathJoin(path3, path4));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
TEST(StringUtilTest, JapaneseUnicode) {
|
|
147
|
+
// Japanese
|
|
148
|
+
const char* s = "がんば";
|
|
149
|
+
vector<uint16_t> unicode;
|
|
150
|
+
ASSERT_TRUE(Utf8ToUnicode(s, unicode));
|
|
151
|
+
ASSERT_EQ(3u, unicode.size());
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
TEST(StringUtilTest, RareChinese) {
|
|
155
|
+
//U+10000 – U+10FFFF
|
|
156
|
+
const char* s = "𪚥";
|
|
157
|
+
vector<uint16_t> unicode;
|
|
158
|
+
ASSERT_FALSE(Utf8ToUnicode(s, unicode));
|
|
159
|
+
ASSERT_EQ(0u, unicode.size());
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
TEST(StringUtilTest, RareChineseUnicode32) {
|
|
163
|
+
//U+10000 – U+10FFFF
|
|
164
|
+
const char* s = "𪚥";
|
|
165
|
+
vector<uint32_t> unicode;
|
|
166
|
+
ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
|
|
167
|
+
ASSERT_EQ(1u, unicode.size());
|
|
168
|
+
|
|
169
|
+
string s2;
|
|
170
|
+
Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
|
|
171
|
+
ASSERT_EQ(s2, s);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
TEST(StringUtilTest, Unicode32) {
|
|
175
|
+
const char* s = "1+1=2你好世界,。";
|
|
176
|
+
vector<uint32_t> unicode;
|
|
177
|
+
ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
|
|
178
|
+
ASSERT_EQ(unicode.size(), 11u);
|
|
179
|
+
|
|
180
|
+
string s2;
|
|
181
|
+
Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
|
|
182
|
+
ASSERT_EQ(s2, s);
|
|
183
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
// Copyright 2006, Google Inc.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
|
5
|
+
// modification, are permitted provided that the following conditions are
|
|
6
|
+
// met:
|
|
7
|
+
//
|
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
|
12
|
+
// in the documentation and/or other materials provided with the
|
|
13
|
+
// distribution.
|
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
|
16
|
+
// this software without specific prior written permission.
|
|
17
|
+
//
|
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
|
|
30
|
+
#include <iostream>
|
|
31
|
+
|
|
32
|
+
#include "gtest/gtest.h"
|
|
33
|
+
|
|
34
|
+
GTEST_API_ int main(int argc, char **argv) {
|
|
35
|
+
std::cout << "Running main() from gtest_main.cc\n";
|
|
36
|
+
|
|
37
|
+
testing::InitGoogleTest(&argc, argv);
|
|
38
|
+
return RUN_ALL_TESTS();
|
|
39
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# CppJieba字典
|
|
2
|
+
|
|
3
|
+
文件后缀名代表的是词典的编码方式。
|
|
4
|
+
比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
## 分词
|
|
8
|
+
|
|
9
|
+
### jieba.dict.utf8/gbk
|
|
10
|
+
|
|
11
|
+
作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
|
|
12
|
+
|
|
13
|
+
### hmm_model.utf8/gbk
|
|
14
|
+
|
|
15
|
+
作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
|
|
16
|
+
|
|
17
|
+
__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## 关键词抽取
|
|
21
|
+
|
|
22
|
+
### idf.utf8
|
|
23
|
+
|
|
24
|
+
IDF(Inverse Document Frequency)
|
|
25
|
+
在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。
|
|
26
|
+
|
|
27
|
+
### stop_words.utf8
|
|
28
|
+
|
|
29
|
+
停用词词典
|
|
30
|
+
|
|
31
|
+
|