nodejieba-plus 3.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/FUNDING.yml +12 -0
- package/.github/workflows/github_release.yml +61 -0
- package/.github/workflows/npm_publish.yml +24 -0
- package/.github/workflows/stale-issues.yml +24 -0
- package/.github/workflows/test.yml +42 -0
- package/.gitmodules +3 -0
- package/.npmignore +15 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +78 -0
- package/LICENSE +21 -0
- package/README.md +349 -0
- package/binding.gyp +63 -0
- package/index.js +77 -0
- package/lib/index.cpp +3 -0
- package/lib/nodejieba.cpp +218 -0
- package/lib/nodejieba.h +28 -0
- package/lib/utils.h +47 -0
- package/package.json +48 -0
- package/submodules/cppjieba/.github/workflows/cmake.yml +51 -0
- package/submodules/cppjieba/.github/workflows/stale-issues.yml +24 -0
- package/submodules/cppjieba/.gitmodules +3 -0
- package/submodules/cppjieba/CHANGELOG.md +305 -0
- package/submodules/cppjieba/CMakeLists.txt +42 -0
- package/submodules/cppjieba/LICENSE +20 -0
- package/submodules/cppjieba/README.md +280 -0
- package/submodules/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
- package/submodules/cppjieba/deps/limonp/.gitmodules +0 -0
- package/submodules/cppjieba/deps/limonp/CHANGELOG.md +160 -0
- package/submodules/cppjieba/deps/limonp/CMakeLists.txt +61 -0
- package/submodules/cppjieba/deps/limonp/LICENSE +20 -0
- package/submodules/cppjieba/deps/limonp/README.md +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ArgvContext.hpp +70 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Closure.hpp +206 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Colors.hpp +31 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Condition.hpp +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Config.hpp +103 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ForcePublic.hpp +7 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/LocalVector.hpp +139 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Logging.hpp +90 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/NonCopyable.hpp +21 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StdExtension.hpp +157 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StringUtil.hpp +386 -0
- package/submodules/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
- package/submodules/cppjieba/deps/limonp/test/demo.cpp +40 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/dict/README.md +31 -0
- package/submodules/cppjieba/dict/hmm_model.utf8 +34 -0
- package/submodules/cppjieba/dict/idf.utf8 +258826 -0
- package/submodules/cppjieba/dict/jieba.dict.utf8 +348982 -0
- package/submodules/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- package/submodules/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- package/submodules/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- package/submodules/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- package/submodules/cppjieba/dict/stop_words.utf8 +1534 -0
- package/submodules/cppjieba/dict/user.dict.utf8 +4 -0
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +381 -0
- package/submodules/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- package/submodules/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- package/submodules/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +169 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- package/submodules/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- package/submodules/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- package/submodules/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- package/submodules/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- package/submodules/cppjieba/include/cppjieba/QuerySegment.hpp +89 -0
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +48 -0
- package/submodules/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- package/submodules/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Trie.hpp +200 -0
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +231 -0
- package/submodules/cppjieba/test/CMakeLists.txt +4 -0
- package/submodules/cppjieba/test/load_test.cpp +54 -0
- package/submodules/cppjieba/test/testdata/curl.res +1 -0
- package/submodules/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/test/testdata/load_test.urls +2 -0
- package/submodules/cppjieba/test/testdata/review.100 +100 -0
- package/submodules/cppjieba/test/testdata/review.100.res +200 -0
- package/submodules/cppjieba/test/testdata/server.conf +19 -0
- package/submodules/cppjieba/test/testdata/testlines.gbk +9 -0
- package/submodules/cppjieba/test/testdata/testlines.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- package/submodules/cppjieba/test/testdata/userdict.english +2 -0
- package/submodules/cppjieba/test/testdata/userdict.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/weicheng.utf8 +247 -0
- package/submodules/cppjieba/test/unittest/CMakeLists.txt +33 -0
- package/submodules/cppjieba/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/test/unittest/jieba_test.cpp +166 -0
- package/submodules/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- package/submodules/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- package/submodules/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- package/submodules/cppjieba/test/unittest/segments_test.cpp +256 -0
- package/submodules/cppjieba/test/unittest/textrank_test.cpp +86 -0
- package/submodules/cppjieba/test/unittest/trie_test.cpp +177 -0
- package/submodules/cppjieba/test/unittest/unicode_test.cpp +43 -0
- package/test/debug_split +0 -0
- package/test/debug_split2 +0 -0
- package/test/debug_split3 +0 -0
- package/test/load_dict_test.js +14 -0
- package/test/missing_binding_test.js +42 -0
- package/test/test.js +366 -0
- package/test/testdata/userdict.utf8 +1 -0
- package/tsconfig.json +59 -0
- package/types/index.d.ts +30 -0
- package/typescript_demo.ts +38 -0
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
|
2
|
+
#define CPPJIEBA_DICT_TRIE_HPP
|
|
3
|
+
|
|
4
|
+
#include <iostream>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include <map>
|
|
7
|
+
#include <string>
|
|
8
|
+
#include <cstring>
|
|
9
|
+
#include <cstdlib>
|
|
10
|
+
#include <stdint.h>
|
|
11
|
+
#include <cmath>
|
|
12
|
+
#include <limits>
|
|
13
|
+
#include <algorithm>
|
|
14
|
+
#include "limonp/StringUtil.hpp"
|
|
15
|
+
#include "limonp/Logging.hpp"
|
|
16
|
+
#include "Unicode.hpp"
|
|
17
|
+
#include "Trie.hpp"
|
|
18
|
+
|
|
19
|
+
namespace cppjieba {
|
|
20
|
+
|
|
21
|
+
using namespace limonp;
|
|
22
|
+
|
|
23
|
+
const double MIN_DOUBLE = -3.14e+100;
|
|
24
|
+
const double MAX_DOUBLE = 3.14e+100;
|
|
25
|
+
const size_t DICT_COLUMN_NUM = 3;
|
|
26
|
+
const char* const UNKNOWN_TAG = "";
|
|
27
|
+
|
|
28
|
+
class DictTrie {
|
|
29
|
+
public:
|
|
30
|
+
enum UserWordWeightOption {
|
|
31
|
+
WordWeightMin,
|
|
32
|
+
WordWeightMedian,
|
|
33
|
+
WordWeightMax,
|
|
34
|
+
}; // enum UserWordWeightOption
|
|
35
|
+
|
|
36
|
+
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
|
37
|
+
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
~DictTrie() {
|
|
41
|
+
delete trie_;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
|
45
|
+
DictUnit node_info;
|
|
46
|
+
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
active_node_infos_.push_back(node_info);
|
|
50
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
|
55
|
+
DictUnit node_info;
|
|
56
|
+
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
|
57
|
+
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
active_node_infos_.push_back(node_info);
|
|
61
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
|
66
|
+
DictUnit node_info;
|
|
67
|
+
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
|
68
|
+
return false;
|
|
69
|
+
}
|
|
70
|
+
trie_->DeleteNode(node_info.word, &node_info);
|
|
71
|
+
return true;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
|
75
|
+
return trie_->Find(begin, end);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
void Find(RuneStrArray::const_iterator begin,
|
|
79
|
+
RuneStrArray::const_iterator end,
|
|
80
|
+
vector<struct Dag>&res,
|
|
81
|
+
size_t max_word_len = MAX_WORD_LENGTH) const {
|
|
82
|
+
trie_->Find(begin, end, res, max_word_len);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
bool Find(const string& word)
|
|
86
|
+
{
|
|
87
|
+
const DictUnit *tmp = NULL;
|
|
88
|
+
RuneStrArray runes;
|
|
89
|
+
if (!DecodeUTF8RunesInString(word, runes))
|
|
90
|
+
{
|
|
91
|
+
XLOG(ERROR) << "Decode failed.";
|
|
92
|
+
}
|
|
93
|
+
tmp = Find(runes.begin(), runes.end());
|
|
94
|
+
if (tmp == NULL)
|
|
95
|
+
{
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
else
|
|
99
|
+
{
|
|
100
|
+
return true;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
|
105
|
+
return IsIn(user_dict_single_chinese_word_, word);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
double GetMinWeight() const {
|
|
109
|
+
return min_weight_;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
void InserUserDictNode(const string& line) {
|
|
113
|
+
vector<string> buf;
|
|
114
|
+
DictUnit node_info;
|
|
115
|
+
Split(line, buf, " ");
|
|
116
|
+
|
|
117
|
+
string word;
|
|
118
|
+
string tag = UNKNOWN_TAG;
|
|
119
|
+
double weight = user_word_default_weight_;
|
|
120
|
+
bool hasSpace = false;
|
|
121
|
+
|
|
122
|
+
// 处理包含空格的关键词
|
|
123
|
+
// 格式可能是: "word" 或 "word tag" 或 "word freq tag"
|
|
124
|
+
// 其中 word 本身可能包含空格
|
|
125
|
+
if (buf.size() == 1) {
|
|
126
|
+
// 只有关键词,无词频和标签
|
|
127
|
+
word = buf[0];
|
|
128
|
+
} else if (buf.size() == 2) {
|
|
129
|
+
// 可能是 "word tag" 或 "word1 word2"
|
|
130
|
+
// 检查第二个字段是否为数字(词频)
|
|
131
|
+
bool isNumber = true;
|
|
132
|
+
for (char c : buf[1]) {
|
|
133
|
+
if (!isdigit(c)) {
|
|
134
|
+
isNumber = false;
|
|
135
|
+
break;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
if (isNumber) {
|
|
139
|
+
// "word freq" 格式,无标签
|
|
140
|
+
int freq = atoi(buf[1].c_str());
|
|
141
|
+
assert(freq_sum_ > 0.0);
|
|
142
|
+
weight = log(1.0 * freq / freq_sum_);
|
|
143
|
+
word = buf[0];
|
|
144
|
+
} else {
|
|
145
|
+
// "word tag" 格式
|
|
146
|
+
word = buf[0];
|
|
147
|
+
tag = buf[1];
|
|
148
|
+
}
|
|
149
|
+
} else {
|
|
150
|
+
// 检查最后两个字段:可能是 "... word freq tag" 或 "... word1 word2 tag" 等
|
|
151
|
+
// 倒数第二个如果是数字,则认为是词频,最后一个是标签
|
|
152
|
+
// 否则认为只有最后一个是标签,前面都是关键词
|
|
153
|
+
bool isFreq = true;
|
|
154
|
+
for (char c : buf[buf.size() - 2]) {
|
|
155
|
+
if (!isdigit(c)) {
|
|
156
|
+
isFreq = false;
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (isFreq) {
|
|
162
|
+
// 格式: "word... freq tag"
|
|
163
|
+
int freq = atoi(buf[buf.size() - 2].c_str());
|
|
164
|
+
assert(freq_sum_ > 0.0);
|
|
165
|
+
weight = log(1.0 * freq / freq_sum_);
|
|
166
|
+
// 前面的所有部分(除了最后两个)组成关键词
|
|
167
|
+
for (size_t i = 0; i < buf.size() - 2; ++i) {
|
|
168
|
+
if (i > 0) word += " ";
|
|
169
|
+
word += buf[i];
|
|
170
|
+
}
|
|
171
|
+
tag = buf[buf.size() - 1];
|
|
172
|
+
} else {
|
|
173
|
+
// 格式: "word... tag" (无词频)
|
|
174
|
+
// 前面的所有部分(除了最后一个)组成关键词
|
|
175
|
+
for (size_t i = 0; i < buf.size() - 1; ++i) {
|
|
176
|
+
if (i > 0) word += " ";
|
|
177
|
+
word += buf[i];
|
|
178
|
+
}
|
|
179
|
+
tag = buf[buf.size() - 1];
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// 检查词中是否包含空格
|
|
184
|
+
hasSpace = (word.find(' ') != string::npos);
|
|
185
|
+
|
|
186
|
+
// 添加原始词(包含空格版本)
|
|
187
|
+
MakeNodeInfo(node_info, word, weight, tag);
|
|
188
|
+
static_node_infos_.push_back(node_info);
|
|
189
|
+
if (node_info.word.size() == 1) {
|
|
190
|
+
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// 如果词包含空格,同时添加无空格版本
|
|
194
|
+
if (hasSpace) {
|
|
195
|
+
string wordNoSpace = word;
|
|
196
|
+
// 移除所有空格
|
|
197
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
198
|
+
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
199
|
+
DictUnit node_info_no_space;
|
|
200
|
+
MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
|
|
201
|
+
static_node_infos_.push_back(node_info_no_space);
|
|
202
|
+
if (node_info_no_space.word.size() == 1) {
|
|
203
|
+
user_dict_single_chinese_word_.insert(node_info_no_space.word[0]);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
void LoadUserDict(const vector<string>& buf) {
|
|
210
|
+
for (size_t i = 0; i < buf.size(); i++) {
|
|
211
|
+
InserUserDictNode(buf[i]);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
void LoadUserDict(const set<string>& buf) {
|
|
216
|
+
std::set<string>::const_iterator iter;
|
|
217
|
+
for (iter = buf.begin(); iter != buf.end(); iter++){
|
|
218
|
+
InserUserDictNode(*iter);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
void LoadUserDict(const string& filePaths) {
|
|
223
|
+
vector<string> files = limonp::Split(filePaths, "|;");
|
|
224
|
+
for (size_t i = 0; i < files.size(); i++) {
|
|
225
|
+
ifstream ifs(files[i].c_str());
|
|
226
|
+
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
|
227
|
+
string line;
|
|
228
|
+
|
|
229
|
+
while(getline(ifs, line)) {
|
|
230
|
+
if (line.size() == 0) {
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
InserUserDictNode(line);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
private:
|
|
240
|
+
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
|
241
|
+
LoadDict(dict_path);
|
|
242
|
+
freq_sum_ = CalcFreqSum(static_node_infos_);
|
|
243
|
+
CalculateWeight(static_node_infos_, freq_sum_);
|
|
244
|
+
SetStaticWordWeights(user_word_weight_opt);
|
|
245
|
+
|
|
246
|
+
if (user_dict_paths.size()) {
|
|
247
|
+
LoadUserDict(user_dict_paths);
|
|
248
|
+
}
|
|
249
|
+
Shrink(static_node_infos_);
|
|
250
|
+
CreateTrie(static_node_infos_);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
|
254
|
+
assert(dictUnits.size());
|
|
255
|
+
vector<Unicode> words;
|
|
256
|
+
vector<const DictUnit*> valuePointers;
|
|
257
|
+
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
|
258
|
+
words.push_back(dictUnits[i].word);
|
|
259
|
+
valuePointers.push_back(&dictUnits[i]);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
trie_ = new Trie(words, valuePointers);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
bool MakeNodeInfo(DictUnit& node_info,
|
|
269
|
+
const string& word,
|
|
270
|
+
double weight,
|
|
271
|
+
const string& tag) {
|
|
272
|
+
if (!DecodeUTF8RunesInString(word, node_info.word)) {
|
|
273
|
+
XLOG(ERROR) << "UTF-8 decode failed for dict word: " << word;
|
|
274
|
+
return false;
|
|
275
|
+
}
|
|
276
|
+
node_info.weight = weight;
|
|
277
|
+
node_info.tag = tag;
|
|
278
|
+
return true;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
void LoadDict(const string& filePath) {
|
|
282
|
+
ifstream ifs(filePath.c_str());
|
|
283
|
+
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
|
284
|
+
string line;
|
|
285
|
+
vector<string> buf;
|
|
286
|
+
|
|
287
|
+
DictUnit node_info;
|
|
288
|
+
while (getline(ifs, line)) {
|
|
289
|
+
Split(line, buf, " ");
|
|
290
|
+
// 支持包含空格的关键词
|
|
291
|
+
// 格式: "word weight tag" 或 "word1 word2 weight tag" 等
|
|
292
|
+
// 最后两个字段是 weight 和 tag,前面的都是关键词
|
|
293
|
+
if (buf.size() >= DICT_COLUMN_NUM) {
|
|
294
|
+
// 组合前面的字段作为关键词
|
|
295
|
+
string word;
|
|
296
|
+
for (size_t i = 0; i < buf.size() - 2; ++i) {
|
|
297
|
+
if (i > 0) word += " ";
|
|
298
|
+
word += buf[i];
|
|
299
|
+
}
|
|
300
|
+
double weight = atof(buf[buf.size() - 2].c_str());
|
|
301
|
+
string tag = buf[buf.size() - 1];
|
|
302
|
+
|
|
303
|
+
// 添加原始词(包含空格版本)
|
|
304
|
+
MakeNodeInfo(node_info, word, weight, tag);
|
|
305
|
+
static_node_infos_.push_back(node_info);
|
|
306
|
+
|
|
307
|
+
// 如果词包含空格,同时添加无空格版本
|
|
308
|
+
if (word.find(' ') != string::npos) {
|
|
309
|
+
string wordNoSpace = word;
|
|
310
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
311
|
+
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
312
|
+
DictUnit node_info_no_space;
|
|
313
|
+
MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
|
|
314
|
+
static_node_infos_.push_back(node_info_no_space);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
} else {
|
|
318
|
+
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
|
|
324
|
+
return lhs.weight < rhs.weight;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
void SetStaticWordWeights(UserWordWeightOption option) {
|
|
328
|
+
XCHECK(!static_node_infos_.empty());
|
|
329
|
+
vector<DictUnit> x = static_node_infos_;
|
|
330
|
+
sort(x.begin(), x.end(), WeightCompare);
|
|
331
|
+
min_weight_ = x[0].weight;
|
|
332
|
+
max_weight_ = x[x.size() - 1].weight;
|
|
333
|
+
median_weight_ = x[x.size() / 2].weight;
|
|
334
|
+
switch (option) {
|
|
335
|
+
case WordWeightMin:
|
|
336
|
+
user_word_default_weight_ = min_weight_;
|
|
337
|
+
break;
|
|
338
|
+
case WordWeightMedian:
|
|
339
|
+
user_word_default_weight_ = median_weight_;
|
|
340
|
+
break;
|
|
341
|
+
default:
|
|
342
|
+
user_word_default_weight_ = max_weight_;
|
|
343
|
+
break;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
|
348
|
+
double sum = 0.0;
|
|
349
|
+
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
350
|
+
sum += node_infos[i].weight;
|
|
351
|
+
}
|
|
352
|
+
return sum;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
|
356
|
+
assert(sum > 0.0);
|
|
357
|
+
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
358
|
+
DictUnit& node_info = node_infos[i];
|
|
359
|
+
assert(node_info.weight > 0.0);
|
|
360
|
+
node_info.weight = log(double(node_info.weight)/sum);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
void Shrink(vector<DictUnit>& units) const {
|
|
365
|
+
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
vector<DictUnit> static_node_infos_;
|
|
369
|
+
deque<DictUnit> active_node_infos_; // must not be vector
|
|
370
|
+
Trie * trie_;
|
|
371
|
+
|
|
372
|
+
double freq_sum_;
|
|
373
|
+
double min_weight_;
|
|
374
|
+
double max_weight_;
|
|
375
|
+
double median_weight_;
|
|
376
|
+
double user_word_default_weight_;
|
|
377
|
+
unordered_set<Rune> user_dict_single_chinese_word_;
|
|
378
|
+
};
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
#endif
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#ifndef CPPJIEBA_FULLSEGMENT_H
|
|
2
|
+
#define CPPJIEBA_FULLSEGMENT_H
|
|
3
|
+
|
|
4
|
+
#include <algorithm>
|
|
5
|
+
#include <set>
|
|
6
|
+
#include <cassert>
|
|
7
|
+
#include "limonp/Logging.hpp"
|
|
8
|
+
#include "DictTrie.hpp"
|
|
9
|
+
#include "SegmentBase.hpp"
|
|
10
|
+
#include "Unicode.hpp"
|
|
11
|
+
|
|
12
|
+
namespace cppjieba {
|
|
13
|
+
class FullSegment: public SegmentBase {
|
|
14
|
+
public:
|
|
15
|
+
FullSegment(const string& dictPath) {
|
|
16
|
+
dictTrie_ = new DictTrie(dictPath);
|
|
17
|
+
isNeedDestroy_ = true;
|
|
18
|
+
}
|
|
19
|
+
FullSegment(const DictTrie* dictTrie)
|
|
20
|
+
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
|
21
|
+
assert(dictTrie_);
|
|
22
|
+
}
|
|
23
|
+
~FullSegment() {
|
|
24
|
+
if (isNeedDestroy_) {
|
|
25
|
+
delete dictTrie_;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
void Cut(const string& sentence,
|
|
29
|
+
vector<string>& words) const {
|
|
30
|
+
vector<Word> tmp;
|
|
31
|
+
Cut(sentence, tmp);
|
|
32
|
+
GetStringsFromWords(tmp, words);
|
|
33
|
+
}
|
|
34
|
+
void Cut(const string& sentence,
|
|
35
|
+
vector<Word>& words) const {
|
|
36
|
+
PreFilter pre_filter(symbols_, sentence);
|
|
37
|
+
PreFilter::Range range;
|
|
38
|
+
vector<WordRange> wrs;
|
|
39
|
+
wrs.reserve(sentence.size()/2);
|
|
40
|
+
while (pre_filter.HasNext()) {
|
|
41
|
+
range = pre_filter.Next();
|
|
42
|
+
Cut(range.begin, range.end, wrs);
|
|
43
|
+
}
|
|
44
|
+
words.clear();
|
|
45
|
+
words.reserve(wrs.size());
|
|
46
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
|
47
|
+
}
|
|
48
|
+
void Cut(RuneStrArray::const_iterator begin,
|
|
49
|
+
RuneStrArray::const_iterator end,
|
|
50
|
+
vector<WordRange>& res) const {
|
|
51
|
+
// result of searching in trie tree
|
|
52
|
+
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
|
53
|
+
|
|
54
|
+
// max index of res's words
|
|
55
|
+
size_t maxIdx = 0;
|
|
56
|
+
|
|
57
|
+
// always equals to (uItr - begin)
|
|
58
|
+
size_t uIdx = 0;
|
|
59
|
+
|
|
60
|
+
// tmp variables
|
|
61
|
+
size_t wordLen = 0;
|
|
62
|
+
assert(dictTrie_);
|
|
63
|
+
vector<struct Dag> dags;
|
|
64
|
+
dictTrie_->Find(begin, end, dags);
|
|
65
|
+
for (size_t i = 0; i < dags.size(); i++) {
|
|
66
|
+
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
|
67
|
+
size_t nextoffset = dags[i].nexts[j].first;
|
|
68
|
+
assert(nextoffset < dags.size());
|
|
69
|
+
const DictUnit* du = dags[i].nexts[j].second;
|
|
70
|
+
if (du == NULL) {
|
|
71
|
+
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
|
72
|
+
WordRange wr(begin + i, begin + nextoffset);
|
|
73
|
+
res.push_back(wr);
|
|
74
|
+
}
|
|
75
|
+
} else {
|
|
76
|
+
wordLen = du->word.size();
|
|
77
|
+
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
|
78
|
+
WordRange wr(begin + i, begin + nextoffset);
|
|
79
|
+
res.push_back(wr);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
|
83
|
+
}
|
|
84
|
+
uIdx++;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
private:
|
|
88
|
+
const DictTrie* dictTrie_;
|
|
89
|
+
bool isNeedDestroy_;
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#endif
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
#ifndef CPPJIEBA_HMMMODEL_H
|
|
2
|
+
#define CPPJIEBA_HMMMODEL_H
|
|
3
|
+
|
|
4
|
+
#include "limonp/StringUtil.hpp"
|
|
5
|
+
#include "Trie.hpp"
|
|
6
|
+
|
|
7
|
+
namespace cppjieba {
|
|
8
|
+
|
|
9
|
+
using namespace limonp;
|
|
10
|
+
typedef unordered_map<Rune, double> EmitProbMap;
|
|
11
|
+
|
|
12
|
+
struct HMMModel {
|
|
13
|
+
/*
|
|
14
|
+
* STATUS:
|
|
15
|
+
* 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
|
|
16
|
+
* */
|
|
17
|
+
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
|
18
|
+
|
|
19
|
+
HMMModel(const string& modelPath) {
|
|
20
|
+
memset(startProb, 0, sizeof(startProb));
|
|
21
|
+
memset(transProb, 0, sizeof(transProb));
|
|
22
|
+
statMap[0] = 'B';
|
|
23
|
+
statMap[1] = 'E';
|
|
24
|
+
statMap[2] = 'M';
|
|
25
|
+
statMap[3] = 'S';
|
|
26
|
+
emitProbVec.push_back(&emitProbB);
|
|
27
|
+
emitProbVec.push_back(&emitProbE);
|
|
28
|
+
emitProbVec.push_back(&emitProbM);
|
|
29
|
+
emitProbVec.push_back(&emitProbS);
|
|
30
|
+
LoadModel(modelPath);
|
|
31
|
+
}
|
|
32
|
+
~HMMModel() {
|
|
33
|
+
}
|
|
34
|
+
void LoadModel(const string& filePath) {
|
|
35
|
+
ifstream ifile(filePath.c_str());
|
|
36
|
+
XCHECK(ifile.is_open()) << "open " << filePath << " failed";
|
|
37
|
+
string line;
|
|
38
|
+
vector<string> tmp;
|
|
39
|
+
vector<string> tmp2;
|
|
40
|
+
//Load startProb
|
|
41
|
+
XCHECK(GetLine(ifile, line));
|
|
42
|
+
Split(line, tmp, " ");
|
|
43
|
+
XCHECK(tmp.size() == STATUS_SUM);
|
|
44
|
+
for (size_t j = 0; j< tmp.size(); j++) {
|
|
45
|
+
startProb[j] = atof(tmp[j].c_str());
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
//Load transProb
|
|
49
|
+
for (size_t i = 0; i < STATUS_SUM; i++) {
|
|
50
|
+
XCHECK(GetLine(ifile, line));
|
|
51
|
+
Split(line, tmp, " ");
|
|
52
|
+
XCHECK(tmp.size() == STATUS_SUM);
|
|
53
|
+
for (size_t j =0; j < STATUS_SUM; j++) {
|
|
54
|
+
transProb[i][j] = atof(tmp[j].c_str());
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
//Load emitProbB
|
|
59
|
+
XCHECK(GetLine(ifile, line));
|
|
60
|
+
XCHECK(LoadEmitProb(line, emitProbB));
|
|
61
|
+
|
|
62
|
+
//Load emitProbE
|
|
63
|
+
XCHECK(GetLine(ifile, line));
|
|
64
|
+
XCHECK(LoadEmitProb(line, emitProbE));
|
|
65
|
+
|
|
66
|
+
//Load emitProbM
|
|
67
|
+
XCHECK(GetLine(ifile, line));
|
|
68
|
+
XCHECK(LoadEmitProb(line, emitProbM));
|
|
69
|
+
|
|
70
|
+
//Load emitProbS
|
|
71
|
+
XCHECK(GetLine(ifile, line));
|
|
72
|
+
XCHECK(LoadEmitProb(line, emitProbS));
|
|
73
|
+
}
|
|
74
|
+
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
|
75
|
+
double defVal)const {
|
|
76
|
+
EmitProbMap::const_iterator cit = ptMp->find(key);
|
|
77
|
+
if (cit == ptMp->end()) {
|
|
78
|
+
return defVal;
|
|
79
|
+
}
|
|
80
|
+
return cit->second;
|
|
81
|
+
}
|
|
82
|
+
bool GetLine(ifstream& ifile, string& line) {
|
|
83
|
+
while (getline(ifile, line)) {
|
|
84
|
+
Trim(line);
|
|
85
|
+
if (line.empty()) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
if (StartsWith(line, "#")) {
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
|
96
|
+
if (line.empty()) {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
vector<string> tmp, tmp2;
|
|
100
|
+
Unicode unicode;
|
|
101
|
+
Split(line, tmp, ",");
|
|
102
|
+
for (size_t i = 0; i < tmp.size(); i++) {
|
|
103
|
+
Split(tmp[i], tmp2, ":");
|
|
104
|
+
if (2 != tmp2.size()) {
|
|
105
|
+
XLOG(ERROR) << "emitProb illegal.";
|
|
106
|
+
return false;
|
|
107
|
+
}
|
|
108
|
+
if (!DecodeUTF8RunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
|
109
|
+
XLOG(ERROR) << "TransCode failed.";
|
|
110
|
+
return false;
|
|
111
|
+
}
|
|
112
|
+
mp[unicode[0]] = atof(tmp2[1].c_str());
|
|
113
|
+
}
|
|
114
|
+
return true;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
char statMap[STATUS_SUM];
|
|
118
|
+
double startProb[STATUS_SUM];
|
|
119
|
+
double transProb[STATUS_SUM][STATUS_SUM];
|
|
120
|
+
EmitProbMap emitProbB;
|
|
121
|
+
EmitProbMap emitProbE;
|
|
122
|
+
EmitProbMap emitProbM;
|
|
123
|
+
EmitProbMap emitProbS;
|
|
124
|
+
vector<EmitProbMap* > emitProbVec;
|
|
125
|
+
}; // struct HMMModel
|
|
126
|
+
|
|
127
|
+
} // namespace cppjieba
|
|
128
|
+
|
|
129
|
+
#endif
|