nodejieba-plus 3.5.13 → 3.5.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -93
- package/analyze_weight.js +57 -0
- package/build/Release/nodejieba.node +0 -0
- package/diagnose_priority.js +71 -0
- package/index.js +9 -1
- package/lib/nodejieba.cpp +145 -13
- package/lib/nodejieba.h +1 -0
- package/package.json +1 -1
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +169 -30
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +8 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +29 -8
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +1 -1
- package/submodules/cppjieba/include/cppjieba/Trie.hpp +10 -13
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +52 -0
- package/test/load_user_dict_test.js +48 -4
- package/test_1_3x_weight.js +86 -0
- package/test_assertion_fix.js +60 -0
- package/test_idf_feature.js +43 -0
- package/test_open_claw.js +65 -0
- package/test_simple.js +17 -0
- package/test_space_keyword.js +66 -0
- package/types/index.d.ts +1 -0
package/lib/nodejieba.cpp
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "cppjieba/TextRankExtractor.hpp"
|
|
8
8
|
|
|
9
9
|
#include <sstream>
|
|
10
|
+
#include <cctype>
|
|
10
11
|
|
|
11
12
|
NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
|
|
12
13
|
DefineAddon(exports, {
|
|
@@ -20,7 +21,8 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
|
|
|
20
21
|
InstanceMethod("extract", &NodeJieba::extract),
|
|
21
22
|
InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
|
|
22
23
|
InstanceMethod("insertWord", &NodeJieba::insertWord),
|
|
23
|
-
InstanceMethod("loadUserDict", &NodeJieba::loadUserDict)
|
|
24
|
+
InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
|
|
25
|
+
InstanceMethod("setIdf", &NodeJieba::setIdf)
|
|
24
26
|
});
|
|
25
27
|
}
|
|
26
28
|
|
|
@@ -52,6 +54,44 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
|
|
|
52
54
|
return Napi::Boolean::New(info.Env(), true);
|
|
53
55
|
}
|
|
54
56
|
|
|
57
|
+
Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
|
|
58
|
+
if (info.Length() < 1) {
|
|
59
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (!_jieba_handle) {
|
|
63
|
+
Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
|
|
64
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
std::string word;
|
|
68
|
+
double idf = 0.0;
|
|
69
|
+
double multiplier = 2.0;
|
|
70
|
+
|
|
71
|
+
if (info[0].IsString()) {
|
|
72
|
+
word = info[0].As<Napi::String>().Utf8Value();
|
|
73
|
+
} else {
|
|
74
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (info.Length() >= 2) {
|
|
78
|
+
if (info[1].IsNumber()) {
|
|
79
|
+
idf = info[1].As<Napi::Number>().DoubleValue();
|
|
80
|
+
_jieba_handle->SetIdfForWord(word, idf);
|
|
81
|
+
return Napi::Boolean::New(info.Env(), true);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (info.Length() >= 3) {
|
|
86
|
+
if (info[2].IsNumber()) {
|
|
87
|
+
multiplier = info[2].As<Napi::Number>().DoubleValue();
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
_jieba_handle->SetIdfForWordWithMultiplier(word, multiplier);
|
|
92
|
+
return Napi::Boolean::New(info.Env(), true);
|
|
93
|
+
}
|
|
94
|
+
|
|
55
95
|
Napi::Value NodeJieba::insertWord(const Napi::CallbackInfo& info) {
|
|
56
96
|
if(info.Length() < 1) {
|
|
57
97
|
return Napi::Boolean::New(info.Env(), false);
|
|
@@ -229,43 +269,135 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
229
269
|
Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
|
|
230
270
|
}
|
|
231
271
|
|
|
232
|
-
|
|
272
|
+
auto isBlankString = [](const std::string& str) -> bool {
|
|
273
|
+
for (char c : str) {
|
|
274
|
+
if (!std::isspace(static_cast<unsigned char>(c))) {
|
|
275
|
+
return false;
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return true;
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
auto trimString = [](std::string& str) -> void {
|
|
282
|
+
size_t start = 0;
|
|
283
|
+
size_t end = str.length();
|
|
284
|
+
|
|
285
|
+
while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
|
|
286
|
+
start++;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
|
|
290
|
+
end--;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
str = str.substr(start, end - start);
|
|
294
|
+
};
|
|
295
|
+
|
|
296
|
+
auto extractKeyword = [](const std::string& line) -> std::string {
|
|
297
|
+
std::istringstream iss(line);
|
|
298
|
+
std::vector<std::string> parts;
|
|
299
|
+
std::string part;
|
|
300
|
+
|
|
301
|
+
while (iss >> part) {
|
|
302
|
+
parts.push_back(part);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (parts.empty()) {
|
|
306
|
+
return "";
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (parts.size() == 1) {
|
|
310
|
+
return parts[0];
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (parts.size() == 2) {
|
|
314
|
+
size_t pos;
|
|
315
|
+
try {
|
|
316
|
+
std::stoi(parts[1], &pos);
|
|
317
|
+
if (pos == parts[1].length()) {
|
|
318
|
+
return parts[0];
|
|
319
|
+
}
|
|
320
|
+
} catch (...) {
|
|
321
|
+
}
|
|
322
|
+
return parts[0];
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (parts.size() >= 3) {
|
|
326
|
+
size_t pos;
|
|
327
|
+
try {
|
|
328
|
+
std::stoi(parts[parts.size() - 2], &pos);
|
|
329
|
+
if (pos == parts[parts.size() - 2].length()) {
|
|
330
|
+
std::string keyword;
|
|
331
|
+
for (size_t i = 0; i < parts.size() - 2; i++) {
|
|
332
|
+
if (i > 0) keyword += " ";
|
|
333
|
+
keyword += parts[i];
|
|
334
|
+
}
|
|
335
|
+
return keyword;
|
|
336
|
+
}
|
|
337
|
+
} catch (...) {
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
std::string keyword;
|
|
341
|
+
for (size_t i = 0; i < parts.size() - 1; i++) {
|
|
342
|
+
if (i > 0) keyword += " ";
|
|
343
|
+
keyword += parts[i];
|
|
344
|
+
}
|
|
345
|
+
return keyword;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return parts[0];
|
|
349
|
+
};
|
|
350
|
+
|
|
351
|
+
auto setDefaultIdf = [&](const std::vector<std::string>& dictLines) {
|
|
352
|
+
for (const auto& line : dictLines) {
|
|
353
|
+
std::string keyword = extractKeyword(line);
|
|
354
|
+
if (!keyword.empty()) {
|
|
355
|
+
_jieba_handle->SetIdfForWordWithMultiplier(keyword, 1.3);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
|
|
233
360
|
if (info[0].IsArray()) {
|
|
234
361
|
Napi::Array arr = info[0].As<Napi::Array>();
|
|
235
362
|
std::vector<std::string> buf;
|
|
236
363
|
for (size_t i = 0; i < arr.Length(); i++) {
|
|
237
364
|
Napi::Value val = arr[i];
|
|
238
|
-
if (val.IsString()) {
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
365
|
+
if (!val.IsString()) {
|
|
366
|
+
Napi::TypeError::New(info.Env(), "Array elements must be strings")
|
|
367
|
+
.ThrowAsJavaScriptException();
|
|
368
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
369
|
+
}
|
|
370
|
+
std::string line = val.As<Napi::String>().Utf8Value();
|
|
371
|
+
trimString(line);
|
|
372
|
+
if (!line.empty() && !isBlankString(line)) {
|
|
373
|
+
buf.push_back(line);
|
|
244
374
|
}
|
|
245
375
|
}
|
|
246
376
|
_jieba_handle->LoadUserDict(buf);
|
|
377
|
+
setDefaultIdf(buf);
|
|
247
378
|
} else if (info[0].IsString()) {
|
|
248
|
-
// 支持传入单个词典条目字符串
|
|
249
379
|
std::string line = info[0].As<Napi::String>().Utf8Value();
|
|
380
|
+
trimString(line);
|
|
250
381
|
std::vector<std::string> buf;
|
|
251
|
-
|
|
252
|
-
if (!line.empty()) {
|
|
382
|
+
if (!line.empty() && !isBlankString(line)) {
|
|
253
383
|
buf.push_back(line);
|
|
254
384
|
_jieba_handle->LoadUserDict(buf);
|
|
385
|
+
setDefaultIdf(buf);
|
|
255
386
|
}
|
|
256
387
|
} else if (info[0].IsBuffer()) {
|
|
257
|
-
// 支持传入 Buffer,将其转换为字符串并按行分割
|
|
258
388
|
Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
|
|
259
389
|
std::string content(buffer.Data(), buffer.Length());
|
|
260
390
|
std::vector<std::string> buf;
|
|
261
391
|
std::istringstream iss(content);
|
|
262
392
|
std::string line;
|
|
263
393
|
while (std::getline(iss, line)) {
|
|
264
|
-
|
|
394
|
+
trimString(line);
|
|
395
|
+
if (!line.empty() && !isBlankString(line)) {
|
|
265
396
|
buf.push_back(line);
|
|
266
397
|
}
|
|
267
398
|
}
|
|
268
399
|
_jieba_handle->LoadUserDict(buf);
|
|
400
|
+
setDefaultIdf(buf);
|
|
269
401
|
} else {
|
|
270
402
|
return Napi::Boolean::New(info.Env(), false);
|
|
271
403
|
}
|
package/lib/nodejieba.h
CHANGED
|
@@ -21,6 +21,7 @@ private:
|
|
|
21
21
|
Napi::Value textRankExtract(const Napi::CallbackInfo& info);
|
|
22
22
|
Napi::Value insertWord(const Napi::CallbackInfo& info);
|
|
23
23
|
Napi::Value loadUserDict(const Napi::CallbackInfo& info);
|
|
24
|
+
Napi::Value setIdf(const Napi::CallbackInfo& info);
|
|
24
25
|
|
|
25
26
|
cppjieba::Jieba* _jieba_handle{nullptr};
|
|
26
27
|
cppjieba::TextRankExtractor* _text_rank_extractor_handle{nullptr};
|
package/package.json
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
#include <stdint.h>
|
|
11
11
|
#include <cmath>
|
|
12
12
|
#include <limits>
|
|
13
|
+
#include <algorithm>
|
|
13
14
|
#include "limonp/StringUtil.hpp"
|
|
14
15
|
#include "limonp/Logging.hpp"
|
|
15
16
|
#include "Unicode.hpp"
|
|
@@ -32,7 +33,7 @@ class DictTrie {
|
|
|
32
33
|
WordWeightMax,
|
|
33
34
|
}; // enum UserWordWeightOption
|
|
34
35
|
|
|
35
|
-
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
|
36
|
+
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) : trie_(NULL) {
|
|
36
37
|
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
|
37
38
|
}
|
|
38
39
|
|
|
@@ -41,23 +42,84 @@ class DictTrie {
|
|
|
41
42
|
}
|
|
42
43
|
|
|
43
44
|
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
std::set<string> insertedWords;
|
|
46
|
+
insertedWords.insert(word);
|
|
47
|
+
|
|
48
|
+
bool hasSpace = (word.find(' ') != string::npos);
|
|
49
|
+
if (hasSpace) {
|
|
50
|
+
string wordNoSpace = word;
|
|
51
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
52
|
+
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
53
|
+
insertedWords.insert(wordNoSpace);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
string wordLower = ToLowerString(word);
|
|
58
|
+
if (wordLower != word) {
|
|
59
|
+
insertedWords.insert(wordLower);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (hasSpace) {
|
|
63
|
+
string wordNoSpace = word;
|
|
64
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
65
|
+
if (!wordNoSpace.empty()) {
|
|
66
|
+
string wordNoSpaceLower = ToLowerString(wordNoSpace);
|
|
67
|
+
if (wordNoSpaceLower != wordNoSpace) {
|
|
68
|
+
insertedWords.insert(wordNoSpaceLower);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
|
|
74
|
+
DictUnit node_info;
|
|
75
|
+
if (!MakeNodeInfo(node_info, *it, user_word_default_weight_, tag)) {
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
active_node_infos_.push_back(node_info);
|
|
79
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
47
80
|
}
|
|
48
|
-
active_node_infos_.push_back(node_info);
|
|
49
|
-
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
50
81
|
return true;
|
|
51
82
|
}
|
|
52
83
|
|
|
53
84
|
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
85
|
+
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_;
|
|
86
|
+
|
|
87
|
+
std::set<string> insertedWords;
|
|
88
|
+
insertedWords.insert(word);
|
|
89
|
+
|
|
90
|
+
bool hasSpace = (word.find(' ') != string::npos);
|
|
91
|
+
if (hasSpace) {
|
|
92
|
+
string wordNoSpace = word;
|
|
93
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
94
|
+
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
95
|
+
insertedWords.insert(wordNoSpace);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
string wordLower = ToLowerString(word);
|
|
100
|
+
if (wordLower != word) {
|
|
101
|
+
insertedWords.insert(wordLower);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (hasSpace) {
|
|
105
|
+
string wordNoSpace = word;
|
|
106
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
107
|
+
if (!wordNoSpace.empty()) {
|
|
108
|
+
string wordNoSpaceLower = ToLowerString(wordNoSpace);
|
|
109
|
+
if (wordNoSpaceLower != wordNoSpace) {
|
|
110
|
+
insertedWords.insert(wordNoSpaceLower);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
|
|
116
|
+
DictUnit node_info;
|
|
117
|
+
if (!MakeNodeInfo(node_info, *it, weight, tag)) {
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
active_node_infos_.push_back(node_info);
|
|
121
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
58
122
|
}
|
|
59
|
-
active_node_infos_.push_back(node_info);
|
|
60
|
-
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
61
123
|
return true;
|
|
62
124
|
}
|
|
63
125
|
|
|
@@ -112,26 +174,93 @@ class DictTrie {
|
|
|
112
174
|
vector<string> buf;
|
|
113
175
|
DictUnit node_info;
|
|
114
176
|
Split(line, buf, " ");
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
177
|
+
|
|
178
|
+
string word;
|
|
179
|
+
string tag = UNKNOWN_TAG;
|
|
180
|
+
double weight = user_word_default_weight_;
|
|
181
|
+
bool hasSpace = false;
|
|
182
|
+
|
|
183
|
+
if (buf.size() == 1) {
|
|
184
|
+
word = buf[0];
|
|
185
|
+
} else if (buf.size() == 2) {
|
|
186
|
+
int freq = atoi(buf[1].c_str());
|
|
187
|
+
if (freq > 0) {
|
|
188
|
+
assert(freq_sum_ > 0.0);
|
|
189
|
+
weight = log(1.0 * freq / freq_sum_);
|
|
190
|
+
word = buf[0];
|
|
191
|
+
} else {
|
|
192
|
+
word = line;
|
|
193
|
+
}
|
|
194
|
+
} else if (buf.size() >= 3) {
|
|
195
|
+
bool isFreq = true;
|
|
196
|
+
for (char c : buf[buf.size() - 2]) {
|
|
197
|
+
if (!isdigit(c)) {
|
|
198
|
+
isFreq = false;
|
|
199
|
+
break;
|
|
130
200
|
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
if (isFreq) {
|
|
204
|
+
int freq = atoi(buf[buf.size() - 2].c_str());
|
|
205
|
+
assert(freq_sum_ > 0.0);
|
|
206
|
+
weight = log(1.0 * freq / freq_sum_);
|
|
207
|
+
for (size_t i = 0; i < buf.size() - 2; ++i) {
|
|
208
|
+
if (i > 0) word += " ";
|
|
209
|
+
word += buf[i];
|
|
210
|
+
}
|
|
211
|
+
tag = buf[buf.size() - 1];
|
|
212
|
+
} else {
|
|
213
|
+
word = line;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
hasSpace = (word.find(' ') != string::npos);
|
|
218
|
+
|
|
219
|
+
std::set<string> insertedWords;
|
|
220
|
+
|
|
221
|
+
insertedWords.insert(word);
|
|
222
|
+
|
|
223
|
+
if (hasSpace) {
|
|
224
|
+
string wordNoSpace = word;
|
|
225
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
226
|
+
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
227
|
+
insertedWords.insert(wordNoSpace);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
string wordLower = ToLowerString(word);
|
|
232
|
+
if (wordLower != word) {
|
|
233
|
+
insertedWords.insert(wordLower);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (hasSpace) {
|
|
237
|
+
string wordNoSpace = word;
|
|
238
|
+
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
239
|
+
if (!wordNoSpace.empty()) {
|
|
240
|
+
string wordNoSpaceLower = ToLowerString(wordNoSpace);
|
|
241
|
+
if (wordNoSpaceLower != wordNoSpace) {
|
|
242
|
+
insertedWords.insert(wordNoSpaceLower);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
|
|
248
|
+
DictUnit temp_node_info;
|
|
249
|
+
if (MakeNodeInfo(temp_node_info, *it, weight, tag)) {
|
|
250
|
+
if (trie_) {
|
|
251
|
+
active_node_infos_.push_back(temp_node_info);
|
|
252
|
+
trie_->InsertNode(active_node_infos_.back().word, &active_node_infos_.back());
|
|
253
|
+
if (active_node_infos_.back().word.size() == 1) {
|
|
254
|
+
user_dict_single_chinese_word_.insert(active_node_infos_.back().word[0]);
|
|
255
|
+
}
|
|
256
|
+
} else {
|
|
257
|
+
static_node_infos_.push_back(temp_node_info);
|
|
258
|
+
if (temp_node_info.word.size() == 1) {
|
|
259
|
+
user_dict_single_chinese_word_.insert(temp_node_info.word[0]);
|
|
260
|
+
}
|
|
134
261
|
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
135
264
|
}
|
|
136
265
|
|
|
137
266
|
void LoadUserDict(const vector<string>& buf) {
|
|
@@ -206,6 +335,16 @@ class DictTrie {
|
|
|
206
335
|
return true;
|
|
207
336
|
}
|
|
208
337
|
|
|
338
|
+
bool MakeNodeInfo(DictUnit& node_info,
|
|
339
|
+
const Unicode& word,
|
|
340
|
+
double weight,
|
|
341
|
+
const string& tag) {
|
|
342
|
+
node_info.word = word;
|
|
343
|
+
node_info.weight = weight;
|
|
344
|
+
node_info.tag = tag;
|
|
345
|
+
return true;
|
|
346
|
+
}
|
|
347
|
+
|
|
209
348
|
void LoadDict(const string& filePath) {
|
|
210
349
|
ifstream ifs(filePath.c_str());
|
|
211
350
|
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
|
@@ -116,6 +116,14 @@ class Jieba {
|
|
|
116
116
|
dict_trie_.LoadUserDict(path);
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
+
void SetIdfForWord(const string& word, double idf) {
|
|
120
|
+
extractor.SetIdf(word, idf);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
void SetIdfForWordWithMultiplier(const string& word, double multiplier = 2.0) {
|
|
124
|
+
extractor.SetIdfWithMultiplier(word, multiplier);
|
|
125
|
+
}
|
|
126
|
+
|
|
119
127
|
private:
|
|
120
128
|
static string pathJoin(const string& dir, const string& filename) {
|
|
121
129
|
if (dir.empty()) {
|
|
@@ -39,6 +39,19 @@ class KeywordExtractor {
|
|
|
39
39
|
~KeywordExtractor() {
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
+
void SetIdf(const string& word, double idf) {
|
|
43
|
+
idfMap_[word] = idf;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
void SetIdfWithMultiplier(const string& word, double multiplier = 2.0) {
|
|
47
|
+
unordered_map<string, double>::const_iterator cit = idfMap_.find(word);
|
|
48
|
+
if (cit != idfMap_.end()) {
|
|
49
|
+
idfMap_[word] = cit->second * multiplier;
|
|
50
|
+
} else {
|
|
51
|
+
idfMap_[word] = idfAverage_ * multiplier;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
42
55
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
|
43
56
|
vector<Word> topWords;
|
|
44
57
|
Extract(sentence, topWords, topN);
|
|
@@ -96,25 +109,33 @@ class KeywordExtractor {
|
|
|
96
109
|
ifstream ifs(idfPath.c_str());
|
|
97
110
|
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
|
98
111
|
string line ;
|
|
99
|
-
vector<string> buf;
|
|
100
112
|
double idf = 0.0;
|
|
101
113
|
double idfSum = 0.0;
|
|
102
114
|
size_t lineno = 0;
|
|
103
115
|
for (; getline(ifs, line); lineno++) {
|
|
104
|
-
buf.clear();
|
|
105
116
|
if (line.empty()) {
|
|
106
117
|
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
|
107
118
|
continue;
|
|
108
119
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
120
|
+
|
|
121
|
+
size_t lastSpace = line.find_last_of(" \t");
|
|
122
|
+
if (lastSpace == string::npos) {
|
|
123
|
+
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " format error. skipped.";
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
string word = line.substr(0, lastSpace);
|
|
128
|
+
string idfStr = line.substr(lastSpace + 1);
|
|
129
|
+
|
|
130
|
+
char* endptr;
|
|
131
|
+
idf = strtod(idfStr.c_str(), &endptr);
|
|
132
|
+
if (endptr == idfStr.c_str()) {
|
|
133
|
+
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " idf format error. skipped.";
|
|
112
134
|
continue;
|
|
113
135
|
}
|
|
114
|
-
|
|
115
|
-
idfMap_[
|
|
136
|
+
|
|
137
|
+
idfMap_[word] = idf;
|
|
116
138
|
idfSum += idf;
|
|
117
|
-
|
|
118
139
|
}
|
|
119
140
|
|
|
120
141
|
assert(lineno);
|
|
@@ -69,7 +69,8 @@ class Trie {
|
|
|
69
69
|
if (NULL == ptNode->next) {
|
|
70
70
|
return NULL;
|
|
71
71
|
}
|
|
72
|
-
|
|
72
|
+
Rune searchRune = ToLowerRune(it->rune);
|
|
73
|
+
citer = ptNode->next->find(searchRune);
|
|
73
74
|
if (ptNode->next->end() == citer) {
|
|
74
75
|
return NULL;
|
|
75
76
|
}
|
|
@@ -90,7 +91,7 @@ class Trie {
|
|
|
90
91
|
for (size_t i = 0; i < size_t(end - begin); i++) {
|
|
91
92
|
res[i].runestr = *(begin + i);
|
|
92
93
|
|
|
93
|
-
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
|
94
|
+
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(ToLowerRune(res[i].runestr.rune)))) {
|
|
94
95
|
ptNode = citer->second;
|
|
95
96
|
} else {
|
|
96
97
|
ptNode = NULL;
|
|
@@ -105,7 +106,7 @@ class Trie {
|
|
|
105
106
|
if (ptNode == NULL || ptNode->next == NULL) {
|
|
106
107
|
break;
|
|
107
108
|
}
|
|
108
|
-
citer = ptNode->next->find((begin + j)->rune);
|
|
109
|
+
citer = ptNode->next->find(ToLowerRune((begin + j)->rune));
|
|
109
110
|
if (ptNode->next->end() == citer) {
|
|
110
111
|
break;
|
|
111
112
|
}
|
|
@@ -128,11 +129,12 @@ class Trie {
|
|
|
128
129
|
if (NULL == ptNode->next) {
|
|
129
130
|
ptNode->next = new TrieNode::NextMap;
|
|
130
131
|
}
|
|
131
|
-
|
|
132
|
+
Rune insertRune = ToLowerRune(*citer);
|
|
133
|
+
kmIter = ptNode->next->find(insertRune);
|
|
132
134
|
if (ptNode->next->end() == kmIter) {
|
|
133
135
|
TrieNode *nextNode = new TrieNode;
|
|
134
136
|
|
|
135
|
-
ptNode->next->insert(make_pair(
|
|
137
|
+
ptNode->next->insert(make_pair(insertRune, nextNode));
|
|
136
138
|
ptNode = nextNode;
|
|
137
139
|
} else {
|
|
138
140
|
ptNode = kmIter->second;
|
|
@@ -145,23 +147,18 @@ class Trie {
|
|
|
145
147
|
if (key.begin() == key.end()) {
|
|
146
148
|
return;
|
|
147
149
|
}
|
|
148
|
-
//定义一个NextMap迭代器
|
|
149
150
|
TrieNode::NextMap::const_iterator kmIter;
|
|
150
|
-
//定义一个指向root的TrieNode指针
|
|
151
151
|
TrieNode *ptNode = root_;
|
|
152
152
|
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
|
153
|
-
//链表不存在元素
|
|
154
153
|
if (NULL == ptNode->next) {
|
|
155
154
|
return;
|
|
156
155
|
}
|
|
157
|
-
|
|
158
|
-
|
|
156
|
+
Rune deleteRune = ToLowerRune(*citer);
|
|
157
|
+
kmIter = ptNode->next->find(deleteRune);
|
|
159
158
|
if (ptNode->next->end() == kmIter) {
|
|
160
159
|
break;
|
|
161
160
|
}
|
|
162
|
-
|
|
163
|
-
ptNode->next->erase(*citer);
|
|
164
|
-
//删除该node
|
|
161
|
+
ptNode->next->erase(deleteRune);
|
|
165
162
|
ptNode = kmIter->second;
|
|
166
163
|
delete ptNode;
|
|
167
164
|
break;
|
|
@@ -222,6 +222,58 @@ inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs)
|
|
|
222
222
|
}
|
|
223
223
|
}
|
|
224
224
|
|
|
225
|
+
inline Rune ToLowerRune(Rune r) {
|
|
226
|
+
if (r >= 'A' && r <= 'Z') {
|
|
227
|
+
return r + ('a' - 'A');
|
|
228
|
+
}
|
|
229
|
+
return r;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
inline Rune ToUpperRune(Rune r) {
|
|
233
|
+
if (r >= 'a' && r <= 'z') {
|
|
234
|
+
return r - ('a' - 'A');
|
|
235
|
+
}
|
|
236
|
+
return r;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
inline Unicode ToLowerUnicode(const Unicode& unicode) {
|
|
240
|
+
Unicode result;
|
|
241
|
+
result.reserve(unicode.size());
|
|
242
|
+
for (size_t i = 0; i < unicode.size(); i++) {
|
|
243
|
+
result.push_back(ToLowerRune(unicode[i]));
|
|
244
|
+
}
|
|
245
|
+
return result;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
inline Unicode ToUpperUnicode(const Unicode& unicode) {
|
|
249
|
+
Unicode result;
|
|
250
|
+
result.reserve(unicode.size());
|
|
251
|
+
for (size_t i = 0; i < unicode.size(); i++) {
|
|
252
|
+
result.push_back(ToUpperRune(unicode[i]));
|
|
253
|
+
}
|
|
254
|
+
return result;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
inline string ToLowerString(const string& s) {
|
|
258
|
+
string result = s;
|
|
259
|
+
for (size_t i = 0; i < result.size(); i++) {
|
|
260
|
+
if (result[i] >= 'A' && result[i] <= 'Z') {
|
|
261
|
+
result[i] = result[i] + ('a' - 'A');
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return result;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
inline string ToUpperString(const string& s) {
|
|
268
|
+
string result = s;
|
|
269
|
+
for (size_t i = 0; i < result.size(); i++) {
|
|
270
|
+
if (result[i] >= 'a' && result[i] <= 'z') {
|
|
271
|
+
result[i] = result[i] - ('a' - 'A');
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
return result;
|
|
275
|
+
}
|
|
276
|
+
|
|
225
277
|
} // namespace cppjieba
|
|
226
278
|
|
|
227
279
|
#endif // CPPJIEBA_UNICODE_H
|