nodejieba-plus 3.5.11 → 3.5.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -3
- package/build/Release/nodejieba.node +0 -0
- package/index.js +17 -0
- package/lib/nodejieba.cpp +11 -4
- package/package.json +1 -1
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +24 -120
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +1 -3
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +0 -4
- package/test/load_user_dict_test.js +32 -0
- package/types/index.d.ts +1 -1
- package/test_open_claw.js +0 -65
package/README.md
CHANGED
|
@@ -191,7 +191,7 @@ console.log(nodejieba.cut("男默女泪"));
|
|
|
191
191
|
|
|
192
192
|
### 批量加载用户词典(新功能)
|
|
193
193
|
|
|
194
|
-
|
|
194
|
+
支持通过字符串数组、Set、单个字符串或 Buffer 批量加载用户词典:
|
|
195
195
|
|
|
196
196
|
```js
|
|
197
197
|
var nodejieba = require("nodejieba");
|
|
@@ -200,10 +200,18 @@ nodejieba.load();
|
|
|
200
200
|
// 方式1:使用字符串数组
|
|
201
201
|
nodejieba.loadUserDict(["云计算", "人工智能 1000 nz", "大数据"]);
|
|
202
202
|
|
|
203
|
-
// 方式2
|
|
203
|
+
// 方式2:使用 Set 集合(自动去重)
|
|
204
|
+
const dictSet = new Set();
|
|
205
|
+
dictSet.add("云计算");
|
|
206
|
+
dictSet.add("人工智能 1000 nz");
|
|
207
|
+
dictSet.add("大数据");
|
|
208
|
+
dictSet.add("云计算"); // 重复添加会被自动去重
|
|
209
|
+
nodejieba.loadUserDict(dictSet);
|
|
210
|
+
|
|
211
|
+
// 方式3:使用单个字符串
|
|
204
212
|
nodejieba.loadUserDict("区块链");
|
|
205
213
|
|
|
206
|
-
// 方式
|
|
214
|
+
// 方式4:使用 Buffer
|
|
207
215
|
const dictBuffer = Buffer.from("新词1\n新词2 100 n\n新词3 nz");
|
|
208
216
|
nodejieba.loadUserDict(dictBuffer);
|
|
209
217
|
|
|
Binary file
|
package/index.js
CHANGED
|
@@ -75,4 +75,21 @@ wrapWithDictLoad("textRankExtract");
|
|
|
75
75
|
wrapWithDictLoad("insertWord");
|
|
76
76
|
wrapWithDictLoad("loadUserDict");
|
|
77
77
|
|
|
78
|
+
// 保存原始的 loadUserDict 函数
|
|
79
|
+
var _loadUserDict = exports.loadUserDict;
|
|
80
|
+
|
|
81
|
+
// 重写 loadUserDict 以支持 Set 格式
|
|
82
|
+
exports.loadUserDict = function (dict) {
|
|
83
|
+
if (!isDictLoaded) {
|
|
84
|
+
exports.load();
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// 如果是 Set 对象,转换为数组
|
|
88
|
+
if (dict instanceof Set) {
|
|
89
|
+
dict = Array.from(dict);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return _loadUserDict.call(this, dict);
|
|
93
|
+
};
|
|
94
|
+
|
|
78
95
|
module.exports = exports;
|
package/lib/nodejieba.cpp
CHANGED
|
@@ -229,14 +229,18 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
229
229
|
Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
|
|
230
230
|
}
|
|
231
231
|
|
|
232
|
-
//
|
|
232
|
+
// 支持传入字符串数组、单个字符串或 Buffer
|
|
233
233
|
if (info[0].IsArray()) {
|
|
234
234
|
Napi::Array arr = info[0].As<Napi::Array>();
|
|
235
235
|
std::vector<std::string> buf;
|
|
236
236
|
for (size_t i = 0; i < arr.Length(); i++) {
|
|
237
237
|
Napi::Value val = arr[i];
|
|
238
238
|
if (val.IsString()) {
|
|
239
|
-
|
|
239
|
+
std::string line = val.As<Napi::String>().Utf8Value();
|
|
240
|
+
// 过滤空字符串,避免断言失败
|
|
241
|
+
if (!line.empty()) {
|
|
242
|
+
buf.push_back(line);
|
|
243
|
+
}
|
|
240
244
|
}
|
|
241
245
|
}
|
|
242
246
|
_jieba_handle->LoadUserDict(buf);
|
|
@@ -244,8 +248,11 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
244
248
|
// 支持传入单个词典条目字符串
|
|
245
249
|
std::string line = info[0].As<Napi::String>().Utf8Value();
|
|
246
250
|
std::vector<std::string> buf;
|
|
247
|
-
|
|
248
|
-
|
|
251
|
+
// 过滤空字符串
|
|
252
|
+
if (!line.empty()) {
|
|
253
|
+
buf.push_back(line);
|
|
254
|
+
_jieba_handle->LoadUserDict(buf);
|
|
255
|
+
}
|
|
249
256
|
} else if (info[0].IsBuffer()) {
|
|
250
257
|
// 支持传入 Buffer,将其转换为字符串并按行分割
|
|
251
258
|
Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
|
package/package.json
CHANGED
|
@@ -10,7 +10,6 @@
|
|
|
10
10
|
#include <stdint.h>
|
|
11
11
|
#include <cmath>
|
|
12
12
|
#include <limits>
|
|
13
|
-
#include <algorithm>
|
|
14
13
|
#include "limonp/StringUtil.hpp"
|
|
15
14
|
#include "limonp/Logging.hpp"
|
|
16
15
|
#include "Unicode.hpp"
|
|
@@ -113,97 +112,26 @@ class DictTrie {
|
|
|
113
112
|
vector<string> buf;
|
|
114
113
|
DictUnit node_info;
|
|
115
114
|
Split(line, buf, " ");
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
bool isNumber = true;
|
|
132
|
-
for (char c : buf[1]) {
|
|
133
|
-
if (!isdigit(c)) {
|
|
134
|
-
isNumber = false;
|
|
135
|
-
break;
|
|
115
|
+
if(buf.size() == 1){
|
|
116
|
+
MakeNodeInfo(node_info,
|
|
117
|
+
buf[0],
|
|
118
|
+
user_word_default_weight_,
|
|
119
|
+
UNKNOWN_TAG);
|
|
120
|
+
} else if (buf.size() == 2) {
|
|
121
|
+
MakeNodeInfo(node_info,
|
|
122
|
+
buf[0],
|
|
123
|
+
user_word_default_weight_,
|
|
124
|
+
buf[1]);
|
|
125
|
+
} else if (buf.size() == 3) {
|
|
126
|
+
int freq = atoi(buf[1].c_str());
|
|
127
|
+
assert(freq_sum_ > 0.0);
|
|
128
|
+
double weight = log(1.0 * freq / freq_sum_);
|
|
129
|
+
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
|
136
130
|
}
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
int freq = atoi(buf[1].c_str());
|
|
141
|
-
assert(freq_sum_ > 0.0);
|
|
142
|
-
weight = log(1.0 * freq / freq_sum_);
|
|
143
|
-
word = buf[0];
|
|
144
|
-
} else {
|
|
145
|
-
// "word tag" 格式
|
|
146
|
-
word = buf[0];
|
|
147
|
-
tag = buf[1];
|
|
148
|
-
}
|
|
149
|
-
} else {
|
|
150
|
-
// 检查最后两个字段:可能是 "... word freq tag" 或 "... word1 word2 tag" 等
|
|
151
|
-
// 倒数第二个如果是数字,则认为是词频,最后一个是标签
|
|
152
|
-
// 否则认为只有最后一个是标签,前面都是关键词
|
|
153
|
-
bool isFreq = true;
|
|
154
|
-
for (char c : buf[buf.size() - 2]) {
|
|
155
|
-
if (!isdigit(c)) {
|
|
156
|
-
isFreq = false;
|
|
157
|
-
break;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
if (isFreq) {
|
|
162
|
-
// 格式: "word... freq tag"
|
|
163
|
-
int freq = atoi(buf[buf.size() - 2].c_str());
|
|
164
|
-
assert(freq_sum_ > 0.0);
|
|
165
|
-
weight = log(1.0 * freq / freq_sum_);
|
|
166
|
-
// 前面的所有部分(除了最后两个)组成关键词
|
|
167
|
-
for (size_t i = 0; i < buf.size() - 2; ++i) {
|
|
168
|
-
if (i > 0) word += " ";
|
|
169
|
-
word += buf[i];
|
|
170
|
-
}
|
|
171
|
-
tag = buf[buf.size() - 1];
|
|
172
|
-
} else {
|
|
173
|
-
// 格式: "word... tag" (无词频)
|
|
174
|
-
// 前面的所有部分(除了最后一个)组成关键词
|
|
175
|
-
for (size_t i = 0; i < buf.size() - 1; ++i) {
|
|
176
|
-
if (i > 0) word += " ";
|
|
177
|
-
word += buf[i];
|
|
178
|
-
}
|
|
179
|
-
tag = buf[buf.size() - 1];
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
// 检查词中是否包含空格
|
|
184
|
-
hasSpace = (word.find(' ') != string::npos);
|
|
185
|
-
|
|
186
|
-
// 添加原始词(包含空格版本)
|
|
187
|
-
MakeNodeInfo(node_info, word, weight, tag);
|
|
188
|
-
static_node_infos_.push_back(node_info);
|
|
189
|
-
if (node_info.word.size() == 1) {
|
|
190
|
-
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
// 如果词包含空格,同时添加无空格版本
|
|
194
|
-
if (hasSpace) {
|
|
195
|
-
string wordNoSpace = word;
|
|
196
|
-
// 移除所有空格
|
|
197
|
-
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
198
|
-
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
199
|
-
DictUnit node_info_no_space;
|
|
200
|
-
MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
|
|
201
|
-
static_node_infos_.push_back(node_info_no_space);
|
|
202
|
-
if (node_info_no_space.word.size() == 1) {
|
|
203
|
-
user_dict_single_chinese_word_.insert(node_info_no_space.word[0]);
|
|
131
|
+
static_node_infos_.push_back(node_info);
|
|
132
|
+
if (node_info.word.size() == 1) {
|
|
133
|
+
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
|
204
134
|
}
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
135
|
}
|
|
208
136
|
|
|
209
137
|
void LoadUserDict(const vector<string>& buf) {
|
|
@@ -287,36 +215,12 @@ class DictTrie {
|
|
|
287
215
|
DictUnit node_info;
|
|
288
216
|
while (getline(ifs, line)) {
|
|
289
217
|
Split(line, buf, " ");
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
for (size_t i = 0; i < buf.size() - 2; ++i) {
|
|
297
|
-
if (i > 0) word += " ";
|
|
298
|
-
word += buf[i];
|
|
299
|
-
}
|
|
300
|
-
double weight = atof(buf[buf.size() - 2].c_str());
|
|
301
|
-
string tag = buf[buf.size() - 1];
|
|
302
|
-
|
|
303
|
-
// 添加原始词(包含空格版本)
|
|
304
|
-
MakeNodeInfo(node_info, word, weight, tag);
|
|
305
|
-
static_node_infos_.push_back(node_info);
|
|
306
|
-
|
|
307
|
-
// 如果词包含空格,同时添加无空格版本
|
|
308
|
-
if (word.find(' ') != string::npos) {
|
|
309
|
-
string wordNoSpace = word;
|
|
310
|
-
wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
|
|
311
|
-
if (!wordNoSpace.empty() && wordNoSpace != word) {
|
|
312
|
-
DictUnit node_info_no_space;
|
|
313
|
-
MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
|
|
314
|
-
static_node_infos_.push_back(node_info_no_space);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
} else {
|
|
318
|
-
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
|
319
|
-
}
|
|
218
|
+
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
|
219
|
+
MakeNodeInfo(node_info,
|
|
220
|
+
buf[0],
|
|
221
|
+
atof(buf[1].c_str()),
|
|
222
|
+
buf[2]);
|
|
223
|
+
static_node_infos_.push_back(node_info);
|
|
320
224
|
}
|
|
321
225
|
}
|
|
322
226
|
|
|
@@ -8,9 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
namespace cppjieba {
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
// 这样英文单词之间的空格不会被当作分隔符
|
|
13
|
-
const char* const SPECIAL_SEPARATORS = "\t\n\xEF\xBC\x8C\xE3\x80\x82";
|
|
11
|
+
const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
|
|
14
12
|
|
|
15
13
|
using namespace limonp;
|
|
16
14
|
|
|
@@ -92,10 +92,6 @@ inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
|
|
|
92
92
|
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
|
93
93
|
// 7bit, total 7bit
|
|
94
94
|
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
|
95
|
-
// 将大写英文字母转换为小写,实现大小写不敏感匹配
|
|
96
|
-
if (rp.rune >= 'A' && rp.rune <= 'Z') {
|
|
97
|
-
rp.rune = rp.rune - 'A' + 'a';
|
|
98
|
-
}
|
|
99
95
|
rp.len = 1;
|
|
100
96
|
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
|
101
97
|
// 110xxxxxx
|
|
@@ -55,4 +55,36 @@ describe("nodejieba.loadUserDict", function() {
|
|
|
55
55
|
result.should.containEql('云计算');
|
|
56
56
|
result.should.containEql('人工智能');
|
|
57
57
|
});
|
|
58
|
+
|
|
59
|
+
it("nodejieba.loadUserDict with Set should return true", function() {
|
|
60
|
+
const dictSet = new Set();
|
|
61
|
+
dictSet.add("非常独特的测试词123");
|
|
62
|
+
dictSet.add("另一个独特测试词 100 n");
|
|
63
|
+
|
|
64
|
+
var loadResult = nodejieba.loadUserDict(dictSet);
|
|
65
|
+
loadResult.should.eql(true);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("nodejieba.loadUserDict with Set should automatically deduplicate", function() {
|
|
69
|
+
const dictSet = new Set();
|
|
70
|
+
dictSet.add("去重专用测试词");
|
|
71
|
+
dictSet.add("去重专用测试词"); // 重复添加
|
|
72
|
+
dictSet.add("去重专用测试词"); // 再次重复添加
|
|
73
|
+
|
|
74
|
+
var loadResult = nodejieba.loadUserDict(dictSet);
|
|
75
|
+
loadResult.should.eql(true);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it("nodejieba.loadUserDict should filter empty strings", function() {
|
|
79
|
+
// 测试空字符串被过滤,不会导致断言失败
|
|
80
|
+
var dictLines = [
|
|
81
|
+
"有效词1",
|
|
82
|
+
"", // 空字符串
|
|
83
|
+
"有效词2",
|
|
84
|
+
"", // 空字符串
|
|
85
|
+
" " // 只有空格的字符串(也会被保留,因为不是完全空)
|
|
86
|
+
];
|
|
87
|
+
var loadResult = nodejieba.loadUserDict(dictLines);
|
|
88
|
+
loadResult.should.eql(true);
|
|
89
|
+
});
|
|
58
90
|
});
|
package/types/index.d.ts
CHANGED
|
@@ -27,5 +27,5 @@ declare module "nodejieba" {
|
|
|
27
27
|
export function textRankExtract(sentence: string, threshold: number): ExtractResult[];
|
|
28
28
|
export function insertWord(word: string, tag?: string): boolean;
|
|
29
29
|
export function cutSmall(sentence: string, small: number): string[];
|
|
30
|
-
export function loadUserDict(dict: string | string[] | Buffer): boolean;
|
|
30
|
+
export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
|
|
31
31
|
}
|
package/test_open_claw.js
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
// 测试 "open claw" 关键词提取问题
|
|
2
|
-
|
|
3
|
-
var nodejieba = require("./index.js");
|
|
4
|
-
|
|
5
|
-
// 测试句子
|
|
6
|
-
var sentence = "Node.js在Web开发中的应用与实践Open Claw,这句测试的话,关键词是\"open claw\"";
|
|
7
|
-
|
|
8
|
-
console.log("=".repeat(60));
|
|
9
|
-
console.log("测试句子:", sentence);
|
|
10
|
-
console.log("=".repeat(60));
|
|
11
|
-
|
|
12
|
-
// 1. 先进行分词测试
|
|
13
|
-
console.log("\n【1. 分词结果】");
|
|
14
|
-
var cutResult = nodejieba.cut(sentence);
|
|
15
|
-
console.log("cut:", cutResult);
|
|
16
|
-
|
|
17
|
-
// 2. 关键词提取测试
|
|
18
|
-
console.log("\n【2. 关键词提取 (extract)】");
|
|
19
|
-
var keywords = nodejieba.extract(sentence, 10);
|
|
20
|
-
console.log("提取的关键词:");
|
|
21
|
-
keywords.forEach(function(kw) {
|
|
22
|
-
console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
// 3. TextRank 关键词提取测试
|
|
26
|
-
console.log("\n【3. TextRank 关键词提取】");
|
|
27
|
-
var textRankKeywords = nodejieba.textRankExtract(sentence, 10);
|
|
28
|
-
console.log("提取的关键词:");
|
|
29
|
-
textRankKeywords.forEach(function(kw) {
|
|
30
|
-
console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
|
|
31
|
-
});
|
|
32
|
-
|
|
33
|
-
// 4. 检查是否包含 "open claw"
|
|
34
|
-
console.log("\n【4. 检查结果】");
|
|
35
|
-
var hasOpenClaw = keywords.some(function(kw) {
|
|
36
|
-
return kw.word.toLowerCase() === "open claw";
|
|
37
|
-
});
|
|
38
|
-
console.log("是否提取到 'open claw':", hasOpenClaw);
|
|
39
|
-
|
|
40
|
-
// 5. 添加自定义词后再次测试
|
|
41
|
-
console.log("\n【5. 添加自定义词后测试】");
|
|
42
|
-
nodejieba.insertWord("open claw");
|
|
43
|
-
console.log("已添加自定义词: open claw");
|
|
44
|
-
|
|
45
|
-
var cutResult2 = nodejieba.cut(sentence);
|
|
46
|
-
console.log("\n再次分词结果:");
|
|
47
|
-
console.log("cut:", cutResult2);
|
|
48
|
-
|
|
49
|
-
var keywords2 = nodejieba.extract(sentence, 10);
|
|
50
|
-
console.log("\n再次提取关键词:");
|
|
51
|
-
keywords2.forEach(function(kw) {
|
|
52
|
-
console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
|
|
53
|
-
});
|
|
54
|
-
|
|
55
|
-
var hasOpenClaw2 = keywords2.some(function(kw) {
|
|
56
|
-
return kw.word.toLowerCase() === "open claw";
|
|
57
|
-
});
|
|
58
|
-
console.log("\n是否提取到 'open claw':", hasOpenClaw2);
|
|
59
|
-
|
|
60
|
-
console.log("\n" + "=".repeat(60));
|
|
61
|
-
console.log("问题分析:");
|
|
62
|
-
console.log("1. jieba 分词器默认基于中文语料训练,对英文词汇识别有限");
|
|
63
|
-
console.log("2. 'Open Claw' 作为英文词组,默认词典中不存在");
|
|
64
|
-
console.log("3. 解决方案: 使用 insertWord() 方法添加自定义词");
|
|
65
|
-
console.log("=".repeat(60));
|