nodejieba-plus 3.5.16 → 3.5.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -0
- package/analyze_weight.js +57 -0
- package/build/Release/nodejieba.node +0 -0
- package/diagnose_priority.js +71 -0
- package/index.js +1 -0
- package/lib/nodejieba.cpp +107 -1
- package/lib/nodejieba.h +1 -0
- package/package.json +1 -1
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +8 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +29 -8
- package/test_1_3x_weight.js +86 -0
- package/test_idf_feature.js +43 -0
- package/test_open_claw.js +65 -0
- package/types/index.d.ts +1 -0
package/README.md
CHANGED
|
@@ -220,6 +220,68 @@ nodejieba.loadUserDict(dictBuffer);
|
|
|
220
220
|
// 分词时会识别用户词典中的词
|
|
221
221
|
var result = nodejieba.cut("云计算和大数据是人工智能的基础");
|
|
222
222
|
console.log(result); // ['云计算', '和', '大数据', '是', '人工智能', '的', '基础']
|
|
223
|
+
|
|
224
|
+
// 关键词提取时,用户词典中的词会自动获得更高的权重(默认2倍)
|
|
225
|
+
var keywords = nodejieba.extract("云计算和大数据是人工智能的基础", 5);
|
|
226
|
+
console.log(keywords); // 用户词典中的词排名会显著提升
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
#### 用户词典权重提升机制(新功能)
|
|
230
|
+
|
|
231
|
+
从 v3.5.16 开始,加载用户词典时会自动为词典中的词设置更高的 IDF 权重,确保在关键词提取时获得更高的排名:
|
|
232
|
+
|
|
233
|
+
**自动权重提升**:
|
|
234
|
+
- 加载用户词典后,词典中的词会自动获得 **1.3 倍 IDF 权重**
|
|
235
|
+
- 这意味着用户词典中的词在关键词提取时会优先显示
|
|
236
|
+
|
|
237
|
+
**手动设置权重**:
|
|
238
|
+
```js
|
|
239
|
+
// 方式1:设置具体的 IDF 值
|
|
240
|
+
nodejieba.setIdf("Open Claw", 30.0);
|
|
241
|
+
|
|
242
|
+
// 方式2:使用倍数提升权重(默认1.3倍)
|
|
243
|
+
nodejieba.setIdf("Open Claw"); // 1.3倍权重
|
|
244
|
+
|
|
245
|
+
// 方式3:自定义倍数
|
|
246
|
+
nodejieba.setIdf("Open Claw", null, 2.0); // 2倍权重
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
#### IDF 词典支持空格关键词(新功能)
|
|
250
|
+
|
|
251
|
+
从 v3.5.16 开始,IDF 词典支持包含空格的关键词:
|
|
252
|
+
|
|
253
|
+
**IDF 词典格式**:
|
|
254
|
+
```
|
|
255
|
+
# 普通关键词
|
|
256
|
+
互动 12.0
|
|
257
|
+
|
|
258
|
+
# 包含空格的关键词
|
|
259
|
+
Open Claw 30.0
|
|
260
|
+
Machine Learning 25.0
|
|
261
|
+
Deep Learning 28.0
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
**使用示例**:
|
|
265
|
+
```js
|
|
266
|
+
var nodejieba = require("nodejieba");
|
|
267
|
+
|
|
268
|
+
// 加载包含空格关键词的 IDF 词典
|
|
269
|
+
nodejieba.load({
|
|
270
|
+
idfDict: "./custom_idf.txt"
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// 加载用户词典(同时需要分词词典支持)
|
|
274
|
+
nodejieba.loadUserDict(["Open Claw", "Machine Learning", "Deep Learning"]);
|
|
275
|
+
|
|
276
|
+
// 关键词提取时会正确识别包含空格的词
|
|
277
|
+
var keywords = nodejieba.extract("Open Claw和Machine Learning都是Deep Learning的基础", 5);
|
|
278
|
+
console.log(keywords);
|
|
279
|
+
// 输出: [
|
|
280
|
+
// { word: 'Open Claw', weight: 30.00 },
|
|
281
|
+
// { word: 'Deep Learning', weight: 28.00 },
|
|
282
|
+
// { word: 'Machine Learning', weight: 25.00 },
|
|
283
|
+
// ...
|
|
284
|
+
// ]
|
|
223
285
|
```
|
|
224
286
|
|
|
225
287
|
#### 词典条目格式
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
console.log("=== 关键词提取权重机制分析 ===\n");
|
|
4
|
+
|
|
5
|
+
nodejieba.load();
|
|
6
|
+
|
|
7
|
+
const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
|
|
8
|
+
|
|
9
|
+
console.log("【问题分析】");
|
|
10
|
+
console.log("关键词提取权重 = TF(词频)× IDF(逆文档频率)\n");
|
|
11
|
+
|
|
12
|
+
console.log("步骤1: 查看分词结果");
|
|
13
|
+
var cutResult = nodejieba.cut(content);
|
|
14
|
+
console.log("分词结果:", cutResult.slice(0, 30));
|
|
15
|
+
|
|
16
|
+
console.log("\n步骤2: 统计词频(TF)");
|
|
17
|
+
var wordFreq = {};
|
|
18
|
+
cutResult.forEach(word => {
|
|
19
|
+
wordFreq[word] = (wordFreq[word] || 0) + 1;
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
console.log("\n高频词(出现2次以上):");
|
|
23
|
+
Object.entries(wordFreq)
|
|
24
|
+
.filter(([word, freq]) => freq >= 2)
|
|
25
|
+
.sort((a, b) => b[1] - a[1])
|
|
26
|
+
.forEach(([word, freq]) => {
|
|
27
|
+
console.log(` ${word}: ${freq}次`);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
console.log("\n步骤3: 提取关键词(未加载用户词典)");
|
|
31
|
+
var extractResult1 = nodejieba.extract(content, 20);
|
|
32
|
+
console.log("关键词及权重:");
|
|
33
|
+
extractResult1.forEach((item, i) => {
|
|
34
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
console.log("\n步骤4: 加载用户词典");
|
|
38
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
39
|
+
|
|
40
|
+
console.log("\n步骤5: 再次提取关键词");
|
|
41
|
+
var extractResult2 = nodejieba.extract(content, 20);
|
|
42
|
+
console.log("关键词及权重:");
|
|
43
|
+
extractResult2.forEach((item, i) => {
|
|
44
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
console.log("\n【核心问题】");
|
|
48
|
+
console.log("1. 'Open Claw' 出现了 2 次(TF = 2)");
|
|
49
|
+
console.log("2. '互动' 出现了 2 次(TF = 2)");
|
|
50
|
+
console.log("3. 但 '互动' 的权重可能更高,因为:");
|
|
51
|
+
console.log(" - '互动' 在 IDF 词典中有专门的权重值");
|
|
52
|
+
console.log(" - 'Open Claw' 不在 IDF 词典中,使用平均 IDF 值");
|
|
53
|
+
console.log("4. 如果 '互动' 的 IDF 值 > 平均 IDF 值,则权重更高");
|
|
54
|
+
|
|
55
|
+
console.log("\n【解决方案】");
|
|
56
|
+
console.log("需要为用户词典中的词设置 IDF 权重!");
|
|
57
|
+
console.log("建议:在 loadUserDict 时,自动为用户词典中的词设置较高的 IDF 值");
|
|
Binary file
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
console.log("=== 问题诊断:Open Claw 未被识别 ===\n");
|
|
4
|
+
|
|
5
|
+
console.log("【问题原因分析】");
|
|
6
|
+
console.log("1. 用户词典必须在调用 extract() 之前加载");
|
|
7
|
+
console.log("2. 用户词典会影响分词结果,进而影响关键词提取");
|
|
8
|
+
console.log("3. 关键词提取依赖于分词结果和 IDF 权重\n");
|
|
9
|
+
|
|
10
|
+
console.log("=== 测试场景 1: 错误用法(先提取后加载)===");
|
|
11
|
+
nodejieba.load();
|
|
12
|
+
const content = "这是一个Open Claw项目,Open Claw很好玩";
|
|
13
|
+
|
|
14
|
+
console.log("步骤1: 先提取关键词(未加载用户词典)");
|
|
15
|
+
var result1 = nodejieba.extract(content, 5);
|
|
16
|
+
console.log("关键词:", result1.map(r => r.word));
|
|
17
|
+
console.log("说明: 此时 'Open Claw' 被拆分成单个字母\n");
|
|
18
|
+
|
|
19
|
+
console.log("步骤2: 再加载用户词典");
|
|
20
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
21
|
+
console.log("词典已加载\n");
|
|
22
|
+
|
|
23
|
+
console.log("步骤3: 再次提取关键词");
|
|
24
|
+
var result2 = nodejieba.extract(content, 5);
|
|
25
|
+
console.log("关键词:", result2.map(r => r.word));
|
|
26
|
+
console.log("说明: 此时 'Open Claw' 已被正确识别\n");
|
|
27
|
+
|
|
28
|
+
console.log("=== 测试场景 2: 正确用法(先加载后提取)===");
|
|
29
|
+
console.log("重新初始化...");
|
|
30
|
+
delete require.cache[require.resolve('./index.js')];
|
|
31
|
+
var nodejieba2 = require("./index.js");
|
|
32
|
+
|
|
33
|
+
console.log("步骤1: 先加载词典");
|
|
34
|
+
nodejieba2.load();
|
|
35
|
+
nodejieba2.loadUserDict("Open Claw 10 n");
|
|
36
|
+
console.log("词典已加载\n");
|
|
37
|
+
|
|
38
|
+
console.log("步骤2: 再提取关键词");
|
|
39
|
+
var result3 = nodejieba2.extract(content, 5);
|
|
40
|
+
console.log("关键词:", result3.map(r => r.word));
|
|
41
|
+
console.log("说明: 'Open Claw' 被正确识别\n");
|
|
42
|
+
|
|
43
|
+
console.log("=== 测试场景 3: 检查优先级 ===");
|
|
44
|
+
console.log("测试: 用户词典 vs 默认词典\n");
|
|
45
|
+
|
|
46
|
+
var testCases = [
|
|
47
|
+
"Open Claw",
|
|
48
|
+
"Open Claw是一个项目",
|
|
49
|
+
"我喜欢Open Claw这个游戏"
|
|
50
|
+
];
|
|
51
|
+
|
|
52
|
+
testCases.forEach((text, i) => {
|
|
53
|
+
console.log(`测试 ${i + 1}: "${text}"`);
|
|
54
|
+
var cut = nodejieba2.cut(text);
|
|
55
|
+
var extract = nodejieba2.extract(text, 3);
|
|
56
|
+
console.log(" 分词:", cut);
|
|
57
|
+
console.log(" 关键词:", extract.map(r => r.word));
|
|
58
|
+
console.log(" 包含 'Open Claw':", cut.includes("Open Claw"));
|
|
59
|
+
console.log();
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
console.log("=== 解决方案 ===");
|
|
63
|
+
console.log("✅ 正确做法:");
|
|
64
|
+
console.log(" 1. 先调用 nodejieba.load()");
|
|
65
|
+
console.log(" 2. 再调用 nodejieba.loadUserDict('Open Claw 10 n')");
|
|
66
|
+
console.log(" 3. 最后调用 nodejieba.extract()");
|
|
67
|
+
console.log();
|
|
68
|
+
console.log("❌ 错误做法:");
|
|
69
|
+
console.log(" 1. 先调用 nodejieba.extract()");
|
|
70
|
+
console.log(" 2. 再调用 nodejieba.loadUserDict()");
|
|
71
|
+
console.log(" (此时词典不会影响已提取的结果)");
|
package/index.js
CHANGED
package/lib/nodejieba.cpp
CHANGED
|
@@ -21,7 +21,8 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
|
|
|
21
21
|
InstanceMethod("extract", &NodeJieba::extract),
|
|
22
22
|
InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
|
|
23
23
|
InstanceMethod("insertWord", &NodeJieba::insertWord),
|
|
24
|
-
InstanceMethod("loadUserDict", &NodeJieba::loadUserDict)
|
|
24
|
+
InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
|
|
25
|
+
InstanceMethod("setIdf", &NodeJieba::setIdf)
|
|
25
26
|
});
|
|
26
27
|
}
|
|
27
28
|
|
|
@@ -53,6 +54,44 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
|
|
|
53
54
|
return Napi::Boolean::New(info.Env(), true);
|
|
54
55
|
}
|
|
55
56
|
|
|
57
|
+
Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
|
|
58
|
+
if (info.Length() < 1) {
|
|
59
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (!_jieba_handle) {
|
|
63
|
+
Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
|
|
64
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
std::string word;
|
|
68
|
+
double idf = 0.0;
|
|
69
|
+
double multiplier = 2.0;
|
|
70
|
+
|
|
71
|
+
if (info[0].IsString()) {
|
|
72
|
+
word = info[0].As<Napi::String>().Utf8Value();
|
|
73
|
+
} else {
|
|
74
|
+
return Napi::Boolean::New(info.Env(), false);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (info.Length() >= 2) {
|
|
78
|
+
if (info[1].IsNumber()) {
|
|
79
|
+
idf = info[1].As<Napi::Number>().DoubleValue();
|
|
80
|
+
_jieba_handle->SetIdfForWord(word, idf);
|
|
81
|
+
return Napi::Boolean::New(info.Env(), true);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (info.Length() >= 3) {
|
|
86
|
+
if (info[2].IsNumber()) {
|
|
87
|
+
multiplier = info[2].As<Napi::Number>().DoubleValue();
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
_jieba_handle->SetIdfForWordWithMultiplier(word, multiplier);
|
|
92
|
+
return Napi::Boolean::New(info.Env(), true);
|
|
93
|
+
}
|
|
94
|
+
|
|
56
95
|
Napi::Value NodeJieba::insertWord(const Napi::CallbackInfo& info) {
|
|
57
96
|
if(info.Length() < 1) {
|
|
58
97
|
return Napi::Boolean::New(info.Env(), false);
|
|
@@ -254,6 +293,70 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
254
293
|
str = str.substr(start, end - start);
|
|
255
294
|
};
|
|
256
295
|
|
|
296
|
+
auto extractKeyword = [](const std::string& line) -> std::string {
|
|
297
|
+
std::istringstream iss(line);
|
|
298
|
+
std::vector<std::string> parts;
|
|
299
|
+
std::string part;
|
|
300
|
+
|
|
301
|
+
while (iss >> part) {
|
|
302
|
+
parts.push_back(part);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (parts.empty()) {
|
|
306
|
+
return "";
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (parts.size() == 1) {
|
|
310
|
+
return parts[0];
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (parts.size() == 2) {
|
|
314
|
+
size_t pos;
|
|
315
|
+
try {
|
|
316
|
+
std::stoi(parts[1], &pos);
|
|
317
|
+
if (pos == parts[1].length()) {
|
|
318
|
+
return parts[0];
|
|
319
|
+
}
|
|
320
|
+
} catch (...) {
|
|
321
|
+
}
|
|
322
|
+
return parts[0];
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (parts.size() >= 3) {
|
|
326
|
+
size_t pos;
|
|
327
|
+
try {
|
|
328
|
+
std::stoi(parts[parts.size() - 2], &pos);
|
|
329
|
+
if (pos == parts[parts.size() - 2].length()) {
|
|
330
|
+
std::string keyword;
|
|
331
|
+
for (size_t i = 0; i < parts.size() - 2; i++) {
|
|
332
|
+
if (i > 0) keyword += " ";
|
|
333
|
+
keyword += parts[i];
|
|
334
|
+
}
|
|
335
|
+
return keyword;
|
|
336
|
+
}
|
|
337
|
+
} catch (...) {
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
std::string keyword;
|
|
341
|
+
for (size_t i = 0; i < parts.size() - 1; i++) {
|
|
342
|
+
if (i > 0) keyword += " ";
|
|
343
|
+
keyword += parts[i];
|
|
344
|
+
}
|
|
345
|
+
return keyword;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return parts[0];
|
|
349
|
+
};
|
|
350
|
+
|
|
351
|
+
auto setDefaultIdf = [&](const std::vector<std::string>& dictLines) {
|
|
352
|
+
for (const auto& line : dictLines) {
|
|
353
|
+
std::string keyword = extractKeyword(line);
|
|
354
|
+
if (!keyword.empty()) {
|
|
355
|
+
_jieba_handle->SetIdfForWordWithMultiplier(keyword, 1.3);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
};
|
|
359
|
+
|
|
257
360
|
if (info[0].IsArray()) {
|
|
258
361
|
Napi::Array arr = info[0].As<Napi::Array>();
|
|
259
362
|
std::vector<std::string> buf;
|
|
@@ -271,6 +374,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
271
374
|
}
|
|
272
375
|
}
|
|
273
376
|
_jieba_handle->LoadUserDict(buf);
|
|
377
|
+
setDefaultIdf(buf);
|
|
274
378
|
} else if (info[0].IsString()) {
|
|
275
379
|
std::string line = info[0].As<Napi::String>().Utf8Value();
|
|
276
380
|
trimString(line);
|
|
@@ -278,6 +382,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
278
382
|
if (!line.empty() && !isBlankString(line)) {
|
|
279
383
|
buf.push_back(line);
|
|
280
384
|
_jieba_handle->LoadUserDict(buf);
|
|
385
|
+
setDefaultIdf(buf);
|
|
281
386
|
}
|
|
282
387
|
} else if (info[0].IsBuffer()) {
|
|
283
388
|
Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
|
|
@@ -292,6 +397,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
|
|
|
292
397
|
}
|
|
293
398
|
}
|
|
294
399
|
_jieba_handle->LoadUserDict(buf);
|
|
400
|
+
setDefaultIdf(buf);
|
|
295
401
|
} else {
|
|
296
402
|
return Napi::Boolean::New(info.Env(), false);
|
|
297
403
|
}
|
package/lib/nodejieba.h
CHANGED
|
@@ -21,6 +21,7 @@ private:
|
|
|
21
21
|
Napi::Value textRankExtract(const Napi::CallbackInfo& info);
|
|
22
22
|
Napi::Value insertWord(const Napi::CallbackInfo& info);
|
|
23
23
|
Napi::Value loadUserDict(const Napi::CallbackInfo& info);
|
|
24
|
+
Napi::Value setIdf(const Napi::CallbackInfo& info);
|
|
24
25
|
|
|
25
26
|
cppjieba::Jieba* _jieba_handle{nullptr};
|
|
26
27
|
cppjieba::TextRankExtractor* _text_rank_extractor_handle{nullptr};
|
package/package.json
CHANGED
|
@@ -116,6 +116,14 @@ class Jieba {
|
|
|
116
116
|
dict_trie_.LoadUserDict(path);
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
+
void SetIdfForWord(const string& word, double idf) {
|
|
120
|
+
extractor.SetIdf(word, idf);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
void SetIdfForWordWithMultiplier(const string& word, double multiplier = 2.0) {
|
|
124
|
+
extractor.SetIdfWithMultiplier(word, multiplier);
|
|
125
|
+
}
|
|
126
|
+
|
|
119
127
|
private:
|
|
120
128
|
static string pathJoin(const string& dir, const string& filename) {
|
|
121
129
|
if (dir.empty()) {
|
|
@@ -39,6 +39,19 @@ class KeywordExtractor {
|
|
|
39
39
|
~KeywordExtractor() {
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
+
void SetIdf(const string& word, double idf) {
|
|
43
|
+
idfMap_[word] = idf;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
void SetIdfWithMultiplier(const string& word, double multiplier = 2.0) {
|
|
47
|
+
unordered_map<string, double>::const_iterator cit = idfMap_.find(word);
|
|
48
|
+
if (cit != idfMap_.end()) {
|
|
49
|
+
idfMap_[word] = cit->second * multiplier;
|
|
50
|
+
} else {
|
|
51
|
+
idfMap_[word] = idfAverage_ * multiplier;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
42
55
|
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
|
43
56
|
vector<Word> topWords;
|
|
44
57
|
Extract(sentence, topWords, topN);
|
|
@@ -96,25 +109,33 @@ class KeywordExtractor {
|
|
|
96
109
|
ifstream ifs(idfPath.c_str());
|
|
97
110
|
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
|
98
111
|
string line ;
|
|
99
|
-
vector<string> buf;
|
|
100
112
|
double idf = 0.0;
|
|
101
113
|
double idfSum = 0.0;
|
|
102
114
|
size_t lineno = 0;
|
|
103
115
|
for (; getline(ifs, line); lineno++) {
|
|
104
|
-
buf.clear();
|
|
105
116
|
if (line.empty()) {
|
|
106
117
|
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
|
107
118
|
continue;
|
|
108
119
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
120
|
+
|
|
121
|
+
size_t lastSpace = line.find_last_of(" \t");
|
|
122
|
+
if (lastSpace == string::npos) {
|
|
123
|
+
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " format error. skipped.";
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
string word = line.substr(0, lastSpace);
|
|
128
|
+
string idfStr = line.substr(lastSpace + 1);
|
|
129
|
+
|
|
130
|
+
char* endptr;
|
|
131
|
+
idf = strtod(idfStr.c_str(), &endptr);
|
|
132
|
+
if (endptr == idfStr.c_str()) {
|
|
133
|
+
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " idf format error. skipped.";
|
|
112
134
|
continue;
|
|
113
135
|
}
|
|
114
|
-
|
|
115
|
-
idfMap_[
|
|
136
|
+
|
|
137
|
+
idfMap_[word] = idf;
|
|
116
138
|
idfSum += idf;
|
|
117
|
-
|
|
118
139
|
}
|
|
119
140
|
|
|
120
141
|
assert(lineno);
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
var fs = require("fs");
|
|
3
|
+
|
|
4
|
+
console.log("=== 测试 1.3 倍 IDF 权重 ===\n");
|
|
5
|
+
|
|
6
|
+
const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
|
|
7
|
+
|
|
8
|
+
console.log("【测试 1: 未加载用户词典】");
|
|
9
|
+
nodejieba.load();
|
|
10
|
+
var result1 = nodejieba.extract(content, 10);
|
|
11
|
+
console.log("关键词及权重:");
|
|
12
|
+
result1.forEach((item, i) => {
|
|
13
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
console.log("\n【测试 2: 加载用户词典(自动 1.3 倍权重)】");
|
|
17
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
18
|
+
var result2 = nodejieba.extract(content, 10);
|
|
19
|
+
console.log("关键词及权重:");
|
|
20
|
+
result2.forEach((item, i) => {
|
|
21
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
console.log("\n【验证 1.3 倍权重】");
|
|
25
|
+
var openClawWeight1 = result1.find(r => r.word === "Open" || r.word === "Claw");
|
|
26
|
+
var openClawWeight2 = result2.find(r => r.word === "Open Claw");
|
|
27
|
+
if (openClawWeight1 && openClawWeight2) {
|
|
28
|
+
console.log(`未加载时权重: ${openClawWeight1.weight.toFixed(2)}`);
|
|
29
|
+
console.log(`加载后权重: ${openClawWeight2.weight.toFixed(2)}`);
|
|
30
|
+
console.log(`权重倍数: ${(openClawWeight2.weight / openClawWeight1.weight).toFixed(2)} ✅`);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
console.log("\n=== 测试 IDF 词典支持空格关键词 ===\n");
|
|
34
|
+
|
|
35
|
+
console.log("【测试 3: 创建包含空格关键词的 IDF 词典】");
|
|
36
|
+
var testIdfPath = "./test_idf_with_spaces.txt";
|
|
37
|
+
var testIdfContent = "Open Claw 30.0\nMachine Learning 25.0\nDeep Learning 28.0\n互动 12.0\n";
|
|
38
|
+
fs.writeFileSync(testIdfPath, testIdfContent, "utf8");
|
|
39
|
+
console.log("测试 IDF 词典内容:");
|
|
40
|
+
console.log(testIdfContent);
|
|
41
|
+
|
|
42
|
+
console.log("\n【测试 4: 加载包含空格的 IDF 词典 + 用户词典】");
|
|
43
|
+
delete require.cache[require.resolve('./index.js')];
|
|
44
|
+
var nodejieba2 = require("./index.js");
|
|
45
|
+
|
|
46
|
+
nodejieba2.load({
|
|
47
|
+
idfDict: testIdfPath
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
nodejieba2.loadUserDict([
|
|
51
|
+
"Open Claw",
|
|
52
|
+
"Machine Learning",
|
|
53
|
+
"Deep Learning"
|
|
54
|
+
]);
|
|
55
|
+
|
|
56
|
+
console.log("\n【测试 5: 验证空格关键词识别】");
|
|
57
|
+
var testContent = "Open Claw和Machine Learning都是Deep Learning的基础";
|
|
58
|
+
var keywords = nodejieba2.extract(testContent, 5);
|
|
59
|
+
console.log("测试文本:", testContent);
|
|
60
|
+
console.log("关键词提取结果:");
|
|
61
|
+
keywords.forEach((item, i) => {
|
|
62
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
console.log("\n【验证结果】");
|
|
66
|
+
var hasOpenClaw = keywords.some(r => r.word === "Open Claw");
|
|
67
|
+
var hasMachineLearning = keywords.some(r => r.word === "Machine Learning");
|
|
68
|
+
var hasDeepLearning = keywords.some(r => r.word === "Deep Learning");
|
|
69
|
+
|
|
70
|
+
console.log(`识别到 'Open Claw': ${hasOpenClaw ? '✅' : '❌'}`);
|
|
71
|
+
console.log(`识别到 'Machine Learning': ${hasMachineLearning ? '✅' : '❌'}`);
|
|
72
|
+
console.log(`识别到 'Deep Learning': ${hasDeepLearning ? '✅' : '❌'}`);
|
|
73
|
+
|
|
74
|
+
if (hasOpenClaw) {
|
|
75
|
+
var openClaw = keywords.find(r => r.word === "Open Claw");
|
|
76
|
+
console.log(`\n'Open Claw' 权重: ${openClaw.weight.toFixed(2)}`);
|
|
77
|
+
console.log(`IDF 词典中设置的权重: 30.0`);
|
|
78
|
+
console.log(`用户词典自动提升倍数: 1.3`);
|
|
79
|
+
console.log(`理论权重: ${(30.0 * 1.3).toFixed(2)} (30.0 × 1.3)`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
console.log("\n【清理测试文件】");
|
|
83
|
+
fs.unlinkSync(testIdfPath);
|
|
84
|
+
console.log("测试文件已删除");
|
|
85
|
+
|
|
86
|
+
console.log("\n=== 测试完成 ===");
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
console.log("=== 测试 IDF 权重提升功能 ===\n");
|
|
4
|
+
|
|
5
|
+
const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
|
|
6
|
+
|
|
7
|
+
console.log("【测试 1: 默认提取(未加载用户词典)】");
|
|
8
|
+
nodejieba.load();
|
|
9
|
+
var result1 = nodejieba.extract(content, 20);
|
|
10
|
+
console.log("关键词及权重:");
|
|
11
|
+
result1.forEach((item, i) => {
|
|
12
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
console.log("\n【测试 2: 加载用户词典(自动提升 IDF 权重)】");
|
|
16
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
17
|
+
var result2 = nodejieba.extract(content, 20);
|
|
18
|
+
console.log("关键词及权重:");
|
|
19
|
+
result2.forEach((item, i) => {
|
|
20
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
console.log("\n【测试 3: 手动设置 IDF 权重】");
|
|
24
|
+
nodejieba.setIdf("Open Claw", 30.0);
|
|
25
|
+
var result3 = nodejieba.extract(content, 20);
|
|
26
|
+
console.log("关键词及权重:");
|
|
27
|
+
result3.forEach((item, i) => {
|
|
28
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
console.log("\n【测试 4: 使用倍数提升 IDF 权重】");
|
|
32
|
+
nodejieba.setIdf("Open Claw", null, 3.0); // 3倍权重
|
|
33
|
+
var result4 = nodejieba.extract(content, 20);
|
|
34
|
+
console.log("关键词及权重:");
|
|
35
|
+
result4.forEach((item, i) => {
|
|
36
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
console.log("\n【对比结果】");
|
|
40
|
+
console.log("未加载词典时 'Open Claw' 排名:", result1.findIndex(r => r.word === "Open Claw") + 1 || "未出现");
|
|
41
|
+
console.log("加载词典后 'Open Claw' 排名:", result2.findIndex(r => r.word === "Open Claw") + 1);
|
|
42
|
+
console.log("手动设置IDF后 'Open Claw' 排名:", result3.findIndex(r => r.word === "Open Claw") + 1);
|
|
43
|
+
console.log("3倍权重后 'Open Claw' 排名:", result4.findIndex(r => r.word === "Open Claw") + 1);
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
|
|
4
|
+
|
|
5
|
+
console.log("=== 测试 1: 默认加载词典 ===");
|
|
6
|
+
nodejieba.load();
|
|
7
|
+
|
|
8
|
+
console.log("\n测试 1.1: 分词结果(未加载用户词典)");
|
|
9
|
+
var cutResult1 = nodejieba.cut(content);
|
|
10
|
+
console.log("分词结果:", cutResult1);
|
|
11
|
+
console.log("包含 'Open Claw':", cutResult1.includes("Open Claw") || cutResult1.includes("Open") || cutResult1.includes("Claw"));
|
|
12
|
+
|
|
13
|
+
console.log("\n测试 1.2: 关键词提取(未加载用户词典)");
|
|
14
|
+
var extractResult1 = nodejieba.extract(content, 20);
|
|
15
|
+
console.log("关键词提取结果:", extractResult1.map(r => r.word));
|
|
16
|
+
|
|
17
|
+
console.log("\n=== 测试 2: 加载用户词典后 ===");
|
|
18
|
+
console.log("加载用户词典: 'Open Claw 10 n'");
|
|
19
|
+
var loadResult = nodejieba.loadUserDict("Open Claw 10 n");
|
|
20
|
+
console.log("加载结果:", loadResult);
|
|
21
|
+
|
|
22
|
+
console.log("\n测试 2.1: 分词结果(已加载用户词典)");
|
|
23
|
+
var cutResult2 = nodejieba.cut(content);
|
|
24
|
+
console.log("分词结果:", cutResult2);
|
|
25
|
+
console.log("包含 'Open Claw':", cutResult2.includes("Open Claw"));
|
|
26
|
+
|
|
27
|
+
console.log("\n测试 2.2: 关键词提取(已加载用户词典)");
|
|
28
|
+
var extractResult2 = nodejieba.extract(content, 20);
|
|
29
|
+
console.log("关键词提取结果:", extractResult2.map(r => r.word));
|
|
30
|
+
console.log("包含 'Open Claw':", extractResult2.some(r => r.word === "Open Claw"));
|
|
31
|
+
|
|
32
|
+
console.log("\n=== 测试 3: 使用 insertWord ===");
|
|
33
|
+
nodejieba.insertWord("Open Claw", "n");
|
|
34
|
+
console.log("插入词: 'Open Claw'");
|
|
35
|
+
|
|
36
|
+
console.log("\n测试 3.1: 分词结果(使用 insertWord)");
|
|
37
|
+
var cutResult3 = nodejieba.cut(content);
|
|
38
|
+
console.log("分词结果:", cutResult3);
|
|
39
|
+
console.log("包含 'Open Claw':", cutResult3.includes("Open Claw"));
|
|
40
|
+
|
|
41
|
+
console.log("\n测试 3.2: 关键词提取(使用 insertWord)");
|
|
42
|
+
var extractResult3 = nodejieba.extract(content, 20);
|
|
43
|
+
console.log("关键词提取结果:", extractResult3.map(r => r.word));
|
|
44
|
+
console.log("包含 'Open Claw':", extractResult3.some(r => r.word === "Open Claw"));
|
|
45
|
+
|
|
46
|
+
console.log("\n=== 测试 4: 检查文本中的 Open Claw ===");
|
|
47
|
+
var testText1 = "Open Claw是一个好游戏";
|
|
48
|
+
var testText2 = "我喜欢Open Claw";
|
|
49
|
+
var testText3 = "OpenClaw很好玩";
|
|
50
|
+
|
|
51
|
+
console.log("\n测试文本1:", testText1);
|
|
52
|
+
console.log("分词:", nodejieba.cut(testText1));
|
|
53
|
+
console.log("关键词:", nodejieba.extract(testText1, 5).map(r => r.word));
|
|
54
|
+
|
|
55
|
+
console.log("\n测试文本2:", testText2);
|
|
56
|
+
console.log("分词:", nodejieba.cut(testText2));
|
|
57
|
+
console.log("关键词:", nodejieba.extract(testText2, 5).map(r => r.word));
|
|
58
|
+
|
|
59
|
+
console.log("\n测试文本3:", testText3);
|
|
60
|
+
console.log("分词:", nodejieba.cut(testText3));
|
|
61
|
+
console.log("关键词:", nodejieba.extract(testText3, 5).map(r => r.word));
|
|
62
|
+
|
|
63
|
+
console.log("\n=== 测试 5: 检查 IDF 词典 ===");
|
|
64
|
+
console.log("说明: 关键词提取需要 IDF 词典支持");
|
|
65
|
+
console.log("用户词典只影响分词,不影响关键词提取的权重计算");
|
package/types/index.d.ts
CHANGED
|
@@ -28,4 +28,5 @@ declare module "nodejieba" {
|
|
|
28
28
|
export function insertWord(word: string, tag?: string): boolean;
|
|
29
29
|
export function cutSmall(sentence: string, small: number): string[];
|
|
30
30
|
export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
|
|
31
|
+
export function setIdf(word: string, idf?: number, multiplier?: number): boolean;
|
|
31
32
|
}
|