nodejieba-plus 3.5.16 → 3.5.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -220,6 +220,68 @@ nodejieba.loadUserDict(dictBuffer);
220
220
  // 分词时会识别用户词典中的词
221
221
  var result = nodejieba.cut("云计算和大数据是人工智能的基础");
222
222
  console.log(result); // ['云计算', '和', '大数据', '是', '人工智能', '的', '基础']
223
+
224
+ // 关键词提取时,用户词典中的词会自动获得更高的权重(默认2倍)
225
+ var keywords = nodejieba.extract("云计算和大数据是人工智能的基础", 5);
226
+ console.log(keywords); // 用户词典中的词排名会显著提升
227
+ ```
228
+
229
+ #### 用户词典权重提升机制(新功能)
230
+
231
+ 从 v3.5.16 开始,加载用户词典时会自动为词典中的词设置更高的 IDF 权重,确保在关键词提取时获得更高的排名:
232
+
233
+ **自动权重提升**:
234
+ - 加载用户词典后,词典中的词会自动获得 **2 倍 IDF 权重**
235
+ - 这意味着用户词典中的词在关键词提取时会优先显示
236
+
237
+ **手动设置权重**:
238
+ ```js
239
+ // 方式1:设置具体的 IDF 值
240
+ nodejieba.setIdf("Open Claw", 30.0);
241
+
242
+ // 方式2:使用倍数提升权重(默认2倍)
243
+ nodejieba.setIdf("Open Claw"); // 2倍权重
244
+
245
+ // 方式3:自定义倍数
246
+ nodejieba.setIdf("Open Claw", null, 3.0); // 3倍权重
247
+ ```
248
+
249
+ #### IDF 词典支持空格关键词(新功能)
250
+
251
+ 从 v3.5.16 开始,IDF 词典支持包含空格的关键词:
252
+
253
+ **IDF 词典格式**:
254
+ ```
255
+ # 普通关键词
256
+ 互动 12.0
257
+
258
+ # 包含空格的关键词
259
+ Open Claw 30.0
260
+ Machine Learning 25.0
261
+ Deep Learning 28.0
262
+ ```
263
+
264
+ **使用示例**:
265
+ ```js
266
+ var nodejieba = require("nodejieba");
267
+
268
+ // 加载包含空格关键词的 IDF 词典
269
+ nodejieba.load({
270
+ idfDict: "./custom_idf.txt"
271
+ });
272
+
273
+ // 加载用户词典(同时需要分词词典支持)
274
+ nodejieba.loadUserDict(["Open Claw", "Machine Learning", "Deep Learning"]);
275
+
276
+ // 关键词提取时会正确识别包含空格的词
277
+ var keywords = nodejieba.extract("Open Claw和Machine Learning都是Deep Learning的基础", 5);
278
+ console.log(keywords);
279
+ // 输出: [
280
+ // { word: 'Open Claw', weight: 30.00 },
281
+ // { word: 'Deep Learning', weight: 28.00 },
282
+ // { word: 'Machine Learning', weight: 25.00 },
283
+ // ...
284
+ // ]
223
285
  ```
224
286
 
225
287
  #### 词典条目格式
@@ -240,6 +302,111 @@ console.log(result); // ['云计算', '和', '大数据', '是', '人工智能',
240
302
  大数据 500 n
241
303
  ```
242
304
 
305
+ ### 批量加载IDF词典(新功能)
306
+
307
+ 支持通过字符串数组、Set、单个字符串或 Buffer 批量加载 IDF 词典,用于关键词提取时设置词语的权重。
308
+
309
+ **功能特点**:
310
+ - 支持多种输入格式:字符串数组、Set、单个字符串、Buffer
311
+ - 支持包含空格的词组(如 "Deep Learning")
312
+ - 自动适配不含空格的版本("Deep Learning" 也可匹配 "DeepLearning")
313
+ - 英文大小写不敏感("MachineLearning" 可匹配 "machinelearning"、"MACHINELEARNING")
314
+ - 同时添加到分词词典,确保分词时能正确识别
315
+
316
+ **使用方法**:
317
+
318
+ ```js
319
+ var nodejieba = require("nodejieba");
320
+ nodejieba.load();
321
+
322
+ // 方式1:使用字符串数组
323
+ nodejieba.loadIdfDict(['自定义词1 15.0', '自定义词2 12.5', '测试词汇 10.0']);
324
+
325
+ // 方式2:使用 Set 集合
326
+ const idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
327
+ nodejieba.loadIdfDict(idfSet);
328
+
329
+ // 方式3:使用单个字符串
330
+ nodejieba.loadIdfDict('单个词汇 20.0');
331
+
332
+ // 方式4:使用 Buffer(必须是 UTF-8 编码)
333
+ const idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
334
+ nodejieba.loadIdfDict(idfBuffer);
335
+
336
+ // 关键词提取时会使用设置的 IDF 权重
337
+ var keywords = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
338
+ console.log(keywords);
339
+ // 输出: [
340
+ // { word: '自定义词1', weight: 15 },
341
+ // { word: '自定义词2', weight: 12.5 },
342
+ // { word: '测试词汇', weight: 10 }
343
+ // ]
344
+ ```
345
+
346
+ **支持包含空格的词组**:
347
+
348
+ ```js
349
+ // 加载包含空格的词组
350
+ nodejieba.loadIdfDict(['人工智能 技术 25.0', '机器 学习 22.0']);
351
+
352
+ // 分词时会识别这些词组
353
+ console.log(nodejieba.cut('人工智能技术的发展和机器学习的应用'));
354
+ // 输出: ['人工智能技术', '的', '发展', '和', '机器学习', '的', '应用']
355
+
356
+ // 关键词提取时使用设置的权重
357
+ var keywords = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
358
+ console.log(keywords);
359
+ // 输出: [
360
+ // { word: '人工智能技术', weight: 25 },
361
+ // { word: '机器学习', weight: 22 },
362
+ // ...
363
+ // ]
364
+ ```
365
+
366
+ **英文大小写不敏感**:
367
+
368
+ ```js
369
+ // 加载英文 IDF 词组
370
+ nodejieba.loadIdfDict(['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0']);
371
+
372
+ // 各种大小写都能正确匹配
373
+ console.log(nodejieba.extract('MachineLearning is important', 2));
374
+ // 输出: [{ word: 'MachineLearning', weight: 30 }, ...]
375
+
376
+ console.log(nodejieba.extract('machinelearning is important', 2));
377
+ // 输出: [{ word: 'machinelearning', weight: 30 }, ...]
378
+
379
+ console.log(nodejieba.extract('MACHINELEARNING is important', 2));
380
+ // 输出: [{ word: 'MACHINELEARNING', weight: 30 }, ...]
381
+
382
+ // 包含空格的英文词组也能正确匹配
383
+ console.log(nodejieba.extract('Deep Learning and AI Technology', 3));
384
+ // 输出: [{ word: 'Deep Learning', weight: 28 }, { word: 'AI Technology', weight: 26 }, ...]
385
+
386
+ // 不含空格的版本也能匹配
387
+ console.log(nodejieba.extract('DeepLearning and AITechnology', 3));
388
+ // 输出: [{ word: 'DeepLearning', weight: 28 }, { word: 'AITechnology', weight: 26 }, ...]
389
+ ```
390
+
391
+ **IDF 词典条目格式**:
392
+
393
+ ```
394
+ # 词语 IDF权重
395
+ 自定义词 15.0
396
+
397
+ # 包含空格的词组
398
+ Deep Learning 28.0
399
+ Machine Learning 25.0
400
+
401
+ # 中文词组
402
+ 人工智能 技术 25.0
403
+ ```
404
+
405
+ **注意事项**:
406
+ - IDF 值越大,词语在关键词提取时的权重越高
407
+ - 建议根据词语的重要性设置合理的 IDF 值(通常在 5-30 之间)
408
+ - 加载 IDF 词典时会自动将词语添加到分词词典,无需额外调用 `loadUserDict`
409
+
243
410
  ### 包含空格的关键词(新功能)
244
411
 
245
412
  支持在自定义词典中使用包含空格的关键词,且支持无空格版本匹配和大小写不敏感匹配。
@@ -0,0 +1,57 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ console.log("=== 关键词提取权重机制分析 ===\n");
4
+
5
+ nodejieba.load();
6
+
7
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
8
+
9
+ console.log("【问题分析】");
10
+ console.log("关键词提取权重 = TF(词频)× IDF(逆文档频率)\n");
11
+
12
+ console.log("步骤1: 查看分词结果");
13
+ var cutResult = nodejieba.cut(content);
14
+ console.log("分词结果:", cutResult.slice(0, 30));
15
+
16
+ console.log("\n步骤2: 统计词频(TF)");
17
+ var wordFreq = {};
18
+ cutResult.forEach(word => {
19
+ wordFreq[word] = (wordFreq[word] || 0) + 1;
20
+ });
21
+
22
+ console.log("\n高频词(出现2次以上):");
23
+ Object.entries(wordFreq)
24
+ .filter(([word, freq]) => freq >= 2)
25
+ .sort((a, b) => b[1] - a[1])
26
+ .forEach(([word, freq]) => {
27
+ console.log(` ${word}: ${freq}次`);
28
+ });
29
+
30
+ console.log("\n步骤3: 提取关键词(未加载用户词典)");
31
+ var extractResult1 = nodejieba.extract(content, 20);
32
+ console.log("关键词及权重:");
33
+ extractResult1.forEach((item, i) => {
34
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
35
+ });
36
+
37
+ console.log("\n步骤4: 加载用户词典");
38
+ nodejieba.loadUserDict("Open Claw 10 n");
39
+
40
+ console.log("\n步骤5: 再次提取关键词");
41
+ var extractResult2 = nodejieba.extract(content, 20);
42
+ console.log("关键词及权重:");
43
+ extractResult2.forEach((item, i) => {
44
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
45
+ });
46
+
47
+ console.log("\n【核心问题】");
48
+ console.log("1. 'Open Claw' 出现了 2 次(TF = 2)");
49
+ console.log("2. '互动' 出现了 2 次(TF = 2)");
50
+ console.log("3. 但 '互动' 的权重可能更高,因为:");
51
+ console.log(" - '互动' 在 IDF 词典中有专门的权重值");
52
+ console.log(" - 'Open Claw' 不在 IDF 词典中,使用平均 IDF 值");
53
+ console.log("4. 如果 '互动' 的 IDF 值 > 平均 IDF 值,则权重更高");
54
+
55
+ console.log("\n【解决方案】");
56
+ console.log("需要为用户词典中的词设置 IDF 权重!");
57
+ console.log("建议:在 loadUserDict 时,自动为用户词典中的词设置较高的 IDF 值");
Binary file
@@ -0,0 +1,71 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ console.log("=== 问题诊断:Open Claw 未被识别 ===\n");
4
+
5
+ console.log("【问题原因分析】");
6
+ console.log("1. 用户词典必须在调用 extract() 之前加载");
7
+ console.log("2. 用户词典会影响分词结果,进而影响关键词提取");
8
+ console.log("3. 关键词提取依赖于分词结果和 IDF 权重\n");
9
+
10
+ console.log("=== 测试场景 1: 错误用法(先提取后加载)===");
11
+ nodejieba.load();
12
+ const content = "这是一个Open Claw项目,Open Claw很好玩";
13
+
14
+ console.log("步骤1: 先提取关键词(未加载用户词典)");
15
+ var result1 = nodejieba.extract(content, 5);
16
+ console.log("关键词:", result1.map(r => r.word));
17
+ console.log("说明: 此时 'Open Claw' 被拆分成单个字母\n");
18
+
19
+ console.log("步骤2: 再加载用户词典");
20
+ nodejieba.loadUserDict("Open Claw 10 n");
21
+ console.log("词典已加载\n");
22
+
23
+ console.log("步骤3: 再次提取关键词");
24
+ var result2 = nodejieba.extract(content, 5);
25
+ console.log("关键词:", result2.map(r => r.word));
26
+ console.log("说明: 此时 'Open Claw' 已被正确识别\n");
27
+
28
+ console.log("=== 测试场景 2: 正确用法(先加载后提取)===");
29
+ console.log("重新初始化...");
30
+ delete require.cache[require.resolve('./index.js')];
31
+ var nodejieba2 = require("./index.js");
32
+
33
+ console.log("步骤1: 先加载词典");
34
+ nodejieba2.load();
35
+ nodejieba2.loadUserDict("Open Claw 10 n");
36
+ console.log("词典已加载\n");
37
+
38
+ console.log("步骤2: 再提取关键词");
39
+ var result3 = nodejieba2.extract(content, 5);
40
+ console.log("关键词:", result3.map(r => r.word));
41
+ console.log("说明: 'Open Claw' 被正确识别\n");
42
+
43
+ console.log("=== 测试场景 3: 检查优先级 ===");
44
+ console.log("测试: 用户词典 vs 默认词典\n");
45
+
46
+ var testCases = [
47
+ "Open Claw",
48
+ "Open Claw是一个项目",
49
+ "我喜欢Open Claw这个游戏"
50
+ ];
51
+
52
+ testCases.forEach((text, i) => {
53
+ console.log(`测试 ${i + 1}: "${text}"`);
54
+ var cut = nodejieba2.cut(text);
55
+ var extract = nodejieba2.extract(text, 3);
56
+ console.log(" 分词:", cut);
57
+ console.log(" 关键词:", extract.map(r => r.word));
58
+ console.log(" 包含 'Open Claw':", cut.includes("Open Claw"));
59
+ console.log();
60
+ });
61
+
62
+ console.log("=== 解决方案 ===");
63
+ console.log("✅ 正确做法:");
64
+ console.log(" 1. 先调用 nodejieba.load()");
65
+ console.log(" 2. 再调用 nodejieba.loadUserDict('Open Claw 10 n')");
66
+ console.log(" 3. 最后调用 nodejieba.extract()");
67
+ console.log();
68
+ console.log("❌ 错误做法:");
69
+ console.log(" 1. 先调用 nodejieba.extract()");
70
+ console.log(" 2. 再调用 nodejieba.loadUserDict()");
71
+ console.log(" (此时词典不会影响已提取的结果)");
package/index.js CHANGED
@@ -74,6 +74,8 @@ wrapWithDictLoad("extract");
74
74
  wrapWithDictLoad("textRankExtract");
75
75
  wrapWithDictLoad("insertWord");
76
76
  wrapWithDictLoad("loadUserDict");
77
+ wrapWithDictLoad("loadIdfDict");
78
+ wrapWithDictLoad("setIdf");
77
79
 
78
80
  // 保存原始的 loadUserDict 函数
79
81
  var _loadUserDict = exports.loadUserDict;
@@ -99,4 +101,28 @@ exports.loadUserDict = function (dict) {
99
101
  return _loadUserDict.call(this, dict);
100
102
  };
101
103
 
104
+ // 保存原始的 loadIdfDict 函数
105
+ var _loadIdfDict = exports.loadIdfDict;
106
+
107
+ // 重写 loadIdfDict 以支持 Set 格式
108
+ exports.loadIdfDict = function (dict) {
109
+ if (!isDictLoaded) {
110
+ exports.load();
111
+ }
112
+
113
+ if (dict === null || dict === undefined) {
114
+ return false;
115
+ }
116
+
117
+ if (dict instanceof Set) {
118
+ dict = Array.from(dict);
119
+ }
120
+
121
+ if (typeof dict !== 'string' && !Array.isArray(dict) && !Buffer.isBuffer(dict)) {
122
+ throw new TypeError('dict must be string, string[], Set<string>, or Buffer');
123
+ }
124
+
125
+ return _loadIdfDict.call(this, dict);
126
+ };
127
+
102
128
  module.exports = exports;
package/lib/nodejieba.cpp CHANGED
@@ -21,7 +21,9 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
21
21
  InstanceMethod("extract", &NodeJieba::extract),
22
22
  InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
23
23
  InstanceMethod("insertWord", &NodeJieba::insertWord),
24
- InstanceMethod("loadUserDict", &NodeJieba::loadUserDict)
24
+ InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
25
+ InstanceMethod("loadIdfDict", &NodeJieba::loadIdfDict),
26
+ InstanceMethod("setIdf", &NodeJieba::setIdf)
25
27
  });
26
28
  }
27
29
 
@@ -53,6 +55,216 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
53
55
  return Napi::Boolean::New(info.Env(), true);
54
56
  }
55
57
 
58
+ Napi::Value NodeJieba::loadIdfDict(const Napi::CallbackInfo& info) {
59
+ if (info.Length() < 1) {
60
+ return Napi::Boolean::New(info.Env(), false);
61
+ }
62
+
63
+ if( !_jieba_handle ){
64
+ Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
65
+ }
66
+
67
+ auto isBlankString = [](const std::string& str) -> bool {
68
+ for (char c : str) {
69
+ if (!std::isspace(static_cast<unsigned char>(c))) {
70
+ return false;
71
+ }
72
+ }
73
+ return true;
74
+ };
75
+
76
+ auto trimString = [](std::string& str) -> void {
77
+ size_t start = 0;
78
+ size_t end = str.length();
79
+
80
+ while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
81
+ start++;
82
+ }
83
+
84
+ while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
85
+ end--;
86
+ }
87
+
88
+ str = str.substr(start, end - start);
89
+ };
90
+
91
+ auto toUpper = [](const std::string& str) -> std::string {
92
+ std::string result = str;
93
+ for (size_t i = 0; i < result.length(); i++) {
94
+ result[i] = std::toupper(static_cast<unsigned char>(result[i]));
95
+ }
96
+ return result;
97
+ };
98
+
99
+ auto toLower = [](const std::string& str) -> std::string {
100
+ std::string result = str;
101
+ for (size_t i = 0; i < result.length(); i++) {
102
+ result[i] = std::tolower(static_cast<unsigned char>(result[i]));
103
+ }
104
+ return result;
105
+ };
106
+
107
+ auto hasEnglishChars = [](const std::string& str) -> bool {
108
+ for (char c : str) {
109
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
110
+ return true;
111
+ }
112
+ }
113
+ return false;
114
+ };
115
+
116
+ auto extractWordAndIdf = [](const std::string& line) -> std::pair<std::string, double> {
117
+ size_t lastSpace = line.find_last_of(" \t");
118
+ if (lastSpace == std::string::npos || lastSpace == 0) {
119
+ return std::make_pair("", 0.0);
120
+ }
121
+
122
+ std::string word = line.substr(0, lastSpace);
123
+ std::string idfStr = line.substr(lastSpace + 1);
124
+
125
+ char* endptr;
126
+ double idf = strtod(idfStr.c_str(), &endptr);
127
+ if (endptr == idfStr.c_str()) {
128
+ return std::make_pair("", 0.0);
129
+ }
130
+
131
+ return std::make_pair(word, idf);
132
+ };
133
+
134
+ auto removeSpaces = [](const std::string& str) -> std::string {
135
+ std::string result;
136
+ for (char c : str) {
137
+ if (!std::isspace(static_cast<unsigned char>(c))) {
138
+ result += c;
139
+ }
140
+ }
141
+ return result;
142
+ };
143
+
144
+ auto processIdfLine = [&](const std::string& line) {
145
+ std::pair<std::string, double> parsed = extractWordAndIdf(line);
146
+ std::string word = parsed.first;
147
+ double idf = parsed.second;
148
+
149
+ if (word.empty() || idf == 0.0) {
150
+ return;
151
+ }
152
+
153
+ _jieba_handle->SetIdfForWord(word, idf);
154
+ _jieba_handle->InsertUserWord(word, "x");
155
+
156
+ std::string wordNoSpace = removeSpaces(word);
157
+ if (wordNoSpace != word) {
158
+ _jieba_handle->SetIdfForWord(wordNoSpace, idf);
159
+ _jieba_handle->InsertUserWord(wordNoSpace, "x");
160
+ }
161
+
162
+ if (hasEnglishChars(word)) {
163
+ std::string wordLower = toLower(word);
164
+ std::string wordUpper = toUpper(word);
165
+
166
+ if (wordLower != word) {
167
+ _jieba_handle->SetIdfForWord(wordLower, idf);
168
+ _jieba_handle->InsertUserWord(wordLower, "x");
169
+ }
170
+
171
+ if (wordUpper != word) {
172
+ _jieba_handle->SetIdfForWord(wordUpper, idf);
173
+ _jieba_handle->InsertUserWord(wordUpper, "x");
174
+ }
175
+
176
+ std::string wordLowerNoSpace = toLower(wordNoSpace);
177
+ std::string wordUpperNoSpace = toUpper(wordNoSpace);
178
+
179
+ if (wordLowerNoSpace != wordNoSpace && wordLowerNoSpace != wordLower) {
180
+ _jieba_handle->SetIdfForWord(wordLowerNoSpace, idf);
181
+ _jieba_handle->InsertUserWord(wordLowerNoSpace, "x");
182
+ }
183
+
184
+ if (wordUpperNoSpace != wordNoSpace && wordUpperNoSpace != wordUpper) {
185
+ _jieba_handle->SetIdfForWord(wordUpperNoSpace, idf);
186
+ _jieba_handle->InsertUserWord(wordUpperNoSpace, "x");
187
+ }
188
+ }
189
+ };
190
+
191
+ if (info[0].IsArray()) {
192
+ Napi::Array arr = info[0].As<Napi::Array>();
193
+ for (size_t i = 0; i < arr.Length(); i++) {
194
+ Napi::Value val = arr[i];
195
+ if (!val.IsString()) {
196
+ Napi::TypeError::New(info.Env(), "Array elements must be strings")
197
+ .ThrowAsJavaScriptException();
198
+ return Napi::Boolean::New(info.Env(), false);
199
+ }
200
+ std::string line = val.As<Napi::String>().Utf8Value();
201
+ trimString(line);
202
+ if (!line.empty() && !isBlankString(line)) {
203
+ processIdfLine(line);
204
+ }
205
+ }
206
+ } else if (info[0].IsString()) {
207
+ std::string line = info[0].As<Napi::String>().Utf8Value();
208
+ trimString(line);
209
+ if (!line.empty() && !isBlankString(line)) {
210
+ processIdfLine(line);
211
+ }
212
+ } else if (info[0].IsBuffer()) {
213
+ Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
214
+ std::string content(buffer.Data(), buffer.Length());
215
+ std::istringstream iss(content);
216
+ std::string line;
217
+ while (std::getline(iss, line)) {
218
+ trimString(line);
219
+ if (!line.empty() && !isBlankString(line)) {
220
+ processIdfLine(line);
221
+ }
222
+ }
223
+ } else {
224
+ return Napi::Boolean::New(info.Env(), false);
225
+ }
226
+
227
+ return Napi::Boolean::New(info.Env(), true);
228
+ }
229
+
230
+ Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
231
+ if (info.Length() < 1) {
232
+ return Napi::Boolean::New(info.Env(), false);
233
+ }
234
+
235
+ if (!_jieba_handle) {
236
+ Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
237
+ return Napi::Boolean::New(info.Env(), false);
238
+ }
239
+
240
+ std::string word;
241
+ double idf = 0.0;
242
+ double multiplier = 2.0;
243
+
244
+ if (info[0].IsString()) {
245
+ word = info[0].As<Napi::String>().Utf8Value();
246
+ } else {
247
+ return Napi::Boolean::New(info.Env(), false);
248
+ }
249
+
250
+ if (info.Length() >= 2) {
251
+ if (info[1].IsNumber()) {
252
+ idf = info[1].As<Napi::Number>().DoubleValue();
253
+ _jieba_handle->SetIdfForWord(word, idf);
254
+ return Napi::Boolean::New(info.Env(), true);
255
+ }
256
+ }
257
+
258
+ if (info.Length() >= 3) {
259
+ if (info[2].IsNumber()) {
260
+ multiplier = info[2].As<Napi::Number>().DoubleValue();
261
+ }
262
+ }
263
+
264
+ _jieba_handle->SetIdfForWordWithMultiplier(word, multiplier);
265
+ return Napi::Boolean::New(info.Env(), true);
266
+ }
267
+
56
268
  Napi::Value NodeJieba::insertWord(const Napi::CallbackInfo& info) {
57
269
  if(info.Length() < 1) {
58
270
  return Napi::Boolean::New(info.Env(), false);
@@ -254,6 +466,70 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
254
466
  str = str.substr(start, end - start);
255
467
  };
256
468
 
469
+ auto extractKeyword = [](const std::string& line) -> std::string {
470
+ std::istringstream iss(line);
471
+ std::vector<std::string> parts;
472
+ std::string part;
473
+
474
+ while (iss >> part) {
475
+ parts.push_back(part);
476
+ }
477
+
478
+ if (parts.empty()) {
479
+ return "";
480
+ }
481
+
482
+ if (parts.size() == 1) {
483
+ return parts[0];
484
+ }
485
+
486
+ if (parts.size() == 2) {
487
+ size_t pos;
488
+ try {
489
+ std::stoi(parts[1], &pos);
490
+ if (pos == parts[1].length()) {
491
+ return parts[0];
492
+ }
493
+ } catch (...) {
494
+ }
495
+ return parts[0];
496
+ }
497
+
498
+ if (parts.size() >= 3) {
499
+ size_t pos;
500
+ try {
501
+ std::stoi(parts[parts.size() - 2], &pos);
502
+ if (pos == parts[parts.size() - 2].length()) {
503
+ std::string keyword;
504
+ for (size_t i = 0; i < parts.size() - 2; i++) {
505
+ if (i > 0) keyword += " ";
506
+ keyword += parts[i];
507
+ }
508
+ return keyword;
509
+ }
510
+ } catch (...) {
511
+ }
512
+
513
+ std::string keyword;
514
+ for (size_t i = 0; i < parts.size() - 1; i++) {
515
+ if (i > 0) keyword += " ";
516
+ keyword += parts[i];
517
+ }
518
+ return keyword;
519
+ }
520
+
521
+ return parts[0];
522
+ };
523
+
524
+ auto setDefaultIdf = [&](const std::vector<std::string>& dictLines) {
525
+ for (const auto& line : dictLines) {
526
+ std::string keyword = extractKeyword(line);
527
+ if (!keyword.empty()) {
528
+ _jieba_handle->SetIdfForWordWithMultiplier(keyword, 2.0);
529
+ }
530
+ }
531
+ };
532
+
257
533
  if (info[0].IsArray()) {
258
534
  Napi::Array arr = info[0].As<Napi::Array>();
259
535
  std::vector<std::string> buf;
@@ -271,6 +547,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
271
547
  }
272
548
  }
273
549
  _jieba_handle->LoadUserDict(buf);
550
+ setDefaultIdf(buf);
274
551
  } else if (info[0].IsString()) {
275
552
  std::string line = info[0].As<Napi::String>().Utf8Value();
276
553
  trimString(line);
@@ -278,6 +555,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
278
555
  if (!line.empty() && !isBlankString(line)) {
279
556
  buf.push_back(line);
280
557
  _jieba_handle->LoadUserDict(buf);
558
+ setDefaultIdf(buf);
281
559
  }
282
560
  } else if (info[0].IsBuffer()) {
283
561
  Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
@@ -292,6 +570,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
292
570
  }
293
571
  }
294
572
  _jieba_handle->LoadUserDict(buf);
573
+ setDefaultIdf(buf);
295
574
  } else {
296
575
  return Napi::Boolean::New(info.Env(), false);
297
576
  }
package/lib/nodejieba.h CHANGED
@@ -21,6 +21,8 @@ private:
21
21
  Napi::Value textRankExtract(const Napi::CallbackInfo& info);
22
22
  Napi::Value insertWord(const Napi::CallbackInfo& info);
23
23
  Napi::Value loadUserDict(const Napi::CallbackInfo& info);
24
+ Napi::Value loadIdfDict(const Napi::CallbackInfo& info);
25
+ Napi::Value setIdf(const Napi::CallbackInfo& info);
24
26
 
25
27
  cppjieba::Jieba* _jieba_handle{nullptr};
26
28
  cppjieba::TextRankExtractor* _text_rank_extractor_handle{nullptr};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "nodejieba-plus",
3
3
  "description": "chinese word segmentation for node",
4
- "version": "3.5.16",
4
+ "version": "3.5.18",
5
5
  "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
6
6
  "maintainers": [
7
7
  "Yanyi Wu <wuyanyi09@foxmail.com>"
@@ -116,6 +116,18 @@ class Jieba {
116
116
  dict_trie_.LoadUserDict(path);
117
117
  }
118
118
 
119
+ void SetIdfForWord(const string& word, double idf) {
120
+ extractor.SetIdf(word, idf);
121
+ }
122
+
123
+ void SetIdfForWordWithMultiplier(const string& word, double multiplier = 2.0) {
124
+ extractor.SetIdfWithMultiplier(word, multiplier);
125
+ }
126
+
127
+ void LoadIdfDict(const vector<string>& buf) {
128
+ extractor.LoadIdfDict(buf);
129
+ }
130
+
119
131
  private:
120
132
  static string pathJoin(const string& dir, const string& filename) {
121
133
  if (dir.empty()) {
@@ -39,6 +39,54 @@ class KeywordExtractor {
39
39
  ~KeywordExtractor() {
40
40
  }
41
41
 
42
+ void SetIdf(const string& word, double idf) {
43
+ idfMap_[word] = idf;
44
+ }
45
+
46
+ void SetIdfWithMultiplier(const string& word, double multiplier = 2.0) {
47
+ unordered_map<string, double>::const_iterator cit = idfMap_.find(word);
48
+ if (cit != idfMap_.end()) {
49
+ idfMap_[word] = cit->second * multiplier;
50
+ } else {
51
+ idfMap_[word] = idfAverage_ * multiplier;
52
+ }
53
+ }
54
+
55
+ void LoadIdfDict(const vector<string>& buf) {
56
+ double idf = 0.0;
57
+ double idfSum = 0.0;
58
+ size_t validCount = 0;
59
+
60
+ for (size_t i = 0; i < buf.size(); i++) {
61
+ const string& line = buf[i];
62
+ if (line.empty()) {
63
+ continue;
64
+ }
65
+
66
+ size_t lastSpace = line.find_last_of(" \t");
67
+ if (lastSpace == string::npos) {
68
+ continue;
69
+ }
70
+
71
+ string word = line.substr(0, lastSpace);
72
+ string idfStr = line.substr(lastSpace + 1);
73
+
74
+ char* endptr;
75
+ idf = strtod(idfStr.c_str(), &endptr);
76
+ if (endptr == idfStr.c_str()) {
77
+ continue;
78
+ }
79
+
80
+ idfMap_[word] = idf;
81
+ idfSum += idf;
82
+ validCount++;
83
+ }
84
+
85
+ if (validCount > 0) {
86
+ idfAverage_ = idfSum / validCount;
87
+ }
88
+ }
89
+
42
90
  void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
43
91
  vector<Word> topWords;
44
92
  Extract(sentence, topWords, topN);
@@ -96,25 +144,33 @@ class KeywordExtractor {
96
144
  ifstream ifs(idfPath.c_str());
97
145
  XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
98
146
  string line ;
99
- vector<string> buf;
100
147
  double idf = 0.0;
101
148
  double idfSum = 0.0;
102
149
  size_t lineno = 0;
103
150
  for (; getline(ifs, line); lineno++) {
104
- buf.clear();
105
151
  if (line.empty()) {
106
152
  XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107
153
  continue;
108
154
  }
109
- Split(line, buf, " ");
110
- if (buf.size() != 2) {
111
- XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
155
+
156
+ size_t lastSpace = line.find_last_of(" \t");
157
+ if (lastSpace == string::npos) {
158
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " format error. skipped.";
112
159
  continue;
113
160
  }
114
- idf = atof(buf[1].c_str());
115
- idfMap_[buf[0]] = idf;
161
+
162
+ string word = line.substr(0, lastSpace);
163
+ string idfStr = line.substr(lastSpace + 1);
164
+
165
+ char* endptr;
166
+ idf = strtod(idfStr.c_str(), &endptr);
167
+ if (endptr == idfStr.c_str()) {
168
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " idf format error. skipped.";
169
+ continue;
170
+ }
171
+
172
+ idfMap_[word] = idf;
116
173
  idfSum += idf;
117
-
118
174
  }
119
175
 
120
176
  assert(lineno);
@@ -0,0 +1,86 @@
1
+ var nodejieba = require("./index.js");
2
+ var fs = require("fs");
3
+
4
+ console.log("=== 测试 1.3 倍 IDF 权重 ===\n");
5
+
6
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
7
+
8
+ console.log("【测试 1: 未加载用户词典】");
9
+ nodejieba.load();
10
+ var result1 = nodejieba.extract(content, 10);
11
+ console.log("关键词及权重:");
12
+ result1.forEach((item, i) => {
13
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
14
+ });
15
+
16
+ console.log("\n【测试 2: 加载用户词典(自动 1.3 倍权重)】");
17
+ nodejieba.loadUserDict("Open Claw 10 n");
18
+ var result2 = nodejieba.extract(content, 10);
19
+ console.log("关键词及权重:");
20
+ result2.forEach((item, i) => {
21
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
22
+ });
23
+
24
+ console.log("\n【验证 1.3 倍权重】");
25
+ var openClawWeight1 = result1.find(r => r.word === "Open" || r.word === "Claw");
26
+ var openClawWeight2 = result2.find(r => r.word === "Open Claw");
27
+ if (openClawWeight1 && openClawWeight2) {
28
+ console.log(`未加载时权重: ${openClawWeight1.weight.toFixed(2)}`);
29
+ console.log(`加载后权重: ${openClawWeight2.weight.toFixed(2)}`);
30
+ console.log(`权重倍数: ${(openClawWeight2.weight / openClawWeight1.weight).toFixed(2)} ✅`);
31
+ }
32
+
33
+ console.log("\n=== 测试 IDF 词典支持空格关键词 ===\n");
34
+
35
+ console.log("【测试 3: 创建包含空格关键词的 IDF 词典】");
36
+ var testIdfPath = "./test_idf_with_spaces.txt";
37
+ var testIdfContent = "Open Claw 30.0\nMachine Learning 25.0\nDeep Learning 28.0\n互动 12.0\n";
38
+ fs.writeFileSync(testIdfPath, testIdfContent, "utf8");
39
+ console.log("测试 IDF 词典内容:");
40
+ console.log(testIdfContent);
41
+
42
+ console.log("\n【测试 4: 加载包含空格的 IDF 词典 + 用户词典】");
43
+ delete require.cache[require.resolve('./index.js')];
44
+ var nodejieba2 = require("./index.js");
45
+
46
+ nodejieba2.load({
47
+ idfDict: testIdfPath
48
+ });
49
+
50
+ nodejieba2.loadUserDict([
51
+ "Open Claw",
52
+ "Machine Learning",
53
+ "Deep Learning"
54
+ ]);
55
+
56
+ console.log("\n【测试 5: 验证空格关键词识别】");
57
+ var testContent = "Open Claw和Machine Learning都是Deep Learning的基础";
58
+ var keywords = nodejieba2.extract(testContent, 5);
59
+ console.log("测试文本:", testContent);
60
+ console.log("关键词提取结果:");
61
+ keywords.forEach((item, i) => {
62
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
63
+ });
64
+
65
+ console.log("\n【验证结果】");
66
+ var hasOpenClaw = keywords.some(r => r.word === "Open Claw");
67
+ var hasMachineLearning = keywords.some(r => r.word === "Machine Learning");
68
+ var hasDeepLearning = keywords.some(r => r.word === "Deep Learning");
69
+
70
+ console.log(`识别到 'Open Claw': ${hasOpenClaw ? '✅' : '❌'}`);
71
+ console.log(`识别到 'Machine Learning': ${hasMachineLearning ? '✅' : '❌'}`);
72
+ console.log(`识别到 'Deep Learning': ${hasDeepLearning ? '✅' : '❌'}`);
73
+
74
+ if (hasOpenClaw) {
75
+ var openClaw = keywords.find(r => r.word === "Open Claw");
76
+ console.log(`\n'Open Claw' 权重: ${openClaw.weight.toFixed(2)}`);
77
+ console.log(`IDF 词典中设置的权重: 30.0`);
78
+ console.log(`用户词典自动提升倍数: 1.3`);
79
+ console.log(`理论权重: ${(30.0 * 1.3).toFixed(2)} (30.0 × 1.3)`);
80
+ }
81
+
82
+ console.log("\n【清理测试文件】");
83
+ fs.unlinkSync(testIdfPath);
84
+ console.log("测试文件已删除");
85
+
86
+ console.log("\n=== 测试完成 ===");
@@ -0,0 +1,39 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ console.log("=== 测试 2.0 倍 IDF 权重 ===\n");
4
+
5
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
6
+
7
+ console.log("【测试 1: 未加载用户词典】");
8
+ nodejieba.load();
9
+ var result1 = nodejieba.extract(content, 10);
10
+ console.log("关键词及权重:");
11
+ result1.forEach((item, i) => {
12
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
13
+ });
14
+
15
+ console.log("\n【测试 2: 加载用户词典(自动 2.0 倍权重)】");
16
+ nodejieba.loadUserDict("Open Claw 10 n");
17
+ var result2 = nodejieba.extract(content, 10);
18
+ console.log("关键词及权重:");
19
+ result2.forEach((item, i) => {
20
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
21
+ });
22
+
23
+ console.log("\n【验证 2.0 倍权重】");
24
+ var openClawWeight1 = result1.find(r => r.word === "Open" || r.word === "Claw");
25
+ var openClawWeight2 = result2.find(r => r.word === "Open Claw");
26
+ if (openClawWeight1 && openClawWeight2) {
27
+ console.log(`未加载时权重: ${openClawWeight1.weight.toFixed(2)}`);
28
+ console.log(`加载后权重: ${openClawWeight2.weight.toFixed(2)}`);
29
+ console.log(`权重倍数: ${(openClawWeight2.weight / openClawWeight1.weight).toFixed(2)}`);
30
+
31
+ var ratio = openClawWeight2.weight / openClawWeight1.weight;
32
+ if (Math.abs(ratio - 2.0) < 0.01) {
33
+ console.log("✅ 2.0 倍权重验证成功!");
34
+ } else {
35
+ console.log(`❌ 权重倍数不符合预期(期望 2.0,实际 ${ratio.toFixed(2)})`);
36
+ }
37
+ }
38
+
39
+ console.log("\n=== 测试完成 ===");
@@ -0,0 +1,43 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ console.log("=== 测试 IDF 权重提升功能 ===\n");
4
+
5
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
6
+
7
+ console.log("【测试 1: 默认提取(未加载用户词典)】");
8
+ nodejieba.load();
9
+ var result1 = nodejieba.extract(content, 20);
10
+ console.log("关键词及权重:");
11
+ result1.forEach((item, i) => {
12
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
13
+ });
14
+
15
+ console.log("\n【测试 2: 加载用户词典(自动提升 IDF 权重)】");
16
+ nodejieba.loadUserDict("Open Claw 10 n");
17
+ var result2 = nodejieba.extract(content, 20);
18
+ console.log("关键词及权重:");
19
+ result2.forEach((item, i) => {
20
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
21
+ });
22
+
23
+ console.log("\n【测试 3: 手动设置 IDF 权重】");
24
+ nodejieba.setIdf("Open Claw", 30.0);
25
+ var result3 = nodejieba.extract(content, 20);
26
+ console.log("关键词及权重:");
27
+ result3.forEach((item, i) => {
28
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
29
+ });
30
+
31
+ console.log("\n【测试 4: 使用倍数提升 IDF 权重】");
32
+ nodejieba.setIdf("Open Claw", null, 3.0); // 3倍权重
33
+ var result4 = nodejieba.extract(content, 20);
34
+ console.log("关键词及权重:");
35
+ result4.forEach((item, i) => {
36
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
37
+ });
38
+
39
+ console.log("\n【对比结果】");
40
+ console.log("未加载词典时 'Open Claw' 排名:", result1.findIndex(r => r.word === "Open Claw") + 1 || "未出现");
41
+ console.log("加载词典后 'Open Claw' 排名:", result2.findIndex(r => r.word === "Open Claw") + 1);
42
+ console.log("手动设置IDF后 'Open Claw' 排名:", result3.findIndex(r => r.word === "Open Claw") + 1);
43
+ console.log("3倍权重后 'Open Claw' 排名:", result4.findIndex(r => r.word === "Open Claw") + 1);
@@ -0,0 +1,98 @@
1
+ var nodejieba = require('./index.js');
2
+
3
+ console.log('=== 测试 loadIdfDict 功能 ===\n');
4
+
5
+ // 初始化
6
+ nodejieba.load();
7
+
8
+ // 测试1: 使用数组加载IDF
9
+ console.log('测试1: 使用数组加载IDF');
10
+ var idfArray = [
11
+ '自定义词1 15.0',
12
+ '自定义词2 12.5',
13
+ '测试词汇 10.0'
14
+ ];
15
+ var result1 = nodejieba.loadIdfDict(idfArray);
16
+ console.log('数组加载结果:', result1);
17
+
18
+ // 测试关键词提取
19
+ var keywords1 = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
20
+ console.log('提取的关键词:', keywords1);
21
+ console.log();
22
+
23
+ // 测试2: 使用字符串加载单个IDF
24
+ console.log('测试2: 使用字符串加载单个IDF');
25
+ var result2 = nodejieba.loadIdfDict('单个词汇 20.0');
26
+ console.log('字符串加载结果:', result2);
27
+
28
+ var keywords2 = nodejieba.extract('这是一个单个词汇的测试', 3);
29
+ console.log('提取的关键词:', keywords2);
30
+ console.log();
31
+
32
+ // 测试3: 使用Buffer加载IDF
33
+ console.log('测试3: 使用Buffer加载IDF');
34
+ var idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
35
+ var result3 = nodejieba.loadIdfDict(idfBuffer);
36
+ console.log('Buffer加载结果:', result3);
37
+
38
+ var keywords3 = nodejieba.extract('缓冲词汇1和缓冲词汇2的测试', 3);
39
+ console.log('提取的关键词:', keywords3);
40
+ console.log();
41
+
42
+ // 测试4: 使用Set加载IDF
43
+ console.log('测试4: 使用Set加载IDF');
44
+ var idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
45
+ var result4 = nodejieba.loadIdfDict(idfSet);
46
+ console.log('Set加载结果:', result4);
47
+
48
+ var keywords4 = nodejieba.extract('集合词汇1和集合词汇2的测试', 3);
49
+ console.log('提取的关键词:', keywords4);
50
+ console.log();
51
+
52
+ // 测试5: 支持包含空格的词组
53
+ console.log('测试5: 支持包含空格的词组');
54
+ var idfWithSpace = ['人工智能 技术 25.0', '机器 学习 22.0'];
55
+ var result5 = nodejieba.loadIdfDict(idfWithSpace);
56
+ console.log('包含空格词组加载结果:', result5);
57
+
58
+ // 测试提取包含空格的词组
59
+ var keywords5a = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
60
+ console.log('提取的关键词(含空格):', keywords5a);
61
+
62
+ // 测试不含空格的版本也能匹配
63
+ var keywords5b = nodejieba.extract('人工智能技术发展和机器学习应用', 5);
64
+ console.log('提取的关键词(不含空格):', keywords5b);
65
+ console.log();
66
+
67
+ // 测试6: 英文大小写不敏感
68
+ console.log('测试6: 英文大小写不敏感');
69
+ var idfEnglish = ['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0'];
70
+ var result6 = nodejieba.loadIdfDict(idfEnglish);
71
+ console.log('英文IDF加载结果:', result6);
72
+
73
+ // 测试大写匹配
74
+ var keywords6a = nodejieba.extract('MachineLearning is important', 3);
75
+ console.log('大写匹配:', keywords6a);
76
+
77
+ // 测试小写匹配
78
+ var keywords6b = nodejieba.extract('machinelearning is important', 3);
79
+ console.log('小写匹配:', keywords6b);
80
+
81
+ // 测试混合大小写匹配
82
+ var keywords6c = nodejieba.extract('MACHINELEARNING is important', 3);
83
+ console.log('全大写匹配:', keywords6c);
84
+
85
+ // 测试包含空格的英文词组
86
+ var keywords6d = nodejieba.extract('Deep Learning and AI Technology', 3);
87
+ console.log('包含空格英文词组:', keywords6d);
88
+
89
+ // 测试不含空格的英文词组
90
+ var keywords6e = nodejieba.extract('DeepLearning and AITechnology', 3);
91
+ console.log('不含空格英文词组:', keywords6e);
92
+
93
+ // 测试小写版本的包含空格英文词组
94
+ var keywords6f = nodejieba.extract('deep learning and ai technology', 3);
95
+ console.log('小写包含空格英文词组:', keywords6f);
96
+ console.log();
97
+
98
+ console.log('=== 所有测试完成 ===');
@@ -0,0 +1,65 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
4
+
5
+ console.log("=== 测试 1: 默认加载词典 ===");
6
+ nodejieba.load();
7
+
8
+ console.log("\n测试 1.1: 分词结果(未加载用户词典)");
9
+ var cutResult1 = nodejieba.cut(content);
10
+ console.log("分词结果:", cutResult1);
11
+ console.log("包含 'Open Claw':", cutResult1.includes("Open Claw") || cutResult1.includes("Open") || cutResult1.includes("Claw"));
12
+
13
+ console.log("\n测试 1.2: 关键词提取(未加载用户词典)");
14
+ var extractResult1 = nodejieba.extract(content, 20);
15
+ console.log("关键词提取结果:", extractResult1.map(r => r.word));
16
+
17
+ console.log("\n=== 测试 2: 加载用户词典后 ===");
18
+ console.log("加载用户词典: 'Open Claw 10 n'");
19
+ var loadResult = nodejieba.loadUserDict("Open Claw 10 n");
20
+ console.log("加载结果:", loadResult);
21
+
22
+ console.log("\n测试 2.1: 分词结果(已加载用户词典)");
23
+ var cutResult2 = nodejieba.cut(content);
24
+ console.log("分词结果:", cutResult2);
25
+ console.log("包含 'Open Claw':", cutResult2.includes("Open Claw"));
26
+
27
+ console.log("\n测试 2.2: 关键词提取(已加载用户词典)");
28
+ var extractResult2 = nodejieba.extract(content, 20);
29
+ console.log("关键词提取结果:", extractResult2.map(r => r.word));
30
+ console.log("包含 'Open Claw':", extractResult2.some(r => r.word === "Open Claw"));
31
+
32
+ console.log("\n=== 测试 3: 使用 insertWord ===");
33
+ nodejieba.insertWord("Open Claw", "n");
34
+ console.log("插入词: 'Open Claw'");
35
+
36
+ console.log("\n测试 3.1: 分词结果(使用 insertWord)");
37
+ var cutResult3 = nodejieba.cut(content);
38
+ console.log("分词结果:", cutResult3);
39
+ console.log("包含 'Open Claw':", cutResult3.includes("Open Claw"));
40
+
41
+ console.log("\n测试 3.2: 关键词提取(使用 insertWord)");
42
+ var extractResult3 = nodejieba.extract(content, 20);
43
+ console.log("关键词提取结果:", extractResult3.map(r => r.word));
44
+ console.log("包含 'Open Claw':", extractResult3.some(r => r.word === "Open Claw"));
45
+
46
+ console.log("\n=== 测试 4: 检查文本中的 Open Claw ===");
47
+ var testText1 = "Open Claw是一个好游戏";
48
+ var testText2 = "我喜欢Open Claw";
49
+ var testText3 = "OpenClaw很好玩";
50
+
51
+ console.log("\n测试文本1:", testText1);
52
+ console.log("分词:", nodejieba.cut(testText1));
53
+ console.log("关键词:", nodejieba.extract(testText1, 5).map(r => r.word));
54
+
55
+ console.log("\n测试文本2:", testText2);
56
+ console.log("分词:", nodejieba.cut(testText2));
57
+ console.log("关键词:", nodejieba.extract(testText2, 5).map(r => r.word));
58
+
59
+ console.log("\n测试文本3:", testText3);
60
+ console.log("分词:", nodejieba.cut(testText3));
61
+ console.log("关键词:", nodejieba.extract(testText3, 5).map(r => r.word));
62
+
63
+ console.log("\n=== 测试 5: 检查 IDF 词典 ===");
64
+ console.log("说明: 关键词提取需要 IDF 词典支持");
65
+ console.log("用户词典只影响分词,不影响关键词提取的权重计算");
package/types/index.d.ts CHANGED
@@ -28,4 +28,6 @@ declare module "nodejieba" {
28
28
  export function insertWord(word: string, tag?: string): boolean;
29
29
  export function cutSmall(sentence: string, small: number): string[];
30
30
  export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
31
+ export function loadIdfDict(dict: string | string[] | Set<string> | Buffer): boolean;
32
+ export function setIdf(word: string, idf?: number, multiplier?: number): boolean;
31
33
  }