nodejieba-plus 3.5.17 → 3.5.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -231,7 +231,7 @@ console.log(keywords); // 用户词典中的词排名会显著提升
231
231
  从 v3.5.16 开始,加载用户词典时会自动为词典中的词设置更高的 IDF 权重,确保在关键词提取时获得更高的排名:
232
232
 
233
233
  **自动权重提升**:
234
- - 加载用户词典后,词典中的词会自动获得 **1.3 倍 IDF 权重**
234
+ - 加载用户词典后,词典中的词会自动获得 **2 倍 IDF 权重**
235
235
  - 这意味着用户词典中的词在关键词提取时会优先显示
236
236
 
237
237
  **手动设置权重**:
@@ -239,11 +239,11 @@ console.log(keywords); // 用户词典中的词排名会显著提升
239
239
  // 方式1:设置具体的 IDF 值
240
240
  nodejieba.setIdf("Open Claw", 30.0);
241
241
 
242
- // 方式2:使用倍数提升权重(默认1.3倍)
243
- nodejieba.setIdf("Open Claw"); // 1.3倍权重
242
+ // 方式2:使用倍数提升权重(默认2倍)
243
+ nodejieba.setIdf("Open Claw"); // 2倍权重
244
244
 
245
245
  // 方式3:自定义倍数
246
- nodejieba.setIdf("Open Claw", null, 2.0); // 2倍权重
246
+ nodejieba.setIdf("Open Claw", null, 3.0); // 3倍权重
247
247
  ```
248
248
 
249
249
  #### IDF 词典支持空格关键词(新功能)
@@ -302,6 +302,111 @@ console.log(keywords);
302
302
  大数据 500 n
303
303
  ```
304
304
 
305
+ ### 批量加载IDF词典(新功能)
306
+
307
+ 支持通过字符串数组、Set、单个字符串或 Buffer 批量加载 IDF 词典,用于关键词提取时设置词语的权重。
308
+
309
+ **功能特点**:
310
+ - 支持多种输入格式:字符串数组、Set、单个字符串、Buffer
311
+ - 支持包含空格的词组(如 "Deep Learning")
312
+ - 自动适配不含空格的版本("Deep Learning" 也可匹配 "DeepLearning")
313
+ - 英文大小写不敏感("MachineLearning" 可匹配 "machinelearning"、"MACHINELEARNING")
314
+ - 同时添加到分词词典,确保分词时能正确识别
315
+
316
+ **使用方法**:
317
+
318
+ ```js
319
+ var nodejieba = require("nodejieba");
320
+ nodejieba.load();
321
+
322
+ // 方式1:使用字符串数组
323
+ nodejieba.loadIdfDict(['自定义词1 15.0', '自定义词2 12.5', '测试词汇 10.0']);
324
+
325
+ // 方式2:使用 Set 集合
326
+ const idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
327
+ nodejieba.loadIdfDict(idfSet);
328
+
329
+ // 方式3:使用单个字符串
330
+ nodejieba.loadIdfDict('单个词汇 20.0');
331
+
332
+ // 方式4:使用 Buffer(必须是 UTF-8 编码)
333
+ const idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
334
+ nodejieba.loadIdfDict(idfBuffer);
335
+
336
+ // 关键词提取时会使用设置的 IDF 权重
337
+ var keywords = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
338
+ console.log(keywords);
339
+ // 输出: [
340
+ // { word: '自定义词1', weight: 15 },
341
+ // { word: '自定义词2', weight: 12.5 },
342
+ // { word: '测试词汇', weight: 10 }
343
+ // ]
344
+ ```
345
+
346
+ **支持包含空格的词组**:
347
+
348
+ ```js
349
+ // 加载包含空格的词组
350
+ nodejieba.loadIdfDict(['人工智能 技术 25.0', '机器 学习 22.0']);
351
+
352
+ // 分词时会识别这些词组
353
+ console.log(nodejieba.cut('人工智能技术的发展和机器学习的应用'));
354
+ // 输出: ['人工智能技术', '的', '发展', '和', '机器学习', '的', '应用']
355
+
356
+ // 关键词提取时使用设置的权重
357
+ var keywords = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
358
+ console.log(keywords);
359
+ // 输出: [
360
+ // { word: '人工智能技术', weight: 25 },
361
+ // { word: '机器学习', weight: 22 },
362
+ // ...
363
+ // ]
364
+ ```
365
+
366
+ **英文大小写不敏感**:
367
+
368
+ ```js
369
+ // 加载英文 IDF 词组
370
+ nodejieba.loadIdfDict(['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0']);
371
+
372
+ // 各种大小写都能正确匹配
373
+ console.log(nodejieba.extract('MachineLearning is important', 2));
374
+ // 输出: [{ word: 'MachineLearning', weight: 30 }, ...]
375
+
376
+ console.log(nodejieba.extract('machinelearning is important', 2));
377
+ // 输出: [{ word: 'machinelearning', weight: 30 }, ...]
378
+
379
+ console.log(nodejieba.extract('MACHINELEARNING is important', 2));
380
+ // 输出: [{ word: 'MACHINELEARNING', weight: 30 }, ...]
381
+
382
+ // 包含空格的英文词组也能正确匹配
383
+ console.log(nodejieba.extract('Deep Learning and AI Technology', 3));
384
+ // 输出: [{ word: 'Deep Learning', weight: 28 }, { word: 'AI Technology', weight: 26 }, ...]
385
+
386
+ // 不含空格的版本也能匹配
387
+ console.log(nodejieba.extract('DeepLearning and AITechnology', 3));
388
+ // 输出: [{ word: 'DeepLearning', weight: 28 }, { word: 'AITechnology', weight: 26 }, ...]
389
+ ```
390
+
391
+ **IDF 词典条目格式**:
392
+
393
+ ```
394
+ # 词语 IDF权重
395
+ 自定义词 15.0
396
+
397
+ # 包含空格的词组
398
+ Deep Learning 28.0
399
+ Machine Learning 25.0
400
+
401
+ # 中文词组
402
+ 人工智能 技术 25.0
403
+ ```
404
+
405
+ **注意事项**:
406
+ - IDF 值越大,词语在关键词提取时的权重越高
407
+ - 建议根据词语的重要性设置合理的 IDF 值(通常在 5-30 之间)
408
+ - 加载 IDF 词典时会自动将词语添加到分词词典,无需额外调用 `loadUserDict`
409
+
305
410
  ### 包含空格的关键词(新功能)
306
411
 
307
412
  支持在自定义词典中使用包含空格的关键词,且支持无空格版本匹配和大小写不敏感匹配。
Binary file
package/index.js CHANGED
@@ -74,6 +74,7 @@ wrapWithDictLoad("extract");
74
74
  wrapWithDictLoad("textRankExtract");
75
75
  wrapWithDictLoad("insertWord");
76
76
  wrapWithDictLoad("loadUserDict");
77
+ wrapWithDictLoad("loadIdfDict");
77
78
  wrapWithDictLoad("setIdf");
78
79
 
79
80
  // 保存原始的 loadUserDict 函数
@@ -100,4 +101,28 @@ exports.loadUserDict = function (dict) {
100
101
  return _loadUserDict.call(this, dict);
101
102
  };
102
103
 
104
+ // 保存原始的 loadIdfDict 函数
105
+ var _loadIdfDict = exports.loadIdfDict;
106
+
107
+ // 重写 loadIdfDict 以支持 Set 格式
108
+ exports.loadIdfDict = function (dict) {
109
+ if (!isDictLoaded) {
110
+ exports.load();
111
+ }
112
+
113
+ if (dict === null || dict === undefined) {
114
+ return false;
115
+ }
116
+
117
+ if (dict instanceof Set) {
118
+ dict = Array.from(dict);
119
+ }
120
+
121
+ if (typeof dict !== 'string' && !Array.isArray(dict) && !Buffer.isBuffer(dict)) {
122
+ throw new TypeError('dict must be string, string[], Set<string>, or Buffer');
123
+ }
124
+
125
+ return _loadIdfDict.call(this, dict);
126
+ };
127
+
103
128
  module.exports = exports;
package/lib/nodejieba.cpp CHANGED
@@ -22,6 +22,7 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
22
22
  InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
23
23
  InstanceMethod("insertWord", &NodeJieba::insertWord),
24
24
  InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
25
+ InstanceMethod("loadIdfDict", &NodeJieba::loadIdfDict),
25
26
  InstanceMethod("setIdf", &NodeJieba::setIdf)
26
27
  });
27
28
  }
@@ -54,6 +55,178 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
54
55
  return Napi::Boolean::New(info.Env(), true);
55
56
  }
56
57
 
58
+ Napi::Value NodeJieba::loadIdfDict(const Napi::CallbackInfo& info) {
59
+ if (info.Length() < 1) {
60
+ return Napi::Boolean::New(info.Env(), false);
61
+ }
62
+
63
+ if( !_jieba_handle ){
64
+ Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
65
+ }
66
+
67
+ auto isBlankString = [](const std::string& str) -> bool {
68
+ for (char c : str) {
69
+ if (!std::isspace(static_cast<unsigned char>(c))) {
70
+ return false;
71
+ }
72
+ }
73
+ return true;
74
+ };
75
+
76
+ auto trimString = [](std::string& str) -> void {
77
+ size_t start = 0;
78
+ size_t end = str.length();
79
+
80
+ while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
81
+ start++;
82
+ }
83
+
84
+ while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
85
+ end--;
86
+ }
87
+
88
+ str = str.substr(start, end - start);
89
+ };
90
+
91
+ auto toUpper = [](const std::string& str) -> std::string {
92
+ std::string result = str;
93
+ for (size_t i = 0; i < result.length(); i++) {
94
+ result[i] = std::toupper(static_cast<unsigned char>(result[i]));
95
+ }
96
+ return result;
97
+ };
98
+
99
+ auto toLower = [](const std::string& str) -> std::string {
100
+ std::string result = str;
101
+ for (size_t i = 0; i < result.length(); i++) {
102
+ result[i] = std::tolower(static_cast<unsigned char>(result[i]));
103
+ }
104
+ return result;
105
+ };
106
+
107
+ auto hasEnglishChars = [](const std::string& str) -> bool {
108
+ for (char c : str) {
109
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
110
+ return true;
111
+ }
112
+ }
113
+ return false;
114
+ };
115
+
116
+ auto extractWordAndIdf = [](const std::string& line) -> std::pair<std::string, double> {
117
+ size_t lastSpace = line.find_last_of(" \t");
118
+ if (lastSpace == std::string::npos || lastSpace == 0) {
119
+ return std::make_pair("", 0.0);
120
+ }
121
+
122
+ std::string word = line.substr(0, lastSpace);
123
+ std::string idfStr = line.substr(lastSpace + 1);
124
+
125
+ char* endptr;
126
+ double idf = strtod(idfStr.c_str(), &endptr);
127
+ if (endptr == idfStr.c_str()) {
128
+ return std::make_pair("", 0.0);
129
+ }
130
+
131
+ return std::make_pair(word, idf);
132
+ };
133
+
134
+ auto removeSpaces = [](const std::string& str) -> std::string {
135
+ std::string result;
136
+ for (char c : str) {
137
+ if (!std::isspace(static_cast<unsigned char>(c))) {
138
+ result += c;
139
+ }
140
+ }
141
+ return result;
142
+ };
143
+
144
+ auto processIdfLine = [&](const std::string& line) {
145
+ std::pair<std::string, double> parsed = extractWordAndIdf(line);
146
+ std::string word = parsed.first;
147
+ double idf = parsed.second;
148
+
149
+ if (word.empty() || idf == 0.0) {
150
+ return;
151
+ }
152
+
153
+ _jieba_handle->SetIdfForWord(word, idf);
154
+ _jieba_handle->InsertUserWord(word, "x");
155
+
156
+ std::string wordNoSpace = removeSpaces(word);
157
+ if (wordNoSpace != word) {
158
+ _jieba_handle->SetIdfForWord(wordNoSpace, idf);
159
+ _jieba_handle->InsertUserWord(wordNoSpace, "x");
160
+ }
161
+
162
+ if (hasEnglishChars(word)) {
163
+ std::string wordLower = toLower(word);
164
+ std::string wordUpper = toUpper(word);
165
+
166
+ if (wordLower != word) {
167
+ _jieba_handle->SetIdfForWord(wordLower, idf);
168
+ _jieba_handle->InsertUserWord(wordLower, "x");
169
+ }
170
+
171
+ if (wordUpper != word) {
172
+ _jieba_handle->SetIdfForWord(wordUpper, idf);
173
+ _jieba_handle->InsertUserWord(wordUpper, "x");
174
+ }
175
+
176
+ std::string wordLowerNoSpace = toLower(wordNoSpace);
177
+ std::string wordUpperNoSpace = toUpper(wordNoSpace);
178
+
179
+ if (wordLowerNoSpace != wordNoSpace && wordLowerNoSpace != wordLower) {
180
+ _jieba_handle->SetIdfForWord(wordLowerNoSpace, idf);
181
+ _jieba_handle->InsertUserWord(wordLowerNoSpace, "x");
182
+ }
183
+
184
+ if (wordUpperNoSpace != wordNoSpace && wordUpperNoSpace != wordUpper) {
185
+ _jieba_handle->SetIdfForWord(wordUpperNoSpace, idf);
186
+ _jieba_handle->InsertUserWord(wordUpperNoSpace, "x");
187
+ }
188
+ }
189
+ };
190
+
191
+ if (info[0].IsArray()) {
192
+ Napi::Array arr = info[0].As<Napi::Array>();
193
+ for (size_t i = 0; i < arr.Length(); i++) {
194
+ Napi::Value val = arr[i];
195
+ if (!val.IsString()) {
196
+ Napi::TypeError::New(info.Env(), "Array elements must be strings")
197
+ .ThrowAsJavaScriptException();
198
+ return Napi::Boolean::New(info.Env(), false);
199
+ }
200
+ std::string line = val.As<Napi::String>().Utf8Value();
201
+ trimString(line);
202
+ if (!line.empty() && !isBlankString(line)) {
203
+ processIdfLine(line);
204
+ }
205
+ }
206
+ } else if (info[0].IsString()) {
207
+ std::string line = info[0].As<Napi::String>().Utf8Value();
208
+ trimString(line);
209
+ if (!line.empty() && !isBlankString(line)) {
210
+ processIdfLine(line);
211
+ }
212
+ } else if (info[0].IsBuffer()) {
213
+ Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
214
+ std::string content(buffer.Data(), buffer.Length());
215
+ std::istringstream iss(content);
216
+ std::string line;
217
+ while (std::getline(iss, line)) {
218
+ trimString(line);
219
+ if (!line.empty() && !isBlankString(line)) {
220
+ processIdfLine(line);
221
+ }
222
+ }
223
+ } else {
224
+ return Napi::Boolean::New(info.Env(), false);
225
+ }
226
+
227
+ return Napi::Boolean::New(info.Env(), true);
228
+ }
229
+
57
230
  Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
58
231
  if (info.Length() < 1) {
59
232
  return Napi::Boolean::New(info.Env(), false);
@@ -352,7 +525,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
352
525
  for (const auto& line : dictLines) {
353
526
  std::string keyword = extractKeyword(line);
354
527
  if (!keyword.empty()) {
355
- _jieba_handle->SetIdfForWordWithMultiplier(keyword, 1.3);
528
+ _jieba_handle->SetIdfForWordWithMultiplier(keyword, 2.0);
356
529
  }
357
530
  }
358
531
  };
package/lib/nodejieba.h CHANGED
@@ -21,6 +21,7 @@ private:
21
21
  Napi::Value textRankExtract(const Napi::CallbackInfo& info);
22
22
  Napi::Value insertWord(const Napi::CallbackInfo& info);
23
23
  Napi::Value loadUserDict(const Napi::CallbackInfo& info);
24
+ Napi::Value loadIdfDict(const Napi::CallbackInfo& info);
24
25
  Napi::Value setIdf(const Napi::CallbackInfo& info);
25
26
 
26
27
  cppjieba::Jieba* _jieba_handle{nullptr};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "nodejieba-plus",
3
3
  "description": "chinese word segmentation for node",
4
- "version": "3.5.17",
4
+ "version": "3.5.18",
5
5
  "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
6
6
  "maintainers": [
7
7
  "Yanyi Wu <wuyanyi09@foxmail.com>"
@@ -124,6 +124,10 @@ class Jieba {
124
124
  extractor.SetIdfWithMultiplier(word, multiplier);
125
125
  }
126
126
 
127
+ void LoadIdfDict(const vector<string>& buf) {
128
+ extractor.LoadIdfDict(buf);
129
+ }
130
+
127
131
  private:
128
132
  static string pathJoin(const string& dir, const string& filename) {
129
133
  if (dir.empty()) {
@@ -52,6 +52,41 @@ class KeywordExtractor {
52
52
  }
53
53
  }
54
54
 
55
+ void LoadIdfDict(const vector<string>& buf) {
56
+ double idf = 0.0;
57
+ double idfSum = 0.0;
58
+ size_t validCount = 0;
59
+
60
+ for (size_t i = 0; i < buf.size(); i++) {
61
+ const string& line = buf[i];
62
+ if (line.empty()) {
63
+ continue;
64
+ }
65
+
66
+ size_t lastSpace = line.find_last_of(" \t");
67
+ if (lastSpace == string::npos) {
68
+ continue;
69
+ }
70
+
71
+ string word = line.substr(0, lastSpace);
72
+ string idfStr = line.substr(lastSpace + 1);
73
+
74
+ char* endptr;
75
+ idf = strtod(idfStr.c_str(), &endptr);
76
+ if (endptr == idfStr.c_str()) {
77
+ continue;
78
+ }
79
+
80
+ idfMap_[word] = idf;
81
+ idfSum += idf;
82
+ validCount++;
83
+ }
84
+
85
+ if (validCount > 0) {
86
+ idfAverage_ = idfSum / validCount;
87
+ }
88
+ }
89
+
55
90
  void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
56
91
  vector<Word> topWords;
57
92
  Extract(sentence, topWords, topN);
@@ -0,0 +1,39 @@
1
+ var nodejieba = require("./index.js");
2
+
3
+ console.log("=== 测试 2.0 倍 IDF 权重 ===\n");
4
+
5
+ const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
6
+
7
+ console.log("【测试 1: 未加载用户词典】");
8
+ nodejieba.load();
9
+ var result1 = nodejieba.extract(content, 10);
10
+ console.log("关键词及权重:");
11
+ result1.forEach((item, i) => {
12
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
13
+ });
14
+
15
+ console.log("\n【测试 2: 加载用户词典(自动 2.0 倍权重)】");
16
+ nodejieba.loadUserDict("Open Claw 10 n");
17
+ var result2 = nodejieba.extract(content, 10);
18
+ console.log("关键词及权重:");
19
+ result2.forEach((item, i) => {
20
+ console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
21
+ });
22
+
23
+ console.log("\n【验证 2.0 倍权重】");
24
+ var openClawWeight1 = result1.find(r => r.word === "Open" || r.word === "Claw");
25
+ var openClawWeight2 = result2.find(r => r.word === "Open Claw");
26
+ if (openClawWeight1 && openClawWeight2) {
27
+ console.log(`未加载时权重: ${openClawWeight1.weight.toFixed(2)}`);
28
+ console.log(`加载后权重: ${openClawWeight2.weight.toFixed(2)}`);
29
+ console.log(`权重倍数: ${(openClawWeight2.weight / openClawWeight1.weight).toFixed(2)}`);
30
+
31
+ var ratio = openClawWeight2.weight / openClawWeight1.weight;
32
+ if (Math.abs(ratio - 2.0) < 0.01) {
33
+ console.log("✅ 2.0 倍权重验证成功!");
34
+ } else {
35
+ console.log(`❌ 权重倍数不符合预期(期望 2.0,实际 ${ratio.toFixed(2)})`);
36
+ }
37
+ }
38
+
39
+ console.log("\n=== 测试完成 ===");
@@ -0,0 +1,98 @@
1
+ var nodejieba = require('./index.js');
2
+
3
+ console.log('=== 测试 loadIdfDict 功能 ===\n');
4
+
5
+ // 初始化
6
+ nodejieba.load();
7
+
8
+ // 测试1: 使用数组加载IDF
9
+ console.log('测试1: 使用数组加载IDF');
10
+ var idfArray = [
11
+ '自定义词1 15.0',
12
+ '自定义词2 12.5',
13
+ '测试词汇 10.0'
14
+ ];
15
+ var result1 = nodejieba.loadIdfDict(idfArray);
16
+ console.log('数组加载结果:', result1);
17
+
18
+ // 测试关键词提取
19
+ var keywords1 = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
20
+ console.log('提取的关键词:', keywords1);
21
+ console.log();
22
+
23
+ // 测试2: 使用字符串加载单个IDF
24
+ console.log('测试2: 使用字符串加载单个IDF');
25
+ var result2 = nodejieba.loadIdfDict('单个词汇 20.0');
26
+ console.log('字符串加载结果:', result2);
27
+
28
+ var keywords2 = nodejieba.extract('这是一个单个词汇的测试', 3);
29
+ console.log('提取的关键词:', keywords2);
30
+ console.log();
31
+
32
+ // 测试3: 使用Buffer加载IDF
33
+ console.log('测试3: 使用Buffer加载IDF');
34
+ var idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
35
+ var result3 = nodejieba.loadIdfDict(idfBuffer);
36
+ console.log('Buffer加载结果:', result3);
37
+
38
+ var keywords3 = nodejieba.extract('缓冲词汇1和缓冲词汇2的测试', 3);
39
+ console.log('提取的关键词:', keywords3);
40
+ console.log();
41
+
42
+ // 测试4: 使用Set加载IDF
43
+ console.log('测试4: 使用Set加载IDF');
44
+ var idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
45
+ var result4 = nodejieba.loadIdfDict(idfSet);
46
+ console.log('Set加载结果:', result4);
47
+
48
+ var keywords4 = nodejieba.extract('集合词汇1和集合词汇2的测试', 3);
49
+ console.log('提取的关键词:', keywords4);
50
+ console.log();
51
+
52
+ // 测试5: 支持包含空格的词组
53
+ console.log('测试5: 支持包含空格的词组');
54
+ var idfWithSpace = ['人工智能 技术 25.0', '机器 学习 22.0'];
55
+ var result5 = nodejieba.loadIdfDict(idfWithSpace);
56
+ console.log('包含空格词组加载结果:', result5);
57
+
58
+ // 测试提取包含空格的词组
59
+ var keywords5a = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
60
+ console.log('提取的关键词(含空格):', keywords5a);
61
+
62
+ // 测试不含空格的版本也能匹配
63
+ var keywords5b = nodejieba.extract('人工智能技术发展和机器学习应用', 5);
64
+ console.log('提取的关键词(不含空格):', keywords5b);
65
+ console.log();
66
+
67
+ // 测试6: 英文大小写不敏感
68
+ console.log('测试6: 英文大小写不敏感');
69
+ var idfEnglish = ['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0'];
70
+ var result6 = nodejieba.loadIdfDict(idfEnglish);
71
+ console.log('英文IDF加载结果:', result6);
72
+
73
+ // 测试大写匹配
74
+ var keywords6a = nodejieba.extract('MachineLearning is important', 3);
75
+ console.log('大写匹配:', keywords6a);
76
+
77
+ // 测试小写匹配
78
+ var keywords6b = nodejieba.extract('machinelearning is important', 3);
79
+ console.log('小写匹配:', keywords6b);
80
+
81
+ // 测试混合大小写匹配
82
+ var keywords6c = nodejieba.extract('MACHINELEARNING is important', 3);
83
+ console.log('全大写匹配:', keywords6c);
84
+
85
+ // 测试包含空格的英文词组
86
+ var keywords6d = nodejieba.extract('Deep Learning and AI Technology', 3);
87
+ console.log('包含空格英文词组:', keywords6d);
88
+
89
+ // 测试不含空格的英文词组
90
+ var keywords6e = nodejieba.extract('DeepLearning and AITechnology', 3);
91
+ console.log('不含空格英文词组:', keywords6e);
92
+
93
+ // 测试小写版本的包含空格英文词组
94
+ var keywords6f = nodejieba.extract('deep learning and ai technology', 3);
95
+ console.log('小写包含空格英文词组:', keywords6f);
96
+ console.log();
97
+
98
+ console.log('=== 所有测试完成 ===');
package/types/index.d.ts CHANGED
@@ -28,5 +28,6 @@ declare module "nodejieba" {
28
28
  export function insertWord(word: string, tag?: string): boolean;
29
29
  export function cutSmall(sentence: string, small: number): string[];
30
30
  export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
31
+ export function loadIdfDict(dict: string | string[] | Set<string> | Buffer): boolean;
31
32
  export function setIdf(word: string, idf?: number, multiplier?: number): boolean;
32
33
  }