nodejieba-plus 3.5.11 → 3.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -191,7 +191,7 @@ console.log(nodejieba.cut("男默女泪"));
191
191
 
192
192
  ### 批量加载用户词典(新功能)
193
193
 
194
- 支持通过字符串数组、单个字符串或 Buffer 批量加载用户词典:
194
+ 支持通过字符串数组、Set、单个字符串或 Buffer 批量加载用户词典:
195
195
 
196
196
  ```js
197
197
  var nodejieba = require("nodejieba");
@@ -200,10 +200,18 @@ nodejieba.load();
200
200
  // 方式1:使用字符串数组
201
201
  nodejieba.loadUserDict(["云计算", "人工智能 1000 nz", "大数据"]);
202
202
 
203
- // 方式2:使用单个字符串
203
+ // 方式2:使用 Set 集合(自动去重)
204
+ const dictSet = new Set();
205
+ dictSet.add("云计算");
206
+ dictSet.add("人工智能 1000 nz");
207
+ dictSet.add("大数据");
208
+ dictSet.add("云计算"); // 重复添加会被自动去重
209
+ nodejieba.loadUserDict(dictSet);
210
+
211
+ // 方式3:使用单个字符串
204
212
  nodejieba.loadUserDict("区块链");
205
213
 
206
- // 方式3:使用 Buffer
214
+ // 方式4:使用 Buffer
207
215
  const dictBuffer = Buffer.from("新词1\n新词2 100 n\n新词3 nz");
208
216
  nodejieba.loadUserDict(dictBuffer);
209
217
 
Binary file
package/index.js CHANGED
@@ -75,4 +75,21 @@ wrapWithDictLoad("textRankExtract");
75
75
  wrapWithDictLoad("insertWord");
76
76
  wrapWithDictLoad("loadUserDict");
77
77
 
78
+ // 保存原始的 loadUserDict 函数
79
+ var _loadUserDict = exports.loadUserDict;
80
+
81
+ // 重写 loadUserDict 以支持 Set 格式
82
+ exports.loadUserDict = function (dict) {
83
+ if (!isDictLoaded) {
84
+ exports.load();
85
+ }
86
+
87
+ // 如果是 Set 对象,转换为数组
88
+ if (dict instanceof Set) {
89
+ dict = Array.from(dict);
90
+ }
91
+
92
+ return _loadUserDict.call(this, dict);
93
+ };
94
+
78
95
  module.exports = exports;
package/lib/nodejieba.cpp CHANGED
@@ -229,7 +229,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
229
229
  Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
230
230
  }
231
231
 
232
- // 支持传入字符串数组或单个字符串(Buffer
232
+ // 支持传入字符串数组、单个字符串或 Buffer
233
233
  if (info[0].IsArray()) {
234
234
  Napi::Array arr = info[0].As<Napi::Array>();
235
235
  std::vector<std::string> buf;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "nodejieba-plus",
3
3
  "description": "chinese word segmentation for node",
4
- "version": "3.5.11",
4
+ "version": "3.5.12",
5
5
  "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
6
6
  "maintainers": [
7
7
  "Yanyi Wu <wuyanyi09@foxmail.com>"
@@ -10,7 +10,6 @@
10
10
  #include <stdint.h>
11
11
  #include <cmath>
12
12
  #include <limits>
13
- #include <algorithm>
14
13
  #include "limonp/StringUtil.hpp"
15
14
  #include "limonp/Logging.hpp"
16
15
  #include "Unicode.hpp"
@@ -113,97 +112,26 @@ class DictTrie {
113
112
  vector<string> buf;
114
113
  DictUnit node_info;
115
114
  Split(line, buf, " ");
116
-
117
- string word;
118
- string tag = UNKNOWN_TAG;
119
- double weight = user_word_default_weight_;
120
- bool hasSpace = false;
121
-
122
- // 处理包含空格的关键词
123
- // 格式可能是: "word" 或 "word tag" 或 "word freq tag"
124
- // 其中 word 本身可能包含空格
125
- if (buf.size() == 1) {
126
- // 只有关键词,无词频和标签
127
- word = buf[0];
128
- } else if (buf.size() == 2) {
129
- // 可能是 "word tag" "word1 word2"
130
- // 检查第二个字段是否为数字(词频)
131
- bool isNumber = true;
132
- for (char c : buf[1]) {
133
- if (!isdigit(c)) {
134
- isNumber = false;
135
- break;
115
+ if(buf.size() == 1){
116
+ MakeNodeInfo(node_info,
117
+ buf[0],
118
+ user_word_default_weight_,
119
+ UNKNOWN_TAG);
120
+ } else if (buf.size() == 2) {
121
+ MakeNodeInfo(node_info,
122
+ buf[0],
123
+ user_word_default_weight_,
124
+ buf[1]);
125
+ } else if (buf.size() == 3) {
126
+ int freq = atoi(buf[1].c_str());
127
+ assert(freq_sum_ > 0.0);
128
+ double weight = log(1.0 * freq / freq_sum_);
129
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
136
130
  }
137
- }
138
- if (isNumber) {
139
- // "word freq" 格式,无标签
140
- int freq = atoi(buf[1].c_str());
141
- assert(freq_sum_ > 0.0);
142
- weight = log(1.0 * freq / freq_sum_);
143
- word = buf[0];
144
- } else {
145
- // "word tag" 格式
146
- word = buf[0];
147
- tag = buf[1];
148
- }
149
- } else {
150
- // 检查最后两个字段:可能是 "... word freq tag" 或 "... word1 word2 tag" 等
151
- // 倒数第二个如果是数字,则认为是词频,最后一个是标签
152
- // 否则认为只有最后一个是标签,前面都是关键词
153
- bool isFreq = true;
154
- for (char c : buf[buf.size() - 2]) {
155
- if (!isdigit(c)) {
156
- isFreq = false;
157
- break;
158
- }
159
- }
160
-
161
- if (isFreq) {
162
- // 格式: "word... freq tag"
163
- int freq = atoi(buf[buf.size() - 2].c_str());
164
- assert(freq_sum_ > 0.0);
165
- weight = log(1.0 * freq / freq_sum_);
166
- // 前面的所有部分(除了最后两个)组成关键词
167
- for (size_t i = 0; i < buf.size() - 2; ++i) {
168
- if (i > 0) word += " ";
169
- word += buf[i];
170
- }
171
- tag = buf[buf.size() - 1];
172
- } else {
173
- // 格式: "word... tag" (无词频)
174
- // 前面的所有部分(除了最后一个)组成关键词
175
- for (size_t i = 0; i < buf.size() - 1; ++i) {
176
- if (i > 0) word += " ";
177
- word += buf[i];
178
- }
179
- tag = buf[buf.size() - 1];
180
- }
181
- }
182
-
183
- // 检查词中是否包含空格
184
- hasSpace = (word.find(' ') != string::npos);
185
-
186
- // 添加原始词(包含空格版本)
187
- MakeNodeInfo(node_info, word, weight, tag);
188
- static_node_infos_.push_back(node_info);
189
- if (node_info.word.size() == 1) {
190
- user_dict_single_chinese_word_.insert(node_info.word[0]);
191
- }
192
-
193
- // 如果词包含空格,同时添加无空格版本
194
- if (hasSpace) {
195
- string wordNoSpace = word;
196
- // 移除所有空格
197
- wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
198
- if (!wordNoSpace.empty() && wordNoSpace != word) {
199
- DictUnit node_info_no_space;
200
- MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
201
- static_node_infos_.push_back(node_info_no_space);
202
- if (node_info_no_space.word.size() == 1) {
203
- user_dict_single_chinese_word_.insert(node_info_no_space.word[0]);
131
+ static_node_infos_.push_back(node_info);
132
+ if (node_info.word.size() == 1) {
133
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
204
134
  }
205
- }
206
- }
207
135
  }
208
136
 
209
137
  void LoadUserDict(const vector<string>& buf) {
@@ -287,36 +215,12 @@ class DictTrie {
287
215
  DictUnit node_info;
288
216
  while (getline(ifs, line)) {
289
217
  Split(line, buf, " ");
290
- // 支持包含空格的关键词
291
- // 格式: "word weight tag" 或 "word1 word2 weight tag" 等
292
- // 最后两个字段是 weight 和 tag,前面的都是关键词
293
- if (buf.size() >= DICT_COLUMN_NUM) {
294
- // 组合前面的字段作为关键词
295
- string word;
296
- for (size_t i = 0; i < buf.size() - 2; ++i) {
297
- if (i > 0) word += " ";
298
- word += buf[i];
299
- }
300
- double weight = atof(buf[buf.size() - 2].c_str());
301
- string tag = buf[buf.size() - 1];
302
-
303
- // 添加原始词(包含空格版本)
304
- MakeNodeInfo(node_info, word, weight, tag);
305
- static_node_infos_.push_back(node_info);
306
-
307
- // 如果词包含空格,同时添加无空格版本
308
- if (word.find(' ') != string::npos) {
309
- string wordNoSpace = word;
310
- wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
311
- if (!wordNoSpace.empty() && wordNoSpace != word) {
312
- DictUnit node_info_no_space;
313
- MakeNodeInfo(node_info_no_space, wordNoSpace, weight, tag);
314
- static_node_infos_.push_back(node_info_no_space);
315
- }
316
- }
317
- } else {
318
- XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
319
- }
218
+ XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
219
+ MakeNodeInfo(node_info,
220
+ buf[0],
221
+ atof(buf[1].c_str()),
222
+ buf[2]);
223
+ static_node_infos_.push_back(node_info);
320
224
  }
321
225
  }
322
226
 
@@ -8,9 +8,7 @@
8
8
 
9
9
  namespace cppjieba {
10
10
 
11
- // 修改分隔符,移除空格,只保留其他分隔符
12
- // 这样英文单词之间的空格不会被当作分隔符
13
- const char* const SPECIAL_SEPARATORS = "\t\n\xEF\xBC\x8C\xE3\x80\x82";
11
+ const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
14
12
 
15
13
  using namespace limonp;
16
14
 
@@ -92,10 +92,6 @@ inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
92
92
  if (!(str[0] & 0x80)) { // 0xxxxxxx
93
93
  // 7bit, total 7bit
94
94
  rp.rune = (uint8_t)(str[0]) & 0x7f;
95
- // 将大写英文字母转换为小写,实现大小写不敏感匹配
96
- if (rp.rune >= 'A' && rp.rune <= 'Z') {
97
- rp.rune = rp.rune - 'A' + 'a';
98
- }
99
95
  rp.len = 1;
100
96
  } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
101
97
  // 110xxxxxx
@@ -55,4 +55,23 @@ describe("nodejieba.loadUserDict", function() {
55
55
  result.should.containEql('云计算');
56
56
  result.should.containEql('人工智能');
57
57
  });
58
+
59
+ it("nodejieba.loadUserDict with Set should return true", function() {
60
+ const dictSet = new Set();
61
+ dictSet.add("非常独特的测试词123");
62
+ dictSet.add("另一个独特测试词 100 n");
63
+
64
+ var loadResult = nodejieba.loadUserDict(dictSet);
65
+ loadResult.should.eql(true);
66
+ });
67
+
68
+ it("nodejieba.loadUserDict with Set should automatically deduplicate", function() {
69
+ const dictSet = new Set();
70
+ dictSet.add("去重专用测试词");
71
+ dictSet.add("去重专用测试词"); // 重复添加
72
+ dictSet.add("去重专用测试词"); // 再次重复添加
73
+
74
+ var loadResult = nodejieba.loadUserDict(dictSet);
75
+ loadResult.should.eql(true);
76
+ });
58
77
  });
package/types/index.d.ts CHANGED
@@ -27,5 +27,5 @@ declare module "nodejieba" {
27
27
  export function textRankExtract(sentence: string, threshold: number): ExtractResult[];
28
28
  export function insertWord(word: string, tag?: string): boolean;
29
29
  export function cutSmall(sentence: string, small: number): string[];
30
- export function loadUserDict(dict: string | string[] | Buffer): boolean;
30
+ export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
31
31
  }
package/test_open_claw.js DELETED
@@ -1,65 +0,0 @@
1
- // 测试 "open claw" 关键词提取问题
2
-
3
- var nodejieba = require("./index.js");
4
-
5
- // 测试句子
6
- var sentence = "Node.js在Web开发中的应用与实践Open Claw,这句测试的话,关键词是\"open claw\"";
7
-
8
- console.log("=".repeat(60));
9
- console.log("测试句子:", sentence);
10
- console.log("=".repeat(60));
11
-
12
- // 1. 先进行分词测试
13
- console.log("\n【1. 分词结果】");
14
- var cutResult = nodejieba.cut(sentence);
15
- console.log("cut:", cutResult);
16
-
17
- // 2. 关键词提取测试
18
- console.log("\n【2. 关键词提取 (extract)】");
19
- var keywords = nodejieba.extract(sentence, 10);
20
- console.log("提取的关键词:");
21
- keywords.forEach(function(kw) {
22
- console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
23
- });
24
-
25
- // 3. TextRank 关键词提取测试
26
- console.log("\n【3. TextRank 关键词提取】");
27
- var textRankKeywords = nodejieba.textRankExtract(sentence, 10);
28
- console.log("提取的关键词:");
29
- textRankKeywords.forEach(function(kw) {
30
- console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
31
- });
32
-
33
- // 4. 检查是否包含 "open claw"
34
- console.log("\n【4. 检查结果】");
35
- var hasOpenClaw = keywords.some(function(kw) {
36
- return kw.word.toLowerCase() === "open claw";
37
- });
38
- console.log("是否提取到 'open claw':", hasOpenClaw);
39
-
40
- // 5. 添加自定义词后再次测试
41
- console.log("\n【5. 添加自定义词后测试】");
42
- nodejieba.insertWord("open claw");
43
- console.log("已添加自定义词: open claw");
44
-
45
- var cutResult2 = nodejieba.cut(sentence);
46
- console.log("\n再次分词结果:");
47
- console.log("cut:", cutResult2);
48
-
49
- var keywords2 = nodejieba.extract(sentence, 10);
50
- console.log("\n再次提取关键词:");
51
- keywords2.forEach(function(kw) {
52
- console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
53
- });
54
-
55
- var hasOpenClaw2 = keywords2.some(function(kw) {
56
- return kw.word.toLowerCase() === "open claw";
57
- });
58
- console.log("\n是否提取到 'open claw':", hasOpenClaw2);
59
-
60
- console.log("\n" + "=".repeat(60));
61
- console.log("问题分析:");
62
- console.log("1. jieba 分词器默认基于中文语料训练,对英文词汇识别有限");
63
- console.log("2. 'Open Claw' 作为英文词组,默认词典中不存在");
64
- console.log("3. 解决方案: 使用 insertWord() 方法添加自定义词");
65
- console.log("=".repeat(60));