nodejieba-plus 3.5.10 → 3.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -27,6 +27,7 @@
27
27
  + **支持包含空格的关键词**(如 "Open Claw")。
28
28
  + **支持无空格版本匹配**(如 "OpenClaw" 可匹配 "Open Claw")。
29
29
  + **支持英文大小写不敏感匹配**(如 "open claw"、"OPEN CLAW" 都可匹配 "Open Claw")。
30
+ + **支持批量加载用户词典**(字符串数组、单个字符串、Buffer 格式)。
30
31
 
31
32
  对实现细节感兴趣的请看如下博文:
32
33
 
@@ -188,6 +189,47 @@ console.log(nodejieba.cut("男默女泪"));
188
189
  // ["男默女泪"]
189
190
  ```
190
191
 
192
+ ### 批量加载用户词典(新功能)
193
+
194
+ 支持通过字符串数组、单个字符串或 Buffer 批量加载用户词典:
195
+
196
+ ```js
197
+ var nodejieba = require("nodejieba");
198
+ nodejieba.load();
199
+
200
+ // 方式1:使用字符串数组
201
+ nodejieba.loadUserDict(["云计算", "人工智能 1000 nz", "大数据"]);
202
+
203
+ // 方式2:使用单个字符串
204
+ nodejieba.loadUserDict("区块链");
205
+
206
+ // 方式3:使用 Buffer
207
+ const dictBuffer = Buffer.from("新词1\n新词2 100 n\n新词3 nz");
208
+ nodejieba.loadUserDict(dictBuffer);
209
+
210
+ // 分词时会识别用户词典中的词
211
+ var result = nodejieba.cut("云计算和大数据是人工智能的基础");
212
+ console.log(result); // ['云计算', '和', '大数据', '是', '人工智能', '的', '基础']
213
+ ```
214
+
215
+ #### 词典条目格式
216
+
217
+ 词典条目支持以下格式:
218
+
219
+ ```
220
+ # 仅关键词
221
+ 云计算
222
+
223
+ # 关键词 + 词频
224
+ 人工智能 1000
225
+
226
+ # 关键词 + 词性标签
227
+ 区块链 nz
228
+
229
+ # 关键词 + 词频 + 词性标签
230
+ 大数据 500 n
231
+ ```
232
+
191
233
  ### 包含空格的关键词(新功能)
192
234
 
193
235
  支持在自定义词典中使用包含空格的关键词,且支持无空格版本匹配和大小写不敏感匹配。
Binary file
package/index.js CHANGED
@@ -73,5 +73,6 @@ wrapWithDictLoad("tag");
73
73
  wrapWithDictLoad("extract");
74
74
  wrapWithDictLoad("textRankExtract");
75
75
  wrapWithDictLoad("insertWord");
76
+ wrapWithDictLoad("loadUserDict");
76
77
 
77
78
  module.exports = exports;
package/lib/nodejieba.cpp CHANGED
@@ -6,6 +6,8 @@
6
6
  #include "cppjieba/KeywordExtractor.hpp"
7
7
  #include "cppjieba/TextRankExtractor.hpp"
8
8
 
9
+ #include <sstream>
10
+
9
11
  NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
10
12
  DefineAddon(exports, {
11
13
  InstanceMethod("load", &NodeJieba::load),
@@ -17,7 +19,8 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
17
19
  InstanceMethod("tag", &NodeJieba::tag),
18
20
  InstanceMethod("extract", &NodeJieba::extract),
19
21
  InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
20
- InstanceMethod("insertWord", &NodeJieba::insertWord)
22
+ InstanceMethod("insertWord", &NodeJieba::insertWord),
23
+ InstanceMethod("loadUserDict", &NodeJieba::loadUserDict)
21
24
  });
22
25
  }
23
26
 
@@ -216,3 +219,49 @@ Napi::Value NodeJieba::textRankExtract(const Napi::CallbackInfo& info) {
216
219
  WrapPairVector(info.Env(), words, outArray);
217
220
  return outArray;
218
221
  }
222
+
223
+ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
224
+ if (info.Length() < 1) {
225
+ return Napi::Boolean::New(info.Env(), false);
226
+ }
227
+
228
+ if( !_jieba_handle ){
229
+ Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
230
+ }
231
+
232
+ // 支持传入字符串数组或单个字符串(Buffer)
233
+ if (info[0].IsArray()) {
234
+ Napi::Array arr = info[0].As<Napi::Array>();
235
+ std::vector<std::string> buf;
236
+ for (size_t i = 0; i < arr.Length(); i++) {
237
+ Napi::Value val = arr[i];
238
+ if (val.IsString()) {
239
+ buf.push_back(val.As<Napi::String>().Utf8Value());
240
+ }
241
+ }
242
+ _jieba_handle->LoadUserDict(buf);
243
+ } else if (info[0].IsString()) {
244
+ // 支持传入单个词典条目字符串
245
+ std::string line = info[0].As<Napi::String>().Utf8Value();
246
+ std::vector<std::string> buf;
247
+ buf.push_back(line);
248
+ _jieba_handle->LoadUserDict(buf);
249
+ } else if (info[0].IsBuffer()) {
250
+ // 支持传入 Buffer,将其转换为字符串并按行分割
251
+ Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
252
+ std::string content(buffer.Data(), buffer.Length());
253
+ std::vector<std::string> buf;
254
+ std::istringstream iss(content);
255
+ std::string line;
256
+ while (std::getline(iss, line)) {
257
+ if (!line.empty()) {
258
+ buf.push_back(line);
259
+ }
260
+ }
261
+ _jieba_handle->LoadUserDict(buf);
262
+ } else {
263
+ return Napi::Boolean::New(info.Env(), false);
264
+ }
265
+
266
+ return Napi::Boolean::New(info.Env(), true);
267
+ }
package/lib/nodejieba.h CHANGED
@@ -20,6 +20,7 @@ private:
20
20
  Napi::Value extract(const Napi::CallbackInfo& info);
21
21
  Napi::Value textRankExtract(const Napi::CallbackInfo& info);
22
22
  Napi::Value insertWord(const Napi::CallbackInfo& info);
23
+ Napi::Value loadUserDict(const Napi::CallbackInfo& info);
23
24
 
24
25
  cppjieba::Jieba* _jieba_handle{nullptr};
25
26
  cppjieba::TextRankExtractor* _text_rank_extractor_handle{nullptr};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "nodejieba-plus",
3
3
  "description": "chinese word segmentation for node",
4
- "version": "3.5.10",
4
+ "version": "3.5.11",
5
5
  "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
6
6
  "maintainers": [
7
7
  "Yanyi Wu <wuyanyi09@foxmail.com>"
@@ -33,7 +33,7 @@
33
33
  "typescript": "^5.0.4"
34
34
  },
35
35
  "scripts": {
36
- "test": "mocha --timeout 10s -R spec test/test.js && mocha --timeout 10s -R spec test/load_dict_test.js && mocha --timeout 10s -R spec test/missing_binding_test.js",
36
+ "test": "mocha --timeout 10s -R spec test/test.js && mocha --timeout 10s -R spec test/load_dict_test.js && mocha --timeout 10s -R spec test/load_user_dict_test.js && mocha --timeout 10s -R spec test/missing_binding_test.js",
37
37
  "install": "npx @mapbox/node-pre-gyp install --fallback-to-build",
38
38
  "rebuild": "npx @mapbox/node-pre-gyp rebuild"
39
39
  },
@@ -0,0 +1,58 @@
1
+ var should = require("should");
2
+ var nodejieba = require("../index.js");
3
+
4
+ describe("nodejieba.loadUserDict", function() {
5
+
6
+ // 确保在测试前加载词典
7
+ before(function() {
8
+ nodejieba.load();
9
+ });
10
+
11
+ it("nodejieba.loadUserDict with string array should return true", function() {
12
+ // 使用字符串数组加载用户词典
13
+ var dictLines = [
14
+ "测试新词",
15
+ "自定义词汇 10 n"
16
+ ];
17
+ var loadResult = nodejieba.loadUserDict(dictLines);
18
+ loadResult.should.eql(true);
19
+ });
20
+
21
+ it("nodejieba.loadUserDict with single string should return true", function() {
22
+ // 加载单个词典条目
23
+ var loadResult = nodejieba.loadUserDict("单个词");
24
+ loadResult.should.eql(true);
25
+ });
26
+
27
+ it("nodejieba.loadUserDict with Buffer should return true", function() {
28
+ // 使用 Buffer 加载用户词典
29
+ var dictContent = Buffer.from("Buffer词 100 nz\n另一个词");
30
+ var loadResult = nodejieba.loadUserDict(dictContent);
31
+ loadResult.should.eql(true);
32
+ });
33
+
34
+ it("nodejieba.loadUserDict should return false when no argument", function() {
35
+ var loadResult = nodejieba.loadUserDict();
36
+ loadResult.should.eql(false);
37
+ });
38
+
39
+ it("nodejieba.loadUserDict with tag should return true", function() {
40
+ // 测试带标签的词典条目
41
+ var dictLines = [
42
+ "技术术语 500 nz"
43
+ ];
44
+ var result = nodejieba.loadUserDict(dictLines);
45
+ result.should.eql(true);
46
+ });
47
+
48
+ it("nodejieba.loadUserDict should work after loading words", function() {
49
+ // 先加载一些用户词典 - 使用比较独特的词确保测试准确性
50
+ nodejieba.loadUserDict(["云计算"]);
51
+ nodejieba.loadUserDict("人工智能 1000 nz");
52
+
53
+ // 验证这些词被正确识别
54
+ var result = nodejieba.cut("云计算是人工智能的基础");
55
+ result.should.containEql('云计算');
56
+ result.should.containEql('人工智能');
57
+ });
58
+ });
@@ -0,0 +1,65 @@
1
+ // 测试 "open claw" 关键词提取问题
2
+
3
+ var nodejieba = require("./index.js");
4
+
5
+ // 测试句子
6
+ var sentence = "Node.js在Web开发中的应用与实践Open Claw,这句测试的话,关键词是\"open claw\"";
7
+
8
+ console.log("=".repeat(60));
9
+ console.log("测试句子:", sentence);
10
+ console.log("=".repeat(60));
11
+
12
+ // 1. 先进行分词测试
13
+ console.log("\n【1. 分词结果】");
14
+ var cutResult = nodejieba.cut(sentence);
15
+ console.log("cut:", cutResult);
16
+
17
+ // 2. 关键词提取测试
18
+ console.log("\n【2. 关键词提取 (extract)】");
19
+ var keywords = nodejieba.extract(sentence, 10);
20
+ console.log("提取的关键词:");
21
+ keywords.forEach(function(kw) {
22
+ console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
23
+ });
24
+
25
+ // 3. TextRank 关键词提取测试
26
+ console.log("\n【3. TextRank 关键词提取】");
27
+ var textRankKeywords = nodejieba.textRankExtract(sentence, 10);
28
+ console.log("提取的关键词:");
29
+ textRankKeywords.forEach(function(kw) {
30
+ console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
31
+ });
32
+
33
+ // 4. 检查是否包含 "open claw"
34
+ console.log("\n【4. 检查结果】");
35
+ var hasOpenClaw = keywords.some(function(kw) {
36
+ return kw.word.toLowerCase() === "open claw";
37
+ });
38
+ console.log("是否提取到 'open claw':", hasOpenClaw);
39
+
40
+ // 5. 添加自定义词后再次测试
41
+ console.log("\n【5. 添加自定义词后测试】");
42
+ nodejieba.insertWord("open claw");
43
+ console.log("已添加自定义词: open claw");
44
+
45
+ var cutResult2 = nodejieba.cut(sentence);
46
+ console.log("\n再次分词结果:");
47
+ console.log("cut:", cutResult2);
48
+
49
+ var keywords2 = nodejieba.extract(sentence, 10);
50
+ console.log("\n再次提取关键词:");
51
+ keywords2.forEach(function(kw) {
52
+ console.log(" - " + kw.word + " (权重: " + kw.weight + ")");
53
+ });
54
+
55
+ var hasOpenClaw2 = keywords2.some(function(kw) {
56
+ return kw.word.toLowerCase() === "open claw";
57
+ });
58
+ console.log("\n是否提取到 'open claw':", hasOpenClaw2);
59
+
60
+ console.log("\n" + "=".repeat(60));
61
+ console.log("问题分析:");
62
+ console.log("1. jieba 分词器默认基于中文语料训练,对英文词汇识别有限");
63
+ console.log("2. 'Open Claw' 作为英文词组,默认词典中不存在");
64
+ console.log("3. 解决方案: 使用 insertWord() 方法添加自定义词");
65
+ console.log("=".repeat(60));
package/types/index.d.ts CHANGED
@@ -27,4 +27,5 @@ declare module "nodejieba" {
27
27
  export function textRankExtract(sentence: string, threshold: number): ExtractResult[];
28
28
  export function insertWord(word: string, tag?: string): boolean;
29
29
  export function cutSmall(sentence: string, small: number): string[];
30
+ export function loadUserDict(dict: string | string[] | Buffer): boolean;
30
31
  }