nodejieba-plus 3.5.13 → 3.5.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +130 -93
- package/analyze_weight.js +57 -0
- package/build/Release/nodejieba.node +0 -0
- package/diagnose_priority.js +71 -0
- package/index.js +9 -1
- package/lib/nodejieba.cpp +145 -13
- package/lib/nodejieba.h +1 -0
- package/package.json +1 -1
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +169 -30
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +8 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +29 -8
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +1 -1
- package/submodules/cppjieba/include/cppjieba/Trie.hpp +10 -13
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +52 -0
- package/test/load_user_dict_test.js +48 -4
- package/test_1_3x_weight.js +86 -0
- package/test_assertion_fix.js +60 -0
- package/test_idf_feature.js +43 -0
- package/test_open_claw.js +65 -0
- package/test_simple.js +17 -0
- package/test_space_keyword.js +66 -0
- package/types/index.d.ts +1 -0
package/README.md
CHANGED
|
@@ -1,38 +1,38 @@
|
|
|
1
|
-
[
|
|
2
|
-
[
|
|
3
|
-
[
|
|
4
|
-
[
|
|
5
|
-
[
|
|
6
|
-
[
|
|
7
|
-
[
|
|
8
|
-
[
|
|
1
|
+
[!\[Build Status\](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml/badge.svg null)](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml)
|
|
2
|
+
[!\[Financial Contributors on Open Collective\](https://opencollective.com/nodejieba/all/badge.svg?label=financial+contributors null)](https://opencollective.com/nodejieba) [!\[Author\](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat null)](https://github.com/yanyiwu/)
|
|
3
|
+
[!\[Platform\](https://img.shields.io/badge/platform-Linux,macOS,Windows-green.svg?style=flat null)](https://github.com/yanyiwu/nodejieba)
|
|
4
|
+
[!\[Performance\](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat null)](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
|
|
5
|
+
[!\[License\](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat null)](http://yanyiwu.mit-license.org)
|
|
6
|
+
[!\[NpmDownload Status\](http://img.shields.io/npm/dm/nodejieba.svg null)](https://www.npmjs.org/package/nodejieba)
|
|
7
|
+
[!\[NPM Version\](https://img.shields.io/npm/v/nodejieba.svg?style=flat null)](https://www.npmjs.org/package/nodejieba)
|
|
8
|
+
[!\[Code Climate\](https://codeclimate.com/github/yanyiwu/nodejieba/badges/gpa.svg null)](https://codeclimate.com/github/yanyiwu/nodejieba)
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
***
|
|
11
11
|
|
|
12
12
|
# NodeJieba "结巴"分词的Node.js版本
|
|
13
13
|
|
|
14
|
-
## 介绍
|
|
14
|
+
## 介绍
|
|
15
15
|
|
|
16
16
|
`NodeJieba`是"结巴"中文分词的 Node.js 版本实现,
|
|
17
|
-
由[CppJieba]提供底层分词算法实现,
|
|
17
|
+
由[CppJieba](https://github.com/yanyiwu/cppjieba.git)提供底层分词算法实现,
|
|
18
18
|
是兼具高性能和易用性两者的 Node.js 中文分词组件。
|
|
19
19
|
|
|
20
20
|
## 特点
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
22
|
+
- 词典载入方式灵活,无需配置词典路径也可使用,需要定制自己的词典路径时也可灵活定制。
|
|
23
|
+
- 底层算法实现是C++,性能高效。
|
|
24
|
+
- 支持多种分词算法,各种分词算法见[CppJieba](https://github.com/yanyiwu/cppjieba.git)的README.md介绍。
|
|
25
|
+
- 支持动态补充词库。
|
|
26
|
+
- 支持TypeScript,提供完整的类型定义。
|
|
27
|
+
- **支持包含空格的关键词**(如 "Open Claw")。
|
|
28
|
+
- **支持无空格版本匹配**(如 "Open Claw" 可匹配 "OpenClaw")。
|
|
29
|
+
- **支持英文大小写不敏感匹配**(如 "open claw"、"OPEN CLAW" 都可匹配 "Open Claw")。
|
|
30
|
+
- **支持批量加载用户词典**(字符串数组、单个字符串、Buffer 格式)。
|
|
31
31
|
|
|
32
32
|
对实现细节感兴趣的请看如下博文:
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
- [Node.js的C++扩展初体验之NodeJieba](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2014-02-22-nodejs-cpp-addon-nodejieba.md)
|
|
35
|
+
- [由NodeJieba谈谈Node.js异步实现](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-03-21-nodejs-asynchronous-insight.md)
|
|
36
36
|
|
|
37
37
|
## 安装
|
|
38
38
|
|
|
@@ -88,11 +88,11 @@ nodejieba.load({
|
|
|
88
88
|
|
|
89
89
|
#### 词典说明
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
91
|
+
- dict: 主词典,带权重和词性标签,建议使用默认词典。
|
|
92
|
+
- hmmDict: 隐式马尔科夫模型,建议使用默认词典。
|
|
93
|
+
- userDict: 用户词典,建议自己根据需要定制。
|
|
94
|
+
- idfDict: 关键词抽取所需的idf信息。
|
|
95
|
+
- stopWordDict: 关键词抽取所需的停用词列表。
|
|
96
96
|
|
|
97
97
|
## API 文档
|
|
98
98
|
|
|
@@ -211,13 +211,77 @@ nodejieba.loadUserDict(dictSet);
|
|
|
211
211
|
// 方式3:使用单个字符串
|
|
212
212
|
nodejieba.loadUserDict("区块链");
|
|
213
213
|
|
|
214
|
-
// 方式4:使用 Buffer
|
|
214
|
+
// 方式4:使用 Buffer(必须是 UTF-8 编码)
|
|
215
215
|
const dictBuffer = Buffer.from("新词1\n新词2 100 n\n新词3 nz");
|
|
216
216
|
nodejieba.loadUserDict(dictBuffer);
|
|
217
217
|
|
|
218
|
+
// 注意:Buffer 必须是 UTF-8 编码,其他编码可能导致乱码或加载失败
|
|
219
|
+
|
|
218
220
|
// 分词时会识别用户词典中的词
|
|
219
221
|
var result = nodejieba.cut("云计算和大数据是人工智能的基础");
|
|
220
222
|
console.log(result); // ['云计算', '和', '大数据', '是', '人工智能', '的', '基础']
|
|
223
|
+
|
|
224
|
+
// 关键词提取时,用户词典中的词会自动获得更高的权重(默认2倍)
|
|
225
|
+
var keywords = nodejieba.extract("云计算和大数据是人工智能的基础", 5);
|
|
226
|
+
console.log(keywords); // 用户词典中的词排名会显著提升
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
#### 用户词典权重提升机制(新功能)
|
|
230
|
+
|
|
231
|
+
从 v3.5.16 开始,加载用户词典时会自动为词典中的词设置更高的 IDF 权重,确保在关键词提取时获得更高的排名:
|
|
232
|
+
|
|
233
|
+
**自动权重提升**:
|
|
234
|
+
- 加载用户词典后,词典中的词会自动获得 **1.3 倍 IDF 权重**
|
|
235
|
+
- 这意味着用户词典中的词在关键词提取时会优先显示
|
|
236
|
+
|
|
237
|
+
**手动设置权重**:
|
|
238
|
+
```js
|
|
239
|
+
// 方式1:设置具体的 IDF 值
|
|
240
|
+
nodejieba.setIdf("Open Claw", 30.0);
|
|
241
|
+
|
|
242
|
+
// 方式2:使用倍数提升权重(默认1.3倍)
|
|
243
|
+
nodejieba.setIdf("Open Claw"); // 1.3倍权重
|
|
244
|
+
|
|
245
|
+
// 方式3:自定义倍数
|
|
246
|
+
nodejieba.setIdf("Open Claw", null, 2.0); // 2倍权重
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
#### IDF 词典支持空格关键词(新功能)
|
|
250
|
+
|
|
251
|
+
从 v3.5.16 开始,IDF 词典支持包含空格的关键词:
|
|
252
|
+
|
|
253
|
+
**IDF 词典格式**:
|
|
254
|
+
```
|
|
255
|
+
# 普通关键词
|
|
256
|
+
互动 12.0
|
|
257
|
+
|
|
258
|
+
# 包含空格的关键词
|
|
259
|
+
Open Claw 30.0
|
|
260
|
+
Machine Learning 25.0
|
|
261
|
+
Deep Learning 28.0
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
**使用示例**:
|
|
265
|
+
```js
|
|
266
|
+
var nodejieba = require("nodejieba");
|
|
267
|
+
|
|
268
|
+
// 加载包含空格关键词的 IDF 词典
|
|
269
|
+
nodejieba.load({
|
|
270
|
+
idfDict: "./custom_idf.txt"
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// 加载用户词典(同时需要分词词典支持)
|
|
274
|
+
nodejieba.loadUserDict(["Open Claw", "Machine Learning", "Deep Learning"]);
|
|
275
|
+
|
|
276
|
+
// 关键词提取时会正确识别包含空格的词
|
|
277
|
+
var keywords = nodejieba.extract("Open Claw和Machine Learning都是Deep Learning的基础", 5);
|
|
278
|
+
console.log(keywords);
|
|
279
|
+
// 输出: [
|
|
280
|
+
// { word: 'Open Claw', weight: 30.00 },
|
|
281
|
+
// { word: 'Deep Learning', weight: 28.00 },
|
|
282
|
+
// { word: 'Machine Learning', weight: 25.00 },
|
|
283
|
+
// ...
|
|
284
|
+
// ]
|
|
221
285
|
```
|
|
222
286
|
|
|
223
287
|
#### 词典条目格式
|
|
@@ -242,76 +306,72 @@ console.log(result); // ['云计算', '和', '大数据', '是', '人工智能',
|
|
|
242
306
|
|
|
243
307
|
支持在自定义词典中使用包含空格的关键词,且支持无空格版本匹配和大小写不敏感匹配。
|
|
244
308
|
|
|
309
|
+
**注意**:本版本已移除空格作为默认分隔符,因此包含空格的关键词可以正确匹配文本中的对应内容,不会被分割。
|
|
310
|
+
|
|
245
311
|
#### 用户词典格式
|
|
246
312
|
|
|
247
313
|
用户词典支持以下格式:
|
|
248
314
|
|
|
249
315
|
```
|
|
250
|
-
#
|
|
316
|
+
# 只有关键词(包含空格)
|
|
251
317
|
Open Claw
|
|
318
|
+
Game Master
|
|
252
319
|
|
|
253
|
-
# 关键词 +
|
|
254
|
-
|
|
320
|
+
# 关键词 + 词频(仅支持单关键词,不支持包含空格的关键词+词频)
|
|
321
|
+
人工智能 1000
|
|
255
322
|
|
|
256
|
-
# 关键词 + 词频 +
|
|
323
|
+
# 关键词 + 词频 + 词性标签(支持包含空格的关键词)
|
|
257
324
|
Open Claw 100 n
|
|
325
|
+
Machine Learning 200 n
|
|
258
326
|
|
|
259
327
|
# 包含多个空格的关键词
|
|
260
|
-
Machine Learning 200 n
|
|
261
328
|
Artificial Intelligence 300 n
|
|
329
|
+
Deep Learning 400 n
|
|
262
330
|
```
|
|
263
331
|
|
|
332
|
+
**格式说明**:
|
|
333
|
+
|
|
334
|
+
- 当词典行只有关键词时(如 `Open Claw`),整个字符串作为关键词
|
|
335
|
+
- 当词典行有词频时(如 `人工智能 1000`),第一个部分是关键词,第二个是词频
|
|
336
|
+
- 当词典行有三个部分且倒数第二个是数字时(如 `Open Claw 100 n`),前面的部分组成关键词,后面是词频和词性
|
|
337
|
+
|
|
264
338
|
#### 使用示例
|
|
265
339
|
|
|
266
340
|
```js
|
|
267
341
|
var nodejieba = require("nodejieba");
|
|
268
|
-
|
|
269
|
-
var path = require('path');
|
|
270
|
-
|
|
271
|
-
// 创建包含空格关键词的用户词典
|
|
272
|
-
var dictContent = `Open Claw 100 n
|
|
273
|
-
Machine Learning 200 n
|
|
274
|
-
Artificial Intelligence 300 n
|
|
275
|
-
`;
|
|
342
|
+
nodejieba.load();
|
|
276
343
|
|
|
277
|
-
|
|
278
|
-
|
|
344
|
+
// 方式1:使用 loadUserDict 加载包含空格的关键词
|
|
345
|
+
nodejieba.loadUserDict(["Open Claw 100 n", "Game Master"]);
|
|
279
346
|
|
|
280
|
-
//
|
|
281
|
-
nodejieba.
|
|
282
|
-
userDict: testDictPath,
|
|
283
|
-
});
|
|
347
|
+
// 方式2:使用 insertWord 添加包含空格的关键词
|
|
348
|
+
nodejieba.insertWord("Deep Learning");
|
|
284
349
|
|
|
285
350
|
// 测试1: 包含空格的关键词匹配
|
|
286
|
-
console.log(nodejieba.cut("I
|
|
287
|
-
//
|
|
351
|
+
console.log(nodejieba.cut("I like Open Claw game"));
|
|
352
|
+
// 输出: ['I', ' ', 'l', 'i', 'k', 'e', ' ', 'Open Claw', ' ', 'g', 'a', 'm', 'e']
|
|
288
353
|
|
|
289
|
-
// 测试2:
|
|
354
|
+
// 测试2: 在中文句子中匹配
|
|
355
|
+
console.log(nodejieba.cut("Open Claw和Game Master都是好游戏"));
|
|
356
|
+
// 输出: ['Open Claw', '和', 'Game Master', '都', '是', '好', '游戏']
|
|
357
|
+
|
|
358
|
+
// 测试3: 大小写不敏感匹配
|
|
290
359
|
console.log(nodejieba.cut("open claw")); // 匹配 Open Claw
|
|
291
360
|
console.log(nodejieba.cut("OPEN CLAW")); // 匹配 Open Claw
|
|
292
361
|
console.log(nodejieba.cut("Open Claw")); // 匹配 Open Claw
|
|
293
362
|
|
|
294
|
-
// 测试
|
|
363
|
+
// 测试4: 无空格版本匹配
|
|
295
364
|
console.log(nodejieba.cut("OpenClaw")); // 匹配 Open Claw
|
|
296
365
|
console.log(nodejieba.cut("openclaw")); // 匹配 Open Claw
|
|
297
366
|
console.log(nodejieba.cut("OPENCLAW")); // 匹配 Open Claw
|
|
298
|
-
|
|
299
|
-
// 测试4: 其他包含空格的关键词
|
|
300
|
-
console.log(nodejieba.cut("Machine Learning is great"));
|
|
301
|
-
// 输出包含: ['Machine Learning']
|
|
302
|
-
|
|
303
|
-
console.log(nodejieba.cut("Artificial Intelligence will change the world"));
|
|
304
|
-
// 输出包含: ['Artificial Intelligence']
|
|
305
|
-
|
|
306
|
-
// 清理测试文件
|
|
307
|
-
fs.unlinkSync(testDictPath);
|
|
308
367
|
```
|
|
309
368
|
|
|
310
369
|
#### 功能说明
|
|
311
370
|
|
|
312
|
-
1. **包含空格的关键词**: 词典中的 "Open Claw" 可以匹配文本中的 "Open Claw"
|
|
371
|
+
1. **包含空格的关键词**: 词典中的 "Open Claw" 可以匹配文本中的 "Open Claw"(不会被分割)
|
|
313
372
|
2. **无空格版本匹配**: 词典中的 "Open Claw" 也可以匹配文本中的 "OpenClaw"
|
|
314
373
|
3. **大小写不敏感**: 词典中的 "Open Claw" 可以匹配 "open claw"、"OPEN CLAW"、"Open Claw" 等任意大小写组合
|
|
374
|
+
4. **自动生成变体**: 添加包含空格的关键词时,会自动生成无空格版本和小写版本,确保各种变体都能匹配
|
|
315
375
|
|
|
316
376
|
More Detals in [demo](https://github.com/yanyiwu/nodejieba-demo)
|
|
317
377
|
|
|
@@ -347,37 +407,23 @@ npm test
|
|
|
347
407
|
|
|
348
408
|
## 应用
|
|
349
409
|
|
|
350
|
-
|
|
351
|
-
|
|
410
|
+
- 支持中文搜索的 gitbook 插件: [gitbook-plugin-search-pro](https://plugins.gitbook.com/plugin/search-pro)
|
|
411
|
+
- 汉字拼音转换工具: [pinyin](https://github.com/hotoo/pinyin)
|
|
352
412
|
|
|
353
413
|
## 性能评测
|
|
354
414
|
|
|
355
415
|
应该是目前性能最好的 Node.js 中文分词库
|
|
356
|
-
详见: [Jieba中文分词系列性能评测]
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
[由NodeJieba谈谈Node.js异步实现]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-03-21-nodejs-asynchronous-insight.md
|
|
360
|
-
[Node.js的C++扩展初体验之NodeJieba]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2014-02-22-nodejs-cpp-addon-nodejieba.md
|
|
361
|
-
[CppJieba]:https://github.com/yanyiwu/cppjieba.git
|
|
362
|
-
[cnpm]:http://cnpmjs.org
|
|
363
|
-
[Jieba中文分词]:https://github.com/fxsjy/jieba
|
|
364
|
-
|
|
365
|
-
[Jieba中文分词系列性能评测]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md
|
|
366
|
-
[contributors]:https://github.com/yanyiwu/nodejieba/graphs/contributors
|
|
367
|
-
[YanyiWu]:http://github.com/yanyiwu
|
|
368
|
-
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
|
369
|
-
[pinyin]:https://github.com/hotoo/pinyin
|
|
416
|
+
详见: [Jieba中文分词系列性能评测](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
|
|
370
417
|
|
|
371
418
|
## Contributors
|
|
372
419
|
|
|
373
420
|
### Code Contributors
|
|
374
421
|
|
|
375
|
-
This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)].
|
|
376
|
-
<a href="https://github.com/yanyiwu/nodejieba/graphs/contributors"><img src="https://opencollective.com/nodejieba/contributors.svg?width=890&button=false" /></a>
|
|
422
|
+
This project exists thanks to all the people who contribute. \[[Contribute](CONTRIBUTING.md)]. <a href="https://github.com/yanyiwu/nodejieba/graphs/contributors"><img src="https://opencollective.com/nodejieba/contributors.svg?width=890&button=false" /></a>
|
|
377
423
|
|
|
378
424
|
### Financial Contributors
|
|
379
425
|
|
|
380
|
-
Become a financial contributor and help us sustain our community. [[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
426
|
+
Become a financial contributor and help us sustain our community. \[[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
381
427
|
|
|
382
428
|
#### Individuals
|
|
383
429
|
|
|
@@ -385,15 +431,6 @@ Become a financial contributor and help us sustain our community. [[Contribute](
|
|
|
385
431
|
|
|
386
432
|
#### Organizations
|
|
387
433
|
|
|
388
|
-
Support this project with your organization. Your logo will show up here with a link to your website. [[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
389
|
-
|
|
390
|
-
<a href="https://opencollective.com/nodejieba/organization/0/website"><img src="https://opencollective.com/nodejieba/organization/0/avatar.svg"></a>
|
|
391
|
-
<a href="https://opencollective.com/nodejieba/organization/1/website"><img src="https://opencollective.com/nodejieba/organization/1/avatar.svg"></a>
|
|
392
|
-
<a href="https://opencollective.com/nodejieba/organization/2/website"><img src="https://opencollective.com/nodejieba/organization/2/avatar.svg"></a>
|
|
393
|
-
<a href="https://opencollective.com/nodejieba/organization/3/website"><img src="https://opencollective.com/nodejieba/organization/3/avatar.svg"></a>
|
|
394
|
-
<a href="https://opencollective.com/nodejieba/organization/4/website"><img src="https://opencollective.com/nodejieba/organization/4/avatar.svg"></a>
|
|
395
|
-
<a href="https://opencollective.com/nodejieba/organization/5/website"><img src="https://opencollective.com/nodejieba/organization/5/avatar.svg"></a>
|
|
396
|
-
<a href="https://opencollective.com/nodejieba/organization/6/website"><img src="https://opencollective.com/nodejieba/organization/6/avatar.svg"></a>
|
|
397
|
-
<a href="https://opencollective.com/nodejieba/organization/7/website"><img src="https://opencollective.com/nodejieba/organization/7/avatar.svg"></a>
|
|
398
|
-
<a href="https://opencollective.com/nodejieba/organization/8/website"><img src="https://opencollective.com/nodejieba/organization/8/avatar.svg"></a>
|
|
399
|
-
<a href="https://opencollective.com/nodejieba/organization/9/website"><img src="https://opencollective.com/nodejieba/organization/9/avatar.svg"></a>
|
|
434
|
+
Support this project with your organization. Your logo will show up here with a link to your website. \[[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
435
|
+
|
|
436
|
+
<a href="https://opencollective.com/nodejieba/organization/0/website"><img src="https://opencollective.com/nodejieba/organization/0/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/1/website"><img src="https://opencollective.com/nodejieba/organization/1/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/2/website"><img src="https://opencollective.com/nodejieba/organization/2/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/3/website"><img src="https://opencollective.com/nodejieba/organization/3/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/4/website"><img src="https://opencollective.com/nodejieba/organization/4/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/5/website"><img src="https://opencollective.com/nodejieba/organization/5/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/6/website"><img src="https://opencollective.com/nodejieba/organization/6/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/7/website"><img src="https://opencollective.com/nodejieba/organization/7/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/8/website"><img src="https://opencollective.com/nodejieba/organization/8/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/9/website"><img src="https://opencollective.com/nodejieba/organization/9/avatar.svg"></a>
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
console.log("=== 关键词提取权重机制分析 ===\n");
|
|
4
|
+
|
|
5
|
+
nodejieba.load();
|
|
6
|
+
|
|
7
|
+
const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目,整合了原版的动画及Open Claw打包制作了MAC安装包,它可以出Open Claw现在你的系统桌面的任何地方,也会随互动有特定的动作,还蛮有意思的项目地址:https://github.com/justaLoli/VPet-Mac云盘:https://pan.quark.cn/s/62596470429a功能:✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动,即睡觉、学习、工作等(带计时器,但没有经验、金钱加成)✅自动事件(发呆、待机、睡觉等)✅桌宠自动移动✅摸头预览";
|
|
8
|
+
|
|
9
|
+
console.log("【问题分析】");
|
|
10
|
+
console.log("关键词提取权重 = TF(词频)× IDF(逆文档频率)\n");
|
|
11
|
+
|
|
12
|
+
console.log("步骤1: 查看分词结果");
|
|
13
|
+
var cutResult = nodejieba.cut(content);
|
|
14
|
+
console.log("分词结果:", cutResult.slice(0, 30));
|
|
15
|
+
|
|
16
|
+
console.log("\n步骤2: 统计词频(TF)");
|
|
17
|
+
var wordFreq = {};
|
|
18
|
+
cutResult.forEach(word => {
|
|
19
|
+
wordFreq[word] = (wordFreq[word] || 0) + 1;
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
console.log("\n高频词(出现2次以上):");
|
|
23
|
+
Object.entries(wordFreq)
|
|
24
|
+
.filter(([word, freq]) => freq >= 2)
|
|
25
|
+
.sort((a, b) => b[1] - a[1])
|
|
26
|
+
.forEach(([word, freq]) => {
|
|
27
|
+
console.log(` ${word}: ${freq}次`);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
console.log("\n步骤3: 提取关键词(未加载用户词典)");
|
|
31
|
+
var extractResult1 = nodejieba.extract(content, 20);
|
|
32
|
+
console.log("关键词及权重:");
|
|
33
|
+
extractResult1.forEach((item, i) => {
|
|
34
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
console.log("\n步骤4: 加载用户词典");
|
|
38
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
39
|
+
|
|
40
|
+
console.log("\n步骤5: 再次提取关键词");
|
|
41
|
+
var extractResult2 = nodejieba.extract(content, 20);
|
|
42
|
+
console.log("关键词及权重:");
|
|
43
|
+
extractResult2.forEach((item, i) => {
|
|
44
|
+
console.log(` ${i + 1}. ${item.word}: ${item.weight.toFixed(2)} (出现${wordFreq[item.word] || 1}次)`);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
console.log("\n【核心问题】");
|
|
48
|
+
console.log("1. 'Open Claw' 出现了 2 次(TF = 2)");
|
|
49
|
+
console.log("2. '互动' 出现了 2 次(TF = 2)");
|
|
50
|
+
console.log("3. 但 '互动' 的权重可能更高,因为:");
|
|
51
|
+
console.log(" - '互动' 在 IDF 词典中有专门的权重值");
|
|
52
|
+
console.log(" - 'Open Claw' 不在 IDF 词典中,使用平均 IDF 值");
|
|
53
|
+
console.log("4. 如果 '互动' 的 IDF 值 > 平均 IDF 值,则权重更高");
|
|
54
|
+
|
|
55
|
+
console.log("\n【解决方案】");
|
|
56
|
+
console.log("需要为用户词典中的词设置 IDF 权重!");
|
|
57
|
+
console.log("建议:在 loadUserDict 时,自动为用户词典中的词设置较高的 IDF 值");
|
|
Binary file
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
var nodejieba = require("./index.js");
|
|
2
|
+
|
|
3
|
+
console.log("=== 问题诊断:Open Claw 未被识别 ===\n");
|
|
4
|
+
|
|
5
|
+
console.log("【问题原因分析】");
|
|
6
|
+
console.log("1. 用户词典必须在调用 extract() 之前加载");
|
|
7
|
+
console.log("2. 用户词典会影响分词结果,进而影响关键词提取");
|
|
8
|
+
console.log("3. 关键词提取依赖于分词结果和 IDF 权重\n");
|
|
9
|
+
|
|
10
|
+
console.log("=== 测试场景 1: 错误用法(先提取后加载)===");
|
|
11
|
+
nodejieba.load();
|
|
12
|
+
const content = "这是一个Open Claw项目,Open Claw很好玩";
|
|
13
|
+
|
|
14
|
+
console.log("步骤1: 先提取关键词(未加载用户词典)");
|
|
15
|
+
var result1 = nodejieba.extract(content, 5);
|
|
16
|
+
console.log("关键词:", result1.map(r => r.word));
|
|
17
|
+
console.log("说明: 此时 'Open Claw' 被拆分成单个字母\n");
|
|
18
|
+
|
|
19
|
+
console.log("步骤2: 再加载用户词典");
|
|
20
|
+
nodejieba.loadUserDict("Open Claw 10 n");
|
|
21
|
+
console.log("词典已加载\n");
|
|
22
|
+
|
|
23
|
+
console.log("步骤3: 再次提取关键词");
|
|
24
|
+
var result2 = nodejieba.extract(content, 5);
|
|
25
|
+
console.log("关键词:", result2.map(r => r.word));
|
|
26
|
+
console.log("说明: 此时 'Open Claw' 已被正确识别\n");
|
|
27
|
+
|
|
28
|
+
console.log("=== 测试场景 2: 正确用法(先加载后提取)===");
|
|
29
|
+
console.log("重新初始化...");
|
|
30
|
+
delete require.cache[require.resolve('./index.js')];
|
|
31
|
+
var nodejieba2 = require("./index.js");
|
|
32
|
+
|
|
33
|
+
console.log("步骤1: 先加载词典");
|
|
34
|
+
nodejieba2.load();
|
|
35
|
+
nodejieba2.loadUserDict("Open Claw 10 n");
|
|
36
|
+
console.log("词典已加载\n");
|
|
37
|
+
|
|
38
|
+
console.log("步骤2: 再提取关键词");
|
|
39
|
+
var result3 = nodejieba2.extract(content, 5);
|
|
40
|
+
console.log("关键词:", result3.map(r => r.word));
|
|
41
|
+
console.log("说明: 'Open Claw' 被正确识别\n");
|
|
42
|
+
|
|
43
|
+
console.log("=== 测试场景 3: 检查优先级 ===");
|
|
44
|
+
console.log("测试: 用户词典 vs 默认词典\n");
|
|
45
|
+
|
|
46
|
+
var testCases = [
|
|
47
|
+
"Open Claw",
|
|
48
|
+
"Open Claw是一个项目",
|
|
49
|
+
"我喜欢Open Claw这个游戏"
|
|
50
|
+
];
|
|
51
|
+
|
|
52
|
+
testCases.forEach((text, i) => {
|
|
53
|
+
console.log(`测试 ${i + 1}: "${text}"`);
|
|
54
|
+
var cut = nodejieba2.cut(text);
|
|
55
|
+
var extract = nodejieba2.extract(text, 3);
|
|
56
|
+
console.log(" 分词:", cut);
|
|
57
|
+
console.log(" 关键词:", extract.map(r => r.word));
|
|
58
|
+
console.log(" 包含 'Open Claw':", cut.includes("Open Claw"));
|
|
59
|
+
console.log();
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
console.log("=== 解决方案 ===");
|
|
63
|
+
console.log("✅ 正确做法:");
|
|
64
|
+
console.log(" 1. 先调用 nodejieba.load()");
|
|
65
|
+
console.log(" 2. 再调用 nodejieba.loadUserDict('Open Claw 10 n')");
|
|
66
|
+
console.log(" 3. 最后调用 nodejieba.extract()");
|
|
67
|
+
console.log();
|
|
68
|
+
console.log("❌ 错误做法:");
|
|
69
|
+
console.log(" 1. 先调用 nodejieba.extract()");
|
|
70
|
+
console.log(" 2. 再调用 nodejieba.loadUserDict()");
|
|
71
|
+
console.log(" (此时词典不会影响已提取的结果)");
|
package/index.js
CHANGED
|
@@ -74,6 +74,7 @@ wrapWithDictLoad("extract");
|
|
|
74
74
|
wrapWithDictLoad("textRankExtract");
|
|
75
75
|
wrapWithDictLoad("insertWord");
|
|
76
76
|
wrapWithDictLoad("loadUserDict");
|
|
77
|
+
wrapWithDictLoad("setIdf");
|
|
77
78
|
|
|
78
79
|
// 保存原始的 loadUserDict 函数
|
|
79
80
|
var _loadUserDict = exports.loadUserDict;
|
|
@@ -84,11 +85,18 @@ exports.loadUserDict = function (dict) {
|
|
|
84
85
|
exports.load();
|
|
85
86
|
}
|
|
86
87
|
|
|
87
|
-
|
|
88
|
+
if (dict === null || dict === undefined) {
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
|
|
88
92
|
if (dict instanceof Set) {
|
|
89
93
|
dict = Array.from(dict);
|
|
90
94
|
}
|
|
91
95
|
|
|
96
|
+
if (typeof dict !== 'string' && !Array.isArray(dict) && !Buffer.isBuffer(dict)) {
|
|
97
|
+
throw new TypeError('dict must be string, string[], Set<string>, or Buffer');
|
|
98
|
+
}
|
|
99
|
+
|
|
92
100
|
return _loadUserDict.call(this, dict);
|
|
93
101
|
};
|
|
94
102
|
|