nodejieba-plus 3.5.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/FUNDING.yml +12 -0
- package/.github/workflows/github_release.yml +61 -0
- package/.github/workflows/npm_publish.yml +24 -0
- package/.github/workflows/stale-issues.yml +24 -0
- package/.github/workflows/test.yml +42 -0
- package/.gitmodules +3 -0
- package/.npmignore +15 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +78 -0
- package/LICENSE +21 -0
- package/README.md +349 -0
- package/binding.gyp +63 -0
- package/index.js +77 -0
- package/lib/index.cpp +3 -0
- package/lib/nodejieba.cpp +218 -0
- package/lib/nodejieba.h +28 -0
- package/lib/utils.h +47 -0
- package/package.json +48 -0
- package/submodules/cppjieba/.github/workflows/cmake.yml +51 -0
- package/submodules/cppjieba/.github/workflows/stale-issues.yml +24 -0
- package/submodules/cppjieba/.gitmodules +3 -0
- package/submodules/cppjieba/CHANGELOG.md +305 -0
- package/submodules/cppjieba/CMakeLists.txt +42 -0
- package/submodules/cppjieba/LICENSE +20 -0
- package/submodules/cppjieba/README.md +280 -0
- package/submodules/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
- package/submodules/cppjieba/deps/limonp/.gitmodules +0 -0
- package/submodules/cppjieba/deps/limonp/CHANGELOG.md +160 -0
- package/submodules/cppjieba/deps/limonp/CMakeLists.txt +61 -0
- package/submodules/cppjieba/deps/limonp/LICENSE +20 -0
- package/submodules/cppjieba/deps/limonp/README.md +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ArgvContext.hpp +70 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Closure.hpp +206 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Colors.hpp +31 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Condition.hpp +38 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Config.hpp +103 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/ForcePublic.hpp +7 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/LocalVector.hpp +139 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/Logging.hpp +90 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/NonCopyable.hpp +21 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StdExtension.hpp +157 -0
- package/submodules/cppjieba/deps/limonp/include/limonp/StringUtil.hpp +386 -0
- package/submodules/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
- package/submodules/cppjieba/deps/limonp/test/demo.cpp +40 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
- package/submodules/cppjieba/deps/limonp/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/dict/README.md +31 -0
- package/submodules/cppjieba/dict/hmm_model.utf8 +34 -0
- package/submodules/cppjieba/dict/idf.utf8 +258826 -0
- package/submodules/cppjieba/dict/jieba.dict.utf8 +348982 -0
- package/submodules/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- package/submodules/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- package/submodules/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- package/submodules/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- package/submodules/cppjieba/dict/stop_words.utf8 +1534 -0
- package/submodules/cppjieba/dict/user.dict.utf8 +4 -0
- package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +381 -0
- package/submodules/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- package/submodules/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- package/submodules/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Jieba.hpp +169 -0
- package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- package/submodules/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- package/submodules/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- package/submodules/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- package/submodules/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- package/submodules/cppjieba/include/cppjieba/QuerySegment.hpp +89 -0
- package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +48 -0
- package/submodules/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- package/submodules/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- package/submodules/cppjieba/include/cppjieba/Trie.hpp +200 -0
- package/submodules/cppjieba/include/cppjieba/Unicode.hpp +231 -0
- package/submodules/cppjieba/test/CMakeLists.txt +4 -0
- package/submodules/cppjieba/test/load_test.cpp +54 -0
- package/submodules/cppjieba/test/testdata/curl.res +1 -0
- package/submodules/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- package/submodules/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- package/submodules/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- package/submodules/cppjieba/test/testdata/load_test.urls +2 -0
- package/submodules/cppjieba/test/testdata/review.100 +100 -0
- package/submodules/cppjieba/test/testdata/review.100.res +200 -0
- package/submodules/cppjieba/test/testdata/server.conf +19 -0
- package/submodules/cppjieba/test/testdata/testlines.gbk +9 -0
- package/submodules/cppjieba/test/testdata/testlines.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- package/submodules/cppjieba/test/testdata/userdict.english +2 -0
- package/submodules/cppjieba/test/testdata/userdict.utf8 +8 -0
- package/submodules/cppjieba/test/testdata/weicheng.utf8 +247 -0
- package/submodules/cppjieba/test/unittest/CMakeLists.txt +33 -0
- package/submodules/cppjieba/test/unittest/gtest_main.cpp +39 -0
- package/submodules/cppjieba/test/unittest/jieba_test.cpp +166 -0
- package/submodules/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- package/submodules/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- package/submodules/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- package/submodules/cppjieba/test/unittest/segments_test.cpp +256 -0
- package/submodules/cppjieba/test/unittest/textrank_test.cpp +86 -0
- package/submodules/cppjieba/test/unittest/trie_test.cpp +177 -0
- package/submodules/cppjieba/test/unittest/unicode_test.cpp +43 -0
- package/test/debug_split +0 -0
- package/test/debug_split2 +0 -0
- package/test/debug_split3 +0 -0
- package/test/load_dict_test.js +14 -0
- package/test/missing_binding_test.js +42 -0
- package/test/test.js +366 -0
- package/test/testdata/userdict.utf8 +1 -0
- package/tsconfig.json +59 -0
- package/types/index.d.ts +30 -0
- package/typescript_demo.ts +38 -0
package/README.md
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
[](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml)
|
|
2
|
+
[](https://opencollective.com/nodejieba) [](https://github.com/yanyiwu/)
|
|
3
|
+
[](https://github.com/yanyiwu/nodejieba)
|
|
4
|
+
[](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
|
|
5
|
+
[](http://yanyiwu.mit-license.org)
|
|
6
|
+
[](https://www.npmjs.org/package/nodejieba)
|
|
7
|
+
[](https://www.npmjs.org/package/nodejieba)
|
|
8
|
+
[](https://codeclimate.com/github/yanyiwu/nodejieba)
|
|
9
|
+
|
|
10
|
+
- - -
|
|
11
|
+
|
|
12
|
+
# NodeJieba "结巴"分词的Node.js版本
|
|
13
|
+
|
|
14
|
+
## 介绍
|
|
15
|
+
|
|
16
|
+
`NodeJieba`是"结巴"中文分词的 Node.js 版本实现,
|
|
17
|
+
由[CppJieba]提供底层分词算法实现,
|
|
18
|
+
是兼具高性能和易用性两者的 Node.js 中文分词组件。
|
|
19
|
+
|
|
20
|
+
## 特点
|
|
21
|
+
|
|
22
|
+
+ 词典载入方式灵活,无需配置词典路径也可使用,需要定制自己的词典路径时也可灵活定制。
|
|
23
|
+
+ 底层算法实现是C++,性能高效。
|
|
24
|
+
+ 支持多种分词算法,各种分词算法见[CppJieba]的README.md介绍。
|
|
25
|
+
+ 支持动态补充词库。
|
|
26
|
+
+ 支持TypeScript,提供完整的类型定义。
|
|
27
|
+
+ **支持包含空格的关键词**(如 "Open Claw")。
|
|
28
|
+
+ **支持无空格版本匹配**(如 "OpenClaw" 可匹配 "Open Claw")。
|
|
29
|
+
+ **支持英文大小写不敏感匹配**(如 "open claw"、"OPEN CLAW" 都可匹配 "Open Claw")。
|
|
30
|
+
|
|
31
|
+
对实现细节感兴趣的请看如下博文:
|
|
32
|
+
|
|
33
|
+
+ [Node.js的C++扩展初体验之NodeJieba]
|
|
34
|
+
+ [由NodeJieba谈谈Node.js异步实现]
|
|
35
|
+
|
|
36
|
+
## 安装
|
|
37
|
+
|
|
38
|
+
```sh
|
|
39
|
+
npm install nodejieba
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 快速开始
|
|
43
|
+
|
|
44
|
+
```js
|
|
45
|
+
var nodejieba = require("nodejieba");
|
|
46
|
+
var result = nodejieba.cut("南京市长江大桥");
|
|
47
|
+
console.log(result);
|
|
48
|
+
//["南京市","长江大桥"]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
更多示例请参考 [demo](https://github.com/yanyiwu/nodejieba-demo)
|
|
52
|
+
|
|
53
|
+
### 词典载入可灵活配置
|
|
54
|
+
|
|
55
|
+
如果没有主动调用词典函数时,
|
|
56
|
+
则会在第一次调用cut等功能函数时,自动载入默认词典。
|
|
57
|
+
|
|
58
|
+
如果要主动触发词典载入,则使用以下函数主动触发。
|
|
59
|
+
|
|
60
|
+
```js
|
|
61
|
+
nodejieba.load();
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
以上用法会自动载入所有默认词典,
|
|
65
|
+
如果需要载入自己的词典,而不是默认词典。
|
|
66
|
+
比如想要载入自己的用户词典,则使用以下函数:
|
|
67
|
+
|
|
68
|
+
```js
|
|
69
|
+
nodejieba.load({
|
|
70
|
+
userDict: './test/testdata/userdict.utf8',
|
|
71
|
+
});
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
字典载入函数load的参数项都是可选的,
|
|
75
|
+
如果没有对应的项则自动填充默认参数。
|
|
76
|
+
所以上面这段代码和下面这代代码是等价的。
|
|
77
|
+
|
|
78
|
+
```js
|
|
79
|
+
nodejieba.load({
|
|
80
|
+
dict: nodejieba.DEFAULT_DICT,
|
|
81
|
+
hmmDict: nodejieba.DEFAULT_HMM_DICT,
|
|
82
|
+
userDict: './test/testdata/userdict.utf8',
|
|
83
|
+
idfDict: nodejieba.DEFAULT_IDF_DICT,
|
|
84
|
+
stopWordDict: nodejieba.DEFAULT_STOP_WORD_DICT,
|
|
85
|
+
});
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
#### 词典说明
|
|
89
|
+
|
|
90
|
+
+ dict: 主词典,带权重和词性标签,建议使用默认词典。
|
|
91
|
+
+ hmmDict: 隐式马尔科夫模型,建议使用默认词典。
|
|
92
|
+
+ userDict: 用户词典,建议自己根据需要定制。
|
|
93
|
+
+ idfDict: 关键词抽取所需的idf信息。
|
|
94
|
+
+ stopWordDict: 关键词抽取所需的停用词列表。
|
|
95
|
+
|
|
96
|
+
## API 文档
|
|
97
|
+
|
|
98
|
+
### 分词
|
|
99
|
+
|
|
100
|
+
#### 1. 默认分词
|
|
101
|
+
|
|
102
|
+
```js
|
|
103
|
+
var nodejieba = require("nodejieba");
|
|
104
|
+
var result = nodejieba.cut("南京市长江大桥");
|
|
105
|
+
console.log(result);
|
|
106
|
+
// ["南京市", "长江大桥"]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
#### 2. 使用HMM模型分词
|
|
110
|
+
|
|
111
|
+
```js
|
|
112
|
+
var result = nodejieba.cutHMM("南京市长江大桥");
|
|
113
|
+
console.log(result);
|
|
114
|
+
// ["南京市", "长江大桥"]
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
#### 3. 全模式分词
|
|
118
|
+
|
|
119
|
+
```js
|
|
120
|
+
var result = nodejieba.cutAll("南京市长江大桥");
|
|
121
|
+
console.log(result);
|
|
122
|
+
// ["南京", "南京市", "市长", "长江", "长江大桥", "大桥"]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
#### 4. 搜索引擎模式分词
|
|
126
|
+
|
|
127
|
+
```js
|
|
128
|
+
var result = nodejieba.cutForSearch("南京市长江大桥");
|
|
129
|
+
console.log(result);
|
|
130
|
+
// ["南京", "市", "长江", "大桥", "南京市", "长江大桥"]
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
#### 5. 小粒度分词
|
|
134
|
+
|
|
135
|
+
```js
|
|
136
|
+
var result = nodejieba.cutSmall("南京市长江大桥", 3);
|
|
137
|
+
console.log(result);
|
|
138
|
+
// ["南京市", "长江大桥"]
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### 词性标注
|
|
142
|
+
|
|
143
|
+
```js
|
|
144
|
+
var nodejieba = require("nodejieba");
|
|
145
|
+
var result = nodejieba.tag("红掌拨清波");
|
|
146
|
+
console.log(result);
|
|
147
|
+
// [ { word: '红掌', tag: 'n' },
|
|
148
|
+
// { word: '拨', tag: 'v' },
|
|
149
|
+
// { word: '清波', tag: 'n' } ]
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### 关键词提取
|
|
153
|
+
|
|
154
|
+
```js
|
|
155
|
+
var nodejieba = require("nodejieba");
|
|
156
|
+
var sentence = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
|
157
|
+
var result = nodejieba.extract(sentence, 5);
|
|
158
|
+
console.log(result);
|
|
159
|
+
// [ { word: '升职', weight: 11.739204307083542 },
|
|
160
|
+
// { word: '加薪', weight: 10.8561552143 },
|
|
161
|
+
// { word: 'CEO', weight: 10.642581114 },
|
|
162
|
+
// { word: '手扶拖拉机', weight: 10.0088573539 },
|
|
163
|
+
// { word: '巅峰', weight: 9.49395840471 } ]
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### TextRank关键词提取
|
|
167
|
+
|
|
168
|
+
```js
|
|
169
|
+
var nodejieba = require("nodejieba");
|
|
170
|
+
var sentence = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
|
171
|
+
var result = nodejieba.textRankExtract(sentence, 5);
|
|
172
|
+
console.log(result);
|
|
173
|
+
// [ { word: '当上', weight: 1 },
|
|
174
|
+
// { word: '不用', weight: 0.9897190043 },
|
|
175
|
+
// { word: '多久', weight: 0.9897190043 },
|
|
176
|
+
// { word: '加薪', weight: 0.9897190043 },
|
|
177
|
+
// { word: '升职', weight: 0.9897190043 } ]
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### 添加自定义词语
|
|
181
|
+
|
|
182
|
+
```js
|
|
183
|
+
var nodejieba = require("nodejieba");
|
|
184
|
+
console.log(nodejieba.cut("男默女泪"));
|
|
185
|
+
// ["男默", "女泪"]
|
|
186
|
+
nodejieba.insertWord("男默女泪");
|
|
187
|
+
console.log(nodejieba.cut("男默女泪"));
|
|
188
|
+
// ["男默女泪"]
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 包含空格的关键词(新功能)
|
|
192
|
+
|
|
193
|
+
支持在自定义词典中使用包含空格的关键词,且支持无空格版本匹配和大小写不敏感匹配。
|
|
194
|
+
|
|
195
|
+
#### 用户词典格式
|
|
196
|
+
|
|
197
|
+
用户词典支持以下格式:
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
# 只有关键词
|
|
201
|
+
Open Claw
|
|
202
|
+
|
|
203
|
+
# 关键词 + 词性标签
|
|
204
|
+
Open Claw n
|
|
205
|
+
|
|
206
|
+
# 关键词 + 词频 + 词性标签
|
|
207
|
+
Open Claw 100 n
|
|
208
|
+
|
|
209
|
+
# 包含多个空格的关键词
|
|
210
|
+
Machine Learning 200 n
|
|
211
|
+
Artificial Intelligence 300 n
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
#### 使用示例
|
|
215
|
+
|
|
216
|
+
```js
|
|
217
|
+
var nodejieba = require("nodejieba");
|
|
218
|
+
var fs = require('fs');
|
|
219
|
+
var path = require('path');
|
|
220
|
+
|
|
221
|
+
// 创建包含空格关键词的用户词典
|
|
222
|
+
var dictContent = `Open Claw 100 n
|
|
223
|
+
Machine Learning 200 n
|
|
224
|
+
Artificial Intelligence 300 n
|
|
225
|
+
`;
|
|
226
|
+
|
|
227
|
+
var testDictPath = path.join(__dirname, 'user_dict.utf8');
|
|
228
|
+
fs.writeFileSync(testDictPath, dictContent);
|
|
229
|
+
|
|
230
|
+
// 加载词典
|
|
231
|
+
nodejieba.load({
|
|
232
|
+
userDict: testDictPath,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// 测试1: 包含空格的关键词匹配
|
|
236
|
+
console.log(nodejieba.cut("I want to use Open Claw tool"));
|
|
237
|
+
// 输出包含: ['Open Claw']
|
|
238
|
+
|
|
239
|
+
// 测试2: 大小写不敏感匹配
|
|
240
|
+
console.log(nodejieba.cut("open claw")); // 匹配 Open Claw
|
|
241
|
+
console.log(nodejieba.cut("OPEN CLAW")); // 匹配 Open Claw
|
|
242
|
+
console.log(nodejieba.cut("Open Claw")); // 匹配 Open Claw
|
|
243
|
+
|
|
244
|
+
// 测试3: 无空格版本匹配
|
|
245
|
+
console.log(nodejieba.cut("OpenClaw")); // 匹配 Open Claw
|
|
246
|
+
console.log(nodejieba.cut("openclaw")); // 匹配 Open Claw
|
|
247
|
+
console.log(nodejieba.cut("OPENCLAW")); // 匹配 Open Claw
|
|
248
|
+
|
|
249
|
+
// 测试4: 其他包含空格的关键词
|
|
250
|
+
console.log(nodejieba.cut("Machine Learning is great"));
|
|
251
|
+
// 输出包含: ['Machine Learning']
|
|
252
|
+
|
|
253
|
+
console.log(nodejieba.cut("Artificial Intelligence will change the world"));
|
|
254
|
+
// 输出包含: ['Artificial Intelligence']
|
|
255
|
+
|
|
256
|
+
// 清理测试文件
|
|
257
|
+
fs.unlinkSync(testDictPath);
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
#### 功能说明
|
|
261
|
+
|
|
262
|
+
1. **包含空格的关键词**: 词典中的 "Open Claw" 可以匹配文本中的 "Open Claw"
|
|
263
|
+
2. **无空格版本匹配**: 词典中的 "Open Claw" 也可以匹配文本中的 "OpenClaw"
|
|
264
|
+
3. **大小写不敏感**: 词典中的 "Open Claw" 可以匹配 "open claw"、"OPEN CLAW"、"Open Claw" 等任意大小写组合
|
|
265
|
+
|
|
266
|
+
More Detals in [demo](https://github.com/yanyiwu/nodejieba-demo)
|
|
267
|
+
|
|
268
|
+
### 关键词抽取
|
|
269
|
+
|
|
270
|
+
```js
|
|
271
|
+
var nodejieba = require("nodejieba");
|
|
272
|
+
var topN = 4;
|
|
273
|
+
console.log(nodejieba.extract("升职加薪,当上CEO,走上人生巅峰。", topN));
|
|
274
|
+
//[ { word: 'CEO', weight: 11.739204307083542 },
|
|
275
|
+
// { word: '升职', weight: 10.8561552143 },
|
|
276
|
+
// { word: '加薪', weight: 10.642581114 },
|
|
277
|
+
// { word: '巅峰', weight: 9.49395840471 } ]
|
|
278
|
+
|
|
279
|
+
console.log(nodejieba.textRankExtract("升职加薪,当上CEO,走上人生巅峰。", topN));
|
|
280
|
+
//[ { word: '当上', weight: 1 },
|
|
281
|
+
// { word: '不用', weight: 0.9898479330698993 },
|
|
282
|
+
// { word: '多久', weight: 0.9851260595435759 },
|
|
283
|
+
// { word: '加薪', weight: 0.9830464899847804 },
|
|
284
|
+
// { word: '升职', weight: 0.9802777682279076 } ]
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
More Detals in [demo](https://github.com/yanyiwu/nodejieba-demo)
|
|
288
|
+
|
|
289
|
+
## Develop NodeJieba
|
|
290
|
+
|
|
291
|
+
```sh
|
|
292
|
+
git clone --recurse-submodules https://github.com/yanyiwu/nodejieba.git
|
|
293
|
+
cd nodejieba
|
|
294
|
+
npm install
|
|
295
|
+
npm test
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## 应用
|
|
299
|
+
|
|
300
|
+
+ 支持中文搜索的 gitbook 插件: [gitbook-plugin-search-pro]
|
|
301
|
+
+ 汉字拼音转换工具: [pinyin]
|
|
302
|
+
|
|
303
|
+
## 性能评测
|
|
304
|
+
|
|
305
|
+
应该是目前性能最好的 Node.js 中文分词库
|
|
306
|
+
详见: [Jieba中文分词系列性能评测]
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
[由NodeJieba谈谈Node.js异步实现]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-03-21-nodejs-asynchronous-insight.md
|
|
310
|
+
[Node.js的C++扩展初体验之NodeJieba]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2014-02-22-nodejs-cpp-addon-nodejieba.md
|
|
311
|
+
[CppJieba]:https://github.com/yanyiwu/cppjieba.git
|
|
312
|
+
[cnpm]:http://cnpmjs.org
|
|
313
|
+
[Jieba中文分词]:https://github.com/fxsjy/jieba
|
|
314
|
+
|
|
315
|
+
[Jieba中文分词系列性能评测]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md
|
|
316
|
+
[contributors]:https://github.com/yanyiwu/nodejieba/graphs/contributors
|
|
317
|
+
[YanyiWu]:http://github.com/yanyiwu
|
|
318
|
+
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
|
319
|
+
[pinyin]:https://github.com/hotoo/pinyin
|
|
320
|
+
|
|
321
|
+
## Contributors
|
|
322
|
+
|
|
323
|
+
### Code Contributors
|
|
324
|
+
|
|
325
|
+
This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)].
|
|
326
|
+
<a href="https://github.com/yanyiwu/nodejieba/graphs/contributors"><img src="https://opencollective.com/nodejieba/contributors.svg?width=890&button=false" /></a>
|
|
327
|
+
|
|
328
|
+
### Financial Contributors
|
|
329
|
+
|
|
330
|
+
Become a financial contributor and help us sustain our community. [[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
331
|
+
|
|
332
|
+
#### Individuals
|
|
333
|
+
|
|
334
|
+
<a href="https://opencollective.com/nodejieba"><img src="https://opencollective.com/nodejieba/individuals.svg?width=890"></a>
|
|
335
|
+
|
|
336
|
+
#### Organizations
|
|
337
|
+
|
|
338
|
+
Support this project with your organization. Your logo will show up here with a link to your website. [[Contribute](https://opencollective.com/nodejieba/contribute)]
|
|
339
|
+
|
|
340
|
+
<a href="https://opencollective.com/nodejieba/organization/0/website"><img src="https://opencollective.com/nodejieba/organization/0/avatar.svg"></a>
|
|
341
|
+
<a href="https://opencollective.com/nodejieba/organization/1/website"><img src="https://opencollective.com/nodejieba/organization/1/avatar.svg"></a>
|
|
342
|
+
<a href="https://opencollective.com/nodejieba/organization/2/website"><img src="https://opencollective.com/nodejieba/organization/2/avatar.svg"></a>
|
|
343
|
+
<a href="https://opencollective.com/nodejieba/organization/3/website"><img src="https://opencollective.com/nodejieba/organization/3/avatar.svg"></a>
|
|
344
|
+
<a href="https://opencollective.com/nodejieba/organization/4/website"><img src="https://opencollective.com/nodejieba/organization/4/avatar.svg"></a>
|
|
345
|
+
<a href="https://opencollective.com/nodejieba/organization/5/website"><img src="https://opencollective.com/nodejieba/organization/5/avatar.svg"></a>
|
|
346
|
+
<a href="https://opencollective.com/nodejieba/organization/6/website"><img src="https://opencollective.com/nodejieba/organization/6/avatar.svg"></a>
|
|
347
|
+
<a href="https://opencollective.com/nodejieba/organization/7/website"><img src="https://opencollective.com/nodejieba/organization/7/avatar.svg"></a>
|
|
348
|
+
<a href="https://opencollective.com/nodejieba/organization/8/website"><img src="https://opencollective.com/nodejieba/organization/8/avatar.svg"></a>
|
|
349
|
+
<a href="https://opencollective.com/nodejieba/organization/9/website"><img src="https://opencollective.com/nodejieba/organization/9/avatar.svg"></a>
|
package/binding.gyp
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"targets": [
|
|
3
|
+
{
|
|
4
|
+
"target_name": "nodejieba",
|
|
5
|
+
"cflags!": [ "-fno-exceptions" ],
|
|
6
|
+
"cflags_cc!": [ "-fno-exceptions" ],
|
|
7
|
+
"xcode_settings": {
|
|
8
|
+
"GCC_ENABLE_CPP_EXCEPTIONS": "YES",
|
|
9
|
+
"CLANG_CXX_LIBRARY": "libc++",
|
|
10
|
+
"MACOSX_DEPLOYMENT_TARGET": "10.13",
|
|
11
|
+
},
|
|
12
|
+
"msvs_settings": {
|
|
13
|
+
"VCCLCompilerTool": {
|
|
14
|
+
"ExceptionHandling": 1,
|
|
15
|
+
"AdditionalOptions": ["/std:c++14","/utf-8"]
|
|
16
|
+
},
|
|
17
|
+
},
|
|
18
|
+
"win_delay_load_hook": "true",
|
|
19
|
+
"sources": [
|
|
20
|
+
"./lib/index.cpp",
|
|
21
|
+
"./lib/nodejieba.cpp",
|
|
22
|
+
],
|
|
23
|
+
"cflags": [
|
|
24
|
+
"-DLOGGING_LEVEL=LL_WARNING"
|
|
25
|
+
],
|
|
26
|
+
"include_dirs" : [
|
|
27
|
+
"<!(node -p \"require('node-addon-api').include_dir\")",
|
|
28
|
+
"./submodules/cppjieba/include",
|
|
29
|
+
"./submodules/cppjieba/deps/limonp/include",
|
|
30
|
+
],
|
|
31
|
+
'configurations': {
|
|
32
|
+
'Release': {
|
|
33
|
+
'msvs_settings': {
|
|
34
|
+
'VCCLCompilerTool': {
|
|
35
|
+
'ExceptionHandling': '1',
|
|
36
|
+
'PreprocessorDefinitions': ['LOGGING_LEVEL=LL_WARNING'],
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"conditions": [
|
|
42
|
+
[ "OS == 'mac'", {
|
|
43
|
+
"xcode_settings": {
|
|
44
|
+
"OTHER_CPLUSPLUSFLAGS":[
|
|
45
|
+
"-mmacosx-version-min=10.13",
|
|
46
|
+
"-std=c++14",
|
|
47
|
+
"-stdlib=libc++",
|
|
48
|
+
"-DLOGGING_LEVEL=LL_WARNING",
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
}],
|
|
52
|
+
# 添加Windows特定条件
|
|
53
|
+
[ "OS == 'win'", {
|
|
54
|
+
"msvs_settings": {
|
|
55
|
+
"VCCLCompilerTool": {
|
|
56
|
+
"AdditionalOptions": ["/std:c++14"]
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}]
|
|
60
|
+
],
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
}
|
package/index.js
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
var path = require('path');
|
|
2
|
+
var BINARY_PATH = path.join(__dirname, 'build/Release/nodejieba.node');
|
|
3
|
+
var nodejieba;
|
|
4
|
+
var bindingLoadError;
|
|
5
|
+
|
|
6
|
+
function getNodejieba() {
|
|
7
|
+
if (nodejieba) {
|
|
8
|
+
return nodejieba;
|
|
9
|
+
}
|
|
10
|
+
if (bindingLoadError) {
|
|
11
|
+
throw bindingLoadError;
|
|
12
|
+
}
|
|
13
|
+
try {
|
|
14
|
+
nodejieba = require(BINARY_PATH);
|
|
15
|
+
return nodejieba;
|
|
16
|
+
} catch (err) {
|
|
17
|
+
if (err && err.code === 'MODULE_NOT_FOUND') {
|
|
18
|
+
bindingLoadError = new Error(
|
|
19
|
+
"nodejieba native binding was not found at " + BINARY_PATH + ". " +
|
|
20
|
+
"This usually means install scripts were skipped or the native binary failed to download/build. " +
|
|
21
|
+
"Try reinstalling without --ignore-scripts or run `npm rebuild nodejieba`."
|
|
22
|
+
);
|
|
23
|
+
bindingLoadError.code = 'BINDING_NOT_FOUND';
|
|
24
|
+
bindingLoadError.cause = err;
|
|
25
|
+
throw bindingLoadError;
|
|
26
|
+
}
|
|
27
|
+
throw err;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
var isDictLoaded = false;
|
|
32
|
+
|
|
33
|
+
var DICT_DIR = __dirname + "/submodules/cppjieba/dict/";
|
|
34
|
+
|
|
35
|
+
var exports = {
|
|
36
|
+
DEFAULT_DICT: DICT_DIR + "jieba.dict.utf8",
|
|
37
|
+
DEFAULT_HMM_DICT: DICT_DIR + "hmm_model.utf8",
|
|
38
|
+
DEFAULT_USER_DICT: DICT_DIR + "user.dict.utf8",
|
|
39
|
+
DEFAULT_IDF_DICT: DICT_DIR + "idf.utf8",
|
|
40
|
+
DEFAULT_STOP_WORD_DICT: DICT_DIR + "stop_words.utf8",
|
|
41
|
+
|
|
42
|
+
load: function (dictJson) {
|
|
43
|
+
if (!dictJson) {
|
|
44
|
+
dictJson = {};
|
|
45
|
+
}
|
|
46
|
+
dict = dictJson.dict || exports.DEFAULT_DICT;
|
|
47
|
+
hmmDict = dictJson.hmmDict || exports.DEFAULT_HMM_DICT;
|
|
48
|
+
userDict = dictJson.userDict || exports.DEFAULT_USER_DICT;
|
|
49
|
+
idfDict = dictJson.idfDict || exports.DEFAULT_IDF_DICT;
|
|
50
|
+
stopWordDict = dictJson.stopWordDict || exports.DEFAULT_STOP_WORD_DICT;
|
|
51
|
+
|
|
52
|
+
var result = getNodejieba().load(dict, hmmDict, userDict, idfDict, stopWordDict);
|
|
53
|
+
isDictLoaded = true;
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
function wrapWithDictLoad(functName) {
|
|
59
|
+
exports[functName] = function () {
|
|
60
|
+
if (!isDictLoaded) {
|
|
61
|
+
exports.load();
|
|
62
|
+
}
|
|
63
|
+
return getNodejieba()[functName].apply(this, arguments);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
wrapWithDictLoad("cut");
|
|
68
|
+
wrapWithDictLoad("cutAll");
|
|
69
|
+
wrapWithDictLoad("cutHMM");
|
|
70
|
+
wrapWithDictLoad("cutForSearch");
|
|
71
|
+
wrapWithDictLoad("cutSmall");
|
|
72
|
+
wrapWithDictLoad("tag");
|
|
73
|
+
wrapWithDictLoad("extract");
|
|
74
|
+
wrapWithDictLoad("textRankExtract");
|
|
75
|
+
wrapWithDictLoad("insertWord");
|
|
76
|
+
|
|
77
|
+
module.exports = exports;
|
package/lib/index.cpp
ADDED