cppjieba_rb 0.2.3 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +6 -2
- data/README.md +8 -4
- data/Rakefile +1 -1
- data/cppjieba_rb.gemspec +5 -5
- data/ext/cppjieba/.travis.yml +0 -1
- data/ext/cppjieba/ChangeLog.md +13 -0
- data/ext/cppjieba/README.md +21 -17
- data/ext/cppjieba/README_EN.md +4 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +1 -3
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +1 -1
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +89 -39
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +6 -6
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +22 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -1
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +17 -5
- data/ext/cppjieba/test/demo.cpp +1 -1
- data/ext/cppjieba_rb/cppjieba_rb.c +1 -0
- data/ext/cppjieba_rb/internal.cc +28 -1
- data/lib/cppjieba_rb.rb +4 -0
- data/lib/cppjieba_rb/version.rb +1 -1
- data/test/test_stop_word_filter.rb +10 -0
- metadata +26 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 74faf109bce12cd386acb5d2ad03ab4fde55a68cdb07a723a5af1f5b2528f3d5
|
4
|
+
data.tar.gz: c4a58470ef8e352cad688b9814080c37b332e46a93987e536387a94304ad1383
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 925ba793289f09a922451f6717b18a769080be4ab132a23edb236e4b91760251b67237dc2b6505c1aa39988d60294ff1ac07638c40a46916f6064be2cd71a425
|
7
|
+
data.tar.gz: e5f9dfcd2d341dd880bfc9aceee265abf6cf1e1d1a3428f807b1cd810691bab8e24946c974fef1260f89aebf03e82e85e518fd961905fd1150eac3ad6108c7ca
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -2,10 +2,14 @@
|
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/cppjieba_rb.svg)](http://badge.fury.io/rb/cppjieba_rb)
|
4
4
|
|
5
|
-
[![Build Status](https://travis-ci.
|
5
|
+
[![Build Status](https://travis-ci.com/erickguan/cppjieba_rb.svg?branch=master)](https://travis-ci.com/erickguan/cppjieba_rb)
|
6
|
+
|
7
|
+
[![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
|
6
8
|
|
7
9
|
Ruby bindings for [Cppjieba](https://github.com/yanyiwu/cppjieba). C++11 required. (gcc 4.8+)
|
8
10
|
|
11
|
+
The TRIE tree has high memory usage. For default dict, it uses ~120 MB memory.
|
12
|
+
|
9
13
|
## Installation
|
10
14
|
|
11
15
|
Add this line to your application's Gemfile:
|
@@ -26,7 +30,7 @@ Mix Segment mode (HMM with Max Prob, default):
|
|
26
30
|
|
27
31
|
```ruby
|
28
32
|
require 'cppjieba_rb'
|
29
|
-
seg = CppjiebaRb::Segment.new # equivalent to "
|
33
|
+
seg = CppjiebaRb::Segment.new # equivalent to "CppjiebaRb::Segment.new mode: :mix"
|
30
34
|
words = seg.segment "令狐冲是云计算行业的专家"
|
31
35
|
# 令狐冲 是 云 计算 行业 的 专家
|
32
36
|
```
|
@@ -70,7 +74,7 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
|
|
70
74
|
|
71
75
|
## Contributing
|
72
76
|
|
73
|
-
1. Fork it ( http://github.com
|
77
|
+
1. Fork it ( http://github.com/fantasticfears/cppjieba_rb/fork )
|
74
78
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
75
79
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
76
80
|
4. Push to the branch (`git push origin my-new-feature`)
|
@@ -79,4 +83,4 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
|
|
79
83
|
## TODO
|
80
84
|
|
81
85
|
- including 367w dict and provide the option for it.
|
82
|
-
- cppjieba implements trie tree, it's memory consuming
|
86
|
+
- cppjieba implements trie tree, it's memory consuming
|
data/Rakefile
CHANGED
@@ -3,7 +3,7 @@ require 'rake/testtask'
|
|
3
3
|
require 'rake/extensiontask'
|
4
4
|
|
5
5
|
gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba_rb.gemspec')
|
6
|
-
Rake::ExtensionTask.new(
|
6
|
+
Rake::ExtensionTask.new("cppjieba_rb", gem) do |ext|
|
7
7
|
ext.lib_dir = File.join('lib', 'cppjieba_rb')
|
8
8
|
end
|
9
9
|
|
data/cppjieba_rb.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ['Erick Guan']
|
10
10
|
spec.email = ['fantasticfears@gmail.com']
|
11
11
|
spec.summary = 'cppjieba binding for ruby'
|
12
|
-
spec.description = 'cppjieba binding for ruby'
|
12
|
+
spec.description = 'cppjieba binding for ruby. Mainly used by Discourse.'
|
13
13
|
spec.homepage = 'https://github.com/fantasticfears/cppjieba_rb'
|
14
14
|
spec.required_ruby_version = '>=2.3.0'
|
15
15
|
spec.license = 'MIT'
|
@@ -43,8 +43,8 @@ Gem::Specification.new do |spec|
|
|
43
43
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
44
44
|
spec.require_paths = ['lib']
|
45
45
|
|
46
|
-
spec.add_development_dependency 'bundler', '~>
|
47
|
-
spec.add_development_dependency 'rake'
|
48
|
-
spec.add_development_dependency 'rake-compiler'
|
49
|
-
spec.add_development_dependency 'minitest'
|
46
|
+
spec.add_development_dependency 'bundler', '~> 2.2', '>= 2.2.10'
|
47
|
+
spec.add_development_dependency 'rake', '~> 13'
|
48
|
+
spec.add_development_dependency 'rake-compiler', '~> 1.1'
|
49
|
+
spec.add_development_dependency 'minitest', '~> 5.14'
|
50
50
|
end
|
data/ext/cppjieba/.travis.yml
CHANGED
data/ext/cppjieba/ChangeLog.md
CHANGED
@@ -1,5 +1,18 @@
|
|
1
1
|
# CppJieba ChangeLog
|
2
2
|
|
3
|
+
## v5.0.3
|
4
|
+
|
5
|
+
+ Upgrade [limonp](https://github.com/yanyiwu/limonp) -> v0.6.3
|
6
|
+
|
7
|
+
## v5.0.2
|
8
|
+
|
9
|
+
+ Upgrade [limonp](https://github.com/yanyiwu/limonp) -> v0.6.1
|
10
|
+
|
11
|
+
## v5.0.1
|
12
|
+
|
13
|
+
+ Make Compiler Happier.
|
14
|
+
+ Add PHP, DLang Links.
|
15
|
+
|
3
16
|
## v5.0.0
|
4
17
|
|
5
18
|
+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
|
data/ext/cppjieba/README.md
CHANGED
@@ -2,12 +2,15 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
|
4
4
|
[![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
|
5
|
+
[![Donate](https://img.shields.io/badge/donate-eos_gitdeveloper-orange.svg)](https://eosflare.io/account/gitdeveloper)
|
5
6
|
[![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
|
6
7
|
[![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
|
8
|
+
[![Tag](https://img.shields.io/github/v/tag/yanyiwu/cppjieba.svg)](https://github.com/yanyiwu/cppjieba/releases)
|
7
9
|
[![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
|
8
10
|
[![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
|
9
11
|
|
10
|
-
|
12
|
+
|
13
|
+
[![logo](http://images.yanyiwu.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
|
11
14
|
|
12
15
|
## 简介
|
13
16
|
|
@@ -223,11 +226,16 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
|
|
223
226
|
+ [iosjieba] iOS 版本的结巴分词。
|
224
227
|
+ [SqlJieba] MySQL 全文索引的结巴中文分词插件。
|
225
228
|
+ [pg_jieba] PostgreSQL 数据库的分词插件。
|
229
|
+
+ [simple] SQLite3 FTS5 数据库的分词插件。
|
226
230
|
+ [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
|
227
231
|
+ [ngx_http_cppjieba_module] Nginx 分词插件。
|
228
232
|
+ [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
|
233
|
+
+ [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
|
229
234
|
+ [KeywordServer] 50行搭建一个中文关键词抽取服务。
|
230
235
|
+ [cppjieba-server] CppJieba HTTP 服务器。
|
236
|
+
+ [phpjieba] php版本的结巴分词扩展。
|
237
|
+
+ [perl5-jieba] Perl版本的结巴分词扩展。
|
238
|
+
+ [jieba-dlang] D 语言的结巴分词 Deimos Bindings。
|
231
239
|
|
232
240
|
## 线上演示
|
233
241
|
|
@@ -238,29 +246,23 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
|
|
238
246
|
|
239
247
|
[Jieba中文分词系列性能评测]
|
240
248
|
|
241
|
-
##
|
242
|
-
|
243
|
-
+ Email: `i@yanyiwu.com`
|
244
|
-
+ QQ: 64162451
|
245
|
-
+ WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
|
246
|
-
|
247
|
-
## 鸣谢
|
249
|
+
## Sponsorship
|
248
250
|
|
249
|
-
|
251
|
+
[![sponsorship](http://images.gitads.io/cppjieba)](https://tracking.gitads.io/?campaign=gitads&repo=cppjieba&redirect=gitads.io)
|
250
252
|
|
251
|
-
##
|
253
|
+
## Contributors
|
252
254
|
|
253
|
-
|
255
|
+
### Code Contributors
|
254
256
|
|
255
|
-
|
256
|
-
|
257
|
-
- [yanyiwu](yanyiwu.com)
|
258
|
-
- [aholic](https://github.com/aholic)
|
257
|
+
This project exists thanks to all the people who contribute.
|
258
|
+
<a href="https://github.com/yanyiwu/cppjieba/graphs/contributors"><img src="https://opencollective.com/cppjieba/contributors.svg?width=890&button=false" /></a>
|
259
259
|
|
260
260
|
[GoJieba]:https://github.com/yanyiwu/gojieba
|
261
261
|
[CppJieba]:https://github.com/yanyiwu/cppjieba
|
262
262
|
[jannson]:https://github.com/jannson
|
263
263
|
[cppjiebapy]:https://github.com/jannson/cppjiebapy
|
264
|
+
[bung87]:https://github.com/bung87
|
265
|
+
[cppjieba-py]:https://github.com/bung87/cppjieba-py
|
264
266
|
[cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
|
265
267
|
[NodeJieba]:https://github.com/yanyiwu/nodejieba
|
266
268
|
[jiebaR]:https://github.com/qinwf/jiebaR
|
@@ -279,7 +281,9 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
|
|
279
281
|
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
|
280
282
|
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
281
283
|
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
|
284
|
+
[phpjieba]:https://github.com/jonnywang/phpjieba
|
285
|
+
[perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
|
286
|
+
[jieba-dlang]:https://github.com/shove70/jieba
|
287
|
+
[simple]:https://github.com/wangfenjin/simple
|
282
288
|
|
283
289
|
|
284
|
-
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
|
285
|
-
|
data/ext/cppjieba/README_EN.md
CHANGED
@@ -86,6 +86,8 @@ Please see details in `test/demo.cpp`.
|
|
86
86
|
+ [ngx_http_cppjieba_module]
|
87
87
|
+ [gitbook-plugin-search-pro]
|
88
88
|
+ [cppjieba-server]
|
89
|
+
+ [perl5-jieba]
|
90
|
+
+ [jieba-dlang]
|
89
91
|
|
90
92
|
## Contact
|
91
93
|
|
@@ -109,3 +111,5 @@ Please see details in `test/demo.cpp`.
|
|
109
111
|
[pg_jieba]:https://github.com/jaiminpan/pg_jieba
|
110
112
|
[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
|
111
113
|
[cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
|
114
|
+
[perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
|
115
|
+
[jieba-dlang]:https://github.com/shove70/jieba
|
@@ -6,7 +6,7 @@
|
|
6
6
|
#ifdef __APPLE__
|
7
7
|
#include <unordered_map>
|
8
8
|
#include <unordered_set>
|
9
|
-
#elif(__cplusplus
|
9
|
+
#elif(__cplusplus >= 201103L)
|
10
10
|
#include <unordered_map>
|
11
11
|
#include <unordered_set>
|
12
12
|
#elif defined _MSC_VER
|
@@ -29,8 +29,6 @@ using std::tr1::unordered_set;
|
|
29
29
|
#include <fstream>
|
30
30
|
#include <sstream>
|
31
31
|
|
32
|
-
#define print(x) std::cout << x << std::endl
|
33
|
-
|
34
32
|
namespace std {
|
35
33
|
|
36
34
|
template<typename T>
|
@@ -80,7 +80,7 @@ inline string& Lower(string& str) {
|
|
80
80
|
|
81
81
|
inline bool IsSpace(unsigned c) {
|
82
82
|
// when passing large int as the argument of isspace, it core dump, so here need a type cast.
|
83
|
-
return c > 0xff ? false : std::isspace(c & 0xff);
|
83
|
+
return c > 0xff ? false : std::isspace(c & 0xff) != 0;
|
84
84
|
}
|
85
85
|
|
86
86
|
inline std::string& LTrim(std::string &s) {
|
@@ -50,6 +50,17 @@ class DictTrie {
|
|
50
50
|
return true;
|
51
51
|
}
|
52
52
|
|
53
|
+
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
54
|
+
DictUnit node_info;
|
55
|
+
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
56
|
+
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
57
|
+
return false;
|
58
|
+
}
|
59
|
+
active_node_infos_.push_back(node_info);
|
60
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
61
|
+
return true;
|
62
|
+
}
|
63
|
+
|
53
64
|
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
54
65
|
return trie_->Find(begin, end);
|
55
66
|
}
|
@@ -61,6 +72,25 @@ class DictTrie {
|
|
61
72
|
trie_->Find(begin, end, res, max_word_len);
|
62
73
|
}
|
63
74
|
|
75
|
+
bool Find(const string& word)
|
76
|
+
{
|
77
|
+
const DictUnit *tmp = NULL;
|
78
|
+
RuneStrArray runes;
|
79
|
+
if (!DecodeRunesInString(word, runes))
|
80
|
+
{
|
81
|
+
XLOG(ERROR) << "Decode failed.";
|
82
|
+
}
|
83
|
+
tmp = Find(runes.begin(), runes.end());
|
84
|
+
if (tmp == NULL)
|
85
|
+
{
|
86
|
+
return false;
|
87
|
+
}
|
88
|
+
else
|
89
|
+
{
|
90
|
+
return true;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
64
94
|
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
65
95
|
return IsIn(user_dict_single_chinese_word_, word);
|
66
96
|
}
|
@@ -69,6 +99,63 @@ class DictTrie {
|
|
69
99
|
return min_weight_;
|
70
100
|
}
|
71
101
|
|
102
|
+
void InserUserDictNode(const string& line) {
|
103
|
+
vector<string> buf;
|
104
|
+
DictUnit node_info;
|
105
|
+
Split(line, buf, " ");
|
106
|
+
if(buf.size() == 1){
|
107
|
+
MakeNodeInfo(node_info,
|
108
|
+
buf[0],
|
109
|
+
user_word_default_weight_,
|
110
|
+
UNKNOWN_TAG);
|
111
|
+
} else if (buf.size() == 2) {
|
112
|
+
MakeNodeInfo(node_info,
|
113
|
+
buf[0],
|
114
|
+
user_word_default_weight_,
|
115
|
+
buf[1]);
|
116
|
+
} else if (buf.size() == 3) {
|
117
|
+
int freq = atoi(buf[1].c_str());
|
118
|
+
assert(freq_sum_ > 0.0);
|
119
|
+
double weight = log(1.0 * freq / freq_sum_);
|
120
|
+
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
121
|
+
}
|
122
|
+
static_node_infos_.push_back(node_info);
|
123
|
+
if (node_info.word.size() == 1) {
|
124
|
+
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
void LoadUserDict(const vector<string>& buf) {
|
129
|
+
for (size_t i = 0; i < buf.size(); i++) {
|
130
|
+
InserUserDictNode(buf[i]);
|
131
|
+
}
|
132
|
+
}
|
133
|
+
|
134
|
+
void LoadUserDict(const set<string>& buf) {
|
135
|
+
std::set<string>::const_iterator iter;
|
136
|
+
for (iter = buf.begin(); iter != buf.end(); iter++){
|
137
|
+
InserUserDictNode(*iter);
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
void LoadUserDict(const string& filePaths) {
|
142
|
+
vector<string> files = limonp::Split(filePaths, "|;");
|
143
|
+
size_t lineno = 0;
|
144
|
+
for (size_t i = 0; i < files.size(); i++) {
|
145
|
+
ifstream ifs(files[i].c_str());
|
146
|
+
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
147
|
+
string line;
|
148
|
+
|
149
|
+
for (; getline(ifs, line); lineno++) {
|
150
|
+
if (line.size() == 0) {
|
151
|
+
continue;
|
152
|
+
}
|
153
|
+
InserUserDictNode(line);
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
|
72
159
|
private:
|
73
160
|
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
74
161
|
LoadDict(dict_path);
|
@@ -95,45 +182,8 @@ class DictTrie {
|
|
95
182
|
trie_ = new Trie(words, valuePointers);
|
96
183
|
}
|
97
184
|
|
98
|
-
|
99
|
-
|
100
|
-
size_t lineno = 0;
|
101
|
-
for (size_t i = 0; i < files.size(); i++) {
|
102
|
-
ifstream ifs(files[i].c_str());
|
103
|
-
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
104
|
-
string line;
|
105
|
-
DictUnit node_info;
|
106
|
-
vector<string> buf;
|
107
|
-
for (; getline(ifs, line); lineno++) {
|
108
|
-
if (line.size() == 0) {
|
109
|
-
continue;
|
110
|
-
}
|
111
|
-
buf.clear();
|
112
|
-
Split(line, buf, " ");
|
113
|
-
DictUnit node_info;
|
114
|
-
if(buf.size() == 1){
|
115
|
-
MakeNodeInfo(node_info,
|
116
|
-
buf[0],
|
117
|
-
user_word_default_weight_,
|
118
|
-
UNKNOWN_TAG);
|
119
|
-
} else if (buf.size() == 2) {
|
120
|
-
MakeNodeInfo(node_info,
|
121
|
-
buf[0],
|
122
|
-
user_word_default_weight_,
|
123
|
-
buf[1]);
|
124
|
-
} else if (buf.size() == 3) {
|
125
|
-
int freq = atoi(buf[1].c_str());
|
126
|
-
assert(freq_sum_ > 0.0);
|
127
|
-
double weight = log(1.0 * freq / freq_sum_);
|
128
|
-
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
129
|
-
}
|
130
|
-
static_node_infos_.push_back(node_info);
|
131
|
-
if (node_info.word.size() == 1) {
|
132
|
-
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
133
|
-
}
|
134
|
-
}
|
135
|
-
}
|
136
|
-
}
|
185
|
+
|
186
|
+
|
137
187
|
|
138
188
|
bool MakeNodeInfo(DictUnit& node_info,
|
139
189
|
const string& word,
|
@@ -48,17 +48,17 @@ class FullSegment: public SegmentBase {
|
|
48
48
|
void Cut(RuneStrArray::const_iterator begin,
|
49
49
|
RuneStrArray::const_iterator end,
|
50
50
|
vector<WordRange>& res) const {
|
51
|
-
//
|
51
|
+
// result of searching in trie tree
|
52
52
|
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
53
53
|
|
54
|
-
//max index of res's words
|
55
|
-
|
54
|
+
// max index of res's words
|
55
|
+
size_t maxIdx = 0;
|
56
56
|
|
57
57
|
// always equals to (uItr - begin)
|
58
|
-
|
58
|
+
size_t uIdx = 0;
|
59
59
|
|
60
|
-
//tmp variables
|
61
|
-
|
60
|
+
// tmp variables
|
61
|
+
size_t wordLen = 0;
|
62
62
|
assert(dictTrie_);
|
63
63
|
vector<struct Dag> dags;
|
64
64
|
dictTrie_->Find(begin, end, dags);
|
@@ -72,6 +72,15 @@ class Jieba {
|
|
72
72
|
return dict_trie_.InsertUserWord(word, tag);
|
73
73
|
}
|
74
74
|
|
75
|
+
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
76
|
+
return dict_trie_.InsertUserWord(word,freq, tag);
|
77
|
+
}
|
78
|
+
|
79
|
+
bool Find(const string& word)
|
80
|
+
{
|
81
|
+
return dict_trie_.Find(word);
|
82
|
+
}
|
83
|
+
|
75
84
|
void ResetSeparators(const string& s) {
|
76
85
|
//TODO
|
77
86
|
mp_seg_.ResetSeparators(s);
|
@@ -84,10 +93,23 @@ class Jieba {
|
|
84
93
|
const DictTrie* GetDictTrie() const {
|
85
94
|
return &dict_trie_;
|
86
95
|
}
|
96
|
+
|
87
97
|
const HMMModel* GetHMMModel() const {
|
88
98
|
return &model_;
|
89
99
|
}
|
90
100
|
|
101
|
+
void LoadUserDict(const vector<string>& buf) {
|
102
|
+
dict_trie_.LoadUserDict(buf);
|
103
|
+
}
|
104
|
+
|
105
|
+
void LoadUserDict(const set<string>& buf) {
|
106
|
+
dict_trie_.LoadUserDict(buf);
|
107
|
+
}
|
108
|
+
|
109
|
+
void LoadUserDict(const string& path) {
|
110
|
+
dict_trie_.LoadUserDict(path);
|
111
|
+
}
|
112
|
+
|
91
113
|
private:
|
92
114
|
DictTrie dict_trie_;
|
93
115
|
HMMModel model_;
|
@@ -18,9 +18,14 @@ typedef uint32_t Rune;
|
|
18
18
|
struct Word {
|
19
19
|
string word;
|
20
20
|
uint32_t offset;
|
21
|
+
uint32_t unicode_offset;
|
22
|
+
uint32_t unicode_length;
|
21
23
|
Word(const string& w, uint32_t o)
|
22
24
|
: word(w), offset(o) {
|
23
25
|
}
|
26
|
+
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
27
|
+
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
28
|
+
}
|
24
29
|
}; // struct Word
|
25
30
|
|
26
31
|
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
@@ -31,10 +36,15 @@ struct RuneStr {
|
|
31
36
|
Rune rune;
|
32
37
|
uint32_t offset;
|
33
38
|
uint32_t len;
|
34
|
-
|
39
|
+
uint32_t unicode_offset;
|
40
|
+
uint32_t unicode_length;
|
41
|
+
RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
|
35
42
|
}
|
36
43
|
RuneStr(Rune r, uint32_t o, uint32_t l)
|
37
|
-
: rune(r), offset(o), len(l) {
|
44
|
+
: rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
|
45
|
+
}
|
46
|
+
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
47
|
+
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
38
48
|
}
|
39
49
|
}; // struct RuneStr
|
40
50
|
|
@@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
|
132
142
|
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
133
143
|
runes.clear();
|
134
144
|
runes.reserve(len / 2);
|
135
|
-
for (
|
145
|
+
for (uint32_t i = 0, j = 0; i < len;) {
|
136
146
|
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
137
147
|
if (rp.len == 0) {
|
138
148
|
runes.clear();
|
139
149
|
return false;
|
140
150
|
}
|
141
|
-
RuneStr x(rp.rune, i, rp.len);
|
151
|
+
RuneStr x(rp.rune, i, rp.len, j, 1);
|
142
152
|
runes.push_back(x);
|
143
153
|
i += rp.len;
|
154
|
+
++j;
|
144
155
|
}
|
145
156
|
return true;
|
146
157
|
}
|
@@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
|
|
182
193
|
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
183
194
|
assert(right->offset >= left->offset);
|
184
195
|
uint32_t len = right->offset - left->offset + right->len;
|
185
|
-
|
196
|
+
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
197
|
+
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
|
186
198
|
}
|
187
199
|
|
188
200
|
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
data/ext/cppjieba/test/demo.cpp
CHANGED
@@ -68,7 +68,7 @@ int main(int argc, char** argv) {
|
|
68
68
|
s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
69
69
|
jieba.Tag(s, tagres);
|
70
70
|
cout << s << endl;
|
71
|
-
cout << tagres << endl
|
71
|
+
cout << tagres << endl;
|
72
72
|
|
73
73
|
cout << "[demo] Keyword Extraction" << endl;
|
74
74
|
const size_t topk = 5;
|
data/ext/cppjieba_rb/internal.cc
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
#include <ruby.h>
|
2
2
|
#include <ruby/encoding.h>
|
3
|
+
|
4
|
+
#include <string>
|
5
|
+
#include <iostream>
|
6
|
+
#include <unordered_set>
|
7
|
+
|
3
8
|
#include "cppjieba/Jieba.hpp"
|
4
9
|
|
5
10
|
#define GET_CPPJIEBA(_data) jieba_cpp_data* _data; \
|
@@ -7,6 +12,7 @@
|
|
7
12
|
|
8
13
|
typedef struct {
|
9
14
|
cppjieba::Jieba* jieba;
|
15
|
+
std::unordered_set<std::string>* stop_words;
|
10
16
|
} jieba_cpp_data;
|
11
17
|
|
12
18
|
// make compiler happy
|
@@ -26,6 +32,8 @@ static void jieba_cpp_free(void* _this)
|
|
26
32
|
jieba_cpp_data* data = static_cast<jieba_cpp_data*>(_this);
|
27
33
|
delete data->jieba;
|
28
34
|
data->jieba = nullptr;
|
35
|
+
delete data->stop_words;
|
36
|
+
data->stop_words = nullptr;
|
29
37
|
}
|
30
38
|
|
31
39
|
static size_t jieba_cpp_memsize(const void* _)
|
@@ -61,6 +69,14 @@ VALUE internal_initialize(VALUE self,
|
|
61
69
|
StringValueCStr(user_dict_path),
|
62
70
|
StringValueCStr(idf_path),
|
63
71
|
StringValueCStr(stop_word_path));
|
72
|
+
data->stop_words = new std::unordered_set<std::string>();
|
73
|
+
std::ifstream ifs(StringValueCStr(stop_word_path));
|
74
|
+
std::string line;
|
75
|
+
while (getline(ifs, line)) {
|
76
|
+
data->stop_words->insert(line);
|
77
|
+
}
|
78
|
+
assert(data->stop_words->size() != 0);
|
79
|
+
return self;
|
64
80
|
}
|
65
81
|
|
66
82
|
VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
|
@@ -127,6 +143,16 @@ static VALUE internal_segment_tag(VALUE self, VALUE text_rbs)
|
|
127
143
|
return result;
|
128
144
|
}
|
129
145
|
|
146
|
+
static VALUE internal_stop_word(VALUE self, VALUE word)
|
147
|
+
{
|
148
|
+
std::string test(StringValueCStr(word));
|
149
|
+
GET_CPPJIEBA(data);
|
150
|
+
if (data->stop_words->find(test) != data->stop_words->end()) {
|
151
|
+
return Qtrue;
|
152
|
+
} else {
|
153
|
+
return Qfalse;
|
154
|
+
}
|
155
|
+
}
|
130
156
|
|
131
157
|
void Init_internal()
|
132
158
|
{
|
@@ -143,6 +169,7 @@ void Init_internal()
|
|
143
169
|
rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
|
144
170
|
rb_define_method(rb_cCppjiebaRb_Internal, "segment", (ruby_method*) &internal_segment, 4);
|
145
171
|
rb_define_method(rb_cCppjiebaRb_Internal, "segment_tag", (ruby_method*) &internal_segment_tag, 1);
|
172
|
+
rb_define_method(rb_cCppjiebaRb_Internal, "stop_word?", (ruby_method*) &internal_stop_word, 1);
|
146
173
|
}
|
147
174
|
|
148
|
-
}
|
175
|
+
}
|
data/lib/cppjieba_rb.rb
CHANGED
data/lib/cppjieba_rb/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cppjieba_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erick Guan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,57 +16,63 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.2'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 2.2.10
|
20
23
|
type: :development
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
27
|
- - "~>"
|
25
28
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
29
|
+
version: '2.2'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 2.2.10
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: rake
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
30
36
|
requirements:
|
31
|
-
- - "
|
37
|
+
- - "~>"
|
32
38
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
39
|
+
version: '13'
|
34
40
|
type: :development
|
35
41
|
prerelease: false
|
36
42
|
version_requirements: !ruby/object:Gem::Requirement
|
37
43
|
requirements:
|
38
|
-
- - "
|
44
|
+
- - "~>"
|
39
45
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
46
|
+
version: '13'
|
41
47
|
- !ruby/object:Gem::Dependency
|
42
48
|
name: rake-compiler
|
43
49
|
requirement: !ruby/object:Gem::Requirement
|
44
50
|
requirements:
|
45
|
-
- - "
|
51
|
+
- - "~>"
|
46
52
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
53
|
+
version: '1.1'
|
48
54
|
type: :development
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
51
57
|
requirements:
|
52
|
-
- - "
|
58
|
+
- - "~>"
|
53
59
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
60
|
+
version: '1.1'
|
55
61
|
- !ruby/object:Gem::Dependency
|
56
62
|
name: minitest
|
57
63
|
requirement: !ruby/object:Gem::Requirement
|
58
64
|
requirements:
|
59
|
-
- - "
|
65
|
+
- - "~>"
|
60
66
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
67
|
+
version: '5.14'
|
62
68
|
type: :development
|
63
69
|
prerelease: false
|
64
70
|
version_requirements: !ruby/object:Gem::Requirement
|
65
71
|
requirements:
|
66
|
-
- - "
|
72
|
+
- - "~>"
|
67
73
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
69
|
-
description: cppjieba binding for ruby
|
74
|
+
version: '5.14'
|
75
|
+
description: cppjieba binding for ruby. Mainly used by Discourse.
|
70
76
|
email:
|
71
77
|
- fantasticfears@gmail.com
|
72
78
|
executables: []
|
@@ -213,6 +219,7 @@ files:
|
|
213
219
|
- lib/cppjieba_rb/version.rb
|
214
220
|
- test/test_keyword.rb
|
215
221
|
- test/test_segment.rb
|
222
|
+
- test/test_stop_word_filter.rb
|
216
223
|
- test/test_tagging.rb
|
217
224
|
homepage: https://github.com/fantasticfears/cppjieba_rb
|
218
225
|
licenses:
|
@@ -233,12 +240,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
233
240
|
- !ruby/object:Gem::Version
|
234
241
|
version: '0'
|
235
242
|
requirements: []
|
236
|
-
|
237
|
-
rubygems_version: 2.6.12
|
243
|
+
rubygems_version: 3.2.15
|
238
244
|
signing_key:
|
239
245
|
specification_version: 4
|
240
246
|
summary: cppjieba binding for ruby
|
241
247
|
test_files:
|
242
248
|
- test/test_keyword.rb
|
243
249
|
- test/test_segment.rb
|
250
|
+
- test/test_stop_word_filter.rb
|
244
251
|
- test/test_tagging.rb
|