cppjieba_rb 0.2.3 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 9761a257960334fdb5e3cad3dd80be9f669b7f65
4
- data.tar.gz: e62980a6590c4e2d141b5583af34737152dc4343
2
+ SHA256:
3
+ metadata.gz: 74faf109bce12cd386acb5d2ad03ab4fde55a68cdb07a723a5af1f5b2528f3d5
4
+ data.tar.gz: c4a58470ef8e352cad688b9814080c37b332e46a93987e536387a94304ad1383
5
5
  SHA512:
6
- metadata.gz: b631d51e9ad4f79ef34baed3c2149a872b0a5e4a512c4d2bd8269c1df6d86d1a870e9824483855c881a2844250096bd2017ec34068af91b8629acc4910aa347d
7
- data.tar.gz: 29f1619beb0cca01ded498afd16d6fd081ca308d1d371b01998557fddf57ef870c31c5b51c0cda609f8e28a083a0ab945d08c98906153b11119f881946064acb
6
+ metadata.gz: 925ba793289f09a922451f6717b18a769080be4ab132a23edb236e4b91760251b67237dc2b6505c1aa39988d60294ff1ac07638c40a46916f6064be2cd71a425
7
+ data.tar.gz: e5f9dfcd2d341dd880bfc9aceee265abf6cf1e1d1a3428f807b1cd810691bab8e24946c974fef1260f89aebf03e82e85e518fd961905fd1150eac3ad6108c7ca
data/.travis.yml CHANGED
@@ -1,7 +1,11 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.3.3
4
- - 2.4.1
3
+ - 2.3
4
+ - 2.4
5
+ - 2.5
6
+ - 2.6
7
+ - 2.7
8
+ - 3.0
5
9
  - ruby-head
6
10
  - rbx-2
7
11
 
data/README.md CHANGED
@@ -2,10 +2,14 @@
2
2
 
3
3
  [![Gem Version](https://badge.fury.io/rb/cppjieba_rb.svg)](http://badge.fury.io/rb/cppjieba_rb)
4
4
 
5
- [![Build Status](https://travis-ci.org/fantasticfears/cppjieba_rb.png?branch=master)](https://travis-ci.org/fantasticfears/cppjieba_rb)
5
+ [![Build Status](https://travis-ci.com/erickguan/cppjieba_rb.svg?branch=master)](https://travis-ci.com/erickguan/cppjieba_rb)
6
+
7
+ [![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
6
8
 
7
9
  Ruby bindings for [Cppjieba](https://github.com/yanyiwu/cppjieba). C++11 required. (gcc 4.8+)
8
10
 
11
+ The TRIE tree has high memory usage. For default dict, it uses ~120 MB memory.
12
+
9
13
  ## Installation
10
14
 
11
15
  Add this line to your application's Gemfile:
@@ -26,7 +30,7 @@ Mix Segment mode (HMM with Max Prob, default):
26
30
 
27
31
  ```ruby
28
32
  require 'cppjieba_rb'
29
- seg = CppjiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
33
+ seg = CppjiebaRb::Segment.new # equivalent to "CppjiebaRb::Segment.new mode: :mix"
30
34
  words = seg.segment "令狐冲是云计算行业的专家"
31
35
  # 令狐冲 是 云 计算 行业 的 专家
32
36
  ```
@@ -70,7 +74,7 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
70
74
 
71
75
  ## Contributing
72
76
 
73
- 1. Fork it ( http://github.com/<my-github-username>/cppjieba_rb/fork )
77
+ 1. Fork it ( http://github.com/fantasticfears/cppjieba_rb/fork )
74
78
  2. Create your feature branch (`git checkout -b my-new-feature`)
75
79
  3. Commit your changes (`git commit -am 'Add some feature'`)
76
80
  4. Push to the branch (`git push origin my-new-feature`)
@@ -79,4 +83,4 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
79
83
  ## TODO
80
84
 
81
85
  - including 367w dict and provide the option for it.
82
- - cppjieba implements trie tree, it's memory consuming
86
+ - cppjieba implements trie tree, it's memory consuming
data/Rakefile CHANGED
@@ -3,7 +3,7 @@ require 'rake/testtask'
3
3
  require 'rake/extensiontask'
4
4
 
5
5
  gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba_rb.gemspec')
6
- Rake::ExtensionTask.new('cppjieba_rb', gem) do |ext|
6
+ Rake::ExtensionTask.new("cppjieba_rb", gem) do |ext|
7
7
  ext.lib_dir = File.join('lib', 'cppjieba_rb')
8
8
  end
9
9
 
data/cppjieba_rb.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ['Erick Guan']
10
10
  spec.email = ['fantasticfears@gmail.com']
11
11
  spec.summary = 'cppjieba binding for ruby'
12
- spec.description = 'cppjieba binding for ruby'
12
+ spec.description = 'cppjieba binding for ruby. Mainly used by Discourse.'
13
13
  spec.homepage = 'https://github.com/fantasticfears/cppjieba_rb'
14
14
  spec.required_ruby_version = '>=2.3.0'
15
15
  spec.license = 'MIT'
@@ -43,8 +43,8 @@ Gem::Specification.new do |spec|
43
43
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
44
44
  spec.require_paths = ['lib']
45
45
 
46
- spec.add_development_dependency 'bundler', '~> 1.5'
47
- spec.add_development_dependency 'rake'
48
- spec.add_development_dependency 'rake-compiler'
49
- spec.add_development_dependency 'minitest'
46
+ spec.add_development_dependency 'bundler', '~> 2.2', '>= 2.2.10'
47
+ spec.add_development_dependency 'rake', '~> 13'
48
+ spec.add_development_dependency 'rake-compiler', '~> 1.1'
49
+ spec.add_development_dependency 'minitest', '~> 5.14'
50
50
  end
@@ -1,7 +1,6 @@
1
1
  language: cpp
2
2
  before_install:
3
3
  - if [ $TRAVIS_OS_NAME == linux ]; then sudo apt-get install cmake; fi
4
- - if [ $TRAVIS_OS_NAME == osx ]; then brew install cmake; fi
5
4
  script:
6
5
  - mkdir build
7
6
  - cd build
@@ -1,5 +1,18 @@
1
1
  # CppJieba ChangeLog
2
2
 
3
+ ## v5.0.3
4
+
5
+ + Upgrade [limonp](https://github.com/yanyiwu/limonp) -> v0.6.3
6
+
7
+ ## v5.0.2
8
+
9
+ + Upgrade [limonp](https://github.com/yanyiwu/limonp) -> v0.6.1
10
+
11
+ ## v5.0.1
12
+
13
+ + Make Compiler Happier.
14
+ + Add PHP, DLang Links.
15
+
3
16
  ## v5.0.0
4
17
 
5
18
  + Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba
@@ -2,12 +2,15 @@
2
2
 
3
3
  [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
4
  [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Donate](https://img.shields.io/badge/donate-eos_gitdeveloper-orange.svg)](https://eosflare.io/account/gitdeveloper)
5
6
  [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
7
  [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
8
+ [![Tag](https://img.shields.io/github/v/tag/yanyiwu/cppjieba.svg)](https://github.com/yanyiwu/cppjieba/releases)
7
9
  [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
10
  [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
11
 
10
- [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
12
+
13
+ [![logo](http://images.yanyiwu.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
14
 
12
15
  ## 简介
13
16
 
@@ -223,11 +226,16 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
223
226
  + [iosjieba] iOS 版本的结巴分词。
224
227
  + [SqlJieba] MySQL 全文索引的结巴中文分词插件。
225
228
  + [pg_jieba] PostgreSQL 数据库的分词插件。
229
+ + [simple] SQLite3 FTS5 数据库的分词插件。
226
230
  + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
231
  + [ngx_http_cppjieba_module] Nginx 分词插件。
228
232
  + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
233
+ + [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
229
234
  + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
235
  + [cppjieba-server] CppJieba HTTP 服务器。
236
+ + [phpjieba] php版本的结巴分词扩展。
237
+ + [perl5-jieba] Perl版本的结巴分词扩展。
238
+ + [jieba-dlang] D 语言的结巴分词 Deimos Bindings。
231
239
 
232
240
  ## 线上演示
233
241
 
@@ -238,29 +246,23 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
238
246
 
239
247
  [Jieba中文分词系列性能评测]
240
248
 
241
- ## 客服
242
-
243
- + Email: `i@yanyiwu.com`
244
- + QQ: 64162451
245
- + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
246
-
247
- ## 鸣谢
249
+ ## Sponsorship
248
250
 
249
- "结巴"中文分词作者: [SunJunyi](https://github.com/fxsjy)
251
+ [![sponsorship](http://images.gitads.io/cppjieba)](https://tracking.gitads.io/?campaign=gitads&repo=cppjieba&redirect=gitads.io)
250
252
 
251
- ## 许可证
253
+ ## Contributors
252
254
 
253
- [MIT](http://yanyiwu.mit-license.org)
255
+ ### Code Contributors
254
256
 
255
- ## 作者
256
-
257
- - [yanyiwu](yanyiwu.com)
258
- - [aholic](https://github.com/aholic)
257
+ This project exists thanks to all the people who contribute.
258
+ <a href="https://github.com/yanyiwu/cppjieba/graphs/contributors"><img src="https://opencollective.com/cppjieba/contributors.svg?width=890&button=false" /></a>
259
259
 
260
260
  [GoJieba]:https://github.com/yanyiwu/gojieba
261
261
  [CppJieba]:https://github.com/yanyiwu/cppjieba
262
262
  [jannson]:https://github.com/jannson
263
263
  [cppjiebapy]:https://github.com/jannson/cppjiebapy
264
+ [bung87]:https://github.com/bung87
265
+ [cppjieba-py]:https://github.com/bung87/cppjieba-py
264
266
  [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
267
  [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
268
  [jiebaR]:https://github.com/qinwf/jiebaR
@@ -279,7 +281,9 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
279
281
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
282
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
283
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
284
+ [phpjieba]:https://github.com/jonnywang/phpjieba
285
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
286
+ [jieba-dlang]:https://github.com/shove70/jieba
287
+ [simple]:https://github.com/wangfenjin/simple
282
288
 
283
289
 
284
- [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
285
-
@@ -86,6 +86,8 @@ Please see details in `test/demo.cpp`.
86
86
  + [ngx_http_cppjieba_module]
87
87
  + [gitbook-plugin-search-pro]
88
88
  + [cppjieba-server]
89
+ + [perl5-jieba]
90
+ + [jieba-dlang]
89
91
 
90
92
  ## Contact
91
93
 
@@ -109,3 +111,5 @@ Please see details in `test/demo.cpp`.
109
111
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
112
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
113
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
114
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
115
+ [jieba-dlang]:https://github.com/shove70/jieba
@@ -6,7 +6,7 @@
6
6
  #ifdef __APPLE__
7
7
  #include <unordered_map>
8
8
  #include <unordered_set>
9
- #elif(__cplusplus == 201103L)
9
+ #elif(__cplusplus >= 201103L)
10
10
  #include <unordered_map>
11
11
  #include <unordered_set>
12
12
  #elif defined _MSC_VER
@@ -29,8 +29,6 @@ using std::tr1::unordered_set;
29
29
  #include <fstream>
30
30
  #include <sstream>
31
31
 
32
- #define print(x) std::cout << x << std::endl
33
-
34
32
  namespace std {
35
33
 
36
34
  template<typename T>
@@ -80,7 +80,7 @@ inline string& Lower(string& str) {
80
80
 
81
81
  inline bool IsSpace(unsigned c) {
82
82
  // when passing large int as the argument of isspace, it core dump, so here need a type cast.
83
- return c > 0xff ? false : std::isspace(c & 0xff);
83
+ return c > 0xff ? false : std::isspace(c & 0xff) != 0;
84
84
  }
85
85
 
86
86
  inline std::string& LTrim(std::string &s) {
@@ -50,6 +50,17 @@ class DictTrie {
50
50
  return true;
51
51
  }
52
52
 
53
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
+ DictUnit node_info;
55
+ double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
+ if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
+ return false;
58
+ }
59
+ active_node_infos_.push_back(node_info);
60
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
+ return true;
62
+ }
63
+
53
64
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
54
65
  return trie_->Find(begin, end);
55
66
  }
@@ -61,6 +72,25 @@ class DictTrie {
61
72
  trie_->Find(begin, end, res, max_word_len);
62
73
  }
63
74
 
75
+ bool Find(const string& word)
76
+ {
77
+ const DictUnit *tmp = NULL;
78
+ RuneStrArray runes;
79
+ if (!DecodeRunesInString(word, runes))
80
+ {
81
+ XLOG(ERROR) << "Decode failed.";
82
+ }
83
+ tmp = Find(runes.begin(), runes.end());
84
+ if (tmp == NULL)
85
+ {
86
+ return false;
87
+ }
88
+ else
89
+ {
90
+ return true;
91
+ }
92
+ }
93
+
64
94
  bool IsUserDictSingleChineseWord(const Rune& word) const {
65
95
  return IsIn(user_dict_single_chinese_word_, word);
66
96
  }
@@ -69,6 +99,63 @@ class DictTrie {
69
99
  return min_weight_;
70
100
  }
71
101
 
102
+ void InserUserDictNode(const string& line) {
103
+ vector<string> buf;
104
+ DictUnit node_info;
105
+ Split(line, buf, " ");
106
+ if(buf.size() == 1){
107
+ MakeNodeInfo(node_info,
108
+ buf[0],
109
+ user_word_default_weight_,
110
+ UNKNOWN_TAG);
111
+ } else if (buf.size() == 2) {
112
+ MakeNodeInfo(node_info,
113
+ buf[0],
114
+ user_word_default_weight_,
115
+ buf[1]);
116
+ } else if (buf.size() == 3) {
117
+ int freq = atoi(buf[1].c_str());
118
+ assert(freq_sum_ > 0.0);
119
+ double weight = log(1.0 * freq / freq_sum_);
120
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
121
+ }
122
+ static_node_infos_.push_back(node_info);
123
+ if (node_info.word.size() == 1) {
124
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
125
+ }
126
+ }
127
+
128
+ void LoadUserDict(const vector<string>& buf) {
129
+ for (size_t i = 0; i < buf.size(); i++) {
130
+ InserUserDictNode(buf[i]);
131
+ }
132
+ }
133
+
134
+ void LoadUserDict(const set<string>& buf) {
135
+ std::set<string>::const_iterator iter;
136
+ for (iter = buf.begin(); iter != buf.end(); iter++){
137
+ InserUserDictNode(*iter);
138
+ }
139
+ }
140
+
141
+ void LoadUserDict(const string& filePaths) {
142
+ vector<string> files = limonp::Split(filePaths, "|;");
143
+ size_t lineno = 0;
144
+ for (size_t i = 0; i < files.size(); i++) {
145
+ ifstream ifs(files[i].c_str());
146
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
147
+ string line;
148
+
149
+ for (; getline(ifs, line); lineno++) {
150
+ if (line.size() == 0) {
151
+ continue;
152
+ }
153
+ InserUserDictNode(line);
154
+ }
155
+ }
156
+ }
157
+
158
+
72
159
  private:
73
160
  void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
74
161
  LoadDict(dict_path);
@@ -95,45 +182,8 @@ class DictTrie {
95
182
  trie_ = new Trie(words, valuePointers);
96
183
  }
97
184
 
98
- void LoadUserDict(const string& filePaths) {
99
- vector<string> files = limonp::Split(filePaths, "|;");
100
- size_t lineno = 0;
101
- for (size_t i = 0; i < files.size(); i++) {
102
- ifstream ifs(files[i].c_str());
103
- XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
104
- string line;
105
- DictUnit node_info;
106
- vector<string> buf;
107
- for (; getline(ifs, line); lineno++) {
108
- if (line.size() == 0) {
109
- continue;
110
- }
111
- buf.clear();
112
- Split(line, buf, " ");
113
- DictUnit node_info;
114
- if(buf.size() == 1){
115
- MakeNodeInfo(node_info,
116
- buf[0],
117
- user_word_default_weight_,
118
- UNKNOWN_TAG);
119
- } else if (buf.size() == 2) {
120
- MakeNodeInfo(node_info,
121
- buf[0],
122
- user_word_default_weight_,
123
- buf[1]);
124
- } else if (buf.size() == 3) {
125
- int freq = atoi(buf[1].c_str());
126
- assert(freq_sum_ > 0.0);
127
- double weight = log(1.0 * freq / freq_sum_);
128
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
129
- }
130
- static_node_infos_.push_back(node_info);
131
- if (node_info.word.size() == 1) {
132
- user_dict_single_chinese_word_.insert(node_info.word[0]);
133
- }
134
- }
135
- }
136
- }
185
+
186
+
137
187
 
138
188
  bool MakeNodeInfo(DictUnit& node_info,
139
189
  const string& word,
@@ -48,17 +48,17 @@ class FullSegment: public SegmentBase {
48
48
  void Cut(RuneStrArray::const_iterator begin,
49
49
  RuneStrArray::const_iterator end,
50
50
  vector<WordRange>& res) const {
51
- //resut of searching in trie tree
51
+ // result of searching in trie tree
52
52
  LocalVector<pair<size_t, const DictUnit*> > tRes;
53
53
 
54
- //max index of res's words
55
- int maxIdx = 0;
54
+ // max index of res's words
55
+ size_t maxIdx = 0;
56
56
 
57
57
  // always equals to (uItr - begin)
58
- int uIdx = 0;
58
+ size_t uIdx = 0;
59
59
 
60
- //tmp variables
61
- int wordLen = 0;
60
+ // tmp variables
61
+ size_t wordLen = 0;
62
62
  assert(dictTrie_);
63
63
  vector<struct Dag> dags;
64
64
  dictTrie_->Find(begin, end, dags);
@@ -72,6 +72,15 @@ class Jieba {
72
72
  return dict_trie_.InsertUserWord(word, tag);
73
73
  }
74
74
 
75
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
76
+ return dict_trie_.InsertUserWord(word,freq, tag);
77
+ }
78
+
79
+ bool Find(const string& word)
80
+ {
81
+ return dict_trie_.Find(word);
82
+ }
83
+
75
84
  void ResetSeparators(const string& s) {
76
85
  //TODO
77
86
  mp_seg_.ResetSeparators(s);
@@ -84,10 +93,23 @@ class Jieba {
84
93
  const DictTrie* GetDictTrie() const {
85
94
  return &dict_trie_;
86
95
  }
96
+
87
97
  const HMMModel* GetHMMModel() const {
88
98
  return &model_;
89
99
  }
90
100
 
101
+ void LoadUserDict(const vector<string>& buf) {
102
+ dict_trie_.LoadUserDict(buf);
103
+ }
104
+
105
+ void LoadUserDict(const set<string>& buf) {
106
+ dict_trie_.LoadUserDict(buf);
107
+ }
108
+
109
+ void LoadUserDict(const string& path) {
110
+ dict_trie_.LoadUserDict(path);
111
+ }
112
+
91
113
  private:
92
114
  DictTrie dict_trie_;
93
115
  HMMModel model_;
@@ -10,7 +10,6 @@
10
10
  #include "FullSegment.hpp"
11
11
  #include "MixSegment.hpp"
12
12
  #include "Unicode.hpp"
13
- #include "DictTrie.hpp"
14
13
 
15
14
  namespace cppjieba {
16
15
  class QuerySegment: public SegmentBase {
@@ -18,9 +18,14 @@ typedef uint32_t Rune;
18
18
  struct Word {
19
19
  string word;
20
20
  uint32_t offset;
21
+ uint32_t unicode_offset;
22
+ uint32_t unicode_length;
21
23
  Word(const string& w, uint32_t o)
22
24
  : word(w), offset(o) {
23
25
  }
26
+ Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
27
+ : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
28
+ }
24
29
  }; // struct Word
25
30
 
26
31
  inline std::ostream& operator << (std::ostream& os, const Word& w) {
@@ -31,10 +36,15 @@ struct RuneStr {
31
36
  Rune rune;
32
37
  uint32_t offset;
33
38
  uint32_t len;
34
- RuneStr(): rune(0), offset(0), len(0) {
39
+ uint32_t unicode_offset;
40
+ uint32_t unicode_length;
41
+ RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
35
42
  }
36
43
  RuneStr(Rune r, uint32_t o, uint32_t l)
37
- : rune(r), offset(o), len(l) {
44
+ : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
45
+ }
46
+ RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
47
+ : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
38
48
  }
39
49
  }; // struct RuneStr
40
50
 
@@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
132
142
  inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
133
143
  runes.clear();
134
144
  runes.reserve(len / 2);
135
- for (size_t i = 0; i < len;) {
145
+ for (uint32_t i = 0, j = 0; i < len;) {
136
146
  RuneStrLite rp = DecodeRuneInString(s + i, len - i);
137
147
  if (rp.len == 0) {
138
148
  runes.clear();
139
149
  return false;
140
150
  }
141
- RuneStr x(rp.rune, i, rp.len);
151
+ RuneStr x(rp.rune, i, rp.len, j, 1);
142
152
  runes.push_back(x);
143
153
  i += rp.len;
154
+ ++j;
144
155
  }
145
156
  return true;
146
157
  }
@@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
182
193
  inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
183
194
  assert(right->offset >= left->offset);
184
195
  uint32_t len = right->offset - left->offset + right->len;
185
- return Word(s.substr(left->offset, len), left->offset);
196
+ uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
197
+ return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
186
198
  }
187
199
 
188
200
  inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
@@ -68,7 +68,7 @@ int main(int argc, char** argv) {
68
68
  s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
69
69
  jieba.Tag(s, tagres);
70
70
  cout << s << endl;
71
- cout << tagres << endl;;
71
+ cout << tagres << endl;
72
72
 
73
73
  cout << "[demo] Keyword Extraction" << endl;
74
74
  const size_t topk = 5;
@@ -1,5 +1,6 @@
1
1
  #include <ruby.h>
2
2
 
3
+ void Init_internal();
3
4
  VALUE rb_mCppjiebaRb;
4
5
 
5
6
  void Init_cppjieba_rb()
@@ -1,5 +1,10 @@
1
1
  #include <ruby.h>
2
2
  #include <ruby/encoding.h>
3
+
4
+ #include <string>
5
+ #include <iostream>
6
+ #include <unordered_set>
7
+
3
8
  #include "cppjieba/Jieba.hpp"
4
9
 
5
10
  #define GET_CPPJIEBA(_data) jieba_cpp_data* _data; \
@@ -7,6 +12,7 @@
7
12
 
8
13
  typedef struct {
9
14
  cppjieba::Jieba* jieba;
15
+ std::unordered_set<std::string>* stop_words;
10
16
  } jieba_cpp_data;
11
17
 
12
18
  // make compiler happy
@@ -26,6 +32,8 @@ static void jieba_cpp_free(void* _this)
26
32
  jieba_cpp_data* data = static_cast<jieba_cpp_data*>(_this);
27
33
  delete data->jieba;
28
34
  data->jieba = nullptr;
35
+ delete data->stop_words;
36
+ data->stop_words = nullptr;
29
37
  }
30
38
 
31
39
  static size_t jieba_cpp_memsize(const void* _)
@@ -61,6 +69,14 @@ VALUE internal_initialize(VALUE self,
61
69
  StringValueCStr(user_dict_path),
62
70
  StringValueCStr(idf_path),
63
71
  StringValueCStr(stop_word_path));
72
+ data->stop_words = new std::unordered_set<std::string>();
73
+ std::ifstream ifs(StringValueCStr(stop_word_path));
74
+ std::string line;
75
+ while (getline(ifs, line)) {
76
+ data->stop_words->insert(line);
77
+ }
78
+ assert(data->stop_words->size() != 0);
79
+ return self;
64
80
  }
65
81
 
66
82
  VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
@@ -127,6 +143,16 @@ static VALUE internal_segment_tag(VALUE self, VALUE text_rbs)
127
143
  return result;
128
144
  }
129
145
 
146
+ static VALUE internal_stop_word(VALUE self, VALUE word)
147
+ {
148
+ std::string test(StringValueCStr(word));
149
+ GET_CPPJIEBA(data);
150
+ if (data->stop_words->find(test) != data->stop_words->end()) {
151
+ return Qtrue;
152
+ } else {
153
+ return Qfalse;
154
+ }
155
+ }
130
156
 
131
157
  void Init_internal()
132
158
  {
@@ -143,6 +169,7 @@ void Init_internal()
143
169
  rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
144
170
  rb_define_method(rb_cCppjiebaRb_Internal, "segment", (ruby_method*) &internal_segment, 4);
145
171
  rb_define_method(rb_cCppjiebaRb_Internal, "segment_tag", (ruby_method*) &internal_segment_tag, 1);
172
+ rb_define_method(rb_cCppjiebaRb_Internal, "stop_word?", (ruby_method*) &internal_stop_word, 1);
146
173
  }
147
174
 
148
- }
175
+ }
data/lib/cppjieba_rb.rb CHANGED
@@ -22,6 +22,10 @@ module CppjiebaRb
22
22
  internal.segment_tag(str)
23
23
  end
24
24
 
25
+ def self.filter_stop_word(arr)
26
+ arr.reject { |w| internal.stop_word?(w) }
27
+ end
28
+
25
29
  class << self
26
30
  def internal
27
31
  @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
@@ -1,3 +1,3 @@
1
1
  module CppjiebaRb
2
- VERSION = '0.2.3'
2
+ VERSION = '0.4.1'
3
3
  end
@@ -0,0 +1,10 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+
5
+ class JiebaTest < Minitest::Test
6
+ def test_filter
7
+ words = CppjiebaRb.filter_stop_word %w(令狐冲 是 云计算 行业 的 专家)
8
+ assert_equal %w(令狐冲 云计算 行业 专家), words
9
+ end
10
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cppjieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erick Guan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-26 00:00:00.000000000 Z
11
+ date: 2021-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,57 +16,63 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.5'
19
+ version: '2.2'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 2.2.10
20
23
  type: :development
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
27
  - - "~>"
25
28
  - !ruby/object:Gem::Version
26
- version: '1.5'
29
+ version: '2.2'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 2.2.10
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: rake
29
35
  requirement: !ruby/object:Gem::Requirement
30
36
  requirements:
31
- - - ">="
37
+ - - "~>"
32
38
  - !ruby/object:Gem::Version
33
- version: '0'
39
+ version: '13'
34
40
  type: :development
35
41
  prerelease: false
36
42
  version_requirements: !ruby/object:Gem::Requirement
37
43
  requirements:
38
- - - ">="
44
+ - - "~>"
39
45
  - !ruby/object:Gem::Version
40
- version: '0'
46
+ version: '13'
41
47
  - !ruby/object:Gem::Dependency
42
48
  name: rake-compiler
43
49
  requirement: !ruby/object:Gem::Requirement
44
50
  requirements:
45
- - - ">="
51
+ - - "~>"
46
52
  - !ruby/object:Gem::Version
47
- version: '0'
53
+ version: '1.1'
48
54
  type: :development
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
51
57
  requirements:
52
- - - ">="
58
+ - - "~>"
53
59
  - !ruby/object:Gem::Version
54
- version: '0'
60
+ version: '1.1'
55
61
  - !ruby/object:Gem::Dependency
56
62
  name: minitest
57
63
  requirement: !ruby/object:Gem::Requirement
58
64
  requirements:
59
- - - ">="
65
+ - - "~>"
60
66
  - !ruby/object:Gem::Version
61
- version: '0'
67
+ version: '5.14'
62
68
  type: :development
63
69
  prerelease: false
64
70
  version_requirements: !ruby/object:Gem::Requirement
65
71
  requirements:
66
- - - ">="
72
+ - - "~>"
67
73
  - !ruby/object:Gem::Version
68
- version: '0'
69
- description: cppjieba binding for ruby
74
+ version: '5.14'
75
+ description: cppjieba binding for ruby. Mainly used by Discourse.
70
76
  email:
71
77
  - fantasticfears@gmail.com
72
78
  executables: []
@@ -213,6 +219,7 @@ files:
213
219
  - lib/cppjieba_rb/version.rb
214
220
  - test/test_keyword.rb
215
221
  - test/test_segment.rb
222
+ - test/test_stop_word_filter.rb
216
223
  - test/test_tagging.rb
217
224
  homepage: https://github.com/fantasticfears/cppjieba_rb
218
225
  licenses:
@@ -233,12 +240,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
233
240
  - !ruby/object:Gem::Version
234
241
  version: '0'
235
242
  requirements: []
236
- rubyforge_project:
237
- rubygems_version: 2.6.12
243
+ rubygems_version: 3.2.15
238
244
  signing_key:
239
245
  specification_version: 4
240
246
  summary: cppjieba binding for ruby
241
247
  test_files:
242
248
  - test/test_keyword.rb
243
249
  - test/test_segment.rb
250
+ - test/test_stop_word_filter.rb
244
251
  - test/test_tagging.rb