cppjieba_rb 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ab9b4adf8e5ddccbe36570f3492b4ae438c7b6a
4
- data.tar.gz: 72987214a678eaf3505f804bd7c9db86f94c6178
3
+ metadata.gz: 8c0c15dd47a0f3fe66f57e4f7d37fd5eca5c2959
4
+ data.tar.gz: 204f0898edcbe685ec297a4259c5a8bdac29eb3a
5
5
  SHA512:
6
- metadata.gz: 1cd2f4cdd703464e0672ea034ae5bc875d4754ce3e45d859b66d2cf65759e32bbca499221ef4ca70239bcf3606eafeee13b9c6723d5a626a8aa2bf06fdfc970e
7
- data.tar.gz: 796eacaca074e602a080a94a2532e4f051e2b9e874ee6412cf45b683522afc36f2ba219be6a375c728ce6f3eb9b225c7fff783643cac3732dd4992bf263a1e4a
6
+ metadata.gz: 9697b467d66427704cbb1b2944c0ac0f04b66153d704f3efa72cf0f6b226fc50c3510b688abf267b5aa4678d261ce18baf491713d39ebe6b2be9de6f2c33c3d5
7
+ data.tar.gz: 7bb56503a56bb7d305eecea1d4671365fc6c736204256e93d2113adf6f43344bab9cdf2b8122f80e0ef7bb42eb2ed01776070e235b81a902caabe89f341f869c
data/.travis.yml CHANGED
@@ -1,7 +1,8 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.3.3
4
- - 2.4.1
3
+ - 2.3
4
+ - 2.4
5
+ - 2.5
5
6
  - ruby-head
6
7
  - rbx-2
7
8
 
data/README.md CHANGED
@@ -4,10 +4,12 @@
4
4
 
5
5
  [![Build Status](https://travis-ci.org/fantasticfears/cppjieba_rb.png?branch=master)](https://travis-ci.org/fantasticfears/cppjieba_rb)
6
6
 
7
- +[![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
7
+ [![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
8
8
 
9
9
  Ruby bindings for [Cppjieba](https://github.com/yanyiwu/cppjieba). C++11 required. (gcc 4.8+)
10
10
 
11
+ The TRIE tree has high memory usage. For default dict, it uses ~120 MB memory.
12
+
11
13
  ## Installation
12
14
 
13
15
  Add this line to your application's Gemfile:
@@ -28,7 +30,7 @@ Mix Segment mode (HMM with Max Prob, default):
28
30
 
29
31
  ```ruby
30
32
  require 'cppjieba_rb'
31
- seg = CppjiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
33
+ seg = CppjiebaRb::Segment.new # equivalent to "CppjiebaRb::Segment.new mode: :mix"
32
34
  words = seg.segment "令狐冲是云计算行业的专家"
33
35
  # 令狐冲 是 云 计算 行业 的 专家
34
36
  ```
@@ -72,7 +74,7 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
72
74
 
73
75
  ## Contributing
74
76
 
75
- 1. Fork it ( http://github.com/<my-github-username>/cppjieba_rb/fork )
77
+ 1. Fork it ( http://github.com/fantasticfears/cppjieba_rb/fork )
76
78
  2. Create your feature branch (`git checkout -b my-new-feature`)
77
79
  3. Commit your changes (`git commit -am 'Add some feature'`)
78
80
  4. Push to the branch (`git push origin my-new-feature`)
@@ -81,4 +83,4 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
81
83
  ## TODO
82
84
 
83
85
  - including 367w dict and provide the option for it.
84
- - cppjieba implements trie tree, it's memory consuming
86
+ - cppjieba implements trie tree, it's memory consuming
data/Rakefile CHANGED
@@ -2,7 +2,7 @@ require "bundler/gem_tasks"
2
2
  require 'rake/testtask'
3
3
  require 'rake/extensiontask'
4
4
 
5
- gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba_rb.gemspec')
5
+ gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba-rb.gemspec.gemspec')
6
6
  Rake::ExtensionTask.new('cppjieba_rb', gem) do |ext|
7
7
  ext.lib_dir = File.join('lib', 'cppjieba_rb')
8
8
  end
@@ -1,7 +1,6 @@
1
1
  language: cpp
2
2
  before_install:
3
3
  - if [ $TRAVIS_OS_NAME == linux ]; then sudo apt-get install cmake; fi
4
- - if [ $TRAVIS_OS_NAME == osx ]; then brew install cmake; fi
5
4
  script:
6
5
  - mkdir build
7
6
  - cd build
@@ -226,8 +226,11 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
226
226
  + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
227
  + [ngx_http_cppjieba_module] Nginx 分词插件。
228
228
  + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
229
+ + [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
229
230
  + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
231
  + [cppjieba-server] CppJieba HTTP 服务器。
232
+ + [phpjieba] php版本的结巴分词扩展。
233
+ + [perl5-jieba] Perl版本的结巴分词扩展。
231
234
 
232
235
  ## 线上演示
233
236
 
@@ -261,6 +264,8 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
261
264
  [CppJieba]:https://github.com/yanyiwu/cppjieba
262
265
  [jannson]:https://github.com/jannson
263
266
  [cppjiebapy]:https://github.com/jannson/cppjiebapy
267
+ [bung87]:https://github.com/bung87
268
+ [cppjieba-py]:https://github.com/bung87/cppjieba-py
264
269
  [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
270
  [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
271
  [jiebaR]:https://github.com/qinwf/jiebaR
@@ -279,6 +284,8 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
279
284
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
285
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
286
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
287
+ [phpjieba]:https://github.com/jonnywang/phpjieba
288
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
282
289
 
283
290
 
284
291
  [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
@@ -86,6 +86,7 @@ Please see details in `test/demo.cpp`.
86
86
  + [ngx_http_cppjieba_module]
87
87
  + [gitbook-plugin-search-pro]
88
88
  + [cppjieba-server]
89
+ + [perl5-jieba]
89
90
 
90
91
  ## Contact
91
92
 
@@ -109,3 +110,4 @@ Please see details in `test/demo.cpp`.
109
110
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
111
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
112
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
113
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
@@ -50,6 +50,17 @@ class DictTrie {
50
50
  return true;
51
51
  }
52
52
 
53
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
+ DictUnit node_info;
55
+ double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
+ if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
+ return false;
58
+ }
59
+ active_node_infos_.push_back(node_info);
60
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
+ return true;
62
+ }
63
+
53
64
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
54
65
  return trie_->Find(begin, end);
55
66
  }
@@ -61,6 +72,25 @@ class DictTrie {
61
72
  trie_->Find(begin, end, res, max_word_len);
62
73
  }
63
74
 
75
+ bool Find(const string& word)
76
+ {
77
+ const DictUnit *tmp = NULL;
78
+ RuneStrArray runes;
79
+ if (!DecodeRunesInString(word, runes))
80
+ {
81
+ XLOG(ERROR) << "Decode failed.";
82
+ }
83
+ tmp = Find(runes.begin(), runes.end());
84
+ if (tmp == NULL)
85
+ {
86
+ return false;
87
+ }
88
+ else
89
+ {
90
+ return true;
91
+ }
92
+ }
93
+
64
94
  bool IsUserDictSingleChineseWord(const Rune& word) const {
65
95
  return IsIn(user_dict_single_chinese_word_, word);
66
96
  }
@@ -69,6 +99,63 @@ class DictTrie {
69
99
  return min_weight_;
70
100
  }
71
101
 
102
+ void InserUserDictNode(const string& line) {
103
+ vector<string> buf;
104
+ DictUnit node_info;
105
+ Split(line, buf, " ");
106
+ if(buf.size() == 1){
107
+ MakeNodeInfo(node_info,
108
+ buf[0],
109
+ user_word_default_weight_,
110
+ UNKNOWN_TAG);
111
+ } else if (buf.size() == 2) {
112
+ MakeNodeInfo(node_info,
113
+ buf[0],
114
+ user_word_default_weight_,
115
+ buf[1]);
116
+ } else if (buf.size() == 3) {
117
+ int freq = atoi(buf[1].c_str());
118
+ assert(freq_sum_ > 0.0);
119
+ double weight = log(1.0 * freq / freq_sum_);
120
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
121
+ }
122
+ static_node_infos_.push_back(node_info);
123
+ if (node_info.word.size() == 1) {
124
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
125
+ }
126
+ }
127
+
128
+ void LoadUserDict(const vector<string>& buf) {
129
+ for (size_t i = 0; i < buf.size(); i++) {
130
+ InserUserDictNode(buf[i]);
131
+ }
132
+ }
133
+
134
+ void LoadUserDict(const set<string>& buf) {
135
+ std::set<string>::const_iterator iter;
136
+ for (iter = buf.begin(); iter != buf.end(); iter++){
137
+ InserUserDictNode(*iter);
138
+ }
139
+ }
140
+
141
+ void LoadUserDict(const string& filePaths) {
142
+ vector<string> files = limonp::Split(filePaths, "|;");
143
+ size_t lineno = 0;
144
+ for (size_t i = 0; i < files.size(); i++) {
145
+ ifstream ifs(files[i].c_str());
146
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
147
+ string line;
148
+
149
+ for (; getline(ifs, line); lineno++) {
150
+ if (line.size() == 0) {
151
+ continue;
152
+ }
153
+ InserUserDictNode(line);
154
+ }
155
+ }
156
+ }
157
+
158
+
72
159
  private:
73
160
  void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
74
161
  LoadDict(dict_path);
@@ -95,45 +182,8 @@ class DictTrie {
95
182
  trie_ = new Trie(words, valuePointers);
96
183
  }
97
184
 
98
- void LoadUserDict(const string& filePaths) {
99
- vector<string> files = limonp::Split(filePaths, "|;");
100
- size_t lineno = 0;
101
- for (size_t i = 0; i < files.size(); i++) {
102
- ifstream ifs(files[i].c_str());
103
- XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
104
- string line;
105
- DictUnit node_info;
106
- vector<string> buf;
107
- for (; getline(ifs, line); lineno++) {
108
- if (line.size() == 0) {
109
- continue;
110
- }
111
- buf.clear();
112
- Split(line, buf, " ");
113
- DictUnit node_info;
114
- if(buf.size() == 1){
115
- MakeNodeInfo(node_info,
116
- buf[0],
117
- user_word_default_weight_,
118
- UNKNOWN_TAG);
119
- } else if (buf.size() == 2) {
120
- MakeNodeInfo(node_info,
121
- buf[0],
122
- user_word_default_weight_,
123
- buf[1]);
124
- } else if (buf.size() == 3) {
125
- int freq = atoi(buf[1].c_str());
126
- assert(freq_sum_ > 0.0);
127
- double weight = log(1.0 * freq / freq_sum_);
128
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
129
- }
130
- static_node_infos_.push_back(node_info);
131
- if (node_info.word.size() == 1) {
132
- user_dict_single_chinese_word_.insert(node_info.word[0]);
133
- }
134
- }
135
- }
136
- }
185
+
186
+
137
187
 
138
188
  bool MakeNodeInfo(DictUnit& node_info,
139
189
  const string& word,
@@ -48,17 +48,17 @@ class FullSegment: public SegmentBase {
48
48
  void Cut(RuneStrArray::const_iterator begin,
49
49
  RuneStrArray::const_iterator end,
50
50
  vector<WordRange>& res) const {
51
- //resut of searching in trie tree
51
+ // resut of searching in trie tree
52
52
  LocalVector<pair<size_t, const DictUnit*> > tRes;
53
53
 
54
- //max index of res's words
55
- int maxIdx = 0;
54
+ // max index of res's words
55
+ size_t maxIdx = 0;
56
56
 
57
57
  // always equals to (uItr - begin)
58
- int uIdx = 0;
58
+ size_t uIdx = 0;
59
59
 
60
- //tmp variables
61
- int wordLen = 0;
60
+ // tmp variables
61
+ size_t wordLen = 0;
62
62
  assert(dictTrie_);
63
63
  vector<struct Dag> dags;
64
64
  dictTrie_->Find(begin, end, dags);
@@ -72,6 +72,15 @@ class Jieba {
72
72
  return dict_trie_.InsertUserWord(word, tag);
73
73
  }
74
74
 
75
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
76
+ return dict_trie_.InsertUserWord(word,freq, tag);
77
+ }
78
+
79
+ bool Find(const string& word)
80
+ {
81
+ return dict_trie_.Find(word);
82
+ }
83
+
75
84
  void ResetSeparators(const string& s) {
76
85
  //TODO
77
86
  mp_seg_.ResetSeparators(s);
@@ -84,10 +93,23 @@ class Jieba {
84
93
  const DictTrie* GetDictTrie() const {
85
94
  return &dict_trie_;
86
95
  }
96
+
87
97
  const HMMModel* GetHMMModel() const {
88
98
  return &model_;
89
99
  }
90
100
 
101
+ void LoadUserDict(const vector<string>& buf) {
102
+ dict_trie_.LoadUserDict(buf);
103
+ }
104
+
105
+ void LoadUserDict(const set<string>& buf) {
106
+ dict_trie_.LoadUserDict(buf);
107
+ }
108
+
109
+ void LoadUserDict(const string& path) {
110
+ dict_trie_.LoadUserDict(path);
111
+ }
112
+
91
113
  private:
92
114
  DictTrie dict_trie_;
93
115
  HMMModel model_;
@@ -18,9 +18,14 @@ typedef uint32_t Rune;
18
18
  struct Word {
19
19
  string word;
20
20
  uint32_t offset;
21
+ uint32_t unicode_offset;
22
+ uint32_t unicode_length;
21
23
  Word(const string& w, uint32_t o)
22
24
  : word(w), offset(o) {
23
25
  }
26
+ Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
27
+ : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
28
+ }
24
29
  }; // struct Word
25
30
 
26
31
  inline std::ostream& operator << (std::ostream& os, const Word& w) {
@@ -31,11 +36,16 @@ struct RuneStr {
31
36
  Rune rune;
32
37
  uint32_t offset;
33
38
  uint32_t len;
39
+ uint32_t unicode_offset;
40
+ uint32_t unicode_length;
34
41
  RuneStr(): rune(0), offset(0), len(0) {
35
42
  }
36
43
  RuneStr(Rune r, uint32_t o, uint32_t l)
37
44
  : rune(r), offset(o), len(l) {
38
45
  }
46
+ RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
47
+ : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
48
+ }
39
49
  }; // struct RuneStr
40
50
 
41
51
  inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
@@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
132
142
  inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
133
143
  runes.clear();
134
144
  runes.reserve(len / 2);
135
- for (size_t i = 0; i < len;) {
145
+ for (uint32_t i = 0, j = 0; i < len;) {
136
146
  RuneStrLite rp = DecodeRuneInString(s + i, len - i);
137
147
  if (rp.len == 0) {
138
148
  runes.clear();
139
149
  return false;
140
150
  }
141
- RuneStr x(rp.rune, i, rp.len);
151
+ RuneStr x(rp.rune, i, rp.len, j, 1);
142
152
  runes.push_back(x);
143
153
  i += rp.len;
154
+ ++j;
144
155
  }
145
156
  return true;
146
157
  }
@@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
182
193
  inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
183
194
  assert(right->offset >= left->offset);
184
195
  uint32_t len = right->offset - left->offset + right->len;
185
- return Word(s.substr(left->offset, len), left->offset);
196
+ uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
197
+ return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
186
198
  }
187
199
 
188
200
  inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
@@ -68,7 +68,7 @@ int main(int argc, char** argv) {
68
68
  s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
69
69
  jieba.Tag(s, tagres);
70
70
  cout << s << endl;
71
- cout << tagres << endl;;
71
+ cout << tagres << endl;
72
72
 
73
73
  cout << "[demo] Keyword Extraction" << endl;
74
74
  const size_t topk = 5;
@@ -75,7 +75,8 @@ VALUE internal_initialize(VALUE self,
75
75
  while (getline(ifs, line)) {
76
76
  data->stop_words->insert(line);
77
77
  }
78
- assert(data->stop_words->size());
78
+ assert(data->stop_words->size() != 0);
79
+ return self;
79
80
  }
80
81
 
81
82
  VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
@@ -171,4 +172,4 @@ void Init_internal()
171
172
  rb_define_method(rb_cCppjiebaRb_Internal, "stop_word?", (ruby_method*) &internal_stop_word, 1);
172
173
  }
173
174
 
174
- }
175
+ }
@@ -1,3 +1,3 @@
1
1
  module CppjiebaRb
2
- VERSION = '0.3.0'
2
+ VERSION = '0.3.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cppjieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erick Guan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-26 00:00:00.000000000 Z
11
+ date: 2018-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -235,7 +235,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
235
235
  version: '0'
236
236
  requirements: []
237
237
  rubyforge_project:
238
- rubygems_version: 2.6.12
238
+ rubygems_version: 2.6.14
239
239
  signing_key:
240
240
  specification_version: 4
241
241
  summary: cppjieba binding for ruby