cppjieba_rb 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ab9b4adf8e5ddccbe36570f3492b4ae438c7b6a
4
- data.tar.gz: 72987214a678eaf3505f804bd7c9db86f94c6178
3
+ metadata.gz: 8c0c15dd47a0f3fe66f57e4f7d37fd5eca5c2959
4
+ data.tar.gz: 204f0898edcbe685ec297a4259c5a8bdac29eb3a
5
5
  SHA512:
6
- metadata.gz: 1cd2f4cdd703464e0672ea034ae5bc875d4754ce3e45d859b66d2cf65759e32bbca499221ef4ca70239bcf3606eafeee13b9c6723d5a626a8aa2bf06fdfc970e
7
- data.tar.gz: 796eacaca074e602a080a94a2532e4f051e2b9e874ee6412cf45b683522afc36f2ba219be6a375c728ce6f3eb9b225c7fff783643cac3732dd4992bf263a1e4a
6
+ metadata.gz: 9697b467d66427704cbb1b2944c0ac0f04b66153d704f3efa72cf0f6b226fc50c3510b688abf267b5aa4678d261ce18baf491713d39ebe6b2be9de6f2c33c3d5
7
+ data.tar.gz: 7bb56503a56bb7d305eecea1d4671365fc6c736204256e93d2113adf6f43344bab9cdf2b8122f80e0ef7bb42eb2ed01776070e235b81a902caabe89f341f869c
data/.travis.yml CHANGED
@@ -1,7 +1,8 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.3.3
4
- - 2.4.1
3
+ - 2.3
4
+ - 2.4
5
+ - 2.5
5
6
  - ruby-head
6
7
  - rbx-2
7
8
 
data/README.md CHANGED
@@ -4,10 +4,12 @@
4
4
 
5
5
  [![Build Status](https://travis-ci.org/fantasticfears/cppjieba_rb.png?branch=master)](https://travis-ci.org/fantasticfears/cppjieba_rb)
6
6
 
7
- +[![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
7
+ [![Patreon](https://img.shields.io/badge/back_on-patreon-red.svg)](https://www.patreon.com/fantasticfears)
8
8
 
9
9
  Ruby bindings for [Cppjieba](https://github.com/yanyiwu/cppjieba). C++11 required. (gcc 4.8+)
10
10
 
11
+ The TRIE tree has high memory usage. For default dict, it uses ~120 MB memory.
12
+
11
13
  ## Installation
12
14
 
13
15
  Add this line to your application's Gemfile:
@@ -28,7 +30,7 @@ Mix Segment mode (HMM with Max Prob, default):
28
30
 
29
31
  ```ruby
30
32
  require 'cppjieba_rb'
31
- seg = CppjiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
33
+ seg = CppjiebaRb::Segment.new # equivalent to "CppjiebaRb::Segment.new mode: :mix"
32
34
  words = seg.segment "令狐冲是云计算行业的专家"
33
35
  # 令狐冲 是 云 计算 行业 的 专家
34
36
  ```
@@ -72,7 +74,7 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
72
74
 
73
75
  ## Contributing
74
76
 
75
- 1. Fork it ( http://github.com/<my-github-username>/cppjieba_rb/fork )
77
+ 1. Fork it ( http://github.com/fantasticfears/cppjieba_rb/fork )
76
78
  2. Create your feature branch (`git checkout -b my-new-feature`)
77
79
  3. Commit your changes (`git commit -am 'Add some feature'`)
78
80
  4. Push to the branch (`git push origin my-new-feature`)
@@ -81,4 +83,4 @@ CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不
81
83
  ## TODO
82
84
 
83
85
  - including 367w dict and provide the option for it.
84
- - cppjieba implements trie tree, it's memory consuming
86
+ - cppjieba implements trie tree, it's memory consuming
data/Rakefile CHANGED
@@ -2,7 +2,7 @@ require "bundler/gem_tasks"
2
2
  require 'rake/testtask'
3
3
  require 'rake/extensiontask'
4
4
 
5
- gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba_rb.gemspec')
5
+ gem = Gem::Specification.load(File.dirname(__FILE__) + '/cppjieba-rb.gemspec.gemspec')
6
6
  Rake::ExtensionTask.new('cppjieba_rb', gem) do |ext|
7
7
  ext.lib_dir = File.join('lib', 'cppjieba_rb')
8
8
  end
@@ -1,7 +1,6 @@
1
1
  language: cpp
2
2
  before_install:
3
3
  - if [ $TRAVIS_OS_NAME == linux ]; then sudo apt-get install cmake; fi
4
- - if [ $TRAVIS_OS_NAME == osx ]; then brew install cmake; fi
5
4
  script:
6
5
  - mkdir build
7
6
  - cd build
@@ -226,8 +226,11 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
226
226
  + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
227
  + [ngx_http_cppjieba_module] Nginx 分词插件。
228
228
  + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
229
+ + [cppjieba-py] 由 [bung87] 基于 pybind11 封装的 python 模块,使用体验上接近于原jieba。
229
230
  + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
231
  + [cppjieba-server] CppJieba HTTP 服务器。
232
+ + [phpjieba] php版本的结巴分词扩展。
233
+ + [perl5-jieba] Perl版本的结巴分词扩展。
231
234
 
232
235
  ## 线上演示
233
236
 
@@ -261,6 +264,8 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
261
264
  [CppJieba]:https://github.com/yanyiwu/cppjieba
262
265
  [jannson]:https://github.com/jannson
263
266
  [cppjiebapy]:https://github.com/jannson/cppjiebapy
267
+ [bung87]:https://github.com/bung87
268
+ [cppjieba-py]:https://github.com/bung87/cppjieba-py
264
269
  [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
270
  [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
271
  [jiebaR]:https://github.com/qinwf/jiebaR
@@ -279,6 +284,8 @@ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Ful
279
284
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
285
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
286
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
287
+ [phpjieba]:https://github.com/jonnywang/phpjieba
288
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
282
289
 
283
290
 
284
291
  [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
@@ -86,6 +86,7 @@ Please see details in `test/demo.cpp`.
86
86
  + [ngx_http_cppjieba_module]
87
87
  + [gitbook-plugin-search-pro]
88
88
  + [cppjieba-server]
89
+ + [perl5-jieba]
89
90
 
90
91
  ## Contact
91
92
 
@@ -109,3 +110,4 @@ Please see details in `test/demo.cpp`.
109
110
  [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
111
  [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
112
  [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
113
+ [perl5-jieba]:https://metacpan.org/pod/distribution/Lingua-ZH-Jieba/lib/Lingua/ZH/Jieba.pod
@@ -50,6 +50,17 @@ class DictTrie {
50
50
  return true;
51
51
  }
52
52
 
53
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
+ DictUnit node_info;
55
+ double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
+ if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
+ return false;
58
+ }
59
+ active_node_infos_.push_back(node_info);
60
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
+ return true;
62
+ }
63
+
53
64
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
54
65
  return trie_->Find(begin, end);
55
66
  }
@@ -61,6 +72,25 @@ class DictTrie {
61
72
  trie_->Find(begin, end, res, max_word_len);
62
73
  }
63
74
 
75
+ bool Find(const string& word)
76
+ {
77
+ const DictUnit *tmp = NULL;
78
+ RuneStrArray runes;
79
+ if (!DecodeRunesInString(word, runes))
80
+ {
81
+ XLOG(ERROR) << "Decode failed.";
82
+ }
83
+ tmp = Find(runes.begin(), runes.end());
84
+ if (tmp == NULL)
85
+ {
86
+ return false;
87
+ }
88
+ else
89
+ {
90
+ return true;
91
+ }
92
+ }
93
+
64
94
  bool IsUserDictSingleChineseWord(const Rune& word) const {
65
95
  return IsIn(user_dict_single_chinese_word_, word);
66
96
  }
@@ -69,6 +99,63 @@ class DictTrie {
69
99
  return min_weight_;
70
100
  }
71
101
 
102
+ void InserUserDictNode(const string& line) {
103
+ vector<string> buf;
104
+ DictUnit node_info;
105
+ Split(line, buf, " ");
106
+ if(buf.size() == 1){
107
+ MakeNodeInfo(node_info,
108
+ buf[0],
109
+ user_word_default_weight_,
110
+ UNKNOWN_TAG);
111
+ } else if (buf.size() == 2) {
112
+ MakeNodeInfo(node_info,
113
+ buf[0],
114
+ user_word_default_weight_,
115
+ buf[1]);
116
+ } else if (buf.size() == 3) {
117
+ int freq = atoi(buf[1].c_str());
118
+ assert(freq_sum_ > 0.0);
119
+ double weight = log(1.0 * freq / freq_sum_);
120
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
121
+ }
122
+ static_node_infos_.push_back(node_info);
123
+ if (node_info.word.size() == 1) {
124
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
125
+ }
126
+ }
127
+
128
+ void LoadUserDict(const vector<string>& buf) {
129
+ for (size_t i = 0; i < buf.size(); i++) {
130
+ InserUserDictNode(buf[i]);
131
+ }
132
+ }
133
+
134
+ void LoadUserDict(const set<string>& buf) {
135
+ std::set<string>::const_iterator iter;
136
+ for (iter = buf.begin(); iter != buf.end(); iter++){
137
+ InserUserDictNode(*iter);
138
+ }
139
+ }
140
+
141
+ void LoadUserDict(const string& filePaths) {
142
+ vector<string> files = limonp::Split(filePaths, "|;");
143
+ size_t lineno = 0;
144
+ for (size_t i = 0; i < files.size(); i++) {
145
+ ifstream ifs(files[i].c_str());
146
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
147
+ string line;
148
+
149
+ for (; getline(ifs, line); lineno++) {
150
+ if (line.size() == 0) {
151
+ continue;
152
+ }
153
+ InserUserDictNode(line);
154
+ }
155
+ }
156
+ }
157
+
158
+
72
159
  private:
73
160
  void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
74
161
  LoadDict(dict_path);
@@ -95,45 +182,8 @@ class DictTrie {
95
182
  trie_ = new Trie(words, valuePointers);
96
183
  }
97
184
 
98
- void LoadUserDict(const string& filePaths) {
99
- vector<string> files = limonp::Split(filePaths, "|;");
100
- size_t lineno = 0;
101
- for (size_t i = 0; i < files.size(); i++) {
102
- ifstream ifs(files[i].c_str());
103
- XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
104
- string line;
105
- DictUnit node_info;
106
- vector<string> buf;
107
- for (; getline(ifs, line); lineno++) {
108
- if (line.size() == 0) {
109
- continue;
110
- }
111
- buf.clear();
112
- Split(line, buf, " ");
113
- DictUnit node_info;
114
- if(buf.size() == 1){
115
- MakeNodeInfo(node_info,
116
- buf[0],
117
- user_word_default_weight_,
118
- UNKNOWN_TAG);
119
- } else if (buf.size() == 2) {
120
- MakeNodeInfo(node_info,
121
- buf[0],
122
- user_word_default_weight_,
123
- buf[1]);
124
- } else if (buf.size() == 3) {
125
- int freq = atoi(buf[1].c_str());
126
- assert(freq_sum_ > 0.0);
127
- double weight = log(1.0 * freq / freq_sum_);
128
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
129
- }
130
- static_node_infos_.push_back(node_info);
131
- if (node_info.word.size() == 1) {
132
- user_dict_single_chinese_word_.insert(node_info.word[0]);
133
- }
134
- }
135
- }
136
- }
185
+
186
+
137
187
 
138
188
  bool MakeNodeInfo(DictUnit& node_info,
139
189
  const string& word,
@@ -48,17 +48,17 @@ class FullSegment: public SegmentBase {
48
48
  void Cut(RuneStrArray::const_iterator begin,
49
49
  RuneStrArray::const_iterator end,
50
50
  vector<WordRange>& res) const {
51
- //resut of searching in trie tree
51
+ // resut of searching in trie tree
52
52
  LocalVector<pair<size_t, const DictUnit*> > tRes;
53
53
 
54
- //max index of res's words
55
- int maxIdx = 0;
54
+ // max index of res's words
55
+ size_t maxIdx = 0;
56
56
 
57
57
  // always equals to (uItr - begin)
58
- int uIdx = 0;
58
+ size_t uIdx = 0;
59
59
 
60
- //tmp variables
61
- int wordLen = 0;
60
+ // tmp variables
61
+ size_t wordLen = 0;
62
62
  assert(dictTrie_);
63
63
  vector<struct Dag> dags;
64
64
  dictTrie_->Find(begin, end, dags);
@@ -72,6 +72,15 @@ class Jieba {
72
72
  return dict_trie_.InsertUserWord(word, tag);
73
73
  }
74
74
 
75
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
76
+ return dict_trie_.InsertUserWord(word,freq, tag);
77
+ }
78
+
79
+ bool Find(const string& word)
80
+ {
81
+ return dict_trie_.Find(word);
82
+ }
83
+
75
84
  void ResetSeparators(const string& s) {
76
85
  //TODO
77
86
  mp_seg_.ResetSeparators(s);
@@ -84,10 +93,23 @@ class Jieba {
84
93
  const DictTrie* GetDictTrie() const {
85
94
  return &dict_trie_;
86
95
  }
96
+
87
97
  const HMMModel* GetHMMModel() const {
88
98
  return &model_;
89
99
  }
90
100
 
101
+ void LoadUserDict(const vector<string>& buf) {
102
+ dict_trie_.LoadUserDict(buf);
103
+ }
104
+
105
+ void LoadUserDict(const set<string>& buf) {
106
+ dict_trie_.LoadUserDict(buf);
107
+ }
108
+
109
+ void LoadUserDict(const string& path) {
110
+ dict_trie_.LoadUserDict(path);
111
+ }
112
+
91
113
  private:
92
114
  DictTrie dict_trie_;
93
115
  HMMModel model_;
@@ -18,9 +18,14 @@ typedef uint32_t Rune;
18
18
  struct Word {
19
19
  string word;
20
20
  uint32_t offset;
21
+ uint32_t unicode_offset;
22
+ uint32_t unicode_length;
21
23
  Word(const string& w, uint32_t o)
22
24
  : word(w), offset(o) {
23
25
  }
26
+ Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
27
+ : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
28
+ }
24
29
  }; // struct Word
25
30
 
26
31
  inline std::ostream& operator << (std::ostream& os, const Word& w) {
@@ -31,11 +36,16 @@ struct RuneStr {
31
36
  Rune rune;
32
37
  uint32_t offset;
33
38
  uint32_t len;
39
+ uint32_t unicode_offset;
40
+ uint32_t unicode_length;
34
41
  RuneStr(): rune(0), offset(0), len(0) {
35
42
  }
36
43
  RuneStr(Rune r, uint32_t o, uint32_t l)
37
44
  : rune(r), offset(o), len(l) {
38
45
  }
46
+ RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
47
+ : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
48
+ }
39
49
  }; // struct RuneStr
40
50
 
41
51
  inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
@@ -132,15 +142,16 @@ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
132
142
  inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
133
143
  runes.clear();
134
144
  runes.reserve(len / 2);
135
- for (size_t i = 0; i < len;) {
145
+ for (uint32_t i = 0, j = 0; i < len;) {
136
146
  RuneStrLite rp = DecodeRuneInString(s + i, len - i);
137
147
  if (rp.len == 0) {
138
148
  runes.clear();
139
149
  return false;
140
150
  }
141
- RuneStr x(rp.rune, i, rp.len);
151
+ RuneStr x(rp.rune, i, rp.len, j, 1);
142
152
  runes.push_back(x);
143
153
  i += rp.len;
154
+ ++j;
144
155
  }
145
156
  return true;
146
157
  }
@@ -182,7 +193,8 @@ inline Unicode DecodeRunesInString(const string& s) {
182
193
  inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
183
194
  assert(right->offset >= left->offset);
184
195
  uint32_t len = right->offset - left->offset + right->len;
185
- return Word(s.substr(left->offset, len), left->offset);
196
+ uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
197
+ return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
186
198
  }
187
199
 
188
200
  inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
@@ -68,7 +68,7 @@ int main(int argc, char** argv) {
68
68
  s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
69
69
  jieba.Tag(s, tagres);
70
70
  cout << s << endl;
71
- cout << tagres << endl;;
71
+ cout << tagres << endl;
72
72
 
73
73
  cout << "[demo] Keyword Extraction" << endl;
74
74
  const size_t topk = 5;
@@ -75,7 +75,8 @@ VALUE internal_initialize(VALUE self,
75
75
  while (getline(ifs, line)) {
76
76
  data->stop_words->insert(line);
77
77
  }
78
- assert(data->stop_words->size());
78
+ assert(data->stop_words->size() != 0);
79
+ return self;
79
80
  }
80
81
 
81
82
  VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
@@ -171,4 +172,4 @@ void Init_internal()
171
172
  rb_define_method(rb_cCppjiebaRb_Internal, "stop_word?", (ruby_method*) &internal_stop_word, 1);
172
173
  }
173
174
 
174
- }
175
+ }
@@ -1,3 +1,3 @@
1
1
  module CppjiebaRb
2
- VERSION = '0.3.0'
2
+ VERSION = '0.3.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cppjieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erick Guan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-26 00:00:00.000000000 Z
11
+ date: 2018-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -235,7 +235,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
235
235
  version: '0'
236
236
  requirements: []
237
237
  rubyforge_project:
238
- rubygems_version: 2.6.12
238
+ rubygems_version: 2.6.14
239
239
  signing_key:
240
240
  specification_version: 4
241
241
  summary: cppjieba binding for ruby