cppjieba_rb 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +2 -2
  5. data/cppjieba_rb.gemspec +4 -4
  6. data/lib/cppjieba_rb/version.rb +1 -1
  7. metadata +17 -135
  8. data/ext/cppjieba/.gitignore +0 -17
  9. data/ext/cppjieba/.travis.yml +0 -21
  10. data/ext/cppjieba/CMakeLists.txt +0 -28
  11. data/ext/cppjieba/ChangeLog.md +0 -236
  12. data/ext/cppjieba/README.md +0 -292
  13. data/ext/cppjieba/README_EN.md +0 -113
  14. data/ext/cppjieba/appveyor.yml +0 -32
  15. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  16. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  17. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  28. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  41. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  44. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  45. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  46. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  47. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  48. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  49. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  50. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  51. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  52. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  53. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  54. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
  55. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
  56. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  57. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  58. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  59. data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
  60. data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
  61. data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
  62. data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
  63. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  64. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
  65. data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
  66. data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
  67. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  68. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  69. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
  70. data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
  71. data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
  72. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  73. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  74. data/ext/cppjieba/dict/README.md +0 -31
  75. data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
  76. data/ext/cppjieba/dict/idf.utf8 +0 -258826
  77. data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
  78. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
  79. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
  80. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
  81. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
  82. data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
  83. data/ext/cppjieba/dict/user.dict.utf8 +0 -4
  84. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
  85. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
  86. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
  87. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
  88. data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
  89. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
  90. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
  91. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
  92. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
  93. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
  94. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
  95. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
  96. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
  97. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
  98. data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
  99. data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
  100. data/ext/cppjieba/test/CMakeLists.txt +0 -5
  101. data/ext/cppjieba/test/demo.cpp +0 -80
  102. data/ext/cppjieba/test/load_test.cpp +0 -54
  103. data/ext/cppjieba/test/testdata/curl.res +0 -1
  104. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
  105. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
  106. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
  107. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
  108. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
  109. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
  110. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
  111. data/ext/cppjieba/test/testdata/load_test.urls +0 -2
  112. data/ext/cppjieba/test/testdata/review.100 +0 -100
  113. data/ext/cppjieba/test/testdata/review.100.res +0 -200
  114. data/ext/cppjieba/test/testdata/server.conf +0 -19
  115. data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
  116. data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
  117. data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
  118. data/ext/cppjieba/test/testdata/userdict.english +0 -2
  119. data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
  120. data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
  121. data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
  122. data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
  123. data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
  124. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
  125. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
  126. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
  127. data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
  128. data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
  129. data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
  130. data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,4 +0,0 @@
1
- 云计算
2
- 韩玉鉴赏
3
- 蓝翔 nz
4
- 区块链 10 nz
@@ -1,277 +0,0 @@
1
- #ifndef CPPJIEBA_DICT_TRIE_HPP
2
- #define CPPJIEBA_DICT_TRIE_HPP
3
-
4
- #include <iostream>
5
- #include <fstream>
6
- #include <map>
7
- #include <string>
8
- #include <cstring>
9
- #include <cstdlib>
10
- #include <stdint.h>
11
- #include <cmath>
12
- #include <limits>
13
- #include "limonp/StringUtil.hpp"
14
- #include "limonp/Logging.hpp"
15
- #include "Unicode.hpp"
16
- #include "Trie.hpp"
17
-
18
- namespace cppjieba {
19
-
20
- using namespace limonp;
21
-
22
- const double MIN_DOUBLE = -3.14e+100;
23
- const double MAX_DOUBLE = 3.14e+100;
24
- const size_t DICT_COLUMN_NUM = 3;
25
- const char* const UNKNOWN_TAG = "";
26
-
27
- class DictTrie {
28
- public:
29
- enum UserWordWeightOption {
30
- WordWeightMin,
31
- WordWeightMedian,
32
- WordWeightMax,
33
- }; // enum UserWordWeightOption
34
-
35
- DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
36
- Init(dict_path, user_dict_paths, user_word_weight_opt);
37
- }
38
-
39
- ~DictTrie() {
40
- delete trie_;
41
- }
42
-
43
- bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
44
- DictUnit node_info;
45
- if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
46
- return false;
47
- }
48
- active_node_infos_.push_back(node_info);
49
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
50
- return true;
51
- }
52
-
53
- bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
- DictUnit node_info;
55
- double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
- if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
- return false;
58
- }
59
- active_node_infos_.push_back(node_info);
60
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
- return true;
62
- }
63
-
64
- const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
65
- return trie_->Find(begin, end);
66
- }
67
-
68
- void Find(RuneStrArray::const_iterator begin,
69
- RuneStrArray::const_iterator end,
70
- vector<struct Dag>&res,
71
- size_t max_word_len = MAX_WORD_LENGTH) const {
72
- trie_->Find(begin, end, res, max_word_len);
73
- }
74
-
75
- bool Find(const string& word)
76
- {
77
- const DictUnit *tmp = NULL;
78
- RuneStrArray runes;
79
- if (!DecodeRunesInString(word, runes))
80
- {
81
- XLOG(ERROR) << "Decode failed.";
82
- }
83
- tmp = Find(runes.begin(), runes.end());
84
- if (tmp == NULL)
85
- {
86
- return false;
87
- }
88
- else
89
- {
90
- return true;
91
- }
92
- }
93
-
94
- bool IsUserDictSingleChineseWord(const Rune& word) const {
95
- return IsIn(user_dict_single_chinese_word_, word);
96
- }
97
-
98
- double GetMinWeight() const {
99
- return min_weight_;
100
- }
101
-
102
- void InserUserDictNode(const string& line) {
103
- vector<string> buf;
104
- DictUnit node_info;
105
- Split(line, buf, " ");
106
- if(buf.size() == 1){
107
- MakeNodeInfo(node_info,
108
- buf[0],
109
- user_word_default_weight_,
110
- UNKNOWN_TAG);
111
- } else if (buf.size() == 2) {
112
- MakeNodeInfo(node_info,
113
- buf[0],
114
- user_word_default_weight_,
115
- buf[1]);
116
- } else if (buf.size() == 3) {
117
- int freq = atoi(buf[1].c_str());
118
- assert(freq_sum_ > 0.0);
119
- double weight = log(1.0 * freq / freq_sum_);
120
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
121
- }
122
- static_node_infos_.push_back(node_info);
123
- if (node_info.word.size() == 1) {
124
- user_dict_single_chinese_word_.insert(node_info.word[0]);
125
- }
126
- }
127
-
128
- void LoadUserDict(const vector<string>& buf) {
129
- for (size_t i = 0; i < buf.size(); i++) {
130
- InserUserDictNode(buf[i]);
131
- }
132
- }
133
-
134
- void LoadUserDict(const set<string>& buf) {
135
- std::set<string>::const_iterator iter;
136
- for (iter = buf.begin(); iter != buf.end(); iter++){
137
- InserUserDictNode(*iter);
138
- }
139
- }
140
-
141
- void LoadUserDict(const string& filePaths) {
142
- vector<string> files = limonp::Split(filePaths, "|;");
143
- size_t lineno = 0;
144
- for (size_t i = 0; i < files.size(); i++) {
145
- ifstream ifs(files[i].c_str());
146
- XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
147
- string line;
148
-
149
- for (; getline(ifs, line); lineno++) {
150
- if (line.size() == 0) {
151
- continue;
152
- }
153
- InserUserDictNode(line);
154
- }
155
- }
156
- }
157
-
158
-
159
- private:
160
- void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
161
- LoadDict(dict_path);
162
- freq_sum_ = CalcFreqSum(static_node_infos_);
163
- CalculateWeight(static_node_infos_, freq_sum_);
164
- SetStaticWordWeights(user_word_weight_opt);
165
-
166
- if (user_dict_paths.size()) {
167
- LoadUserDict(user_dict_paths);
168
- }
169
- Shrink(static_node_infos_);
170
- CreateTrie(static_node_infos_);
171
- }
172
-
173
- void CreateTrie(const vector<DictUnit>& dictUnits) {
174
- assert(dictUnits.size());
175
- vector<Unicode> words;
176
- vector<const DictUnit*> valuePointers;
177
- for (size_t i = 0 ; i < dictUnits.size(); i ++) {
178
- words.push_back(dictUnits[i].word);
179
- valuePointers.push_back(&dictUnits[i]);
180
- }
181
-
182
- trie_ = new Trie(words, valuePointers);
183
- }
184
-
185
-
186
-
187
-
188
- bool MakeNodeInfo(DictUnit& node_info,
189
- const string& word,
190
- double weight,
191
- const string& tag) {
192
- if (!DecodeRunesInString(word, node_info.word)) {
193
- XLOG(ERROR) << "Decode " << word << " failed.";
194
- return false;
195
- }
196
- node_info.weight = weight;
197
- node_info.tag = tag;
198
- return true;
199
- }
200
-
201
- void LoadDict(const string& filePath) {
202
- ifstream ifs(filePath.c_str());
203
- XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
204
- string line;
205
- vector<string> buf;
206
-
207
- DictUnit node_info;
208
- for (size_t lineno = 0; getline(ifs, line); lineno++) {
209
- Split(line, buf, " ");
210
- XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
211
- MakeNodeInfo(node_info,
212
- buf[0],
213
- atof(buf[1].c_str()),
214
- buf[2]);
215
- static_node_infos_.push_back(node_info);
216
- }
217
- }
218
-
219
- static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
220
- return lhs.weight < rhs.weight;
221
- }
222
-
223
- void SetStaticWordWeights(UserWordWeightOption option) {
224
- XCHECK(!static_node_infos_.empty());
225
- vector<DictUnit> x = static_node_infos_;
226
- sort(x.begin(), x.end(), WeightCompare);
227
- min_weight_ = x[0].weight;
228
- max_weight_ = x[x.size() - 1].weight;
229
- median_weight_ = x[x.size() / 2].weight;
230
- switch (option) {
231
- case WordWeightMin:
232
- user_word_default_weight_ = min_weight_;
233
- break;
234
- case WordWeightMedian:
235
- user_word_default_weight_ = median_weight_;
236
- break;
237
- default:
238
- user_word_default_weight_ = max_weight_;
239
- break;
240
- }
241
- }
242
-
243
- double CalcFreqSum(const vector<DictUnit>& node_infos) const {
244
- double sum = 0.0;
245
- for (size_t i = 0; i < node_infos.size(); i++) {
246
- sum += node_infos[i].weight;
247
- }
248
- return sum;
249
- }
250
-
251
- void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
252
- assert(sum > 0.0);
253
- for (size_t i = 0; i < node_infos.size(); i++) {
254
- DictUnit& node_info = node_infos[i];
255
- assert(node_info.weight > 0.0);
256
- node_info.weight = log(double(node_info.weight)/sum);
257
- }
258
- }
259
-
260
- void Shrink(vector<DictUnit>& units) const {
261
- vector<DictUnit>(units.begin(), units.end()).swap(units);
262
- }
263
-
264
- vector<DictUnit> static_node_infos_;
265
- deque<DictUnit> active_node_infos_; // must not be vector
266
- Trie * trie_;
267
-
268
- double freq_sum_;
269
- double min_weight_;
270
- double max_weight_;
271
- double median_weight_;
272
- double user_word_default_weight_;
273
- unordered_set<Rune> user_dict_single_chinese_word_;
274
- };
275
- }
276
-
277
- #endif
@@ -1,93 +0,0 @@
1
- #ifndef CPPJIEBA_FULLSEGMENT_H
2
- #define CPPJIEBA_FULLSEGMENT_H
3
-
4
- #include <algorithm>
5
- #include <set>
6
- #include <cassert>
7
- #include "limonp/Logging.hpp"
8
- #include "DictTrie.hpp"
9
- #include "SegmentBase.hpp"
10
- #include "Unicode.hpp"
11
-
12
- namespace cppjieba {
13
- class FullSegment: public SegmentBase {
14
- public:
15
- FullSegment(const string& dictPath) {
16
- dictTrie_ = new DictTrie(dictPath);
17
- isNeedDestroy_ = true;
18
- }
19
- FullSegment(const DictTrie* dictTrie)
20
- : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
- assert(dictTrie_);
22
- }
23
- ~FullSegment() {
24
- if (isNeedDestroy_) {
25
- delete dictTrie_;
26
- }
27
- }
28
- void Cut(const string& sentence,
29
- vector<string>& words) const {
30
- vector<Word> tmp;
31
- Cut(sentence, tmp);
32
- GetStringsFromWords(tmp, words);
33
- }
34
- void Cut(const string& sentence,
35
- vector<Word>& words) const {
36
- PreFilter pre_filter(symbols_, sentence);
37
- PreFilter::Range range;
38
- vector<WordRange> wrs;
39
- wrs.reserve(sentence.size()/2);
40
- while (pre_filter.HasNext()) {
41
- range = pre_filter.Next();
42
- Cut(range.begin, range.end, wrs);
43
- }
44
- words.clear();
45
- words.reserve(wrs.size());
46
- GetWordsFromWordRanges(sentence, wrs, words);
47
- }
48
- void Cut(RuneStrArray::const_iterator begin,
49
- RuneStrArray::const_iterator end,
50
- vector<WordRange>& res) const {
51
- // resut of searching in trie tree
52
- LocalVector<pair<size_t, const DictUnit*> > tRes;
53
-
54
- // max index of res's words
55
- size_t maxIdx = 0;
56
-
57
- // always equals to (uItr - begin)
58
- size_t uIdx = 0;
59
-
60
- // tmp variables
61
- size_t wordLen = 0;
62
- assert(dictTrie_);
63
- vector<struct Dag> dags;
64
- dictTrie_->Find(begin, end, dags);
65
- for (size_t i = 0; i < dags.size(); i++) {
66
- for (size_t j = 0; j < dags[i].nexts.size(); j++) {
67
- size_t nextoffset = dags[i].nexts[j].first;
68
- assert(nextoffset < dags.size());
69
- const DictUnit* du = dags[i].nexts[j].second;
70
- if (du == NULL) {
71
- if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
72
- WordRange wr(begin + i, begin + nextoffset);
73
- res.push_back(wr);
74
- }
75
- } else {
76
- wordLen = du->word.size();
77
- if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
78
- WordRange wr(begin + i, begin + nextoffset);
79
- res.push_back(wr);
80
- }
81
- }
82
- maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
83
- }
84
- uIdx++;
85
- }
86
- }
87
- private:
88
- const DictTrie* dictTrie_;
89
- bool isNeedDestroy_;
90
- };
91
- }
92
-
93
- #endif
@@ -1,129 +0,0 @@
1
- #ifndef CPPJIEBA_HMMMODEL_H
2
- #define CPPJIEBA_HMMMODEL_H
3
-
4
- #include "limonp/StringUtil.hpp"
5
- #include "Trie.hpp"
6
-
7
- namespace cppjieba {
8
-
9
- using namespace limonp;
10
- typedef unordered_map<Rune, double> EmitProbMap;
11
-
12
- struct HMMModel {
13
- /*
14
- * STATUS:
15
- * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
16
- * */
17
- enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
18
-
19
- HMMModel(const string& modelPath) {
20
- memset(startProb, 0, sizeof(startProb));
21
- memset(transProb, 0, sizeof(transProb));
22
- statMap[0] = 'B';
23
- statMap[1] = 'E';
24
- statMap[2] = 'M';
25
- statMap[3] = 'S';
26
- emitProbVec.push_back(&emitProbB);
27
- emitProbVec.push_back(&emitProbE);
28
- emitProbVec.push_back(&emitProbM);
29
- emitProbVec.push_back(&emitProbS);
30
- LoadModel(modelPath);
31
- }
32
- ~HMMModel() {
33
- }
34
- void LoadModel(const string& filePath) {
35
- ifstream ifile(filePath.c_str());
36
- XCHECK(ifile.is_open()) << "open " << filePath << " failed";
37
- string line;
38
- vector<string> tmp;
39
- vector<string> tmp2;
40
- //Load startProb
41
- XCHECK(GetLine(ifile, line));
42
- Split(line, tmp, " ");
43
- XCHECK(tmp.size() == STATUS_SUM);
44
- for (size_t j = 0; j< tmp.size(); j++) {
45
- startProb[j] = atof(tmp[j].c_str());
46
- }
47
-
48
- //Load transProb
49
- for (size_t i = 0; i < STATUS_SUM; i++) {
50
- XCHECK(GetLine(ifile, line));
51
- Split(line, tmp, " ");
52
- XCHECK(tmp.size() == STATUS_SUM);
53
- for (size_t j =0; j < STATUS_SUM; j++) {
54
- transProb[i][j] = atof(tmp[j].c_str());
55
- }
56
- }
57
-
58
- //Load emitProbB
59
- XCHECK(GetLine(ifile, line));
60
- XCHECK(LoadEmitProb(line, emitProbB));
61
-
62
- //Load emitProbE
63
- XCHECK(GetLine(ifile, line));
64
- XCHECK(LoadEmitProb(line, emitProbE));
65
-
66
- //Load emitProbM
67
- XCHECK(GetLine(ifile, line));
68
- XCHECK(LoadEmitProb(line, emitProbM));
69
-
70
- //Load emitProbS
71
- XCHECK(GetLine(ifile, line));
72
- XCHECK(LoadEmitProb(line, emitProbS));
73
- }
74
- double GetEmitProb(const EmitProbMap* ptMp, Rune key,
75
- double defVal)const {
76
- EmitProbMap::const_iterator cit = ptMp->find(key);
77
- if (cit == ptMp->end()) {
78
- return defVal;
79
- }
80
- return cit->second;
81
- }
82
- bool GetLine(ifstream& ifile, string& line) {
83
- while (getline(ifile, line)) {
84
- Trim(line);
85
- if (line.empty()) {
86
- continue;
87
- }
88
- if (StartsWith(line, "#")) {
89
- continue;
90
- }
91
- return true;
92
- }
93
- return false;
94
- }
95
- bool LoadEmitProb(const string& line, EmitProbMap& mp) {
96
- if (line.empty()) {
97
- return false;
98
- }
99
- vector<string> tmp, tmp2;
100
- Unicode unicode;
101
- Split(line, tmp, ",");
102
- for (size_t i = 0; i < tmp.size(); i++) {
103
- Split(tmp[i], tmp2, ":");
104
- if (2 != tmp2.size()) {
105
- XLOG(ERROR) << "emitProb illegal.";
106
- return false;
107
- }
108
- if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
109
- XLOG(ERROR) << "TransCode failed.";
110
- return false;
111
- }
112
- mp[unicode[0]] = atof(tmp2[1].c_str());
113
- }
114
- return true;
115
- }
116
-
117
- char statMap[STATUS_SUM];
118
- double startProb[STATUS_SUM];
119
- double transProb[STATUS_SUM][STATUS_SUM];
120
- EmitProbMap emitProbB;
121
- EmitProbMap emitProbE;
122
- EmitProbMap emitProbM;
123
- EmitProbMap emitProbS;
124
- vector<EmitProbMap* > emitProbVec;
125
- }; // struct HMMModel
126
-
127
- } // namespace cppjieba
128
-
129
- #endif