cppjieba_rb 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (130) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +2 -2
  5. data/cppjieba_rb.gemspec +4 -4
  6. data/lib/cppjieba_rb/version.rb +1 -1
  7. metadata +17 -135
  8. data/ext/cppjieba/.gitignore +0 -17
  9. data/ext/cppjieba/.travis.yml +0 -21
  10. data/ext/cppjieba/CMakeLists.txt +0 -28
  11. data/ext/cppjieba/ChangeLog.md +0 -236
  12. data/ext/cppjieba/README.md +0 -292
  13. data/ext/cppjieba/README_EN.md +0 -113
  14. data/ext/cppjieba/appveyor.yml +0 -32
  15. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  16. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  17. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  28. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  41. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  44. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  45. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  46. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  47. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  48. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  49. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  50. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  51. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  52. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  53. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  54. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
  55. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
  56. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  57. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  58. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  59. data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
  60. data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
  61. data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
  62. data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
  63. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  64. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
  65. data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
  66. data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
  67. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  68. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  69. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
  70. data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
  71. data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
  72. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  73. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  74. data/ext/cppjieba/dict/README.md +0 -31
  75. data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
  76. data/ext/cppjieba/dict/idf.utf8 +0 -258826
  77. data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
  78. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
  79. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
  80. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
  81. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
  82. data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
  83. data/ext/cppjieba/dict/user.dict.utf8 +0 -4
  84. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
  85. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
  86. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
  87. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
  88. data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
  89. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
  90. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
  91. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
  92. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
  93. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
  94. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
  95. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
  96. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
  97. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
  98. data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
  99. data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
  100. data/ext/cppjieba/test/CMakeLists.txt +0 -5
  101. data/ext/cppjieba/test/demo.cpp +0 -80
  102. data/ext/cppjieba/test/load_test.cpp +0 -54
  103. data/ext/cppjieba/test/testdata/curl.res +0 -1
  104. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
  105. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
  106. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
  107. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
  108. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
  109. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
  110. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
  111. data/ext/cppjieba/test/testdata/load_test.urls +0 -2
  112. data/ext/cppjieba/test/testdata/review.100 +0 -100
  113. data/ext/cppjieba/test/testdata/review.100.res +0 -200
  114. data/ext/cppjieba/test/testdata/server.conf +0 -19
  115. data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
  116. data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
  117. data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
  118. data/ext/cppjieba/test/testdata/userdict.english +0 -2
  119. data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
  120. data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
  121. data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
  122. data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
  123. data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
  124. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
  125. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
  126. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
  127. data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
  128. data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
  129. data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
  130. data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,4 +0,0 @@
1
- 云计算
2
- 韩玉鉴赏
3
- 蓝翔 nz
4
- 区块链 10 nz
@@ -1,277 +0,0 @@
1
- #ifndef CPPJIEBA_DICT_TRIE_HPP
2
- #define CPPJIEBA_DICT_TRIE_HPP
3
-
4
- #include <iostream>
5
- #include <fstream>
6
- #include <map>
7
- #include <string>
8
- #include <cstring>
9
- #include <cstdlib>
10
- #include <stdint.h>
11
- #include <cmath>
12
- #include <limits>
13
- #include "limonp/StringUtil.hpp"
14
- #include "limonp/Logging.hpp"
15
- #include "Unicode.hpp"
16
- #include "Trie.hpp"
17
-
18
- namespace cppjieba {
19
-
20
- using namespace limonp;
21
-
22
- const double MIN_DOUBLE = -3.14e+100;
23
- const double MAX_DOUBLE = 3.14e+100;
24
- const size_t DICT_COLUMN_NUM = 3;
25
- const char* const UNKNOWN_TAG = "";
26
-
27
- class DictTrie {
28
- public:
29
- enum UserWordWeightOption {
30
- WordWeightMin,
31
- WordWeightMedian,
32
- WordWeightMax,
33
- }; // enum UserWordWeightOption
34
-
35
- DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
36
- Init(dict_path, user_dict_paths, user_word_weight_opt);
37
- }
38
-
39
- ~DictTrie() {
40
- delete trie_;
41
- }
42
-
43
- bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
44
- DictUnit node_info;
45
- if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
46
- return false;
47
- }
48
- active_node_infos_.push_back(node_info);
49
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
50
- return true;
51
- }
52
-
53
- bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
- DictUnit node_info;
55
- double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
- if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
- return false;
58
- }
59
- active_node_infos_.push_back(node_info);
60
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
- return true;
62
- }
63
-
64
- const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
65
- return trie_->Find(begin, end);
66
- }
67
-
68
- void Find(RuneStrArray::const_iterator begin,
69
- RuneStrArray::const_iterator end,
70
- vector<struct Dag>&res,
71
- size_t max_word_len = MAX_WORD_LENGTH) const {
72
- trie_->Find(begin, end, res, max_word_len);
73
- }
74
-
75
- bool Find(const string& word)
76
- {
77
- const DictUnit *tmp = NULL;
78
- RuneStrArray runes;
79
- if (!DecodeRunesInString(word, runes))
80
- {
81
- XLOG(ERROR) << "Decode failed.";
82
- }
83
- tmp = Find(runes.begin(), runes.end());
84
- if (tmp == NULL)
85
- {
86
- return false;
87
- }
88
- else
89
- {
90
- return true;
91
- }
92
- }
93
-
94
- bool IsUserDictSingleChineseWord(const Rune& word) const {
95
- return IsIn(user_dict_single_chinese_word_, word);
96
- }
97
-
98
- double GetMinWeight() const {
99
- return min_weight_;
100
- }
101
-
102
- void InserUserDictNode(const string& line) {
103
- vector<string> buf;
104
- DictUnit node_info;
105
- Split(line, buf, " ");
106
- if(buf.size() == 1){
107
- MakeNodeInfo(node_info,
108
- buf[0],
109
- user_word_default_weight_,
110
- UNKNOWN_TAG);
111
- } else if (buf.size() == 2) {
112
- MakeNodeInfo(node_info,
113
- buf[0],
114
- user_word_default_weight_,
115
- buf[1]);
116
- } else if (buf.size() == 3) {
117
- int freq = atoi(buf[1].c_str());
118
- assert(freq_sum_ > 0.0);
119
- double weight = log(1.0 * freq / freq_sum_);
120
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
121
- }
122
- static_node_infos_.push_back(node_info);
123
- if (node_info.word.size() == 1) {
124
- user_dict_single_chinese_word_.insert(node_info.word[0]);
125
- }
126
- }
127
-
128
- void LoadUserDict(const vector<string>& buf) {
129
- for (size_t i = 0; i < buf.size(); i++) {
130
- InserUserDictNode(buf[i]);
131
- }
132
- }
133
-
134
- void LoadUserDict(const set<string>& buf) {
135
- std::set<string>::const_iterator iter;
136
- for (iter = buf.begin(); iter != buf.end(); iter++){
137
- InserUserDictNode(*iter);
138
- }
139
- }
140
-
141
- void LoadUserDict(const string& filePaths) {
142
- vector<string> files = limonp::Split(filePaths, "|;");
143
- size_t lineno = 0;
144
- for (size_t i = 0; i < files.size(); i++) {
145
- ifstream ifs(files[i].c_str());
146
- XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
147
- string line;
148
-
149
- for (; getline(ifs, line); lineno++) {
150
- if (line.size() == 0) {
151
- continue;
152
- }
153
- InserUserDictNode(line);
154
- }
155
- }
156
- }
157
-
158
-
159
- private:
160
- void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
161
- LoadDict(dict_path);
162
- freq_sum_ = CalcFreqSum(static_node_infos_);
163
- CalculateWeight(static_node_infos_, freq_sum_);
164
- SetStaticWordWeights(user_word_weight_opt);
165
-
166
- if (user_dict_paths.size()) {
167
- LoadUserDict(user_dict_paths);
168
- }
169
- Shrink(static_node_infos_);
170
- CreateTrie(static_node_infos_);
171
- }
172
-
173
- void CreateTrie(const vector<DictUnit>& dictUnits) {
174
- assert(dictUnits.size());
175
- vector<Unicode> words;
176
- vector<const DictUnit*> valuePointers;
177
- for (size_t i = 0 ; i < dictUnits.size(); i ++) {
178
- words.push_back(dictUnits[i].word);
179
- valuePointers.push_back(&dictUnits[i]);
180
- }
181
-
182
- trie_ = new Trie(words, valuePointers);
183
- }
184
-
185
-
186
-
187
-
188
- bool MakeNodeInfo(DictUnit& node_info,
189
- const string& word,
190
- double weight,
191
- const string& tag) {
192
- if (!DecodeRunesInString(word, node_info.word)) {
193
- XLOG(ERROR) << "Decode " << word << " failed.";
194
- return false;
195
- }
196
- node_info.weight = weight;
197
- node_info.tag = tag;
198
- return true;
199
- }
200
-
201
- void LoadDict(const string& filePath) {
202
- ifstream ifs(filePath.c_str());
203
- XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
204
- string line;
205
- vector<string> buf;
206
-
207
- DictUnit node_info;
208
- for (size_t lineno = 0; getline(ifs, line); lineno++) {
209
- Split(line, buf, " ");
210
- XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
211
- MakeNodeInfo(node_info,
212
- buf[0],
213
- atof(buf[1].c_str()),
214
- buf[2]);
215
- static_node_infos_.push_back(node_info);
216
- }
217
- }
218
-
219
- static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
220
- return lhs.weight < rhs.weight;
221
- }
222
-
223
- void SetStaticWordWeights(UserWordWeightOption option) {
224
- XCHECK(!static_node_infos_.empty());
225
- vector<DictUnit> x = static_node_infos_;
226
- sort(x.begin(), x.end(), WeightCompare);
227
- min_weight_ = x[0].weight;
228
- max_weight_ = x[x.size() - 1].weight;
229
- median_weight_ = x[x.size() / 2].weight;
230
- switch (option) {
231
- case WordWeightMin:
232
- user_word_default_weight_ = min_weight_;
233
- break;
234
- case WordWeightMedian:
235
- user_word_default_weight_ = median_weight_;
236
- break;
237
- default:
238
- user_word_default_weight_ = max_weight_;
239
- break;
240
- }
241
- }
242
-
243
- double CalcFreqSum(const vector<DictUnit>& node_infos) const {
244
- double sum = 0.0;
245
- for (size_t i = 0; i < node_infos.size(); i++) {
246
- sum += node_infos[i].weight;
247
- }
248
- return sum;
249
- }
250
-
251
- void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
252
- assert(sum > 0.0);
253
- for (size_t i = 0; i < node_infos.size(); i++) {
254
- DictUnit& node_info = node_infos[i];
255
- assert(node_info.weight > 0.0);
256
- node_info.weight = log(double(node_info.weight)/sum);
257
- }
258
- }
259
-
260
- void Shrink(vector<DictUnit>& units) const {
261
- vector<DictUnit>(units.begin(), units.end()).swap(units);
262
- }
263
-
264
- vector<DictUnit> static_node_infos_;
265
- deque<DictUnit> active_node_infos_; // must not be vector
266
- Trie * trie_;
267
-
268
- double freq_sum_;
269
- double min_weight_;
270
- double max_weight_;
271
- double median_weight_;
272
- double user_word_default_weight_;
273
- unordered_set<Rune> user_dict_single_chinese_word_;
274
- };
275
- }
276
-
277
- #endif
@@ -1,93 +0,0 @@
1
- #ifndef CPPJIEBA_FULLSEGMENT_H
2
- #define CPPJIEBA_FULLSEGMENT_H
3
-
4
- #include <algorithm>
5
- #include <set>
6
- #include <cassert>
7
- #include "limonp/Logging.hpp"
8
- #include "DictTrie.hpp"
9
- #include "SegmentBase.hpp"
10
- #include "Unicode.hpp"
11
-
12
- namespace cppjieba {
13
- class FullSegment: public SegmentBase {
14
- public:
15
- FullSegment(const string& dictPath) {
16
- dictTrie_ = new DictTrie(dictPath);
17
- isNeedDestroy_ = true;
18
- }
19
- FullSegment(const DictTrie* dictTrie)
20
- : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
- assert(dictTrie_);
22
- }
23
- ~FullSegment() {
24
- if (isNeedDestroy_) {
25
- delete dictTrie_;
26
- }
27
- }
28
- void Cut(const string& sentence,
29
- vector<string>& words) const {
30
- vector<Word> tmp;
31
- Cut(sentence, tmp);
32
- GetStringsFromWords(tmp, words);
33
- }
34
- void Cut(const string& sentence,
35
- vector<Word>& words) const {
36
- PreFilter pre_filter(symbols_, sentence);
37
- PreFilter::Range range;
38
- vector<WordRange> wrs;
39
- wrs.reserve(sentence.size()/2);
40
- while (pre_filter.HasNext()) {
41
- range = pre_filter.Next();
42
- Cut(range.begin, range.end, wrs);
43
- }
44
- words.clear();
45
- words.reserve(wrs.size());
46
- GetWordsFromWordRanges(sentence, wrs, words);
47
- }
48
- void Cut(RuneStrArray::const_iterator begin,
49
- RuneStrArray::const_iterator end,
50
- vector<WordRange>& res) const {
51
- // resut of searching in trie tree
52
- LocalVector<pair<size_t, const DictUnit*> > tRes;
53
-
54
- // max index of res's words
55
- size_t maxIdx = 0;
56
-
57
- // always equals to (uItr - begin)
58
- size_t uIdx = 0;
59
-
60
- // tmp variables
61
- size_t wordLen = 0;
62
- assert(dictTrie_);
63
- vector<struct Dag> dags;
64
- dictTrie_->Find(begin, end, dags);
65
- for (size_t i = 0; i < dags.size(); i++) {
66
- for (size_t j = 0; j < dags[i].nexts.size(); j++) {
67
- size_t nextoffset = dags[i].nexts[j].first;
68
- assert(nextoffset < dags.size());
69
- const DictUnit* du = dags[i].nexts[j].second;
70
- if (du == NULL) {
71
- if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
72
- WordRange wr(begin + i, begin + nextoffset);
73
- res.push_back(wr);
74
- }
75
- } else {
76
- wordLen = du->word.size();
77
- if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
78
- WordRange wr(begin + i, begin + nextoffset);
79
- res.push_back(wr);
80
- }
81
- }
82
- maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
83
- }
84
- uIdx++;
85
- }
86
- }
87
- private:
88
- const DictTrie* dictTrie_;
89
- bool isNeedDestroy_;
90
- };
91
- }
92
-
93
- #endif
@@ -1,129 +0,0 @@
1
- #ifndef CPPJIEBA_HMMMODEL_H
2
- #define CPPJIEBA_HMMMODEL_H
3
-
4
- #include "limonp/StringUtil.hpp"
5
- #include "Trie.hpp"
6
-
7
- namespace cppjieba {
8
-
9
- using namespace limonp;
10
- typedef unordered_map<Rune, double> EmitProbMap;
11
-
12
- struct HMMModel {
13
- /*
14
- * STATUS:
15
- * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
16
- * */
17
- enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
18
-
19
- HMMModel(const string& modelPath) {
20
- memset(startProb, 0, sizeof(startProb));
21
- memset(transProb, 0, sizeof(transProb));
22
- statMap[0] = 'B';
23
- statMap[1] = 'E';
24
- statMap[2] = 'M';
25
- statMap[3] = 'S';
26
- emitProbVec.push_back(&emitProbB);
27
- emitProbVec.push_back(&emitProbE);
28
- emitProbVec.push_back(&emitProbM);
29
- emitProbVec.push_back(&emitProbS);
30
- LoadModel(modelPath);
31
- }
32
- ~HMMModel() {
33
- }
34
- void LoadModel(const string& filePath) {
35
- ifstream ifile(filePath.c_str());
36
- XCHECK(ifile.is_open()) << "open " << filePath << " failed";
37
- string line;
38
- vector<string> tmp;
39
- vector<string> tmp2;
40
- //Load startProb
41
- XCHECK(GetLine(ifile, line));
42
- Split(line, tmp, " ");
43
- XCHECK(tmp.size() == STATUS_SUM);
44
- for (size_t j = 0; j< tmp.size(); j++) {
45
- startProb[j] = atof(tmp[j].c_str());
46
- }
47
-
48
- //Load transProb
49
- for (size_t i = 0; i < STATUS_SUM; i++) {
50
- XCHECK(GetLine(ifile, line));
51
- Split(line, tmp, " ");
52
- XCHECK(tmp.size() == STATUS_SUM);
53
- for (size_t j =0; j < STATUS_SUM; j++) {
54
- transProb[i][j] = atof(tmp[j].c_str());
55
- }
56
- }
57
-
58
- //Load emitProbB
59
- XCHECK(GetLine(ifile, line));
60
- XCHECK(LoadEmitProb(line, emitProbB));
61
-
62
- //Load emitProbE
63
- XCHECK(GetLine(ifile, line));
64
- XCHECK(LoadEmitProb(line, emitProbE));
65
-
66
- //Load emitProbM
67
- XCHECK(GetLine(ifile, line));
68
- XCHECK(LoadEmitProb(line, emitProbM));
69
-
70
- //Load emitProbS
71
- XCHECK(GetLine(ifile, line));
72
- XCHECK(LoadEmitProb(line, emitProbS));
73
- }
74
- double GetEmitProb(const EmitProbMap* ptMp, Rune key,
75
- double defVal)const {
76
- EmitProbMap::const_iterator cit = ptMp->find(key);
77
- if (cit == ptMp->end()) {
78
- return defVal;
79
- }
80
- return cit->second;
81
- }
82
- bool GetLine(ifstream& ifile, string& line) {
83
- while (getline(ifile, line)) {
84
- Trim(line);
85
- if (line.empty()) {
86
- continue;
87
- }
88
- if (StartsWith(line, "#")) {
89
- continue;
90
- }
91
- return true;
92
- }
93
- return false;
94
- }
95
- bool LoadEmitProb(const string& line, EmitProbMap& mp) {
96
- if (line.empty()) {
97
- return false;
98
- }
99
- vector<string> tmp, tmp2;
100
- Unicode unicode;
101
- Split(line, tmp, ",");
102
- for (size_t i = 0; i < tmp.size(); i++) {
103
- Split(tmp[i], tmp2, ":");
104
- if (2 != tmp2.size()) {
105
- XLOG(ERROR) << "emitProb illegal.";
106
- return false;
107
- }
108
- if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
109
- XLOG(ERROR) << "TransCode failed.";
110
- return false;
111
- }
112
- mp[unicode[0]] = atof(tmp2[1].c_str());
113
- }
114
- return true;
115
- }
116
-
117
- char statMap[STATUS_SUM];
118
- double startProb[STATUS_SUM];
119
- double transProb[STATUS_SUM][STATUS_SUM];
120
- EmitProbMap emitProbB;
121
- EmitProbMap emitProbE;
122
- EmitProbMap emitProbM;
123
- EmitProbMap emitProbS;
124
- vector<EmitProbMap* > emitProbVec;
125
- }; // struct HMMModel
126
-
127
- } // namespace cppjieba
128
-
129
- #endif