cppjieba_rb 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,4 @@
1
+ 云计算
2
+ 韩玉鉴赏
3
+ 蓝翔 nz
4
+ 区块链 10 nz
@@ -0,0 +1,227 @@
1
+ #ifndef CPPJIEBA_DICT_TRIE_HPP
2
+ #define CPPJIEBA_DICT_TRIE_HPP
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <map>
7
+ #include <string>
8
+ #include <cstring>
9
+ #include <cstdlib>
10
+ #include <stdint.h>
11
+ #include <cmath>
12
+ #include <limits>
13
+ #include "limonp/StringUtil.hpp"
14
+ #include "limonp/Logging.hpp"
15
+ #include "Unicode.hpp"
16
+ #include "Trie.hpp"
17
+
18
+ namespace cppjieba {
19
+
20
+ using namespace limonp;
21
+
22
+ const double MIN_DOUBLE = -3.14e+100;
23
+ const double MAX_DOUBLE = 3.14e+100;
24
+ const size_t DICT_COLUMN_NUM = 3;
25
+ const char* const UNKNOWN_TAG = "";
26
+
27
+ class DictTrie {
28
+ public:
29
+ enum UserWordWeightOption {
30
+ WordWeightMin,
31
+ WordWeightMedian,
32
+ WordWeightMax,
33
+ }; // enum UserWordWeightOption
34
+
35
+ DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
36
+ Init(dict_path, user_dict_paths, user_word_weight_opt);
37
+ }
38
+
39
+ ~DictTrie() {
40
+ delete trie_;
41
+ }
42
+
43
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
44
+ DictUnit node_info;
45
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
46
+ return false;
47
+ }
48
+ active_node_infos_.push_back(node_info);
49
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
50
+ return true;
51
+ }
52
+
53
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
54
+ return trie_->Find(begin, end);
55
+ }
56
+
57
+ void Find(RuneStrArray::const_iterator begin,
58
+ RuneStrArray::const_iterator end,
59
+ vector<struct Dag>&res,
60
+ size_t max_word_len = MAX_WORD_LENGTH) const {
61
+ trie_->Find(begin, end, res, max_word_len);
62
+ }
63
+
64
+ bool IsUserDictSingleChineseWord(const Rune& word) const {
65
+ return IsIn(user_dict_single_chinese_word_, word);
66
+ }
67
+
68
+ double GetMinWeight() const {
69
+ return min_weight_;
70
+ }
71
+
72
+ private:
73
+ void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
74
+ LoadDict(dict_path);
75
+ freq_sum_ = CalcFreqSum(static_node_infos_);
76
+ CalculateWeight(static_node_infos_, freq_sum_);
77
+ SetStaticWordWeights(user_word_weight_opt);
78
+
79
+ if (user_dict_paths.size()) {
80
+ LoadUserDict(user_dict_paths);
81
+ }
82
+ Shrink(static_node_infos_);
83
+ CreateTrie(static_node_infos_);
84
+ }
85
+
86
+ void CreateTrie(const vector<DictUnit>& dictUnits) {
87
+ assert(dictUnits.size());
88
+ vector<Unicode> words;
89
+ vector<const DictUnit*> valuePointers;
90
+ for (size_t i = 0 ; i < dictUnits.size(); i ++) {
91
+ words.push_back(dictUnits[i].word);
92
+ valuePointers.push_back(&dictUnits[i]);
93
+ }
94
+
95
+ trie_ = new Trie(words, valuePointers);
96
+ }
97
+
98
+ void LoadUserDict(const string& filePaths) {
99
+ vector<string> files = limonp::Split(filePaths, "|;");
100
+ size_t lineno = 0;
101
+ for (size_t i = 0; i < files.size(); i++) {
102
+ ifstream ifs(files[i].c_str());
103
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
104
+ string line;
105
+ DictUnit node_info;
106
+ vector<string> buf;
107
+ for (; getline(ifs, line); lineno++) {
108
+ if (line.size() == 0) {
109
+ continue;
110
+ }
111
+ buf.clear();
112
+ Split(line, buf, " ");
113
+ DictUnit node_info;
114
+ if(buf.size() == 1){
115
+ MakeNodeInfo(node_info,
116
+ buf[0],
117
+ user_word_default_weight_,
118
+ UNKNOWN_TAG);
119
+ } else if (buf.size() == 2) {
120
+ MakeNodeInfo(node_info,
121
+ buf[0],
122
+ user_word_default_weight_,
123
+ buf[1]);
124
+ } else if (buf.size() == 3) {
125
+ int freq = atoi(buf[1].c_str());
126
+ assert(freq_sum_ > 0.0);
127
+ double weight = log(1.0 * freq / freq_sum_);
128
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
129
+ }
130
+ static_node_infos_.push_back(node_info);
131
+ if (node_info.word.size() == 1) {
132
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ bool MakeNodeInfo(DictUnit& node_info,
139
+ const string& word,
140
+ double weight,
141
+ const string& tag) {
142
+ if (!DecodeRunesInString(word, node_info.word)) {
143
+ XLOG(ERROR) << "Decode " << word << " failed.";
144
+ return false;
145
+ }
146
+ node_info.weight = weight;
147
+ node_info.tag = tag;
148
+ return true;
149
+ }
150
+
151
+ void LoadDict(const string& filePath) {
152
+ ifstream ifs(filePath.c_str());
153
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
154
+ string line;
155
+ vector<string> buf;
156
+
157
+ DictUnit node_info;
158
+ for (size_t lineno = 0; getline(ifs, line); lineno++) {
159
+ Split(line, buf, " ");
160
+ XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
161
+ MakeNodeInfo(node_info,
162
+ buf[0],
163
+ atof(buf[1].c_str()),
164
+ buf[2]);
165
+ static_node_infos_.push_back(node_info);
166
+ }
167
+ }
168
+
169
+ static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
170
+ return lhs.weight < rhs.weight;
171
+ }
172
+
173
+ void SetStaticWordWeights(UserWordWeightOption option) {
174
+ XCHECK(!static_node_infos_.empty());
175
+ vector<DictUnit> x = static_node_infos_;
176
+ sort(x.begin(), x.end(), WeightCompare);
177
+ min_weight_ = x[0].weight;
178
+ max_weight_ = x[x.size() - 1].weight;
179
+ median_weight_ = x[x.size() / 2].weight;
180
+ switch (option) {
181
+ case WordWeightMin:
182
+ user_word_default_weight_ = min_weight_;
183
+ break;
184
+ case WordWeightMedian:
185
+ user_word_default_weight_ = median_weight_;
186
+ break;
187
+ default:
188
+ user_word_default_weight_ = max_weight_;
189
+ break;
190
+ }
191
+ }
192
+
193
+ double CalcFreqSum(const vector<DictUnit>& node_infos) const {
194
+ double sum = 0.0;
195
+ for (size_t i = 0; i < node_infos.size(); i++) {
196
+ sum += node_infos[i].weight;
197
+ }
198
+ return sum;
199
+ }
200
+
201
+ void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
202
+ assert(sum > 0.0);
203
+ for (size_t i = 0; i < node_infos.size(); i++) {
204
+ DictUnit& node_info = node_infos[i];
205
+ assert(node_info.weight > 0.0);
206
+ node_info.weight = log(double(node_info.weight)/sum);
207
+ }
208
+ }
209
+
210
+ void Shrink(vector<DictUnit>& units) const {
211
+ vector<DictUnit>(units.begin(), units.end()).swap(units);
212
+ }
213
+
214
+ vector<DictUnit> static_node_infos_;
215
+ deque<DictUnit> active_node_infos_; // must not be vector
216
+ Trie * trie_;
217
+
218
+ double freq_sum_;
219
+ double min_weight_;
220
+ double max_weight_;
221
+ double median_weight_;
222
+ double user_word_default_weight_;
223
+ unordered_set<Rune> user_dict_single_chinese_word_;
224
+ };
225
+ }
226
+
227
+ #endif
@@ -0,0 +1,93 @@
1
+ #ifndef CPPJIEBA_FULLSEGMENT_H
2
+ #define CPPJIEBA_FULLSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentBase.hpp"
10
+ #include "Unicode.hpp"
11
+
12
+ namespace cppjieba {
13
+ class FullSegment: public SegmentBase {
14
+ public:
15
+ FullSegment(const string& dictPath) {
16
+ dictTrie_ = new DictTrie(dictPath);
17
+ isNeedDestroy_ = true;
18
+ }
19
+ FullSegment(const DictTrie* dictTrie)
20
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
+ assert(dictTrie_);
22
+ }
23
+ ~FullSegment() {
24
+ if (isNeedDestroy_) {
25
+ delete dictTrie_;
26
+ }
27
+ }
28
+ void Cut(const string& sentence,
29
+ vector<string>& words) const {
30
+ vector<Word> tmp;
31
+ Cut(sentence, tmp);
32
+ GetStringsFromWords(tmp, words);
33
+ }
34
+ void Cut(const string& sentence,
35
+ vector<Word>& words) const {
36
+ PreFilter pre_filter(symbols_, sentence);
37
+ PreFilter::Range range;
38
+ vector<WordRange> wrs;
39
+ wrs.reserve(sentence.size()/2);
40
+ while (pre_filter.HasNext()) {
41
+ range = pre_filter.Next();
42
+ Cut(range.begin, range.end, wrs);
43
+ }
44
+ words.clear();
45
+ words.reserve(wrs.size());
46
+ GetWordsFromWordRanges(sentence, wrs, words);
47
+ }
48
+ void Cut(RuneStrArray::const_iterator begin,
49
+ RuneStrArray::const_iterator end,
50
+ vector<WordRange>& res) const {
51
+ //resut of searching in trie tree
52
+ LocalVector<pair<size_t, const DictUnit*> > tRes;
53
+
54
+ //max index of res's words
55
+ int maxIdx = 0;
56
+
57
+ // always equals to (uItr - begin)
58
+ int uIdx = 0;
59
+
60
+ //tmp variables
61
+ int wordLen = 0;
62
+ assert(dictTrie_);
63
+ vector<struct Dag> dags;
64
+ dictTrie_->Find(begin, end, dags);
65
+ for (size_t i = 0; i < dags.size(); i++) {
66
+ for (size_t j = 0; j < dags[i].nexts.size(); j++) {
67
+ size_t nextoffset = dags[i].nexts[j].first;
68
+ assert(nextoffset < dags.size());
69
+ const DictUnit* du = dags[i].nexts[j].second;
70
+ if (du == NULL) {
71
+ if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
72
+ WordRange wr(begin + i, begin + nextoffset);
73
+ res.push_back(wr);
74
+ }
75
+ } else {
76
+ wordLen = du->word.size();
77
+ if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
78
+ WordRange wr(begin + i, begin + nextoffset);
79
+ res.push_back(wr);
80
+ }
81
+ }
82
+ maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
83
+ }
84
+ uIdx++;
85
+ }
86
+ }
87
+ private:
88
+ const DictTrie* dictTrie_;
89
+ bool isNeedDestroy_;
90
+ };
91
+ }
92
+
93
+ #endif
@@ -0,0 +1,129 @@
1
+ #ifndef CPPJIEBA_HMMMODEL_H
2
+ #define CPPJIEBA_HMMMODEL_H
3
+
4
+ #include "limonp/StringUtil.hpp"
5
+ #include "Trie.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ using namespace limonp;
10
+ typedef unordered_map<Rune, double> EmitProbMap;
11
+
12
+ struct HMMModel {
13
+ /*
14
+ * STATUS:
15
+ * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
16
+ * */
17
+ enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
18
+
19
+ HMMModel(const string& modelPath) {
20
+ memset(startProb, 0, sizeof(startProb));
21
+ memset(transProb, 0, sizeof(transProb));
22
+ statMap[0] = 'B';
23
+ statMap[1] = 'E';
24
+ statMap[2] = 'M';
25
+ statMap[3] = 'S';
26
+ emitProbVec.push_back(&emitProbB);
27
+ emitProbVec.push_back(&emitProbE);
28
+ emitProbVec.push_back(&emitProbM);
29
+ emitProbVec.push_back(&emitProbS);
30
+ LoadModel(modelPath);
31
+ }
32
+ ~HMMModel() {
33
+ }
34
+ void LoadModel(const string& filePath) {
35
+ ifstream ifile(filePath.c_str());
36
+ XCHECK(ifile.is_open()) << "open " << filePath << " failed";
37
+ string line;
38
+ vector<string> tmp;
39
+ vector<string> tmp2;
40
+ //Load startProb
41
+ XCHECK(GetLine(ifile, line));
42
+ Split(line, tmp, " ");
43
+ XCHECK(tmp.size() == STATUS_SUM);
44
+ for (size_t j = 0; j< tmp.size(); j++) {
45
+ startProb[j] = atof(tmp[j].c_str());
46
+ }
47
+
48
+ //Load transProb
49
+ for (size_t i = 0; i < STATUS_SUM; i++) {
50
+ XCHECK(GetLine(ifile, line));
51
+ Split(line, tmp, " ");
52
+ XCHECK(tmp.size() == STATUS_SUM);
53
+ for (size_t j =0; j < STATUS_SUM; j++) {
54
+ transProb[i][j] = atof(tmp[j].c_str());
55
+ }
56
+ }
57
+
58
+ //Load emitProbB
59
+ XCHECK(GetLine(ifile, line));
60
+ XCHECK(LoadEmitProb(line, emitProbB));
61
+
62
+ //Load emitProbE
63
+ XCHECK(GetLine(ifile, line));
64
+ XCHECK(LoadEmitProb(line, emitProbE));
65
+
66
+ //Load emitProbM
67
+ XCHECK(GetLine(ifile, line));
68
+ XCHECK(LoadEmitProb(line, emitProbM));
69
+
70
+ //Load emitProbS
71
+ XCHECK(GetLine(ifile, line));
72
+ XCHECK(LoadEmitProb(line, emitProbS));
73
+ }
74
+ double GetEmitProb(const EmitProbMap* ptMp, Rune key,
75
+ double defVal)const {
76
+ EmitProbMap::const_iterator cit = ptMp->find(key);
77
+ if (cit == ptMp->end()) {
78
+ return defVal;
79
+ }
80
+ return cit->second;
81
+ }
82
+ bool GetLine(ifstream& ifile, string& line) {
83
+ while (getline(ifile, line)) {
84
+ Trim(line);
85
+ if (line.empty()) {
86
+ continue;
87
+ }
88
+ if (StartsWith(line, "#")) {
89
+ continue;
90
+ }
91
+ return true;
92
+ }
93
+ return false;
94
+ }
95
+ bool LoadEmitProb(const string& line, EmitProbMap& mp) {
96
+ if (line.empty()) {
97
+ return false;
98
+ }
99
+ vector<string> tmp, tmp2;
100
+ Unicode unicode;
101
+ Split(line, tmp, ",");
102
+ for (size_t i = 0; i < tmp.size(); i++) {
103
+ Split(tmp[i], tmp2, ":");
104
+ if (2 != tmp2.size()) {
105
+ XLOG(ERROR) << "emitProb illegal.";
106
+ return false;
107
+ }
108
+ if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
109
+ XLOG(ERROR) << "TransCode failed.";
110
+ return false;
111
+ }
112
+ mp[unicode[0]] = atof(tmp2[1].c_str());
113
+ }
114
+ return true;
115
+ }
116
+
117
+ char statMap[STATUS_SUM];
118
+ double startProb[STATUS_SUM];
119
+ double transProb[STATUS_SUM][STATUS_SUM];
120
+ EmitProbMap emitProbB;
121
+ EmitProbMap emitProbE;
122
+ EmitProbMap emitProbM;
123
+ EmitProbMap emitProbS;
124
+ vector<EmitProbMap* > emitProbVec;
125
+ }; // struct HMMModel
126
+
127
+ } // namespace cppjieba
128
+
129
+ #endif
@@ -0,0 +1,190 @@
1
+ #ifndef CPPJIBEA_HMMSEGMENT_H
2
+ #define CPPJIBEA_HMMSEGMENT_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <memory.h>
7
+ #include <cassert>
8
+ #include "HMMModel.hpp"
9
+ #include "SegmentBase.hpp"
10
+
11
+ namespace cppjieba {
12
+ class HMMSegment: public SegmentBase {
13
+ public:
14
+ HMMSegment(const string& filePath)
15
+ : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
16
+ }
17
+ HMMSegment(const HMMModel* model)
18
+ : model_(model), isNeedDestroy_(false) {
19
+ }
20
+ ~HMMSegment() {
21
+ if (isNeedDestroy_) {
22
+ delete model_;
23
+ }
24
+ }
25
+
26
+ void Cut(const string& sentence,
27
+ vector<string>& words) const {
28
+ vector<Word> tmp;
29
+ Cut(sentence, tmp);
30
+ GetStringsFromWords(tmp, words);
31
+ }
32
+ void Cut(const string& sentence,
33
+ vector<Word>& words) const {
34
+ PreFilter pre_filter(symbols_, sentence);
35
+ PreFilter::Range range;
36
+ vector<WordRange> wrs;
37
+ wrs.reserve(sentence.size()/2);
38
+ while (pre_filter.HasNext()) {
39
+ range = pre_filter.Next();
40
+ Cut(range.begin, range.end, wrs);
41
+ }
42
+ words.clear();
43
+ words.reserve(wrs.size());
44
+ GetWordsFromWordRanges(sentence, wrs, words);
45
+ }
46
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
47
+ RuneStrArray::const_iterator left = begin;
48
+ RuneStrArray::const_iterator right = begin;
49
+ while (right != end) {
50
+ if (right->rune < 0x80) {
51
+ if (left != right) {
52
+ InternalCut(left, right, res);
53
+ }
54
+ left = right;
55
+ do {
56
+ right = SequentialLetterRule(left, end);
57
+ if (right != left) {
58
+ break;
59
+ }
60
+ right = NumbersRule(left, end);
61
+ if (right != left) {
62
+ break;
63
+ }
64
+ right ++;
65
+ } while (false);
66
+ WordRange wr(left, right - 1);
67
+ res.push_back(wr);
68
+ left = right;
69
+ } else {
70
+ right++;
71
+ }
72
+ }
73
+ if (left != right) {
74
+ InternalCut(left, right, res);
75
+ }
76
+ }
77
+ private:
78
+ // sequential letters rule
79
+ RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
80
+ Rune x = begin->rune;
81
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
82
+ begin ++;
83
+ } else {
84
+ return begin;
85
+ }
86
+ while (begin != end) {
87
+ x = begin->rune;
88
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
89
+ begin ++;
90
+ } else {
91
+ break;
92
+ }
93
+ }
94
+ return begin;
95
+ }
96
+ //
97
+ RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
98
+ Rune x = begin->rune;
99
+ if ('0' <= x && x <= '9') {
100
+ begin ++;
101
+ } else {
102
+ return begin;
103
+ }
104
+ while (begin != end) {
105
+ x = begin->rune;
106
+ if ( ('0' <= x && x <= '9') || x == '.') {
107
+ begin++;
108
+ } else {
109
+ break;
110
+ }
111
+ }
112
+ return begin;
113
+ }
114
+ void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
115
+ vector<size_t> status;
116
+ Viterbi(begin, end, status);
117
+
118
+ RuneStrArray::const_iterator left = begin;
119
+ RuneStrArray::const_iterator right;
120
+ for (size_t i = 0; i < status.size(); i++) {
121
+ if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
122
+ right = begin + i + 1;
123
+ WordRange wr(left, right - 1);
124
+ res.push_back(wr);
125
+ left = right;
126
+ }
127
+ }
128
+ }
129
+
130
+ void Viterbi(RuneStrArray::const_iterator begin,
131
+ RuneStrArray::const_iterator end,
132
+ vector<size_t>& status) const {
133
+ size_t Y = HMMModel::STATUS_SUM;
134
+ size_t X = end - begin;
135
+
136
+ size_t XYSize = X * Y;
137
+ size_t now, old, stat;
138
+ double tmp, endE, endS;
139
+
140
+ vector<int> path(XYSize);
141
+ vector<double> weight(XYSize);
142
+
143
+ //start
144
+ for (size_t y = 0; y < Y; y++) {
145
+ weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
146
+ path[0 + y * X] = -1;
147
+ }
148
+
149
+ double emitProb;
150
+
151
+ for (size_t x = 1; x < X; x++) {
152
+ for (size_t y = 0; y < Y; y++) {
153
+ now = x + y*X;
154
+ weight[now] = MIN_DOUBLE;
155
+ path[now] = HMMModel::E; // warning
156
+ emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
157
+ for (size_t preY = 0; preY < Y; preY++) {
158
+ old = x - 1 + preY * X;
159
+ tmp = weight[old] + model_->transProb[preY][y] + emitProb;
160
+ if (tmp > weight[now]) {
161
+ weight[now] = tmp;
162
+ path[now] = preY;
163
+ }
164
+ }
165
+ }
166
+ }
167
+
168
+ endE = weight[X-1+HMMModel::E*X];
169
+ endS = weight[X-1+HMMModel::S*X];
170
+ stat = 0;
171
+ if (endE >= endS) {
172
+ stat = HMMModel::E;
173
+ } else {
174
+ stat = HMMModel::S;
175
+ }
176
+
177
+ status.resize(X);
178
+ for (int x = X -1 ; x >= 0; x--) {
179
+ status[x] = stat;
180
+ stat = path[x + stat*X];
181
+ }
182
+ }
183
+
184
+ const HMMModel* model_;
185
+ bool isNeedDestroy_;
186
+ }; // class HMMSegment
187
+
188
+ } // namespace cppjieba
189
+
190
+ #endif