cppjieba_rb 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,4 @@
1
+ 云计算
2
+ 韩玉鉴赏
3
+ 蓝翔 nz
4
+ 区块链 10 nz
@@ -0,0 +1,227 @@
1
+ #ifndef CPPJIEBA_DICT_TRIE_HPP
2
+ #define CPPJIEBA_DICT_TRIE_HPP
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <map>
7
+ #include <string>
8
+ #include <cstring>
9
+ #include <cstdlib>
10
+ #include <stdint.h>
11
+ #include <cmath>
12
+ #include <limits>
13
+ #include "limonp/StringUtil.hpp"
14
+ #include "limonp/Logging.hpp"
15
+ #include "Unicode.hpp"
16
+ #include "Trie.hpp"
17
+
18
+ namespace cppjieba {
19
+
20
+ using namespace limonp;
21
+
22
+ const double MIN_DOUBLE = -3.14e+100;
23
+ const double MAX_DOUBLE = 3.14e+100;
24
+ const size_t DICT_COLUMN_NUM = 3;
25
+ const char* const UNKNOWN_TAG = "";
26
+
27
+ class DictTrie {
28
+ public:
29
+ enum UserWordWeightOption {
30
+ WordWeightMin,
31
+ WordWeightMedian,
32
+ WordWeightMax,
33
+ }; // enum UserWordWeightOption
34
+
35
+ DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
36
+ Init(dict_path, user_dict_paths, user_word_weight_opt);
37
+ }
38
+
39
+ ~DictTrie() {
40
+ delete trie_;
41
+ }
42
+
43
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
44
+ DictUnit node_info;
45
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
46
+ return false;
47
+ }
48
+ active_node_infos_.push_back(node_info);
49
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
50
+ return true;
51
+ }
52
+
53
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
54
+ return trie_->Find(begin, end);
55
+ }
56
+
57
+ void Find(RuneStrArray::const_iterator begin,
58
+ RuneStrArray::const_iterator end,
59
+ vector<struct Dag>&res,
60
+ size_t max_word_len = MAX_WORD_LENGTH) const {
61
+ trie_->Find(begin, end, res, max_word_len);
62
+ }
63
+
64
+ bool IsUserDictSingleChineseWord(const Rune& word) const {
65
+ return IsIn(user_dict_single_chinese_word_, word);
66
+ }
67
+
68
+ double GetMinWeight() const {
69
+ return min_weight_;
70
+ }
71
+
72
+ private:
73
+ void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
74
+ LoadDict(dict_path);
75
+ freq_sum_ = CalcFreqSum(static_node_infos_);
76
+ CalculateWeight(static_node_infos_, freq_sum_);
77
+ SetStaticWordWeights(user_word_weight_opt);
78
+
79
+ if (user_dict_paths.size()) {
80
+ LoadUserDict(user_dict_paths);
81
+ }
82
+ Shrink(static_node_infos_);
83
+ CreateTrie(static_node_infos_);
84
+ }
85
+
86
+ void CreateTrie(const vector<DictUnit>& dictUnits) {
87
+ assert(dictUnits.size());
88
+ vector<Unicode> words;
89
+ vector<const DictUnit*> valuePointers;
90
+ for (size_t i = 0 ; i < dictUnits.size(); i ++) {
91
+ words.push_back(dictUnits[i].word);
92
+ valuePointers.push_back(&dictUnits[i]);
93
+ }
94
+
95
+ trie_ = new Trie(words, valuePointers);
96
+ }
97
+
98
+ void LoadUserDict(const string& filePaths) {
99
+ vector<string> files = limonp::Split(filePaths, "|;");
100
+ size_t lineno = 0;
101
+ for (size_t i = 0; i < files.size(); i++) {
102
+ ifstream ifs(files[i].c_str());
103
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
104
+ string line;
105
+ DictUnit node_info;
106
+ vector<string> buf;
107
+ for (; getline(ifs, line); lineno++) {
108
+ if (line.size() == 0) {
109
+ continue;
110
+ }
111
+ buf.clear();
112
+ Split(line, buf, " ");
113
+ DictUnit node_info;
114
+ if(buf.size() == 1){
115
+ MakeNodeInfo(node_info,
116
+ buf[0],
117
+ user_word_default_weight_,
118
+ UNKNOWN_TAG);
119
+ } else if (buf.size() == 2) {
120
+ MakeNodeInfo(node_info,
121
+ buf[0],
122
+ user_word_default_weight_,
123
+ buf[1]);
124
+ } else if (buf.size() == 3) {
125
+ int freq = atoi(buf[1].c_str());
126
+ assert(freq_sum_ > 0.0);
127
+ double weight = log(1.0 * freq / freq_sum_);
128
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
129
+ }
130
+ static_node_infos_.push_back(node_info);
131
+ if (node_info.word.size() == 1) {
132
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ bool MakeNodeInfo(DictUnit& node_info,
139
+ const string& word,
140
+ double weight,
141
+ const string& tag) {
142
+ if (!DecodeRunesInString(word, node_info.word)) {
143
+ XLOG(ERROR) << "Decode " << word << " failed.";
144
+ return false;
145
+ }
146
+ node_info.weight = weight;
147
+ node_info.tag = tag;
148
+ return true;
149
+ }
150
+
151
+ void LoadDict(const string& filePath) {
152
+ ifstream ifs(filePath.c_str());
153
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
154
+ string line;
155
+ vector<string> buf;
156
+
157
+ DictUnit node_info;
158
+ for (size_t lineno = 0; getline(ifs, line); lineno++) {
159
+ Split(line, buf, " ");
160
+ XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
161
+ MakeNodeInfo(node_info,
162
+ buf[0],
163
+ atof(buf[1].c_str()),
164
+ buf[2]);
165
+ static_node_infos_.push_back(node_info);
166
+ }
167
+ }
168
+
169
+ static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
170
+ return lhs.weight < rhs.weight;
171
+ }
172
+
173
+ void SetStaticWordWeights(UserWordWeightOption option) {
174
+ XCHECK(!static_node_infos_.empty());
175
+ vector<DictUnit> x = static_node_infos_;
176
+ sort(x.begin(), x.end(), WeightCompare);
177
+ min_weight_ = x[0].weight;
178
+ max_weight_ = x[x.size() - 1].weight;
179
+ median_weight_ = x[x.size() / 2].weight;
180
+ switch (option) {
181
+ case WordWeightMin:
182
+ user_word_default_weight_ = min_weight_;
183
+ break;
184
+ case WordWeightMedian:
185
+ user_word_default_weight_ = median_weight_;
186
+ break;
187
+ default:
188
+ user_word_default_weight_ = max_weight_;
189
+ break;
190
+ }
191
+ }
192
+
193
+ double CalcFreqSum(const vector<DictUnit>& node_infos) const {
194
+ double sum = 0.0;
195
+ for (size_t i = 0; i < node_infos.size(); i++) {
196
+ sum += node_infos[i].weight;
197
+ }
198
+ return sum;
199
+ }
200
+
201
+ void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
202
+ assert(sum > 0.0);
203
+ for (size_t i = 0; i < node_infos.size(); i++) {
204
+ DictUnit& node_info = node_infos[i];
205
+ assert(node_info.weight > 0.0);
206
+ node_info.weight = log(double(node_info.weight)/sum);
207
+ }
208
+ }
209
+
210
+ void Shrink(vector<DictUnit>& units) const {
211
+ vector<DictUnit>(units.begin(), units.end()).swap(units);
212
+ }
213
+
214
+ vector<DictUnit> static_node_infos_;
215
+ deque<DictUnit> active_node_infos_; // must not be vector
216
+ Trie * trie_;
217
+
218
+ double freq_sum_;
219
+ double min_weight_;
220
+ double max_weight_;
221
+ double median_weight_;
222
+ double user_word_default_weight_;
223
+ unordered_set<Rune> user_dict_single_chinese_word_;
224
+ };
225
+ }
226
+
227
+ #endif
@@ -0,0 +1,93 @@
1
+ #ifndef CPPJIEBA_FULLSEGMENT_H
2
+ #define CPPJIEBA_FULLSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentBase.hpp"
10
+ #include "Unicode.hpp"
11
+
12
+ namespace cppjieba {
13
+ class FullSegment: public SegmentBase {
14
+ public:
15
+ FullSegment(const string& dictPath) {
16
+ dictTrie_ = new DictTrie(dictPath);
17
+ isNeedDestroy_ = true;
18
+ }
19
+ FullSegment(const DictTrie* dictTrie)
20
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
+ assert(dictTrie_);
22
+ }
23
+ ~FullSegment() {
24
+ if (isNeedDestroy_) {
25
+ delete dictTrie_;
26
+ }
27
+ }
28
+ void Cut(const string& sentence,
29
+ vector<string>& words) const {
30
+ vector<Word> tmp;
31
+ Cut(sentence, tmp);
32
+ GetStringsFromWords(tmp, words);
33
+ }
34
+ void Cut(const string& sentence,
35
+ vector<Word>& words) const {
36
+ PreFilter pre_filter(symbols_, sentence);
37
+ PreFilter::Range range;
38
+ vector<WordRange> wrs;
39
+ wrs.reserve(sentence.size()/2);
40
+ while (pre_filter.HasNext()) {
41
+ range = pre_filter.Next();
42
+ Cut(range.begin, range.end, wrs);
43
+ }
44
+ words.clear();
45
+ words.reserve(wrs.size());
46
+ GetWordsFromWordRanges(sentence, wrs, words);
47
+ }
48
+ void Cut(RuneStrArray::const_iterator begin,
49
+ RuneStrArray::const_iterator end,
50
+ vector<WordRange>& res) const {
51
+ //resut of searching in trie tree
52
+ LocalVector<pair<size_t, const DictUnit*> > tRes;
53
+
54
+ //max index of res's words
55
+ int maxIdx = 0;
56
+
57
+ // always equals to (uItr - begin)
58
+ int uIdx = 0;
59
+
60
+ //tmp variables
61
+ int wordLen = 0;
62
+ assert(dictTrie_);
63
+ vector<struct Dag> dags;
64
+ dictTrie_->Find(begin, end, dags);
65
+ for (size_t i = 0; i < dags.size(); i++) {
66
+ for (size_t j = 0; j < dags[i].nexts.size(); j++) {
67
+ size_t nextoffset = dags[i].nexts[j].first;
68
+ assert(nextoffset < dags.size());
69
+ const DictUnit* du = dags[i].nexts[j].second;
70
+ if (du == NULL) {
71
+ if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
72
+ WordRange wr(begin + i, begin + nextoffset);
73
+ res.push_back(wr);
74
+ }
75
+ } else {
76
+ wordLen = du->word.size();
77
+ if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
78
+ WordRange wr(begin + i, begin + nextoffset);
79
+ res.push_back(wr);
80
+ }
81
+ }
82
+ maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
83
+ }
84
+ uIdx++;
85
+ }
86
+ }
87
+ private:
88
+ const DictTrie* dictTrie_;
89
+ bool isNeedDestroy_;
90
+ };
91
+ }
92
+
93
+ #endif
@@ -0,0 +1,129 @@
1
+ #ifndef CPPJIEBA_HMMMODEL_H
2
+ #define CPPJIEBA_HMMMODEL_H
3
+
4
+ #include "limonp/StringUtil.hpp"
5
+ #include "Trie.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ using namespace limonp;
10
+ typedef unordered_map<Rune, double> EmitProbMap;
11
+
12
+ struct HMMModel {
13
+ /*
14
+ * STATUS:
15
+ * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
16
+ * */
17
+ enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
18
+
19
+ HMMModel(const string& modelPath) {
20
+ memset(startProb, 0, sizeof(startProb));
21
+ memset(transProb, 0, sizeof(transProb));
22
+ statMap[0] = 'B';
23
+ statMap[1] = 'E';
24
+ statMap[2] = 'M';
25
+ statMap[3] = 'S';
26
+ emitProbVec.push_back(&emitProbB);
27
+ emitProbVec.push_back(&emitProbE);
28
+ emitProbVec.push_back(&emitProbM);
29
+ emitProbVec.push_back(&emitProbS);
30
+ LoadModel(modelPath);
31
+ }
32
+ ~HMMModel() {
33
+ }
34
+ void LoadModel(const string& filePath) {
35
+ ifstream ifile(filePath.c_str());
36
+ XCHECK(ifile.is_open()) << "open " << filePath << " failed";
37
+ string line;
38
+ vector<string> tmp;
39
+ vector<string> tmp2;
40
+ //Load startProb
41
+ XCHECK(GetLine(ifile, line));
42
+ Split(line, tmp, " ");
43
+ XCHECK(tmp.size() == STATUS_SUM);
44
+ for (size_t j = 0; j< tmp.size(); j++) {
45
+ startProb[j] = atof(tmp[j].c_str());
46
+ }
47
+
48
+ //Load transProb
49
+ for (size_t i = 0; i < STATUS_SUM; i++) {
50
+ XCHECK(GetLine(ifile, line));
51
+ Split(line, tmp, " ");
52
+ XCHECK(tmp.size() == STATUS_SUM);
53
+ for (size_t j =0; j < STATUS_SUM; j++) {
54
+ transProb[i][j] = atof(tmp[j].c_str());
55
+ }
56
+ }
57
+
58
+ //Load emitProbB
59
+ XCHECK(GetLine(ifile, line));
60
+ XCHECK(LoadEmitProb(line, emitProbB));
61
+
62
+ //Load emitProbE
63
+ XCHECK(GetLine(ifile, line));
64
+ XCHECK(LoadEmitProb(line, emitProbE));
65
+
66
+ //Load emitProbM
67
+ XCHECK(GetLine(ifile, line));
68
+ XCHECK(LoadEmitProb(line, emitProbM));
69
+
70
+ //Load emitProbS
71
+ XCHECK(GetLine(ifile, line));
72
+ XCHECK(LoadEmitProb(line, emitProbS));
73
+ }
74
+ double GetEmitProb(const EmitProbMap* ptMp, Rune key,
75
+ double defVal)const {
76
+ EmitProbMap::const_iterator cit = ptMp->find(key);
77
+ if (cit == ptMp->end()) {
78
+ return defVal;
79
+ }
80
+ return cit->second;
81
+ }
82
+ bool GetLine(ifstream& ifile, string& line) {
83
+ while (getline(ifile, line)) {
84
+ Trim(line);
85
+ if (line.empty()) {
86
+ continue;
87
+ }
88
+ if (StartsWith(line, "#")) {
89
+ continue;
90
+ }
91
+ return true;
92
+ }
93
+ return false;
94
+ }
95
+ bool LoadEmitProb(const string& line, EmitProbMap& mp) {
96
+ if (line.empty()) {
97
+ return false;
98
+ }
99
+ vector<string> tmp, tmp2;
100
+ Unicode unicode;
101
+ Split(line, tmp, ",");
102
+ for (size_t i = 0; i < tmp.size(); i++) {
103
+ Split(tmp[i], tmp2, ":");
104
+ if (2 != tmp2.size()) {
105
+ XLOG(ERROR) << "emitProb illegal.";
106
+ return false;
107
+ }
108
+ if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
109
+ XLOG(ERROR) << "TransCode failed.";
110
+ return false;
111
+ }
112
+ mp[unicode[0]] = atof(tmp2[1].c_str());
113
+ }
114
+ return true;
115
+ }
116
+
117
+ char statMap[STATUS_SUM];
118
+ double startProb[STATUS_SUM];
119
+ double transProb[STATUS_SUM][STATUS_SUM];
120
+ EmitProbMap emitProbB;
121
+ EmitProbMap emitProbE;
122
+ EmitProbMap emitProbM;
123
+ EmitProbMap emitProbS;
124
+ vector<EmitProbMap* > emitProbVec;
125
+ }; // struct HMMModel
126
+
127
+ } // namespace cppjieba
128
+
129
+ #endif
@@ -0,0 +1,190 @@
1
+ #ifndef CPPJIBEA_HMMSEGMENT_H
2
+ #define CPPJIBEA_HMMSEGMENT_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <memory.h>
7
+ #include <cassert>
8
+ #include "HMMModel.hpp"
9
+ #include "SegmentBase.hpp"
10
+
11
+ namespace cppjieba {
12
+ class HMMSegment: public SegmentBase {
13
+ public:
14
+ HMMSegment(const string& filePath)
15
+ : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
16
+ }
17
+ HMMSegment(const HMMModel* model)
18
+ : model_(model), isNeedDestroy_(false) {
19
+ }
20
+ ~HMMSegment() {
21
+ if (isNeedDestroy_) {
22
+ delete model_;
23
+ }
24
+ }
25
+
26
+ void Cut(const string& sentence,
27
+ vector<string>& words) const {
28
+ vector<Word> tmp;
29
+ Cut(sentence, tmp);
30
+ GetStringsFromWords(tmp, words);
31
+ }
32
+ void Cut(const string& sentence,
33
+ vector<Word>& words) const {
34
+ PreFilter pre_filter(symbols_, sentence);
35
+ PreFilter::Range range;
36
+ vector<WordRange> wrs;
37
+ wrs.reserve(sentence.size()/2);
38
+ while (pre_filter.HasNext()) {
39
+ range = pre_filter.Next();
40
+ Cut(range.begin, range.end, wrs);
41
+ }
42
+ words.clear();
43
+ words.reserve(wrs.size());
44
+ GetWordsFromWordRanges(sentence, wrs, words);
45
+ }
46
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
47
+ RuneStrArray::const_iterator left = begin;
48
+ RuneStrArray::const_iterator right = begin;
49
+ while (right != end) {
50
+ if (right->rune < 0x80) {
51
+ if (left != right) {
52
+ InternalCut(left, right, res);
53
+ }
54
+ left = right;
55
+ do {
56
+ right = SequentialLetterRule(left, end);
57
+ if (right != left) {
58
+ break;
59
+ }
60
+ right = NumbersRule(left, end);
61
+ if (right != left) {
62
+ break;
63
+ }
64
+ right ++;
65
+ } while (false);
66
+ WordRange wr(left, right - 1);
67
+ res.push_back(wr);
68
+ left = right;
69
+ } else {
70
+ right++;
71
+ }
72
+ }
73
+ if (left != right) {
74
+ InternalCut(left, right, res);
75
+ }
76
+ }
77
+ private:
78
+ // sequential letters rule
79
+ RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
80
+ Rune x = begin->rune;
81
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
82
+ begin ++;
83
+ } else {
84
+ return begin;
85
+ }
86
+ while (begin != end) {
87
+ x = begin->rune;
88
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
89
+ begin ++;
90
+ } else {
91
+ break;
92
+ }
93
+ }
94
+ return begin;
95
+ }
96
+ //
97
+ RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
98
+ Rune x = begin->rune;
99
+ if ('0' <= x && x <= '9') {
100
+ begin ++;
101
+ } else {
102
+ return begin;
103
+ }
104
+ while (begin != end) {
105
+ x = begin->rune;
106
+ if ( ('0' <= x && x <= '9') || x == '.') {
107
+ begin++;
108
+ } else {
109
+ break;
110
+ }
111
+ }
112
+ return begin;
113
+ }
114
+ void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
115
+ vector<size_t> status;
116
+ Viterbi(begin, end, status);
117
+
118
+ RuneStrArray::const_iterator left = begin;
119
+ RuneStrArray::const_iterator right;
120
+ for (size_t i = 0; i < status.size(); i++) {
121
+ if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
122
+ right = begin + i + 1;
123
+ WordRange wr(left, right - 1);
124
+ res.push_back(wr);
125
+ left = right;
126
+ }
127
+ }
128
+ }
129
+
130
+ void Viterbi(RuneStrArray::const_iterator begin,
131
+ RuneStrArray::const_iterator end,
132
+ vector<size_t>& status) const {
133
+ size_t Y = HMMModel::STATUS_SUM;
134
+ size_t X = end - begin;
135
+
136
+ size_t XYSize = X * Y;
137
+ size_t now, old, stat;
138
+ double tmp, endE, endS;
139
+
140
+ vector<int> path(XYSize);
141
+ vector<double> weight(XYSize);
142
+
143
+ //start
144
+ for (size_t y = 0; y < Y; y++) {
145
+ weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
146
+ path[0 + y * X] = -1;
147
+ }
148
+
149
+ double emitProb;
150
+
151
+ for (size_t x = 1; x < X; x++) {
152
+ for (size_t y = 0; y < Y; y++) {
153
+ now = x + y*X;
154
+ weight[now] = MIN_DOUBLE;
155
+ path[now] = HMMModel::E; // warning
156
+ emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
157
+ for (size_t preY = 0; preY < Y; preY++) {
158
+ old = x - 1 + preY * X;
159
+ tmp = weight[old] + model_->transProb[preY][y] + emitProb;
160
+ if (tmp > weight[now]) {
161
+ weight[now] = tmp;
162
+ path[now] = preY;
163
+ }
164
+ }
165
+ }
166
+ }
167
+
168
+ endE = weight[X-1+HMMModel::E*X];
169
+ endS = weight[X-1+HMMModel::S*X];
170
+ stat = 0;
171
+ if (endE >= endS) {
172
+ stat = HMMModel::E;
173
+ } else {
174
+ stat = HMMModel::S;
175
+ }
176
+
177
+ status.resize(X);
178
+ for (int x = X -1 ; x >= 0; x--) {
179
+ status[x] = stat;
180
+ stat = path[x + stat*X];
181
+ }
182
+ }
183
+
184
+ const HMMModel* model_;
185
+ bool isNeedDestroy_;
186
+ }; // class HMMSegment
187
+
188
+ } // namespace cppjieba
189
+
190
+ #endif