jieba_rb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +51 -0
  8. data/Rakefile +11 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +27 -0
  12. data/ext/cppjieba/ChangeLog.md +81 -0
  13. data/ext/cppjieba/Dockerfile +11 -0
  14. data/ext/cppjieba/LICENSE +20 -0
  15. data/ext/cppjieba/README.md +359 -0
  16. data/ext/cppjieba/conf/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/conf/server.conf +16 -0
  18. data/ext/cppjieba/dict/CMakeLists.txt +1 -0
  19. data/ext/cppjieba/dict/README.md +31 -0
  20. data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
  21. data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
  22. data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
  23. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  24. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  25. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  26. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  27. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  28. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  29. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  30. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  31. data/ext/cppjieba/dict/user.dict.utf8 +3 -0
  32. data/ext/cppjieba/script/CMakeLists.txt +1 -0
  33. data/ext/cppjieba/script/cjserver.start +12 -0
  34. data/ext/cppjieba/script/cjserver.stop +13 -0
  35. data/ext/cppjieba/server/CMakeLists.txt +9 -0
  36. data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
  37. data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
  38. data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
  39. data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
  40. data/ext/cppjieba/server/server.cpp +91 -0
  41. data/ext/cppjieba/src/DictTrie.hpp +211 -0
  42. data/ext/cppjieba/src/FullSegment.hpp +153 -0
  43. data/ext/cppjieba/src/HMMSegment.hpp +394 -0
  44. data/ext/cppjieba/src/ISegment.hpp +17 -0
  45. data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
  46. data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
  47. data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
  48. data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
  49. data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
  50. data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
  51. data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
  52. data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
  53. data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
  54. data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
  55. data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
  56. data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
  57. data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
  58. data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
  59. data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
  60. data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
  61. data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
  62. data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
  63. data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
  64. data/ext/cppjieba/src/MPSegment.hpp +148 -0
  65. data/ext/cppjieba/src/MixSegment.hpp +121 -0
  66. data/ext/cppjieba/src/PosTagger.hpp +109 -0
  67. data/ext/cppjieba/src/QuerySegment.hpp +123 -0
  68. data/ext/cppjieba/src/SegmentBase.hpp +78 -0
  69. data/ext/cppjieba/src/TransCode.hpp +63 -0
  70. data/ext/cppjieba/src/Trie.hpp +298 -0
  71. data/ext/cppjieba/test/CMakeLists.txt +7 -0
  72. data/ext/cppjieba/test/keyword_demo.cpp +16 -0
  73. data/ext/cppjieba/test/load_test.cpp +56 -0
  74. data/ext/cppjieba/test/segment_demo.cpp +59 -0
  75. data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
  76. data/ext/cppjieba/test/servertest/load_test.py +91 -0
  77. data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
  78. data/ext/cppjieba/test/tagging_demo.cpp +12 -0
  79. data/ext/cppjieba/test/testdata/curl.res +1 -0
  80. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  81. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  82. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  83. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  84. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  85. data/ext/cppjieba/test/testdata/review.100 +100 -0
  86. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  87. data/ext/cppjieba/test/testdata/server.conf +13 -0
  88. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  89. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  90. data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
  91. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  92. data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
  93. data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
  94. data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
  95. data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
  96. data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
  97. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
  98. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
  99. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
  100. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
  101. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
  102. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
  103. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
  104. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
  105. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
  106. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
  107. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
  108. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
  109. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
  110. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
  111. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
  112. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  113. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  114. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
  115. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
  116. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
  117. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
  118. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
  119. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
  120. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
  121. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
  122. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
  123. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
  124. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
  125. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
  126. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
  127. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
  128. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
  129. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
  130. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
  131. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
  132. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
  133. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
  134. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
  135. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  136. data/ext/jieba/extconf.rb +26 -0
  137. data/ext/jieba/jieba.c +9 -0
  138. data/ext/jieba/jieba.h +9 -0
  139. data/ext/jieba/segment.cc +88 -0
  140. data/ext/jieba/segment.h +17 -0
  141. data/jieba_rb.gemspec +51 -0
  142. data/lib/jieba_rb/version.rb +3 -0
  143. data/lib/jieba_rb.rb +28 -0
  144. data/test/test_segment.rb +32 -0
  145. metadata +246 -0
@@ -0,0 +1,394 @@
1
+ #ifndef CPPJIBEA_HMMSEGMENT_H
2
+ #define CPPJIBEA_HMMSEGMENT_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <memory.h>
7
+ #include <cassert>
8
+ #include "Limonp/StringUtil.hpp"
9
+ #include "Limonp/Logger.hpp"
10
+ #include "TransCode.hpp"
11
+ #include "ISegment.hpp"
12
+ #include "SegmentBase.hpp"
13
+ #include "DictTrie.hpp"
14
+
15
+ namespace CppJieba
16
+ {
17
+ using namespace Limonp;
18
+ typedef unordered_map<uint16_t, double> EmitProbMap;
19
+ class HMMSegment: public SegmentBase
20
+ {
21
+ public:
22
+ /*
23
+ * STATUS:
24
+ * 0:B, 1:E, 2:M, 3:S
25
+ * */
26
+ enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
27
+ private:
28
+ char _statMap[STATUS_SUM];
29
+ double _startProb[STATUS_SUM];
30
+ double _transProb[STATUS_SUM][STATUS_SUM];
31
+ EmitProbMap _emitProbB;
32
+ EmitProbMap _emitProbE;
33
+ EmitProbMap _emitProbM;
34
+ EmitProbMap _emitProbS;
35
+ vector<EmitProbMap* > _emitProbVec;
36
+
37
+ public:
38
+ HMMSegment(){}
39
+ explicit HMMSegment(const string& filePath)
40
+ {
41
+ LIMONP_CHECK(init(filePath));
42
+ }
43
+ virtual ~HMMSegment(){}
44
+ public:
45
+ bool init(const string& filePath)
46
+ {
47
+ memset(_startProb, 0, sizeof(_startProb));
48
+ memset(_transProb, 0, sizeof(_transProb));
49
+ _statMap[0] = 'B';
50
+ _statMap[1] = 'E';
51
+ _statMap[2] = 'M';
52
+ _statMap[3] = 'S';
53
+ _emitProbVec.push_back(&_emitProbB);
54
+ _emitProbVec.push_back(&_emitProbE);
55
+ _emitProbVec.push_back(&_emitProbM);
56
+ _emitProbVec.push_back(&_emitProbS);
57
+ LIMONP_CHECK(_loadModel(filePath.c_str()));
58
+ LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
59
+ return true;
60
+ }
61
+ public:
62
+ using SegmentBase::cut;
63
+ public:
64
+ bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
65
+ {
66
+ Unicode::const_iterator left = begin;
67
+ Unicode::const_iterator right = begin;
68
+ while(right != end)
69
+ {
70
+ if(*right < 0x80)
71
+ {
72
+ if(left != right && !_cut(left, right, res))
73
+ {
74
+ return false;
75
+ }
76
+ left = right;
77
+ do {
78
+ right = _sequentialLetterRule(left, end);
79
+ if(right != left)
80
+ {
81
+ break;
82
+ }
83
+ right = _numbersRule(left, end);
84
+ if(right != left)
85
+ {
86
+ break;
87
+ }
88
+ right ++;
89
+ } while(false);
90
+ res.push_back(Unicode(left, right));
91
+ left = right;
92
+ }
93
+ else
94
+ {
95
+ right++;
96
+ }
97
+ }
98
+ if(left != right && !_cut(left, right, res))
99
+ {
100
+ return false;
101
+ }
102
+ return true;
103
+ }
104
+ private:
105
+ // sequential letters rule
106
+ Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
107
+ {
108
+ Unicode::value_type x;
109
+ while(begin != end)
110
+ {
111
+ x = *begin;
112
+ if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
113
+ {
114
+ begin ++;
115
+ }
116
+ else
117
+ {
118
+ break;
119
+ }
120
+ }
121
+ return begin;
122
+ }
123
+ //
124
+ Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
125
+ {
126
+ Unicode::value_type x = *begin;
127
+ if('0' <= x && x <= '9')
128
+ {
129
+ begin ++;
130
+ }
131
+ else
132
+ {
133
+ return begin;
134
+ }
135
+ while(begin != end)
136
+ {
137
+ x = *begin;
138
+ if( ('0' <= x && x <= '9') || x == '.')
139
+ {
140
+ begin++;
141
+ }
142
+ else
143
+ {
144
+ break;
145
+ }
146
+ }
147
+ return begin;
148
+ }
149
+ bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
150
+ {
151
+ vector<size_t> status;
152
+ if(!_viterbi(begin, end, status))
153
+ {
154
+ LogError("_viterbi failed.");
155
+ return false;
156
+ }
157
+
158
+ Unicode::const_iterator left = begin;
159
+ Unicode::const_iterator right;
160
+ for(size_t i = 0; i < status.size(); i++)
161
+ {
162
+ if(status[i] % 2) //if(E == status[i] || S == status[i])
163
+ {
164
+ right = begin + i + 1;
165
+ res.push_back(Unicode(left, right));
166
+ left = right;
167
+ }
168
+ }
169
+ return true;
170
+ }
171
+ public:
172
+ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
173
+ {
174
+ if(begin == end)
175
+ {
176
+ return false;
177
+ }
178
+ vector<Unicode> words;
179
+ words.reserve(end - begin);
180
+ if(!cut(begin, end, words))
181
+ {
182
+ return false;
183
+ }
184
+ size_t offset = res.size();
185
+ res.resize(res.size() + words.size());
186
+ for(size_t i = 0; i < words.size(); i++)
187
+ {
188
+ if(!TransCode::encode(words[i], res[offset + i]))
189
+ {
190
+ LogError("encode failed.");
191
+ }
192
+ }
193
+ return true;
194
+ }
195
+
196
+ private:
197
+ bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
198
+ {
199
+ if(begin == end)
200
+ {
201
+ return false;
202
+ }
203
+
204
+ size_t Y = STATUS_SUM;
205
+ size_t X = end - begin;
206
+
207
+ size_t XYSize = X * Y;
208
+ size_t now, old, stat;
209
+ double tmp, endE, endS;
210
+
211
+ vector<int> path(XYSize);
212
+ vector<double> weight(XYSize);
213
+
214
+ //start
215
+ for(size_t y = 0; y < Y; y++)
216
+ {
217
+ weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
218
+ path[0 + y * X] = -1;
219
+ }
220
+
221
+
222
+ double emitProb;
223
+
224
+ for(size_t x = 1; x < X; x++)
225
+ {
226
+ for(size_t y = 0; y < Y; y++)
227
+ {
228
+ now = x + y*X;
229
+ weight[now] = MIN_DOUBLE;
230
+ path[now] = E; // warning
231
+ emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
232
+ for(size_t preY = 0; preY < Y; preY++)
233
+ {
234
+ old = x - 1 + preY * X;
235
+ tmp = weight[old] + _transProb[preY][y] + emitProb;
236
+ if(tmp > weight[now])
237
+ {
238
+ weight[now] = tmp;
239
+ path[now] = preY;
240
+ }
241
+ }
242
+ }
243
+ }
244
+
245
+ endE = weight[X-1+E*X];
246
+ endS = weight[X-1+S*X];
247
+ stat = 0;
248
+ if(endE >= endS)
249
+ {
250
+ stat = E;
251
+ }
252
+ else
253
+ {
254
+ stat = S;
255
+ }
256
+
257
+ status.resize(X);
258
+ for(int x = X -1 ; x >= 0; x--)
259
+ {
260
+ status[x] = stat;
261
+ stat = path[x + stat*X];
262
+ }
263
+
264
+ return true;
265
+ }
266
+ bool _loadModel(const char* const filePath)
267
+ {
268
+ LogDebug("loadModel [%s] start ...", filePath);
269
+ ifstream ifile(filePath);
270
+ string line;
271
+ vector<string> tmp;
272
+ vector<string> tmp2;
273
+ //load _startProb
274
+ if(!_getLine(ifile, line))
275
+ {
276
+ return false;
277
+ }
278
+ split(line, tmp, " ");
279
+ if(tmp.size() != STATUS_SUM)
280
+ {
281
+ LogError("start_p illegal");
282
+ return false;
283
+ }
284
+ for(size_t j = 0; j< tmp.size(); j++)
285
+ {
286
+ _startProb[j] = atof(tmp[j].c_str());
287
+ }
288
+
289
+ //load _transProb
290
+ for(size_t i = 0; i < STATUS_SUM; i++)
291
+ {
292
+ if(!_getLine(ifile, line))
293
+ {
294
+ return false;
295
+ }
296
+ split(line, tmp, " ");
297
+ if(tmp.size() != STATUS_SUM)
298
+ {
299
+ LogError("trans_p illegal");
300
+ return false;
301
+ }
302
+ for(size_t j =0; j < STATUS_SUM; j++)
303
+ {
304
+ _transProb[i][j] = atof(tmp[j].c_str());
305
+ }
306
+ }
307
+
308
+ //load _emitProbB
309
+ if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
310
+ {
311
+ return false;
312
+ }
313
+
314
+ //load _emitProbE
315
+ if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
316
+ {
317
+ return false;
318
+ }
319
+
320
+ //load _emitProbM
321
+ if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
322
+ {
323
+ return false;
324
+ }
325
+
326
+ //load _emitProbS
327
+ if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
328
+ {
329
+ return false;
330
+ }
331
+
332
+ LogDebug("loadModel [%s] end.", filePath);
333
+
334
+ return true;
335
+ }
336
+ bool _getLine(ifstream& ifile, string& line)
337
+ {
338
+ while(getline(ifile, line))
339
+ {
340
+ trim(line);
341
+ if(line.empty())
342
+ {
343
+ continue;
344
+ }
345
+ if(startsWith(line, "#"))
346
+ {
347
+ continue;
348
+ }
349
+ return true;
350
+ }
351
+ return false;
352
+ }
353
+ bool _loadEmitProb(const string& line, EmitProbMap& mp)
354
+ {
355
+ if(line.empty())
356
+ {
357
+ return false;
358
+ }
359
+ vector<string> tmp, tmp2;
360
+ Unicode unicode;
361
+ split(line, tmp, ",");
362
+ for(size_t i = 0; i < tmp.size(); i++)
363
+ {
364
+ split(tmp[i], tmp2, ":");
365
+ if(2 != tmp2.size())
366
+ {
367
+ LogError("_emitProb illegal.");
368
+ return false;
369
+ }
370
+ if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
371
+ {
372
+ LogError("TransCode failed.");
373
+ return false;
374
+ }
375
+ mp[unicode[0]] = atof(tmp2[1].c_str());
376
+ }
377
+ return true;
378
+ }
379
+ double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
380
+ {
381
+ EmitProbMap::const_iterator cit = ptMp->find(key);
382
+ if(cit == ptMp->end())
383
+ {
384
+ return defVal;
385
+ }
386
+ return cit->second;
387
+
388
+ }
389
+
390
+
391
+ };
392
+ }
393
+
394
+ #endif
@@ -0,0 +1,17 @@
1
+ #ifndef CPPJIEBA_SEGMENTINTERFACE_H
2
+ #define CPPJIEBA_SEGMENTINTERFACE_H
3
+
4
+
5
+ namespace CppJieba
6
+ {
7
+ class ISegment
8
+ {
9
+ public:
10
+ virtual ~ISegment(){};
11
+ public:
12
+ virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
13
+ virtual bool cut(const string& str, vector<string>& res) const = 0;
14
+ };
15
+ }
16
+
17
+ #endif
@@ -0,0 +1,173 @@
1
+ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
2
+ #define CPPJIEBA_KEYWORD_EXTRACTOR_H
3
+
4
+ #include "MixSegment.hpp"
5
+ #include <cmath>
6
+ #include <set>
7
+
8
+ namespace CppJieba
9
+ {
10
+ using namespace Limonp;
11
+
12
+ /*utf8*/
13
+ class KeywordExtractor
14
+ {
15
+ private:
16
+ MixSegment _segment;
17
+ private:
18
+ unordered_map<string, double> _idfMap;
19
+ double _idfAverage;
20
+
21
+ unordered_set<string> _stopWords;
22
+ public:
23
+ KeywordExtractor(){};
24
+ KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
25
+ {
26
+ LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
27
+ };
28
+ ~KeywordExtractor(){};
29
+
30
+ public:
31
+ bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
32
+ {
33
+ _loadIdfDict(idfPath);
34
+ _loadStopWordDict(stopWordPath);
35
+ LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
36
+ return true;
37
+ };
38
+ public:
39
+
40
+ bool extract(const string& str, vector<string>& keywords, size_t topN) const
41
+ {
42
+ vector<pair<string, double> > topWords;
43
+ if(!extract(str, topWords, topN))
44
+ {
45
+ return false;
46
+ }
47
+ for(size_t i = 0; i < topWords.size(); i++)
48
+ {
49
+ keywords.push_back(topWords[i].first);
50
+ }
51
+ return true;
52
+ }
53
+
54
+ bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
55
+ {
56
+ vector<string> words;
57
+ if(!_segment.cut(str, words))
58
+ {
59
+ LogError("segment cut(%s) failed.", str.c_str());
60
+ return false;
61
+ }
62
+
63
+ map<string, double> wordmap;
64
+ for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
65
+ {
66
+ if(_isSingleWord(*iter))
67
+ {
68
+ continue;
69
+ }
70
+ wordmap[*iter] += 1.0;
71
+ }
72
+
73
+ for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
74
+ {
75
+ if(_stopWords.end() != _stopWords.find(itr->first))
76
+ {
77
+ wordmap.erase(itr++);
78
+ continue;
79
+ }
80
+
81
+ unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
82
+ if(cit != _idfMap.end())
83
+ {
84
+ itr->second *= cit->second;
85
+ }
86
+ else
87
+ {
88
+ itr->second *= _idfAverage;
89
+ }
90
+ itr ++;
91
+ }
92
+
93
+ keywords.clear();
94
+ std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
95
+ topN = min(topN, keywords.size());
96
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
97
+ keywords.resize(topN);
98
+ return true;
99
+ }
100
+ private:
101
+ void _loadIdfDict(const string& idfPath)
102
+ {
103
+ ifstream ifs(idfPath.c_str());
104
+ if(!ifs)
105
+ {
106
+ LogError("open %s failed.", idfPath.c_str());
107
+ assert(false);
108
+ }
109
+ string line ;
110
+ vector<string> buf;
111
+ double idf = 0.0;
112
+ double idfSum = 0.0;
113
+ size_t lineno = 0;
114
+ for(;getline(ifs, line); lineno++)
115
+ {
116
+ buf.clear();
117
+ if(line.empty())
118
+ {
119
+ LogError("line[%d] empty. skipped.", lineno);
120
+ continue;
121
+ }
122
+ if(!split(line, buf, " ") || buf.size() != 2)
123
+ {
124
+ LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
125
+ continue;
126
+ }
127
+ idf = atof(buf[1].c_str());
128
+ _idfMap[buf[0]] = idf;
129
+ idfSum += idf;
130
+
131
+ }
132
+
133
+ assert(lineno);
134
+ _idfAverage = idfSum / lineno;
135
+ assert(_idfAverage > 0.0);
136
+ }
137
+ void _loadStopWordDict(const string& filePath)
138
+ {
139
+ ifstream ifs(filePath.c_str());
140
+ if(!ifs)
141
+ {
142
+ LogError("open %s failed.", filePath.c_str());
143
+ assert(false);
144
+ }
145
+ string line ;
146
+ while(getline(ifs, line))
147
+ {
148
+ _stopWords.insert(line);
149
+ }
150
+ assert(_stopWords.size());
151
+ }
152
+ private:
153
+ bool _isSingleWord(const string& str) const
154
+ {
155
+ Unicode unicode;
156
+ TransCode::decode(str, unicode);
157
+ if(unicode.size() == 1)
158
+ return true;
159
+ return false;
160
+ }
161
+
162
+ private:
163
+ static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
164
+ {
165
+ return lhs.second > rhs.second;
166
+ }
167
+
168
+ };
169
+ }
170
+
171
+ #endif
172
+
173
+
@@ -0,0 +1,84 @@
1
+ /************************************
2
+ * file enc : ascii
3
+ * author : wuyanyi09@gmail.com
4
+ ************************************/
5
+
6
+ #ifndef LIMONP_ARGV_FUNCTS_H
7
+ #define LIMONP_ARGV_FUNCTS_H
8
+
9
+ #include <set>
10
+ #include <sstream>
11
+ #include "StringUtil.hpp"
12
+
13
+ namespace Limonp
14
+ {
15
+ using namespace std;
16
+ class ArgvContext
17
+ {
18
+ public :
19
+ ArgvContext(int argc, const char* const * argv)
20
+ {
21
+
22
+ for(int i = 0; i < argc; i++)
23
+ {
24
+ if(startsWith(argv[i], "-"))
25
+ {
26
+ if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
27
+ {
28
+ mpss_[argv[i]] = argv[i+1];
29
+ i++;
30
+ }
31
+ else
32
+ {
33
+ sset_.insert(argv[i]);
34
+ }
35
+ }
36
+ else
37
+ {
38
+ args_.push_back(argv[i]);
39
+ }
40
+ }
41
+ }
42
+ ~ArgvContext(){};
43
+ public:
44
+ friend ostream& operator << (ostream& os, const ArgvContext& args);
45
+ string operator [](size_t i) const
46
+ {
47
+ if(i < args_.size())
48
+ {
49
+ return args_[i];
50
+ }
51
+ return "";
52
+ }
53
+ string operator [](const string& key) const
54
+ {
55
+ map<string, string>::const_iterator it = mpss_.find(key);
56
+ if(it != mpss_.end())
57
+ {
58
+ return it->second;
59
+ }
60
+ return "";
61
+ }
62
+ public:
63
+ bool hasKey(const string& key) const
64
+ {
65
+ if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end())
66
+ {
67
+ return true;
68
+ }
69
+ return false;
70
+ }
71
+ private:
72
+ vector<string> args_;
73
+ map<string, string> mpss_;
74
+ set<string> sset_;
75
+
76
+ };
77
+
78
+ inline ostream& operator << (ostream& os, const ArgvContext& args)
79
+ {
80
+ return os<<args.args_<<args.mpss_<<args.sset_;
81
+ }
82
+ }
83
+
84
+ #endif