jieba_rb 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +51 -0
- data/Rakefile +11 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +27 -0
- data/ext/cppjieba/ChangeLog.md +81 -0
- data/ext/cppjieba/Dockerfile +11 -0
- data/ext/cppjieba/LICENSE +20 -0
- data/ext/cppjieba/README.md +359 -0
- data/ext/cppjieba/conf/CMakeLists.txt +1 -0
- data/ext/cppjieba/conf/server.conf +16 -0
- data/ext/cppjieba/dict/CMakeLists.txt +1 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +3 -0
- data/ext/cppjieba/script/CMakeLists.txt +1 -0
- data/ext/cppjieba/script/cjserver.start +12 -0
- data/ext/cppjieba/script/cjserver.stop +13 -0
- data/ext/cppjieba/server/CMakeLists.txt +9 -0
- data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
- data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
- data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
- data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
- data/ext/cppjieba/server/server.cpp +91 -0
- data/ext/cppjieba/src/DictTrie.hpp +211 -0
- data/ext/cppjieba/src/FullSegment.hpp +153 -0
- data/ext/cppjieba/src/HMMSegment.hpp +394 -0
- data/ext/cppjieba/src/ISegment.hpp +17 -0
- data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
- data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
- data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
- data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
- data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
- data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
- data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
- data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
- data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
- data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
- data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
- data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
- data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
- data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
- data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
- data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
- data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
- data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
- data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
- data/ext/cppjieba/src/MPSegment.hpp +148 -0
- data/ext/cppjieba/src/MixSegment.hpp +121 -0
- data/ext/cppjieba/src/PosTagger.hpp +109 -0
- data/ext/cppjieba/src/QuerySegment.hpp +123 -0
- data/ext/cppjieba/src/SegmentBase.hpp +78 -0
- data/ext/cppjieba/src/TransCode.hpp +63 -0
- data/ext/cppjieba/src/Trie.hpp +298 -0
- data/ext/cppjieba/test/CMakeLists.txt +7 -0
- data/ext/cppjieba/test/keyword_demo.cpp +16 -0
- data/ext/cppjieba/test/load_test.cpp +56 -0
- data/ext/cppjieba/test/segment_demo.cpp +59 -0
- data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
- data/ext/cppjieba/test/servertest/load_test.py +91 -0
- data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
- data/ext/cppjieba/test/tagging_demo.cpp +12 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +13 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
- data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
- data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
- data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
- data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/jieba/extconf.rb +26 -0
- data/ext/jieba/jieba.c +9 -0
- data/ext/jieba/jieba.h +9 -0
- data/ext/jieba/segment.cc +88 -0
- data/ext/jieba/segment.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/lib/jieba_rb.rb +28 -0
- data/test/test_segment.rb +32 -0
- metadata +246 -0
@@ -0,0 +1,394 @@
|
|
1
|
+
#ifndef CPPJIBEA_HMMSEGMENT_H
|
2
|
+
#define CPPJIBEA_HMMSEGMENT_H
|
3
|
+
|
4
|
+
#include <iostream>
|
5
|
+
#include <fstream>
|
6
|
+
#include <memory.h>
|
7
|
+
#include <cassert>
|
8
|
+
#include "Limonp/StringUtil.hpp"
|
9
|
+
#include "Limonp/Logger.hpp"
|
10
|
+
#include "TransCode.hpp"
|
11
|
+
#include "ISegment.hpp"
|
12
|
+
#include "SegmentBase.hpp"
|
13
|
+
#include "DictTrie.hpp"
|
14
|
+
|
15
|
+
namespace CppJieba
|
16
|
+
{
|
17
|
+
using namespace Limonp;
|
18
|
+
typedef unordered_map<uint16_t, double> EmitProbMap;
|
19
|
+
class HMMSegment: public SegmentBase
|
20
|
+
{
|
21
|
+
public:
|
22
|
+
/*
|
23
|
+
* STATUS:
|
24
|
+
* 0:B, 1:E, 2:M, 3:S
|
25
|
+
* */
|
26
|
+
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
27
|
+
private:
|
28
|
+
char _statMap[STATUS_SUM];
|
29
|
+
double _startProb[STATUS_SUM];
|
30
|
+
double _transProb[STATUS_SUM][STATUS_SUM];
|
31
|
+
EmitProbMap _emitProbB;
|
32
|
+
EmitProbMap _emitProbE;
|
33
|
+
EmitProbMap _emitProbM;
|
34
|
+
EmitProbMap _emitProbS;
|
35
|
+
vector<EmitProbMap* > _emitProbVec;
|
36
|
+
|
37
|
+
public:
|
38
|
+
HMMSegment(){}
|
39
|
+
explicit HMMSegment(const string& filePath)
|
40
|
+
{
|
41
|
+
LIMONP_CHECK(init(filePath));
|
42
|
+
}
|
43
|
+
virtual ~HMMSegment(){}
|
44
|
+
public:
|
45
|
+
bool init(const string& filePath)
|
46
|
+
{
|
47
|
+
memset(_startProb, 0, sizeof(_startProb));
|
48
|
+
memset(_transProb, 0, sizeof(_transProb));
|
49
|
+
_statMap[0] = 'B';
|
50
|
+
_statMap[1] = 'E';
|
51
|
+
_statMap[2] = 'M';
|
52
|
+
_statMap[3] = 'S';
|
53
|
+
_emitProbVec.push_back(&_emitProbB);
|
54
|
+
_emitProbVec.push_back(&_emitProbE);
|
55
|
+
_emitProbVec.push_back(&_emitProbM);
|
56
|
+
_emitProbVec.push_back(&_emitProbS);
|
57
|
+
LIMONP_CHECK(_loadModel(filePath.c_str()));
|
58
|
+
LogInfo("HMMSegment init(%s) ok.", filePath.c_str());
|
59
|
+
return true;
|
60
|
+
}
|
61
|
+
public:
|
62
|
+
using SegmentBase::cut;
|
63
|
+
public:
|
64
|
+
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res)const
|
65
|
+
{
|
66
|
+
Unicode::const_iterator left = begin;
|
67
|
+
Unicode::const_iterator right = begin;
|
68
|
+
while(right != end)
|
69
|
+
{
|
70
|
+
if(*right < 0x80)
|
71
|
+
{
|
72
|
+
if(left != right && !_cut(left, right, res))
|
73
|
+
{
|
74
|
+
return false;
|
75
|
+
}
|
76
|
+
left = right;
|
77
|
+
do {
|
78
|
+
right = _sequentialLetterRule(left, end);
|
79
|
+
if(right != left)
|
80
|
+
{
|
81
|
+
break;
|
82
|
+
}
|
83
|
+
right = _numbersRule(left, end);
|
84
|
+
if(right != left)
|
85
|
+
{
|
86
|
+
break;
|
87
|
+
}
|
88
|
+
right ++;
|
89
|
+
} while(false);
|
90
|
+
res.push_back(Unicode(left, right));
|
91
|
+
left = right;
|
92
|
+
}
|
93
|
+
else
|
94
|
+
{
|
95
|
+
right++;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
if(left != right && !_cut(left, right, res))
|
99
|
+
{
|
100
|
+
return false;
|
101
|
+
}
|
102
|
+
return true;
|
103
|
+
}
|
104
|
+
private:
|
105
|
+
// sequential letters rule
|
106
|
+
Unicode::const_iterator _sequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
107
|
+
{
|
108
|
+
Unicode::value_type x;
|
109
|
+
while(begin != end)
|
110
|
+
{
|
111
|
+
x = *begin;
|
112
|
+
if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z'))
|
113
|
+
{
|
114
|
+
begin ++;
|
115
|
+
}
|
116
|
+
else
|
117
|
+
{
|
118
|
+
break;
|
119
|
+
}
|
120
|
+
}
|
121
|
+
return begin;
|
122
|
+
}
|
123
|
+
//
|
124
|
+
Unicode::const_iterator _numbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const
|
125
|
+
{
|
126
|
+
Unicode::value_type x = *begin;
|
127
|
+
if('0' <= x && x <= '9')
|
128
|
+
{
|
129
|
+
begin ++;
|
130
|
+
}
|
131
|
+
else
|
132
|
+
{
|
133
|
+
return begin;
|
134
|
+
}
|
135
|
+
while(begin != end)
|
136
|
+
{
|
137
|
+
x = *begin;
|
138
|
+
if( ('0' <= x && x <= '9') || x == '.')
|
139
|
+
{
|
140
|
+
begin++;
|
141
|
+
}
|
142
|
+
else
|
143
|
+
{
|
144
|
+
break;
|
145
|
+
}
|
146
|
+
}
|
147
|
+
return begin;
|
148
|
+
}
|
149
|
+
bool _cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
150
|
+
{
|
151
|
+
vector<size_t> status;
|
152
|
+
if(!_viterbi(begin, end, status))
|
153
|
+
{
|
154
|
+
LogError("_viterbi failed.");
|
155
|
+
return false;
|
156
|
+
}
|
157
|
+
|
158
|
+
Unicode::const_iterator left = begin;
|
159
|
+
Unicode::const_iterator right;
|
160
|
+
for(size_t i = 0; i < status.size(); i++)
|
161
|
+
{
|
162
|
+
if(status[i] % 2) //if(E == status[i] || S == status[i])
|
163
|
+
{
|
164
|
+
right = begin + i + 1;
|
165
|
+
res.push_back(Unicode(left, right));
|
166
|
+
left = right;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
return true;
|
170
|
+
}
|
171
|
+
public:
|
172
|
+
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
173
|
+
{
|
174
|
+
if(begin == end)
|
175
|
+
{
|
176
|
+
return false;
|
177
|
+
}
|
178
|
+
vector<Unicode> words;
|
179
|
+
words.reserve(end - begin);
|
180
|
+
if(!cut(begin, end, words))
|
181
|
+
{
|
182
|
+
return false;
|
183
|
+
}
|
184
|
+
size_t offset = res.size();
|
185
|
+
res.resize(res.size() + words.size());
|
186
|
+
for(size_t i = 0; i < words.size(); i++)
|
187
|
+
{
|
188
|
+
if(!TransCode::encode(words[i], res[offset + i]))
|
189
|
+
{
|
190
|
+
LogError("encode failed.");
|
191
|
+
}
|
192
|
+
}
|
193
|
+
return true;
|
194
|
+
}
|
195
|
+
|
196
|
+
private:
|
197
|
+
bool _viterbi(Unicode::const_iterator begin, Unicode::const_iterator end, vector<size_t>& status)const
|
198
|
+
{
|
199
|
+
if(begin == end)
|
200
|
+
{
|
201
|
+
return false;
|
202
|
+
}
|
203
|
+
|
204
|
+
size_t Y = STATUS_SUM;
|
205
|
+
size_t X = end - begin;
|
206
|
+
|
207
|
+
size_t XYSize = X * Y;
|
208
|
+
size_t now, old, stat;
|
209
|
+
double tmp, endE, endS;
|
210
|
+
|
211
|
+
vector<int> path(XYSize);
|
212
|
+
vector<double> weight(XYSize);
|
213
|
+
|
214
|
+
//start
|
215
|
+
for(size_t y = 0; y < Y; y++)
|
216
|
+
{
|
217
|
+
weight[0 + y * X] = _startProb[y] + _getEmitProb(_emitProbVec[y], *begin, MIN_DOUBLE);
|
218
|
+
path[0 + y * X] = -1;
|
219
|
+
}
|
220
|
+
|
221
|
+
|
222
|
+
double emitProb;
|
223
|
+
|
224
|
+
for(size_t x = 1; x < X; x++)
|
225
|
+
{
|
226
|
+
for(size_t y = 0; y < Y; y++)
|
227
|
+
{
|
228
|
+
now = x + y*X;
|
229
|
+
weight[now] = MIN_DOUBLE;
|
230
|
+
path[now] = E; // warning
|
231
|
+
emitProb = _getEmitProb(_emitProbVec[y], *(begin+x), MIN_DOUBLE);
|
232
|
+
for(size_t preY = 0; preY < Y; preY++)
|
233
|
+
{
|
234
|
+
old = x - 1 + preY * X;
|
235
|
+
tmp = weight[old] + _transProb[preY][y] + emitProb;
|
236
|
+
if(tmp > weight[now])
|
237
|
+
{
|
238
|
+
weight[now] = tmp;
|
239
|
+
path[now] = preY;
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
endE = weight[X-1+E*X];
|
246
|
+
endS = weight[X-1+S*X];
|
247
|
+
stat = 0;
|
248
|
+
if(endE >= endS)
|
249
|
+
{
|
250
|
+
stat = E;
|
251
|
+
}
|
252
|
+
else
|
253
|
+
{
|
254
|
+
stat = S;
|
255
|
+
}
|
256
|
+
|
257
|
+
status.resize(X);
|
258
|
+
for(int x = X -1 ; x >= 0; x--)
|
259
|
+
{
|
260
|
+
status[x] = stat;
|
261
|
+
stat = path[x + stat*X];
|
262
|
+
}
|
263
|
+
|
264
|
+
return true;
|
265
|
+
}
|
266
|
+
bool _loadModel(const char* const filePath)
|
267
|
+
{
|
268
|
+
LogDebug("loadModel [%s] start ...", filePath);
|
269
|
+
ifstream ifile(filePath);
|
270
|
+
string line;
|
271
|
+
vector<string> tmp;
|
272
|
+
vector<string> tmp2;
|
273
|
+
//load _startProb
|
274
|
+
if(!_getLine(ifile, line))
|
275
|
+
{
|
276
|
+
return false;
|
277
|
+
}
|
278
|
+
split(line, tmp, " ");
|
279
|
+
if(tmp.size() != STATUS_SUM)
|
280
|
+
{
|
281
|
+
LogError("start_p illegal");
|
282
|
+
return false;
|
283
|
+
}
|
284
|
+
for(size_t j = 0; j< tmp.size(); j++)
|
285
|
+
{
|
286
|
+
_startProb[j] = atof(tmp[j].c_str());
|
287
|
+
}
|
288
|
+
|
289
|
+
//load _transProb
|
290
|
+
for(size_t i = 0; i < STATUS_SUM; i++)
|
291
|
+
{
|
292
|
+
if(!_getLine(ifile, line))
|
293
|
+
{
|
294
|
+
return false;
|
295
|
+
}
|
296
|
+
split(line, tmp, " ");
|
297
|
+
if(tmp.size() != STATUS_SUM)
|
298
|
+
{
|
299
|
+
LogError("trans_p illegal");
|
300
|
+
return false;
|
301
|
+
}
|
302
|
+
for(size_t j =0; j < STATUS_SUM; j++)
|
303
|
+
{
|
304
|
+
_transProb[i][j] = atof(tmp[j].c_str());
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
//load _emitProbB
|
309
|
+
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbB))
|
310
|
+
{
|
311
|
+
return false;
|
312
|
+
}
|
313
|
+
|
314
|
+
//load _emitProbE
|
315
|
+
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbE))
|
316
|
+
{
|
317
|
+
return false;
|
318
|
+
}
|
319
|
+
|
320
|
+
//load _emitProbM
|
321
|
+
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbM))
|
322
|
+
{
|
323
|
+
return false;
|
324
|
+
}
|
325
|
+
|
326
|
+
//load _emitProbS
|
327
|
+
if(!_getLine(ifile, line) || !_loadEmitProb(line, _emitProbS))
|
328
|
+
{
|
329
|
+
return false;
|
330
|
+
}
|
331
|
+
|
332
|
+
LogDebug("loadModel [%s] end.", filePath);
|
333
|
+
|
334
|
+
return true;
|
335
|
+
}
|
336
|
+
bool _getLine(ifstream& ifile, string& line)
|
337
|
+
{
|
338
|
+
while(getline(ifile, line))
|
339
|
+
{
|
340
|
+
trim(line);
|
341
|
+
if(line.empty())
|
342
|
+
{
|
343
|
+
continue;
|
344
|
+
}
|
345
|
+
if(startsWith(line, "#"))
|
346
|
+
{
|
347
|
+
continue;
|
348
|
+
}
|
349
|
+
return true;
|
350
|
+
}
|
351
|
+
return false;
|
352
|
+
}
|
353
|
+
bool _loadEmitProb(const string& line, EmitProbMap& mp)
|
354
|
+
{
|
355
|
+
if(line.empty())
|
356
|
+
{
|
357
|
+
return false;
|
358
|
+
}
|
359
|
+
vector<string> tmp, tmp2;
|
360
|
+
Unicode unicode;
|
361
|
+
split(line, tmp, ",");
|
362
|
+
for(size_t i = 0; i < tmp.size(); i++)
|
363
|
+
{
|
364
|
+
split(tmp[i], tmp2, ":");
|
365
|
+
if(2 != tmp2.size())
|
366
|
+
{
|
367
|
+
LogError("_emitProb illegal.");
|
368
|
+
return false;
|
369
|
+
}
|
370
|
+
if(!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1)
|
371
|
+
{
|
372
|
+
LogError("TransCode failed.");
|
373
|
+
return false;
|
374
|
+
}
|
375
|
+
mp[unicode[0]] = atof(tmp2[1].c_str());
|
376
|
+
}
|
377
|
+
return true;
|
378
|
+
}
|
379
|
+
double _getEmitProb(const EmitProbMap* ptMp, uint16_t key, double defVal)const
|
380
|
+
{
|
381
|
+
EmitProbMap::const_iterator cit = ptMp->find(key);
|
382
|
+
if(cit == ptMp->end())
|
383
|
+
{
|
384
|
+
return defVal;
|
385
|
+
}
|
386
|
+
return cit->second;
|
387
|
+
|
388
|
+
}
|
389
|
+
|
390
|
+
|
391
|
+
};
|
392
|
+
}
|
393
|
+
|
394
|
+
#endif
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef CPPJIEBA_SEGMENTINTERFACE_H
|
2
|
+
#define CPPJIEBA_SEGMENTINTERFACE_H
|
3
|
+
|
4
|
+
|
5
|
+
namespace CppJieba
|
6
|
+
{
|
7
|
+
class ISegment
|
8
|
+
{
|
9
|
+
public:
|
10
|
+
virtual ~ISegment(){};
|
11
|
+
public:
|
12
|
+
virtual bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<string>& res) const = 0;
|
13
|
+
virtual bool cut(const string& str, vector<string>& res) const = 0;
|
14
|
+
};
|
15
|
+
}
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,173 @@
|
|
1
|
+
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
2
|
+
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
3
|
+
|
4
|
+
#include "MixSegment.hpp"
|
5
|
+
#include <cmath>
|
6
|
+
#include <set>
|
7
|
+
|
8
|
+
namespace CppJieba
|
9
|
+
{
|
10
|
+
using namespace Limonp;
|
11
|
+
|
12
|
+
/*utf8*/
|
13
|
+
class KeywordExtractor
|
14
|
+
{
|
15
|
+
private:
|
16
|
+
MixSegment _segment;
|
17
|
+
private:
|
18
|
+
unordered_map<string, double> _idfMap;
|
19
|
+
double _idfAverage;
|
20
|
+
|
21
|
+
unordered_set<string> _stopWords;
|
22
|
+
public:
|
23
|
+
KeywordExtractor(){};
|
24
|
+
KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
25
|
+
{
|
26
|
+
LIMONP_CHECK(init(dictPath, hmmFilePath, idfPath, stopWordPath));
|
27
|
+
};
|
28
|
+
~KeywordExtractor(){};
|
29
|
+
|
30
|
+
public:
|
31
|
+
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath)
|
32
|
+
{
|
33
|
+
_loadIdfDict(idfPath);
|
34
|
+
_loadStopWordDict(stopWordPath);
|
35
|
+
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath));
|
36
|
+
return true;
|
37
|
+
};
|
38
|
+
public:
|
39
|
+
|
40
|
+
bool extract(const string& str, vector<string>& keywords, size_t topN) const
|
41
|
+
{
|
42
|
+
vector<pair<string, double> > topWords;
|
43
|
+
if(!extract(str, topWords, topN))
|
44
|
+
{
|
45
|
+
return false;
|
46
|
+
}
|
47
|
+
for(size_t i = 0; i < topWords.size(); i++)
|
48
|
+
{
|
49
|
+
keywords.push_back(topWords[i].first);
|
50
|
+
}
|
51
|
+
return true;
|
52
|
+
}
|
53
|
+
|
54
|
+
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const
|
55
|
+
{
|
56
|
+
vector<string> words;
|
57
|
+
if(!_segment.cut(str, words))
|
58
|
+
{
|
59
|
+
LogError("segment cut(%s) failed.", str.c_str());
|
60
|
+
return false;
|
61
|
+
}
|
62
|
+
|
63
|
+
map<string, double> wordmap;
|
64
|
+
for(vector<string>::iterator iter = words.begin(); iter != words.end(); iter++)
|
65
|
+
{
|
66
|
+
if(_isSingleWord(*iter))
|
67
|
+
{
|
68
|
+
continue;
|
69
|
+
}
|
70
|
+
wordmap[*iter] += 1.0;
|
71
|
+
}
|
72
|
+
|
73
|
+
for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); )
|
74
|
+
{
|
75
|
+
if(_stopWords.end() != _stopWords.find(itr->first))
|
76
|
+
{
|
77
|
+
wordmap.erase(itr++);
|
78
|
+
continue;
|
79
|
+
}
|
80
|
+
|
81
|
+
unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first);
|
82
|
+
if(cit != _idfMap.end())
|
83
|
+
{
|
84
|
+
itr->second *= cit->second;
|
85
|
+
}
|
86
|
+
else
|
87
|
+
{
|
88
|
+
itr->second *= _idfAverage;
|
89
|
+
}
|
90
|
+
itr ++;
|
91
|
+
}
|
92
|
+
|
93
|
+
keywords.clear();
|
94
|
+
std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin()));
|
95
|
+
topN = min(topN, keywords.size());
|
96
|
+
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp);
|
97
|
+
keywords.resize(topN);
|
98
|
+
return true;
|
99
|
+
}
|
100
|
+
private:
|
101
|
+
void _loadIdfDict(const string& idfPath)
|
102
|
+
{
|
103
|
+
ifstream ifs(idfPath.c_str());
|
104
|
+
if(!ifs)
|
105
|
+
{
|
106
|
+
LogError("open %s failed.", idfPath.c_str());
|
107
|
+
assert(false);
|
108
|
+
}
|
109
|
+
string line ;
|
110
|
+
vector<string> buf;
|
111
|
+
double idf = 0.0;
|
112
|
+
double idfSum = 0.0;
|
113
|
+
size_t lineno = 0;
|
114
|
+
for(;getline(ifs, line); lineno++)
|
115
|
+
{
|
116
|
+
buf.clear();
|
117
|
+
if(line.empty())
|
118
|
+
{
|
119
|
+
LogError("line[%d] empty. skipped.", lineno);
|
120
|
+
continue;
|
121
|
+
}
|
122
|
+
if(!split(line, buf, " ") || buf.size() != 2)
|
123
|
+
{
|
124
|
+
LogError("line %d [%s] illegal. skipped.", lineno, line.c_str());
|
125
|
+
continue;
|
126
|
+
}
|
127
|
+
idf = atof(buf[1].c_str());
|
128
|
+
_idfMap[buf[0]] = idf;
|
129
|
+
idfSum += idf;
|
130
|
+
|
131
|
+
}
|
132
|
+
|
133
|
+
assert(lineno);
|
134
|
+
_idfAverage = idfSum / lineno;
|
135
|
+
assert(_idfAverage > 0.0);
|
136
|
+
}
|
137
|
+
void _loadStopWordDict(const string& filePath)
|
138
|
+
{
|
139
|
+
ifstream ifs(filePath.c_str());
|
140
|
+
if(!ifs)
|
141
|
+
{
|
142
|
+
LogError("open %s failed.", filePath.c_str());
|
143
|
+
assert(false);
|
144
|
+
}
|
145
|
+
string line ;
|
146
|
+
while(getline(ifs, line))
|
147
|
+
{
|
148
|
+
_stopWords.insert(line);
|
149
|
+
}
|
150
|
+
assert(_stopWords.size());
|
151
|
+
}
|
152
|
+
private:
|
153
|
+
bool _isSingleWord(const string& str) const
|
154
|
+
{
|
155
|
+
Unicode unicode;
|
156
|
+
TransCode::decode(str, unicode);
|
157
|
+
if(unicode.size() == 1)
|
158
|
+
return true;
|
159
|
+
return false;
|
160
|
+
}
|
161
|
+
|
162
|
+
private:
|
163
|
+
static bool _cmp(const pair<string, double>& lhs, const pair<string, double>& rhs)
|
164
|
+
{
|
165
|
+
return lhs.second > rhs.second;
|
166
|
+
}
|
167
|
+
|
168
|
+
};
|
169
|
+
}
|
170
|
+
|
171
|
+
#endif
|
172
|
+
|
173
|
+
|
@@ -0,0 +1,84 @@
|
|
1
|
+
/************************************
|
2
|
+
* file enc : ascii
|
3
|
+
* author : wuyanyi09@gmail.com
|
4
|
+
************************************/
|
5
|
+
|
6
|
+
#ifndef LIMONP_ARGV_FUNCTS_H
|
7
|
+
#define LIMONP_ARGV_FUNCTS_H
|
8
|
+
|
9
|
+
#include <set>
|
10
|
+
#include <sstream>
|
11
|
+
#include "StringUtil.hpp"
|
12
|
+
|
13
|
+
namespace Limonp
|
14
|
+
{
|
15
|
+
using namespace std;
|
16
|
+
class ArgvContext
|
17
|
+
{
|
18
|
+
public :
|
19
|
+
ArgvContext(int argc, const char* const * argv)
|
20
|
+
{
|
21
|
+
|
22
|
+
for(int i = 0; i < argc; i++)
|
23
|
+
{
|
24
|
+
if(startsWith(argv[i], "-"))
|
25
|
+
{
|
26
|
+
if(i + 1 < argc && !startsWith(argv[i + 1], "-"))
|
27
|
+
{
|
28
|
+
mpss_[argv[i]] = argv[i+1];
|
29
|
+
i++;
|
30
|
+
}
|
31
|
+
else
|
32
|
+
{
|
33
|
+
sset_.insert(argv[i]);
|
34
|
+
}
|
35
|
+
}
|
36
|
+
else
|
37
|
+
{
|
38
|
+
args_.push_back(argv[i]);
|
39
|
+
}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
~ArgvContext(){};
|
43
|
+
public:
|
44
|
+
friend ostream& operator << (ostream& os, const ArgvContext& args);
|
45
|
+
string operator [](size_t i) const
|
46
|
+
{
|
47
|
+
if(i < args_.size())
|
48
|
+
{
|
49
|
+
return args_[i];
|
50
|
+
}
|
51
|
+
return "";
|
52
|
+
}
|
53
|
+
string operator [](const string& key) const
|
54
|
+
{
|
55
|
+
map<string, string>::const_iterator it = mpss_.find(key);
|
56
|
+
if(it != mpss_.end())
|
57
|
+
{
|
58
|
+
return it->second;
|
59
|
+
}
|
60
|
+
return "";
|
61
|
+
}
|
62
|
+
public:
|
63
|
+
bool hasKey(const string& key) const
|
64
|
+
{
|
65
|
+
if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end())
|
66
|
+
{
|
67
|
+
return true;
|
68
|
+
}
|
69
|
+
return false;
|
70
|
+
}
|
71
|
+
private:
|
72
|
+
vector<string> args_;
|
73
|
+
map<string, string> mpss_;
|
74
|
+
set<string> sset_;
|
75
|
+
|
76
|
+
};
|
77
|
+
|
78
|
+
inline ostream& operator << (ostream& os, const ArgvContext& args)
|
79
|
+
{
|
80
|
+
return os<<args.args_<<args.mpss_<<args.sset_;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
#endif
|