jieba_rb 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +51 -0
- data/Rakefile +11 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +27 -0
- data/ext/cppjieba/ChangeLog.md +81 -0
- data/ext/cppjieba/Dockerfile +11 -0
- data/ext/cppjieba/LICENSE +20 -0
- data/ext/cppjieba/README.md +359 -0
- data/ext/cppjieba/conf/CMakeLists.txt +1 -0
- data/ext/cppjieba/conf/server.conf +16 -0
- data/ext/cppjieba/dict/CMakeLists.txt +1 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +3 -0
- data/ext/cppjieba/script/CMakeLists.txt +1 -0
- data/ext/cppjieba/script/cjserver.start +12 -0
- data/ext/cppjieba/script/cjserver.stop +13 -0
- data/ext/cppjieba/server/CMakeLists.txt +9 -0
- data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
- data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
- data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
- data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
- data/ext/cppjieba/server/server.cpp +91 -0
- data/ext/cppjieba/src/DictTrie.hpp +211 -0
- data/ext/cppjieba/src/FullSegment.hpp +153 -0
- data/ext/cppjieba/src/HMMSegment.hpp +394 -0
- data/ext/cppjieba/src/ISegment.hpp +17 -0
- data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
- data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
- data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
- data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
- data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
- data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
- data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
- data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
- data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
- data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
- data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
- data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
- data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
- data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
- data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
- data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
- data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
- data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
- data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
- data/ext/cppjieba/src/MPSegment.hpp +148 -0
- data/ext/cppjieba/src/MixSegment.hpp +121 -0
- data/ext/cppjieba/src/PosTagger.hpp +109 -0
- data/ext/cppjieba/src/QuerySegment.hpp +123 -0
- data/ext/cppjieba/src/SegmentBase.hpp +78 -0
- data/ext/cppjieba/src/TransCode.hpp +63 -0
- data/ext/cppjieba/src/Trie.hpp +298 -0
- data/ext/cppjieba/test/CMakeLists.txt +7 -0
- data/ext/cppjieba/test/keyword_demo.cpp +16 -0
- data/ext/cppjieba/test/load_test.cpp +56 -0
- data/ext/cppjieba/test/segment_demo.cpp +59 -0
- data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
- data/ext/cppjieba/test/servertest/load_test.py +91 -0
- data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
- data/ext/cppjieba/test/tagging_demo.cpp +12 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +13 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
- data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
- data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
- data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
- data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/jieba/extconf.rb +26 -0
- data/ext/jieba/jieba.c +9 -0
- data/ext/jieba/jieba.h +9 -0
- data/ext/jieba/segment.cc +88 -0
- data/ext/jieba/segment.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/lib/jieba_rb.rb +28 -0
- data/test/test_segment.rb +32 -0
- metadata +246 -0
@@ -0,0 +1,298 @@
|
|
1
|
+
#ifndef CPPJIEBA_TRIE_HPP
|
2
|
+
#define CPPJIEBA_TRIE_HPP
|
3
|
+
|
4
|
+
#include "Limonp/StdExtension.hpp"
|
5
|
+
#include <vector>
|
6
|
+
#include <queue>
|
7
|
+
|
8
|
+
namespace CppJieba
|
9
|
+
{
|
10
|
+
using namespace std;
|
11
|
+
|
12
|
+
struct DictUnit
|
13
|
+
{
|
14
|
+
Unicode word;
|
15
|
+
double weight;
|
16
|
+
string tag;
|
17
|
+
};
|
18
|
+
|
19
|
+
// for debugging
|
20
|
+
inline ostream & operator << (ostream& os, const DictUnit& unit)
|
21
|
+
{
|
22
|
+
string s;
|
23
|
+
s << unit.word;
|
24
|
+
return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
25
|
+
}
|
26
|
+
|
27
|
+
typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
|
28
|
+
|
29
|
+
struct SegmentChar
|
30
|
+
{
|
31
|
+
uint16_t uniCh;
|
32
|
+
DagType dag;
|
33
|
+
const DictUnit * pInfo;
|
34
|
+
double weight;
|
35
|
+
size_t nextPos;
|
36
|
+
SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
|
37
|
+
{}
|
38
|
+
~SegmentChar()
|
39
|
+
{}
|
40
|
+
};
|
41
|
+
|
42
|
+
typedef Unicode::value_type TrieKey;
|
43
|
+
|
44
|
+
class TrieNode
|
45
|
+
{
|
46
|
+
public:
|
47
|
+
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
48
|
+
public:
|
49
|
+
TrieNode * fail;
|
50
|
+
NextMap * next;
|
51
|
+
const DictUnit * ptValue;
|
52
|
+
public:
|
53
|
+
TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
|
54
|
+
{}
|
55
|
+
const TrieNode * findNext(TrieKey key) const
|
56
|
+
{
|
57
|
+
if(next == NULL)
|
58
|
+
{
|
59
|
+
return NULL;
|
60
|
+
}
|
61
|
+
typename NextMap::const_iterator iter = next->find(key);
|
62
|
+
if(iter == next->end())
|
63
|
+
{
|
64
|
+
return NULL;
|
65
|
+
}
|
66
|
+
return iter->second;
|
67
|
+
}
|
68
|
+
};
|
69
|
+
|
70
|
+
class Trie
|
71
|
+
{
|
72
|
+
private:
|
73
|
+
TrieNode* _root;
|
74
|
+
public:
|
75
|
+
Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
76
|
+
{
|
77
|
+
_root = new TrieNode;
|
78
|
+
_createTrie(keys, valuePointers);
|
79
|
+
_build();// build automation
|
80
|
+
}
|
81
|
+
~Trie()
|
82
|
+
{
|
83
|
+
if(_root)
|
84
|
+
{
|
85
|
+
_deleteNode(_root);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
public:
|
89
|
+
const DictUnit* find(typename Unicode::const_iterator begin, typename Unicode::const_iterator end) const
|
90
|
+
{
|
91
|
+
typename TrieNode::NextMap::const_iterator citer;
|
92
|
+
const TrieNode* ptNode = _root;
|
93
|
+
for(typename Unicode::const_iterator it = begin; it != end; it++)
|
94
|
+
{// build automation
|
95
|
+
assert(ptNode);
|
96
|
+
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
|
97
|
+
{
|
98
|
+
return NULL;
|
99
|
+
}
|
100
|
+
ptNode = citer->second;
|
101
|
+
}
|
102
|
+
return ptNode->ptValue;
|
103
|
+
}
|
104
|
+
// aho-corasick-automation
|
105
|
+
void find(
|
106
|
+
typename Unicode::const_iterator begin,
|
107
|
+
typename Unicode::const_iterator end,
|
108
|
+
vector<struct SegmentChar>& res
|
109
|
+
) const
|
110
|
+
{
|
111
|
+
res.resize(end - begin);
|
112
|
+
const TrieNode * now = _root;
|
113
|
+
//typename TrieNode::NextMap::const_iterator iter;
|
114
|
+
const TrieNode* node;
|
115
|
+
// compiler will complain warnings if only "i < end - begin" .
|
116
|
+
for (size_t i = 0; i < size_t(end - begin); i++)
|
117
|
+
{
|
118
|
+
Unicode::value_type ch = *(begin + i);
|
119
|
+
res[i].uniCh = ch;
|
120
|
+
assert(res[i].dag.empty());
|
121
|
+
res[i].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, NULL));
|
122
|
+
bool flag = false;
|
123
|
+
|
124
|
+
// rollback
|
125
|
+
while( now != _root )
|
126
|
+
{
|
127
|
+
node = now->findNext(ch);
|
128
|
+
if (node != NULL)
|
129
|
+
{
|
130
|
+
flag = true;
|
131
|
+
break;
|
132
|
+
}
|
133
|
+
else
|
134
|
+
{
|
135
|
+
now = now->fail;
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
if(!flag)
|
140
|
+
{
|
141
|
+
node = now->findNext(ch);
|
142
|
+
}
|
143
|
+
if(node == NULL)
|
144
|
+
{
|
145
|
+
now = _root;
|
146
|
+
}
|
147
|
+
else
|
148
|
+
{
|
149
|
+
now = node;
|
150
|
+
const TrieNode * temp = now;
|
151
|
+
while(temp != _root)
|
152
|
+
{
|
153
|
+
if (temp->ptValue)
|
154
|
+
{
|
155
|
+
size_t pos = i - temp->ptValue->word.size() + 1;
|
156
|
+
res[pos].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
|
157
|
+
if(pos == i)
|
158
|
+
{
|
159
|
+
res[pos].dag[0].second = temp->ptValue;
|
160
|
+
}
|
161
|
+
}
|
162
|
+
temp = temp->fail;
|
163
|
+
assert(temp);
|
164
|
+
}
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
bool find(
|
169
|
+
typename Unicode::const_iterator begin,
|
170
|
+
typename Unicode::const_iterator end,
|
171
|
+
DagType & res,
|
172
|
+
size_t offset = 0) const
|
173
|
+
{
|
174
|
+
const TrieNode * ptNode = _root;
|
175
|
+
typename TrieNode::NextMap::const_iterator citer;
|
176
|
+
for(typename Unicode::const_iterator itr = begin; itr != end ; itr++)
|
177
|
+
{
|
178
|
+
assert(ptNode);
|
179
|
+
if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
|
180
|
+
{
|
181
|
+
break;
|
182
|
+
}
|
183
|
+
ptNode = citer->second;
|
184
|
+
if(ptNode->ptValue)
|
185
|
+
{
|
186
|
+
if(itr == begin && res.size() == 1) // first singleword
|
187
|
+
{
|
188
|
+
res[0].second = ptNode->ptValue;
|
189
|
+
}
|
190
|
+
else
|
191
|
+
{
|
192
|
+
res.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
|
193
|
+
}
|
194
|
+
}
|
195
|
+
}
|
196
|
+
return !res.empty();
|
197
|
+
}
|
198
|
+
private:
|
199
|
+
void _build()
|
200
|
+
{
|
201
|
+
queue<TrieNode*> que;
|
202
|
+
assert(_root->ptValue == NULL);
|
203
|
+
assert(_root->next);
|
204
|
+
_root->fail = NULL;
|
205
|
+
for(typename TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
|
206
|
+
iter->second->fail = _root;
|
207
|
+
que.push(iter->second);
|
208
|
+
}
|
209
|
+
TrieNode* back = NULL;
|
210
|
+
typename TrieNode::NextMap::iterator backiter;
|
211
|
+
while(!que.empty()) {
|
212
|
+
TrieNode * now = que.front();
|
213
|
+
que.pop();
|
214
|
+
if(now->next == NULL) {
|
215
|
+
continue;
|
216
|
+
}
|
217
|
+
for(typename TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
|
218
|
+
back = now->fail;
|
219
|
+
while(back != NULL) {
|
220
|
+
if(back->next && (backiter = back->next->find(iter->first)) != back->next->end())
|
221
|
+
{
|
222
|
+
iter->second->fail = backiter->second;
|
223
|
+
break;
|
224
|
+
}
|
225
|
+
back = back->fail;
|
226
|
+
}
|
227
|
+
if(back == NULL) {
|
228
|
+
iter->second->fail = _root;
|
229
|
+
}
|
230
|
+
que.push(iter->second);
|
231
|
+
}
|
232
|
+
}
|
233
|
+
}
|
234
|
+
private:
|
235
|
+
void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
|
236
|
+
{
|
237
|
+
if(valuePointers.empty() || keys.empty())
|
238
|
+
{
|
239
|
+
return;
|
240
|
+
}
|
241
|
+
assert(keys.size() == valuePointers.size());
|
242
|
+
|
243
|
+
for(size_t i = 0; i < keys.size(); i++)
|
244
|
+
{
|
245
|
+
_insertNode(keys[i], valuePointers[i]);
|
246
|
+
}
|
247
|
+
}
|
248
|
+
private:
|
249
|
+
void _insertNode(const Unicode& key, const DictUnit* ptValue)
|
250
|
+
{
|
251
|
+
TrieNode* ptNode = _root;
|
252
|
+
|
253
|
+
typename TrieNode::NextMap::const_iterator kmIter;
|
254
|
+
|
255
|
+
for(typename Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++)
|
256
|
+
{
|
257
|
+
if(NULL == ptNode->next)
|
258
|
+
{
|
259
|
+
ptNode->next = new typename TrieNode::NextMap;
|
260
|
+
}
|
261
|
+
kmIter = ptNode->next->find(*citer);
|
262
|
+
if(ptNode->next->end() == kmIter)
|
263
|
+
{
|
264
|
+
TrieNode * nextNode = new TrieNode;
|
265
|
+
nextNode->next = NULL;
|
266
|
+
nextNode->ptValue = NULL;
|
267
|
+
|
268
|
+
(*ptNode->next)[*citer] = nextNode;
|
269
|
+
ptNode = nextNode;
|
270
|
+
}
|
271
|
+
else
|
272
|
+
{
|
273
|
+
ptNode = kmIter->second;
|
274
|
+
}
|
275
|
+
}
|
276
|
+
ptNode->ptValue = ptValue;
|
277
|
+
}
|
278
|
+
void _deleteNode(TrieNode* node)
|
279
|
+
{
|
280
|
+
if(!node)
|
281
|
+
{
|
282
|
+
return;
|
283
|
+
}
|
284
|
+
if(node->next)
|
285
|
+
{
|
286
|
+
typename TrieNode::NextMap::iterator it;
|
287
|
+
for(it = node->next->begin(); it != node->next->end(); it++)
|
288
|
+
{
|
289
|
+
_deleteNode(it->second);
|
290
|
+
}
|
291
|
+
delete node->next;
|
292
|
+
}
|
293
|
+
delete node;
|
294
|
+
}
|
295
|
+
};
|
296
|
+
}
|
297
|
+
|
298
|
+
#endif
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#include "../src/KeywordExtractor.hpp"
|
2
|
+
using namespace CppJieba;
|
3
|
+
|
4
|
+
int main(int argc, char ** argv)
|
5
|
+
{
|
6
|
+
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
7
|
+
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
8
|
+
vector<pair<string, double> > wordweights;
|
9
|
+
vector<string> words;
|
10
|
+
size_t topN = 5;
|
11
|
+
extractor.extract(s, wordweights, topN);
|
12
|
+
cout<< s << '\n' << wordweights << endl;
|
13
|
+
extractor.extract(s, words, topN);
|
14
|
+
cout<< s << '\n' << words << endl;
|
15
|
+
return EXIT_SUCCESS;
|
16
|
+
}
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include <ctime>
|
3
|
+
#include <fstream>
|
4
|
+
#include "../src/MPSegment.hpp"
|
5
|
+
#include "../src/HMMSegment.hpp"
|
6
|
+
#include "../src/MixSegment.hpp"
|
7
|
+
#include "../src/KeywordExtractor.hpp"
|
8
|
+
|
9
|
+
using namespace CppJieba;
|
10
|
+
|
11
|
+
void cut(size_t times = 50)
|
12
|
+
{
|
13
|
+
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
14
|
+
vector<string> res;
|
15
|
+
string doc;
|
16
|
+
ifstream ifs("../test/testdata/weicheng.utf8");
|
17
|
+
assert(ifs);
|
18
|
+
doc << ifs;
|
19
|
+
long beginTime = clock();
|
20
|
+
for(size_t i = 0; i < times; i ++)
|
21
|
+
{
|
22
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
23
|
+
fflush(stdout);
|
24
|
+
res.clear();
|
25
|
+
seg.cut(doc, res);
|
26
|
+
}
|
27
|
+
long endTime = clock();
|
28
|
+
printf("\ncut: [%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
29
|
+
}
|
30
|
+
|
31
|
+
void extract(size_t times = 400)
|
32
|
+
{
|
33
|
+
KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
34
|
+
vector<string> words;
|
35
|
+
string doc;
|
36
|
+
ifstream ifs("../test/testdata/review.100");
|
37
|
+
assert(ifs);
|
38
|
+
doc << ifs;
|
39
|
+
long beginTime = clock();
|
40
|
+
for(size_t i = 0; i < times; i ++)
|
41
|
+
{
|
42
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
43
|
+
fflush(stdout);
|
44
|
+
words.clear();
|
45
|
+
extractor.extract(doc, words, 5);
|
46
|
+
}
|
47
|
+
long endTime = clock();
|
48
|
+
printf("\nextract: [%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
49
|
+
}
|
50
|
+
|
51
|
+
int main(int argc, char ** argv)
|
52
|
+
{
|
53
|
+
cut();
|
54
|
+
extract();
|
55
|
+
return EXIT_SUCCESS;
|
56
|
+
}
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include <fstream>
|
3
|
+
|
4
|
+
#define LOGGER_LEVEL LL_WARN
|
5
|
+
|
6
|
+
#include "../src/MPSegment.hpp"
|
7
|
+
#include "../src/HMMSegment.hpp"
|
8
|
+
#include "../src/MixSegment.hpp"
|
9
|
+
|
10
|
+
using namespace CppJieba;
|
11
|
+
|
12
|
+
const char * const TEST_FILE = "../test/testdata/testlines.utf8";
|
13
|
+
const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
|
14
|
+
const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
|
15
|
+
const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
|
16
|
+
|
17
|
+
void cut(const ISegment& seg, const char * const filePath)
|
18
|
+
{
|
19
|
+
ifstream ifile(filePath);
|
20
|
+
vector<string> words;
|
21
|
+
string line;
|
22
|
+
string res;
|
23
|
+
while(getline(ifile, line))
|
24
|
+
{
|
25
|
+
if(!line.empty())
|
26
|
+
{
|
27
|
+
words.clear();
|
28
|
+
seg.cut(line, words);
|
29
|
+
join(words.begin(), words.end(), res, "/");
|
30
|
+
cout<< res <<endl;
|
31
|
+
}
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
int main(int argc, char ** argv)
|
37
|
+
{
|
38
|
+
{
|
39
|
+
printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful
|
40
|
+
MPSegment seg(JIEBA_DICT_FILE);
|
41
|
+
cut(seg, TEST_FILE);
|
42
|
+
}
|
43
|
+
{
|
44
|
+
printf("\e[32m%s\e[0m\n", "[demo] HMMSegment"); // colorful
|
45
|
+
HMMSegment seg(HMM_DICT_FILE);
|
46
|
+
cut(seg, TEST_FILE);
|
47
|
+
}
|
48
|
+
{
|
49
|
+
printf("\e[32m%s\e[0m\n", "[demo] MixSegment"); // colorful
|
50
|
+
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE);
|
51
|
+
cut(seg, TEST_FILE);
|
52
|
+
}
|
53
|
+
{
|
54
|
+
printf("\e[32m%s\e[0m\n", "[demo] MixSegment with UserDict"); // colorful
|
55
|
+
MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE, USER_DICT_FILE);
|
56
|
+
cut(seg, TEST_FILE);
|
57
|
+
}
|
58
|
+
return EXIT_SUCCESS;
|
59
|
+
}
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# coding:utf-8
|
3
|
+
import time
|
4
|
+
import urllib2
|
5
|
+
import threading
|
6
|
+
from Queue import Queue
|
7
|
+
from time import sleep
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# 性能测试页面
|
11
|
+
#PERF_TEST_URL = "http://10.2.66.38/?yyid=-1&suv=1309231700203264&callback=xxxxx"
|
12
|
+
URLS = [line for line in open("../test/testdata/load_test.urls", "r")]
|
13
|
+
|
14
|
+
# 配置:压力测试
|
15
|
+
THREAD_NUM = 10 # 并发线程总数
|
16
|
+
ONE_WORKER_NUM = 500 # 每个线程的循环次数
|
17
|
+
LOOP_SLEEP = 0.01 # 每次请求时间间隔(秒)
|
18
|
+
|
19
|
+
# 配置:模拟运行状态
|
20
|
+
#THREAD_NUM = 10 # 并发线程总数
|
21
|
+
#ONE_WORKER_NUM = 10 # 每个线程的循环次数
|
22
|
+
#LOOP_SLEEP = 0 # 每次请求时间间隔(秒)
|
23
|
+
|
24
|
+
|
25
|
+
# 出错数
|
26
|
+
ERROR_NUM = 0
|
27
|
+
|
28
|
+
|
29
|
+
#具体的处理函数,负责处理单个任务
|
30
|
+
def doWork(index, url):
|
31
|
+
t = threading.currentThread()
|
32
|
+
#print "["+t.name+" "+str(index)+"] "+PERF_TEST_URL
|
33
|
+
|
34
|
+
try:
|
35
|
+
html = urllib2.urlopen(url).read()
|
36
|
+
except urllib2.URLError, e:
|
37
|
+
print "["+t.name+" "+str(index)+"] "
|
38
|
+
print e
|
39
|
+
global ERROR_NUM
|
40
|
+
ERROR_NUM += 1
|
41
|
+
|
42
|
+
|
43
|
+
#这个是工作进程,负责不断从队列取数据并处理
|
44
|
+
def working():
|
45
|
+
t = threading.currentThread()
|
46
|
+
print "["+t.name+"] Sub Thread Begin"
|
47
|
+
|
48
|
+
i = 0
|
49
|
+
while i < ONE_WORKER_NUM:
|
50
|
+
i += 1
|
51
|
+
doWork(i, URLS[i % len(URLS)])
|
52
|
+
sleep(LOOP_SLEEP)
|
53
|
+
|
54
|
+
print "["+t.name+"] Sub Thread End"
|
55
|
+
|
56
|
+
|
57
|
+
def main():
|
58
|
+
#doWork(0)
|
59
|
+
#return
|
60
|
+
|
61
|
+
t1 = time.time()
|
62
|
+
|
63
|
+
Threads = []
|
64
|
+
|
65
|
+
# 创建线程
|
66
|
+
for i in range(THREAD_NUM):
|
67
|
+
t = threading.Thread(target=working, name="T"+str(i))
|
68
|
+
t.setDaemon(True)
|
69
|
+
Threads.append(t)
|
70
|
+
|
71
|
+
for t in Threads:
|
72
|
+
t.start()
|
73
|
+
|
74
|
+
for t in Threads:
|
75
|
+
t.join()
|
76
|
+
|
77
|
+
print "main thread end"
|
78
|
+
|
79
|
+
t2 = time.time()
|
80
|
+
print "========================================"
|
81
|
+
#print "URL:", PERF_TEST_URL
|
82
|
+
print "任务数量:", THREAD_NUM, "*", ONE_WORKER_NUM, "=", THREAD_NUM*ONE_WORKER_NUM
|
83
|
+
print "总耗时(秒):", t2-t1
|
84
|
+
print "每次请求耗时(秒):", (t2-t1) / (THREAD_NUM*ONE_WORKER_NUM)
|
85
|
+
print "每秒承载请求数:", 1 / ((t2-t1) / (THREAD_NUM*ONE_WORKER_NUM))
|
86
|
+
print "错误数量:", ERROR_NUM
|
87
|
+
|
88
|
+
|
89
|
+
if __name__ == "__main__":
|
90
|
+
main()
|
91
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#include "../src/PosTagger.hpp"
|
2
|
+
using namespace CppJieba;
|
3
|
+
|
4
|
+
int main(int argc, char ** argv)
|
5
|
+
{
|
6
|
+
PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
|
7
|
+
string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
|
8
|
+
vector<pair<string, string> > res;
|
9
|
+
tagger.tag(s, res);
|
10
|
+
cout << res << endl;
|
11
|
+
return EXIT_SUCCESS;
|
12
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
["南京市", "长江大桥"]
|
@@ -0,0 +1,93 @@
|
|
1
|
+
龙鸣狮吼 3 nr
|
2
|
+
龙齐诺 2 nr
|
3
|
+
龙齿 3 n
|
4
|
+
龚 176 nr
|
5
|
+
龚世萍 2 nr
|
6
|
+
龚书铎 2 nr
|
7
|
+
龚二人 2 nr
|
8
|
+
龚云甫 3 nr
|
9
|
+
龚伟强 5 nr
|
10
|
+
龚先生 4 nr
|
11
|
+
龚光杰 44 nr
|
12
|
+
龚古尔 24 nr
|
13
|
+
龚子敬 2 nr
|
14
|
+
龚孝升 12 nr
|
15
|
+
龚学平 2 nr
|
16
|
+
龚完敬 5 nr
|
17
|
+
龚定庵 3 nr
|
18
|
+
龚定敬 2 nr
|
19
|
+
龚宝铨 5 nr
|
20
|
+
龚家村 3 nr
|
21
|
+
龚建国 29 nr
|
22
|
+
龚德俊 6 nr
|
23
|
+
龚心瀚 3 nr
|
24
|
+
龚志国 2 nr
|
25
|
+
龚意田 3 nr
|
26
|
+
龚慈恩 3 nr
|
27
|
+
龚施茜 3 nr
|
28
|
+
龚晓犁 2 nr
|
29
|
+
龚普洛 3 nr
|
30
|
+
龚智超 7 nr
|
31
|
+
龚松林 10 nr
|
32
|
+
龚永明 3 nr
|
33
|
+
龚永泉 5 nr
|
34
|
+
龚泽艺 256 nr
|
35
|
+
龚睿 8 nrfg
|
36
|
+
龚祖同 2 nr
|
37
|
+
龚秋婷 3 nr
|
38
|
+
龚老爷 2 nr
|
39
|
+
龚育之 19 nr
|
40
|
+
龚自珍 28 nr
|
41
|
+
龚蓓苾 3 nr
|
42
|
+
龚虹嘉 3 nr
|
43
|
+
龚诗嘉 3 nr
|
44
|
+
龛 223 ng
|
45
|
+
龜 2 zg
|
46
|
+
龟 903 ns
|
47
|
+
龟儿子 123 n
|
48
|
+
龟兆 3 nz
|
49
|
+
龟兹 215 ns
|
50
|
+
龟兹王 3 nrt
|
51
|
+
龟冷搘床 3 v
|
52
|
+
龟冷支床 3 n
|
53
|
+
龟卜 3 n
|
54
|
+
龟厌不告 3 l
|
55
|
+
龟壳 33 n
|
56
|
+
龟壳花 3 n
|
57
|
+
龟头 34 n
|
58
|
+
龟头炎 3 n
|
59
|
+
龟山 23 ns
|
60
|
+
龟山乡 3 ns
|
61
|
+
龟山岛 3 ns
|
62
|
+
龟年鹤寿 3 ns
|
63
|
+
龟年鹤算 3 l
|
64
|
+
龟文 3 nz
|
65
|
+
龟文写迹 3 n
|
66
|
+
龟文鸟迹 3 n
|
67
|
+
龟板 10 n
|
68
|
+
龟毛免角 3 n
|
69
|
+
龟毛兔角 3 n
|
70
|
+
龟溪 3 ns
|
71
|
+
龟玉 3 nz
|
72
|
+
龟王 3 nz
|
73
|
+
龟甲 92 ns
|
74
|
+
龟甲胶 3 nz
|
75
|
+
龟筮 3 n
|
76
|
+
龟纹 3 n
|
77
|
+
龟缩 29 v
|
78
|
+
龟肉 3 n
|
79
|
+
龟背 21 n
|
80
|
+
龟背竹 3 n
|
81
|
+
龟苓膏 3 n
|
82
|
+
龟苗 3 n
|
83
|
+
龟裂 34 v
|
84
|
+
龟足 5 v
|
85
|
+
龟鉴 2 n
|
86
|
+
龟镜 3 nz
|
87
|
+
龟鳖 3 n
|
88
|
+
龟鹤遐寿 3 l
|
89
|
+
龟龄鹤算 3 n
|
90
|
+
龟龙片甲 3 nz
|
91
|
+
龟龙麟凤 3 ns
|
92
|
+
龠 5 g
|
93
|
+
龢 732 zg
|