jieba_rb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +51 -0
  8. data/Rakefile +11 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +27 -0
  12. data/ext/cppjieba/ChangeLog.md +81 -0
  13. data/ext/cppjieba/Dockerfile +11 -0
  14. data/ext/cppjieba/LICENSE +20 -0
  15. data/ext/cppjieba/README.md +359 -0
  16. data/ext/cppjieba/conf/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/conf/server.conf +16 -0
  18. data/ext/cppjieba/dict/CMakeLists.txt +1 -0
  19. data/ext/cppjieba/dict/README.md +31 -0
  20. data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
  21. data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
  22. data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
  23. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  24. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  25. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  26. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  27. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  28. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  29. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  30. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  31. data/ext/cppjieba/dict/user.dict.utf8 +3 -0
  32. data/ext/cppjieba/script/CMakeLists.txt +1 -0
  33. data/ext/cppjieba/script/cjserver.start +12 -0
  34. data/ext/cppjieba/script/cjserver.stop +13 -0
  35. data/ext/cppjieba/server/CMakeLists.txt +9 -0
  36. data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
  37. data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
  38. data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
  39. data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
  40. data/ext/cppjieba/server/server.cpp +91 -0
  41. data/ext/cppjieba/src/DictTrie.hpp +211 -0
  42. data/ext/cppjieba/src/FullSegment.hpp +153 -0
  43. data/ext/cppjieba/src/HMMSegment.hpp +394 -0
  44. data/ext/cppjieba/src/ISegment.hpp +17 -0
  45. data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
  46. data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
  47. data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
  48. data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
  49. data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
  50. data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
  51. data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
  52. data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
  53. data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
  54. data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
  55. data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
  56. data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
  57. data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
  58. data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
  59. data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
  60. data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
  61. data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
  62. data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
  63. data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
  64. data/ext/cppjieba/src/MPSegment.hpp +148 -0
  65. data/ext/cppjieba/src/MixSegment.hpp +121 -0
  66. data/ext/cppjieba/src/PosTagger.hpp +109 -0
  67. data/ext/cppjieba/src/QuerySegment.hpp +123 -0
  68. data/ext/cppjieba/src/SegmentBase.hpp +78 -0
  69. data/ext/cppjieba/src/TransCode.hpp +63 -0
  70. data/ext/cppjieba/src/Trie.hpp +298 -0
  71. data/ext/cppjieba/test/CMakeLists.txt +7 -0
  72. data/ext/cppjieba/test/keyword_demo.cpp +16 -0
  73. data/ext/cppjieba/test/load_test.cpp +56 -0
  74. data/ext/cppjieba/test/segment_demo.cpp +59 -0
  75. data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
  76. data/ext/cppjieba/test/servertest/load_test.py +91 -0
  77. data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
  78. data/ext/cppjieba/test/tagging_demo.cpp +12 -0
  79. data/ext/cppjieba/test/testdata/curl.res +1 -0
  80. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  81. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  82. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  83. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  84. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  85. data/ext/cppjieba/test/testdata/review.100 +100 -0
  86. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  87. data/ext/cppjieba/test/testdata/server.conf +13 -0
  88. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  89. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  90. data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
  91. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  92. data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
  93. data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
  94. data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
  95. data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
  96. data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
  97. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
  98. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
  99. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
  100. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
  101. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
  102. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
  103. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
  104. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
  105. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
  106. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
  107. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
  108. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
  109. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
  110. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
  111. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
  112. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  113. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  114. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
  115. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
  116. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
  117. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
  118. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
  119. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
  120. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
  121. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
  122. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
  123. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
  124. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
  125. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
  126. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
  127. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
  128. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
  129. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
  130. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
  131. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
  132. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
  133. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
  134. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
  135. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  136. data/ext/jieba/extconf.rb +26 -0
  137. data/ext/jieba/jieba.c +9 -0
  138. data/ext/jieba/jieba.h +9 -0
  139. data/ext/jieba/segment.cc +88 -0
  140. data/ext/jieba/segment.h +17 -0
  141. data/jieba_rb.gemspec +51 -0
  142. data/lib/jieba_rb/version.rb +3 -0
  143. data/lib/jieba_rb.rb +28 -0
  144. data/test/test_segment.rb +32 -0
  145. metadata +246 -0
@@ -0,0 +1,298 @@
1
+ #ifndef CPPJIEBA_TRIE_HPP
2
+ #define CPPJIEBA_TRIE_HPP
3
+
4
+ #include "Limonp/StdExtension.hpp"
5
+ #include <vector>
6
+ #include <queue>
7
+
8
+ namespace CppJieba
9
+ {
10
+ using namespace std;
11
+
12
+ struct DictUnit
13
+ {
14
+ Unicode word;
15
+ double weight;
16
+ string tag;
17
+ };
18
+
19
+ // for debugging
20
+ inline ostream & operator << (ostream& os, const DictUnit& unit)
21
+ {
22
+ string s;
23
+ s << unit.word;
24
+ return os << string_format("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
25
+ }
26
+
27
+ typedef LocalVector<std::pair<size_t, const DictUnit*> > DagType;
28
+
29
+ struct SegmentChar
30
+ {
31
+ uint16_t uniCh;
32
+ DagType dag;
33
+ const DictUnit * pInfo;
34
+ double weight;
35
+ size_t nextPos;
36
+ SegmentChar():uniCh(0), pInfo(NULL), weight(0.0), nextPos(0)
37
+ {}
38
+ ~SegmentChar()
39
+ {}
40
+ };
41
+
42
+ typedef Unicode::value_type TrieKey;
43
+
44
+ class TrieNode
45
+ {
46
+ public:
47
+ typedef unordered_map<TrieKey, TrieNode*> NextMap;
48
+ public:
49
+ TrieNode * fail;
50
+ NextMap * next;
51
+ const DictUnit * ptValue;
52
+ public:
53
+ TrieNode(): fail(NULL), next(NULL), ptValue(NULL)
54
+ {}
55
+ const TrieNode * findNext(TrieKey key) const
56
+ {
57
+ if(next == NULL)
58
+ {
59
+ return NULL;
60
+ }
61
+ typename NextMap::const_iterator iter = next->find(key);
62
+ if(iter == next->end())
63
+ {
64
+ return NULL;
65
+ }
66
+ return iter->second;
67
+ }
68
+ };
69
+
70
+ class Trie
71
+ {
72
+ private:
73
+ TrieNode* _root;
74
+ public:
75
+ Trie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
76
+ {
77
+ _root = new TrieNode;
78
+ _createTrie(keys, valuePointers);
79
+ _build();// build automation
80
+ }
81
+ ~Trie()
82
+ {
83
+ if(_root)
84
+ {
85
+ _deleteNode(_root);
86
+ }
87
+ }
88
+ public:
89
+ const DictUnit* find(typename Unicode::const_iterator begin, typename Unicode::const_iterator end) const
90
+ {
91
+ typename TrieNode::NextMap::const_iterator citer;
92
+ const TrieNode* ptNode = _root;
93
+ for(typename Unicode::const_iterator it = begin; it != end; it++)
94
+ {// build automation
95
+ assert(ptNode);
96
+ if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*it)))
97
+ {
98
+ return NULL;
99
+ }
100
+ ptNode = citer->second;
101
+ }
102
+ return ptNode->ptValue;
103
+ }
104
+ // aho-corasick-automation
105
+ void find(
106
+ typename Unicode::const_iterator begin,
107
+ typename Unicode::const_iterator end,
108
+ vector<struct SegmentChar>& res
109
+ ) const
110
+ {
111
+ res.resize(end - begin);
112
+ const TrieNode * now = _root;
113
+ //typename TrieNode::NextMap::const_iterator iter;
114
+ const TrieNode* node;
115
+ // compiler will complain warnings if only "i < end - begin" .
116
+ for (size_t i = 0; i < size_t(end - begin); i++)
117
+ {
118
+ Unicode::value_type ch = *(begin + i);
119
+ res[i].uniCh = ch;
120
+ assert(res[i].dag.empty());
121
+ res[i].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, NULL));
122
+ bool flag = false;
123
+
124
+ // rollback
125
+ while( now != _root )
126
+ {
127
+ node = now->findNext(ch);
128
+ if (node != NULL)
129
+ {
130
+ flag = true;
131
+ break;
132
+ }
133
+ else
134
+ {
135
+ now = now->fail;
136
+ }
137
+ }
138
+
139
+ if(!flag)
140
+ {
141
+ node = now->findNext(ch);
142
+ }
143
+ if(node == NULL)
144
+ {
145
+ now = _root;
146
+ }
147
+ else
148
+ {
149
+ now = node;
150
+ const TrieNode * temp = now;
151
+ while(temp != _root)
152
+ {
153
+ if (temp->ptValue)
154
+ {
155
+ size_t pos = i - temp->ptValue->word.size() + 1;
156
+ res[pos].dag.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(i, temp->ptValue));
157
+ if(pos == i)
158
+ {
159
+ res[pos].dag[0].second = temp->ptValue;
160
+ }
161
+ }
162
+ temp = temp->fail;
163
+ assert(temp);
164
+ }
165
+ }
166
+ }
167
+ }
168
+ bool find(
169
+ typename Unicode::const_iterator begin,
170
+ typename Unicode::const_iterator end,
171
+ DagType & res,
172
+ size_t offset = 0) const
173
+ {
174
+ const TrieNode * ptNode = _root;
175
+ typename TrieNode::NextMap::const_iterator citer;
176
+ for(typename Unicode::const_iterator itr = begin; itr != end ; itr++)
177
+ {
178
+ assert(ptNode);
179
+ if(NULL == ptNode->next || ptNode->next->end() == (citer = ptNode->next->find(*itr)))
180
+ {
181
+ break;
182
+ }
183
+ ptNode = citer->second;
184
+ if(ptNode->ptValue)
185
+ {
186
+ if(itr == begin && res.size() == 1) // first singleword
187
+ {
188
+ res[0].second = ptNode->ptValue;
189
+ }
190
+ else
191
+ {
192
+ res.push_back(pair<typename vector<Unicode >::size_type, const DictUnit* >(itr - begin + offset, ptNode->ptValue));
193
+ }
194
+ }
195
+ }
196
+ return !res.empty();
197
+ }
198
+ private:
199
+ void _build()
200
+ {
201
+ queue<TrieNode*> que;
202
+ assert(_root->ptValue == NULL);
203
+ assert(_root->next);
204
+ _root->fail = NULL;
205
+ for(typename TrieNode::NextMap::iterator iter = _root->next->begin(); iter != _root->next->end(); iter++) {
206
+ iter->second->fail = _root;
207
+ que.push(iter->second);
208
+ }
209
+ TrieNode* back = NULL;
210
+ typename TrieNode::NextMap::iterator backiter;
211
+ while(!que.empty()) {
212
+ TrieNode * now = que.front();
213
+ que.pop();
214
+ if(now->next == NULL) {
215
+ continue;
216
+ }
217
+ for(typename TrieNode::NextMap::iterator iter = now->next->begin(); iter != now->next->end(); iter++) {
218
+ back = now->fail;
219
+ while(back != NULL) {
220
+ if(back->next && (backiter = back->next->find(iter->first)) != back->next->end())
221
+ {
222
+ iter->second->fail = backiter->second;
223
+ break;
224
+ }
225
+ back = back->fail;
226
+ }
227
+ if(back == NULL) {
228
+ iter->second->fail = _root;
229
+ }
230
+ que.push(iter->second);
231
+ }
232
+ }
233
+ }
234
+ private:
235
+ void _createTrie(const vector<Unicode>& keys, const vector<const DictUnit*> & valuePointers)
236
+ {
237
+ if(valuePointers.empty() || keys.empty())
238
+ {
239
+ return;
240
+ }
241
+ assert(keys.size() == valuePointers.size());
242
+
243
+ for(size_t i = 0; i < keys.size(); i++)
244
+ {
245
+ _insertNode(keys[i], valuePointers[i]);
246
+ }
247
+ }
248
+ private:
249
+ void _insertNode(const Unicode& key, const DictUnit* ptValue)
250
+ {
251
+ TrieNode* ptNode = _root;
252
+
253
+ typename TrieNode::NextMap::const_iterator kmIter;
254
+
255
+ for(typename Unicode::const_iterator citer = key.begin(); citer != key.end(); citer++)
256
+ {
257
+ if(NULL == ptNode->next)
258
+ {
259
+ ptNode->next = new typename TrieNode::NextMap;
260
+ }
261
+ kmIter = ptNode->next->find(*citer);
262
+ if(ptNode->next->end() == kmIter)
263
+ {
264
+ TrieNode * nextNode = new TrieNode;
265
+ nextNode->next = NULL;
266
+ nextNode->ptValue = NULL;
267
+
268
+ (*ptNode->next)[*citer] = nextNode;
269
+ ptNode = nextNode;
270
+ }
271
+ else
272
+ {
273
+ ptNode = kmIter->second;
274
+ }
275
+ }
276
+ ptNode->ptValue = ptValue;
277
+ }
278
+ void _deleteNode(TrieNode* node)
279
+ {
280
+ if(!node)
281
+ {
282
+ return;
283
+ }
284
+ if(node->next)
285
+ {
286
+ typename TrieNode::NextMap::iterator it;
287
+ for(it = node->next->begin(); it != node->next->end(); it++)
288
+ {
289
+ _deleteNode(it->second);
290
+ }
291
+ delete node->next;
292
+ }
293
+ delete node;
294
+ }
295
+ };
296
+ }
297
+
298
+ #endif
@@ -0,0 +1,7 @@
1
+ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
2
+
3
+ ADD_EXECUTABLE(segment.demo segment_demo.cpp)
4
+ ADD_EXECUTABLE(keyword.demo keyword_demo.cpp)
5
+ ADD_EXECUTABLE(tagging.demo tagging_demo.cpp)
6
+ ADD_EXECUTABLE(load_test load_test.cpp)
7
+ ADD_SUBDIRECTORY(unittest)
@@ -0,0 +1,16 @@
1
+ #include "../src/KeywordExtractor.hpp"
2
+ using namespace CppJieba;
3
+
4
+ int main(int argc, char ** argv)
5
+ {
6
+ KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
7
+ string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
8
+ vector<pair<string, double> > wordweights;
9
+ vector<string> words;
10
+ size_t topN = 5;
11
+ extractor.extract(s, wordweights, topN);
12
+ cout<< s << '\n' << wordweights << endl;
13
+ extractor.extract(s, words, topN);
14
+ cout<< s << '\n' << words << endl;
15
+ return EXIT_SUCCESS;
16
+ }
@@ -0,0 +1,56 @@
1
+ #include <iostream>
2
+ #include <ctime>
3
+ #include <fstream>
4
+ #include "../src/MPSegment.hpp"
5
+ #include "../src/HMMSegment.hpp"
6
+ #include "../src/MixSegment.hpp"
7
+ #include "../src/KeywordExtractor.hpp"
8
+
9
+ using namespace CppJieba;
10
+
11
+ void cut(size_t times = 50)
12
+ {
13
+ MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
14
+ vector<string> res;
15
+ string doc;
16
+ ifstream ifs("../test/testdata/weicheng.utf8");
17
+ assert(ifs);
18
+ doc << ifs;
19
+ long beginTime = clock();
20
+ for(size_t i = 0; i < times; i ++)
21
+ {
22
+ printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
23
+ fflush(stdout);
24
+ res.clear();
25
+ seg.cut(doc, res);
26
+ }
27
+ long endTime = clock();
28
+ printf("\ncut: [%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
29
+ }
30
+
31
+ void extract(size_t times = 400)
32
+ {
33
+ KeywordExtractor extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
34
+ vector<string> words;
35
+ string doc;
36
+ ifstream ifs("../test/testdata/review.100");
37
+ assert(ifs);
38
+ doc << ifs;
39
+ long beginTime = clock();
40
+ for(size_t i = 0; i < times; i ++)
41
+ {
42
+ printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
43
+ fflush(stdout);
44
+ words.clear();
45
+ extractor.extract(doc, words, 5);
46
+ }
47
+ long endTime = clock();
48
+ printf("\nextract: [%.3lf seconds]time consumed.\n", double(endTime - beginTime)/CLOCKS_PER_SEC);
49
+ }
50
+
51
+ int main(int argc, char ** argv)
52
+ {
53
+ cut();
54
+ extract();
55
+ return EXIT_SUCCESS;
56
+ }
@@ -0,0 +1,59 @@
1
+ #include <iostream>
2
+ #include <fstream>
3
+
4
+ #define LOGGER_LEVEL LL_WARN
5
+
6
+ #include "../src/MPSegment.hpp"
7
+ #include "../src/HMMSegment.hpp"
8
+ #include "../src/MixSegment.hpp"
9
+
10
+ using namespace CppJieba;
11
+
12
+ const char * const TEST_FILE = "../test/testdata/testlines.utf8";
13
+ const char * const JIEBA_DICT_FILE = "../dict/jieba.dict.utf8";
14
+ const char * const HMM_DICT_FILE = "../dict/hmm_model.utf8";
15
+ const char * const USER_DICT_FILE = "../dict/user.dict.utf8";
16
+
17
+ void cut(const ISegment& seg, const char * const filePath)
18
+ {
19
+ ifstream ifile(filePath);
20
+ vector<string> words;
21
+ string line;
22
+ string res;
23
+ while(getline(ifile, line))
24
+ {
25
+ if(!line.empty())
26
+ {
27
+ words.clear();
28
+ seg.cut(line, words);
29
+ join(words.begin(), words.end(), res, "/");
30
+ cout<< res <<endl;
31
+ }
32
+ }
33
+ }
34
+
35
+
36
+ int main(int argc, char ** argv)
37
+ {
38
+ {
39
+ printf("\e[32m%s\e[0m\n", "[demo] MPSegment"); // colorful
40
+ MPSegment seg(JIEBA_DICT_FILE);
41
+ cut(seg, TEST_FILE);
42
+ }
43
+ {
44
+ printf("\e[32m%s\e[0m\n", "[demo] HMMSegment"); // colorful
45
+ HMMSegment seg(HMM_DICT_FILE);
46
+ cut(seg, TEST_FILE);
47
+ }
48
+ {
49
+ printf("\e[32m%s\e[0m\n", "[demo] MixSegment"); // colorful
50
+ MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE);
51
+ cut(seg, TEST_FILE);
52
+ }
53
+ {
54
+ printf("\e[32m%s\e[0m\n", "[demo] MixSegment with UserDict"); // colorful
55
+ MixSegment seg(JIEBA_DICT_FILE, HMM_DICT_FILE, USER_DICT_FILE);
56
+ cut(seg, TEST_FILE);
57
+ }
58
+ return EXIT_SUCCESS;
59
+ }
@@ -0,0 +1,2 @@
1
+ # go get github.com/aszxqw/go_http_load
2
+ go_http_load -method=GET -get_urls="../test/testdata/load_test.urls" -loop_count=500 -goroutines=2
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/python
2
+ # coding:utf-8
3
+ import time
4
+ import urllib2
5
+ import threading
6
+ from Queue import Queue
7
+ from time import sleep
8
+ import sys
9
+
10
+ # 性能测试页面
11
+ #PERF_TEST_URL = "http://10.2.66.38/?yyid=-1&suv=1309231700203264&callback=xxxxx"
12
+ URLS = [line for line in open("../test/testdata/load_test.urls", "r")]
13
+
14
+ # 配置:压力测试
15
+ THREAD_NUM = 10 # 并发线程总数
16
+ ONE_WORKER_NUM = 500 # 每个线程的循环次数
17
+ LOOP_SLEEP = 0.01 # 每次请求时间间隔(秒)
18
+
19
+ # 配置:模拟运行状态
20
+ #THREAD_NUM = 10 # 并发线程总数
21
+ #ONE_WORKER_NUM = 10 # 每个线程的循环次数
22
+ #LOOP_SLEEP = 0 # 每次请求时间间隔(秒)
23
+
24
+
25
+ # 出错数
26
+ ERROR_NUM = 0
27
+
28
+
29
+ #具体的处理函数,负责处理单个任务
30
+ def doWork(index, url):
31
+ t = threading.currentThread()
32
+ #print "["+t.name+" "+str(index)+"] "+PERF_TEST_URL
33
+
34
+ try:
35
+ html = urllib2.urlopen(url).read()
36
+ except urllib2.URLError, e:
37
+ print "["+t.name+" "+str(index)+"] "
38
+ print e
39
+ global ERROR_NUM
40
+ ERROR_NUM += 1
41
+
42
+
43
+ #这个是工作进程,负责不断从队列取数据并处理
44
+ def working():
45
+ t = threading.currentThread()
46
+ print "["+t.name+"] Sub Thread Begin"
47
+
48
+ i = 0
49
+ while i < ONE_WORKER_NUM:
50
+ i += 1
51
+ doWork(i, URLS[i % len(URLS)])
52
+ sleep(LOOP_SLEEP)
53
+
54
+ print "["+t.name+"] Sub Thread End"
55
+
56
+
57
+ def main():
58
+ #doWork(0)
59
+ #return
60
+
61
+ t1 = time.time()
62
+
63
+ Threads = []
64
+
65
+ # 创建线程
66
+ for i in range(THREAD_NUM):
67
+ t = threading.Thread(target=working, name="T"+str(i))
68
+ t.setDaemon(True)
69
+ Threads.append(t)
70
+
71
+ for t in Threads:
72
+ t.start()
73
+
74
+ for t in Threads:
75
+ t.join()
76
+
77
+ print "main thread end"
78
+
79
+ t2 = time.time()
80
+ print "========================================"
81
+ #print "URL:", PERF_TEST_URL
82
+ print "任务数量:", THREAD_NUM, "*", ONE_WORKER_NUM, "=", THREAD_NUM*ONE_WORKER_NUM
83
+ print "总耗时(秒):", t2-t1
84
+ print "每次请求耗时(秒):", (t2-t1) / (THREAD_NUM*ONE_WORKER_NUM)
85
+ print "每秒承载请求数:", 1 / ((t2-t1) / (THREAD_NUM*ONE_WORKER_NUM))
86
+ print "错误数量:", ERROR_NUM
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
91
+
@@ -0,0 +1,11 @@
1
+ CURL_RES=../test/testdata/curl.res
2
+ TMP=curl.res.tmp
3
+ curl -s "http://127.0.0.1:11200/?key=南京市长江大桥" >> $TMP
4
+ if diff $TMP $CURL_RES >> /dev/null
5
+ then
6
+ echo "ok";
7
+ else
8
+ echo "failed."
9
+ fi
10
+
11
+ rm $TMP
@@ -0,0 +1,12 @@
1
+ #include "../src/PosTagger.hpp"
2
+ using namespace CppJieba;
3
+
4
+ int main(int argc, char ** argv)
5
+ {
6
+ PosTagger tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
7
+ string s("我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。");
8
+ vector<pair<string, string> > res;
9
+ tagger.tag(s, res);
10
+ cout << res << endl;
11
+ return EXIT_SUCCESS;
12
+ }
@@ -0,0 +1 @@
1
+ ["南京市", "长江大桥"]
@@ -0,0 +1,93 @@
1
+ 龙鸣狮吼 3 nr
2
+ 龙齐诺 2 nr
3
+ 龙齿 3 n
4
+ 龚 176 nr
5
+ 龚世萍 2 nr
6
+ 龚书铎 2 nr
7
+ 龚二人 2 nr
8
+ 龚云甫 3 nr
9
+ 龚伟强 5 nr
10
+ 龚先生 4 nr
11
+ 龚光杰 44 nr
12
+ 龚古尔 24 nr
13
+ 龚子敬 2 nr
14
+ 龚孝升 12 nr
15
+ 龚学平 2 nr
16
+ 龚完敬 5 nr
17
+ 龚定庵 3 nr
18
+ 龚定敬 2 nr
19
+ 龚宝铨 5 nr
20
+ 龚家村 3 nr
21
+ 龚建国 29 nr
22
+ 龚德俊 6 nr
23
+ 龚心瀚 3 nr
24
+ 龚志国 2 nr
25
+ 龚意田 3 nr
26
+ 龚慈恩 3 nr
27
+ 龚施茜 3 nr
28
+ 龚晓犁 2 nr
29
+ 龚普洛 3 nr
30
+ 龚智超 7 nr
31
+ 龚松林 10 nr
32
+ 龚永明 3 nr
33
+ 龚永泉 5 nr
34
+ 龚泽艺 256 nr
35
+ 龚睿 8 nrfg
36
+ 龚祖同 2 nr
37
+ 龚秋婷 3 nr
38
+ 龚老爷 2 nr
39
+ 龚育之 19 nr
40
+ 龚自珍 28 nr
41
+ 龚蓓苾 3 nr
42
+ 龚虹嘉 3 nr
43
+ 龚诗嘉 3 nr
44
+ 龛 223 ng
45
+ 龜 2 zg
46
+ 龟 903 ns
47
+ 龟儿子 123 n
48
+ 龟兆 3 nz
49
+ 龟兹 215 ns
50
+ 龟兹王 3 nrt
51
+ 龟冷搘床 3 v
52
+ 龟冷支床 3 n
53
+ 龟卜 3 n
54
+ 龟厌不告 3 l
55
+ 龟壳 33 n
56
+ 龟壳花 3 n
57
+ 龟头 34 n
58
+ 龟头炎 3 n
59
+ 龟山 23 ns
60
+ 龟山乡 3 ns
61
+ 龟山岛 3 ns
62
+ 龟年鹤寿 3 ns
63
+ 龟年鹤算 3 l
64
+ 龟文 3 nz
65
+ 龟文写迹 3 n
66
+ 龟文鸟迹 3 n
67
+ 龟板 10 n
68
+ 龟毛免角 3 n
69
+ 龟毛兔角 3 n
70
+ 龟溪 3 ns
71
+ 龟玉 3 nz
72
+ 龟王 3 nz
73
+ 龟甲 92 ns
74
+ 龟甲胶 3 nz
75
+ 龟筮 3 n
76
+ 龟纹 3 n
77
+ 龟缩 29 v
78
+ 龟肉 3 n
79
+ 龟背 21 n
80
+ 龟背竹 3 n
81
+ 龟苓膏 3 n
82
+ 龟苗 3 n
83
+ 龟裂 34 v
84
+ 龟足 5 v
85
+ 龟鉴 2 n
86
+ 龟镜 3 nz
87
+ 龟鳖 3 n
88
+ 龟鹤遐寿 3 l
89
+ 龟龄鹤算 3 n
90
+ 龟龙片甲 3 nz
91
+ 龟龙麟凤 3 ns
92
+ 龠 5 g
93
+ 龢 732 zg