cppjieba_rb 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,215 @@
1
+ #ifndef CPPJIEBA_UNICODE_H
2
+ #define CPPJIEBA_UNICODE_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string>
7
+ #include <vector>
8
+ #include <ostream>
9
+ #include "limonp/LocalVector.hpp"
10
+
11
+ namespace cppjieba {
12
+
13
+ using std::string;
14
+ using std::vector;
15
+
16
+ typedef uint32_t Rune;
17
+
18
+ struct Word {
19
+ string word;
20
+ uint32_t offset;
21
+ Word(const string& w, uint32_t o)
22
+ : word(w), offset(o) {
23
+ }
24
+ }; // struct Word
25
+
26
+ inline std::ostream& operator << (std::ostream& os, const Word& w) {
27
+ return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
28
+ }
29
+
30
+ struct RuneStr {
31
+ Rune rune;
32
+ uint32_t offset;
33
+ uint32_t len;
34
+ RuneStr(): rune(0), offset(0), len(0) {
35
+ }
36
+ RuneStr(Rune r, uint32_t o, uint32_t l)
37
+ : rune(r), offset(o), len(l) {
38
+ }
39
+ }; // struct RuneStr
40
+
41
+ inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
42
+ return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
43
+ }
44
+
45
+ typedef limonp::LocalVector<Rune> Unicode;
46
+ typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
47
+
48
+ // [left, right]
49
+ struct WordRange {
50
+ RuneStrArray::const_iterator left;
51
+ RuneStrArray::const_iterator right;
52
+ WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
53
+ : left(l), right(r) {
54
+ }
55
+ size_t Length() const {
56
+ return right - left + 1;
57
+ }
58
+ bool IsAllAscii() const {
59
+ for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
60
+ if (iter->rune >= 0x80) {
61
+ return false;
62
+ }
63
+ }
64
+ return true;
65
+ }
66
+ }; // struct WordRange
67
+
68
+ struct RuneStrLite {
69
+ uint32_t rune;
70
+ uint32_t len;
71
+ RuneStrLite(): rune(0), len(0) {
72
+ }
73
+ RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
74
+ }
75
+ }; // struct RuneStrLite
76
+
77
+ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
78
+ RuneStrLite rp(0, 0);
79
+ if (str == NULL || len == 0) {
80
+ return rp;
81
+ }
82
+ if (!(str[0] & 0x80)) { // 0xxxxxxx
83
+ // 7bit, total 7bit
84
+ rp.rune = (uint8_t)(str[0]) & 0x7f;
85
+ rp.len = 1;
86
+ } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
87
+ // 110xxxxxx
88
+ // 5bit, total 5bit
89
+ rp.rune = (uint8_t)(str[0]) & 0x1f;
90
+
91
+ // 6bit, total 11bit
92
+ rp.rune <<= 6;
93
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
94
+ rp.len = 2;
95
+ } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
96
+ // 4bit, total 4bit
97
+ rp.rune = (uint8_t)(str[0]) & 0x0f;
98
+
99
+ // 6bit, total 10bit
100
+ rp.rune <<= 6;
101
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
102
+
103
+ // 6bit, total 16bit
104
+ rp.rune <<= 6;
105
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
106
+
107
+ rp.len = 3;
108
+ } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
109
+ // 3bit, total 3bit
110
+ rp.rune = (uint8_t)(str[0]) & 0x07;
111
+
112
+ // 6bit, total 9bit
113
+ rp.rune <<= 6;
114
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
115
+
116
+ // 6bit, total 15bit
117
+ rp.rune <<= 6;
118
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
119
+
120
+ // 6bit, total 21bit
121
+ rp.rune <<= 6;
122
+ rp.rune |= (uint8_t)(str[3]) & 0x3f;
123
+
124
+ rp.len = 4;
125
+ } else {
126
+ rp.rune = 0;
127
+ rp.len = 0;
128
+ }
129
+ return rp;
130
+ }
131
+
132
+ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
133
+ runes.clear();
134
+ runes.reserve(len / 2);
135
+ for (size_t i = 0; i < len;) {
136
+ RuneStrLite rp = DecodeRuneInString(s + i, len - i);
137
+ if (rp.len == 0) {
138
+ runes.clear();
139
+ return false;
140
+ }
141
+ RuneStr x(rp.rune, i, rp.len);
142
+ runes.push_back(x);
143
+ i += rp.len;
144
+ }
145
+ return true;
146
+ }
147
+
148
+ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
149
+ return DecodeRunesInString(s.c_str(), s.size(), runes);
150
+ }
151
+
152
+ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
153
+ unicode.clear();
154
+ RuneStrArray runes;
155
+ if (!DecodeRunesInString(s, len, runes)) {
156
+ return false;
157
+ }
158
+ unicode.reserve(runes.size());
159
+ for (size_t i = 0; i < runes.size(); i++) {
160
+ unicode.push_back(runes[i].rune);
161
+ }
162
+ return true;
163
+ }
164
+
165
+ inline bool IsSingleWord(const string& str) {
166
+ RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
167
+ return rp.len == str.size();
168
+ }
169
+
170
+ inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
171
+ return DecodeRunesInString(s.c_str(), s.size(), unicode);
172
+ }
173
+
174
+ inline Unicode DecodeRunesInString(const string& s) {
175
+ Unicode result;
176
+ DecodeRunesInString(s, result);
177
+ return result;
178
+ }
179
+
180
+
181
+ // [left, right]
182
+ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
183
+ assert(right->offset >= left->offset);
184
+ uint32_t len = right->offset - left->offset + right->len;
185
+ return Word(s.substr(left->offset, len), left->offset);
186
+ }
187
+
188
+ inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
189
+ assert(right->offset >= left->offset);
190
+ uint32_t len = right->offset - left->offset + right->len;
191
+ return s.substr(left->offset, len);
192
+ }
193
+
194
+ inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
195
+ for (size_t i = 0; i < wrs.size(); i++) {
196
+ words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
197
+ }
198
+ }
199
+
200
+ inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
201
+ vector<Word> result;
202
+ GetWordsFromWordRanges(s, wrs, result);
203
+ return result;
204
+ }
205
+
206
+ inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
207
+ strs.resize(words.size());
208
+ for (size_t i = 0; i < words.size(); ++i) {
209
+ strs[i] = words[i].word;
210
+ }
211
+ }
212
+
213
+ } // namespace cppjieba
214
+
215
+ #endif // CPPJIEBA_UNICODE_H
@@ -0,0 +1,5 @@
1
+ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
2
+
3
+ ADD_EXECUTABLE(demo demo.cpp)
4
+ ADD_EXECUTABLE(load_test load_test.cpp)
5
+ ADD_SUBDIRECTORY(unittest)
@@ -0,0 +1,80 @@
1
+ #include "cppjieba/Jieba.hpp"
2
+
3
+ using namespace std;
4
+
5
+ const char* const DICT_PATH = "../dict/jieba.dict.utf8";
6
+ const char* const HMM_PATH = "../dict/hmm_model.utf8";
7
+ const char* const USER_DICT_PATH = "../dict/user.dict.utf8";
8
+ const char* const IDF_PATH = "../dict/idf.utf8";
9
+ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
10
+
11
+ int main(int argc, char** argv) {
12
+ cppjieba::Jieba jieba(DICT_PATH,
13
+ HMM_PATH,
14
+ USER_DICT_PATH,
15
+ IDF_PATH,
16
+ STOP_WORD_PATH);
17
+ vector<string> words;
18
+ vector<cppjieba::Word> jiebawords;
19
+ string s;
20
+ string result;
21
+
22
+ s = "他来到了网易杭研大厦";
23
+ cout << s << endl;
24
+ cout << "[demo] Cut With HMM" << endl;
25
+ jieba.Cut(s, words, true);
26
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
27
+
28
+ cout << "[demo] Cut Without HMM " << endl;
29
+ jieba.Cut(s, words, false);
30
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
31
+
32
+ s = "我来到北京清华大学";
33
+ cout << s << endl;
34
+ cout << "[demo] CutAll" << endl;
35
+ jieba.CutAll(s, words);
36
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
37
+
38
+ s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
39
+ cout << s << endl;
40
+ cout << "[demo] CutForSearch" << endl;
41
+ jieba.CutForSearch(s, words);
42
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
43
+
44
+ cout << "[demo] Insert User Word" << endl;
45
+ jieba.Cut("男默女泪", words);
46
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
47
+ jieba.InsertUserWord("男默女泪");
48
+ jieba.Cut("男默女泪", words);
49
+ cout << limonp::Join(words.begin(), words.end(), "/") << endl;
50
+
51
+ cout << "[demo] CutForSearch Word With Offset" << endl;
52
+ jieba.CutForSearch(s, jiebawords, true);
53
+ cout << jiebawords << endl;
54
+
55
+ cout << "[demo] Lookup Tag for Single Token" << endl;
56
+ const int DemoTokenMaxLen = 32;
57
+ char DemoTokens[][DemoTokenMaxLen] = {"拖拉机", "CEO", "123", "。"};
58
+ vector<pair<string, string> > LookupTagres(sizeof(DemoTokens) / DemoTokenMaxLen);
59
+ vector<pair<string, string> >::iterator it;
60
+ for (it = LookupTagres.begin(); it != LookupTagres.end(); it++) {
61
+ it->first = DemoTokens[it - LookupTagres.begin()];
62
+ it->second = jieba.LookupTag(it->first);
63
+ }
64
+ cout << LookupTagres << endl;
65
+
66
+ cout << "[demo] Tagging" << endl;
67
+ vector<pair<string, string> > tagres;
68
+ s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
69
+ jieba.Tag(s, tagres);
70
+ cout << s << endl;
71
+ cout << tagres << endl;;
72
+
73
+ cout << "[demo] Keyword Extraction" << endl;
74
+ const size_t topk = 5;
75
+ vector<cppjieba::KeywordExtractor::Word> keywordres;
76
+ jieba.extractor.Extract(s, keywordres, topk);
77
+ cout << s << endl;
78
+ cout << keywordres << endl;
79
+ return EXIT_SUCCESS;
80
+ }
@@ -0,0 +1,54 @@
1
+ #include <iostream>
2
+ #include <ctime>
3
+ #include <fstream>
4
+ #include "cppjieba/MPSegment.hpp"
5
+ #include "cppjieba/HMMSegment.hpp"
6
+ #include "cppjieba/MixSegment.hpp"
7
+ #include "cppjieba/KeywordExtractor.hpp"
8
+ #include "limonp/Colors.hpp"
9
+
10
+ using namespace cppjieba;
11
+
12
+ void Cut(size_t times = 50) {
13
+ MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
14
+ vector<string> res;
15
+ string doc;
16
+ ifstream ifs("../test/testdata/weicheng.utf8");
17
+ assert(ifs);
18
+ doc << ifs;
19
+ long beginTime = clock();
20
+ for (size_t i = 0; i < times; i ++) {
21
+ printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
22
+ fflush(stdout);
23
+ res.clear();
24
+ seg.Cut(doc, res);
25
+ }
26
+ printf("\n");
27
+ long endTime = clock();
28
+ ColorPrintln(GREEN, "Cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
29
+ }
30
+
31
+ void Extract(size_t times = 400) {
32
+ KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
33
+ vector<string> words;
34
+ string doc;
35
+ ifstream ifs("../test/testdata/review.100");
36
+ assert(ifs);
37
+ doc << ifs;
38
+ long beginTime = clock();
39
+ for (size_t i = 0; i < times; i ++) {
40
+ printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
41
+ fflush(stdout);
42
+ words.clear();
43
+ Extractor.Extract(doc, words, 5);
44
+ }
45
+ printf("\n");
46
+ long endTime = clock();
47
+ ColorPrintln(GREEN, "Extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
48
+ }
49
+
50
+ int main(int argc, char ** argv) {
51
+ Cut();
52
+ Extract();
53
+ return EXIT_SUCCESS;
54
+ }
@@ -0,0 +1 @@
1
+ ["南京市", "长江大桥"]