cppjieba_rb 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
@@ -0,0 +1,215 @@
|
|
1
|
+
#ifndef CPPJIEBA_UNICODE_H
|
2
|
+
#define CPPJIEBA_UNICODE_H
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
#include <string>
|
7
|
+
#include <vector>
|
8
|
+
#include <ostream>
|
9
|
+
#include "limonp/LocalVector.hpp"
|
10
|
+
|
11
|
+
namespace cppjieba {
|
12
|
+
|
13
|
+
using std::string;
|
14
|
+
using std::vector;
|
15
|
+
|
16
|
+
typedef uint32_t Rune;
|
17
|
+
|
18
|
+
struct Word {
|
19
|
+
string word;
|
20
|
+
uint32_t offset;
|
21
|
+
Word(const string& w, uint32_t o)
|
22
|
+
: word(w), offset(o) {
|
23
|
+
}
|
24
|
+
}; // struct Word
|
25
|
+
|
26
|
+
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
27
|
+
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
28
|
+
}
|
29
|
+
|
30
|
+
struct RuneStr {
|
31
|
+
Rune rune;
|
32
|
+
uint32_t offset;
|
33
|
+
uint32_t len;
|
34
|
+
RuneStr(): rune(0), offset(0), len(0) {
|
35
|
+
}
|
36
|
+
RuneStr(Rune r, uint32_t o, uint32_t l)
|
37
|
+
: rune(r), offset(o), len(l) {
|
38
|
+
}
|
39
|
+
}; // struct RuneStr
|
40
|
+
|
41
|
+
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
42
|
+
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
43
|
+
}
|
44
|
+
|
45
|
+
typedef limonp::LocalVector<Rune> Unicode;
|
46
|
+
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
47
|
+
|
48
|
+
// [left, right]
|
49
|
+
struct WordRange {
|
50
|
+
RuneStrArray::const_iterator left;
|
51
|
+
RuneStrArray::const_iterator right;
|
52
|
+
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
53
|
+
: left(l), right(r) {
|
54
|
+
}
|
55
|
+
size_t Length() const {
|
56
|
+
return right - left + 1;
|
57
|
+
}
|
58
|
+
bool IsAllAscii() const {
|
59
|
+
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
60
|
+
if (iter->rune >= 0x80) {
|
61
|
+
return false;
|
62
|
+
}
|
63
|
+
}
|
64
|
+
return true;
|
65
|
+
}
|
66
|
+
}; // struct WordRange
|
67
|
+
|
68
|
+
struct RuneStrLite {
|
69
|
+
uint32_t rune;
|
70
|
+
uint32_t len;
|
71
|
+
RuneStrLite(): rune(0), len(0) {
|
72
|
+
}
|
73
|
+
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
74
|
+
}
|
75
|
+
}; // struct RuneStrLite
|
76
|
+
|
77
|
+
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
78
|
+
RuneStrLite rp(0, 0);
|
79
|
+
if (str == NULL || len == 0) {
|
80
|
+
return rp;
|
81
|
+
}
|
82
|
+
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
83
|
+
// 7bit, total 7bit
|
84
|
+
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
85
|
+
rp.len = 1;
|
86
|
+
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
87
|
+
// 110xxxxxx
|
88
|
+
// 5bit, total 5bit
|
89
|
+
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
90
|
+
|
91
|
+
// 6bit, total 11bit
|
92
|
+
rp.rune <<= 6;
|
93
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
94
|
+
rp.len = 2;
|
95
|
+
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
96
|
+
// 4bit, total 4bit
|
97
|
+
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
98
|
+
|
99
|
+
// 6bit, total 10bit
|
100
|
+
rp.rune <<= 6;
|
101
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
102
|
+
|
103
|
+
// 6bit, total 16bit
|
104
|
+
rp.rune <<= 6;
|
105
|
+
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
106
|
+
|
107
|
+
rp.len = 3;
|
108
|
+
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
109
|
+
// 3bit, total 3bit
|
110
|
+
rp.rune = (uint8_t)(str[0]) & 0x07;
|
111
|
+
|
112
|
+
// 6bit, total 9bit
|
113
|
+
rp.rune <<= 6;
|
114
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
115
|
+
|
116
|
+
// 6bit, total 15bit
|
117
|
+
rp.rune <<= 6;
|
118
|
+
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
119
|
+
|
120
|
+
// 6bit, total 21bit
|
121
|
+
rp.rune <<= 6;
|
122
|
+
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
123
|
+
|
124
|
+
rp.len = 4;
|
125
|
+
} else {
|
126
|
+
rp.rune = 0;
|
127
|
+
rp.len = 0;
|
128
|
+
}
|
129
|
+
return rp;
|
130
|
+
}
|
131
|
+
|
132
|
+
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
133
|
+
runes.clear();
|
134
|
+
runes.reserve(len / 2);
|
135
|
+
for (size_t i = 0; i < len;) {
|
136
|
+
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
137
|
+
if (rp.len == 0) {
|
138
|
+
runes.clear();
|
139
|
+
return false;
|
140
|
+
}
|
141
|
+
RuneStr x(rp.rune, i, rp.len);
|
142
|
+
runes.push_back(x);
|
143
|
+
i += rp.len;
|
144
|
+
}
|
145
|
+
return true;
|
146
|
+
}
|
147
|
+
|
148
|
+
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
149
|
+
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
150
|
+
}
|
151
|
+
|
152
|
+
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
153
|
+
unicode.clear();
|
154
|
+
RuneStrArray runes;
|
155
|
+
if (!DecodeRunesInString(s, len, runes)) {
|
156
|
+
return false;
|
157
|
+
}
|
158
|
+
unicode.reserve(runes.size());
|
159
|
+
for (size_t i = 0; i < runes.size(); i++) {
|
160
|
+
unicode.push_back(runes[i].rune);
|
161
|
+
}
|
162
|
+
return true;
|
163
|
+
}
|
164
|
+
|
165
|
+
inline bool IsSingleWord(const string& str) {
|
166
|
+
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
167
|
+
return rp.len == str.size();
|
168
|
+
}
|
169
|
+
|
170
|
+
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
171
|
+
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
172
|
+
}
|
173
|
+
|
174
|
+
inline Unicode DecodeRunesInString(const string& s) {
|
175
|
+
Unicode result;
|
176
|
+
DecodeRunesInString(s, result);
|
177
|
+
return result;
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
// [left, right]
|
182
|
+
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
183
|
+
assert(right->offset >= left->offset);
|
184
|
+
uint32_t len = right->offset - left->offset + right->len;
|
185
|
+
return Word(s.substr(left->offset, len), left->offset);
|
186
|
+
}
|
187
|
+
|
188
|
+
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
189
|
+
assert(right->offset >= left->offset);
|
190
|
+
uint32_t len = right->offset - left->offset + right->len;
|
191
|
+
return s.substr(left->offset, len);
|
192
|
+
}
|
193
|
+
|
194
|
+
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
195
|
+
for (size_t i = 0; i < wrs.size(); i++) {
|
196
|
+
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
197
|
+
}
|
198
|
+
}
|
199
|
+
|
200
|
+
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
201
|
+
vector<Word> result;
|
202
|
+
GetWordsFromWordRanges(s, wrs, result);
|
203
|
+
return result;
|
204
|
+
}
|
205
|
+
|
206
|
+
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
207
|
+
strs.resize(words.size());
|
208
|
+
for (size_t i = 0; i < words.size(); ++i) {
|
209
|
+
strs[i] = words[i].word;
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
} // namespace cppjieba
|
214
|
+
|
215
|
+
#endif // CPPJIEBA_UNICODE_H
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#include "cppjieba/Jieba.hpp"
|
2
|
+
|
3
|
+
using namespace std;
|
4
|
+
|
5
|
+
const char* const DICT_PATH = "../dict/jieba.dict.utf8";
|
6
|
+
const char* const HMM_PATH = "../dict/hmm_model.utf8";
|
7
|
+
const char* const USER_DICT_PATH = "../dict/user.dict.utf8";
|
8
|
+
const char* const IDF_PATH = "../dict/idf.utf8";
|
9
|
+
const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
|
10
|
+
|
11
|
+
int main(int argc, char** argv) {
|
12
|
+
cppjieba::Jieba jieba(DICT_PATH,
|
13
|
+
HMM_PATH,
|
14
|
+
USER_DICT_PATH,
|
15
|
+
IDF_PATH,
|
16
|
+
STOP_WORD_PATH);
|
17
|
+
vector<string> words;
|
18
|
+
vector<cppjieba::Word> jiebawords;
|
19
|
+
string s;
|
20
|
+
string result;
|
21
|
+
|
22
|
+
s = "他来到了网易杭研大厦";
|
23
|
+
cout << s << endl;
|
24
|
+
cout << "[demo] Cut With HMM" << endl;
|
25
|
+
jieba.Cut(s, words, true);
|
26
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
27
|
+
|
28
|
+
cout << "[demo] Cut Without HMM " << endl;
|
29
|
+
jieba.Cut(s, words, false);
|
30
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
31
|
+
|
32
|
+
s = "我来到北京清华大学";
|
33
|
+
cout << s << endl;
|
34
|
+
cout << "[demo] CutAll" << endl;
|
35
|
+
jieba.CutAll(s, words);
|
36
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
37
|
+
|
38
|
+
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
39
|
+
cout << s << endl;
|
40
|
+
cout << "[demo] CutForSearch" << endl;
|
41
|
+
jieba.CutForSearch(s, words);
|
42
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
43
|
+
|
44
|
+
cout << "[demo] Insert User Word" << endl;
|
45
|
+
jieba.Cut("男默女泪", words);
|
46
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
47
|
+
jieba.InsertUserWord("男默女泪");
|
48
|
+
jieba.Cut("男默女泪", words);
|
49
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
50
|
+
|
51
|
+
cout << "[demo] CutForSearch Word With Offset" << endl;
|
52
|
+
jieba.CutForSearch(s, jiebawords, true);
|
53
|
+
cout << jiebawords << endl;
|
54
|
+
|
55
|
+
cout << "[demo] Lookup Tag for Single Token" << endl;
|
56
|
+
const int DemoTokenMaxLen = 32;
|
57
|
+
char DemoTokens[][DemoTokenMaxLen] = {"拖拉机", "CEO", "123", "。"};
|
58
|
+
vector<pair<string, string> > LookupTagres(sizeof(DemoTokens) / DemoTokenMaxLen);
|
59
|
+
vector<pair<string, string> >::iterator it;
|
60
|
+
for (it = LookupTagres.begin(); it != LookupTagres.end(); it++) {
|
61
|
+
it->first = DemoTokens[it - LookupTagres.begin()];
|
62
|
+
it->second = jieba.LookupTag(it->first);
|
63
|
+
}
|
64
|
+
cout << LookupTagres << endl;
|
65
|
+
|
66
|
+
cout << "[demo] Tagging" << endl;
|
67
|
+
vector<pair<string, string> > tagres;
|
68
|
+
s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
69
|
+
jieba.Tag(s, tagres);
|
70
|
+
cout << s << endl;
|
71
|
+
cout << tagres << endl;;
|
72
|
+
|
73
|
+
cout << "[demo] Keyword Extraction" << endl;
|
74
|
+
const size_t topk = 5;
|
75
|
+
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
76
|
+
jieba.extractor.Extract(s, keywordres, topk);
|
77
|
+
cout << s << endl;
|
78
|
+
cout << keywordres << endl;
|
79
|
+
return EXIT_SUCCESS;
|
80
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#include <iostream>
|
2
|
+
#include <ctime>
|
3
|
+
#include <fstream>
|
4
|
+
#include "cppjieba/MPSegment.hpp"
|
5
|
+
#include "cppjieba/HMMSegment.hpp"
|
6
|
+
#include "cppjieba/MixSegment.hpp"
|
7
|
+
#include "cppjieba/KeywordExtractor.hpp"
|
8
|
+
#include "limonp/Colors.hpp"
|
9
|
+
|
10
|
+
using namespace cppjieba;
|
11
|
+
|
12
|
+
void Cut(size_t times = 50) {
|
13
|
+
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
14
|
+
vector<string> res;
|
15
|
+
string doc;
|
16
|
+
ifstream ifs("../test/testdata/weicheng.utf8");
|
17
|
+
assert(ifs);
|
18
|
+
doc << ifs;
|
19
|
+
long beginTime = clock();
|
20
|
+
for (size_t i = 0; i < times; i ++) {
|
21
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
22
|
+
fflush(stdout);
|
23
|
+
res.clear();
|
24
|
+
seg.Cut(doc, res);
|
25
|
+
}
|
26
|
+
printf("\n");
|
27
|
+
long endTime = clock();
|
28
|
+
ColorPrintln(GREEN, "Cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
29
|
+
}
|
30
|
+
|
31
|
+
void Extract(size_t times = 400) {
|
32
|
+
KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
33
|
+
vector<string> words;
|
34
|
+
string doc;
|
35
|
+
ifstream ifs("../test/testdata/review.100");
|
36
|
+
assert(ifs);
|
37
|
+
doc << ifs;
|
38
|
+
long beginTime = clock();
|
39
|
+
for (size_t i = 0; i < times; i ++) {
|
40
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
41
|
+
fflush(stdout);
|
42
|
+
words.clear();
|
43
|
+
Extractor.Extract(doc, words, 5);
|
44
|
+
}
|
45
|
+
printf("\n");
|
46
|
+
long endTime = clock();
|
47
|
+
ColorPrintln(GREEN, "Extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
48
|
+
}
|
49
|
+
|
50
|
+
int main(int argc, char ** argv) {
|
51
|
+
Cut();
|
52
|
+
Extract();
|
53
|
+
return EXIT_SUCCESS;
|
54
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
["南京市", "长江大桥"]
|