cppjieba_rb 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#ifndef CPPJIEBA_UNICODE_H
|
|
2
|
+
#define CPPJIEBA_UNICODE_H
|
|
3
|
+
|
|
4
|
+
#include <stdint.h>
|
|
5
|
+
#include <stdlib.h>
|
|
6
|
+
#include <string>
|
|
7
|
+
#include <vector>
|
|
8
|
+
#include <ostream>
|
|
9
|
+
#include "limonp/LocalVector.hpp"
|
|
10
|
+
|
|
11
|
+
namespace cppjieba {
|
|
12
|
+
|
|
13
|
+
using std::string;
|
|
14
|
+
using std::vector;
|
|
15
|
+
|
|
16
|
+
typedef uint32_t Rune;
|
|
17
|
+
|
|
18
|
+
struct Word {
|
|
19
|
+
string word;
|
|
20
|
+
uint32_t offset;
|
|
21
|
+
Word(const string& w, uint32_t o)
|
|
22
|
+
: word(w), offset(o) {
|
|
23
|
+
}
|
|
24
|
+
}; // struct Word
|
|
25
|
+
|
|
26
|
+
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
|
27
|
+
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
struct RuneStr {
|
|
31
|
+
Rune rune;
|
|
32
|
+
uint32_t offset;
|
|
33
|
+
uint32_t len;
|
|
34
|
+
RuneStr(): rune(0), offset(0), len(0) {
|
|
35
|
+
}
|
|
36
|
+
RuneStr(Rune r, uint32_t o, uint32_t l)
|
|
37
|
+
: rune(r), offset(o), len(l) {
|
|
38
|
+
}
|
|
39
|
+
}; // struct RuneStr
|
|
40
|
+
|
|
41
|
+
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
|
42
|
+
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
typedef limonp::LocalVector<Rune> Unicode;
|
|
46
|
+
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
|
47
|
+
|
|
48
|
+
// [left, right]
|
|
49
|
+
struct WordRange {
|
|
50
|
+
RuneStrArray::const_iterator left;
|
|
51
|
+
RuneStrArray::const_iterator right;
|
|
52
|
+
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
|
53
|
+
: left(l), right(r) {
|
|
54
|
+
}
|
|
55
|
+
size_t Length() const {
|
|
56
|
+
return right - left + 1;
|
|
57
|
+
}
|
|
58
|
+
bool IsAllAscii() const {
|
|
59
|
+
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
|
60
|
+
if (iter->rune >= 0x80) {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return true;
|
|
65
|
+
}
|
|
66
|
+
}; // struct WordRange
|
|
67
|
+
|
|
68
|
+
struct RuneStrLite {
|
|
69
|
+
uint32_t rune;
|
|
70
|
+
uint32_t len;
|
|
71
|
+
RuneStrLite(): rune(0), len(0) {
|
|
72
|
+
}
|
|
73
|
+
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
|
74
|
+
}
|
|
75
|
+
}; // struct RuneStrLite
|
|
76
|
+
|
|
77
|
+
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
|
78
|
+
RuneStrLite rp(0, 0);
|
|
79
|
+
if (str == NULL || len == 0) {
|
|
80
|
+
return rp;
|
|
81
|
+
}
|
|
82
|
+
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
|
83
|
+
// 7bit, total 7bit
|
|
84
|
+
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
|
85
|
+
rp.len = 1;
|
|
86
|
+
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
|
87
|
+
// 110xxxxxx
|
|
88
|
+
// 5bit, total 5bit
|
|
89
|
+
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
|
90
|
+
|
|
91
|
+
// 6bit, total 11bit
|
|
92
|
+
rp.rune <<= 6;
|
|
93
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
94
|
+
rp.len = 2;
|
|
95
|
+
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
|
96
|
+
// 4bit, total 4bit
|
|
97
|
+
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
|
98
|
+
|
|
99
|
+
// 6bit, total 10bit
|
|
100
|
+
rp.rune <<= 6;
|
|
101
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
102
|
+
|
|
103
|
+
// 6bit, total 16bit
|
|
104
|
+
rp.rune <<= 6;
|
|
105
|
+
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
|
106
|
+
|
|
107
|
+
rp.len = 3;
|
|
108
|
+
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
|
109
|
+
// 3bit, total 3bit
|
|
110
|
+
rp.rune = (uint8_t)(str[0]) & 0x07;
|
|
111
|
+
|
|
112
|
+
// 6bit, total 9bit
|
|
113
|
+
rp.rune <<= 6;
|
|
114
|
+
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
115
|
+
|
|
116
|
+
// 6bit, total 15bit
|
|
117
|
+
rp.rune <<= 6;
|
|
118
|
+
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
|
119
|
+
|
|
120
|
+
// 6bit, total 21bit
|
|
121
|
+
rp.rune <<= 6;
|
|
122
|
+
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
|
123
|
+
|
|
124
|
+
rp.len = 4;
|
|
125
|
+
} else {
|
|
126
|
+
rp.rune = 0;
|
|
127
|
+
rp.len = 0;
|
|
128
|
+
}
|
|
129
|
+
return rp;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
|
133
|
+
runes.clear();
|
|
134
|
+
runes.reserve(len / 2);
|
|
135
|
+
for (size_t i = 0; i < len;) {
|
|
136
|
+
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
|
137
|
+
if (rp.len == 0) {
|
|
138
|
+
runes.clear();
|
|
139
|
+
return false;
|
|
140
|
+
}
|
|
141
|
+
RuneStr x(rp.rune, i, rp.len);
|
|
142
|
+
runes.push_back(x);
|
|
143
|
+
i += rp.len;
|
|
144
|
+
}
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
|
149
|
+
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
|
153
|
+
unicode.clear();
|
|
154
|
+
RuneStrArray runes;
|
|
155
|
+
if (!DecodeRunesInString(s, len, runes)) {
|
|
156
|
+
return false;
|
|
157
|
+
}
|
|
158
|
+
unicode.reserve(runes.size());
|
|
159
|
+
for (size_t i = 0; i < runes.size(); i++) {
|
|
160
|
+
unicode.push_back(runes[i].rune);
|
|
161
|
+
}
|
|
162
|
+
return true;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
inline bool IsSingleWord(const string& str) {
|
|
166
|
+
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
|
167
|
+
return rp.len == str.size();
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
|
171
|
+
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
inline Unicode DecodeRunesInString(const string& s) {
|
|
175
|
+
Unicode result;
|
|
176
|
+
DecodeRunesInString(s, result);
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
// [left, right]
|
|
182
|
+
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
|
183
|
+
assert(right->offset >= left->offset);
|
|
184
|
+
uint32_t len = right->offset - left->offset + right->len;
|
|
185
|
+
return Word(s.substr(left->offset, len), left->offset);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
|
189
|
+
assert(right->offset >= left->offset);
|
|
190
|
+
uint32_t len = right->offset - left->offset + right->len;
|
|
191
|
+
return s.substr(left->offset, len);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
|
195
|
+
for (size_t i = 0; i < wrs.size(); i++) {
|
|
196
|
+
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
|
201
|
+
vector<Word> result;
|
|
202
|
+
GetWordsFromWordRanges(s, wrs, result);
|
|
203
|
+
return result;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
|
207
|
+
strs.resize(words.size());
|
|
208
|
+
for (size_t i = 0; i < words.size(); ++i) {
|
|
209
|
+
strs[i] = words[i].word;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
} // namespace cppjieba
|
|
214
|
+
|
|
215
|
+
#endif // CPPJIEBA_UNICODE_H
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
#include "cppjieba/Jieba.hpp"
|
|
2
|
+
|
|
3
|
+
using namespace std;
|
|
4
|
+
|
|
5
|
+
const char* const DICT_PATH = "../dict/jieba.dict.utf8";
|
|
6
|
+
const char* const HMM_PATH = "../dict/hmm_model.utf8";
|
|
7
|
+
const char* const USER_DICT_PATH = "../dict/user.dict.utf8";
|
|
8
|
+
const char* const IDF_PATH = "../dict/idf.utf8";
|
|
9
|
+
const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
|
|
10
|
+
|
|
11
|
+
int main(int argc, char** argv) {
|
|
12
|
+
cppjieba::Jieba jieba(DICT_PATH,
|
|
13
|
+
HMM_PATH,
|
|
14
|
+
USER_DICT_PATH,
|
|
15
|
+
IDF_PATH,
|
|
16
|
+
STOP_WORD_PATH);
|
|
17
|
+
vector<string> words;
|
|
18
|
+
vector<cppjieba::Word> jiebawords;
|
|
19
|
+
string s;
|
|
20
|
+
string result;
|
|
21
|
+
|
|
22
|
+
s = "他来到了网易杭研大厦";
|
|
23
|
+
cout << s << endl;
|
|
24
|
+
cout << "[demo] Cut With HMM" << endl;
|
|
25
|
+
jieba.Cut(s, words, true);
|
|
26
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
27
|
+
|
|
28
|
+
cout << "[demo] Cut Without HMM " << endl;
|
|
29
|
+
jieba.Cut(s, words, false);
|
|
30
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
31
|
+
|
|
32
|
+
s = "我来到北京清华大学";
|
|
33
|
+
cout << s << endl;
|
|
34
|
+
cout << "[demo] CutAll" << endl;
|
|
35
|
+
jieba.CutAll(s, words);
|
|
36
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
37
|
+
|
|
38
|
+
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
|
|
39
|
+
cout << s << endl;
|
|
40
|
+
cout << "[demo] CutForSearch" << endl;
|
|
41
|
+
jieba.CutForSearch(s, words);
|
|
42
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
43
|
+
|
|
44
|
+
cout << "[demo] Insert User Word" << endl;
|
|
45
|
+
jieba.Cut("男默女泪", words);
|
|
46
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
47
|
+
jieba.InsertUserWord("男默女泪");
|
|
48
|
+
jieba.Cut("男默女泪", words);
|
|
49
|
+
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
|
|
50
|
+
|
|
51
|
+
cout << "[demo] CutForSearch Word With Offset" << endl;
|
|
52
|
+
jieba.CutForSearch(s, jiebawords, true);
|
|
53
|
+
cout << jiebawords << endl;
|
|
54
|
+
|
|
55
|
+
cout << "[demo] Lookup Tag for Single Token" << endl;
|
|
56
|
+
const int DemoTokenMaxLen = 32;
|
|
57
|
+
char DemoTokens[][DemoTokenMaxLen] = {"拖拉机", "CEO", "123", "。"};
|
|
58
|
+
vector<pair<string, string> > LookupTagres(sizeof(DemoTokens) / DemoTokenMaxLen);
|
|
59
|
+
vector<pair<string, string> >::iterator it;
|
|
60
|
+
for (it = LookupTagres.begin(); it != LookupTagres.end(); it++) {
|
|
61
|
+
it->first = DemoTokens[it - LookupTagres.begin()];
|
|
62
|
+
it->second = jieba.LookupTag(it->first);
|
|
63
|
+
}
|
|
64
|
+
cout << LookupTagres << endl;
|
|
65
|
+
|
|
66
|
+
cout << "[demo] Tagging" << endl;
|
|
67
|
+
vector<pair<string, string> > tagres;
|
|
68
|
+
s = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";
|
|
69
|
+
jieba.Tag(s, tagres);
|
|
70
|
+
cout << s << endl;
|
|
71
|
+
cout << tagres << endl;;
|
|
72
|
+
|
|
73
|
+
cout << "[demo] Keyword Extraction" << endl;
|
|
74
|
+
const size_t topk = 5;
|
|
75
|
+
vector<cppjieba::KeywordExtractor::Word> keywordres;
|
|
76
|
+
jieba.extractor.Extract(s, keywordres, topk);
|
|
77
|
+
cout << s << endl;
|
|
78
|
+
cout << keywordres << endl;
|
|
79
|
+
return EXIT_SUCCESS;
|
|
80
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#include <iostream>
|
|
2
|
+
#include <ctime>
|
|
3
|
+
#include <fstream>
|
|
4
|
+
#include "cppjieba/MPSegment.hpp"
|
|
5
|
+
#include "cppjieba/HMMSegment.hpp"
|
|
6
|
+
#include "cppjieba/MixSegment.hpp"
|
|
7
|
+
#include "cppjieba/KeywordExtractor.hpp"
|
|
8
|
+
#include "limonp/Colors.hpp"
|
|
9
|
+
|
|
10
|
+
using namespace cppjieba;
|
|
11
|
+
|
|
12
|
+
void Cut(size_t times = 50) {
|
|
13
|
+
MixSegment seg("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
|
14
|
+
vector<string> res;
|
|
15
|
+
string doc;
|
|
16
|
+
ifstream ifs("../test/testdata/weicheng.utf8");
|
|
17
|
+
assert(ifs);
|
|
18
|
+
doc << ifs;
|
|
19
|
+
long beginTime = clock();
|
|
20
|
+
for (size_t i = 0; i < times; i ++) {
|
|
21
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
|
22
|
+
fflush(stdout);
|
|
23
|
+
res.clear();
|
|
24
|
+
seg.Cut(doc, res);
|
|
25
|
+
}
|
|
26
|
+
printf("\n");
|
|
27
|
+
long endTime = clock();
|
|
28
|
+
ColorPrintln(GREEN, "Cut: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
void Extract(size_t times = 400) {
|
|
32
|
+
KeywordExtractor Extractor("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
|
33
|
+
vector<string> words;
|
|
34
|
+
string doc;
|
|
35
|
+
ifstream ifs("../test/testdata/review.100");
|
|
36
|
+
assert(ifs);
|
|
37
|
+
doc << ifs;
|
|
38
|
+
long beginTime = clock();
|
|
39
|
+
for (size_t i = 0; i < times; i ++) {
|
|
40
|
+
printf("process [%3.0lf %%]\r", 100.0*(i+1)/times);
|
|
41
|
+
fflush(stdout);
|
|
42
|
+
words.clear();
|
|
43
|
+
Extractor.Extract(doc, words, 5);
|
|
44
|
+
}
|
|
45
|
+
printf("\n");
|
|
46
|
+
long endTime = clock();
|
|
47
|
+
ColorPrintln(GREEN, "Extract: [%.3lf seconds]time consumed.", double(endTime - beginTime)/CLOCKS_PER_SEC);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
int main(int argc, char ** argv) {
|
|
51
|
+
Cut();
|
|
52
|
+
Extract();
|
|
53
|
+
return EXIT_SUCCESS;
|
|
54
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
["南京市", "长江大桥"]
|