cppjieba_rb 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +3 -0
- data/README.md +1 -1
- data/Rakefile +2 -2
- data/cppjieba_rb.gemspec +4 -4
- data/lib/cppjieba_rb/version.rb +1 -1
- metadata +17 -135
- data/ext/cppjieba/.gitignore +0 -17
- data/ext/cppjieba/.travis.yml +0 -21
- data/ext/cppjieba/CMakeLists.txt +0 -28
- data/ext/cppjieba/ChangeLog.md +0 -236
- data/ext/cppjieba/README.md +0 -292
- data/ext/cppjieba/README_EN.md +0 -113
- data/ext/cppjieba/appveyor.yml +0 -32
- data/ext/cppjieba/deps/CMakeLists.txt +0 -1
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
- data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
- data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
- data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
- data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
- data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
- data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
- data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
- data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
- data/ext/cppjieba/dict/README.md +0 -31
- data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
- data/ext/cppjieba/dict/idf.utf8 +0 -258826
- data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
- data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
- data/ext/cppjieba/dict/user.dict.utf8 +0 -4
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
- data/ext/cppjieba/test/CMakeLists.txt +0 -5
- data/ext/cppjieba/test/demo.cpp +0 -80
- data/ext/cppjieba/test/load_test.cpp +0 -54
- data/ext/cppjieba/test/testdata/curl.res +0 -1
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
- data/ext/cppjieba/test/testdata/load_test.urls +0 -2
- data/ext/cppjieba/test/testdata/review.100 +0 -100
- data/ext/cppjieba/test/testdata/review.100.res +0 -200
- data/ext/cppjieba/test/testdata/server.conf +0 -19
- data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
- data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
- data/ext/cppjieba/test/testdata/userdict.english +0 -2
- data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
- data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
- data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
- data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
- data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
- data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
- data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
- data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
- data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,190 +0,0 @@
|
|
1
|
-
#ifndef CPPJIBEA_HMMSEGMENT_H
|
2
|
-
#define CPPJIBEA_HMMSEGMENT_H
|
3
|
-
|
4
|
-
#include <iostream>
|
5
|
-
#include <fstream>
|
6
|
-
#include <memory.h>
|
7
|
-
#include <cassert>
|
8
|
-
#include "HMMModel.hpp"
|
9
|
-
#include "SegmentBase.hpp"
|
10
|
-
|
11
|
-
namespace cppjieba {
|
12
|
-
class HMMSegment: public SegmentBase {
|
13
|
-
public:
|
14
|
-
HMMSegment(const string& filePath)
|
15
|
-
: model_(new HMMModel(filePath)), isNeedDestroy_(true) {
|
16
|
-
}
|
17
|
-
HMMSegment(const HMMModel* model)
|
18
|
-
: model_(model), isNeedDestroy_(false) {
|
19
|
-
}
|
20
|
-
~HMMSegment() {
|
21
|
-
if (isNeedDestroy_) {
|
22
|
-
delete model_;
|
23
|
-
}
|
24
|
-
}
|
25
|
-
|
26
|
-
void Cut(const string& sentence,
|
27
|
-
vector<string>& words) const {
|
28
|
-
vector<Word> tmp;
|
29
|
-
Cut(sentence, tmp);
|
30
|
-
GetStringsFromWords(tmp, words);
|
31
|
-
}
|
32
|
-
void Cut(const string& sentence,
|
33
|
-
vector<Word>& words) const {
|
34
|
-
PreFilter pre_filter(symbols_, sentence);
|
35
|
-
PreFilter::Range range;
|
36
|
-
vector<WordRange> wrs;
|
37
|
-
wrs.reserve(sentence.size()/2);
|
38
|
-
while (pre_filter.HasNext()) {
|
39
|
-
range = pre_filter.Next();
|
40
|
-
Cut(range.begin, range.end, wrs);
|
41
|
-
}
|
42
|
-
words.clear();
|
43
|
-
words.reserve(wrs.size());
|
44
|
-
GetWordsFromWordRanges(sentence, wrs, words);
|
45
|
-
}
|
46
|
-
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
47
|
-
RuneStrArray::const_iterator left = begin;
|
48
|
-
RuneStrArray::const_iterator right = begin;
|
49
|
-
while (right != end) {
|
50
|
-
if (right->rune < 0x80) {
|
51
|
-
if (left != right) {
|
52
|
-
InternalCut(left, right, res);
|
53
|
-
}
|
54
|
-
left = right;
|
55
|
-
do {
|
56
|
-
right = SequentialLetterRule(left, end);
|
57
|
-
if (right != left) {
|
58
|
-
break;
|
59
|
-
}
|
60
|
-
right = NumbersRule(left, end);
|
61
|
-
if (right != left) {
|
62
|
-
break;
|
63
|
-
}
|
64
|
-
right ++;
|
65
|
-
} while (false);
|
66
|
-
WordRange wr(left, right - 1);
|
67
|
-
res.push_back(wr);
|
68
|
-
left = right;
|
69
|
-
} else {
|
70
|
-
right++;
|
71
|
-
}
|
72
|
-
}
|
73
|
-
if (left != right) {
|
74
|
-
InternalCut(left, right, res);
|
75
|
-
}
|
76
|
-
}
|
77
|
-
private:
|
78
|
-
// sequential letters rule
|
79
|
-
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
80
|
-
Rune x = begin->rune;
|
81
|
-
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
82
|
-
begin ++;
|
83
|
-
} else {
|
84
|
-
return begin;
|
85
|
-
}
|
86
|
-
while (begin != end) {
|
87
|
-
x = begin->rune;
|
88
|
-
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
89
|
-
begin ++;
|
90
|
-
} else {
|
91
|
-
break;
|
92
|
-
}
|
93
|
-
}
|
94
|
-
return begin;
|
95
|
-
}
|
96
|
-
//
|
97
|
-
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
98
|
-
Rune x = begin->rune;
|
99
|
-
if ('0' <= x && x <= '9') {
|
100
|
-
begin ++;
|
101
|
-
} else {
|
102
|
-
return begin;
|
103
|
-
}
|
104
|
-
while (begin != end) {
|
105
|
-
x = begin->rune;
|
106
|
-
if ( ('0' <= x && x <= '9') || x == '.') {
|
107
|
-
begin++;
|
108
|
-
} else {
|
109
|
-
break;
|
110
|
-
}
|
111
|
-
}
|
112
|
-
return begin;
|
113
|
-
}
|
114
|
-
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
115
|
-
vector<size_t> status;
|
116
|
-
Viterbi(begin, end, status);
|
117
|
-
|
118
|
-
RuneStrArray::const_iterator left = begin;
|
119
|
-
RuneStrArray::const_iterator right;
|
120
|
-
for (size_t i = 0; i < status.size(); i++) {
|
121
|
-
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
122
|
-
right = begin + i + 1;
|
123
|
-
WordRange wr(left, right - 1);
|
124
|
-
res.push_back(wr);
|
125
|
-
left = right;
|
126
|
-
}
|
127
|
-
}
|
128
|
-
}
|
129
|
-
|
130
|
-
void Viterbi(RuneStrArray::const_iterator begin,
|
131
|
-
RuneStrArray::const_iterator end,
|
132
|
-
vector<size_t>& status) const {
|
133
|
-
size_t Y = HMMModel::STATUS_SUM;
|
134
|
-
size_t X = end - begin;
|
135
|
-
|
136
|
-
size_t XYSize = X * Y;
|
137
|
-
size_t now, old, stat;
|
138
|
-
double tmp, endE, endS;
|
139
|
-
|
140
|
-
vector<int> path(XYSize);
|
141
|
-
vector<double> weight(XYSize);
|
142
|
-
|
143
|
-
//start
|
144
|
-
for (size_t y = 0; y < Y; y++) {
|
145
|
-
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
|
146
|
-
path[0 + y * X] = -1;
|
147
|
-
}
|
148
|
-
|
149
|
-
double emitProb;
|
150
|
-
|
151
|
-
for (size_t x = 1; x < X; x++) {
|
152
|
-
for (size_t y = 0; y < Y; y++) {
|
153
|
-
now = x + y*X;
|
154
|
-
weight[now] = MIN_DOUBLE;
|
155
|
-
path[now] = HMMModel::E; // warning
|
156
|
-
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
|
157
|
-
for (size_t preY = 0; preY < Y; preY++) {
|
158
|
-
old = x - 1 + preY * X;
|
159
|
-
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
160
|
-
if (tmp > weight[now]) {
|
161
|
-
weight[now] = tmp;
|
162
|
-
path[now] = preY;
|
163
|
-
}
|
164
|
-
}
|
165
|
-
}
|
166
|
-
}
|
167
|
-
|
168
|
-
endE = weight[X-1+HMMModel::E*X];
|
169
|
-
endS = weight[X-1+HMMModel::S*X];
|
170
|
-
stat = 0;
|
171
|
-
if (endE >= endS) {
|
172
|
-
stat = HMMModel::E;
|
173
|
-
} else {
|
174
|
-
stat = HMMModel::S;
|
175
|
-
}
|
176
|
-
|
177
|
-
status.resize(X);
|
178
|
-
for (int x = X -1 ; x >= 0; x--) {
|
179
|
-
status[x] = stat;
|
180
|
-
stat = path[x + stat*X];
|
181
|
-
}
|
182
|
-
}
|
183
|
-
|
184
|
-
const HMMModel* model_;
|
185
|
-
bool isNeedDestroy_;
|
186
|
-
}; // class HMMSegment
|
187
|
-
|
188
|
-
} // namespace cppjieba
|
189
|
-
|
190
|
-
#endif
|
@@ -1,130 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEAB_JIEBA_H
|
2
|
-
#define CPPJIEAB_JIEBA_H
|
3
|
-
|
4
|
-
#include "QuerySegment.hpp"
|
5
|
-
#include "KeywordExtractor.hpp"
|
6
|
-
|
7
|
-
namespace cppjieba {
|
8
|
-
|
9
|
-
class Jieba {
|
10
|
-
public:
|
11
|
-
Jieba(const string& dict_path,
|
12
|
-
const string& model_path,
|
13
|
-
const string& user_dict_path,
|
14
|
-
const string& idfPath,
|
15
|
-
const string& stopWordPath)
|
16
|
-
: dict_trie_(dict_path, user_dict_path),
|
17
|
-
model_(model_path),
|
18
|
-
mp_seg_(&dict_trie_),
|
19
|
-
hmm_seg_(&model_),
|
20
|
-
mix_seg_(&dict_trie_, &model_),
|
21
|
-
full_seg_(&dict_trie_),
|
22
|
-
query_seg_(&dict_trie_, &model_),
|
23
|
-
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
|
24
|
-
}
|
25
|
-
~Jieba() {
|
26
|
-
}
|
27
|
-
|
28
|
-
struct LocWord {
|
29
|
-
string word;
|
30
|
-
size_t begin;
|
31
|
-
size_t end;
|
32
|
-
}; // struct LocWord
|
33
|
-
|
34
|
-
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
35
|
-
mix_seg_.Cut(sentence, words, hmm);
|
36
|
-
}
|
37
|
-
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
38
|
-
mix_seg_.Cut(sentence, words, hmm);
|
39
|
-
}
|
40
|
-
void CutAll(const string& sentence, vector<string>& words) const {
|
41
|
-
full_seg_.Cut(sentence, words);
|
42
|
-
}
|
43
|
-
void CutAll(const string& sentence, vector<Word>& words) const {
|
44
|
-
full_seg_.Cut(sentence, words);
|
45
|
-
}
|
46
|
-
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
|
47
|
-
query_seg_.Cut(sentence, words, hmm);
|
48
|
-
}
|
49
|
-
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
50
|
-
query_seg_.Cut(sentence, words, hmm);
|
51
|
-
}
|
52
|
-
void CutHMM(const string& sentence, vector<string>& words) const {
|
53
|
-
hmm_seg_.Cut(sentence, words);
|
54
|
-
}
|
55
|
-
void CutHMM(const string& sentence, vector<Word>& words) const {
|
56
|
-
hmm_seg_.Cut(sentence, words);
|
57
|
-
}
|
58
|
-
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
|
59
|
-
mp_seg_.Cut(sentence, words, max_word_len);
|
60
|
-
}
|
61
|
-
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
|
62
|
-
mp_seg_.Cut(sentence, words, max_word_len);
|
63
|
-
}
|
64
|
-
|
65
|
-
void Tag(const string& sentence, vector<pair<string, string> >& words) const {
|
66
|
-
mix_seg_.Tag(sentence, words);
|
67
|
-
}
|
68
|
-
string LookupTag(const string &str) const {
|
69
|
-
return mix_seg_.LookupTag(str);
|
70
|
-
}
|
71
|
-
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
72
|
-
return dict_trie_.InsertUserWord(word, tag);
|
73
|
-
}
|
74
|
-
|
75
|
-
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
76
|
-
return dict_trie_.InsertUserWord(word,freq, tag);
|
77
|
-
}
|
78
|
-
|
79
|
-
bool Find(const string& word)
|
80
|
-
{
|
81
|
-
return dict_trie_.Find(word);
|
82
|
-
}
|
83
|
-
|
84
|
-
void ResetSeparators(const string& s) {
|
85
|
-
//TODO
|
86
|
-
mp_seg_.ResetSeparators(s);
|
87
|
-
hmm_seg_.ResetSeparators(s);
|
88
|
-
mix_seg_.ResetSeparators(s);
|
89
|
-
full_seg_.ResetSeparators(s);
|
90
|
-
query_seg_.ResetSeparators(s);
|
91
|
-
}
|
92
|
-
|
93
|
-
const DictTrie* GetDictTrie() const {
|
94
|
-
return &dict_trie_;
|
95
|
-
}
|
96
|
-
|
97
|
-
const HMMModel* GetHMMModel() const {
|
98
|
-
return &model_;
|
99
|
-
}
|
100
|
-
|
101
|
-
void LoadUserDict(const vector<string>& buf) {
|
102
|
-
dict_trie_.LoadUserDict(buf);
|
103
|
-
}
|
104
|
-
|
105
|
-
void LoadUserDict(const set<string>& buf) {
|
106
|
-
dict_trie_.LoadUserDict(buf);
|
107
|
-
}
|
108
|
-
|
109
|
-
void LoadUserDict(const string& path) {
|
110
|
-
dict_trie_.LoadUserDict(path);
|
111
|
-
}
|
112
|
-
|
113
|
-
private:
|
114
|
-
DictTrie dict_trie_;
|
115
|
-
HMMModel model_;
|
116
|
-
|
117
|
-
// They share the same dict trie and model
|
118
|
-
MPSegment mp_seg_;
|
119
|
-
HMMSegment hmm_seg_;
|
120
|
-
MixSegment mix_seg_;
|
121
|
-
FullSegment full_seg_;
|
122
|
-
QuerySegment query_seg_;
|
123
|
-
|
124
|
-
public:
|
125
|
-
KeywordExtractor extractor;
|
126
|
-
}; // class Jieba
|
127
|
-
|
128
|
-
} // namespace cppjieba
|
129
|
-
|
130
|
-
#endif // CPPJIEAB_JIEBA_H
|
@@ -1,153 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
|
2
|
-
#define CPPJIEBA_KEYWORD_EXTRACTOR_H
|
3
|
-
|
4
|
-
#include <cmath>
|
5
|
-
#include <set>
|
6
|
-
#include "MixSegment.hpp"
|
7
|
-
|
8
|
-
namespace cppjieba {
|
9
|
-
|
10
|
-
using namespace limonp;
|
11
|
-
using namespace std;
|
12
|
-
|
13
|
-
/*utf8*/
|
14
|
-
class KeywordExtractor {
|
15
|
-
public:
|
16
|
-
struct Word {
|
17
|
-
string word;
|
18
|
-
vector<size_t> offsets;
|
19
|
-
double weight;
|
20
|
-
}; // struct Word
|
21
|
-
|
22
|
-
KeywordExtractor(const string& dictPath,
|
23
|
-
const string& hmmFilePath,
|
24
|
-
const string& idfPath,
|
25
|
-
const string& stopWordPath,
|
26
|
-
const string& userDict = "")
|
27
|
-
: segment_(dictPath, hmmFilePath, userDict) {
|
28
|
-
LoadIdfDict(idfPath);
|
29
|
-
LoadStopWordDict(stopWordPath);
|
30
|
-
}
|
31
|
-
KeywordExtractor(const DictTrie* dictTrie,
|
32
|
-
const HMMModel* model,
|
33
|
-
const string& idfPath,
|
34
|
-
const string& stopWordPath)
|
35
|
-
: segment_(dictTrie, model) {
|
36
|
-
LoadIdfDict(idfPath);
|
37
|
-
LoadStopWordDict(stopWordPath);
|
38
|
-
}
|
39
|
-
~KeywordExtractor() {
|
40
|
-
}
|
41
|
-
|
42
|
-
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
43
|
-
vector<Word> topWords;
|
44
|
-
Extract(sentence, topWords, topN);
|
45
|
-
for (size_t i = 0; i < topWords.size(); i++) {
|
46
|
-
keywords.push_back(topWords[i].word);
|
47
|
-
}
|
48
|
-
}
|
49
|
-
|
50
|
-
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
51
|
-
vector<Word> topWords;
|
52
|
-
Extract(sentence, topWords, topN);
|
53
|
-
for (size_t i = 0; i < topWords.size(); i++) {
|
54
|
-
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
|
59
|
-
vector<string> words;
|
60
|
-
segment_.Cut(sentence, words);
|
61
|
-
|
62
|
-
map<string, Word> wordmap;
|
63
|
-
size_t offset = 0;
|
64
|
-
for (size_t i = 0; i < words.size(); ++i) {
|
65
|
-
size_t t = offset;
|
66
|
-
offset += words[i].size();
|
67
|
-
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
68
|
-
continue;
|
69
|
-
}
|
70
|
-
wordmap[words[i]].offsets.push_back(t);
|
71
|
-
wordmap[words[i]].weight += 1.0;
|
72
|
-
}
|
73
|
-
if (offset != sentence.size()) {
|
74
|
-
XLOG(ERROR) << "words illegal";
|
75
|
-
return;
|
76
|
-
}
|
77
|
-
|
78
|
-
keywords.clear();
|
79
|
-
keywords.reserve(wordmap.size());
|
80
|
-
for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
81
|
-
unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
|
82
|
-
if (cit != idfMap_.end()) {
|
83
|
-
itr->second.weight *= cit->second;
|
84
|
-
} else {
|
85
|
-
itr->second.weight *= idfAverage_;
|
86
|
-
}
|
87
|
-
itr->second.word = itr->first;
|
88
|
-
keywords.push_back(itr->second);
|
89
|
-
}
|
90
|
-
topN = min(topN, keywords.size());
|
91
|
-
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
92
|
-
keywords.resize(topN);
|
93
|
-
}
|
94
|
-
private:
|
95
|
-
void LoadIdfDict(const string& idfPath) {
|
96
|
-
ifstream ifs(idfPath.c_str());
|
97
|
-
XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
|
98
|
-
string line ;
|
99
|
-
vector<string> buf;
|
100
|
-
double idf = 0.0;
|
101
|
-
double idfSum = 0.0;
|
102
|
-
size_t lineno = 0;
|
103
|
-
for (; getline(ifs, line); lineno++) {
|
104
|
-
buf.clear();
|
105
|
-
if (line.empty()) {
|
106
|
-
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
107
|
-
continue;
|
108
|
-
}
|
109
|
-
Split(line, buf, " ");
|
110
|
-
if (buf.size() != 2) {
|
111
|
-
XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
|
112
|
-
continue;
|
113
|
-
}
|
114
|
-
idf = atof(buf[1].c_str());
|
115
|
-
idfMap_[buf[0]] = idf;
|
116
|
-
idfSum += idf;
|
117
|
-
|
118
|
-
}
|
119
|
-
|
120
|
-
assert(lineno);
|
121
|
-
idfAverage_ = idfSum / lineno;
|
122
|
-
assert(idfAverage_ > 0.0);
|
123
|
-
}
|
124
|
-
void LoadStopWordDict(const string& filePath) {
|
125
|
-
ifstream ifs(filePath.c_str());
|
126
|
-
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
127
|
-
string line ;
|
128
|
-
while (getline(ifs, line)) {
|
129
|
-
stopWords_.insert(line);
|
130
|
-
}
|
131
|
-
assert(stopWords_.size());
|
132
|
-
}
|
133
|
-
|
134
|
-
static bool Compare(const Word& lhs, const Word& rhs) {
|
135
|
-
return lhs.weight > rhs.weight;
|
136
|
-
}
|
137
|
-
|
138
|
-
MixSegment segment_;
|
139
|
-
unordered_map<string, double> idfMap_;
|
140
|
-
double idfAverage_;
|
141
|
-
|
142
|
-
unordered_set<string> stopWords_;
|
143
|
-
}; // class KeywordExtractor
|
144
|
-
|
145
|
-
inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
|
146
|
-
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
147
|
-
}
|
148
|
-
|
149
|
-
} // namespace cppjieba
|
150
|
-
|
151
|
-
#endif
|
152
|
-
|
153
|
-
|