cppjieba_rb 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
2
|
+
#define CPPJIEBA_DICT_TRIE_HPP
|
3
|
+
|
4
|
+
#include <iostream>
|
5
|
+
#include <fstream>
|
6
|
+
#include <map>
|
7
|
+
#include <string>
|
8
|
+
#include <cstring>
|
9
|
+
#include <cstdlib>
|
10
|
+
#include <stdint.h>
|
11
|
+
#include <cmath>
|
12
|
+
#include <limits>
|
13
|
+
#include "limonp/StringUtil.hpp"
|
14
|
+
#include "limonp/Logging.hpp"
|
15
|
+
#include "Unicode.hpp"
|
16
|
+
#include "Trie.hpp"
|
17
|
+
|
18
|
+
namespace cppjieba {
|
19
|
+
|
20
|
+
using namespace limonp;
|
21
|
+
|
22
|
+
const double MIN_DOUBLE = -3.14e+100;
|
23
|
+
const double MAX_DOUBLE = 3.14e+100;
|
24
|
+
const size_t DICT_COLUMN_NUM = 3;
|
25
|
+
const char* const UNKNOWN_TAG = "";
|
26
|
+
|
27
|
+
class DictTrie {
|
28
|
+
public:
|
29
|
+
enum UserWordWeightOption {
|
30
|
+
WordWeightMin,
|
31
|
+
WordWeightMedian,
|
32
|
+
WordWeightMax,
|
33
|
+
}; // enum UserWordWeightOption
|
34
|
+
|
35
|
+
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
36
|
+
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
37
|
+
}
|
38
|
+
|
39
|
+
~DictTrie() {
|
40
|
+
delete trie_;
|
41
|
+
}
|
42
|
+
|
43
|
+
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
44
|
+
DictUnit node_info;
|
45
|
+
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
46
|
+
return false;
|
47
|
+
}
|
48
|
+
active_node_infos_.push_back(node_info);
|
49
|
+
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
50
|
+
return true;
|
51
|
+
}
|
52
|
+
|
53
|
+
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
54
|
+
return trie_->Find(begin, end);
|
55
|
+
}
|
56
|
+
|
57
|
+
void Find(RuneStrArray::const_iterator begin,
|
58
|
+
RuneStrArray::const_iterator end,
|
59
|
+
vector<struct Dag>&res,
|
60
|
+
size_t max_word_len = MAX_WORD_LENGTH) const {
|
61
|
+
trie_->Find(begin, end, res, max_word_len);
|
62
|
+
}
|
63
|
+
|
64
|
+
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
65
|
+
return IsIn(user_dict_single_chinese_word_, word);
|
66
|
+
}
|
67
|
+
|
68
|
+
double GetMinWeight() const {
|
69
|
+
return min_weight_;
|
70
|
+
}
|
71
|
+
|
72
|
+
private:
|
73
|
+
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
74
|
+
LoadDict(dict_path);
|
75
|
+
freq_sum_ = CalcFreqSum(static_node_infos_);
|
76
|
+
CalculateWeight(static_node_infos_, freq_sum_);
|
77
|
+
SetStaticWordWeights(user_word_weight_opt);
|
78
|
+
|
79
|
+
if (user_dict_paths.size()) {
|
80
|
+
LoadUserDict(user_dict_paths);
|
81
|
+
}
|
82
|
+
Shrink(static_node_infos_);
|
83
|
+
CreateTrie(static_node_infos_);
|
84
|
+
}
|
85
|
+
|
86
|
+
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
87
|
+
assert(dictUnits.size());
|
88
|
+
vector<Unicode> words;
|
89
|
+
vector<const DictUnit*> valuePointers;
|
90
|
+
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
91
|
+
words.push_back(dictUnits[i].word);
|
92
|
+
valuePointers.push_back(&dictUnits[i]);
|
93
|
+
}
|
94
|
+
|
95
|
+
trie_ = new Trie(words, valuePointers);
|
96
|
+
}
|
97
|
+
|
98
|
+
void LoadUserDict(const string& filePaths) {
|
99
|
+
vector<string> files = limonp::Split(filePaths, "|;");
|
100
|
+
size_t lineno = 0;
|
101
|
+
for (size_t i = 0; i < files.size(); i++) {
|
102
|
+
ifstream ifs(files[i].c_str());
|
103
|
+
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
104
|
+
string line;
|
105
|
+
DictUnit node_info;
|
106
|
+
vector<string> buf;
|
107
|
+
for (; getline(ifs, line); lineno++) {
|
108
|
+
if (line.size() == 0) {
|
109
|
+
continue;
|
110
|
+
}
|
111
|
+
buf.clear();
|
112
|
+
Split(line, buf, " ");
|
113
|
+
DictUnit node_info;
|
114
|
+
if(buf.size() == 1){
|
115
|
+
MakeNodeInfo(node_info,
|
116
|
+
buf[0],
|
117
|
+
user_word_default_weight_,
|
118
|
+
UNKNOWN_TAG);
|
119
|
+
} else if (buf.size() == 2) {
|
120
|
+
MakeNodeInfo(node_info,
|
121
|
+
buf[0],
|
122
|
+
user_word_default_weight_,
|
123
|
+
buf[1]);
|
124
|
+
} else if (buf.size() == 3) {
|
125
|
+
int freq = atoi(buf[1].c_str());
|
126
|
+
assert(freq_sum_ > 0.0);
|
127
|
+
double weight = log(1.0 * freq / freq_sum_);
|
128
|
+
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
129
|
+
}
|
130
|
+
static_node_infos_.push_back(node_info);
|
131
|
+
if (node_info.word.size() == 1) {
|
132
|
+
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
133
|
+
}
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
138
|
+
bool MakeNodeInfo(DictUnit& node_info,
|
139
|
+
const string& word,
|
140
|
+
double weight,
|
141
|
+
const string& tag) {
|
142
|
+
if (!DecodeRunesInString(word, node_info.word)) {
|
143
|
+
XLOG(ERROR) << "Decode " << word << " failed.";
|
144
|
+
return false;
|
145
|
+
}
|
146
|
+
node_info.weight = weight;
|
147
|
+
node_info.tag = tag;
|
148
|
+
return true;
|
149
|
+
}
|
150
|
+
|
151
|
+
void LoadDict(const string& filePath) {
|
152
|
+
ifstream ifs(filePath.c_str());
|
153
|
+
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
154
|
+
string line;
|
155
|
+
vector<string> buf;
|
156
|
+
|
157
|
+
DictUnit node_info;
|
158
|
+
for (size_t lineno = 0; getline(ifs, line); lineno++) {
|
159
|
+
Split(line, buf, " ");
|
160
|
+
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
161
|
+
MakeNodeInfo(node_info,
|
162
|
+
buf[0],
|
163
|
+
atof(buf[1].c_str()),
|
164
|
+
buf[2]);
|
165
|
+
static_node_infos_.push_back(node_info);
|
166
|
+
}
|
167
|
+
}
|
168
|
+
|
169
|
+
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
|
170
|
+
return lhs.weight < rhs.weight;
|
171
|
+
}
|
172
|
+
|
173
|
+
void SetStaticWordWeights(UserWordWeightOption option) {
|
174
|
+
XCHECK(!static_node_infos_.empty());
|
175
|
+
vector<DictUnit> x = static_node_infos_;
|
176
|
+
sort(x.begin(), x.end(), WeightCompare);
|
177
|
+
min_weight_ = x[0].weight;
|
178
|
+
max_weight_ = x[x.size() - 1].weight;
|
179
|
+
median_weight_ = x[x.size() / 2].weight;
|
180
|
+
switch (option) {
|
181
|
+
case WordWeightMin:
|
182
|
+
user_word_default_weight_ = min_weight_;
|
183
|
+
break;
|
184
|
+
case WordWeightMedian:
|
185
|
+
user_word_default_weight_ = median_weight_;
|
186
|
+
break;
|
187
|
+
default:
|
188
|
+
user_word_default_weight_ = max_weight_;
|
189
|
+
break;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
194
|
+
double sum = 0.0;
|
195
|
+
for (size_t i = 0; i < node_infos.size(); i++) {
|
196
|
+
sum += node_infos[i].weight;
|
197
|
+
}
|
198
|
+
return sum;
|
199
|
+
}
|
200
|
+
|
201
|
+
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
202
|
+
assert(sum > 0.0);
|
203
|
+
for (size_t i = 0; i < node_infos.size(); i++) {
|
204
|
+
DictUnit& node_info = node_infos[i];
|
205
|
+
assert(node_info.weight > 0.0);
|
206
|
+
node_info.weight = log(double(node_info.weight)/sum);
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
void Shrink(vector<DictUnit>& units) const {
|
211
|
+
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
212
|
+
}
|
213
|
+
|
214
|
+
vector<DictUnit> static_node_infos_;
|
215
|
+
deque<DictUnit> active_node_infos_; // must not be vector
|
216
|
+
Trie * trie_;
|
217
|
+
|
218
|
+
double freq_sum_;
|
219
|
+
double min_weight_;
|
220
|
+
double max_weight_;
|
221
|
+
double median_weight_;
|
222
|
+
double user_word_default_weight_;
|
223
|
+
unordered_set<Rune> user_dict_single_chinese_word_;
|
224
|
+
};
|
225
|
+
}
|
226
|
+
|
227
|
+
#endif
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef CPPJIEBA_FULLSEGMENT_H
|
2
|
+
#define CPPJIEBA_FULLSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "limonp/Logging.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "SegmentBase.hpp"
|
10
|
+
#include "Unicode.hpp"
|
11
|
+
|
12
|
+
namespace cppjieba {
|
13
|
+
class FullSegment: public SegmentBase {
|
14
|
+
public:
|
15
|
+
FullSegment(const string& dictPath) {
|
16
|
+
dictTrie_ = new DictTrie(dictPath);
|
17
|
+
isNeedDestroy_ = true;
|
18
|
+
}
|
19
|
+
FullSegment(const DictTrie* dictTrie)
|
20
|
+
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
21
|
+
assert(dictTrie_);
|
22
|
+
}
|
23
|
+
~FullSegment() {
|
24
|
+
if (isNeedDestroy_) {
|
25
|
+
delete dictTrie_;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
void Cut(const string& sentence,
|
29
|
+
vector<string>& words) const {
|
30
|
+
vector<Word> tmp;
|
31
|
+
Cut(sentence, tmp);
|
32
|
+
GetStringsFromWords(tmp, words);
|
33
|
+
}
|
34
|
+
void Cut(const string& sentence,
|
35
|
+
vector<Word>& words) const {
|
36
|
+
PreFilter pre_filter(symbols_, sentence);
|
37
|
+
PreFilter::Range range;
|
38
|
+
vector<WordRange> wrs;
|
39
|
+
wrs.reserve(sentence.size()/2);
|
40
|
+
while (pre_filter.HasNext()) {
|
41
|
+
range = pre_filter.Next();
|
42
|
+
Cut(range.begin, range.end, wrs);
|
43
|
+
}
|
44
|
+
words.clear();
|
45
|
+
words.reserve(wrs.size());
|
46
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
47
|
+
}
|
48
|
+
void Cut(RuneStrArray::const_iterator begin,
|
49
|
+
RuneStrArray::const_iterator end,
|
50
|
+
vector<WordRange>& res) const {
|
51
|
+
//resut of searching in trie tree
|
52
|
+
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
53
|
+
|
54
|
+
//max index of res's words
|
55
|
+
int maxIdx = 0;
|
56
|
+
|
57
|
+
// always equals to (uItr - begin)
|
58
|
+
int uIdx = 0;
|
59
|
+
|
60
|
+
//tmp variables
|
61
|
+
int wordLen = 0;
|
62
|
+
assert(dictTrie_);
|
63
|
+
vector<struct Dag> dags;
|
64
|
+
dictTrie_->Find(begin, end, dags);
|
65
|
+
for (size_t i = 0; i < dags.size(); i++) {
|
66
|
+
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
67
|
+
size_t nextoffset = dags[i].nexts[j].first;
|
68
|
+
assert(nextoffset < dags.size());
|
69
|
+
const DictUnit* du = dags[i].nexts[j].second;
|
70
|
+
if (du == NULL) {
|
71
|
+
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
72
|
+
WordRange wr(begin + i, begin + nextoffset);
|
73
|
+
res.push_back(wr);
|
74
|
+
}
|
75
|
+
} else {
|
76
|
+
wordLen = du->word.size();
|
77
|
+
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
78
|
+
WordRange wr(begin + i, begin + nextoffset);
|
79
|
+
res.push_back(wr);
|
80
|
+
}
|
81
|
+
}
|
82
|
+
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
83
|
+
}
|
84
|
+
uIdx++;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
private:
|
88
|
+
const DictTrie* dictTrie_;
|
89
|
+
bool isNeedDestroy_;
|
90
|
+
};
|
91
|
+
}
|
92
|
+
|
93
|
+
#endif
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#ifndef CPPJIEBA_HMMMODEL_H
|
2
|
+
#define CPPJIEBA_HMMMODEL_H
|
3
|
+
|
4
|
+
#include "limonp/StringUtil.hpp"
|
5
|
+
#include "Trie.hpp"
|
6
|
+
|
7
|
+
namespace cppjieba {
|
8
|
+
|
9
|
+
using namespace limonp;
|
10
|
+
typedef unordered_map<Rune, double> EmitProbMap;
|
11
|
+
|
12
|
+
struct HMMModel {
|
13
|
+
/*
|
14
|
+
* STATUS:
|
15
|
+
* 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
|
16
|
+
* */
|
17
|
+
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
18
|
+
|
19
|
+
HMMModel(const string& modelPath) {
|
20
|
+
memset(startProb, 0, sizeof(startProb));
|
21
|
+
memset(transProb, 0, sizeof(transProb));
|
22
|
+
statMap[0] = 'B';
|
23
|
+
statMap[1] = 'E';
|
24
|
+
statMap[2] = 'M';
|
25
|
+
statMap[3] = 'S';
|
26
|
+
emitProbVec.push_back(&emitProbB);
|
27
|
+
emitProbVec.push_back(&emitProbE);
|
28
|
+
emitProbVec.push_back(&emitProbM);
|
29
|
+
emitProbVec.push_back(&emitProbS);
|
30
|
+
LoadModel(modelPath);
|
31
|
+
}
|
32
|
+
~HMMModel() {
|
33
|
+
}
|
34
|
+
void LoadModel(const string& filePath) {
|
35
|
+
ifstream ifile(filePath.c_str());
|
36
|
+
XCHECK(ifile.is_open()) << "open " << filePath << " failed";
|
37
|
+
string line;
|
38
|
+
vector<string> tmp;
|
39
|
+
vector<string> tmp2;
|
40
|
+
//Load startProb
|
41
|
+
XCHECK(GetLine(ifile, line));
|
42
|
+
Split(line, tmp, " ");
|
43
|
+
XCHECK(tmp.size() == STATUS_SUM);
|
44
|
+
for (size_t j = 0; j< tmp.size(); j++) {
|
45
|
+
startProb[j] = atof(tmp[j].c_str());
|
46
|
+
}
|
47
|
+
|
48
|
+
//Load transProb
|
49
|
+
for (size_t i = 0; i < STATUS_SUM; i++) {
|
50
|
+
XCHECK(GetLine(ifile, line));
|
51
|
+
Split(line, tmp, " ");
|
52
|
+
XCHECK(tmp.size() == STATUS_SUM);
|
53
|
+
for (size_t j =0; j < STATUS_SUM; j++) {
|
54
|
+
transProb[i][j] = atof(tmp[j].c_str());
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
//Load emitProbB
|
59
|
+
XCHECK(GetLine(ifile, line));
|
60
|
+
XCHECK(LoadEmitProb(line, emitProbB));
|
61
|
+
|
62
|
+
//Load emitProbE
|
63
|
+
XCHECK(GetLine(ifile, line));
|
64
|
+
XCHECK(LoadEmitProb(line, emitProbE));
|
65
|
+
|
66
|
+
//Load emitProbM
|
67
|
+
XCHECK(GetLine(ifile, line));
|
68
|
+
XCHECK(LoadEmitProb(line, emitProbM));
|
69
|
+
|
70
|
+
//Load emitProbS
|
71
|
+
XCHECK(GetLine(ifile, line));
|
72
|
+
XCHECK(LoadEmitProb(line, emitProbS));
|
73
|
+
}
|
74
|
+
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
75
|
+
double defVal)const {
|
76
|
+
EmitProbMap::const_iterator cit = ptMp->find(key);
|
77
|
+
if (cit == ptMp->end()) {
|
78
|
+
return defVal;
|
79
|
+
}
|
80
|
+
return cit->second;
|
81
|
+
}
|
82
|
+
bool GetLine(ifstream& ifile, string& line) {
|
83
|
+
while (getline(ifile, line)) {
|
84
|
+
Trim(line);
|
85
|
+
if (line.empty()) {
|
86
|
+
continue;
|
87
|
+
}
|
88
|
+
if (StartsWith(line, "#")) {
|
89
|
+
continue;
|
90
|
+
}
|
91
|
+
return true;
|
92
|
+
}
|
93
|
+
return false;
|
94
|
+
}
|
95
|
+
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
96
|
+
if (line.empty()) {
|
97
|
+
return false;
|
98
|
+
}
|
99
|
+
vector<string> tmp, tmp2;
|
100
|
+
Unicode unicode;
|
101
|
+
Split(line, tmp, ",");
|
102
|
+
for (size_t i = 0; i < tmp.size(); i++) {
|
103
|
+
Split(tmp[i], tmp2, ":");
|
104
|
+
if (2 != tmp2.size()) {
|
105
|
+
XLOG(ERROR) << "emitProb illegal.";
|
106
|
+
return false;
|
107
|
+
}
|
108
|
+
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
109
|
+
XLOG(ERROR) << "TransCode failed.";
|
110
|
+
return false;
|
111
|
+
}
|
112
|
+
mp[unicode[0]] = atof(tmp2[1].c_str());
|
113
|
+
}
|
114
|
+
return true;
|
115
|
+
}
|
116
|
+
|
117
|
+
char statMap[STATUS_SUM];
|
118
|
+
double startProb[STATUS_SUM];
|
119
|
+
double transProb[STATUS_SUM][STATUS_SUM];
|
120
|
+
EmitProbMap emitProbB;
|
121
|
+
EmitProbMap emitProbE;
|
122
|
+
EmitProbMap emitProbM;
|
123
|
+
EmitProbMap emitProbS;
|
124
|
+
vector<EmitProbMap* > emitProbVec;
|
125
|
+
}; // struct HMMModel
|
126
|
+
|
127
|
+
} // namespace cppjieba
|
128
|
+
|
129
|
+
#endif
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#ifndef CPPJIBEA_HMMSEGMENT_H
|
2
|
+
#define CPPJIBEA_HMMSEGMENT_H
|
3
|
+
|
4
|
+
#include <iostream>
|
5
|
+
#include <fstream>
|
6
|
+
#include <memory.h>
|
7
|
+
#include <cassert>
|
8
|
+
#include "HMMModel.hpp"
|
9
|
+
#include "SegmentBase.hpp"
|
10
|
+
|
11
|
+
namespace cppjieba {
|
12
|
+
class HMMSegment: public SegmentBase {
|
13
|
+
public:
|
14
|
+
HMMSegment(const string& filePath)
|
15
|
+
: model_(new HMMModel(filePath)), isNeedDestroy_(true) {
|
16
|
+
}
|
17
|
+
HMMSegment(const HMMModel* model)
|
18
|
+
: model_(model), isNeedDestroy_(false) {
|
19
|
+
}
|
20
|
+
~HMMSegment() {
|
21
|
+
if (isNeedDestroy_) {
|
22
|
+
delete model_;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
void Cut(const string& sentence,
|
27
|
+
vector<string>& words) const {
|
28
|
+
vector<Word> tmp;
|
29
|
+
Cut(sentence, tmp);
|
30
|
+
GetStringsFromWords(tmp, words);
|
31
|
+
}
|
32
|
+
void Cut(const string& sentence,
|
33
|
+
vector<Word>& words) const {
|
34
|
+
PreFilter pre_filter(symbols_, sentence);
|
35
|
+
PreFilter::Range range;
|
36
|
+
vector<WordRange> wrs;
|
37
|
+
wrs.reserve(sentence.size()/2);
|
38
|
+
while (pre_filter.HasNext()) {
|
39
|
+
range = pre_filter.Next();
|
40
|
+
Cut(range.begin, range.end, wrs);
|
41
|
+
}
|
42
|
+
words.clear();
|
43
|
+
words.reserve(wrs.size());
|
44
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
45
|
+
}
|
46
|
+
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
47
|
+
RuneStrArray::const_iterator left = begin;
|
48
|
+
RuneStrArray::const_iterator right = begin;
|
49
|
+
while (right != end) {
|
50
|
+
if (right->rune < 0x80) {
|
51
|
+
if (left != right) {
|
52
|
+
InternalCut(left, right, res);
|
53
|
+
}
|
54
|
+
left = right;
|
55
|
+
do {
|
56
|
+
right = SequentialLetterRule(left, end);
|
57
|
+
if (right != left) {
|
58
|
+
break;
|
59
|
+
}
|
60
|
+
right = NumbersRule(left, end);
|
61
|
+
if (right != left) {
|
62
|
+
break;
|
63
|
+
}
|
64
|
+
right ++;
|
65
|
+
} while (false);
|
66
|
+
WordRange wr(left, right - 1);
|
67
|
+
res.push_back(wr);
|
68
|
+
left = right;
|
69
|
+
} else {
|
70
|
+
right++;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
if (left != right) {
|
74
|
+
InternalCut(left, right, res);
|
75
|
+
}
|
76
|
+
}
|
77
|
+
private:
|
78
|
+
// sequential letters rule
|
79
|
+
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
80
|
+
Rune x = begin->rune;
|
81
|
+
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
|
82
|
+
begin ++;
|
83
|
+
} else {
|
84
|
+
return begin;
|
85
|
+
}
|
86
|
+
while (begin != end) {
|
87
|
+
x = begin->rune;
|
88
|
+
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
|
89
|
+
begin ++;
|
90
|
+
} else {
|
91
|
+
break;
|
92
|
+
}
|
93
|
+
}
|
94
|
+
return begin;
|
95
|
+
}
|
96
|
+
//
|
97
|
+
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
98
|
+
Rune x = begin->rune;
|
99
|
+
if ('0' <= x && x <= '9') {
|
100
|
+
begin ++;
|
101
|
+
} else {
|
102
|
+
return begin;
|
103
|
+
}
|
104
|
+
while (begin != end) {
|
105
|
+
x = begin->rune;
|
106
|
+
if ( ('0' <= x && x <= '9') || x == '.') {
|
107
|
+
begin++;
|
108
|
+
} else {
|
109
|
+
break;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
return begin;
|
113
|
+
}
|
114
|
+
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
|
115
|
+
vector<size_t> status;
|
116
|
+
Viterbi(begin, end, status);
|
117
|
+
|
118
|
+
RuneStrArray::const_iterator left = begin;
|
119
|
+
RuneStrArray::const_iterator right;
|
120
|
+
for (size_t i = 0; i < status.size(); i++) {
|
121
|
+
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
|
122
|
+
right = begin + i + 1;
|
123
|
+
WordRange wr(left, right - 1);
|
124
|
+
res.push_back(wr);
|
125
|
+
left = right;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
void Viterbi(RuneStrArray::const_iterator begin,
|
131
|
+
RuneStrArray::const_iterator end,
|
132
|
+
vector<size_t>& status) const {
|
133
|
+
size_t Y = HMMModel::STATUS_SUM;
|
134
|
+
size_t X = end - begin;
|
135
|
+
|
136
|
+
size_t XYSize = X * Y;
|
137
|
+
size_t now, old, stat;
|
138
|
+
double tmp, endE, endS;
|
139
|
+
|
140
|
+
vector<int> path(XYSize);
|
141
|
+
vector<double> weight(XYSize);
|
142
|
+
|
143
|
+
//start
|
144
|
+
for (size_t y = 0; y < Y; y++) {
|
145
|
+
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
|
146
|
+
path[0 + y * X] = -1;
|
147
|
+
}
|
148
|
+
|
149
|
+
double emitProb;
|
150
|
+
|
151
|
+
for (size_t x = 1; x < X; x++) {
|
152
|
+
for (size_t y = 0; y < Y; y++) {
|
153
|
+
now = x + y*X;
|
154
|
+
weight[now] = MIN_DOUBLE;
|
155
|
+
path[now] = HMMModel::E; // warning
|
156
|
+
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
|
157
|
+
for (size_t preY = 0; preY < Y; preY++) {
|
158
|
+
old = x - 1 + preY * X;
|
159
|
+
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
|
160
|
+
if (tmp > weight[now]) {
|
161
|
+
weight[now] = tmp;
|
162
|
+
path[now] = preY;
|
163
|
+
}
|
164
|
+
}
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
endE = weight[X-1+HMMModel::E*X];
|
169
|
+
endS = weight[X-1+HMMModel::S*X];
|
170
|
+
stat = 0;
|
171
|
+
if (endE >= endS) {
|
172
|
+
stat = HMMModel::E;
|
173
|
+
} else {
|
174
|
+
stat = HMMModel::S;
|
175
|
+
}
|
176
|
+
|
177
|
+
status.resize(X);
|
178
|
+
for (int x = X -1 ; x >= 0; x--) {
|
179
|
+
status[x] = stat;
|
180
|
+
stat = path[x + stat*X];
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
const HMMModel* model_;
|
185
|
+
bool isNeedDestroy_;
|
186
|
+
}; // class HMMSegment
|
187
|
+
|
188
|
+
} // namespace cppjieba
|
189
|
+
|
190
|
+
#endif
|