cppjieba_rb 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +3 -0
- data/README.md +1 -1
- data/Rakefile +2 -2
- data/cppjieba_rb.gemspec +4 -4
- data/lib/cppjieba_rb/version.rb +1 -1
- metadata +17 -135
- data/ext/cppjieba/.gitignore +0 -17
- data/ext/cppjieba/.travis.yml +0 -21
- data/ext/cppjieba/CMakeLists.txt +0 -28
- data/ext/cppjieba/ChangeLog.md +0 -236
- data/ext/cppjieba/README.md +0 -292
- data/ext/cppjieba/README_EN.md +0 -113
- data/ext/cppjieba/appveyor.yml +0 -32
- data/ext/cppjieba/deps/CMakeLists.txt +0 -1
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
- data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
- data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
- data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
- data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
- data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
- data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
- data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
- data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
- data/ext/cppjieba/dict/README.md +0 -31
- data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
- data/ext/cppjieba/dict/idf.utf8 +0 -258826
- data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
- data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
- data/ext/cppjieba/dict/user.dict.utf8 +0 -4
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
- data/ext/cppjieba/test/CMakeLists.txt +0 -5
- data/ext/cppjieba/test/demo.cpp +0 -80
- data/ext/cppjieba/test/load_test.cpp +0 -54
- data/ext/cppjieba/test/testdata/curl.res +0 -1
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
- data/ext/cppjieba/test/testdata/load_test.urls +0 -2
- data/ext/cppjieba/test/testdata/review.100 +0 -100
- data/ext/cppjieba/test/testdata/review.100.res +0 -200
- data/ext/cppjieba/test/testdata/server.conf +0 -19
- data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
- data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
- data/ext/cppjieba/test/testdata/userdict.english +0 -2
- data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
- data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
- data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
- data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
- data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
- data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
- data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
- data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
- data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
|
@@ -1,277 +0,0 @@
|
|
|
1
|
-
#ifndef CPPJIEBA_DICT_TRIE_HPP
|
|
2
|
-
#define CPPJIEBA_DICT_TRIE_HPP
|
|
3
|
-
|
|
4
|
-
#include <iostream>
|
|
5
|
-
#include <fstream>
|
|
6
|
-
#include <map>
|
|
7
|
-
#include <string>
|
|
8
|
-
#include <cstring>
|
|
9
|
-
#include <cstdlib>
|
|
10
|
-
#include <stdint.h>
|
|
11
|
-
#include <cmath>
|
|
12
|
-
#include <limits>
|
|
13
|
-
#include "limonp/StringUtil.hpp"
|
|
14
|
-
#include "limonp/Logging.hpp"
|
|
15
|
-
#include "Unicode.hpp"
|
|
16
|
-
#include "Trie.hpp"
|
|
17
|
-
|
|
18
|
-
namespace cppjieba {
|
|
19
|
-
|
|
20
|
-
using namespace limonp;
|
|
21
|
-
|
|
22
|
-
const double MIN_DOUBLE = -3.14e+100;
|
|
23
|
-
const double MAX_DOUBLE = 3.14e+100;
|
|
24
|
-
const size_t DICT_COLUMN_NUM = 3;
|
|
25
|
-
const char* const UNKNOWN_TAG = "";
|
|
26
|
-
|
|
27
|
-
class DictTrie {
|
|
28
|
-
public:
|
|
29
|
-
enum UserWordWeightOption {
|
|
30
|
-
WordWeightMin,
|
|
31
|
-
WordWeightMedian,
|
|
32
|
-
WordWeightMax,
|
|
33
|
-
}; // enum UserWordWeightOption
|
|
34
|
-
|
|
35
|
-
DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
|
36
|
-
Init(dict_path, user_dict_paths, user_word_weight_opt);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
~DictTrie() {
|
|
40
|
-
delete trie_;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
|
44
|
-
DictUnit node_info;
|
|
45
|
-
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
|
46
|
-
return false;
|
|
47
|
-
}
|
|
48
|
-
active_node_infos_.push_back(node_info);
|
|
49
|
-
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
50
|
-
return true;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
|
|
54
|
-
DictUnit node_info;
|
|
55
|
-
double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
|
|
56
|
-
if (!MakeNodeInfo(node_info, word, weight , tag)) {
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
active_node_infos_.push_back(node_info);
|
|
60
|
-
trie_->InsertNode(node_info.word, &active_node_infos_.back());
|
|
61
|
-
return true;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
|
65
|
-
return trie_->Find(begin, end);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
void Find(RuneStrArray::const_iterator begin,
|
|
69
|
-
RuneStrArray::const_iterator end,
|
|
70
|
-
vector<struct Dag>&res,
|
|
71
|
-
size_t max_word_len = MAX_WORD_LENGTH) const {
|
|
72
|
-
trie_->Find(begin, end, res, max_word_len);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
bool Find(const string& word)
|
|
76
|
-
{
|
|
77
|
-
const DictUnit *tmp = NULL;
|
|
78
|
-
RuneStrArray runes;
|
|
79
|
-
if (!DecodeRunesInString(word, runes))
|
|
80
|
-
{
|
|
81
|
-
XLOG(ERROR) << "Decode failed.";
|
|
82
|
-
}
|
|
83
|
-
tmp = Find(runes.begin(), runes.end());
|
|
84
|
-
if (tmp == NULL)
|
|
85
|
-
{
|
|
86
|
-
return false;
|
|
87
|
-
}
|
|
88
|
-
else
|
|
89
|
-
{
|
|
90
|
-
return true;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
|
95
|
-
return IsIn(user_dict_single_chinese_word_, word);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
double GetMinWeight() const {
|
|
99
|
-
return min_weight_;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
void InserUserDictNode(const string& line) {
|
|
103
|
-
vector<string> buf;
|
|
104
|
-
DictUnit node_info;
|
|
105
|
-
Split(line, buf, " ");
|
|
106
|
-
if(buf.size() == 1){
|
|
107
|
-
MakeNodeInfo(node_info,
|
|
108
|
-
buf[0],
|
|
109
|
-
user_word_default_weight_,
|
|
110
|
-
UNKNOWN_TAG);
|
|
111
|
-
} else if (buf.size() == 2) {
|
|
112
|
-
MakeNodeInfo(node_info,
|
|
113
|
-
buf[0],
|
|
114
|
-
user_word_default_weight_,
|
|
115
|
-
buf[1]);
|
|
116
|
-
} else if (buf.size() == 3) {
|
|
117
|
-
int freq = atoi(buf[1].c_str());
|
|
118
|
-
assert(freq_sum_ > 0.0);
|
|
119
|
-
double weight = log(1.0 * freq / freq_sum_);
|
|
120
|
-
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
|
|
121
|
-
}
|
|
122
|
-
static_node_infos_.push_back(node_info);
|
|
123
|
-
if (node_info.word.size() == 1) {
|
|
124
|
-
user_dict_single_chinese_word_.insert(node_info.word[0]);
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
void LoadUserDict(const vector<string>& buf) {
|
|
129
|
-
for (size_t i = 0; i < buf.size(); i++) {
|
|
130
|
-
InserUserDictNode(buf[i]);
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
void LoadUserDict(const set<string>& buf) {
|
|
135
|
-
std::set<string>::const_iterator iter;
|
|
136
|
-
for (iter = buf.begin(); iter != buf.end(); iter++){
|
|
137
|
-
InserUserDictNode(*iter);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
void LoadUserDict(const string& filePaths) {
|
|
142
|
-
vector<string> files = limonp::Split(filePaths, "|;");
|
|
143
|
-
size_t lineno = 0;
|
|
144
|
-
for (size_t i = 0; i < files.size(); i++) {
|
|
145
|
-
ifstream ifs(files[i].c_str());
|
|
146
|
-
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
|
147
|
-
string line;
|
|
148
|
-
|
|
149
|
-
for (; getline(ifs, line); lineno++) {
|
|
150
|
-
if (line.size() == 0) {
|
|
151
|
-
continue;
|
|
152
|
-
}
|
|
153
|
-
InserUserDictNode(line);
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
private:
|
|
160
|
-
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
|
|
161
|
-
LoadDict(dict_path);
|
|
162
|
-
freq_sum_ = CalcFreqSum(static_node_infos_);
|
|
163
|
-
CalculateWeight(static_node_infos_, freq_sum_);
|
|
164
|
-
SetStaticWordWeights(user_word_weight_opt);
|
|
165
|
-
|
|
166
|
-
if (user_dict_paths.size()) {
|
|
167
|
-
LoadUserDict(user_dict_paths);
|
|
168
|
-
}
|
|
169
|
-
Shrink(static_node_infos_);
|
|
170
|
-
CreateTrie(static_node_infos_);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
void CreateTrie(const vector<DictUnit>& dictUnits) {
|
|
174
|
-
assert(dictUnits.size());
|
|
175
|
-
vector<Unicode> words;
|
|
176
|
-
vector<const DictUnit*> valuePointers;
|
|
177
|
-
for (size_t i = 0 ; i < dictUnits.size(); i ++) {
|
|
178
|
-
words.push_back(dictUnits[i].word);
|
|
179
|
-
valuePointers.push_back(&dictUnits[i]);
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
trie_ = new Trie(words, valuePointers);
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
bool MakeNodeInfo(DictUnit& node_info,
|
|
189
|
-
const string& word,
|
|
190
|
-
double weight,
|
|
191
|
-
const string& tag) {
|
|
192
|
-
if (!DecodeRunesInString(word, node_info.word)) {
|
|
193
|
-
XLOG(ERROR) << "Decode " << word << " failed.";
|
|
194
|
-
return false;
|
|
195
|
-
}
|
|
196
|
-
node_info.weight = weight;
|
|
197
|
-
node_info.tag = tag;
|
|
198
|
-
return true;
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
void LoadDict(const string& filePath) {
|
|
202
|
-
ifstream ifs(filePath.c_str());
|
|
203
|
-
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
|
204
|
-
string line;
|
|
205
|
-
vector<string> buf;
|
|
206
|
-
|
|
207
|
-
DictUnit node_info;
|
|
208
|
-
for (size_t lineno = 0; getline(ifs, line); lineno++) {
|
|
209
|
-
Split(line, buf, " ");
|
|
210
|
-
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
|
211
|
-
MakeNodeInfo(node_info,
|
|
212
|
-
buf[0],
|
|
213
|
-
atof(buf[1].c_str()),
|
|
214
|
-
buf[2]);
|
|
215
|
-
static_node_infos_.push_back(node_info);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
|
|
220
|
-
return lhs.weight < rhs.weight;
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
void SetStaticWordWeights(UserWordWeightOption option) {
|
|
224
|
-
XCHECK(!static_node_infos_.empty());
|
|
225
|
-
vector<DictUnit> x = static_node_infos_;
|
|
226
|
-
sort(x.begin(), x.end(), WeightCompare);
|
|
227
|
-
min_weight_ = x[0].weight;
|
|
228
|
-
max_weight_ = x[x.size() - 1].weight;
|
|
229
|
-
median_weight_ = x[x.size() / 2].weight;
|
|
230
|
-
switch (option) {
|
|
231
|
-
case WordWeightMin:
|
|
232
|
-
user_word_default_weight_ = min_weight_;
|
|
233
|
-
break;
|
|
234
|
-
case WordWeightMedian:
|
|
235
|
-
user_word_default_weight_ = median_weight_;
|
|
236
|
-
break;
|
|
237
|
-
default:
|
|
238
|
-
user_word_default_weight_ = max_weight_;
|
|
239
|
-
break;
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
|
|
244
|
-
double sum = 0.0;
|
|
245
|
-
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
246
|
-
sum += node_infos[i].weight;
|
|
247
|
-
}
|
|
248
|
-
return sum;
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
|
|
252
|
-
assert(sum > 0.0);
|
|
253
|
-
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
254
|
-
DictUnit& node_info = node_infos[i];
|
|
255
|
-
assert(node_info.weight > 0.0);
|
|
256
|
-
node_info.weight = log(double(node_info.weight)/sum);
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
void Shrink(vector<DictUnit>& units) const {
|
|
261
|
-
vector<DictUnit>(units.begin(), units.end()).swap(units);
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
vector<DictUnit> static_node_infos_;
|
|
265
|
-
deque<DictUnit> active_node_infos_; // must not be vector
|
|
266
|
-
Trie * trie_;
|
|
267
|
-
|
|
268
|
-
double freq_sum_;
|
|
269
|
-
double min_weight_;
|
|
270
|
-
double max_weight_;
|
|
271
|
-
double median_weight_;
|
|
272
|
-
double user_word_default_weight_;
|
|
273
|
-
unordered_set<Rune> user_dict_single_chinese_word_;
|
|
274
|
-
};
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
#endif
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
#ifndef CPPJIEBA_FULLSEGMENT_H
|
|
2
|
-
#define CPPJIEBA_FULLSEGMENT_H
|
|
3
|
-
|
|
4
|
-
#include <algorithm>
|
|
5
|
-
#include <set>
|
|
6
|
-
#include <cassert>
|
|
7
|
-
#include "limonp/Logging.hpp"
|
|
8
|
-
#include "DictTrie.hpp"
|
|
9
|
-
#include "SegmentBase.hpp"
|
|
10
|
-
#include "Unicode.hpp"
|
|
11
|
-
|
|
12
|
-
namespace cppjieba {
|
|
13
|
-
class FullSegment: public SegmentBase {
|
|
14
|
-
public:
|
|
15
|
-
FullSegment(const string& dictPath) {
|
|
16
|
-
dictTrie_ = new DictTrie(dictPath);
|
|
17
|
-
isNeedDestroy_ = true;
|
|
18
|
-
}
|
|
19
|
-
FullSegment(const DictTrie* dictTrie)
|
|
20
|
-
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
|
21
|
-
assert(dictTrie_);
|
|
22
|
-
}
|
|
23
|
-
~FullSegment() {
|
|
24
|
-
if (isNeedDestroy_) {
|
|
25
|
-
delete dictTrie_;
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
void Cut(const string& sentence,
|
|
29
|
-
vector<string>& words) const {
|
|
30
|
-
vector<Word> tmp;
|
|
31
|
-
Cut(sentence, tmp);
|
|
32
|
-
GetStringsFromWords(tmp, words);
|
|
33
|
-
}
|
|
34
|
-
void Cut(const string& sentence,
|
|
35
|
-
vector<Word>& words) const {
|
|
36
|
-
PreFilter pre_filter(symbols_, sentence);
|
|
37
|
-
PreFilter::Range range;
|
|
38
|
-
vector<WordRange> wrs;
|
|
39
|
-
wrs.reserve(sentence.size()/2);
|
|
40
|
-
while (pre_filter.HasNext()) {
|
|
41
|
-
range = pre_filter.Next();
|
|
42
|
-
Cut(range.begin, range.end, wrs);
|
|
43
|
-
}
|
|
44
|
-
words.clear();
|
|
45
|
-
words.reserve(wrs.size());
|
|
46
|
-
GetWordsFromWordRanges(sentence, wrs, words);
|
|
47
|
-
}
|
|
48
|
-
void Cut(RuneStrArray::const_iterator begin,
|
|
49
|
-
RuneStrArray::const_iterator end,
|
|
50
|
-
vector<WordRange>& res) const {
|
|
51
|
-
// resut of searching in trie tree
|
|
52
|
-
LocalVector<pair<size_t, const DictUnit*> > tRes;
|
|
53
|
-
|
|
54
|
-
// max index of res's words
|
|
55
|
-
size_t maxIdx = 0;
|
|
56
|
-
|
|
57
|
-
// always equals to (uItr - begin)
|
|
58
|
-
size_t uIdx = 0;
|
|
59
|
-
|
|
60
|
-
// tmp variables
|
|
61
|
-
size_t wordLen = 0;
|
|
62
|
-
assert(dictTrie_);
|
|
63
|
-
vector<struct Dag> dags;
|
|
64
|
-
dictTrie_->Find(begin, end, dags);
|
|
65
|
-
for (size_t i = 0; i < dags.size(); i++) {
|
|
66
|
-
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
|
|
67
|
-
size_t nextoffset = dags[i].nexts[j].first;
|
|
68
|
-
assert(nextoffset < dags.size());
|
|
69
|
-
const DictUnit* du = dags[i].nexts[j].second;
|
|
70
|
-
if (du == NULL) {
|
|
71
|
-
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
|
|
72
|
-
WordRange wr(begin + i, begin + nextoffset);
|
|
73
|
-
res.push_back(wr);
|
|
74
|
-
}
|
|
75
|
-
} else {
|
|
76
|
-
wordLen = du->word.size();
|
|
77
|
-
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
|
|
78
|
-
WordRange wr(begin + i, begin + nextoffset);
|
|
79
|
-
res.push_back(wr);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
|
|
83
|
-
}
|
|
84
|
-
uIdx++;
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
private:
|
|
88
|
-
const DictTrie* dictTrie_;
|
|
89
|
-
bool isNeedDestroy_;
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
#endif
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
#ifndef CPPJIEBA_HMMMODEL_H
|
|
2
|
-
#define CPPJIEBA_HMMMODEL_H
|
|
3
|
-
|
|
4
|
-
#include "limonp/StringUtil.hpp"
|
|
5
|
-
#include "Trie.hpp"
|
|
6
|
-
|
|
7
|
-
namespace cppjieba {
|
|
8
|
-
|
|
9
|
-
using namespace limonp;
|
|
10
|
-
typedef unordered_map<Rune, double> EmitProbMap;
|
|
11
|
-
|
|
12
|
-
struct HMMModel {
|
|
13
|
-
/*
|
|
14
|
-
* STATUS:
|
|
15
|
-
* 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
|
|
16
|
-
* */
|
|
17
|
-
enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
|
|
18
|
-
|
|
19
|
-
HMMModel(const string& modelPath) {
|
|
20
|
-
memset(startProb, 0, sizeof(startProb));
|
|
21
|
-
memset(transProb, 0, sizeof(transProb));
|
|
22
|
-
statMap[0] = 'B';
|
|
23
|
-
statMap[1] = 'E';
|
|
24
|
-
statMap[2] = 'M';
|
|
25
|
-
statMap[3] = 'S';
|
|
26
|
-
emitProbVec.push_back(&emitProbB);
|
|
27
|
-
emitProbVec.push_back(&emitProbE);
|
|
28
|
-
emitProbVec.push_back(&emitProbM);
|
|
29
|
-
emitProbVec.push_back(&emitProbS);
|
|
30
|
-
LoadModel(modelPath);
|
|
31
|
-
}
|
|
32
|
-
~HMMModel() {
|
|
33
|
-
}
|
|
34
|
-
void LoadModel(const string& filePath) {
|
|
35
|
-
ifstream ifile(filePath.c_str());
|
|
36
|
-
XCHECK(ifile.is_open()) << "open " << filePath << " failed";
|
|
37
|
-
string line;
|
|
38
|
-
vector<string> tmp;
|
|
39
|
-
vector<string> tmp2;
|
|
40
|
-
//Load startProb
|
|
41
|
-
XCHECK(GetLine(ifile, line));
|
|
42
|
-
Split(line, tmp, " ");
|
|
43
|
-
XCHECK(tmp.size() == STATUS_SUM);
|
|
44
|
-
for (size_t j = 0; j< tmp.size(); j++) {
|
|
45
|
-
startProb[j] = atof(tmp[j].c_str());
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
//Load transProb
|
|
49
|
-
for (size_t i = 0; i < STATUS_SUM; i++) {
|
|
50
|
-
XCHECK(GetLine(ifile, line));
|
|
51
|
-
Split(line, tmp, " ");
|
|
52
|
-
XCHECK(tmp.size() == STATUS_SUM);
|
|
53
|
-
for (size_t j =0; j < STATUS_SUM; j++) {
|
|
54
|
-
transProb[i][j] = atof(tmp[j].c_str());
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
//Load emitProbB
|
|
59
|
-
XCHECK(GetLine(ifile, line));
|
|
60
|
-
XCHECK(LoadEmitProb(line, emitProbB));
|
|
61
|
-
|
|
62
|
-
//Load emitProbE
|
|
63
|
-
XCHECK(GetLine(ifile, line));
|
|
64
|
-
XCHECK(LoadEmitProb(line, emitProbE));
|
|
65
|
-
|
|
66
|
-
//Load emitProbM
|
|
67
|
-
XCHECK(GetLine(ifile, line));
|
|
68
|
-
XCHECK(LoadEmitProb(line, emitProbM));
|
|
69
|
-
|
|
70
|
-
//Load emitProbS
|
|
71
|
-
XCHECK(GetLine(ifile, line));
|
|
72
|
-
XCHECK(LoadEmitProb(line, emitProbS));
|
|
73
|
-
}
|
|
74
|
-
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
|
75
|
-
double defVal)const {
|
|
76
|
-
EmitProbMap::const_iterator cit = ptMp->find(key);
|
|
77
|
-
if (cit == ptMp->end()) {
|
|
78
|
-
return defVal;
|
|
79
|
-
}
|
|
80
|
-
return cit->second;
|
|
81
|
-
}
|
|
82
|
-
bool GetLine(ifstream& ifile, string& line) {
|
|
83
|
-
while (getline(ifile, line)) {
|
|
84
|
-
Trim(line);
|
|
85
|
-
if (line.empty()) {
|
|
86
|
-
continue;
|
|
87
|
-
}
|
|
88
|
-
if (StartsWith(line, "#")) {
|
|
89
|
-
continue;
|
|
90
|
-
}
|
|
91
|
-
return true;
|
|
92
|
-
}
|
|
93
|
-
return false;
|
|
94
|
-
}
|
|
95
|
-
bool LoadEmitProb(const string& line, EmitProbMap& mp) {
|
|
96
|
-
if (line.empty()) {
|
|
97
|
-
return false;
|
|
98
|
-
}
|
|
99
|
-
vector<string> tmp, tmp2;
|
|
100
|
-
Unicode unicode;
|
|
101
|
-
Split(line, tmp, ",");
|
|
102
|
-
for (size_t i = 0; i < tmp.size(); i++) {
|
|
103
|
-
Split(tmp[i], tmp2, ":");
|
|
104
|
-
if (2 != tmp2.size()) {
|
|
105
|
-
XLOG(ERROR) << "emitProb illegal.";
|
|
106
|
-
return false;
|
|
107
|
-
}
|
|
108
|
-
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
|
|
109
|
-
XLOG(ERROR) << "TransCode failed.";
|
|
110
|
-
return false;
|
|
111
|
-
}
|
|
112
|
-
mp[unicode[0]] = atof(tmp2[1].c_str());
|
|
113
|
-
}
|
|
114
|
-
return true;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
char statMap[STATUS_SUM];
|
|
118
|
-
double startProb[STATUS_SUM];
|
|
119
|
-
double transProb[STATUS_SUM][STATUS_SUM];
|
|
120
|
-
EmitProbMap emitProbB;
|
|
121
|
-
EmitProbMap emitProbE;
|
|
122
|
-
EmitProbMap emitProbM;
|
|
123
|
-
EmitProbMap emitProbS;
|
|
124
|
-
vector<EmitProbMap* > emitProbVec;
|
|
125
|
-
}; // struct HMMModel
|
|
126
|
-
|
|
127
|
-
} // namespace cppjieba
|
|
128
|
-
|
|
129
|
-
#endif
|