cppjieba_rb 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +3 -0
- data/README.md +1 -1
- data/Rakefile +2 -2
- data/cppjieba_rb.gemspec +4 -4
- data/lib/cppjieba_rb/version.rb +1 -1
- metadata +17 -135
- data/ext/cppjieba/.gitignore +0 -17
- data/ext/cppjieba/.travis.yml +0 -21
- data/ext/cppjieba/CMakeLists.txt +0 -28
- data/ext/cppjieba/ChangeLog.md +0 -236
- data/ext/cppjieba/README.md +0 -292
- data/ext/cppjieba/README_EN.md +0 -113
- data/ext/cppjieba/appveyor.yml +0 -32
- data/ext/cppjieba/deps/CMakeLists.txt +0 -1
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
- data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
- data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
- data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
- data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
- data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
- data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
- data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
- data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
- data/ext/cppjieba/dict/README.md +0 -31
- data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
- data/ext/cppjieba/dict/idf.utf8 +0 -258826
- data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
- data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
- data/ext/cppjieba/dict/user.dict.utf8 +0 -4
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
- data/ext/cppjieba/test/CMakeLists.txt +0 -5
- data/ext/cppjieba/test/demo.cpp +0 -80
- data/ext/cppjieba/test/load_test.cpp +0 -54
- data/ext/cppjieba/test/testdata/curl.res +0 -1
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
- data/ext/cppjieba/test/testdata/load_test.urls +0 -2
- data/ext/cppjieba/test/testdata/review.100 +0 -100
- data/ext/cppjieba/test/testdata/review.100.res +0 -200
- data/ext/cppjieba/test/testdata/server.conf +0 -19
- data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
- data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
- data/ext/cppjieba/test/testdata/userdict.english +0 -2
- data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
- data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
- data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
- data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
- data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
- data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
- data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
- data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
- data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,137 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_MPSEGMENT_H
|
2
|
-
#define CPPJIEBA_MPSEGMENT_H
|
3
|
-
|
4
|
-
#include <algorithm>
|
5
|
-
#include <set>
|
6
|
-
#include <cassert>
|
7
|
-
#include "limonp/Logging.hpp"
|
8
|
-
#include "DictTrie.hpp"
|
9
|
-
#include "SegmentTagged.hpp"
|
10
|
-
#include "PosTagger.hpp"
|
11
|
-
|
12
|
-
namespace cppjieba {
|
13
|
-
|
14
|
-
class MPSegment: public SegmentTagged {
|
15
|
-
public:
|
16
|
-
MPSegment(const string& dictPath, const string& userDictPath = "")
|
17
|
-
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
18
|
-
}
|
19
|
-
MPSegment(const DictTrie* dictTrie)
|
20
|
-
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
21
|
-
assert(dictTrie_);
|
22
|
-
}
|
23
|
-
~MPSegment() {
|
24
|
-
if (isNeedDestroy_) {
|
25
|
-
delete dictTrie_;
|
26
|
-
}
|
27
|
-
}
|
28
|
-
|
29
|
-
void Cut(const string& sentence, vector<string>& words) const {
|
30
|
-
Cut(sentence, words, MAX_WORD_LENGTH);
|
31
|
-
}
|
32
|
-
|
33
|
-
void Cut(const string& sentence,
|
34
|
-
vector<string>& words,
|
35
|
-
size_t max_word_len) const {
|
36
|
-
vector<Word> tmp;
|
37
|
-
Cut(sentence, tmp, max_word_len);
|
38
|
-
GetStringsFromWords(tmp, words);
|
39
|
-
}
|
40
|
-
void Cut(const string& sentence,
|
41
|
-
vector<Word>& words,
|
42
|
-
size_t max_word_len = MAX_WORD_LENGTH) const {
|
43
|
-
PreFilter pre_filter(symbols_, sentence);
|
44
|
-
PreFilter::Range range;
|
45
|
-
vector<WordRange> wrs;
|
46
|
-
wrs.reserve(sentence.size()/2);
|
47
|
-
while (pre_filter.HasNext()) {
|
48
|
-
range = pre_filter.Next();
|
49
|
-
Cut(range.begin, range.end, wrs, max_word_len);
|
50
|
-
}
|
51
|
-
words.clear();
|
52
|
-
words.reserve(wrs.size());
|
53
|
-
GetWordsFromWordRanges(sentence, wrs, words);
|
54
|
-
}
|
55
|
-
void Cut(RuneStrArray::const_iterator begin,
|
56
|
-
RuneStrArray::const_iterator end,
|
57
|
-
vector<WordRange>& words,
|
58
|
-
size_t max_word_len = MAX_WORD_LENGTH) const {
|
59
|
-
vector<Dag> dags;
|
60
|
-
dictTrie_->Find(begin,
|
61
|
-
end,
|
62
|
-
dags,
|
63
|
-
max_word_len);
|
64
|
-
CalcDP(dags);
|
65
|
-
CutByDag(begin, end, dags, words);
|
66
|
-
}
|
67
|
-
|
68
|
-
const DictTrie* GetDictTrie() const {
|
69
|
-
return dictTrie_;
|
70
|
-
}
|
71
|
-
|
72
|
-
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
73
|
-
return tagger_.Tag(src, res, *this);
|
74
|
-
}
|
75
|
-
|
76
|
-
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
77
|
-
return dictTrie_->IsUserDictSingleChineseWord(value);
|
78
|
-
}
|
79
|
-
private:
|
80
|
-
void CalcDP(vector<Dag>& dags) const {
|
81
|
-
size_t nextPos;
|
82
|
-
const DictUnit* p;
|
83
|
-
double val;
|
84
|
-
|
85
|
-
for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
86
|
-
rit->pInfo = NULL;
|
87
|
-
rit->weight = MIN_DOUBLE;
|
88
|
-
assert(!rit->nexts.empty());
|
89
|
-
for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
90
|
-
nextPos = it->first;
|
91
|
-
p = it->second;
|
92
|
-
val = 0.0;
|
93
|
-
if (nextPos + 1 < dags.size()) {
|
94
|
-
val += dags[nextPos + 1].weight;
|
95
|
-
}
|
96
|
-
|
97
|
-
if (p) {
|
98
|
-
val += p->weight;
|
99
|
-
} else {
|
100
|
-
val += dictTrie_->GetMinWeight();
|
101
|
-
}
|
102
|
-
if (val > rit->weight) {
|
103
|
-
rit->pInfo = p;
|
104
|
-
rit->weight = val;
|
105
|
-
}
|
106
|
-
}
|
107
|
-
}
|
108
|
-
}
|
109
|
-
void CutByDag(RuneStrArray::const_iterator begin,
|
110
|
-
RuneStrArray::const_iterator end,
|
111
|
-
const vector<Dag>& dags,
|
112
|
-
vector<WordRange>& words) const {
|
113
|
-
size_t i = 0;
|
114
|
-
while (i < dags.size()) {
|
115
|
-
const DictUnit* p = dags[i].pInfo;
|
116
|
-
if (p) {
|
117
|
-
assert(p->word.size() >= 1);
|
118
|
-
WordRange wr(begin + i, begin + i + p->word.size() - 1);
|
119
|
-
words.push_back(wr);
|
120
|
-
i += p->word.size();
|
121
|
-
} else { //single chinese word
|
122
|
-
WordRange wr(begin + i, begin + i);
|
123
|
-
words.push_back(wr);
|
124
|
-
i++;
|
125
|
-
}
|
126
|
-
}
|
127
|
-
}
|
128
|
-
|
129
|
-
const DictTrie* dictTrie_;
|
130
|
-
bool isNeedDestroy_;
|
131
|
-
PosTagger tagger_;
|
132
|
-
|
133
|
-
}; // class MPSegment
|
134
|
-
|
135
|
-
} // namespace cppjieba
|
136
|
-
|
137
|
-
#endif
|
@@ -1,109 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_MIXSEGMENT_H
|
2
|
-
#define CPPJIEBA_MIXSEGMENT_H
|
3
|
-
|
4
|
-
#include <cassert>
|
5
|
-
#include "MPSegment.hpp"
|
6
|
-
#include "HMMSegment.hpp"
|
7
|
-
#include "limonp/StringUtil.hpp"
|
8
|
-
#include "PosTagger.hpp"
|
9
|
-
|
10
|
-
namespace cppjieba {
|
11
|
-
class MixSegment: public SegmentTagged {
|
12
|
-
public:
|
13
|
-
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
14
|
-
const string& userDict = "")
|
15
|
-
: mpSeg_(mpSegDict, userDict),
|
16
|
-
hmmSeg_(hmmSegDict) {
|
17
|
-
}
|
18
|
-
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
19
|
-
: mpSeg_(dictTrie), hmmSeg_(model) {
|
20
|
-
}
|
21
|
-
~MixSegment() {
|
22
|
-
}
|
23
|
-
|
24
|
-
void Cut(const string& sentence, vector<string>& words) const {
|
25
|
-
Cut(sentence, words, true);
|
26
|
-
}
|
27
|
-
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
28
|
-
vector<Word> tmp;
|
29
|
-
Cut(sentence, tmp, hmm);
|
30
|
-
GetStringsFromWords(tmp, words);
|
31
|
-
}
|
32
|
-
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
33
|
-
PreFilter pre_filter(symbols_, sentence);
|
34
|
-
PreFilter::Range range;
|
35
|
-
vector<WordRange> wrs;
|
36
|
-
wrs.reserve(sentence.size() / 2);
|
37
|
-
while (pre_filter.HasNext()) {
|
38
|
-
range = pre_filter.Next();
|
39
|
-
Cut(range.begin, range.end, wrs, hmm);
|
40
|
-
}
|
41
|
-
words.clear();
|
42
|
-
words.reserve(wrs.size());
|
43
|
-
GetWordsFromWordRanges(sentence, wrs, words);
|
44
|
-
}
|
45
|
-
|
46
|
-
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
47
|
-
if (!hmm) {
|
48
|
-
mpSeg_.Cut(begin, end, res);
|
49
|
-
return;
|
50
|
-
}
|
51
|
-
vector<WordRange> words;
|
52
|
-
assert(end >= begin);
|
53
|
-
words.reserve(end - begin);
|
54
|
-
mpSeg_.Cut(begin, end, words);
|
55
|
-
|
56
|
-
vector<WordRange> hmmRes;
|
57
|
-
hmmRes.reserve(end - begin);
|
58
|
-
for (size_t i = 0; i < words.size(); i++) {
|
59
|
-
//if mp Get a word, it's ok, put it into result
|
60
|
-
if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
61
|
-
res.push_back(words[i]);
|
62
|
-
continue;
|
63
|
-
}
|
64
|
-
|
65
|
-
// if mp Get a single one and it is not in userdict, collect it in sequence
|
66
|
-
size_t j = i;
|
67
|
-
while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
68
|
-
j++;
|
69
|
-
}
|
70
|
-
|
71
|
-
// Cut the sequence with hmm
|
72
|
-
assert(j - 1 >= i);
|
73
|
-
// TODO
|
74
|
-
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
|
75
|
-
//put hmm result to result
|
76
|
-
for (size_t k = 0; k < hmmRes.size(); k++) {
|
77
|
-
res.push_back(hmmRes[k]);
|
78
|
-
}
|
79
|
-
|
80
|
-
//clear tmp vars
|
81
|
-
hmmRes.clear();
|
82
|
-
|
83
|
-
//let i jump over this piece
|
84
|
-
i = j - 1;
|
85
|
-
}
|
86
|
-
}
|
87
|
-
|
88
|
-
const DictTrie* GetDictTrie() const {
|
89
|
-
return mpSeg_.GetDictTrie();
|
90
|
-
}
|
91
|
-
|
92
|
-
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
93
|
-
return tagger_.Tag(src, res, *this);
|
94
|
-
}
|
95
|
-
|
96
|
-
string LookupTag(const string &str) const {
|
97
|
-
return tagger_.LookupTag(str, *this);
|
98
|
-
}
|
99
|
-
|
100
|
-
private:
|
101
|
-
MPSegment mpSeg_;
|
102
|
-
HMMSegment hmmSeg_;
|
103
|
-
PosTagger tagger_;
|
104
|
-
|
105
|
-
}; // class MixSegment
|
106
|
-
|
107
|
-
} // namespace cppjieba
|
108
|
-
|
109
|
-
#endif
|
@@ -1,77 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_POS_TAGGING_H
|
2
|
-
#define CPPJIEBA_POS_TAGGING_H
|
3
|
-
|
4
|
-
#include "limonp/StringUtil.hpp"
|
5
|
-
#include "SegmentTagged.hpp"
|
6
|
-
#include "DictTrie.hpp"
|
7
|
-
|
8
|
-
namespace cppjieba {
|
9
|
-
using namespace limonp;
|
10
|
-
|
11
|
-
static const char* const POS_M = "m";
|
12
|
-
static const char* const POS_ENG = "eng";
|
13
|
-
static const char* const POS_X = "x";
|
14
|
-
|
15
|
-
class PosTagger {
|
16
|
-
public:
|
17
|
-
PosTagger() {
|
18
|
-
}
|
19
|
-
~PosTagger() {
|
20
|
-
}
|
21
|
-
|
22
|
-
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
23
|
-
vector<string> CutRes;
|
24
|
-
segment.Cut(src, CutRes);
|
25
|
-
|
26
|
-
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
27
|
-
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
|
28
|
-
}
|
29
|
-
return !res.empty();
|
30
|
-
}
|
31
|
-
|
32
|
-
string LookupTag(const string &str, const SegmentTagged& segment) const {
|
33
|
-
const DictUnit *tmp = NULL;
|
34
|
-
RuneStrArray runes;
|
35
|
-
const DictTrie * dict = segment.GetDictTrie();
|
36
|
-
assert(dict != NULL);
|
37
|
-
if (!DecodeRunesInString(str, runes)) {
|
38
|
-
XLOG(ERROR) << "Decode failed.";
|
39
|
-
return POS_X;
|
40
|
-
}
|
41
|
-
tmp = dict->Find(runes.begin(), runes.end());
|
42
|
-
if (tmp == NULL || tmp->tag.empty()) {
|
43
|
-
return SpecialRule(runes);
|
44
|
-
} else {
|
45
|
-
return tmp->tag;
|
46
|
-
}
|
47
|
-
}
|
48
|
-
|
49
|
-
private:
|
50
|
-
const char* SpecialRule(const RuneStrArray& unicode) const {
|
51
|
-
size_t m = 0;
|
52
|
-
size_t eng = 0;
|
53
|
-
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
54
|
-
if (unicode[i].rune < 0x80) {
|
55
|
-
eng ++;
|
56
|
-
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
57
|
-
m++;
|
58
|
-
}
|
59
|
-
}
|
60
|
-
}
|
61
|
-
// ascii char is not found
|
62
|
-
if (eng == 0) {
|
63
|
-
return POS_X;
|
64
|
-
}
|
65
|
-
// all the ascii is number char
|
66
|
-
if (m == eng) {
|
67
|
-
return POS_M;
|
68
|
-
}
|
69
|
-
// the ascii chars contain english letter
|
70
|
-
return POS_ENG;
|
71
|
-
}
|
72
|
-
|
73
|
-
}; // class PosTagger
|
74
|
-
|
75
|
-
} // namespace cppjieba
|
76
|
-
|
77
|
-
#endif
|
@@ -1,54 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_PRE_FILTER_H
|
2
|
-
#define CPPJIEBA_PRE_FILTER_H
|
3
|
-
|
4
|
-
#include "Trie.hpp"
|
5
|
-
#include "limonp/Logging.hpp"
|
6
|
-
|
7
|
-
namespace cppjieba {
|
8
|
-
|
9
|
-
class PreFilter {
|
10
|
-
public:
|
11
|
-
//TODO use WordRange instead of Range
|
12
|
-
struct Range {
|
13
|
-
RuneStrArray::const_iterator begin;
|
14
|
-
RuneStrArray::const_iterator end;
|
15
|
-
}; // struct Range
|
16
|
-
|
17
|
-
PreFilter(const unordered_set<Rune>& symbols,
|
18
|
-
const string& sentence)
|
19
|
-
: symbols_(symbols) {
|
20
|
-
if (!DecodeRunesInString(sentence, sentence_)) {
|
21
|
-
XLOG(ERROR) << "decode failed. ";
|
22
|
-
}
|
23
|
-
cursor_ = sentence_.begin();
|
24
|
-
}
|
25
|
-
~PreFilter() {
|
26
|
-
}
|
27
|
-
bool HasNext() const {
|
28
|
-
return cursor_ != sentence_.end();
|
29
|
-
}
|
30
|
-
Range Next() {
|
31
|
-
Range range;
|
32
|
-
range.begin = cursor_;
|
33
|
-
while (cursor_ != sentence_.end()) {
|
34
|
-
if (IsIn(symbols_, cursor_->rune)) {
|
35
|
-
if (range.begin == cursor_) {
|
36
|
-
cursor_ ++;
|
37
|
-
}
|
38
|
-
range.end = cursor_;
|
39
|
-
return range;
|
40
|
-
}
|
41
|
-
cursor_ ++;
|
42
|
-
}
|
43
|
-
range.end = sentence_.end();
|
44
|
-
return range;
|
45
|
-
}
|
46
|
-
private:
|
47
|
-
RuneStrArray::const_iterator cursor_;
|
48
|
-
RuneStrArray sentence_;
|
49
|
-
const unordered_set<Rune>& symbols_;
|
50
|
-
}; // class PreFilter
|
51
|
-
|
52
|
-
} // namespace cppjieba
|
53
|
-
|
54
|
-
#endif // CPPJIEBA_PRE_FILTER_H
|
@@ -1,90 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
2
|
-
#define CPPJIEBA_QUERYSEGMENT_H
|
3
|
-
|
4
|
-
#include <algorithm>
|
5
|
-
#include <set>
|
6
|
-
#include <cassert>
|
7
|
-
#include "limonp/Logging.hpp"
|
8
|
-
#include "DictTrie.hpp"
|
9
|
-
#include "SegmentBase.hpp"
|
10
|
-
#include "FullSegment.hpp"
|
11
|
-
#include "MixSegment.hpp"
|
12
|
-
#include "Unicode.hpp"
|
13
|
-
#include "DictTrie.hpp"
|
14
|
-
|
15
|
-
namespace cppjieba {
|
16
|
-
class QuerySegment: public SegmentBase {
|
17
|
-
public:
|
18
|
-
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
19
|
-
: mixSeg_(dict, model, userDict),
|
20
|
-
trie_(mixSeg_.GetDictTrie()) {
|
21
|
-
}
|
22
|
-
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
23
|
-
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
24
|
-
}
|
25
|
-
~QuerySegment() {
|
26
|
-
}
|
27
|
-
|
28
|
-
void Cut(const string& sentence, vector<string>& words) const {
|
29
|
-
Cut(sentence, words, true);
|
30
|
-
}
|
31
|
-
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
32
|
-
vector<Word> tmp;
|
33
|
-
Cut(sentence, tmp, hmm);
|
34
|
-
GetStringsFromWords(tmp, words);
|
35
|
-
}
|
36
|
-
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
37
|
-
PreFilter pre_filter(symbols_, sentence);
|
38
|
-
PreFilter::Range range;
|
39
|
-
vector<WordRange> wrs;
|
40
|
-
wrs.reserve(sentence.size()/2);
|
41
|
-
while (pre_filter.HasNext()) {
|
42
|
-
range = pre_filter.Next();
|
43
|
-
Cut(range.begin, range.end, wrs, hmm);
|
44
|
-
}
|
45
|
-
words.clear();
|
46
|
-
words.reserve(wrs.size());
|
47
|
-
GetWordsFromWordRanges(sentence, wrs, words);
|
48
|
-
}
|
49
|
-
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
50
|
-
//use mix Cut first
|
51
|
-
vector<WordRange> mixRes;
|
52
|
-
mixSeg_.Cut(begin, end, mixRes, hmm);
|
53
|
-
|
54
|
-
vector<WordRange> fullRes;
|
55
|
-
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
56
|
-
if (mixResItr->Length() > 2) {
|
57
|
-
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
58
|
-
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
59
|
-
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
60
|
-
res.push_back(wr);
|
61
|
-
}
|
62
|
-
}
|
63
|
-
}
|
64
|
-
if (mixResItr->Length() > 3) {
|
65
|
-
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
66
|
-
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
67
|
-
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
68
|
-
res.push_back(wr);
|
69
|
-
}
|
70
|
-
}
|
71
|
-
}
|
72
|
-
res.push_back(*mixResItr);
|
73
|
-
}
|
74
|
-
}
|
75
|
-
private:
|
76
|
-
bool IsAllAscii(const Unicode& s) const {
|
77
|
-
for(size_t i = 0; i < s.size(); i++) {
|
78
|
-
if (s[i] >= 0x80) {
|
79
|
-
return false;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
return true;
|
83
|
-
}
|
84
|
-
MixSegment mixSeg_;
|
85
|
-
const DictTrie* trie_;
|
86
|
-
}; // QuerySegment
|
87
|
-
|
88
|
-
} // namespace cppjieba
|
89
|
-
|
90
|
-
#endif
|