jieba-rb 5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +15 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/jieba/extconf.rb +28 -0
- data/ext/jieba/jieba.c +11 -0
- data/ext/jieba/jieba.h +11 -0
- data/ext/jieba/keyword.cc +92 -0
- data/ext/jieba/keyword.h +17 -0
- data/ext/jieba/segment.cc +107 -0
- data/ext/jieba/segment.h +17 -0
- data/ext/jieba/tagging.cc +76 -0
- data/ext/jieba/tagging.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba-rb.rb +66 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +32 -0
- data/test/test_tagging.rb +22 -0
- data/test/user.dict.utf8 +23 -0
- metadata +219 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
#ifndef CPPJIEBA_MPSEGMENT_H
|
2
|
+
#define CPPJIEBA_MPSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "limonp/Logging.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "SegmentTagged.hpp"
|
10
|
+
#include "PosTagger.hpp"
|
11
|
+
|
12
|
+
namespace cppjieba {
|
13
|
+
|
14
|
+
class MPSegment: public SegmentTagged {
|
15
|
+
public:
|
16
|
+
MPSegment(const string& dictPath, const string& userDictPath = "")
|
17
|
+
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
|
18
|
+
}
|
19
|
+
MPSegment(const DictTrie* dictTrie)
|
20
|
+
: dictTrie_(dictTrie), isNeedDestroy_(false) {
|
21
|
+
assert(dictTrie_);
|
22
|
+
}
|
23
|
+
~MPSegment() {
|
24
|
+
if (isNeedDestroy_) {
|
25
|
+
delete dictTrie_;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
void Cut(const string& sentence, vector<string>& words) const {
|
30
|
+
Cut(sentence, words, MAX_WORD_LENGTH);
|
31
|
+
}
|
32
|
+
|
33
|
+
void Cut(const string& sentence,
|
34
|
+
vector<string>& words,
|
35
|
+
size_t max_word_len) const {
|
36
|
+
vector<Word> tmp;
|
37
|
+
Cut(sentence, tmp, max_word_len);
|
38
|
+
GetStringsFromWords(tmp, words);
|
39
|
+
}
|
40
|
+
void Cut(const string& sentence,
|
41
|
+
vector<Word>& words,
|
42
|
+
size_t max_word_len = MAX_WORD_LENGTH) const {
|
43
|
+
PreFilter pre_filter(symbols_, sentence);
|
44
|
+
PreFilter::Range range;
|
45
|
+
vector<WordRange> wrs;
|
46
|
+
wrs.reserve(sentence.size()/2);
|
47
|
+
while (pre_filter.HasNext()) {
|
48
|
+
range = pre_filter.Next();
|
49
|
+
Cut(range.begin, range.end, wrs, max_word_len);
|
50
|
+
}
|
51
|
+
words.clear();
|
52
|
+
words.reserve(wrs.size());
|
53
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
54
|
+
}
|
55
|
+
void Cut(RuneStrArray::const_iterator begin,
|
56
|
+
RuneStrArray::const_iterator end,
|
57
|
+
vector<WordRange>& words,
|
58
|
+
size_t max_word_len = MAX_WORD_LENGTH) const {
|
59
|
+
vector<Dag> dags;
|
60
|
+
dictTrie_->Find(begin,
|
61
|
+
end,
|
62
|
+
dags,
|
63
|
+
max_word_len);
|
64
|
+
CalcDP(dags);
|
65
|
+
CutByDag(begin, end, dags, words);
|
66
|
+
}
|
67
|
+
|
68
|
+
const DictTrie* GetDictTrie() const {
|
69
|
+
return dictTrie_;
|
70
|
+
}
|
71
|
+
|
72
|
+
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
73
|
+
return tagger_.Tag(src, res, *this);
|
74
|
+
}
|
75
|
+
|
76
|
+
bool IsUserDictSingleChineseWord(const Rune& value) const {
|
77
|
+
return dictTrie_->IsUserDictSingleChineseWord(value);
|
78
|
+
}
|
79
|
+
private:
|
80
|
+
void CalcDP(vector<Dag>& dags) const {
|
81
|
+
size_t nextPos;
|
82
|
+
const DictUnit* p;
|
83
|
+
double val;
|
84
|
+
|
85
|
+
for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
|
86
|
+
rit->pInfo = NULL;
|
87
|
+
rit->weight = MIN_DOUBLE;
|
88
|
+
assert(!rit->nexts.empty());
|
89
|
+
for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
|
90
|
+
nextPos = it->first;
|
91
|
+
p = it->second;
|
92
|
+
val = 0.0;
|
93
|
+
if (nextPos + 1 < dags.size()) {
|
94
|
+
val += dags[nextPos + 1].weight;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (p) {
|
98
|
+
val += p->weight;
|
99
|
+
} else {
|
100
|
+
val += dictTrie_->GetMinWeight();
|
101
|
+
}
|
102
|
+
if (val > rit->weight) {
|
103
|
+
rit->pInfo = p;
|
104
|
+
rit->weight = val;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
void CutByDag(RuneStrArray::const_iterator begin,
|
110
|
+
RuneStrArray::const_iterator end,
|
111
|
+
const vector<Dag>& dags,
|
112
|
+
vector<WordRange>& words) const {
|
113
|
+
size_t i = 0;
|
114
|
+
while (i < dags.size()) {
|
115
|
+
const DictUnit* p = dags[i].pInfo;
|
116
|
+
if (p) {
|
117
|
+
assert(p->word.size() >= 1);
|
118
|
+
WordRange wr(begin + i, begin + i + p->word.size() - 1);
|
119
|
+
words.push_back(wr);
|
120
|
+
i += p->word.size();
|
121
|
+
} else { //single chinese word
|
122
|
+
WordRange wr(begin + i, begin + i);
|
123
|
+
words.push_back(wr);
|
124
|
+
i++;
|
125
|
+
}
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
const DictTrie* dictTrie_;
|
130
|
+
bool isNeedDestroy_;
|
131
|
+
PosTagger tagger_;
|
132
|
+
|
133
|
+
}; // class MPSegment
|
134
|
+
|
135
|
+
} // namespace cppjieba
|
136
|
+
|
137
|
+
#endif
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#ifndef CPPJIEBA_MIXSEGMENT_H
|
2
|
+
#define CPPJIEBA_MIXSEGMENT_H
|
3
|
+
|
4
|
+
#include <cassert>
|
5
|
+
#include "MPSegment.hpp"
|
6
|
+
#include "HMMSegment.hpp"
|
7
|
+
#include "limonp/StringUtil.hpp"
|
8
|
+
#include "PosTagger.hpp"
|
9
|
+
|
10
|
+
namespace cppjieba {
|
11
|
+
class MixSegment: public SegmentTagged {
|
12
|
+
public:
|
13
|
+
MixSegment(const string& mpSegDict, const string& hmmSegDict,
|
14
|
+
const string& userDict = "")
|
15
|
+
: mpSeg_(mpSegDict, userDict),
|
16
|
+
hmmSeg_(hmmSegDict) {
|
17
|
+
}
|
18
|
+
MixSegment(const DictTrie* dictTrie, const HMMModel* model)
|
19
|
+
: mpSeg_(dictTrie), hmmSeg_(model) {
|
20
|
+
}
|
21
|
+
~MixSegment() {
|
22
|
+
}
|
23
|
+
|
24
|
+
void Cut(const string& sentence, vector<string>& words) const {
|
25
|
+
Cut(sentence, words, true);
|
26
|
+
}
|
27
|
+
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
28
|
+
vector<Word> tmp;
|
29
|
+
Cut(sentence, tmp, hmm);
|
30
|
+
GetStringsFromWords(tmp, words);
|
31
|
+
}
|
32
|
+
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
33
|
+
PreFilter pre_filter(symbols_, sentence);
|
34
|
+
PreFilter::Range range;
|
35
|
+
vector<WordRange> wrs;
|
36
|
+
wrs.reserve(sentence.size() / 2);
|
37
|
+
while (pre_filter.HasNext()) {
|
38
|
+
range = pre_filter.Next();
|
39
|
+
Cut(range.begin, range.end, wrs, hmm);
|
40
|
+
}
|
41
|
+
words.clear();
|
42
|
+
words.reserve(wrs.size());
|
43
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
44
|
+
}
|
45
|
+
|
46
|
+
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
47
|
+
if (!hmm) {
|
48
|
+
mpSeg_.Cut(begin, end, res);
|
49
|
+
return;
|
50
|
+
}
|
51
|
+
vector<WordRange> words;
|
52
|
+
assert(end >= begin);
|
53
|
+
words.reserve(end - begin);
|
54
|
+
mpSeg_.Cut(begin, end, words);
|
55
|
+
|
56
|
+
vector<WordRange> hmmRes;
|
57
|
+
hmmRes.reserve(end - begin);
|
58
|
+
for (size_t i = 0; i < words.size(); i++) {
|
59
|
+
//if mp Get a word, it's ok, put it into result
|
60
|
+
if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
|
61
|
+
res.push_back(words[i]);
|
62
|
+
continue;
|
63
|
+
}
|
64
|
+
|
65
|
+
// if mp Get a single one and it is not in userdict, collect it in sequence
|
66
|
+
size_t j = i;
|
67
|
+
while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
|
68
|
+
j++;
|
69
|
+
}
|
70
|
+
|
71
|
+
// Cut the sequence with hmm
|
72
|
+
assert(j - 1 >= i);
|
73
|
+
// TODO
|
74
|
+
hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
|
75
|
+
//put hmm result to result
|
76
|
+
for (size_t k = 0; k < hmmRes.size(); k++) {
|
77
|
+
res.push_back(hmmRes[k]);
|
78
|
+
}
|
79
|
+
|
80
|
+
//clear tmp vars
|
81
|
+
hmmRes.clear();
|
82
|
+
|
83
|
+
//let i jump over this piece
|
84
|
+
i = j - 1;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
const DictTrie* GetDictTrie() const {
|
89
|
+
return mpSeg_.GetDictTrie();
|
90
|
+
}
|
91
|
+
|
92
|
+
bool Tag(const string& src, vector<pair<string, string> >& res) const {
|
93
|
+
return tagger_.Tag(src, res, *this);
|
94
|
+
}
|
95
|
+
|
96
|
+
string LookupTag(const string &str) const {
|
97
|
+
return tagger_.LookupTag(str, *this);
|
98
|
+
}
|
99
|
+
|
100
|
+
private:
|
101
|
+
MPSegment mpSeg_;
|
102
|
+
HMMSegment hmmSeg_;
|
103
|
+
PosTagger tagger_;
|
104
|
+
|
105
|
+
}; // class MixSegment
|
106
|
+
|
107
|
+
} // namespace cppjieba
|
108
|
+
|
109
|
+
#endif
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#ifndef CPPJIEBA_POS_TAGGING_H
|
2
|
+
#define CPPJIEBA_POS_TAGGING_H
|
3
|
+
|
4
|
+
#include "limonp/StringUtil.hpp"
|
5
|
+
#include "SegmentTagged.hpp"
|
6
|
+
#include "DictTrie.hpp"
|
7
|
+
|
8
|
+
namespace cppjieba {
|
9
|
+
using namespace limonp;
|
10
|
+
|
11
|
+
static const char* const POS_M = "m";
|
12
|
+
static const char* const POS_ENG = "eng";
|
13
|
+
static const char* const POS_X = "x";
|
14
|
+
|
15
|
+
class PosTagger {
|
16
|
+
public:
|
17
|
+
PosTagger() {
|
18
|
+
}
|
19
|
+
~PosTagger() {
|
20
|
+
}
|
21
|
+
|
22
|
+
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
|
23
|
+
vector<string> CutRes;
|
24
|
+
segment.Cut(src, CutRes);
|
25
|
+
|
26
|
+
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
|
27
|
+
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
|
28
|
+
}
|
29
|
+
return !res.empty();
|
30
|
+
}
|
31
|
+
|
32
|
+
string LookupTag(const string &str, const SegmentTagged& segment) const {
|
33
|
+
const DictUnit *tmp = NULL;
|
34
|
+
RuneStrArray runes;
|
35
|
+
const DictTrie * dict = segment.GetDictTrie();
|
36
|
+
assert(dict != NULL);
|
37
|
+
if (!DecodeRunesInString(str, runes)) {
|
38
|
+
XLOG(ERROR) << "Decode failed.";
|
39
|
+
return POS_X;
|
40
|
+
}
|
41
|
+
tmp = dict->Find(runes.begin(), runes.end());
|
42
|
+
if (tmp == NULL || tmp->tag.empty()) {
|
43
|
+
return SpecialRule(runes);
|
44
|
+
} else {
|
45
|
+
return tmp->tag;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
private:
|
50
|
+
const char* SpecialRule(const RuneStrArray& unicode) const {
|
51
|
+
size_t m = 0;
|
52
|
+
size_t eng = 0;
|
53
|
+
for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
|
54
|
+
if (unicode[i].rune < 0x80) {
|
55
|
+
eng ++;
|
56
|
+
if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
|
57
|
+
m++;
|
58
|
+
}
|
59
|
+
}
|
60
|
+
}
|
61
|
+
// ascii char is not found
|
62
|
+
if (eng == 0) {
|
63
|
+
return POS_X;
|
64
|
+
}
|
65
|
+
// all the ascii is number char
|
66
|
+
if (m == eng) {
|
67
|
+
return POS_M;
|
68
|
+
}
|
69
|
+
// the ascii chars contain english letter
|
70
|
+
return POS_ENG;
|
71
|
+
}
|
72
|
+
|
73
|
+
}; // class PosTagger
|
74
|
+
|
75
|
+
} // namespace cppjieba
|
76
|
+
|
77
|
+
#endif
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#ifndef CPPJIEBA_PRE_FILTER_H
|
2
|
+
#define CPPJIEBA_PRE_FILTER_H
|
3
|
+
|
4
|
+
#include "Trie.hpp"
|
5
|
+
#include "limonp/Logging.hpp"
|
6
|
+
|
7
|
+
namespace cppjieba {
|
8
|
+
|
9
|
+
class PreFilter {
|
10
|
+
public:
|
11
|
+
//TODO use WordRange instead of Range
|
12
|
+
struct Range {
|
13
|
+
RuneStrArray::const_iterator begin;
|
14
|
+
RuneStrArray::const_iterator end;
|
15
|
+
}; // struct Range
|
16
|
+
|
17
|
+
PreFilter(const unordered_set<Rune>& symbols,
|
18
|
+
const string& sentence)
|
19
|
+
: symbols_(symbols) {
|
20
|
+
if (!DecodeRunesInString(sentence, sentence_)) {
|
21
|
+
XLOG(ERROR) << "decode failed. ";
|
22
|
+
}
|
23
|
+
cursor_ = sentence_.begin();
|
24
|
+
}
|
25
|
+
~PreFilter() {
|
26
|
+
}
|
27
|
+
bool HasNext() const {
|
28
|
+
return cursor_ != sentence_.end();
|
29
|
+
}
|
30
|
+
Range Next() {
|
31
|
+
Range range;
|
32
|
+
range.begin = cursor_;
|
33
|
+
while (cursor_ != sentence_.end()) {
|
34
|
+
if (IsIn(symbols_, cursor_->rune)) {
|
35
|
+
if (range.begin == cursor_) {
|
36
|
+
cursor_ ++;
|
37
|
+
}
|
38
|
+
range.end = cursor_;
|
39
|
+
return range;
|
40
|
+
}
|
41
|
+
cursor_ ++;
|
42
|
+
}
|
43
|
+
range.end = sentence_.end();
|
44
|
+
return range;
|
45
|
+
}
|
46
|
+
private:
|
47
|
+
RuneStrArray::const_iterator cursor_;
|
48
|
+
RuneStrArray sentence_;
|
49
|
+
const unordered_set<Rune>& symbols_;
|
50
|
+
}; // class PreFilter
|
51
|
+
|
52
|
+
} // namespace cppjieba
|
53
|
+
|
54
|
+
#endif // CPPJIEBA_PRE_FILTER_H
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
2
|
+
#define CPPJIEBA_QUERYSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "limonp/Logging.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "SegmentBase.hpp"
|
10
|
+
#include "FullSegment.hpp"
|
11
|
+
#include "MixSegment.hpp"
|
12
|
+
#include "Unicode.hpp"
|
13
|
+
#include "DictTrie.hpp"
|
14
|
+
|
15
|
+
namespace cppjieba {
|
16
|
+
class QuerySegment: public SegmentBase {
|
17
|
+
public:
|
18
|
+
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
19
|
+
: mixSeg_(dict, model, userDict),
|
20
|
+
trie_(mixSeg_.GetDictTrie()) {
|
21
|
+
}
|
22
|
+
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
23
|
+
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
24
|
+
}
|
25
|
+
~QuerySegment() {
|
26
|
+
}
|
27
|
+
|
28
|
+
void Cut(const string& sentence, vector<string>& words) const {
|
29
|
+
Cut(sentence, words, true);
|
30
|
+
}
|
31
|
+
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
32
|
+
vector<Word> tmp;
|
33
|
+
Cut(sentence, tmp, hmm);
|
34
|
+
GetStringsFromWords(tmp, words);
|
35
|
+
}
|
36
|
+
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
37
|
+
PreFilter pre_filter(symbols_, sentence);
|
38
|
+
PreFilter::Range range;
|
39
|
+
vector<WordRange> wrs;
|
40
|
+
wrs.reserve(sentence.size()/2);
|
41
|
+
while (pre_filter.HasNext()) {
|
42
|
+
range = pre_filter.Next();
|
43
|
+
Cut(range.begin, range.end, wrs, hmm);
|
44
|
+
}
|
45
|
+
words.clear();
|
46
|
+
words.reserve(wrs.size());
|
47
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
48
|
+
}
|
49
|
+
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
50
|
+
//use mix Cut first
|
51
|
+
vector<WordRange> mixRes;
|
52
|
+
mixSeg_.Cut(begin, end, mixRes, hmm);
|
53
|
+
|
54
|
+
vector<WordRange> fullRes;
|
55
|
+
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
56
|
+
if (mixResItr->Length() > 2) {
|
57
|
+
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
58
|
+
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
59
|
+
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
60
|
+
res.push_back(wr);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
if (mixResItr->Length() > 3) {
|
65
|
+
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
66
|
+
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
67
|
+
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
68
|
+
res.push_back(wr);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
res.push_back(*mixResItr);
|
73
|
+
}
|
74
|
+
}
|
75
|
+
private:
|
76
|
+
bool IsAllAscii(const Unicode& s) const {
|
77
|
+
for(size_t i = 0; i < s.size(); i++) {
|
78
|
+
if (s[i] >= 0x80) {
|
79
|
+
return false;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
return true;
|
83
|
+
}
|
84
|
+
MixSegment mixSeg_;
|
85
|
+
const DictTrie* trie_;
|
86
|
+
}; // QuerySegment
|
87
|
+
|
88
|
+
} // namespace cppjieba
|
89
|
+
|
90
|
+
#endif
|