jieba-rb 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,137 @@
1
+ #ifndef CPPJIEBA_MPSEGMENT_H
2
+ #define CPPJIEBA_MPSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentTagged.hpp"
10
+ #include "PosTagger.hpp"
11
+
12
+ namespace cppjieba {
13
+
14
+ class MPSegment: public SegmentTagged {
15
+ public:
16
+ MPSegment(const string& dictPath, const string& userDictPath = "")
17
+ : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
18
+ }
19
+ MPSegment(const DictTrie* dictTrie)
20
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
+ assert(dictTrie_);
22
+ }
23
+ ~MPSegment() {
24
+ if (isNeedDestroy_) {
25
+ delete dictTrie_;
26
+ }
27
+ }
28
+
29
+ void Cut(const string& sentence, vector<string>& words) const {
30
+ Cut(sentence, words, MAX_WORD_LENGTH);
31
+ }
32
+
33
+ void Cut(const string& sentence,
34
+ vector<string>& words,
35
+ size_t max_word_len) const {
36
+ vector<Word> tmp;
37
+ Cut(sentence, tmp, max_word_len);
38
+ GetStringsFromWords(tmp, words);
39
+ }
40
+ void Cut(const string& sentence,
41
+ vector<Word>& words,
42
+ size_t max_word_len = MAX_WORD_LENGTH) const {
43
+ PreFilter pre_filter(symbols_, sentence);
44
+ PreFilter::Range range;
45
+ vector<WordRange> wrs;
46
+ wrs.reserve(sentence.size()/2);
47
+ while (pre_filter.HasNext()) {
48
+ range = pre_filter.Next();
49
+ Cut(range.begin, range.end, wrs, max_word_len);
50
+ }
51
+ words.clear();
52
+ words.reserve(wrs.size());
53
+ GetWordsFromWordRanges(sentence, wrs, words);
54
+ }
55
+ void Cut(RuneStrArray::const_iterator begin,
56
+ RuneStrArray::const_iterator end,
57
+ vector<WordRange>& words,
58
+ size_t max_word_len = MAX_WORD_LENGTH) const {
59
+ vector<Dag> dags;
60
+ dictTrie_->Find(begin,
61
+ end,
62
+ dags,
63
+ max_word_len);
64
+ CalcDP(dags);
65
+ CutByDag(begin, end, dags, words);
66
+ }
67
+
68
+ const DictTrie* GetDictTrie() const {
69
+ return dictTrie_;
70
+ }
71
+
72
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
73
+ return tagger_.Tag(src, res, *this);
74
+ }
75
+
76
+ bool IsUserDictSingleChineseWord(const Rune& value) const {
77
+ return dictTrie_->IsUserDictSingleChineseWord(value);
78
+ }
79
+ private:
80
+ void CalcDP(vector<Dag>& dags) const {
81
+ size_t nextPos;
82
+ const DictUnit* p;
83
+ double val;
84
+
85
+ for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
86
+ rit->pInfo = NULL;
87
+ rit->weight = MIN_DOUBLE;
88
+ assert(!rit->nexts.empty());
89
+ for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
90
+ nextPos = it->first;
91
+ p = it->second;
92
+ val = 0.0;
93
+ if (nextPos + 1 < dags.size()) {
94
+ val += dags[nextPos + 1].weight;
95
+ }
96
+
97
+ if (p) {
98
+ val += p->weight;
99
+ } else {
100
+ val += dictTrie_->GetMinWeight();
101
+ }
102
+ if (val > rit->weight) {
103
+ rit->pInfo = p;
104
+ rit->weight = val;
105
+ }
106
+ }
107
+ }
108
+ }
109
+ void CutByDag(RuneStrArray::const_iterator begin,
110
+ RuneStrArray::const_iterator end,
111
+ const vector<Dag>& dags,
112
+ vector<WordRange>& words) const {
113
+ size_t i = 0;
114
+ while (i < dags.size()) {
115
+ const DictUnit* p = dags[i].pInfo;
116
+ if (p) {
117
+ assert(p->word.size() >= 1);
118
+ WordRange wr(begin + i, begin + i + p->word.size() - 1);
119
+ words.push_back(wr);
120
+ i += p->word.size();
121
+ } else { //single chinese word
122
+ WordRange wr(begin + i, begin + i);
123
+ words.push_back(wr);
124
+ i++;
125
+ }
126
+ }
127
+ }
128
+
129
+ const DictTrie* dictTrie_;
130
+ bool isNeedDestroy_;
131
+ PosTagger tagger_;
132
+
133
+ }; // class MPSegment
134
+
135
+ } // namespace cppjieba
136
+
137
+ #endif
@@ -0,0 +1,109 @@
1
+ #ifndef CPPJIEBA_MIXSEGMENT_H
2
+ #define CPPJIEBA_MIXSEGMENT_H
3
+
4
+ #include <cassert>
5
+ #include "MPSegment.hpp"
6
+ #include "HMMSegment.hpp"
7
+ #include "limonp/StringUtil.hpp"
8
+ #include "PosTagger.hpp"
9
+
10
+ namespace cppjieba {
11
+ class MixSegment: public SegmentTagged {
12
+ public:
13
+ MixSegment(const string& mpSegDict, const string& hmmSegDict,
14
+ const string& userDict = "")
15
+ : mpSeg_(mpSegDict, userDict),
16
+ hmmSeg_(hmmSegDict) {
17
+ }
18
+ MixSegment(const DictTrie* dictTrie, const HMMModel* model)
19
+ : mpSeg_(dictTrie), hmmSeg_(model) {
20
+ }
21
+ ~MixSegment() {
22
+ }
23
+
24
+ void Cut(const string& sentence, vector<string>& words) const {
25
+ Cut(sentence, words, true);
26
+ }
27
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
28
+ vector<Word> tmp;
29
+ Cut(sentence, tmp, hmm);
30
+ GetStringsFromWords(tmp, words);
31
+ }
32
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
33
+ PreFilter pre_filter(symbols_, sentence);
34
+ PreFilter::Range range;
35
+ vector<WordRange> wrs;
36
+ wrs.reserve(sentence.size() / 2);
37
+ while (pre_filter.HasNext()) {
38
+ range = pre_filter.Next();
39
+ Cut(range.begin, range.end, wrs, hmm);
40
+ }
41
+ words.clear();
42
+ words.reserve(wrs.size());
43
+ GetWordsFromWordRanges(sentence, wrs, words);
44
+ }
45
+
46
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
47
+ if (!hmm) {
48
+ mpSeg_.Cut(begin, end, res);
49
+ return;
50
+ }
51
+ vector<WordRange> words;
52
+ assert(end >= begin);
53
+ words.reserve(end - begin);
54
+ mpSeg_.Cut(begin, end, words);
55
+
56
+ vector<WordRange> hmmRes;
57
+ hmmRes.reserve(end - begin);
58
+ for (size_t i = 0; i < words.size(); i++) {
59
+ //if mp Get a word, it's ok, put it into result
60
+ if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
61
+ res.push_back(words[i]);
62
+ continue;
63
+ }
64
+
65
+ // if mp Get a single one and it is not in userdict, collect it in sequence
66
+ size_t j = i;
67
+ while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
68
+ j++;
69
+ }
70
+
71
+ // Cut the sequence with hmm
72
+ assert(j - 1 >= i);
73
+ // TODO
74
+ hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
75
+ //put hmm result to result
76
+ for (size_t k = 0; k < hmmRes.size(); k++) {
77
+ res.push_back(hmmRes[k]);
78
+ }
79
+
80
+ //clear tmp vars
81
+ hmmRes.clear();
82
+
83
+ //let i jump over this piece
84
+ i = j - 1;
85
+ }
86
+ }
87
+
88
+ const DictTrie* GetDictTrie() const {
89
+ return mpSeg_.GetDictTrie();
90
+ }
91
+
92
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
93
+ return tagger_.Tag(src, res, *this);
94
+ }
95
+
96
+ string LookupTag(const string &str) const {
97
+ return tagger_.LookupTag(str, *this);
98
+ }
99
+
100
+ private:
101
+ MPSegment mpSeg_;
102
+ HMMSegment hmmSeg_;
103
+ PosTagger tagger_;
104
+
105
+ }; // class MixSegment
106
+
107
+ } // namespace cppjieba
108
+
109
+ #endif
@@ -0,0 +1,77 @@
1
+ #ifndef CPPJIEBA_POS_TAGGING_H
2
+ #define CPPJIEBA_POS_TAGGING_H
3
+
4
+ #include "limonp/StringUtil.hpp"
5
+ #include "SegmentTagged.hpp"
6
+ #include "DictTrie.hpp"
7
+
8
+ namespace cppjieba {
9
+ using namespace limonp;
10
+
11
+ static const char* const POS_M = "m";
12
+ static const char* const POS_ENG = "eng";
13
+ static const char* const POS_X = "x";
14
+
15
+ class PosTagger {
16
+ public:
17
+ PosTagger() {
18
+ }
19
+ ~PosTagger() {
20
+ }
21
+
22
+ bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
23
+ vector<string> CutRes;
24
+ segment.Cut(src, CutRes);
25
+
26
+ for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
27
+ res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
28
+ }
29
+ return !res.empty();
30
+ }
31
+
32
+ string LookupTag(const string &str, const SegmentTagged& segment) const {
33
+ const DictUnit *tmp = NULL;
34
+ RuneStrArray runes;
35
+ const DictTrie * dict = segment.GetDictTrie();
36
+ assert(dict != NULL);
37
+ if (!DecodeRunesInString(str, runes)) {
38
+ XLOG(ERROR) << "Decode failed.";
39
+ return POS_X;
40
+ }
41
+ tmp = dict->Find(runes.begin(), runes.end());
42
+ if (tmp == NULL || tmp->tag.empty()) {
43
+ return SpecialRule(runes);
44
+ } else {
45
+ return tmp->tag;
46
+ }
47
+ }
48
+
49
+ private:
50
+ const char* SpecialRule(const RuneStrArray& unicode) const {
51
+ size_t m = 0;
52
+ size_t eng = 0;
53
+ for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
54
+ if (unicode[i].rune < 0x80) {
55
+ eng ++;
56
+ if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
57
+ m++;
58
+ }
59
+ }
60
+ }
61
+ // ascii char is not found
62
+ if (eng == 0) {
63
+ return POS_X;
64
+ }
65
+ // all the ascii is number char
66
+ if (m == eng) {
67
+ return POS_M;
68
+ }
69
+ // the ascii chars contain english letter
70
+ return POS_ENG;
71
+ }
72
+
73
+ }; // class PosTagger
74
+
75
+ } // namespace cppjieba
76
+
77
+ #endif
@@ -0,0 +1,54 @@
1
+ #ifndef CPPJIEBA_PRE_FILTER_H
2
+ #define CPPJIEBA_PRE_FILTER_H
3
+
4
+ #include "Trie.hpp"
5
+ #include "limonp/Logging.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ class PreFilter {
10
+ public:
11
+ //TODO use WordRange instead of Range
12
+ struct Range {
13
+ RuneStrArray::const_iterator begin;
14
+ RuneStrArray::const_iterator end;
15
+ }; // struct Range
16
+
17
+ PreFilter(const unordered_set<Rune>& symbols,
18
+ const string& sentence)
19
+ : symbols_(symbols) {
20
+ if (!DecodeRunesInString(sentence, sentence_)) {
21
+ XLOG(ERROR) << "decode failed. ";
22
+ }
23
+ cursor_ = sentence_.begin();
24
+ }
25
+ ~PreFilter() {
26
+ }
27
+ bool HasNext() const {
28
+ return cursor_ != sentence_.end();
29
+ }
30
+ Range Next() {
31
+ Range range;
32
+ range.begin = cursor_;
33
+ while (cursor_ != sentence_.end()) {
34
+ if (IsIn(symbols_, cursor_->rune)) {
35
+ if (range.begin == cursor_) {
36
+ cursor_ ++;
37
+ }
38
+ range.end = cursor_;
39
+ return range;
40
+ }
41
+ cursor_ ++;
42
+ }
43
+ range.end = sentence_.end();
44
+ return range;
45
+ }
46
+ private:
47
+ RuneStrArray::const_iterator cursor_;
48
+ RuneStrArray sentence_;
49
+ const unordered_set<Rune>& symbols_;
50
+ }; // class PreFilter
51
+
52
+ } // namespace cppjieba
53
+
54
+ #endif // CPPJIEBA_PRE_FILTER_H
@@ -0,0 +1,90 @@
1
+ #ifndef CPPJIEBA_QUERYSEGMENT_H
2
+ #define CPPJIEBA_QUERYSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentBase.hpp"
10
+ #include "FullSegment.hpp"
11
+ #include "MixSegment.hpp"
12
+ #include "Unicode.hpp"
13
+ #include "DictTrie.hpp"
14
+
15
+ namespace cppjieba {
16
+ class QuerySegment: public SegmentBase {
17
+ public:
18
+ QuerySegment(const string& dict, const string& model, const string& userDict = "")
19
+ : mixSeg_(dict, model, userDict),
20
+ trie_(mixSeg_.GetDictTrie()) {
21
+ }
22
+ QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
23
+ : mixSeg_(dictTrie, model), trie_(dictTrie) {
24
+ }
25
+ ~QuerySegment() {
26
+ }
27
+
28
+ void Cut(const string& sentence, vector<string>& words) const {
29
+ Cut(sentence, words, true);
30
+ }
31
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
32
+ vector<Word> tmp;
33
+ Cut(sentence, tmp, hmm);
34
+ GetStringsFromWords(tmp, words);
35
+ }
36
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
37
+ PreFilter pre_filter(symbols_, sentence);
38
+ PreFilter::Range range;
39
+ vector<WordRange> wrs;
40
+ wrs.reserve(sentence.size()/2);
41
+ while (pre_filter.HasNext()) {
42
+ range = pre_filter.Next();
43
+ Cut(range.begin, range.end, wrs, hmm);
44
+ }
45
+ words.clear();
46
+ words.reserve(wrs.size());
47
+ GetWordsFromWordRanges(sentence, wrs, words);
48
+ }
49
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
50
+ //use mix Cut first
51
+ vector<WordRange> mixRes;
52
+ mixSeg_.Cut(begin, end, mixRes, hmm);
53
+
54
+ vector<WordRange> fullRes;
55
+ for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
56
+ if (mixResItr->Length() > 2) {
57
+ for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
58
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
59
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
60
+ res.push_back(wr);
61
+ }
62
+ }
63
+ }
64
+ if (mixResItr->Length() > 3) {
65
+ for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
66
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
67
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
68
+ res.push_back(wr);
69
+ }
70
+ }
71
+ }
72
+ res.push_back(*mixResItr);
73
+ }
74
+ }
75
+ private:
76
+ bool IsAllAscii(const Unicode& s) const {
77
+ for(size_t i = 0; i < s.size(); i++) {
78
+ if (s[i] >= 0x80) {
79
+ return false;
80
+ }
81
+ }
82
+ return true;
83
+ }
84
+ MixSegment mixSeg_;
85
+ const DictTrie* trie_;
86
+ }; // QuerySegment
87
+
88
+ } // namespace cppjieba
89
+
90
+ #endif