cppjieba_rb 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (130) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +2 -2
  5. data/cppjieba_rb.gemspec +4 -4
  6. data/lib/cppjieba_rb/version.rb +1 -1
  7. metadata +17 -135
  8. data/ext/cppjieba/.gitignore +0 -17
  9. data/ext/cppjieba/.travis.yml +0 -21
  10. data/ext/cppjieba/CMakeLists.txt +0 -28
  11. data/ext/cppjieba/ChangeLog.md +0 -236
  12. data/ext/cppjieba/README.md +0 -292
  13. data/ext/cppjieba/README_EN.md +0 -113
  14. data/ext/cppjieba/appveyor.yml +0 -32
  15. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  16. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  17. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  28. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  41. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  44. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  45. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  46. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  47. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  48. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  49. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  50. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  51. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  52. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  53. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  54. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
  55. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
  56. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  57. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  58. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  59. data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
  60. data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
  61. data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
  62. data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
  63. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  64. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
  65. data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
  66. data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
  67. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  68. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  69. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
  70. data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
  71. data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
  72. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  73. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  74. data/ext/cppjieba/dict/README.md +0 -31
  75. data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
  76. data/ext/cppjieba/dict/idf.utf8 +0 -258826
  77. data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
  78. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
  79. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
  80. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
  81. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
  82. data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
  83. data/ext/cppjieba/dict/user.dict.utf8 +0 -4
  84. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
  85. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
  86. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
  87. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
  88. data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
  89. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
  90. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
  91. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
  92. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
  93. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
  94. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
  95. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
  96. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
  97. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
  98. data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
  99. data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
  100. data/ext/cppjieba/test/CMakeLists.txt +0 -5
  101. data/ext/cppjieba/test/demo.cpp +0 -80
  102. data/ext/cppjieba/test/load_test.cpp +0 -54
  103. data/ext/cppjieba/test/testdata/curl.res +0 -1
  104. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
  105. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
  106. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
  107. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
  108. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
  109. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
  110. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
  111. data/ext/cppjieba/test/testdata/load_test.urls +0 -2
  112. data/ext/cppjieba/test/testdata/review.100 +0 -100
  113. data/ext/cppjieba/test/testdata/review.100.res +0 -200
  114. data/ext/cppjieba/test/testdata/server.conf +0 -19
  115. data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
  116. data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
  117. data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
  118. data/ext/cppjieba/test/testdata/userdict.english +0 -2
  119. data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
  120. data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
  121. data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
  122. data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
  123. data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
  124. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
  125. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
  126. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
  127. data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
  128. data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
  129. data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
  130. data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,137 +0,0 @@
1
- #ifndef CPPJIEBA_MPSEGMENT_H
2
- #define CPPJIEBA_MPSEGMENT_H
3
-
4
- #include <algorithm>
5
- #include <set>
6
- #include <cassert>
7
- #include "limonp/Logging.hpp"
8
- #include "DictTrie.hpp"
9
- #include "SegmentTagged.hpp"
10
- #include "PosTagger.hpp"
11
-
12
- namespace cppjieba {
13
-
14
- class MPSegment: public SegmentTagged {
15
- public:
16
- MPSegment(const string& dictPath, const string& userDictPath = "")
17
- : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
18
- }
19
- MPSegment(const DictTrie* dictTrie)
20
- : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
- assert(dictTrie_);
22
- }
23
- ~MPSegment() {
24
- if (isNeedDestroy_) {
25
- delete dictTrie_;
26
- }
27
- }
28
-
29
- void Cut(const string& sentence, vector<string>& words) const {
30
- Cut(sentence, words, MAX_WORD_LENGTH);
31
- }
32
-
33
- void Cut(const string& sentence,
34
- vector<string>& words,
35
- size_t max_word_len) const {
36
- vector<Word> tmp;
37
- Cut(sentence, tmp, max_word_len);
38
- GetStringsFromWords(tmp, words);
39
- }
40
- void Cut(const string& sentence,
41
- vector<Word>& words,
42
- size_t max_word_len = MAX_WORD_LENGTH) const {
43
- PreFilter pre_filter(symbols_, sentence);
44
- PreFilter::Range range;
45
- vector<WordRange> wrs;
46
- wrs.reserve(sentence.size()/2);
47
- while (pre_filter.HasNext()) {
48
- range = pre_filter.Next();
49
- Cut(range.begin, range.end, wrs, max_word_len);
50
- }
51
- words.clear();
52
- words.reserve(wrs.size());
53
- GetWordsFromWordRanges(sentence, wrs, words);
54
- }
55
- void Cut(RuneStrArray::const_iterator begin,
56
- RuneStrArray::const_iterator end,
57
- vector<WordRange>& words,
58
- size_t max_word_len = MAX_WORD_LENGTH) const {
59
- vector<Dag> dags;
60
- dictTrie_->Find(begin,
61
- end,
62
- dags,
63
- max_word_len);
64
- CalcDP(dags);
65
- CutByDag(begin, end, dags, words);
66
- }
67
-
68
- const DictTrie* GetDictTrie() const {
69
- return dictTrie_;
70
- }
71
-
72
- bool Tag(const string& src, vector<pair<string, string> >& res) const {
73
- return tagger_.Tag(src, res, *this);
74
- }
75
-
76
- bool IsUserDictSingleChineseWord(const Rune& value) const {
77
- return dictTrie_->IsUserDictSingleChineseWord(value);
78
- }
79
- private:
80
- void CalcDP(vector<Dag>& dags) const {
81
- size_t nextPos;
82
- const DictUnit* p;
83
- double val;
84
-
85
- for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
86
- rit->pInfo = NULL;
87
- rit->weight = MIN_DOUBLE;
88
- assert(!rit->nexts.empty());
89
- for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
90
- nextPos = it->first;
91
- p = it->second;
92
- val = 0.0;
93
- if (nextPos + 1 < dags.size()) {
94
- val += dags[nextPos + 1].weight;
95
- }
96
-
97
- if (p) {
98
- val += p->weight;
99
- } else {
100
- val += dictTrie_->GetMinWeight();
101
- }
102
- if (val > rit->weight) {
103
- rit->pInfo = p;
104
- rit->weight = val;
105
- }
106
- }
107
- }
108
- }
109
- void CutByDag(RuneStrArray::const_iterator begin,
110
- RuneStrArray::const_iterator end,
111
- const vector<Dag>& dags,
112
- vector<WordRange>& words) const {
113
- size_t i = 0;
114
- while (i < dags.size()) {
115
- const DictUnit* p = dags[i].pInfo;
116
- if (p) {
117
- assert(p->word.size() >= 1);
118
- WordRange wr(begin + i, begin + i + p->word.size() - 1);
119
- words.push_back(wr);
120
- i += p->word.size();
121
- } else { //single chinese word
122
- WordRange wr(begin + i, begin + i);
123
- words.push_back(wr);
124
- i++;
125
- }
126
- }
127
- }
128
-
129
- const DictTrie* dictTrie_;
130
- bool isNeedDestroy_;
131
- PosTagger tagger_;
132
-
133
- }; // class MPSegment
134
-
135
- } // namespace cppjieba
136
-
137
- #endif
@@ -1,109 +0,0 @@
1
- #ifndef CPPJIEBA_MIXSEGMENT_H
2
- #define CPPJIEBA_MIXSEGMENT_H
3
-
4
- #include <cassert>
5
- #include "MPSegment.hpp"
6
- #include "HMMSegment.hpp"
7
- #include "limonp/StringUtil.hpp"
8
- #include "PosTagger.hpp"
9
-
10
- namespace cppjieba {
11
- class MixSegment: public SegmentTagged {
12
- public:
13
- MixSegment(const string& mpSegDict, const string& hmmSegDict,
14
- const string& userDict = "")
15
- : mpSeg_(mpSegDict, userDict),
16
- hmmSeg_(hmmSegDict) {
17
- }
18
- MixSegment(const DictTrie* dictTrie, const HMMModel* model)
19
- : mpSeg_(dictTrie), hmmSeg_(model) {
20
- }
21
- ~MixSegment() {
22
- }
23
-
24
- void Cut(const string& sentence, vector<string>& words) const {
25
- Cut(sentence, words, true);
26
- }
27
- void Cut(const string& sentence, vector<string>& words, bool hmm) const {
28
- vector<Word> tmp;
29
- Cut(sentence, tmp, hmm);
30
- GetStringsFromWords(tmp, words);
31
- }
32
- void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
33
- PreFilter pre_filter(symbols_, sentence);
34
- PreFilter::Range range;
35
- vector<WordRange> wrs;
36
- wrs.reserve(sentence.size() / 2);
37
- while (pre_filter.HasNext()) {
38
- range = pre_filter.Next();
39
- Cut(range.begin, range.end, wrs, hmm);
40
- }
41
- words.clear();
42
- words.reserve(wrs.size());
43
- GetWordsFromWordRanges(sentence, wrs, words);
44
- }
45
-
46
- void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
47
- if (!hmm) {
48
- mpSeg_.Cut(begin, end, res);
49
- return;
50
- }
51
- vector<WordRange> words;
52
- assert(end >= begin);
53
- words.reserve(end - begin);
54
- mpSeg_.Cut(begin, end, words);
55
-
56
- vector<WordRange> hmmRes;
57
- hmmRes.reserve(end - begin);
58
- for (size_t i = 0; i < words.size(); i++) {
59
- //if mp Get a word, it's ok, put it into result
60
- if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
61
- res.push_back(words[i]);
62
- continue;
63
- }
64
-
65
- // if mp Get a single one and it is not in userdict, collect it in sequence
66
- size_t j = i;
67
- while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
68
- j++;
69
- }
70
-
71
- // Cut the sequence with hmm
72
- assert(j - 1 >= i);
73
- // TODO
74
- hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
75
- //put hmm result to result
76
- for (size_t k = 0; k < hmmRes.size(); k++) {
77
- res.push_back(hmmRes[k]);
78
- }
79
-
80
- //clear tmp vars
81
- hmmRes.clear();
82
-
83
- //let i jump over this piece
84
- i = j - 1;
85
- }
86
- }
87
-
88
- const DictTrie* GetDictTrie() const {
89
- return mpSeg_.GetDictTrie();
90
- }
91
-
92
- bool Tag(const string& src, vector<pair<string, string> >& res) const {
93
- return tagger_.Tag(src, res, *this);
94
- }
95
-
96
- string LookupTag(const string &str) const {
97
- return tagger_.LookupTag(str, *this);
98
- }
99
-
100
- private:
101
- MPSegment mpSeg_;
102
- HMMSegment hmmSeg_;
103
- PosTagger tagger_;
104
-
105
- }; // class MixSegment
106
-
107
- } // namespace cppjieba
108
-
109
- #endif
@@ -1,77 +0,0 @@
1
- #ifndef CPPJIEBA_POS_TAGGING_H
2
- #define CPPJIEBA_POS_TAGGING_H
3
-
4
- #include "limonp/StringUtil.hpp"
5
- #include "SegmentTagged.hpp"
6
- #include "DictTrie.hpp"
7
-
8
- namespace cppjieba {
9
- using namespace limonp;
10
-
11
- static const char* const POS_M = "m";
12
- static const char* const POS_ENG = "eng";
13
- static const char* const POS_X = "x";
14
-
15
- class PosTagger {
16
- public:
17
- PosTagger() {
18
- }
19
- ~PosTagger() {
20
- }
21
-
22
- bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
23
- vector<string> CutRes;
24
- segment.Cut(src, CutRes);
25
-
26
- for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
27
- res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
28
- }
29
- return !res.empty();
30
- }
31
-
32
- string LookupTag(const string &str, const SegmentTagged& segment) const {
33
- const DictUnit *tmp = NULL;
34
- RuneStrArray runes;
35
- const DictTrie * dict = segment.GetDictTrie();
36
- assert(dict != NULL);
37
- if (!DecodeRunesInString(str, runes)) {
38
- XLOG(ERROR) << "Decode failed.";
39
- return POS_X;
40
- }
41
- tmp = dict->Find(runes.begin(), runes.end());
42
- if (tmp == NULL || tmp->tag.empty()) {
43
- return SpecialRule(runes);
44
- } else {
45
- return tmp->tag;
46
- }
47
- }
48
-
49
- private:
50
- const char* SpecialRule(const RuneStrArray& unicode) const {
51
- size_t m = 0;
52
- size_t eng = 0;
53
- for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
54
- if (unicode[i].rune < 0x80) {
55
- eng ++;
56
- if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
57
- m++;
58
- }
59
- }
60
- }
61
- // ascii char is not found
62
- if (eng == 0) {
63
- return POS_X;
64
- }
65
- // all the ascii is number char
66
- if (m == eng) {
67
- return POS_M;
68
- }
69
- // the ascii chars contain english letter
70
- return POS_ENG;
71
- }
72
-
73
- }; // class PosTagger
74
-
75
- } // namespace cppjieba
76
-
77
- #endif
@@ -1,54 +0,0 @@
1
- #ifndef CPPJIEBA_PRE_FILTER_H
2
- #define CPPJIEBA_PRE_FILTER_H
3
-
4
- #include "Trie.hpp"
5
- #include "limonp/Logging.hpp"
6
-
7
- namespace cppjieba {
8
-
9
- class PreFilter {
10
- public:
11
- //TODO use WordRange instead of Range
12
- struct Range {
13
- RuneStrArray::const_iterator begin;
14
- RuneStrArray::const_iterator end;
15
- }; // struct Range
16
-
17
- PreFilter(const unordered_set<Rune>& symbols,
18
- const string& sentence)
19
- : symbols_(symbols) {
20
- if (!DecodeRunesInString(sentence, sentence_)) {
21
- XLOG(ERROR) << "decode failed. ";
22
- }
23
- cursor_ = sentence_.begin();
24
- }
25
- ~PreFilter() {
26
- }
27
- bool HasNext() const {
28
- return cursor_ != sentence_.end();
29
- }
30
- Range Next() {
31
- Range range;
32
- range.begin = cursor_;
33
- while (cursor_ != sentence_.end()) {
34
- if (IsIn(symbols_, cursor_->rune)) {
35
- if (range.begin == cursor_) {
36
- cursor_ ++;
37
- }
38
- range.end = cursor_;
39
- return range;
40
- }
41
- cursor_ ++;
42
- }
43
- range.end = sentence_.end();
44
- return range;
45
- }
46
- private:
47
- RuneStrArray::const_iterator cursor_;
48
- RuneStrArray sentence_;
49
- const unordered_set<Rune>& symbols_;
50
- }; // class PreFilter
51
-
52
- } // namespace cppjieba
53
-
54
- #endif // CPPJIEBA_PRE_FILTER_H
@@ -1,90 +0,0 @@
1
- #ifndef CPPJIEBA_QUERYSEGMENT_H
2
- #define CPPJIEBA_QUERYSEGMENT_H
3
-
4
- #include <algorithm>
5
- #include <set>
6
- #include <cassert>
7
- #include "limonp/Logging.hpp"
8
- #include "DictTrie.hpp"
9
- #include "SegmentBase.hpp"
10
- #include "FullSegment.hpp"
11
- #include "MixSegment.hpp"
12
- #include "Unicode.hpp"
13
- #include "DictTrie.hpp"
14
-
15
- namespace cppjieba {
16
- class QuerySegment: public SegmentBase {
17
- public:
18
- QuerySegment(const string& dict, const string& model, const string& userDict = "")
19
- : mixSeg_(dict, model, userDict),
20
- trie_(mixSeg_.GetDictTrie()) {
21
- }
22
- QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
23
- : mixSeg_(dictTrie, model), trie_(dictTrie) {
24
- }
25
- ~QuerySegment() {
26
- }
27
-
28
- void Cut(const string& sentence, vector<string>& words) const {
29
- Cut(sentence, words, true);
30
- }
31
- void Cut(const string& sentence, vector<string>& words, bool hmm) const {
32
- vector<Word> tmp;
33
- Cut(sentence, tmp, hmm);
34
- GetStringsFromWords(tmp, words);
35
- }
36
- void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
37
- PreFilter pre_filter(symbols_, sentence);
38
- PreFilter::Range range;
39
- vector<WordRange> wrs;
40
- wrs.reserve(sentence.size()/2);
41
- while (pre_filter.HasNext()) {
42
- range = pre_filter.Next();
43
- Cut(range.begin, range.end, wrs, hmm);
44
- }
45
- words.clear();
46
- words.reserve(wrs.size());
47
- GetWordsFromWordRanges(sentence, wrs, words);
48
- }
49
- void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
50
- //use mix Cut first
51
- vector<WordRange> mixRes;
52
- mixSeg_.Cut(begin, end, mixRes, hmm);
53
-
54
- vector<WordRange> fullRes;
55
- for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
56
- if (mixResItr->Length() > 2) {
57
- for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
58
- WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
59
- if (trie_->Find(wr.left, wr.right + 1) != NULL) {
60
- res.push_back(wr);
61
- }
62
- }
63
- }
64
- if (mixResItr->Length() > 3) {
65
- for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
66
- WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
67
- if (trie_->Find(wr.left, wr.right + 1) != NULL) {
68
- res.push_back(wr);
69
- }
70
- }
71
- }
72
- res.push_back(*mixResItr);
73
- }
74
- }
75
- private:
76
- bool IsAllAscii(const Unicode& s) const {
77
- for(size_t i = 0; i < s.size(); i++) {
78
- if (s[i] >= 0x80) {
79
- return false;
80
- }
81
- }
82
- return true;
83
- }
84
- MixSegment mixSeg_;
85
- const DictTrie* trie_;
86
- }; // QuerySegment
87
-
88
- } // namespace cppjieba
89
-
90
- #endif