cppjieba_rb 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,108 @@
1
+ #ifndef CPPJIEAB_JIEBA_H
2
+ #define CPPJIEAB_JIEBA_H
3
+
4
+ #include "QuerySegment.hpp"
5
+ #include "KeywordExtractor.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ class Jieba {
10
+ public:
11
+ Jieba(const string& dict_path,
12
+ const string& model_path,
13
+ const string& user_dict_path,
14
+ const string& idfPath,
15
+ const string& stopWordPath)
16
+ : dict_trie_(dict_path, user_dict_path),
17
+ model_(model_path),
18
+ mp_seg_(&dict_trie_),
19
+ hmm_seg_(&model_),
20
+ mix_seg_(&dict_trie_, &model_),
21
+ full_seg_(&dict_trie_),
22
+ query_seg_(&dict_trie_, &model_),
23
+ extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
24
+ }
25
+ ~Jieba() {
26
+ }
27
+
28
+ struct LocWord {
29
+ string word;
30
+ size_t begin;
31
+ size_t end;
32
+ }; // struct LocWord
33
+
34
+ void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
35
+ mix_seg_.Cut(sentence, words, hmm);
36
+ }
37
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
38
+ mix_seg_.Cut(sentence, words, hmm);
39
+ }
40
+ void CutAll(const string& sentence, vector<string>& words) const {
41
+ full_seg_.Cut(sentence, words);
42
+ }
43
+ void CutAll(const string& sentence, vector<Word>& words) const {
44
+ full_seg_.Cut(sentence, words);
45
+ }
46
+ void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
47
+ query_seg_.Cut(sentence, words, hmm);
48
+ }
49
+ void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
50
+ query_seg_.Cut(sentence, words, hmm);
51
+ }
52
+ void CutHMM(const string& sentence, vector<string>& words) const {
53
+ hmm_seg_.Cut(sentence, words);
54
+ }
55
+ void CutHMM(const string& sentence, vector<Word>& words) const {
56
+ hmm_seg_.Cut(sentence, words);
57
+ }
58
+ void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
59
+ mp_seg_.Cut(sentence, words, max_word_len);
60
+ }
61
+ void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
62
+ mp_seg_.Cut(sentence, words, max_word_len);
63
+ }
64
+
65
+ void Tag(const string& sentence, vector<pair<string, string> >& words) const {
66
+ mix_seg_.Tag(sentence, words);
67
+ }
68
+ string LookupTag(const string &str) const {
69
+ return mix_seg_.LookupTag(str);
70
+ }
71
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
72
+ return dict_trie_.InsertUserWord(word, tag);
73
+ }
74
+
75
+ void ResetSeparators(const string& s) {
76
+ //TODO
77
+ mp_seg_.ResetSeparators(s);
78
+ hmm_seg_.ResetSeparators(s);
79
+ mix_seg_.ResetSeparators(s);
80
+ full_seg_.ResetSeparators(s);
81
+ query_seg_.ResetSeparators(s);
82
+ }
83
+
84
+ const DictTrie* GetDictTrie() const {
85
+ return &dict_trie_;
86
+ }
87
+ const HMMModel* GetHMMModel() const {
88
+ return &model_;
89
+ }
90
+
91
+ private:
92
+ DictTrie dict_trie_;
93
+ HMMModel model_;
94
+
95
+ // They share the same dict trie and model
96
+ MPSegment mp_seg_;
97
+ HMMSegment hmm_seg_;
98
+ MixSegment mix_seg_;
99
+ FullSegment full_seg_;
100
+ QuerySegment query_seg_;
101
+
102
+ public:
103
+ KeywordExtractor extractor;
104
+ }; // class Jieba
105
+
106
+ } // namespace cppjieba
107
+
108
+ #endif // CPPJIEAB_JIEBA_H
@@ -0,0 +1,153 @@
1
+ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
2
+ #define CPPJIEBA_KEYWORD_EXTRACTOR_H
3
+
4
+ #include <cmath>
5
+ #include <set>
6
+ #include "MixSegment.hpp"
7
+
8
+ namespace cppjieba {
9
+
10
+ using namespace limonp;
11
+ using namespace std;
12
+
13
+ /*utf8*/
14
+ class KeywordExtractor {
15
+ public:
16
+ struct Word {
17
+ string word;
18
+ vector<size_t> offsets;
19
+ double weight;
20
+ }; // struct Word
21
+
22
+ KeywordExtractor(const string& dictPath,
23
+ const string& hmmFilePath,
24
+ const string& idfPath,
25
+ const string& stopWordPath,
26
+ const string& userDict = "")
27
+ : segment_(dictPath, hmmFilePath, userDict) {
28
+ LoadIdfDict(idfPath);
29
+ LoadStopWordDict(stopWordPath);
30
+ }
31
+ KeywordExtractor(const DictTrie* dictTrie,
32
+ const HMMModel* model,
33
+ const string& idfPath,
34
+ const string& stopWordPath)
35
+ : segment_(dictTrie, model) {
36
+ LoadIdfDict(idfPath);
37
+ LoadStopWordDict(stopWordPath);
38
+ }
39
+ ~KeywordExtractor() {
40
+ }
41
+
42
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
43
+ vector<Word> topWords;
44
+ Extract(sentence, topWords, topN);
45
+ for (size_t i = 0; i < topWords.size(); i++) {
46
+ keywords.push_back(topWords[i].word);
47
+ }
48
+ }
49
+
50
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
51
+ vector<Word> topWords;
52
+ Extract(sentence, topWords, topN);
53
+ for (size_t i = 0; i < topWords.size(); i++) {
54
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
55
+ }
56
+ }
57
+
58
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
59
+ vector<string> words;
60
+ segment_.Cut(sentence, words);
61
+
62
+ map<string, Word> wordmap;
63
+ size_t offset = 0;
64
+ for (size_t i = 0; i < words.size(); ++i) {
65
+ size_t t = offset;
66
+ offset += words[i].size();
67
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
68
+ continue;
69
+ }
70
+ wordmap[words[i]].offsets.push_back(t);
71
+ wordmap[words[i]].weight += 1.0;
72
+ }
73
+ if (offset != sentence.size()) {
74
+ XLOG(ERROR) << "words illegal";
75
+ return;
76
+ }
77
+
78
+ keywords.clear();
79
+ keywords.reserve(wordmap.size());
80
+ for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
81
+ unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
82
+ if (cit != idfMap_.end()) {
83
+ itr->second.weight *= cit->second;
84
+ } else {
85
+ itr->second.weight *= idfAverage_;
86
+ }
87
+ itr->second.word = itr->first;
88
+ keywords.push_back(itr->second);
89
+ }
90
+ topN = min(topN, keywords.size());
91
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
92
+ keywords.resize(topN);
93
+ }
94
+ private:
95
+ void LoadIdfDict(const string& idfPath) {
96
+ ifstream ifs(idfPath.c_str());
97
+ XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
98
+ string line ;
99
+ vector<string> buf;
100
+ double idf = 0.0;
101
+ double idfSum = 0.0;
102
+ size_t lineno = 0;
103
+ for (; getline(ifs, line); lineno++) {
104
+ buf.clear();
105
+ if (line.empty()) {
106
+ XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107
+ continue;
108
+ }
109
+ Split(line, buf, " ");
110
+ if (buf.size() != 2) {
111
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
112
+ continue;
113
+ }
114
+ idf = atof(buf[1].c_str());
115
+ idfMap_[buf[0]] = idf;
116
+ idfSum += idf;
117
+
118
+ }
119
+
120
+ assert(lineno);
121
+ idfAverage_ = idfSum / lineno;
122
+ assert(idfAverage_ > 0.0);
123
+ }
124
+ void LoadStopWordDict(const string& filePath) {
125
+ ifstream ifs(filePath.c_str());
126
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
127
+ string line ;
128
+ while (getline(ifs, line)) {
129
+ stopWords_.insert(line);
130
+ }
131
+ assert(stopWords_.size());
132
+ }
133
+
134
+ static bool Compare(const Word& lhs, const Word& rhs) {
135
+ return lhs.weight > rhs.weight;
136
+ }
137
+
138
+ MixSegment segment_;
139
+ unordered_map<string, double> idfMap_;
140
+ double idfAverage_;
141
+
142
+ unordered_set<string> stopWords_;
143
+ }; // class KeywordExtractor
144
+
145
+ inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
146
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
147
+ }
148
+
149
+ } // namespace cppjieba
150
+
151
+ #endif
152
+
153
+
@@ -0,0 +1,137 @@
1
+ #ifndef CPPJIEBA_MPSEGMENT_H
2
+ #define CPPJIEBA_MPSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentTagged.hpp"
10
+ #include "PosTagger.hpp"
11
+
12
+ namespace cppjieba {
13
+
14
+ class MPSegment: public SegmentTagged {
15
+ public:
16
+ MPSegment(const string& dictPath, const string& userDictPath = "")
17
+ : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
18
+ }
19
+ MPSegment(const DictTrie* dictTrie)
20
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
21
+ assert(dictTrie_);
22
+ }
23
+ ~MPSegment() {
24
+ if (isNeedDestroy_) {
25
+ delete dictTrie_;
26
+ }
27
+ }
28
+
29
+ void Cut(const string& sentence, vector<string>& words) const {
30
+ Cut(sentence, words, MAX_WORD_LENGTH);
31
+ }
32
+
33
+ void Cut(const string& sentence,
34
+ vector<string>& words,
35
+ size_t max_word_len) const {
36
+ vector<Word> tmp;
37
+ Cut(sentence, tmp, max_word_len);
38
+ GetStringsFromWords(tmp, words);
39
+ }
40
+ void Cut(const string& sentence,
41
+ vector<Word>& words,
42
+ size_t max_word_len = MAX_WORD_LENGTH) const {
43
+ PreFilter pre_filter(symbols_, sentence);
44
+ PreFilter::Range range;
45
+ vector<WordRange> wrs;
46
+ wrs.reserve(sentence.size()/2);
47
+ while (pre_filter.HasNext()) {
48
+ range = pre_filter.Next();
49
+ Cut(range.begin, range.end, wrs, max_word_len);
50
+ }
51
+ words.clear();
52
+ words.reserve(wrs.size());
53
+ GetWordsFromWordRanges(sentence, wrs, words);
54
+ }
55
+ void Cut(RuneStrArray::const_iterator begin,
56
+ RuneStrArray::const_iterator end,
57
+ vector<WordRange>& words,
58
+ size_t max_word_len = MAX_WORD_LENGTH) const {
59
+ vector<Dag> dags;
60
+ dictTrie_->Find(begin,
61
+ end,
62
+ dags,
63
+ max_word_len);
64
+ CalcDP(dags);
65
+ CutByDag(begin, end, dags, words);
66
+ }
67
+
68
+ const DictTrie* GetDictTrie() const {
69
+ return dictTrie_;
70
+ }
71
+
72
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
73
+ return tagger_.Tag(src, res, *this);
74
+ }
75
+
76
+ bool IsUserDictSingleChineseWord(const Rune& value) const {
77
+ return dictTrie_->IsUserDictSingleChineseWord(value);
78
+ }
79
+ private:
80
+ void CalcDP(vector<Dag>& dags) const {
81
+ size_t nextPos;
82
+ const DictUnit* p;
83
+ double val;
84
+
85
+ for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
86
+ rit->pInfo = NULL;
87
+ rit->weight = MIN_DOUBLE;
88
+ assert(!rit->nexts.empty());
89
+ for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
90
+ nextPos = it->first;
91
+ p = it->second;
92
+ val = 0.0;
93
+ if (nextPos + 1 < dags.size()) {
94
+ val += dags[nextPos + 1].weight;
95
+ }
96
+
97
+ if (p) {
98
+ val += p->weight;
99
+ } else {
100
+ val += dictTrie_->GetMinWeight();
101
+ }
102
+ if (val > rit->weight) {
103
+ rit->pInfo = p;
104
+ rit->weight = val;
105
+ }
106
+ }
107
+ }
108
+ }
109
+ void CutByDag(RuneStrArray::const_iterator begin,
110
+ RuneStrArray::const_iterator end,
111
+ const vector<Dag>& dags,
112
+ vector<WordRange>& words) const {
113
+ size_t i = 0;
114
+ while (i < dags.size()) {
115
+ const DictUnit* p = dags[i].pInfo;
116
+ if (p) {
117
+ assert(p->word.size() >= 1);
118
+ WordRange wr(begin + i, begin + i + p->word.size() - 1);
119
+ words.push_back(wr);
120
+ i += p->word.size();
121
+ } else { //single chinese word
122
+ WordRange wr(begin + i, begin + i);
123
+ words.push_back(wr);
124
+ i++;
125
+ }
126
+ }
127
+ }
128
+
129
+ const DictTrie* dictTrie_;
130
+ bool isNeedDestroy_;
131
+ PosTagger tagger_;
132
+
133
+ }; // class MPSegment
134
+
135
+ } // namespace cppjieba
136
+
137
+ #endif
@@ -0,0 +1,109 @@
1
+ #ifndef CPPJIEBA_MIXSEGMENT_H
2
+ #define CPPJIEBA_MIXSEGMENT_H
3
+
4
+ #include <cassert>
5
+ #include "MPSegment.hpp"
6
+ #include "HMMSegment.hpp"
7
+ #include "limonp/StringUtil.hpp"
8
+ #include "PosTagger.hpp"
9
+
10
+ namespace cppjieba {
11
+ class MixSegment: public SegmentTagged {
12
+ public:
13
+ MixSegment(const string& mpSegDict, const string& hmmSegDict,
14
+ const string& userDict = "")
15
+ : mpSeg_(mpSegDict, userDict),
16
+ hmmSeg_(hmmSegDict) {
17
+ }
18
+ MixSegment(const DictTrie* dictTrie, const HMMModel* model)
19
+ : mpSeg_(dictTrie), hmmSeg_(model) {
20
+ }
21
+ ~MixSegment() {
22
+ }
23
+
24
+ void Cut(const string& sentence, vector<string>& words) const {
25
+ Cut(sentence, words, true);
26
+ }
27
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
28
+ vector<Word> tmp;
29
+ Cut(sentence, tmp, hmm);
30
+ GetStringsFromWords(tmp, words);
31
+ }
32
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
33
+ PreFilter pre_filter(symbols_, sentence);
34
+ PreFilter::Range range;
35
+ vector<WordRange> wrs;
36
+ wrs.reserve(sentence.size() / 2);
37
+ while (pre_filter.HasNext()) {
38
+ range = pre_filter.Next();
39
+ Cut(range.begin, range.end, wrs, hmm);
40
+ }
41
+ words.clear();
42
+ words.reserve(wrs.size());
43
+ GetWordsFromWordRanges(sentence, wrs, words);
44
+ }
45
+
46
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
47
+ if (!hmm) {
48
+ mpSeg_.Cut(begin, end, res);
49
+ return;
50
+ }
51
+ vector<WordRange> words;
52
+ assert(end >= begin);
53
+ words.reserve(end - begin);
54
+ mpSeg_.Cut(begin, end, words);
55
+
56
+ vector<WordRange> hmmRes;
57
+ hmmRes.reserve(end - begin);
58
+ for (size_t i = 0; i < words.size(); i++) {
59
+ //if mp Get a word, it's ok, put it into result
60
+ if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
61
+ res.push_back(words[i]);
62
+ continue;
63
+ }
64
+
65
+ // if mp Get a single one and it is not in userdict, collect it in sequence
66
+ size_t j = i;
67
+ while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
68
+ j++;
69
+ }
70
+
71
+ // Cut the sequence with hmm
72
+ assert(j - 1 >= i);
73
+ // TODO
74
+ hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
75
+ //put hmm result to result
76
+ for (size_t k = 0; k < hmmRes.size(); k++) {
77
+ res.push_back(hmmRes[k]);
78
+ }
79
+
80
+ //clear tmp vars
81
+ hmmRes.clear();
82
+
83
+ //let i jump over this piece
84
+ i = j - 1;
85
+ }
86
+ }
87
+
88
+ const DictTrie* GetDictTrie() const {
89
+ return mpSeg_.GetDictTrie();
90
+ }
91
+
92
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
93
+ return tagger_.Tag(src, res, *this);
94
+ }
95
+
96
+ string LookupTag(const string &str) const {
97
+ return tagger_.LookupTag(str, *this);
98
+ }
99
+
100
+ private:
101
+ MPSegment mpSeg_;
102
+ HMMSegment hmmSeg_;
103
+ PosTagger tagger_;
104
+
105
+ }; // class MixSegment
106
+
107
+ } // namespace cppjieba
108
+
109
+ #endif
@@ -0,0 +1,77 @@
1
+ #ifndef CPPJIEBA_POS_TAGGING_H
2
+ #define CPPJIEBA_POS_TAGGING_H
3
+
4
+ #include "limonp/StringUtil.hpp"
5
+ #include "SegmentTagged.hpp"
6
+ #include "DictTrie.hpp"
7
+
8
+ namespace cppjieba {
9
+ using namespace limonp;
10
+
11
+ static const char* const POS_M = "m";
12
+ static const char* const POS_ENG = "eng";
13
+ static const char* const POS_X = "x";
14
+
15
+ class PosTagger {
16
+ public:
17
+ PosTagger() {
18
+ }
19
+ ~PosTagger() {
20
+ }
21
+
22
+ bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
23
+ vector<string> CutRes;
24
+ segment.Cut(src, CutRes);
25
+
26
+ for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
27
+ res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
28
+ }
29
+ return !res.empty();
30
+ }
31
+
32
+ string LookupTag(const string &str, const SegmentTagged& segment) const {
33
+ const DictUnit *tmp = NULL;
34
+ RuneStrArray runes;
35
+ const DictTrie * dict = segment.GetDictTrie();
36
+ assert(dict != NULL);
37
+ if (!DecodeRunesInString(str, runes)) {
38
+ XLOG(ERROR) << "Decode failed.";
39
+ return POS_X;
40
+ }
41
+ tmp = dict->Find(runes.begin(), runes.end());
42
+ if (tmp == NULL || tmp->tag.empty()) {
43
+ return SpecialRule(runes);
44
+ } else {
45
+ return tmp->tag;
46
+ }
47
+ }
48
+
49
+ private:
50
+ const char* SpecialRule(const RuneStrArray& unicode) const {
51
+ size_t m = 0;
52
+ size_t eng = 0;
53
+ for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
54
+ if (unicode[i].rune < 0x80) {
55
+ eng ++;
56
+ if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
57
+ m++;
58
+ }
59
+ }
60
+ }
61
+ // ascii char is not found
62
+ if (eng == 0) {
63
+ return POS_X;
64
+ }
65
+ // all the ascii is number char
66
+ if (m == eng) {
67
+ return POS_M;
68
+ }
69
+ // the ascii chars contain english letter
70
+ return POS_ENG;
71
+ }
72
+
73
+ }; // class PosTagger
74
+
75
+ } // namespace cppjieba
76
+
77
+ #endif
@@ -0,0 +1,54 @@
1
+ #ifndef CPPJIEBA_PRE_FILTER_H
2
+ #define CPPJIEBA_PRE_FILTER_H
3
+
4
+ #include "Trie.hpp"
5
+ #include "limonp/Logging.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ class PreFilter {
10
+ public:
11
+ //TODO use WordRange instead of Range
12
+ struct Range {
13
+ RuneStrArray::const_iterator begin;
14
+ RuneStrArray::const_iterator end;
15
+ }; // struct Range
16
+
17
+ PreFilter(const unordered_set<Rune>& symbols,
18
+ const string& sentence)
19
+ : symbols_(symbols) {
20
+ if (!DecodeRunesInString(sentence, sentence_)) {
21
+ XLOG(ERROR) << "decode failed. ";
22
+ }
23
+ cursor_ = sentence_.begin();
24
+ }
25
+ ~PreFilter() {
26
+ }
27
+ bool HasNext() const {
28
+ return cursor_ != sentence_.end();
29
+ }
30
+ Range Next() {
31
+ Range range;
32
+ range.begin = cursor_;
33
+ while (cursor_ != sentence_.end()) {
34
+ if (IsIn(symbols_, cursor_->rune)) {
35
+ if (range.begin == cursor_) {
36
+ cursor_ ++;
37
+ }
38
+ range.end = cursor_;
39
+ return range;
40
+ }
41
+ cursor_ ++;
42
+ }
43
+ range.end = sentence_.end();
44
+ return range;
45
+ }
46
+ private:
47
+ RuneStrArray::const_iterator cursor_;
48
+ RuneStrArray sentence_;
49
+ const unordered_set<Rune>& symbols_;
50
+ }; // class PreFilter
51
+
52
+ } // namespace cppjieba
53
+
54
+ #endif // CPPJIEBA_PRE_FILTER_H