jieba-rb 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,46 @@
1
+ #ifndef CPPJIEBA_SEGMENTBASE_H
2
+ #define CPPJIEBA_SEGMENTBASE_H
3
+
4
+ #include "limonp/Logging.hpp"
5
+ #include "PreFilter.hpp"
6
+ #include <cassert>
7
+
8
+
9
+ namespace cppjieba {
10
+
11
+ const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
12
+
13
+ using namespace limonp;
14
+
15
+ class SegmentBase {
16
+ public:
17
+ SegmentBase() {
18
+ XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
19
+ }
20
+ virtual ~SegmentBase() {
21
+ }
22
+
23
+ virtual void Cut(const string& sentence, vector<string>& words) const = 0;
24
+
25
+ bool ResetSeparators(const string& s) {
26
+ symbols_.clear();
27
+ RuneStrArray runes;
28
+ if (!DecodeRunesInString(s, runes)) {
29
+ XLOG(ERROR) << "decode " << s << " failed";
30
+ return false;
31
+ }
32
+ for (size_t i = 0; i < runes.size(); i++) {
33
+ if (!symbols_.insert(runes[i].rune).second) {
34
+ XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
35
+ return false;
36
+ }
37
+ }
38
+ return true;
39
+ }
40
+ protected:
41
+ unordered_set<Rune> symbols_;
42
+ }; // class SegmentBase
43
+
44
+ } // cppjieba
45
+
46
+ #endif
@@ -0,0 +1,24 @@
1
+ #ifndef CPPJIEBA_SEGMENTTAGGED_H
2
+ #define CPPJIEBA_SEGMENTTAGGED_H
3
+
4
+ #include "SegmentBase.hpp"
5
+ #include "DictTrie.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ class SegmentTagged : public SegmentBase{
10
+ public:
11
+ SegmentTagged() {
12
+ }
13
+ virtual ~SegmentTagged() {
14
+ }
15
+
16
+ virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
17
+
18
+ virtual const DictTrie* GetDictTrie() const = 0;
19
+
20
+ }; // class SegmentTagged
21
+
22
+ } // cppjieba
23
+
24
+ #endif
@@ -0,0 +1,190 @@
1
+ #ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
2
+ #define CPPJIEBA_TEXTRANK_EXTRACTOR_H
3
+
4
+ #include <cmath>
5
+ #include "Jieba.hpp"
6
+
7
+ namespace cppjieba {
8
+ using namespace limonp;
9
+ using namespace std;
10
+
11
+ class TextRankExtractor {
12
+ public:
13
+ typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
14
+ private:
15
+ typedef std::map<string,Word> WordMap;
16
+
17
+ class WordGraph{
18
+ private:
19
+ typedef double Score;
20
+ typedef string Node;
21
+ typedef std::set<Node> NodeSet;
22
+
23
+ typedef std::map<Node,double> Edges;
24
+ typedef std::map<Node,Edges> Graph;
25
+ //typedef std::unordered_map<Node,double> Edges;
26
+ //typedef std::unordered_map<Node,Edges> Graph;
27
+
28
+ double d;
29
+ Graph graph;
30
+ NodeSet nodeSet;
31
+ public:
32
+ WordGraph(): d(0.85) {};
33
+ WordGraph(double in_d): d(in_d) {};
34
+
35
+ void addEdge(Node start,Node end,double weight){
36
+ Edges temp;
37
+ Edges::iterator gotEdges;
38
+ nodeSet.insert(start);
39
+ nodeSet.insert(end);
40
+ graph[start][end]+=weight;
41
+ graph[end][start]+=weight;
42
+ }
43
+
44
+ void rank(WordMap &ws,size_t rankTime=10){
45
+ WordMap outSum;
46
+ Score wsdef, min_rank, max_rank;
47
+
48
+ if( graph.size() == 0)
49
+ return;
50
+
51
+ wsdef = 1.0 / graph.size();
52
+
53
+ for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
54
+ // edges->first start节点;edge->first end节点;edge->second 权重
55
+ ws[edges->first].word=edges->first;
56
+ ws[edges->first].weight=wsdef;
57
+ outSum[edges->first].weight=0;
58
+ for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
59
+ outSum[edges->first].weight+=edge->second;
60
+ }
61
+ }
62
+ //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
63
+ for( size_t i=0; i<rankTime; i++ ){
64
+ for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
65
+ double s = 0;
66
+ for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
67
+ // edge->first end节点;edge->second 权重
68
+ s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
69
+ ws[*node].weight = (1 - d) + d * s;
70
+ }
71
+ }
72
+
73
+ min_rank=max_rank=ws.begin()->second.weight;
74
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
75
+ if( i->second.weight < min_rank ){
76
+ min_rank = i->second.weight;
77
+ }
78
+ if( i->second.weight > max_rank ){
79
+ max_rank = i->second.weight;
80
+ }
81
+ }
82
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
83
+ ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
84
+ }
85
+ }
86
+ };
87
+
88
+ public:
89
+ TextRankExtractor(const string& dictPath,
90
+ const string& hmmFilePath,
91
+ const string& stopWordPath,
92
+ const string& userDict = "")
93
+ : segment_(dictPath, hmmFilePath, userDict) {
94
+ LoadStopWordDict(stopWordPath);
95
+ }
96
+ TextRankExtractor(const DictTrie* dictTrie,
97
+ const HMMModel* model,
98
+ const string& stopWordPath)
99
+ : segment_(dictTrie, model) {
100
+ LoadStopWordDict(stopWordPath);
101
+ }
102
+ TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
103
+ LoadStopWordDict(stopWordPath);
104
+ }
105
+ ~TextRankExtractor() {
106
+ }
107
+
108
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
109
+ vector<Word> topWords;
110
+ Extract(sentence, topWords, topN);
111
+ for (size_t i = 0; i < topWords.size(); i++) {
112
+ keywords.push_back(topWords[i].word);
113
+ }
114
+ }
115
+
116
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
117
+ vector<Word> topWords;
118
+ Extract(sentence, topWords, topN);
119
+ for (size_t i = 0; i < topWords.size(); i++) {
120
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
121
+ }
122
+ }
123
+
124
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
125
+ vector<string> words;
126
+ segment_.Cut(sentence, words);
127
+
128
+ TextRankExtractor::WordGraph graph;
129
+ WordMap wordmap;
130
+ size_t offset = 0;
131
+
132
+ for(size_t i=0; i < words.size(); i++){
133
+ size_t t = offset;
134
+ offset += words[i].size();
135
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
136
+ continue;
137
+ }
138
+ for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
139
+ if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
140
+ skip++;
141
+ continue;
142
+ }
143
+ graph.addEdge(words[i],words[j],1);
144
+ }
145
+ wordmap[words[i]].offsets.push_back(t);
146
+ }
147
+ if (offset != sentence.size()) {
148
+ XLOG(ERROR) << "words illegal";
149
+ return;
150
+ }
151
+
152
+ graph.rank(wordmap,rankTime);
153
+
154
+ keywords.clear();
155
+ keywords.reserve(wordmap.size());
156
+ for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
157
+ keywords.push_back(itr->second);
158
+ }
159
+
160
+ topN = min(topN, keywords.size());
161
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
162
+ keywords.resize(topN);
163
+ }
164
+ private:
165
+ void LoadStopWordDict(const string& filePath) {
166
+ ifstream ifs(filePath.c_str());
167
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
168
+ string line ;
169
+ while (getline(ifs, line)) {
170
+ stopWords_.insert(line);
171
+ }
172
+ assert(stopWords_.size());
173
+ }
174
+
175
+ static bool Compare(const Word &x,const Word &y){
176
+ return x.weight > y.weight;
177
+ }
178
+
179
+ MixSegment segment_;
180
+ unordered_set<string> stopWords_;
181
+ }; // class TextRankExtractor
182
+
183
+ inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
184
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
185
+ }
186
+ } // namespace cppjieba
187
+
188
+ #endif
189
+
190
+
@@ -0,0 +1,174 @@
1
+ #ifndef CPPJIEBA_TRIE_HPP
2
+ #define CPPJIEBA_TRIE_HPP
3
+
4
+ #include <vector>
5
+ #include <queue>
6
+ #include "limonp/StdExtension.hpp"
7
+ #include "Unicode.hpp"
8
+
9
+ namespace cppjieba {
10
+
11
+ using namespace std;
12
+
13
+ const size_t MAX_WORD_LENGTH = 512;
14
+
15
+ struct DictUnit {
16
+ Unicode word;
17
+ double weight;
18
+ string tag;
19
+ }; // struct DictUnit
20
+
21
+ // for debugging
22
+ // inline ostream & operator << (ostream& os, const DictUnit& unit) {
23
+ // string s;
24
+ // s << unit.word;
25
+ // return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
26
+ // }
27
+
28
+ struct Dag {
29
+ RuneStr runestr;
30
+ // [offset, nexts.first]
31
+ limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
32
+ const DictUnit * pInfo;
33
+ double weight;
34
+ size_t nextPos; // TODO
35
+ Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
36
+ }
37
+ }; // struct Dag
38
+
39
+ typedef Rune TrieKey;
40
+
41
+ class TrieNode {
42
+ public :
43
+ TrieNode(): next(NULL), ptValue(NULL) {
44
+ }
45
+ public:
46
+ typedef unordered_map<TrieKey, TrieNode*> NextMap;
47
+ NextMap *next;
48
+ const DictUnit *ptValue;
49
+ };
50
+
51
+ class Trie {
52
+ public:
53
+ Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
54
+ : root_(new TrieNode) {
55
+ CreateTrie(keys, valuePointers);
56
+ }
57
+ ~Trie() {
58
+ DeleteNode(root_);
59
+ }
60
+
61
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
62
+ if (begin == end) {
63
+ return NULL;
64
+ }
65
+
66
+ const TrieNode* ptNode = root_;
67
+ TrieNode::NextMap::const_iterator citer;
68
+ for (RuneStrArray::const_iterator it = begin; it != end; it++) {
69
+ if (NULL == ptNode->next) {
70
+ return NULL;
71
+ }
72
+ citer = ptNode->next->find(it->rune);
73
+ if (ptNode->next->end() == citer) {
74
+ return NULL;
75
+ }
76
+ ptNode = citer->second;
77
+ }
78
+ return ptNode->ptValue;
79
+ }
80
+
81
+ void Find(RuneStrArray::const_iterator begin,
82
+ RuneStrArray::const_iterator end,
83
+ vector<struct Dag>&res,
84
+ size_t max_word_len = MAX_WORD_LENGTH) const {
85
+ assert(root_ != NULL);
86
+ res.resize(end - begin);
87
+
88
+ const TrieNode *ptNode = NULL;
89
+ TrieNode::NextMap::const_iterator citer;
90
+ for (size_t i = 0; i < size_t(end - begin); i++) {
91
+ res[i].runestr = *(begin + i);
92
+
93
+ if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
94
+ ptNode = citer->second;
95
+ } else {
96
+ ptNode = NULL;
97
+ }
98
+ if (ptNode != NULL) {
99
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
100
+ } else {
101
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
102
+ }
103
+
104
+ for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
105
+ if (ptNode == NULL || ptNode->next == NULL) {
106
+ break;
107
+ }
108
+ citer = ptNode->next->find((begin + j)->rune);
109
+ if (ptNode->next->end() == citer) {
110
+ break;
111
+ }
112
+ ptNode = citer->second;
113
+ if (NULL != ptNode->ptValue) {
114
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
115
+ }
116
+ }
117
+ }
118
+ }
119
+
120
+ void InsertNode(const Unicode& key, const DictUnit* ptValue) {
121
+ if (key.begin() == key.end()) {
122
+ return;
123
+ }
124
+
125
+ TrieNode::NextMap::const_iterator kmIter;
126
+ TrieNode *ptNode = root_;
127
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
128
+ if (NULL == ptNode->next) {
129
+ ptNode->next = new TrieNode::NextMap;
130
+ }
131
+ kmIter = ptNode->next->find(*citer);
132
+ if (ptNode->next->end() == kmIter) {
133
+ TrieNode *nextNode = new TrieNode;
134
+
135
+ ptNode->next->insert(make_pair(*citer, nextNode));
136
+ ptNode = nextNode;
137
+ } else {
138
+ ptNode = kmIter->second;
139
+ }
140
+ }
141
+ assert(ptNode != NULL);
142
+ ptNode->ptValue = ptValue;
143
+ }
144
+
145
+ private:
146
+ void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
147
+ if (valuePointers.empty() || keys.empty()) {
148
+ return;
149
+ }
150
+ assert(keys.size() == valuePointers.size());
151
+
152
+ for (size_t i = 0; i < keys.size(); i++) {
153
+ InsertNode(keys[i], valuePointers[i]);
154
+ }
155
+ }
156
+
157
+ void DeleteNode(TrieNode* node) {
158
+ if (NULL == node) {
159
+ return;
160
+ }
161
+ if (NULL != node->next) {
162
+ for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
163
+ DeleteNode(it->second);
164
+ }
165
+ delete node->next;
166
+ }
167
+ delete node;
168
+ }
169
+
170
+ TrieNode* root_;
171
+ }; // class Trie
172
+ } // namespace cppjieba
173
+
174
+ #endif // CPPJIEBA_TRIE_HPP
@@ -0,0 +1,215 @@
1
+ #ifndef CPPJIEBA_UNICODE_H
2
+ #define CPPJIEBA_UNICODE_H
3
+
4
+ #include <stdint.h>
5
+ #include <stdlib.h>
6
+ #include <string>
7
+ #include <vector>
8
+ #include <ostream>
9
+ #include "limonp/LocalVector.hpp"
10
+
11
+ namespace cppjieba {
12
+
13
+ using std::string;
14
+ using std::vector;
15
+
16
+ typedef uint32_t Rune;
17
+
18
+ struct Word {
19
+ string word;
20
+ uint32_t offset;
21
+ Word(const string& w, uint32_t o)
22
+ : word(w), offset(o) {
23
+ }
24
+ }; // struct Word
25
+
26
+ inline std::ostream& operator << (std::ostream& os, const Word& w) {
27
+ return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
28
+ }
29
+
30
+ struct RuneStr {
31
+ Rune rune;
32
+ uint32_t offset;
33
+ uint32_t len;
34
+ RuneStr(): rune(0), offset(0), len(0) {
35
+ }
36
+ RuneStr(Rune r, uint32_t o, uint32_t l)
37
+ : rune(r), offset(o), len(l) {
38
+ }
39
+ }; // struct RuneStr
40
+
41
+ inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
42
+ return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
43
+ }
44
+
45
+ typedef limonp::LocalVector<Rune> Unicode;
46
+ typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
47
+
48
+ // [left, right]
49
+ struct WordRange {
50
+ RuneStrArray::const_iterator left;
51
+ RuneStrArray::const_iterator right;
52
+ WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
53
+ : left(l), right(r) {
54
+ }
55
+ size_t Length() const {
56
+ return right - left + 1;
57
+ }
58
+ bool IsAllAscii() const {
59
+ for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
60
+ if (iter->rune >= 0x80) {
61
+ return false;
62
+ }
63
+ }
64
+ return true;
65
+ }
66
+ }; // struct WordRange
67
+
68
+ struct RuneStrLite {
69
+ uint32_t rune;
70
+ uint32_t len;
71
+ RuneStrLite(): rune(0), len(0) {
72
+ }
73
+ RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
74
+ }
75
+ }; // struct RuneStrLite
76
+
77
+ inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
78
+ RuneStrLite rp(0, 0);
79
+ if (str == NULL || len == 0) {
80
+ return rp;
81
+ }
82
+ if (!(str[0] & 0x80)) { // 0xxxxxxx
83
+ // 7bit, total 7bit
84
+ rp.rune = (uint8_t)(str[0]) & 0x7f;
85
+ rp.len = 1;
86
+ } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
87
+ // 110xxxxxx
88
+ // 5bit, total 5bit
89
+ rp.rune = (uint8_t)(str[0]) & 0x1f;
90
+
91
+ // 6bit, total 11bit
92
+ rp.rune <<= 6;
93
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
94
+ rp.len = 2;
95
+ } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
96
+ // 4bit, total 4bit
97
+ rp.rune = (uint8_t)(str[0]) & 0x0f;
98
+
99
+ // 6bit, total 10bit
100
+ rp.rune <<= 6;
101
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
102
+
103
+ // 6bit, total 16bit
104
+ rp.rune <<= 6;
105
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
106
+
107
+ rp.len = 3;
108
+ } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
109
+ // 3bit, total 3bit
110
+ rp.rune = (uint8_t)(str[0]) & 0x07;
111
+
112
+ // 6bit, total 9bit
113
+ rp.rune <<= 6;
114
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
115
+
116
+ // 6bit, total 15bit
117
+ rp.rune <<= 6;
118
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
119
+
120
+ // 6bit, total 21bit
121
+ rp.rune <<= 6;
122
+ rp.rune |= (uint8_t)(str[3]) & 0x3f;
123
+
124
+ rp.len = 4;
125
+ } else {
126
+ rp.rune = 0;
127
+ rp.len = 0;
128
+ }
129
+ return rp;
130
+ }
131
+
132
+ inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
133
+ runes.clear();
134
+ runes.reserve(len / 2);
135
+ for (size_t i = 0; i < len;) {
136
+ RuneStrLite rp = DecodeRuneInString(s + i, len - i);
137
+ if (rp.len == 0) {
138
+ runes.clear();
139
+ return false;
140
+ }
141
+ RuneStr x(rp.rune, i, rp.len);
142
+ runes.push_back(x);
143
+ i += rp.len;
144
+ }
145
+ return true;
146
+ }
147
+
148
+ inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
149
+ return DecodeRunesInString(s.c_str(), s.size(), runes);
150
+ }
151
+
152
+ inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
153
+ unicode.clear();
154
+ RuneStrArray runes;
155
+ if (!DecodeRunesInString(s, len, runes)) {
156
+ return false;
157
+ }
158
+ unicode.reserve(runes.size());
159
+ for (size_t i = 0; i < runes.size(); i++) {
160
+ unicode.push_back(runes[i].rune);
161
+ }
162
+ return true;
163
+ }
164
+
165
+ inline bool IsSingleWord(const string& str) {
166
+ RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
167
+ return rp.len == str.size();
168
+ }
169
+
170
+ inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
171
+ return DecodeRunesInString(s.c_str(), s.size(), unicode);
172
+ }
173
+
174
+ inline Unicode DecodeRunesInString(const string& s) {
175
+ Unicode result;
176
+ DecodeRunesInString(s, result);
177
+ return result;
178
+ }
179
+
180
+
181
+ // [left, right]
182
+ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
183
+ assert(right->offset >= left->offset);
184
+ uint32_t len = right->offset - left->offset + right->len;
185
+ return Word(s.substr(left->offset, len), left->offset);
186
+ }
187
+
188
+ inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
189
+ assert(right->offset >= left->offset);
190
+ uint32_t len = right->offset - left->offset + right->len;
191
+ return s.substr(left->offset, len);
192
+ }
193
+
194
+ inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
195
+ for (size_t i = 0; i < wrs.size(); i++) {
196
+ words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
197
+ }
198
+ }
199
+
200
+ inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
201
+ vector<Word> result;
202
+ GetWordsFromWordRanges(s, wrs, result);
203
+ return result;
204
+ }
205
+
206
+ inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
207
+ strs.resize(words.size());
208
+ for (size_t i = 0; i < words.size(); ++i) {
209
+ strs[i] = words[i].word;
210
+ }
211
+ }
212
+
213
+ } // namespace cppjieba
214
+
215
+ #endif // CPPJIEBA_UNICODE_H