cppjieba_rb 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,90 @@
1
+ #ifndef CPPJIEBA_QUERYSEGMENT_H
2
+ #define CPPJIEBA_QUERYSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "limonp/Logging.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "SegmentBase.hpp"
10
+ #include "FullSegment.hpp"
11
+ #include "MixSegment.hpp"
12
+ #include "Unicode.hpp"
13
+ #include "DictTrie.hpp"
14
+
15
+ namespace cppjieba {
16
+ class QuerySegment: public SegmentBase {
17
+ public:
18
+ QuerySegment(const string& dict, const string& model, const string& userDict = "")
19
+ : mixSeg_(dict, model, userDict),
20
+ trie_(mixSeg_.GetDictTrie()) {
21
+ }
22
+ QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
23
+ : mixSeg_(dictTrie, model), trie_(dictTrie) {
24
+ }
25
+ ~QuerySegment() {
26
+ }
27
+
28
+ void Cut(const string& sentence, vector<string>& words) const {
29
+ Cut(sentence, words, true);
30
+ }
31
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
32
+ vector<Word> tmp;
33
+ Cut(sentence, tmp, hmm);
34
+ GetStringsFromWords(tmp, words);
35
+ }
36
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
37
+ PreFilter pre_filter(symbols_, sentence);
38
+ PreFilter::Range range;
39
+ vector<WordRange> wrs;
40
+ wrs.reserve(sentence.size()/2);
41
+ while (pre_filter.HasNext()) {
42
+ range = pre_filter.Next();
43
+ Cut(range.begin, range.end, wrs, hmm);
44
+ }
45
+ words.clear();
46
+ words.reserve(wrs.size());
47
+ GetWordsFromWordRanges(sentence, wrs, words);
48
+ }
49
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
50
+ //use mix Cut first
51
+ vector<WordRange> mixRes;
52
+ mixSeg_.Cut(begin, end, mixRes, hmm);
53
+
54
+ vector<WordRange> fullRes;
55
+ for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
56
+ if (mixResItr->Length() > 2) {
57
+ for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
58
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
59
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
60
+ res.push_back(wr);
61
+ }
62
+ }
63
+ }
64
+ if (mixResItr->Length() > 3) {
65
+ for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
66
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
67
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
68
+ res.push_back(wr);
69
+ }
70
+ }
71
+ }
72
+ res.push_back(*mixResItr);
73
+ }
74
+ }
75
+ private:
76
+ bool IsAllAscii(const Unicode& s) const {
77
+ for(size_t i = 0; i < s.size(); i++) {
78
+ if (s[i] >= 0x80) {
79
+ return false;
80
+ }
81
+ }
82
+ return true;
83
+ }
84
+ MixSegment mixSeg_;
85
+ const DictTrie* trie_;
86
+ }; // QuerySegment
87
+
88
+ } // namespace cppjieba
89
+
90
+ #endif
@@ -0,0 +1,46 @@
1
+ #ifndef CPPJIEBA_SEGMENTBASE_H
2
+ #define CPPJIEBA_SEGMENTBASE_H
3
+
4
+ #include "limonp/Logging.hpp"
5
+ #include "PreFilter.hpp"
6
+ #include <cassert>
7
+
8
+
9
+ namespace cppjieba {
10
+
11
+ const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
12
+
13
+ using namespace limonp;
14
+
15
+ class SegmentBase {
16
+ public:
17
+ SegmentBase() {
18
+ XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
19
+ }
20
+ virtual ~SegmentBase() {
21
+ }
22
+
23
+ virtual void Cut(const string& sentence, vector<string>& words) const = 0;
24
+
25
+ bool ResetSeparators(const string& s) {
26
+ symbols_.clear();
27
+ RuneStrArray runes;
28
+ if (!DecodeRunesInString(s, runes)) {
29
+ XLOG(ERROR) << "decode " << s << " failed";
30
+ return false;
31
+ }
32
+ for (size_t i = 0; i < runes.size(); i++) {
33
+ if (!symbols_.insert(runes[i].rune).second) {
34
+ XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
35
+ return false;
36
+ }
37
+ }
38
+ return true;
39
+ }
40
+ protected:
41
+ unordered_set<Rune> symbols_;
42
+ }; // class SegmentBase
43
+
44
+ } // cppjieba
45
+
46
+ #endif
@@ -0,0 +1,23 @@
1
+ #ifndef CPPJIEBA_SEGMENTTAGGED_H
2
+ #define CPPJIEBA_SEGMENTTAGGED_H
3
+
4
+ #include "SegmentBase.hpp"
5
+
6
+ namespace cppjieba {
7
+
8
+ class SegmentTagged : public SegmentBase{
9
+ public:
10
+ SegmentTagged() {
11
+ }
12
+ virtual ~SegmentTagged() {
13
+ }
14
+
15
+ virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
16
+
17
+ virtual const DictTrie* GetDictTrie() const = 0;
18
+
19
+ }; // class SegmentTagged
20
+
21
+ } // cppjieba
22
+
23
+ #endif
@@ -0,0 +1,190 @@
1
+ #ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
2
+ #define CPPJIEBA_TEXTRANK_EXTRACTOR_H
3
+
4
+ #include <cmath>
5
+ #include "Jieba.hpp"
6
+
7
+ namespace cppjieba {
8
+ using namespace limonp;
9
+ using namespace std;
10
+
11
+ class TextRankExtractor {
12
+ public:
13
+ typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
14
+ private:
15
+ typedef std::map<string,Word> WordMap;
16
+
17
+ class WordGraph{
18
+ private:
19
+ typedef double Score;
20
+ typedef string Node;
21
+ typedef std::set<Node> NodeSet;
22
+
23
+ typedef std::map<Node,double> Edges;
24
+ typedef std::map<Node,Edges> Graph;
25
+ //typedef std::unordered_map<Node,double> Edges;
26
+ //typedef std::unordered_map<Node,Edges> Graph;
27
+
28
+ double d;
29
+ Graph graph;
30
+ NodeSet nodeSet;
31
+ public:
32
+ WordGraph(): d(0.85) {};
33
+ WordGraph(double in_d): d(in_d) {};
34
+
35
+ void addEdge(Node start,Node end,double weight){
36
+ Edges temp;
37
+ Edges::iterator gotEdges;
38
+ nodeSet.insert(start);
39
+ nodeSet.insert(end);
40
+ graph[start][end]+=weight;
41
+ graph[end][start]+=weight;
42
+ }
43
+
44
+ void rank(WordMap &ws,size_t rankTime=10){
45
+ WordMap outSum;
46
+ Score wsdef, min_rank, max_rank;
47
+
48
+ if( graph.size() == 0)
49
+ return;
50
+
51
+ wsdef = 1.0 / graph.size();
52
+
53
+ for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
54
+ // edges->first start节点;edge->first end节点;edge->second 权重
55
+ ws[edges->first].word=edges->first;
56
+ ws[edges->first].weight=wsdef;
57
+ outSum[edges->first].weight=0;
58
+ for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
59
+ outSum[edges->first].weight+=edge->second;
60
+ }
61
+ }
62
+ //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
63
+ for( size_t i=0; i<rankTime; i++ ){
64
+ for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
65
+ double s = 0;
66
+ for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
67
+ // edge->first end节点;edge->second 权重
68
+ s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
69
+ ws[*node].weight = (1 - d) + d * s;
70
+ }
71
+ }
72
+
73
+ min_rank=max_rank=ws.begin()->second.weight;
74
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
75
+ if( i->second.weight < min_rank ){
76
+ min_rank = i->second.weight;
77
+ }
78
+ if( i->second.weight > max_rank ){
79
+ max_rank = i->second.weight;
80
+ }
81
+ }
82
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
83
+ ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
84
+ }
85
+ }
86
+ };
87
+
88
+ public:
89
+ TextRankExtractor(const string& dictPath,
90
+ const string& hmmFilePath,
91
+ const string& stopWordPath,
92
+ const string& userDict = "")
93
+ : segment_(dictPath, hmmFilePath, userDict) {
94
+ LoadStopWordDict(stopWordPath);
95
+ }
96
+ TextRankExtractor(const DictTrie* dictTrie,
97
+ const HMMModel* model,
98
+ const string& stopWordPath)
99
+ : segment_(dictTrie, model) {
100
+ LoadStopWordDict(stopWordPath);
101
+ }
102
+ TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
103
+ LoadStopWordDict(stopWordPath);
104
+ }
105
+ ~TextRankExtractor() {
106
+ }
107
+
108
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
109
+ vector<Word> topWords;
110
+ Extract(sentence, topWords, topN);
111
+ for (size_t i = 0; i < topWords.size(); i++) {
112
+ keywords.push_back(topWords[i].word);
113
+ }
114
+ }
115
+
116
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
117
+ vector<Word> topWords;
118
+ Extract(sentence, topWords, topN);
119
+ for (size_t i = 0; i < topWords.size(); i++) {
120
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
121
+ }
122
+ }
123
+
124
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
125
+ vector<string> words;
126
+ segment_.Cut(sentence, words);
127
+
128
+ TextRankExtractor::WordGraph graph;
129
+ WordMap wordmap;
130
+ size_t offset = 0;
131
+
132
+ for(size_t i=0; i < words.size(); i++){
133
+ size_t t = offset;
134
+ offset += words[i].size();
135
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
136
+ continue;
137
+ }
138
+ for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
139
+ if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
140
+ skip++;
141
+ continue;
142
+ }
143
+ graph.addEdge(words[i],words[j],1);
144
+ }
145
+ wordmap[words[i]].offsets.push_back(t);
146
+ }
147
+ if (offset != sentence.size()) {
148
+ XLOG(ERROR) << "words illegal";
149
+ return;
150
+ }
151
+
152
+ graph.rank(wordmap,rankTime);
153
+
154
+ keywords.clear();
155
+ keywords.reserve(wordmap.size());
156
+ for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
157
+ keywords.push_back(itr->second);
158
+ }
159
+
160
+ topN = min(topN, keywords.size());
161
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
162
+ keywords.resize(topN);
163
+ }
164
+ private:
165
+ void LoadStopWordDict(const string& filePath) {
166
+ ifstream ifs(filePath.c_str());
167
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
168
+ string line ;
169
+ while (getline(ifs, line)) {
170
+ stopWords_.insert(line);
171
+ }
172
+ assert(stopWords_.size());
173
+ }
174
+
175
+ static bool Compare(const Word &x,const Word &y){
176
+ return x.weight > y.weight;
177
+ }
178
+
179
+ MixSegment segment_;
180
+ unordered_set<string> stopWords_;
181
+ }; // class TextRankExtractor
182
+
183
+ inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
184
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
185
+ }
186
+ } // namespace cppjieba
187
+
188
+ #endif
189
+
190
+
@@ -0,0 +1,174 @@
1
+ #ifndef CPPJIEBA_TRIE_HPP
2
+ #define CPPJIEBA_TRIE_HPP
3
+
4
+ #include <vector>
5
+ #include <queue>
6
+ #include "limonp/StdExtension.hpp"
7
+ #include "Unicode.hpp"
8
+
9
+ namespace cppjieba {
10
+
11
+ using namespace std;
12
+
13
+ const size_t MAX_WORD_LENGTH = 512;
14
+
15
+ struct DictUnit {
16
+ Unicode word;
17
+ double weight;
18
+ string tag;
19
+ }; // struct DictUnit
20
+
21
+ // for debugging
22
+ // inline ostream & operator << (ostream& os, const DictUnit& unit) {
23
+ // string s;
24
+ // s << unit.word;
25
+ // return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
26
+ // }
27
+
28
+ struct Dag {
29
+ RuneStr runestr;
30
+ // [offset, nexts.first]
31
+ limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
32
+ const DictUnit * pInfo;
33
+ double weight;
34
+ size_t nextPos; // TODO
35
+ Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
36
+ }
37
+ }; // struct Dag
38
+
39
+ typedef Rune TrieKey;
40
+
41
+ class TrieNode {
42
+ public :
43
+ TrieNode(): next(NULL), ptValue(NULL) {
44
+ }
45
+ public:
46
+ typedef unordered_map<TrieKey, TrieNode*> NextMap;
47
+ NextMap *next;
48
+ const DictUnit *ptValue;
49
+ };
50
+
51
+ class Trie {
52
+ public:
53
+ Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
54
+ : root_(new TrieNode) {
55
+ CreateTrie(keys, valuePointers);
56
+ }
57
+ ~Trie() {
58
+ DeleteNode(root_);
59
+ }
60
+
61
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
62
+ if (begin == end) {
63
+ return NULL;
64
+ }
65
+
66
+ const TrieNode* ptNode = root_;
67
+ TrieNode::NextMap::const_iterator citer;
68
+ for (RuneStrArray::const_iterator it = begin; it != end; it++) {
69
+ if (NULL == ptNode->next) {
70
+ return NULL;
71
+ }
72
+ citer = ptNode->next->find(it->rune);
73
+ if (ptNode->next->end() == citer) {
74
+ return NULL;
75
+ }
76
+ ptNode = citer->second;
77
+ }
78
+ return ptNode->ptValue;
79
+ }
80
+
81
+ void Find(RuneStrArray::const_iterator begin,
82
+ RuneStrArray::const_iterator end,
83
+ vector<struct Dag>&res,
84
+ size_t max_word_len = MAX_WORD_LENGTH) const {
85
+ assert(root_ != NULL);
86
+ res.resize(end - begin);
87
+
88
+ const TrieNode *ptNode = NULL;
89
+ TrieNode::NextMap::const_iterator citer;
90
+ for (size_t i = 0; i < size_t(end - begin); i++) {
91
+ res[i].runestr = *(begin + i);
92
+
93
+ if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
94
+ ptNode = citer->second;
95
+ } else {
96
+ ptNode = NULL;
97
+ }
98
+ if (ptNode != NULL) {
99
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
100
+ } else {
101
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
102
+ }
103
+
104
+ for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
105
+ if (ptNode == NULL || ptNode->next == NULL) {
106
+ break;
107
+ }
108
+ citer = ptNode->next->find((begin + j)->rune);
109
+ if (ptNode->next->end() == citer) {
110
+ break;
111
+ }
112
+ ptNode = citer->second;
113
+ if (NULL != ptNode->ptValue) {
114
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
115
+ }
116
+ }
117
+ }
118
+ }
119
+
120
+ void InsertNode(const Unicode& key, const DictUnit* ptValue) {
121
+ if (key.begin() == key.end()) {
122
+ return;
123
+ }
124
+
125
+ TrieNode::NextMap::const_iterator kmIter;
126
+ TrieNode *ptNode = root_;
127
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
128
+ if (NULL == ptNode->next) {
129
+ ptNode->next = new TrieNode::NextMap;
130
+ }
131
+ kmIter = ptNode->next->find(*citer);
132
+ if (ptNode->next->end() == kmIter) {
133
+ TrieNode *nextNode = new TrieNode;
134
+
135
+ ptNode->next->insert(make_pair(*citer, nextNode));
136
+ ptNode = nextNode;
137
+ } else {
138
+ ptNode = kmIter->second;
139
+ }
140
+ }
141
+ assert(ptNode != NULL);
142
+ ptNode->ptValue = ptValue;
143
+ }
144
+
145
+ private:
146
+ void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
147
+ if (valuePointers.empty() || keys.empty()) {
148
+ return;
149
+ }
150
+ assert(keys.size() == valuePointers.size());
151
+
152
+ for (size_t i = 0; i < keys.size(); i++) {
153
+ InsertNode(keys[i], valuePointers[i]);
154
+ }
155
+ }
156
+
157
+ void DeleteNode(TrieNode* node) {
158
+ if (NULL == node) {
159
+ return;
160
+ }
161
+ if (NULL != node->next) {
162
+ for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
163
+ DeleteNode(it->second);
164
+ }
165
+ delete node->next;
166
+ }
167
+ delete node;
168
+ }
169
+
170
+ TrieNode* root_;
171
+ }; // class Trie
172
+ } // namespace cppjieba
173
+
174
+ #endif // CPPJIEBA_TRIE_HPP