jieba-rb 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,190 @@
1
+ #ifndef CPPJIBEA_HMMSEGMENT_H
2
+ #define CPPJIBEA_HMMSEGMENT_H
3
+
4
+ #include <iostream>
5
+ #include <fstream>
6
+ #include <memory.h>
7
+ #include <cassert>
8
+ #include "HMMModel.hpp"
9
+ #include "SegmentBase.hpp"
10
+
11
+ namespace cppjieba {
12
+ class HMMSegment: public SegmentBase {
13
+ public:
14
+ HMMSegment(const string& filePath)
15
+ : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
16
+ }
17
+ HMMSegment(const HMMModel* model)
18
+ : model_(model), isNeedDestroy_(false) {
19
+ }
20
+ ~HMMSegment() {
21
+ if (isNeedDestroy_) {
22
+ delete model_;
23
+ }
24
+ }
25
+
26
+ void Cut(const string& sentence,
27
+ vector<string>& words) const {
28
+ vector<Word> tmp;
29
+ Cut(sentence, tmp);
30
+ GetStringsFromWords(tmp, words);
31
+ }
32
+ void Cut(const string& sentence,
33
+ vector<Word>& words) const {
34
+ PreFilter pre_filter(symbols_, sentence);
35
+ PreFilter::Range range;
36
+ vector<WordRange> wrs;
37
+ wrs.reserve(sentence.size()/2);
38
+ while (pre_filter.HasNext()) {
39
+ range = pre_filter.Next();
40
+ Cut(range.begin, range.end, wrs);
41
+ }
42
+ words.clear();
43
+ words.reserve(wrs.size());
44
+ GetWordsFromWordRanges(sentence, wrs, words);
45
+ }
46
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
47
+ RuneStrArray::const_iterator left = begin;
48
+ RuneStrArray::const_iterator right = begin;
49
+ while (right != end) {
50
+ if (right->rune < 0x80) {
51
+ if (left != right) {
52
+ InternalCut(left, right, res);
53
+ }
54
+ left = right;
55
+ do {
56
+ right = SequentialLetterRule(left, end);
57
+ if (right != left) {
58
+ break;
59
+ }
60
+ right = NumbersRule(left, end);
61
+ if (right != left) {
62
+ break;
63
+ }
64
+ right ++;
65
+ } while (false);
66
+ WordRange wr(left, right - 1);
67
+ res.push_back(wr);
68
+ left = right;
69
+ } else {
70
+ right++;
71
+ }
72
+ }
73
+ if (left != right) {
74
+ InternalCut(left, right, res);
75
+ }
76
+ }
77
+ private:
78
+ // sequential letters rule
79
+ RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
80
+ Rune x = begin->rune;
81
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
82
+ begin ++;
83
+ } else {
84
+ return begin;
85
+ }
86
+ while (begin != end) {
87
+ x = begin->rune;
88
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
89
+ begin ++;
90
+ } else {
91
+ break;
92
+ }
93
+ }
94
+ return begin;
95
+ }
96
+ //
97
+ RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
98
+ Rune x = begin->rune;
99
+ if ('0' <= x && x <= '9') {
100
+ begin ++;
101
+ } else {
102
+ return begin;
103
+ }
104
+ while (begin != end) {
105
+ x = begin->rune;
106
+ if ( ('0' <= x && x <= '9') || x == '.') {
107
+ begin++;
108
+ } else {
109
+ break;
110
+ }
111
+ }
112
+ return begin;
113
+ }
114
+ void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
115
+ vector<size_t> status;
116
+ Viterbi(begin, end, status);
117
+
118
+ RuneStrArray::const_iterator left = begin;
119
+ RuneStrArray::const_iterator right;
120
+ for (size_t i = 0; i < status.size(); i++) {
121
+ if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
122
+ right = begin + i + 1;
123
+ WordRange wr(left, right - 1);
124
+ res.push_back(wr);
125
+ left = right;
126
+ }
127
+ }
128
+ }
129
+
130
+ void Viterbi(RuneStrArray::const_iterator begin,
131
+ RuneStrArray::const_iterator end,
132
+ vector<size_t>& status) const {
133
+ size_t Y = HMMModel::STATUS_SUM;
134
+ size_t X = end - begin;
135
+
136
+ size_t XYSize = X * Y;
137
+ size_t now, old, stat;
138
+ double tmp, endE, endS;
139
+
140
+ vector<int> path(XYSize);
141
+ vector<double> weight(XYSize);
142
+
143
+ //start
144
+ for (size_t y = 0; y < Y; y++) {
145
+ weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
146
+ path[0 + y * X] = -1;
147
+ }
148
+
149
+ double emitProb;
150
+
151
+ for (size_t x = 1; x < X; x++) {
152
+ for (size_t y = 0; y < Y; y++) {
153
+ now = x + y*X;
154
+ weight[now] = MIN_DOUBLE;
155
+ path[now] = HMMModel::E; // warning
156
+ emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
157
+ for (size_t preY = 0; preY < Y; preY++) {
158
+ old = x - 1 + preY * X;
159
+ tmp = weight[old] + model_->transProb[preY][y] + emitProb;
160
+ if (tmp > weight[now]) {
161
+ weight[now] = tmp;
162
+ path[now] = preY;
163
+ }
164
+ }
165
+ }
166
+ }
167
+
168
+ endE = weight[X-1+HMMModel::E*X];
169
+ endS = weight[X-1+HMMModel::S*X];
170
+ stat = 0;
171
+ if (endE >= endS) {
172
+ stat = HMMModel::E;
173
+ } else {
174
+ stat = HMMModel::S;
175
+ }
176
+
177
+ status.resize(X);
178
+ for (int x = X -1 ; x >= 0; x--) {
179
+ status[x] = stat;
180
+ stat = path[x + stat*X];
181
+ }
182
+ }
183
+
184
+ const HMMModel* model_;
185
+ bool isNeedDestroy_;
186
+ }; // class HMMSegment
187
+
188
+ } // namespace cppjieba
189
+
190
+ #endif
@@ -0,0 +1,108 @@
1
+ #ifndef CPPJIEAB_JIEBA_H
2
+ #define CPPJIEAB_JIEBA_H
3
+
4
+ #include "QuerySegment.hpp"
5
+ #include "KeywordExtractor.hpp"
6
+
7
+ namespace cppjieba {
8
+
9
+ class Jieba {
10
+ public:
11
+ Jieba(const string& dict_path,
12
+ const string& model_path,
13
+ const string& user_dict_path,
14
+ const string& idfPath,
15
+ const string& stopWordPath)
16
+ : dict_trie_(dict_path, user_dict_path),
17
+ model_(model_path),
18
+ mp_seg_(&dict_trie_),
19
+ hmm_seg_(&model_),
20
+ mix_seg_(&dict_trie_, &model_),
21
+ full_seg_(&dict_trie_),
22
+ query_seg_(&dict_trie_, &model_),
23
+ extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
24
+ }
25
+ ~Jieba() {
26
+ }
27
+
28
+ struct LocWord {
29
+ string word;
30
+ size_t begin;
31
+ size_t end;
32
+ }; // struct LocWord
33
+
34
+ void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
35
+ mix_seg_.Cut(sentence, words, hmm);
36
+ }
37
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
38
+ mix_seg_.Cut(sentence, words, hmm);
39
+ }
40
+ void CutAll(const string& sentence, vector<string>& words) const {
41
+ full_seg_.Cut(sentence, words);
42
+ }
43
+ void CutAll(const string& sentence, vector<Word>& words) const {
44
+ full_seg_.Cut(sentence, words);
45
+ }
46
+ void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
47
+ query_seg_.Cut(sentence, words, hmm);
48
+ }
49
+ void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
50
+ query_seg_.Cut(sentence, words, hmm);
51
+ }
52
+ void CutHMM(const string& sentence, vector<string>& words) const {
53
+ hmm_seg_.Cut(sentence, words);
54
+ }
55
+ void CutHMM(const string& sentence, vector<Word>& words) const {
56
+ hmm_seg_.Cut(sentence, words);
57
+ }
58
+ void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
59
+ mp_seg_.Cut(sentence, words, max_word_len);
60
+ }
61
+ void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
62
+ mp_seg_.Cut(sentence, words, max_word_len);
63
+ }
64
+
65
+ void Tag(const string& sentence, vector<pair<string, string> >& words) const {
66
+ mix_seg_.Tag(sentence, words);
67
+ }
68
+ string LookupTag(const string &str) const {
69
+ return mix_seg_.LookupTag(str);
70
+ }
71
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
72
+ return dict_trie_.InsertUserWord(word, tag);
73
+ }
74
+
75
+ void ResetSeparators(const string& s) {
76
+ //TODO
77
+ mp_seg_.ResetSeparators(s);
78
+ hmm_seg_.ResetSeparators(s);
79
+ mix_seg_.ResetSeparators(s);
80
+ full_seg_.ResetSeparators(s);
81
+ query_seg_.ResetSeparators(s);
82
+ }
83
+
84
+ const DictTrie* GetDictTrie() const {
85
+ return &dict_trie_;
86
+ }
87
+ const HMMModel* GetHMMModel() const {
88
+ return &model_;
89
+ }
90
+
91
+ private:
92
+ DictTrie dict_trie_;
93
+ HMMModel model_;
94
+
95
+ // They share the same dict trie and model
96
+ MPSegment mp_seg_;
97
+ HMMSegment hmm_seg_;
98
+ MixSegment mix_seg_;
99
+ FullSegment full_seg_;
100
+ QuerySegment query_seg_;
101
+
102
+ public:
103
+ KeywordExtractor extractor;
104
+ }; // class Jieba
105
+
106
+ } // namespace cppjieba
107
+
108
+ #endif // CPPJIEAB_JIEBA_H
@@ -0,0 +1,153 @@
1
+ #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
2
+ #define CPPJIEBA_KEYWORD_EXTRACTOR_H
3
+
4
+ #include <cmath>
5
+ #include <set>
6
+ #include "MixSegment.hpp"
7
+
8
+ namespace cppjieba {
9
+
10
+ using namespace limonp;
11
+ using namespace std;
12
+
13
+ /*utf8*/
14
+ class KeywordExtractor {
15
+ public:
16
+ struct Word {
17
+ string word;
18
+ vector<size_t> offsets;
19
+ double weight;
20
+ }; // struct Word
21
+
22
+ KeywordExtractor(const string& dictPath,
23
+ const string& hmmFilePath,
24
+ const string& idfPath,
25
+ const string& stopWordPath,
26
+ const string& userDict = "")
27
+ : segment_(dictPath, hmmFilePath, userDict) {
28
+ LoadIdfDict(idfPath);
29
+ LoadStopWordDict(stopWordPath);
30
+ }
31
+ KeywordExtractor(const DictTrie* dictTrie,
32
+ const HMMModel* model,
33
+ const string& idfPath,
34
+ const string& stopWordPath)
35
+ : segment_(dictTrie, model) {
36
+ LoadIdfDict(idfPath);
37
+ LoadStopWordDict(stopWordPath);
38
+ }
39
+ ~KeywordExtractor() {
40
+ }
41
+
42
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
43
+ vector<Word> topWords;
44
+ Extract(sentence, topWords, topN);
45
+ for (size_t i = 0; i < topWords.size(); i++) {
46
+ keywords.push_back(topWords[i].word);
47
+ }
48
+ }
49
+
50
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
51
+ vector<Word> topWords;
52
+ Extract(sentence, topWords, topN);
53
+ for (size_t i = 0; i < topWords.size(); i++) {
54
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
55
+ }
56
+ }
57
+
58
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
59
+ vector<string> words;
60
+ segment_.Cut(sentence, words);
61
+
62
+ map<string, Word> wordmap;
63
+ size_t offset = 0;
64
+ for (size_t i = 0; i < words.size(); ++i) {
65
+ size_t t = offset;
66
+ offset += words[i].size();
67
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
68
+ continue;
69
+ }
70
+ wordmap[words[i]].offsets.push_back(t);
71
+ wordmap[words[i]].weight += 1.0;
72
+ }
73
+ if (offset != sentence.size()) {
74
+ XLOG(ERROR) << "words illegal";
75
+ return;
76
+ }
77
+
78
+ keywords.clear();
79
+ keywords.reserve(wordmap.size());
80
+ for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
81
+ unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
82
+ if (cit != idfMap_.end()) {
83
+ itr->second.weight *= cit->second;
84
+ } else {
85
+ itr->second.weight *= idfAverage_;
86
+ }
87
+ itr->second.word = itr->first;
88
+ keywords.push_back(itr->second);
89
+ }
90
+ topN = min(topN, keywords.size());
91
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
92
+ keywords.resize(topN);
93
+ }
94
+ private:
95
+ void LoadIdfDict(const string& idfPath) {
96
+ ifstream ifs(idfPath.c_str());
97
+ XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
98
+ string line ;
99
+ vector<string> buf;
100
+ double idf = 0.0;
101
+ double idfSum = 0.0;
102
+ size_t lineno = 0;
103
+ for (; getline(ifs, line); lineno++) {
104
+ buf.clear();
105
+ if (line.empty()) {
106
+ XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107
+ continue;
108
+ }
109
+ Split(line, buf, " ");
110
+ if (buf.size() != 2) {
111
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
112
+ continue;
113
+ }
114
+ idf = atof(buf[1].c_str());
115
+ idfMap_[buf[0]] = idf;
116
+ idfSum += idf;
117
+
118
+ }
119
+
120
+ assert(lineno);
121
+ idfAverage_ = idfSum / lineno;
122
+ assert(idfAverage_ > 0.0);
123
+ }
124
+ void LoadStopWordDict(const string& filePath) {
125
+ ifstream ifs(filePath.c_str());
126
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
127
+ string line ;
128
+ while (getline(ifs, line)) {
129
+ stopWords_.insert(line);
130
+ }
131
+ assert(stopWords_.size());
132
+ }
133
+
134
+ static bool Compare(const Word& lhs, const Word& rhs) {
135
+ return lhs.weight > rhs.weight;
136
+ }
137
+
138
+ MixSegment segment_;
139
+ unordered_map<string, double> idfMap_;
140
+ double idfAverage_;
141
+
142
+ unordered_set<string> stopWords_;
143
+ }; // class KeywordExtractor
144
+
145
+ inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
146
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
147
+ }
148
+
149
+ } // namespace cppjieba
150
+
151
+ #endif
152
+
153
+