cppjieba_rb 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (130) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +2 -2
  5. data/cppjieba_rb.gemspec +4 -4
  6. data/lib/cppjieba_rb/version.rb +1 -1
  7. metadata +17 -135
  8. data/ext/cppjieba/.gitignore +0 -17
  9. data/ext/cppjieba/.travis.yml +0 -21
  10. data/ext/cppjieba/CMakeLists.txt +0 -28
  11. data/ext/cppjieba/ChangeLog.md +0 -236
  12. data/ext/cppjieba/README.md +0 -292
  13. data/ext/cppjieba/README_EN.md +0 -113
  14. data/ext/cppjieba/appveyor.yml +0 -32
  15. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  16. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  17. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  28. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  41. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  44. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  45. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  46. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  47. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  48. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  49. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  50. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  51. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  52. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  53. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  54. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
  55. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
  56. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  57. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  58. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  59. data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
  60. data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
  61. data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
  62. data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
  63. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  64. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
  65. data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
  66. data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
  67. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  68. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  69. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
  70. data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
  71. data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
  72. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  73. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  74. data/ext/cppjieba/dict/README.md +0 -31
  75. data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
  76. data/ext/cppjieba/dict/idf.utf8 +0 -258826
  77. data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
  78. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
  79. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
  80. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
  81. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
  82. data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
  83. data/ext/cppjieba/dict/user.dict.utf8 +0 -4
  84. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
  85. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
  86. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
  87. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
  88. data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
  89. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
  90. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
  91. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
  92. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
  93. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
  94. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
  95. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
  96. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
  97. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
  98. data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
  99. data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
  100. data/ext/cppjieba/test/CMakeLists.txt +0 -5
  101. data/ext/cppjieba/test/demo.cpp +0 -80
  102. data/ext/cppjieba/test/load_test.cpp +0 -54
  103. data/ext/cppjieba/test/testdata/curl.res +0 -1
  104. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
  105. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
  106. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
  107. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
  108. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
  109. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
  110. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
  111. data/ext/cppjieba/test/testdata/load_test.urls +0 -2
  112. data/ext/cppjieba/test/testdata/review.100 +0 -100
  113. data/ext/cppjieba/test/testdata/review.100.res +0 -200
  114. data/ext/cppjieba/test/testdata/server.conf +0 -19
  115. data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
  116. data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
  117. data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
  118. data/ext/cppjieba/test/testdata/userdict.english +0 -2
  119. data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
  120. data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
  121. data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
  122. data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
  123. data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
  124. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
  125. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
  126. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
  127. data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
  128. data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
  129. data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
  130. data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,190 +0,0 @@
1
- #ifndef CPPJIBEA_HMMSEGMENT_H
2
- #define CPPJIBEA_HMMSEGMENT_H
3
-
4
- #include <iostream>
5
- #include <fstream>
6
- #include <memory.h>
7
- #include <cassert>
8
- #include "HMMModel.hpp"
9
- #include "SegmentBase.hpp"
10
-
11
- namespace cppjieba {
12
- class HMMSegment: public SegmentBase {
13
- public:
14
- HMMSegment(const string& filePath)
15
- : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
16
- }
17
- HMMSegment(const HMMModel* model)
18
- : model_(model), isNeedDestroy_(false) {
19
- }
20
- ~HMMSegment() {
21
- if (isNeedDestroy_) {
22
- delete model_;
23
- }
24
- }
25
-
26
- void Cut(const string& sentence,
27
- vector<string>& words) const {
28
- vector<Word> tmp;
29
- Cut(sentence, tmp);
30
- GetStringsFromWords(tmp, words);
31
- }
32
- void Cut(const string& sentence,
33
- vector<Word>& words) const {
34
- PreFilter pre_filter(symbols_, sentence);
35
- PreFilter::Range range;
36
- vector<WordRange> wrs;
37
- wrs.reserve(sentence.size()/2);
38
- while (pre_filter.HasNext()) {
39
- range = pre_filter.Next();
40
- Cut(range.begin, range.end, wrs);
41
- }
42
- words.clear();
43
- words.reserve(wrs.size());
44
- GetWordsFromWordRanges(sentence, wrs, words);
45
- }
46
- void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
47
- RuneStrArray::const_iterator left = begin;
48
- RuneStrArray::const_iterator right = begin;
49
- while (right != end) {
50
- if (right->rune < 0x80) {
51
- if (left != right) {
52
- InternalCut(left, right, res);
53
- }
54
- left = right;
55
- do {
56
- right = SequentialLetterRule(left, end);
57
- if (right != left) {
58
- break;
59
- }
60
- right = NumbersRule(left, end);
61
- if (right != left) {
62
- break;
63
- }
64
- right ++;
65
- } while (false);
66
- WordRange wr(left, right - 1);
67
- res.push_back(wr);
68
- left = right;
69
- } else {
70
- right++;
71
- }
72
- }
73
- if (left != right) {
74
- InternalCut(left, right, res);
75
- }
76
- }
77
- private:
78
- // sequential letters rule
79
- RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
80
- Rune x = begin->rune;
81
- if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
82
- begin ++;
83
- } else {
84
- return begin;
85
- }
86
- while (begin != end) {
87
- x = begin->rune;
88
- if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
89
- begin ++;
90
- } else {
91
- break;
92
- }
93
- }
94
- return begin;
95
- }
96
- //
97
- RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
98
- Rune x = begin->rune;
99
- if ('0' <= x && x <= '9') {
100
- begin ++;
101
- } else {
102
- return begin;
103
- }
104
- while (begin != end) {
105
- x = begin->rune;
106
- if ( ('0' <= x && x <= '9') || x == '.') {
107
- begin++;
108
- } else {
109
- break;
110
- }
111
- }
112
- return begin;
113
- }
114
- void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
115
- vector<size_t> status;
116
- Viterbi(begin, end, status);
117
-
118
- RuneStrArray::const_iterator left = begin;
119
- RuneStrArray::const_iterator right;
120
- for (size_t i = 0; i < status.size(); i++) {
121
- if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
122
- right = begin + i + 1;
123
- WordRange wr(left, right - 1);
124
- res.push_back(wr);
125
- left = right;
126
- }
127
- }
128
- }
129
-
130
- void Viterbi(RuneStrArray::const_iterator begin,
131
- RuneStrArray::const_iterator end,
132
- vector<size_t>& status) const {
133
- size_t Y = HMMModel::STATUS_SUM;
134
- size_t X = end - begin;
135
-
136
- size_t XYSize = X * Y;
137
- size_t now, old, stat;
138
- double tmp, endE, endS;
139
-
140
- vector<int> path(XYSize);
141
- vector<double> weight(XYSize);
142
-
143
- //start
144
- for (size_t y = 0; y < Y; y++) {
145
- weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
146
- path[0 + y * X] = -1;
147
- }
148
-
149
- double emitProb;
150
-
151
- for (size_t x = 1; x < X; x++) {
152
- for (size_t y = 0; y < Y; y++) {
153
- now = x + y*X;
154
- weight[now] = MIN_DOUBLE;
155
- path[now] = HMMModel::E; // warning
156
- emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
157
- for (size_t preY = 0; preY < Y; preY++) {
158
- old = x - 1 + preY * X;
159
- tmp = weight[old] + model_->transProb[preY][y] + emitProb;
160
- if (tmp > weight[now]) {
161
- weight[now] = tmp;
162
- path[now] = preY;
163
- }
164
- }
165
- }
166
- }
167
-
168
- endE = weight[X-1+HMMModel::E*X];
169
- endS = weight[X-1+HMMModel::S*X];
170
- stat = 0;
171
- if (endE >= endS) {
172
- stat = HMMModel::E;
173
- } else {
174
- stat = HMMModel::S;
175
- }
176
-
177
- status.resize(X);
178
- for (int x = X -1 ; x >= 0; x--) {
179
- status[x] = stat;
180
- stat = path[x + stat*X];
181
- }
182
- }
183
-
184
- const HMMModel* model_;
185
- bool isNeedDestroy_;
186
- }; // class HMMSegment
187
-
188
- } // namespace cppjieba
189
-
190
- #endif
@@ -1,130 +0,0 @@
1
- #ifndef CPPJIEAB_JIEBA_H
2
- #define CPPJIEAB_JIEBA_H
3
-
4
- #include "QuerySegment.hpp"
5
- #include "KeywordExtractor.hpp"
6
-
7
- namespace cppjieba {
8
-
9
- class Jieba {
10
- public:
11
- Jieba(const string& dict_path,
12
- const string& model_path,
13
- const string& user_dict_path,
14
- const string& idfPath,
15
- const string& stopWordPath)
16
- : dict_trie_(dict_path, user_dict_path),
17
- model_(model_path),
18
- mp_seg_(&dict_trie_),
19
- hmm_seg_(&model_),
20
- mix_seg_(&dict_trie_, &model_),
21
- full_seg_(&dict_trie_),
22
- query_seg_(&dict_trie_, &model_),
23
- extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
24
- }
25
- ~Jieba() {
26
- }
27
-
28
- struct LocWord {
29
- string word;
30
- size_t begin;
31
- size_t end;
32
- }; // struct LocWord
33
-
34
- void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
35
- mix_seg_.Cut(sentence, words, hmm);
36
- }
37
- void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
38
- mix_seg_.Cut(sentence, words, hmm);
39
- }
40
- void CutAll(const string& sentence, vector<string>& words) const {
41
- full_seg_.Cut(sentence, words);
42
- }
43
- void CutAll(const string& sentence, vector<Word>& words) const {
44
- full_seg_.Cut(sentence, words);
45
- }
46
- void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
47
- query_seg_.Cut(sentence, words, hmm);
48
- }
49
- void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
50
- query_seg_.Cut(sentence, words, hmm);
51
- }
52
- void CutHMM(const string& sentence, vector<string>& words) const {
53
- hmm_seg_.Cut(sentence, words);
54
- }
55
- void CutHMM(const string& sentence, vector<Word>& words) const {
56
- hmm_seg_.Cut(sentence, words);
57
- }
58
- void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
59
- mp_seg_.Cut(sentence, words, max_word_len);
60
- }
61
- void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
62
- mp_seg_.Cut(sentence, words, max_word_len);
63
- }
64
-
65
- void Tag(const string& sentence, vector<pair<string, string> >& words) const {
66
- mix_seg_.Tag(sentence, words);
67
- }
68
- string LookupTag(const string &str) const {
69
- return mix_seg_.LookupTag(str);
70
- }
71
- bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
72
- return dict_trie_.InsertUserWord(word, tag);
73
- }
74
-
75
- bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
76
- return dict_trie_.InsertUserWord(word,freq, tag);
77
- }
78
-
79
- bool Find(const string& word)
80
- {
81
- return dict_trie_.Find(word);
82
- }
83
-
84
- void ResetSeparators(const string& s) {
85
- //TODO
86
- mp_seg_.ResetSeparators(s);
87
- hmm_seg_.ResetSeparators(s);
88
- mix_seg_.ResetSeparators(s);
89
- full_seg_.ResetSeparators(s);
90
- query_seg_.ResetSeparators(s);
91
- }
92
-
93
- const DictTrie* GetDictTrie() const {
94
- return &dict_trie_;
95
- }
96
-
97
- const HMMModel* GetHMMModel() const {
98
- return &model_;
99
- }
100
-
101
- void LoadUserDict(const vector<string>& buf) {
102
- dict_trie_.LoadUserDict(buf);
103
- }
104
-
105
- void LoadUserDict(const set<string>& buf) {
106
- dict_trie_.LoadUserDict(buf);
107
- }
108
-
109
- void LoadUserDict(const string& path) {
110
- dict_trie_.LoadUserDict(path);
111
- }
112
-
113
- private:
114
- DictTrie dict_trie_;
115
- HMMModel model_;
116
-
117
- // They share the same dict trie and model
118
- MPSegment mp_seg_;
119
- HMMSegment hmm_seg_;
120
- MixSegment mix_seg_;
121
- FullSegment full_seg_;
122
- QuerySegment query_seg_;
123
-
124
- public:
125
- KeywordExtractor extractor;
126
- }; // class Jieba
127
-
128
- } // namespace cppjieba
129
-
130
- #endif // CPPJIEAB_JIEBA_H
@@ -1,153 +0,0 @@
1
- #ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
2
- #define CPPJIEBA_KEYWORD_EXTRACTOR_H
3
-
4
- #include <cmath>
5
- #include <set>
6
- #include "MixSegment.hpp"
7
-
8
- namespace cppjieba {
9
-
10
- using namespace limonp;
11
- using namespace std;
12
-
13
- /*utf8*/
14
- class KeywordExtractor {
15
- public:
16
- struct Word {
17
- string word;
18
- vector<size_t> offsets;
19
- double weight;
20
- }; // struct Word
21
-
22
- KeywordExtractor(const string& dictPath,
23
- const string& hmmFilePath,
24
- const string& idfPath,
25
- const string& stopWordPath,
26
- const string& userDict = "")
27
- : segment_(dictPath, hmmFilePath, userDict) {
28
- LoadIdfDict(idfPath);
29
- LoadStopWordDict(stopWordPath);
30
- }
31
- KeywordExtractor(const DictTrie* dictTrie,
32
- const HMMModel* model,
33
- const string& idfPath,
34
- const string& stopWordPath)
35
- : segment_(dictTrie, model) {
36
- LoadIdfDict(idfPath);
37
- LoadStopWordDict(stopWordPath);
38
- }
39
- ~KeywordExtractor() {
40
- }
41
-
42
- void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
43
- vector<Word> topWords;
44
- Extract(sentence, topWords, topN);
45
- for (size_t i = 0; i < topWords.size(); i++) {
46
- keywords.push_back(topWords[i].word);
47
- }
48
- }
49
-
50
- void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
51
- vector<Word> topWords;
52
- Extract(sentence, topWords, topN);
53
- for (size_t i = 0; i < topWords.size(); i++) {
54
- keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
55
- }
56
- }
57
-
58
- void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
59
- vector<string> words;
60
- segment_.Cut(sentence, words);
61
-
62
- map<string, Word> wordmap;
63
- size_t offset = 0;
64
- for (size_t i = 0; i < words.size(); ++i) {
65
- size_t t = offset;
66
- offset += words[i].size();
67
- if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
68
- continue;
69
- }
70
- wordmap[words[i]].offsets.push_back(t);
71
- wordmap[words[i]].weight += 1.0;
72
- }
73
- if (offset != sentence.size()) {
74
- XLOG(ERROR) << "words illegal";
75
- return;
76
- }
77
-
78
- keywords.clear();
79
- keywords.reserve(wordmap.size());
80
- for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
81
- unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
82
- if (cit != idfMap_.end()) {
83
- itr->second.weight *= cit->second;
84
- } else {
85
- itr->second.weight *= idfAverage_;
86
- }
87
- itr->second.word = itr->first;
88
- keywords.push_back(itr->second);
89
- }
90
- topN = min(topN, keywords.size());
91
- partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
92
- keywords.resize(topN);
93
- }
94
- private:
95
- void LoadIdfDict(const string& idfPath) {
96
- ifstream ifs(idfPath.c_str());
97
- XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
98
- string line ;
99
- vector<string> buf;
100
- double idf = 0.0;
101
- double idfSum = 0.0;
102
- size_t lineno = 0;
103
- for (; getline(ifs, line); lineno++) {
104
- buf.clear();
105
- if (line.empty()) {
106
- XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107
- continue;
108
- }
109
- Split(line, buf, " ");
110
- if (buf.size() != 2) {
111
- XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
112
- continue;
113
- }
114
- idf = atof(buf[1].c_str());
115
- idfMap_[buf[0]] = idf;
116
- idfSum += idf;
117
-
118
- }
119
-
120
- assert(lineno);
121
- idfAverage_ = idfSum / lineno;
122
- assert(idfAverage_ > 0.0);
123
- }
124
- void LoadStopWordDict(const string& filePath) {
125
- ifstream ifs(filePath.c_str());
126
- XCHECK(ifs.is_open()) << "open " << filePath << " failed";
127
- string line ;
128
- while (getline(ifs, line)) {
129
- stopWords_.insert(line);
130
- }
131
- assert(stopWords_.size());
132
- }
133
-
134
- static bool Compare(const Word& lhs, const Word& rhs) {
135
- return lhs.weight > rhs.weight;
136
- }
137
-
138
- MixSegment segment_;
139
- unordered_map<string, double> idfMap_;
140
- double idfAverage_;
141
-
142
- unordered_set<string> stopWords_;
143
- }; // class KeywordExtractor
144
-
145
- inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
146
- return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
147
- }
148
-
149
- } // namespace cppjieba
150
-
151
- #endif
152
-
153
-