cppjieba_rb 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
@@ -0,0 +1,90 @@
|
|
1
|
+
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
2
|
+
#define CPPJIEBA_QUERYSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "limonp/Logging.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "SegmentBase.hpp"
|
10
|
+
#include "FullSegment.hpp"
|
11
|
+
#include "MixSegment.hpp"
|
12
|
+
#include "Unicode.hpp"
|
13
|
+
#include "DictTrie.hpp"
|
14
|
+
|
15
|
+
namespace cppjieba {
|
16
|
+
class QuerySegment: public SegmentBase {
|
17
|
+
public:
|
18
|
+
QuerySegment(const string& dict, const string& model, const string& userDict = "")
|
19
|
+
: mixSeg_(dict, model, userDict),
|
20
|
+
trie_(mixSeg_.GetDictTrie()) {
|
21
|
+
}
|
22
|
+
QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
|
23
|
+
: mixSeg_(dictTrie, model), trie_(dictTrie) {
|
24
|
+
}
|
25
|
+
~QuerySegment() {
|
26
|
+
}
|
27
|
+
|
28
|
+
void Cut(const string& sentence, vector<string>& words) const {
|
29
|
+
Cut(sentence, words, true);
|
30
|
+
}
|
31
|
+
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
|
32
|
+
vector<Word> tmp;
|
33
|
+
Cut(sentence, tmp, hmm);
|
34
|
+
GetStringsFromWords(tmp, words);
|
35
|
+
}
|
36
|
+
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
|
37
|
+
PreFilter pre_filter(symbols_, sentence);
|
38
|
+
PreFilter::Range range;
|
39
|
+
vector<WordRange> wrs;
|
40
|
+
wrs.reserve(sentence.size()/2);
|
41
|
+
while (pre_filter.HasNext()) {
|
42
|
+
range = pre_filter.Next();
|
43
|
+
Cut(range.begin, range.end, wrs, hmm);
|
44
|
+
}
|
45
|
+
words.clear();
|
46
|
+
words.reserve(wrs.size());
|
47
|
+
GetWordsFromWordRanges(sentence, wrs, words);
|
48
|
+
}
|
49
|
+
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
|
50
|
+
//use mix Cut first
|
51
|
+
vector<WordRange> mixRes;
|
52
|
+
mixSeg_.Cut(begin, end, mixRes, hmm);
|
53
|
+
|
54
|
+
vector<WordRange> fullRes;
|
55
|
+
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
|
56
|
+
if (mixResItr->Length() > 2) {
|
57
|
+
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
58
|
+
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
59
|
+
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
60
|
+
res.push_back(wr);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
if (mixResItr->Length() > 3) {
|
65
|
+
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
66
|
+
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
67
|
+
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
|
68
|
+
res.push_back(wr);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
res.push_back(*mixResItr);
|
73
|
+
}
|
74
|
+
}
|
75
|
+
private:
|
76
|
+
bool IsAllAscii(const Unicode& s) const {
|
77
|
+
for(size_t i = 0; i < s.size(); i++) {
|
78
|
+
if (s[i] >= 0x80) {
|
79
|
+
return false;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
return true;
|
83
|
+
}
|
84
|
+
MixSegment mixSeg_;
|
85
|
+
const DictTrie* trie_;
|
86
|
+
}; // QuerySegment
|
87
|
+
|
88
|
+
} // namespace cppjieba
|
89
|
+
|
90
|
+
#endif
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#ifndef CPPJIEBA_SEGMENTBASE_H
|
2
|
+
#define CPPJIEBA_SEGMENTBASE_H
|
3
|
+
|
4
|
+
#include "limonp/Logging.hpp"
|
5
|
+
#include "PreFilter.hpp"
|
6
|
+
#include <cassert>
|
7
|
+
|
8
|
+
|
9
|
+
namespace cppjieba {
|
10
|
+
|
11
|
+
const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
|
12
|
+
|
13
|
+
using namespace limonp;
|
14
|
+
|
15
|
+
class SegmentBase {
|
16
|
+
public:
|
17
|
+
SegmentBase() {
|
18
|
+
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
19
|
+
}
|
20
|
+
virtual ~SegmentBase() {
|
21
|
+
}
|
22
|
+
|
23
|
+
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
24
|
+
|
25
|
+
bool ResetSeparators(const string& s) {
|
26
|
+
symbols_.clear();
|
27
|
+
RuneStrArray runes;
|
28
|
+
if (!DecodeRunesInString(s, runes)) {
|
29
|
+
XLOG(ERROR) << "decode " << s << " failed";
|
30
|
+
return false;
|
31
|
+
}
|
32
|
+
for (size_t i = 0; i < runes.size(); i++) {
|
33
|
+
if (!symbols_.insert(runes[i].rune).second) {
|
34
|
+
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
|
35
|
+
return false;
|
36
|
+
}
|
37
|
+
}
|
38
|
+
return true;
|
39
|
+
}
|
40
|
+
protected:
|
41
|
+
unordered_set<Rune> symbols_;
|
42
|
+
}; // class SegmentBase
|
43
|
+
|
44
|
+
} // cppjieba
|
45
|
+
|
46
|
+
#endif
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
2
|
+
#define CPPJIEBA_SEGMENTTAGGED_H
|
3
|
+
|
4
|
+
#include "SegmentBase.hpp"
|
5
|
+
|
6
|
+
namespace cppjieba {
|
7
|
+
|
8
|
+
class SegmentTagged : public SegmentBase{
|
9
|
+
public:
|
10
|
+
SegmentTagged() {
|
11
|
+
}
|
12
|
+
virtual ~SegmentTagged() {
|
13
|
+
}
|
14
|
+
|
15
|
+
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
|
16
|
+
|
17
|
+
virtual const DictTrie* GetDictTrie() const = 0;
|
18
|
+
|
19
|
+
}; // class SegmentTagged
|
20
|
+
|
21
|
+
} // cppjieba
|
22
|
+
|
23
|
+
#endif
|
@@ -0,0 +1,190 @@
|
|
1
|
+
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
2
|
+
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
3
|
+
|
4
|
+
#include <cmath>
|
5
|
+
#include "Jieba.hpp"
|
6
|
+
|
7
|
+
namespace cppjieba {
|
8
|
+
using namespace limonp;
|
9
|
+
using namespace std;
|
10
|
+
|
11
|
+
class TextRankExtractor {
|
12
|
+
public:
|
13
|
+
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
|
14
|
+
private:
|
15
|
+
typedef std::map<string,Word> WordMap;
|
16
|
+
|
17
|
+
class WordGraph{
|
18
|
+
private:
|
19
|
+
typedef double Score;
|
20
|
+
typedef string Node;
|
21
|
+
typedef std::set<Node> NodeSet;
|
22
|
+
|
23
|
+
typedef std::map<Node,double> Edges;
|
24
|
+
typedef std::map<Node,Edges> Graph;
|
25
|
+
//typedef std::unordered_map<Node,double> Edges;
|
26
|
+
//typedef std::unordered_map<Node,Edges> Graph;
|
27
|
+
|
28
|
+
double d;
|
29
|
+
Graph graph;
|
30
|
+
NodeSet nodeSet;
|
31
|
+
public:
|
32
|
+
WordGraph(): d(0.85) {};
|
33
|
+
WordGraph(double in_d): d(in_d) {};
|
34
|
+
|
35
|
+
void addEdge(Node start,Node end,double weight){
|
36
|
+
Edges temp;
|
37
|
+
Edges::iterator gotEdges;
|
38
|
+
nodeSet.insert(start);
|
39
|
+
nodeSet.insert(end);
|
40
|
+
graph[start][end]+=weight;
|
41
|
+
graph[end][start]+=weight;
|
42
|
+
}
|
43
|
+
|
44
|
+
void rank(WordMap &ws,size_t rankTime=10){
|
45
|
+
WordMap outSum;
|
46
|
+
Score wsdef, min_rank, max_rank;
|
47
|
+
|
48
|
+
if( graph.size() == 0)
|
49
|
+
return;
|
50
|
+
|
51
|
+
wsdef = 1.0 / graph.size();
|
52
|
+
|
53
|
+
for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
|
54
|
+
// edges->first start节点;edge->first end节点;edge->second 权重
|
55
|
+
ws[edges->first].word=edges->first;
|
56
|
+
ws[edges->first].weight=wsdef;
|
57
|
+
outSum[edges->first].weight=0;
|
58
|
+
for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
|
59
|
+
outSum[edges->first].weight+=edge->second;
|
60
|
+
}
|
61
|
+
}
|
62
|
+
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
63
|
+
for( size_t i=0; i<rankTime; i++ ){
|
64
|
+
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
|
65
|
+
double s = 0;
|
66
|
+
for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
|
67
|
+
// edge->first end节点;edge->second 权重
|
68
|
+
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
69
|
+
ws[*node].weight = (1 - d) + d * s;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
min_rank=max_rank=ws.begin()->second.weight;
|
74
|
+
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
75
|
+
if( i->second.weight < min_rank ){
|
76
|
+
min_rank = i->second.weight;
|
77
|
+
}
|
78
|
+
if( i->second.weight > max_rank ){
|
79
|
+
max_rank = i->second.weight;
|
80
|
+
}
|
81
|
+
}
|
82
|
+
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
83
|
+
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
84
|
+
}
|
85
|
+
}
|
86
|
+
};
|
87
|
+
|
88
|
+
public:
|
89
|
+
TextRankExtractor(const string& dictPath,
|
90
|
+
const string& hmmFilePath,
|
91
|
+
const string& stopWordPath,
|
92
|
+
const string& userDict = "")
|
93
|
+
: segment_(dictPath, hmmFilePath, userDict) {
|
94
|
+
LoadStopWordDict(stopWordPath);
|
95
|
+
}
|
96
|
+
TextRankExtractor(const DictTrie* dictTrie,
|
97
|
+
const HMMModel* model,
|
98
|
+
const string& stopWordPath)
|
99
|
+
: segment_(dictTrie, model) {
|
100
|
+
LoadStopWordDict(stopWordPath);
|
101
|
+
}
|
102
|
+
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
103
|
+
LoadStopWordDict(stopWordPath);
|
104
|
+
}
|
105
|
+
~TextRankExtractor() {
|
106
|
+
}
|
107
|
+
|
108
|
+
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
109
|
+
vector<Word> topWords;
|
110
|
+
Extract(sentence, topWords, topN);
|
111
|
+
for (size_t i = 0; i < topWords.size(); i++) {
|
112
|
+
keywords.push_back(topWords[i].word);
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
117
|
+
vector<Word> topWords;
|
118
|
+
Extract(sentence, topWords, topN);
|
119
|
+
for (size_t i = 0; i < topWords.size(); i++) {
|
120
|
+
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
125
|
+
vector<string> words;
|
126
|
+
segment_.Cut(sentence, words);
|
127
|
+
|
128
|
+
TextRankExtractor::WordGraph graph;
|
129
|
+
WordMap wordmap;
|
130
|
+
size_t offset = 0;
|
131
|
+
|
132
|
+
for(size_t i=0; i < words.size(); i++){
|
133
|
+
size_t t = offset;
|
134
|
+
offset += words[i].size();
|
135
|
+
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
136
|
+
continue;
|
137
|
+
}
|
138
|
+
for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
|
139
|
+
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
140
|
+
skip++;
|
141
|
+
continue;
|
142
|
+
}
|
143
|
+
graph.addEdge(words[i],words[j],1);
|
144
|
+
}
|
145
|
+
wordmap[words[i]].offsets.push_back(t);
|
146
|
+
}
|
147
|
+
if (offset != sentence.size()) {
|
148
|
+
XLOG(ERROR) << "words illegal";
|
149
|
+
return;
|
150
|
+
}
|
151
|
+
|
152
|
+
graph.rank(wordmap,rankTime);
|
153
|
+
|
154
|
+
keywords.clear();
|
155
|
+
keywords.reserve(wordmap.size());
|
156
|
+
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
157
|
+
keywords.push_back(itr->second);
|
158
|
+
}
|
159
|
+
|
160
|
+
topN = min(topN, keywords.size());
|
161
|
+
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
162
|
+
keywords.resize(topN);
|
163
|
+
}
|
164
|
+
private:
|
165
|
+
void LoadStopWordDict(const string& filePath) {
|
166
|
+
ifstream ifs(filePath.c_str());
|
167
|
+
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
168
|
+
string line ;
|
169
|
+
while (getline(ifs, line)) {
|
170
|
+
stopWords_.insert(line);
|
171
|
+
}
|
172
|
+
assert(stopWords_.size());
|
173
|
+
}
|
174
|
+
|
175
|
+
static bool Compare(const Word &x,const Word &y){
|
176
|
+
return x.weight > y.weight;
|
177
|
+
}
|
178
|
+
|
179
|
+
MixSegment segment_;
|
180
|
+
unordered_set<string> stopWords_;
|
181
|
+
}; // class TextRankExtractor
|
182
|
+
|
183
|
+
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
184
|
+
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
185
|
+
}
|
186
|
+
} // namespace cppjieba
|
187
|
+
|
188
|
+
#endif
|
189
|
+
|
190
|
+
|
@@ -0,0 +1,174 @@
|
|
1
|
+
#ifndef CPPJIEBA_TRIE_HPP
|
2
|
+
#define CPPJIEBA_TRIE_HPP
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
#include <queue>
|
6
|
+
#include "limonp/StdExtension.hpp"
|
7
|
+
#include "Unicode.hpp"
|
8
|
+
|
9
|
+
namespace cppjieba {
|
10
|
+
|
11
|
+
using namespace std;
|
12
|
+
|
13
|
+
const size_t MAX_WORD_LENGTH = 512;
|
14
|
+
|
15
|
+
struct DictUnit {
|
16
|
+
Unicode word;
|
17
|
+
double weight;
|
18
|
+
string tag;
|
19
|
+
}; // struct DictUnit
|
20
|
+
|
21
|
+
// for debugging
|
22
|
+
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
23
|
+
// string s;
|
24
|
+
// s << unit.word;
|
25
|
+
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
26
|
+
// }
|
27
|
+
|
28
|
+
struct Dag {
|
29
|
+
RuneStr runestr;
|
30
|
+
// [offset, nexts.first]
|
31
|
+
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
|
32
|
+
const DictUnit * pInfo;
|
33
|
+
double weight;
|
34
|
+
size_t nextPos; // TODO
|
35
|
+
Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
|
36
|
+
}
|
37
|
+
}; // struct Dag
|
38
|
+
|
39
|
+
typedef Rune TrieKey;
|
40
|
+
|
41
|
+
class TrieNode {
|
42
|
+
public :
|
43
|
+
TrieNode(): next(NULL), ptValue(NULL) {
|
44
|
+
}
|
45
|
+
public:
|
46
|
+
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
47
|
+
NextMap *next;
|
48
|
+
const DictUnit *ptValue;
|
49
|
+
};
|
50
|
+
|
51
|
+
class Trie {
|
52
|
+
public:
|
53
|
+
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
|
54
|
+
: root_(new TrieNode) {
|
55
|
+
CreateTrie(keys, valuePointers);
|
56
|
+
}
|
57
|
+
~Trie() {
|
58
|
+
DeleteNode(root_);
|
59
|
+
}
|
60
|
+
|
61
|
+
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
62
|
+
if (begin == end) {
|
63
|
+
return NULL;
|
64
|
+
}
|
65
|
+
|
66
|
+
const TrieNode* ptNode = root_;
|
67
|
+
TrieNode::NextMap::const_iterator citer;
|
68
|
+
for (RuneStrArray::const_iterator it = begin; it != end; it++) {
|
69
|
+
if (NULL == ptNode->next) {
|
70
|
+
return NULL;
|
71
|
+
}
|
72
|
+
citer = ptNode->next->find(it->rune);
|
73
|
+
if (ptNode->next->end() == citer) {
|
74
|
+
return NULL;
|
75
|
+
}
|
76
|
+
ptNode = citer->second;
|
77
|
+
}
|
78
|
+
return ptNode->ptValue;
|
79
|
+
}
|
80
|
+
|
81
|
+
void Find(RuneStrArray::const_iterator begin,
|
82
|
+
RuneStrArray::const_iterator end,
|
83
|
+
vector<struct Dag>&res,
|
84
|
+
size_t max_word_len = MAX_WORD_LENGTH) const {
|
85
|
+
assert(root_ != NULL);
|
86
|
+
res.resize(end - begin);
|
87
|
+
|
88
|
+
const TrieNode *ptNode = NULL;
|
89
|
+
TrieNode::NextMap::const_iterator citer;
|
90
|
+
for (size_t i = 0; i < size_t(end - begin); i++) {
|
91
|
+
res[i].runestr = *(begin + i);
|
92
|
+
|
93
|
+
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
94
|
+
ptNode = citer->second;
|
95
|
+
} else {
|
96
|
+
ptNode = NULL;
|
97
|
+
}
|
98
|
+
if (ptNode != NULL) {
|
99
|
+
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
100
|
+
} else {
|
101
|
+
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
|
102
|
+
}
|
103
|
+
|
104
|
+
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
|
105
|
+
if (ptNode == NULL || ptNode->next == NULL) {
|
106
|
+
break;
|
107
|
+
}
|
108
|
+
citer = ptNode->next->find((begin + j)->rune);
|
109
|
+
if (ptNode->next->end() == citer) {
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
ptNode = citer->second;
|
113
|
+
if (NULL != ptNode->ptValue) {
|
114
|
+
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
115
|
+
}
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
121
|
+
if (key.begin() == key.end()) {
|
122
|
+
return;
|
123
|
+
}
|
124
|
+
|
125
|
+
TrieNode::NextMap::const_iterator kmIter;
|
126
|
+
TrieNode *ptNode = root_;
|
127
|
+
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
128
|
+
if (NULL == ptNode->next) {
|
129
|
+
ptNode->next = new TrieNode::NextMap;
|
130
|
+
}
|
131
|
+
kmIter = ptNode->next->find(*citer);
|
132
|
+
if (ptNode->next->end() == kmIter) {
|
133
|
+
TrieNode *nextNode = new TrieNode;
|
134
|
+
|
135
|
+
ptNode->next->insert(make_pair(*citer, nextNode));
|
136
|
+
ptNode = nextNode;
|
137
|
+
} else {
|
138
|
+
ptNode = kmIter->second;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
assert(ptNode != NULL);
|
142
|
+
ptNode->ptValue = ptValue;
|
143
|
+
}
|
144
|
+
|
145
|
+
private:
|
146
|
+
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
147
|
+
if (valuePointers.empty() || keys.empty()) {
|
148
|
+
return;
|
149
|
+
}
|
150
|
+
assert(keys.size() == valuePointers.size());
|
151
|
+
|
152
|
+
for (size_t i = 0; i < keys.size(); i++) {
|
153
|
+
InsertNode(keys[i], valuePointers[i]);
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
void DeleteNode(TrieNode* node) {
|
158
|
+
if (NULL == node) {
|
159
|
+
return;
|
160
|
+
}
|
161
|
+
if (NULL != node->next) {
|
162
|
+
for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
|
163
|
+
DeleteNode(it->second);
|
164
|
+
}
|
165
|
+
delete node->next;
|
166
|
+
}
|
167
|
+
delete node;
|
168
|
+
}
|
169
|
+
|
170
|
+
TrieNode* root_;
|
171
|
+
}; // class Trie
|
172
|
+
} // namespace cppjieba
|
173
|
+
|
174
|
+
#endif // CPPJIEBA_TRIE_HPP
|