cppjieba_rb 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +3 -0
- data/README.md +1 -1
- data/Rakefile +2 -2
- data/cppjieba_rb.gemspec +4 -4
- data/lib/cppjieba_rb/version.rb +1 -1
- metadata +17 -135
- data/ext/cppjieba/.gitignore +0 -17
- data/ext/cppjieba/.travis.yml +0 -21
- data/ext/cppjieba/CMakeLists.txt +0 -28
- data/ext/cppjieba/ChangeLog.md +0 -236
- data/ext/cppjieba/README.md +0 -292
- data/ext/cppjieba/README_EN.md +0 -113
- data/ext/cppjieba/appveyor.yml +0 -32
- data/ext/cppjieba/deps/CMakeLists.txt +0 -1
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
- data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
- data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
- data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
- data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
- data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
- data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
- data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
- data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
- data/ext/cppjieba/dict/README.md +0 -31
- data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
- data/ext/cppjieba/dict/idf.utf8 +0 -258826
- data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
- data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
- data/ext/cppjieba/dict/user.dict.utf8 +0 -4
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
- data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
- data/ext/cppjieba/test/CMakeLists.txt +0 -5
- data/ext/cppjieba/test/demo.cpp +0 -80
- data/ext/cppjieba/test/load_test.cpp +0 -54
- data/ext/cppjieba/test/testdata/curl.res +0 -1
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
- data/ext/cppjieba/test/testdata/load_test.urls +0 -2
- data/ext/cppjieba/test/testdata/review.100 +0 -100
- data/ext/cppjieba/test/testdata/review.100.res +0 -200
- data/ext/cppjieba/test/testdata/server.conf +0 -19
- data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
- data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
- data/ext/cppjieba/test/testdata/userdict.english +0 -2
- data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
- data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
- data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
- data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
- data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
- data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
- data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
- data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
- data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,46 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_SEGMENTBASE_H
|
2
|
-
#define CPPJIEBA_SEGMENTBASE_H
|
3
|
-
|
4
|
-
#include "limonp/Logging.hpp"
|
5
|
-
#include "PreFilter.hpp"
|
6
|
-
#include <cassert>
|
7
|
-
|
8
|
-
|
9
|
-
namespace cppjieba {
|
10
|
-
|
11
|
-
const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
|
12
|
-
|
13
|
-
using namespace limonp;
|
14
|
-
|
15
|
-
class SegmentBase {
|
16
|
-
public:
|
17
|
-
SegmentBase() {
|
18
|
-
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
|
19
|
-
}
|
20
|
-
virtual ~SegmentBase() {
|
21
|
-
}
|
22
|
-
|
23
|
-
virtual void Cut(const string& sentence, vector<string>& words) const = 0;
|
24
|
-
|
25
|
-
bool ResetSeparators(const string& s) {
|
26
|
-
symbols_.clear();
|
27
|
-
RuneStrArray runes;
|
28
|
-
if (!DecodeRunesInString(s, runes)) {
|
29
|
-
XLOG(ERROR) << "decode " << s << " failed";
|
30
|
-
return false;
|
31
|
-
}
|
32
|
-
for (size_t i = 0; i < runes.size(); i++) {
|
33
|
-
if (!symbols_.insert(runes[i].rune).second) {
|
34
|
-
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
|
35
|
-
return false;
|
36
|
-
}
|
37
|
-
}
|
38
|
-
return true;
|
39
|
-
}
|
40
|
-
protected:
|
41
|
-
unordered_set<Rune> symbols_;
|
42
|
-
}; // class SegmentBase
|
43
|
-
|
44
|
-
} // cppjieba
|
45
|
-
|
46
|
-
#endif
|
@@ -1,23 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_SEGMENTTAGGED_H
|
2
|
-
#define CPPJIEBA_SEGMENTTAGGED_H
|
3
|
-
|
4
|
-
#include "SegmentBase.hpp"
|
5
|
-
|
6
|
-
namespace cppjieba {
|
7
|
-
|
8
|
-
class SegmentTagged : public SegmentBase{
|
9
|
-
public:
|
10
|
-
SegmentTagged() {
|
11
|
-
}
|
12
|
-
virtual ~SegmentTagged() {
|
13
|
-
}
|
14
|
-
|
15
|
-
virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
|
16
|
-
|
17
|
-
virtual const DictTrie* GetDictTrie() const = 0;
|
18
|
-
|
19
|
-
}; // class SegmentTagged
|
20
|
-
|
21
|
-
} // cppjieba
|
22
|
-
|
23
|
-
#endif
|
@@ -1,190 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
2
|
-
#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
|
3
|
-
|
4
|
-
#include <cmath>
|
5
|
-
#include "Jieba.hpp"
|
6
|
-
|
7
|
-
namespace cppjieba {
|
8
|
-
using namespace limonp;
|
9
|
-
using namespace std;
|
10
|
-
|
11
|
-
class TextRankExtractor {
|
12
|
-
public:
|
13
|
-
typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
|
14
|
-
private:
|
15
|
-
typedef std::map<string,Word> WordMap;
|
16
|
-
|
17
|
-
class WordGraph{
|
18
|
-
private:
|
19
|
-
typedef double Score;
|
20
|
-
typedef string Node;
|
21
|
-
typedef std::set<Node> NodeSet;
|
22
|
-
|
23
|
-
typedef std::map<Node,double> Edges;
|
24
|
-
typedef std::map<Node,Edges> Graph;
|
25
|
-
//typedef std::unordered_map<Node,double> Edges;
|
26
|
-
//typedef std::unordered_map<Node,Edges> Graph;
|
27
|
-
|
28
|
-
double d;
|
29
|
-
Graph graph;
|
30
|
-
NodeSet nodeSet;
|
31
|
-
public:
|
32
|
-
WordGraph(): d(0.85) {};
|
33
|
-
WordGraph(double in_d): d(in_d) {};
|
34
|
-
|
35
|
-
void addEdge(Node start,Node end,double weight){
|
36
|
-
Edges temp;
|
37
|
-
Edges::iterator gotEdges;
|
38
|
-
nodeSet.insert(start);
|
39
|
-
nodeSet.insert(end);
|
40
|
-
graph[start][end]+=weight;
|
41
|
-
graph[end][start]+=weight;
|
42
|
-
}
|
43
|
-
|
44
|
-
void rank(WordMap &ws,size_t rankTime=10){
|
45
|
-
WordMap outSum;
|
46
|
-
Score wsdef, min_rank, max_rank;
|
47
|
-
|
48
|
-
if( graph.size() == 0)
|
49
|
-
return;
|
50
|
-
|
51
|
-
wsdef = 1.0 / graph.size();
|
52
|
-
|
53
|
-
for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
|
54
|
-
// edges->first start节点;edge->first end节点;edge->second 权重
|
55
|
-
ws[edges->first].word=edges->first;
|
56
|
-
ws[edges->first].weight=wsdef;
|
57
|
-
outSum[edges->first].weight=0;
|
58
|
-
for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
|
59
|
-
outSum[edges->first].weight+=edge->second;
|
60
|
-
}
|
61
|
-
}
|
62
|
-
//sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
|
63
|
-
for( size_t i=0; i<rankTime; i++ ){
|
64
|
-
for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
|
65
|
-
double s = 0;
|
66
|
-
for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
|
67
|
-
// edge->first end节点;edge->second 权重
|
68
|
-
s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
|
69
|
-
ws[*node].weight = (1 - d) + d * s;
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
min_rank=max_rank=ws.begin()->second.weight;
|
74
|
-
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
75
|
-
if( i->second.weight < min_rank ){
|
76
|
-
min_rank = i->second.weight;
|
77
|
-
}
|
78
|
-
if( i->second.weight > max_rank ){
|
79
|
-
max_rank = i->second.weight;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
|
83
|
-
ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
|
84
|
-
}
|
85
|
-
}
|
86
|
-
};
|
87
|
-
|
88
|
-
public:
|
89
|
-
TextRankExtractor(const string& dictPath,
|
90
|
-
const string& hmmFilePath,
|
91
|
-
const string& stopWordPath,
|
92
|
-
const string& userDict = "")
|
93
|
-
: segment_(dictPath, hmmFilePath, userDict) {
|
94
|
-
LoadStopWordDict(stopWordPath);
|
95
|
-
}
|
96
|
-
TextRankExtractor(const DictTrie* dictTrie,
|
97
|
-
const HMMModel* model,
|
98
|
-
const string& stopWordPath)
|
99
|
-
: segment_(dictTrie, model) {
|
100
|
-
LoadStopWordDict(stopWordPath);
|
101
|
-
}
|
102
|
-
TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
|
103
|
-
LoadStopWordDict(stopWordPath);
|
104
|
-
}
|
105
|
-
~TextRankExtractor() {
|
106
|
-
}
|
107
|
-
|
108
|
-
void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
|
109
|
-
vector<Word> topWords;
|
110
|
-
Extract(sentence, topWords, topN);
|
111
|
-
for (size_t i = 0; i < topWords.size(); i++) {
|
112
|
-
keywords.push_back(topWords[i].word);
|
113
|
-
}
|
114
|
-
}
|
115
|
-
|
116
|
-
void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
|
117
|
-
vector<Word> topWords;
|
118
|
-
Extract(sentence, topWords, topN);
|
119
|
-
for (size_t i = 0; i < topWords.size(); i++) {
|
120
|
-
keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
|
121
|
-
}
|
122
|
-
}
|
123
|
-
|
124
|
-
void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
|
125
|
-
vector<string> words;
|
126
|
-
segment_.Cut(sentence, words);
|
127
|
-
|
128
|
-
TextRankExtractor::WordGraph graph;
|
129
|
-
WordMap wordmap;
|
130
|
-
size_t offset = 0;
|
131
|
-
|
132
|
-
for(size_t i=0; i < words.size(); i++){
|
133
|
-
size_t t = offset;
|
134
|
-
offset += words[i].size();
|
135
|
-
if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
|
136
|
-
continue;
|
137
|
-
}
|
138
|
-
for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
|
139
|
-
if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
|
140
|
-
skip++;
|
141
|
-
continue;
|
142
|
-
}
|
143
|
-
graph.addEdge(words[i],words[j],1);
|
144
|
-
}
|
145
|
-
wordmap[words[i]].offsets.push_back(t);
|
146
|
-
}
|
147
|
-
if (offset != sentence.size()) {
|
148
|
-
XLOG(ERROR) << "words illegal";
|
149
|
-
return;
|
150
|
-
}
|
151
|
-
|
152
|
-
graph.rank(wordmap,rankTime);
|
153
|
-
|
154
|
-
keywords.clear();
|
155
|
-
keywords.reserve(wordmap.size());
|
156
|
-
for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
|
157
|
-
keywords.push_back(itr->second);
|
158
|
-
}
|
159
|
-
|
160
|
-
topN = min(topN, keywords.size());
|
161
|
-
partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
|
162
|
-
keywords.resize(topN);
|
163
|
-
}
|
164
|
-
private:
|
165
|
-
void LoadStopWordDict(const string& filePath) {
|
166
|
-
ifstream ifs(filePath.c_str());
|
167
|
-
XCHECK(ifs.is_open()) << "open " << filePath << " failed";
|
168
|
-
string line ;
|
169
|
-
while (getline(ifs, line)) {
|
170
|
-
stopWords_.insert(line);
|
171
|
-
}
|
172
|
-
assert(stopWords_.size());
|
173
|
-
}
|
174
|
-
|
175
|
-
static bool Compare(const Word &x,const Word &y){
|
176
|
-
return x.weight > y.weight;
|
177
|
-
}
|
178
|
-
|
179
|
-
MixSegment segment_;
|
180
|
-
unordered_set<string> stopWords_;
|
181
|
-
}; // class TextRankExtractor
|
182
|
-
|
183
|
-
inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
|
184
|
-
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
|
185
|
-
}
|
186
|
-
} // namespace cppjieba
|
187
|
-
|
188
|
-
#endif
|
189
|
-
|
190
|
-
|
@@ -1,174 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_TRIE_HPP
|
2
|
-
#define CPPJIEBA_TRIE_HPP
|
3
|
-
|
4
|
-
#include <vector>
|
5
|
-
#include <queue>
|
6
|
-
#include "limonp/StdExtension.hpp"
|
7
|
-
#include "Unicode.hpp"
|
8
|
-
|
9
|
-
namespace cppjieba {
|
10
|
-
|
11
|
-
using namespace std;
|
12
|
-
|
13
|
-
const size_t MAX_WORD_LENGTH = 512;
|
14
|
-
|
15
|
-
struct DictUnit {
|
16
|
-
Unicode word;
|
17
|
-
double weight;
|
18
|
-
string tag;
|
19
|
-
}; // struct DictUnit
|
20
|
-
|
21
|
-
// for debugging
|
22
|
-
// inline ostream & operator << (ostream& os, const DictUnit& unit) {
|
23
|
-
// string s;
|
24
|
-
// s << unit.word;
|
25
|
-
// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
|
26
|
-
// }
|
27
|
-
|
28
|
-
struct Dag {
|
29
|
-
RuneStr runestr;
|
30
|
-
// [offset, nexts.first]
|
31
|
-
limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
|
32
|
-
const DictUnit * pInfo;
|
33
|
-
double weight;
|
34
|
-
size_t nextPos; // TODO
|
35
|
-
Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
|
36
|
-
}
|
37
|
-
}; // struct Dag
|
38
|
-
|
39
|
-
typedef Rune TrieKey;
|
40
|
-
|
41
|
-
class TrieNode {
|
42
|
-
public :
|
43
|
-
TrieNode(): next(NULL), ptValue(NULL) {
|
44
|
-
}
|
45
|
-
public:
|
46
|
-
typedef unordered_map<TrieKey, TrieNode*> NextMap;
|
47
|
-
NextMap *next;
|
48
|
-
const DictUnit *ptValue;
|
49
|
-
};
|
50
|
-
|
51
|
-
class Trie {
|
52
|
-
public:
|
53
|
-
Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
|
54
|
-
: root_(new TrieNode) {
|
55
|
-
CreateTrie(keys, valuePointers);
|
56
|
-
}
|
57
|
-
~Trie() {
|
58
|
-
DeleteNode(root_);
|
59
|
-
}
|
60
|
-
|
61
|
-
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
62
|
-
if (begin == end) {
|
63
|
-
return NULL;
|
64
|
-
}
|
65
|
-
|
66
|
-
const TrieNode* ptNode = root_;
|
67
|
-
TrieNode::NextMap::const_iterator citer;
|
68
|
-
for (RuneStrArray::const_iterator it = begin; it != end; it++) {
|
69
|
-
if (NULL == ptNode->next) {
|
70
|
-
return NULL;
|
71
|
-
}
|
72
|
-
citer = ptNode->next->find(it->rune);
|
73
|
-
if (ptNode->next->end() == citer) {
|
74
|
-
return NULL;
|
75
|
-
}
|
76
|
-
ptNode = citer->second;
|
77
|
-
}
|
78
|
-
return ptNode->ptValue;
|
79
|
-
}
|
80
|
-
|
81
|
-
void Find(RuneStrArray::const_iterator begin,
|
82
|
-
RuneStrArray::const_iterator end,
|
83
|
-
vector<struct Dag>&res,
|
84
|
-
size_t max_word_len = MAX_WORD_LENGTH) const {
|
85
|
-
assert(root_ != NULL);
|
86
|
-
res.resize(end - begin);
|
87
|
-
|
88
|
-
const TrieNode *ptNode = NULL;
|
89
|
-
TrieNode::NextMap::const_iterator citer;
|
90
|
-
for (size_t i = 0; i < size_t(end - begin); i++) {
|
91
|
-
res[i].runestr = *(begin + i);
|
92
|
-
|
93
|
-
if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
|
94
|
-
ptNode = citer->second;
|
95
|
-
} else {
|
96
|
-
ptNode = NULL;
|
97
|
-
}
|
98
|
-
if (ptNode != NULL) {
|
99
|
-
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
|
100
|
-
} else {
|
101
|
-
res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
|
102
|
-
}
|
103
|
-
|
104
|
-
for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
|
105
|
-
if (ptNode == NULL || ptNode->next == NULL) {
|
106
|
-
break;
|
107
|
-
}
|
108
|
-
citer = ptNode->next->find((begin + j)->rune);
|
109
|
-
if (ptNode->next->end() == citer) {
|
110
|
-
break;
|
111
|
-
}
|
112
|
-
ptNode = citer->second;
|
113
|
-
if (NULL != ptNode->ptValue) {
|
114
|
-
res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
|
115
|
-
}
|
116
|
-
}
|
117
|
-
}
|
118
|
-
}
|
119
|
-
|
120
|
-
void InsertNode(const Unicode& key, const DictUnit* ptValue) {
|
121
|
-
if (key.begin() == key.end()) {
|
122
|
-
return;
|
123
|
-
}
|
124
|
-
|
125
|
-
TrieNode::NextMap::const_iterator kmIter;
|
126
|
-
TrieNode *ptNode = root_;
|
127
|
-
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
128
|
-
if (NULL == ptNode->next) {
|
129
|
-
ptNode->next = new TrieNode::NextMap;
|
130
|
-
}
|
131
|
-
kmIter = ptNode->next->find(*citer);
|
132
|
-
if (ptNode->next->end() == kmIter) {
|
133
|
-
TrieNode *nextNode = new TrieNode;
|
134
|
-
|
135
|
-
ptNode->next->insert(make_pair(*citer, nextNode));
|
136
|
-
ptNode = nextNode;
|
137
|
-
} else {
|
138
|
-
ptNode = kmIter->second;
|
139
|
-
}
|
140
|
-
}
|
141
|
-
assert(ptNode != NULL);
|
142
|
-
ptNode->ptValue = ptValue;
|
143
|
-
}
|
144
|
-
|
145
|
-
private:
|
146
|
-
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
147
|
-
if (valuePointers.empty() || keys.empty()) {
|
148
|
-
return;
|
149
|
-
}
|
150
|
-
assert(keys.size() == valuePointers.size());
|
151
|
-
|
152
|
-
for (size_t i = 0; i < keys.size(); i++) {
|
153
|
-
InsertNode(keys[i], valuePointers[i]);
|
154
|
-
}
|
155
|
-
}
|
156
|
-
|
157
|
-
void DeleteNode(TrieNode* node) {
|
158
|
-
if (NULL == node) {
|
159
|
-
return;
|
160
|
-
}
|
161
|
-
if (NULL != node->next) {
|
162
|
-
for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
|
163
|
-
DeleteNode(it->second);
|
164
|
-
}
|
165
|
-
delete node->next;
|
166
|
-
}
|
167
|
-
delete node;
|
168
|
-
}
|
169
|
-
|
170
|
-
TrieNode* root_;
|
171
|
-
}; // class Trie
|
172
|
-
} // namespace cppjieba
|
173
|
-
|
174
|
-
#endif // CPPJIEBA_TRIE_HPP
|
@@ -1,227 +0,0 @@
|
|
1
|
-
#ifndef CPPJIEBA_UNICODE_H
|
2
|
-
#define CPPJIEBA_UNICODE_H
|
3
|
-
|
4
|
-
#include <stdint.h>
|
5
|
-
#include <stdlib.h>
|
6
|
-
#include <string>
|
7
|
-
#include <vector>
|
8
|
-
#include <ostream>
|
9
|
-
#include "limonp/LocalVector.hpp"
|
10
|
-
|
11
|
-
namespace cppjieba {
|
12
|
-
|
13
|
-
using std::string;
|
14
|
-
using std::vector;
|
15
|
-
|
16
|
-
typedef uint32_t Rune;
|
17
|
-
|
18
|
-
struct Word {
|
19
|
-
string word;
|
20
|
-
uint32_t offset;
|
21
|
-
uint32_t unicode_offset;
|
22
|
-
uint32_t unicode_length;
|
23
|
-
Word(const string& w, uint32_t o)
|
24
|
-
: word(w), offset(o) {
|
25
|
-
}
|
26
|
-
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
27
|
-
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
28
|
-
}
|
29
|
-
}; // struct Word
|
30
|
-
|
31
|
-
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
32
|
-
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
33
|
-
}
|
34
|
-
|
35
|
-
struct RuneStr {
|
36
|
-
Rune rune;
|
37
|
-
uint32_t offset;
|
38
|
-
uint32_t len;
|
39
|
-
uint32_t unicode_offset;
|
40
|
-
uint32_t unicode_length;
|
41
|
-
RuneStr(): rune(0), offset(0), len(0) {
|
42
|
-
}
|
43
|
-
RuneStr(Rune r, uint32_t o, uint32_t l)
|
44
|
-
: rune(r), offset(o), len(l) {
|
45
|
-
}
|
46
|
-
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
47
|
-
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
48
|
-
}
|
49
|
-
}; // struct RuneStr
|
50
|
-
|
51
|
-
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
52
|
-
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
53
|
-
}
|
54
|
-
|
55
|
-
typedef limonp::LocalVector<Rune> Unicode;
|
56
|
-
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
57
|
-
|
58
|
-
// [left, right]
|
59
|
-
struct WordRange {
|
60
|
-
RuneStrArray::const_iterator left;
|
61
|
-
RuneStrArray::const_iterator right;
|
62
|
-
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
63
|
-
: left(l), right(r) {
|
64
|
-
}
|
65
|
-
size_t Length() const {
|
66
|
-
return right - left + 1;
|
67
|
-
}
|
68
|
-
bool IsAllAscii() const {
|
69
|
-
for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
70
|
-
if (iter->rune >= 0x80) {
|
71
|
-
return false;
|
72
|
-
}
|
73
|
-
}
|
74
|
-
return true;
|
75
|
-
}
|
76
|
-
}; // struct WordRange
|
77
|
-
|
78
|
-
struct RuneStrLite {
|
79
|
-
uint32_t rune;
|
80
|
-
uint32_t len;
|
81
|
-
RuneStrLite(): rune(0), len(0) {
|
82
|
-
}
|
83
|
-
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
84
|
-
}
|
85
|
-
}; // struct RuneStrLite
|
86
|
-
|
87
|
-
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
88
|
-
RuneStrLite rp(0, 0);
|
89
|
-
if (str == NULL || len == 0) {
|
90
|
-
return rp;
|
91
|
-
}
|
92
|
-
if (!(str[0] & 0x80)) { // 0xxxxxxx
|
93
|
-
// 7bit, total 7bit
|
94
|
-
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
95
|
-
rp.len = 1;
|
96
|
-
} else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
|
97
|
-
// 110xxxxxx
|
98
|
-
// 5bit, total 5bit
|
99
|
-
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
100
|
-
|
101
|
-
// 6bit, total 11bit
|
102
|
-
rp.rune <<= 6;
|
103
|
-
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
104
|
-
rp.len = 2;
|
105
|
-
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
106
|
-
// 4bit, total 4bit
|
107
|
-
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
108
|
-
|
109
|
-
// 6bit, total 10bit
|
110
|
-
rp.rune <<= 6;
|
111
|
-
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
112
|
-
|
113
|
-
// 6bit, total 16bit
|
114
|
-
rp.rune <<= 6;
|
115
|
-
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
116
|
-
|
117
|
-
rp.len = 3;
|
118
|
-
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
119
|
-
// 3bit, total 3bit
|
120
|
-
rp.rune = (uint8_t)(str[0]) & 0x07;
|
121
|
-
|
122
|
-
// 6bit, total 9bit
|
123
|
-
rp.rune <<= 6;
|
124
|
-
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
125
|
-
|
126
|
-
// 6bit, total 15bit
|
127
|
-
rp.rune <<= 6;
|
128
|
-
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
129
|
-
|
130
|
-
// 6bit, total 21bit
|
131
|
-
rp.rune <<= 6;
|
132
|
-
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
133
|
-
|
134
|
-
rp.len = 4;
|
135
|
-
} else {
|
136
|
-
rp.rune = 0;
|
137
|
-
rp.len = 0;
|
138
|
-
}
|
139
|
-
return rp;
|
140
|
-
}
|
141
|
-
|
142
|
-
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
143
|
-
runes.clear();
|
144
|
-
runes.reserve(len / 2);
|
145
|
-
for (uint32_t i = 0, j = 0; i < len;) {
|
146
|
-
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
147
|
-
if (rp.len == 0) {
|
148
|
-
runes.clear();
|
149
|
-
return false;
|
150
|
-
}
|
151
|
-
RuneStr x(rp.rune, i, rp.len, j, 1);
|
152
|
-
runes.push_back(x);
|
153
|
-
i += rp.len;
|
154
|
-
++j;
|
155
|
-
}
|
156
|
-
return true;
|
157
|
-
}
|
158
|
-
|
159
|
-
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
160
|
-
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
161
|
-
}
|
162
|
-
|
163
|
-
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
164
|
-
unicode.clear();
|
165
|
-
RuneStrArray runes;
|
166
|
-
if (!DecodeRunesInString(s, len, runes)) {
|
167
|
-
return false;
|
168
|
-
}
|
169
|
-
unicode.reserve(runes.size());
|
170
|
-
for (size_t i = 0; i < runes.size(); i++) {
|
171
|
-
unicode.push_back(runes[i].rune);
|
172
|
-
}
|
173
|
-
return true;
|
174
|
-
}
|
175
|
-
|
176
|
-
inline bool IsSingleWord(const string& str) {
|
177
|
-
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
178
|
-
return rp.len == str.size();
|
179
|
-
}
|
180
|
-
|
181
|
-
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
182
|
-
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
183
|
-
}
|
184
|
-
|
185
|
-
inline Unicode DecodeRunesInString(const string& s) {
|
186
|
-
Unicode result;
|
187
|
-
DecodeRunesInString(s, result);
|
188
|
-
return result;
|
189
|
-
}
|
190
|
-
|
191
|
-
|
192
|
-
// [left, right]
|
193
|
-
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
194
|
-
assert(right->offset >= left->offset);
|
195
|
-
uint32_t len = right->offset - left->offset + right->len;
|
196
|
-
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
197
|
-
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
|
198
|
-
}
|
199
|
-
|
200
|
-
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
201
|
-
assert(right->offset >= left->offset);
|
202
|
-
uint32_t len = right->offset - left->offset + right->len;
|
203
|
-
return s.substr(left->offset, len);
|
204
|
-
}
|
205
|
-
|
206
|
-
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
207
|
-
for (size_t i = 0; i < wrs.size(); i++) {
|
208
|
-
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
209
|
-
}
|
210
|
-
}
|
211
|
-
|
212
|
-
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
213
|
-
vector<Word> result;
|
214
|
-
GetWordsFromWordRanges(s, wrs, result);
|
215
|
-
return result;
|
216
|
-
}
|
217
|
-
|
218
|
-
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
219
|
-
strs.resize(words.size());
|
220
|
-
for (size_t i = 0; i < words.size(); ++i) {
|
221
|
-
strs[i] = words[i].word;
|
222
|
-
}
|
223
|
-
}
|
224
|
-
|
225
|
-
} // namespace cppjieba
|
226
|
-
|
227
|
-
#endif // CPPJIEBA_UNICODE_H
|