jieba_rb 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +51 -0
- data/Rakefile +11 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +27 -0
- data/ext/cppjieba/ChangeLog.md +81 -0
- data/ext/cppjieba/Dockerfile +11 -0
- data/ext/cppjieba/LICENSE +20 -0
- data/ext/cppjieba/README.md +359 -0
- data/ext/cppjieba/conf/CMakeLists.txt +1 -0
- data/ext/cppjieba/conf/server.conf +16 -0
- data/ext/cppjieba/dict/CMakeLists.txt +1 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +3 -0
- data/ext/cppjieba/script/CMakeLists.txt +1 -0
- data/ext/cppjieba/script/cjserver.start +12 -0
- data/ext/cppjieba/script/cjserver.stop +13 -0
- data/ext/cppjieba/server/CMakeLists.txt +9 -0
- data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
- data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
- data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
- data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
- data/ext/cppjieba/server/server.cpp +91 -0
- data/ext/cppjieba/src/DictTrie.hpp +211 -0
- data/ext/cppjieba/src/FullSegment.hpp +153 -0
- data/ext/cppjieba/src/HMMSegment.hpp +394 -0
- data/ext/cppjieba/src/ISegment.hpp +17 -0
- data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
- data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
- data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
- data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
- data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
- data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
- data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
- data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
- data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
- data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
- data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
- data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
- data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
- data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
- data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
- data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
- data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
- data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
- data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
- data/ext/cppjieba/src/MPSegment.hpp +148 -0
- data/ext/cppjieba/src/MixSegment.hpp +121 -0
- data/ext/cppjieba/src/PosTagger.hpp +109 -0
- data/ext/cppjieba/src/QuerySegment.hpp +123 -0
- data/ext/cppjieba/src/SegmentBase.hpp +78 -0
- data/ext/cppjieba/src/TransCode.hpp +63 -0
- data/ext/cppjieba/src/Trie.hpp +298 -0
- data/ext/cppjieba/test/CMakeLists.txt +7 -0
- data/ext/cppjieba/test/keyword_demo.cpp +16 -0
- data/ext/cppjieba/test/load_test.cpp +56 -0
- data/ext/cppjieba/test/segment_demo.cpp +59 -0
- data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
- data/ext/cppjieba/test/servertest/load_test.py +91 -0
- data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
- data/ext/cppjieba/test/tagging_demo.cpp +12 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +13 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
- data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
- data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
- data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
- data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
- data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/jieba/extconf.rb +26 -0
- data/ext/jieba/jieba.c +9 -0
- data/ext/jieba/jieba.h +9 -0
- data/ext/jieba/segment.cc +88 -0
- data/ext/jieba/segment.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/lib/jieba_rb.rb +28 -0
- data/test/test_segment.rb +32 -0
- metadata +246 -0
@@ -0,0 +1,148 @@
|
|
1
|
+
#ifndef CPPJIEBA_MPSEGMENT_H
|
2
|
+
#define CPPJIEBA_MPSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "Limonp/Logger.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "ISegment.hpp"
|
10
|
+
#include "SegmentBase.hpp"
|
11
|
+
|
12
|
+
namespace CppJieba
|
13
|
+
{
|
14
|
+
|
15
|
+
class MPSegment: public SegmentBase
|
16
|
+
{
|
17
|
+
private:
|
18
|
+
DictTrie _dictTrie;
|
19
|
+
|
20
|
+
public:
|
21
|
+
MPSegment(){};
|
22
|
+
MPSegment(const string& dictPath, const string& userDictPath = "")
|
23
|
+
{
|
24
|
+
LIMONP_CHECK(init(dictPath, userDictPath));
|
25
|
+
};
|
26
|
+
virtual ~MPSegment(){};
|
27
|
+
public:
|
28
|
+
bool init(const string& dictPath, const string& userDictPath = "")
|
29
|
+
{
|
30
|
+
LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
|
31
|
+
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
|
32
|
+
return true;
|
33
|
+
}
|
34
|
+
bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
|
35
|
+
{
|
36
|
+
return _dictTrie.isUserDictSingleChineseWord(value);
|
37
|
+
}
|
38
|
+
public:
|
39
|
+
using SegmentBase::cut;
|
40
|
+
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
41
|
+
{
|
42
|
+
if(begin == end)
|
43
|
+
{
|
44
|
+
return false;
|
45
|
+
}
|
46
|
+
|
47
|
+
vector<Unicode> words;
|
48
|
+
words.reserve(end - begin);
|
49
|
+
if(!cut(begin, end, words))
|
50
|
+
{
|
51
|
+
return false;
|
52
|
+
}
|
53
|
+
size_t offset = res.size();
|
54
|
+
res.resize(res.size() + words.size());
|
55
|
+
for(size_t i = 0; i < words.size(); i++)
|
56
|
+
{
|
57
|
+
if(!TransCode::encode(words[i], res[i + offset]))
|
58
|
+
{
|
59
|
+
LogError("encode failed.");
|
60
|
+
res[i + offset].clear();
|
61
|
+
}
|
62
|
+
}
|
63
|
+
return true;
|
64
|
+
}
|
65
|
+
|
66
|
+
bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
|
67
|
+
{
|
68
|
+
if(end == begin)
|
69
|
+
{
|
70
|
+
return false;
|
71
|
+
}
|
72
|
+
vector<SegmentChar> segmentChars;
|
73
|
+
|
74
|
+
_dictTrie.find(begin, end, segmentChars);
|
75
|
+
|
76
|
+
_calcDP(segmentChars);
|
77
|
+
|
78
|
+
_cut(segmentChars, res);
|
79
|
+
|
80
|
+
return true;
|
81
|
+
}
|
82
|
+
const DictTrie* getDictTrie() const
|
83
|
+
{
|
84
|
+
return &_dictTrie;
|
85
|
+
}
|
86
|
+
|
87
|
+
private:
|
88
|
+
void _calcDP(vector<SegmentChar>& segmentChars) const
|
89
|
+
{
|
90
|
+
size_t nextPos;
|
91
|
+
const DictUnit* p;
|
92
|
+
double val;
|
93
|
+
|
94
|
+
for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
|
95
|
+
{
|
96
|
+
segmentChars[i].pInfo = NULL;
|
97
|
+
segmentChars[i].weight = MIN_DOUBLE;
|
98
|
+
assert(!segmentChars[i].dag.empty());
|
99
|
+
for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
|
100
|
+
{
|
101
|
+
nextPos = it->first;
|
102
|
+
p = it->second;
|
103
|
+
val = 0.0;
|
104
|
+
if(nextPos + 1 < segmentChars.size())
|
105
|
+
{
|
106
|
+
val += segmentChars[nextPos + 1].weight;
|
107
|
+
}
|
108
|
+
|
109
|
+
if(p)
|
110
|
+
{
|
111
|
+
val += p->weight;
|
112
|
+
}
|
113
|
+
else
|
114
|
+
{
|
115
|
+
val += _dictTrie.getMinWeight();
|
116
|
+
}
|
117
|
+
if(val > segmentChars[i].weight)
|
118
|
+
{
|
119
|
+
segmentChars[i].pInfo = p;
|
120
|
+
segmentChars[i].weight = val;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const
|
126
|
+
{
|
127
|
+
size_t i = 0;
|
128
|
+
while(i < segmentChars.size())
|
129
|
+
{
|
130
|
+
const DictUnit* p = segmentChars[i].pInfo;
|
131
|
+
if(p)
|
132
|
+
{
|
133
|
+
res.push_back(p->word);
|
134
|
+
i += p->word.size();
|
135
|
+
}
|
136
|
+
else//single chinese word
|
137
|
+
{
|
138
|
+
res.push_back(Unicode(1, segmentChars[i].uniCh));
|
139
|
+
i++;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
|
145
|
+
};
|
146
|
+
}
|
147
|
+
|
148
|
+
#endif
|
@@ -0,0 +1,121 @@
|
|
1
|
+
#ifndef CPPJIEBA_MIXSEGMENT_H
|
2
|
+
#define CPPJIEBA_MIXSEGMENT_H
|
3
|
+
|
4
|
+
#include <cassert>
|
5
|
+
#include "MPSegment.hpp"
|
6
|
+
#include "HMMSegment.hpp"
|
7
|
+
#include "Limonp/StringUtil.hpp"
|
8
|
+
|
9
|
+
namespace CppJieba
|
10
|
+
{
|
11
|
+
class MixSegment: public SegmentBase
|
12
|
+
{
|
13
|
+
private:
|
14
|
+
MPSegment _mpSeg;
|
15
|
+
HMMSegment _hmmSeg;
|
16
|
+
public:
|
17
|
+
MixSegment(){};
|
18
|
+
MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
19
|
+
{
|
20
|
+
LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
|
21
|
+
}
|
22
|
+
virtual ~MixSegment(){}
|
23
|
+
public:
|
24
|
+
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
|
25
|
+
{
|
26
|
+
LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
|
27
|
+
LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
|
28
|
+
LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
|
29
|
+
return true;
|
30
|
+
}
|
31
|
+
public:
|
32
|
+
using SegmentBase::cut;
|
33
|
+
public:
|
34
|
+
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
35
|
+
{
|
36
|
+
vector<Unicode> words;
|
37
|
+
words.reserve(end - begin);
|
38
|
+
if(!_mpSeg.cut(begin, end, words))
|
39
|
+
{
|
40
|
+
LogError("mpSeg cutDAG failed.");
|
41
|
+
return false;
|
42
|
+
}
|
43
|
+
|
44
|
+
vector<Unicode> hmmRes;
|
45
|
+
hmmRes.reserve(end - begin);
|
46
|
+
Unicode piece;
|
47
|
+
piece.reserve(end - begin);
|
48
|
+
for (size_t i = 0, j = 0; i < words.size(); i++)
|
49
|
+
{
|
50
|
+
//if mp get a word, it's ok, put it into result
|
51
|
+
if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
|
52
|
+
{
|
53
|
+
res.push_back(words[i]);
|
54
|
+
continue;
|
55
|
+
}
|
56
|
+
|
57
|
+
// if mp get a single one and it is not in userdict, collect it in sequence
|
58
|
+
j = i;
|
59
|
+
while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
|
60
|
+
{
|
61
|
+
piece.push_back(words[j][0]);
|
62
|
+
j++;
|
63
|
+
}
|
64
|
+
|
65
|
+
// cut the sequence with hmm
|
66
|
+
if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
|
67
|
+
{
|
68
|
+
LogError("_hmmSeg cut failed.");
|
69
|
+
return false;
|
70
|
+
}
|
71
|
+
|
72
|
+
//put hmm result to result
|
73
|
+
for (size_t k = 0; k < hmmRes.size(); k++)
|
74
|
+
{
|
75
|
+
res.push_back(hmmRes[k]);
|
76
|
+
}
|
77
|
+
|
78
|
+
//clear tmp vars
|
79
|
+
piece.clear();
|
80
|
+
hmmRes.clear();
|
81
|
+
|
82
|
+
//let i jump over this piece
|
83
|
+
i = j - 1;
|
84
|
+
}
|
85
|
+
return true;
|
86
|
+
}
|
87
|
+
|
88
|
+
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
|
89
|
+
{
|
90
|
+
if(begin == end)
|
91
|
+
{
|
92
|
+
return false;
|
93
|
+
}
|
94
|
+
|
95
|
+
vector<Unicode> uRes;
|
96
|
+
uRes.reserve(end - begin);
|
97
|
+
if (!cut(begin, end, uRes))
|
98
|
+
{
|
99
|
+
return false;
|
100
|
+
}
|
101
|
+
|
102
|
+
size_t offset = res.size();
|
103
|
+
res.resize(res.size() + uRes.size());
|
104
|
+
for(size_t i = 0; i < uRes.size(); i ++, offset++)
|
105
|
+
{
|
106
|
+
if(!TransCode::encode(uRes[i], res[offset]))
|
107
|
+
{
|
108
|
+
LogError("encode failed.");
|
109
|
+
}
|
110
|
+
}
|
111
|
+
return true;
|
112
|
+
}
|
113
|
+
|
114
|
+
const DictTrie* getDictTrie() const
|
115
|
+
{
|
116
|
+
return _mpSeg.getDictTrie();
|
117
|
+
}
|
118
|
+
};
|
119
|
+
}
|
120
|
+
|
121
|
+
#endif
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#ifndef CPPJIEBA_POS_TAGGING_H
|
2
|
+
#define CPPJIEBA_POS_TAGGING_H
|
3
|
+
|
4
|
+
#include "MixSegment.hpp"
|
5
|
+
#include "Limonp/StringUtil.hpp"
|
6
|
+
#include "DictTrie.hpp"
|
7
|
+
|
8
|
+
namespace CppJieba
|
9
|
+
{
|
10
|
+
using namespace Limonp;
|
11
|
+
|
12
|
+
static const char* const POS_M = "m";
|
13
|
+
static const char* const POS_ENG = "eng";
|
14
|
+
static const char* const POS_X = "x";
|
15
|
+
|
16
|
+
class PosTagger
|
17
|
+
{
|
18
|
+
private:
|
19
|
+
MixSegment _segment;
|
20
|
+
const DictTrie * _dictTrie;
|
21
|
+
|
22
|
+
public:
|
23
|
+
PosTagger()
|
24
|
+
{}
|
25
|
+
PosTagger(
|
26
|
+
const string& dictPath,
|
27
|
+
const string& hmmFilePath,
|
28
|
+
const string& userDictPath = ""
|
29
|
+
)
|
30
|
+
{
|
31
|
+
init(dictPath, hmmFilePath, userDictPath);
|
32
|
+
};
|
33
|
+
~PosTagger(){};
|
34
|
+
public:
|
35
|
+
void init(
|
36
|
+
const string& dictPath,
|
37
|
+
const string& hmmFilePath,
|
38
|
+
const string& userDictPath = ""
|
39
|
+
)
|
40
|
+
{
|
41
|
+
LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
|
42
|
+
_dictTrie = _segment.getDictTrie();
|
43
|
+
LIMONP_CHECK(_dictTrie);
|
44
|
+
};
|
45
|
+
|
46
|
+
|
47
|
+
bool tag(const string& src, vector<pair<string, string> >& res) const
|
48
|
+
{
|
49
|
+
vector<string> cutRes;
|
50
|
+
if (!_segment.cut(src, cutRes))
|
51
|
+
{
|
52
|
+
LogError("_mixSegment cut failed");
|
53
|
+
return false;
|
54
|
+
}
|
55
|
+
|
56
|
+
const DictUnit *tmp = NULL;
|
57
|
+
Unicode unico;
|
58
|
+
for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
|
59
|
+
{
|
60
|
+
if (!TransCode::decode(*itr, unico))
|
61
|
+
{
|
62
|
+
LogError("decode failed.");
|
63
|
+
return false;
|
64
|
+
}
|
65
|
+
tmp = _dictTrie->find(unico.begin(), unico.end());
|
66
|
+
if(tmp == NULL || tmp->tag.empty())
|
67
|
+
{
|
68
|
+
res.push_back(make_pair(*itr, _specialRule(unico)));
|
69
|
+
}
|
70
|
+
else
|
71
|
+
{
|
72
|
+
res.push_back(make_pair(*itr, tmp->tag));
|
73
|
+
}
|
74
|
+
}
|
75
|
+
return !res.empty();
|
76
|
+
}
|
77
|
+
private:
|
78
|
+
const char* _specialRule(const Unicode& unicode) const
|
79
|
+
{
|
80
|
+
size_t m = 0;
|
81
|
+
size_t eng = 0;
|
82
|
+
for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++)
|
83
|
+
{
|
84
|
+
if(unicode[i] < 0x80)
|
85
|
+
{
|
86
|
+
eng ++;
|
87
|
+
if('0' <= unicode[i] && unicode[i] <= '9')
|
88
|
+
{
|
89
|
+
m++;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
}
|
93
|
+
// ascii char is not found
|
94
|
+
if(eng == 0)
|
95
|
+
{
|
96
|
+
return POS_X;
|
97
|
+
}
|
98
|
+
// all the ascii is number char
|
99
|
+
if(m == eng)
|
100
|
+
{
|
101
|
+
return POS_M;
|
102
|
+
}
|
103
|
+
// the ascii chars contain english letter
|
104
|
+
return POS_ENG;
|
105
|
+
}
|
106
|
+
};
|
107
|
+
}
|
108
|
+
|
109
|
+
#endif
|
@@ -0,0 +1,123 @@
|
|
1
|
+
#ifndef CPPJIEBA_QUERYSEGMENT_H
|
2
|
+
#define CPPJIEBA_QUERYSEGMENT_H
|
3
|
+
|
4
|
+
#include <algorithm>
|
5
|
+
#include <set>
|
6
|
+
#include <cassert>
|
7
|
+
#include "Limonp/Logger.hpp"
|
8
|
+
#include "DictTrie.hpp"
|
9
|
+
#include "ISegment.hpp"
|
10
|
+
#include "SegmentBase.hpp"
|
11
|
+
#include "FullSegment.hpp"
|
12
|
+
#include "MixSegment.hpp"
|
13
|
+
#include "TransCode.hpp"
|
14
|
+
#include "DictTrie.hpp"
|
15
|
+
|
16
|
+
namespace CppJieba
|
17
|
+
{
|
18
|
+
class QuerySegment: public SegmentBase
|
19
|
+
{
|
20
|
+
private:
|
21
|
+
MixSegment _mixSeg;
|
22
|
+
FullSegment _fullSeg;
|
23
|
+
size_t _maxWordLen;
|
24
|
+
|
25
|
+
public:
|
26
|
+
QuerySegment(){};
|
27
|
+
QuerySegment(const string& dict, const string& model, size_t maxWordLen)
|
28
|
+
{
|
29
|
+
init(dict, model, maxWordLen);
|
30
|
+
};
|
31
|
+
virtual ~QuerySegment(){};
|
32
|
+
public:
|
33
|
+
bool init(const string& dict, const string& model, size_t maxWordLen)
|
34
|
+
{
|
35
|
+
LIMONP_CHECK(_mixSeg.init(dict, model));
|
36
|
+
LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
|
37
|
+
assert(maxWordLen);
|
38
|
+
_maxWordLen = maxWordLen;
|
39
|
+
return true;
|
40
|
+
}
|
41
|
+
|
42
|
+
public:
|
43
|
+
using SegmentBase::cut;
|
44
|
+
|
45
|
+
public:
|
46
|
+
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
|
47
|
+
{
|
48
|
+
if (begin >= end)
|
49
|
+
{
|
50
|
+
LogError("begin >= end");
|
51
|
+
return false;
|
52
|
+
}
|
53
|
+
|
54
|
+
//use mix cut first
|
55
|
+
vector<Unicode> mixRes;
|
56
|
+
if (!_mixSeg.cut(begin, end, mixRes))
|
57
|
+
{
|
58
|
+
LogError("_mixSeg cut failed.");
|
59
|
+
return false;
|
60
|
+
}
|
61
|
+
|
62
|
+
vector<Unicode> fullRes;
|
63
|
+
for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
|
64
|
+
{
|
65
|
+
|
66
|
+
// if it's too long, cut with _fullSeg, put fullRes in res
|
67
|
+
if (mixResItr->size() > _maxWordLen)
|
68
|
+
{
|
69
|
+
if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
|
70
|
+
{
|
71
|
+
for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
|
72
|
+
{
|
73
|
+
res.push_back(*fullResItr);
|
74
|
+
}
|
75
|
+
|
76
|
+
//clear tmp res
|
77
|
+
fullRes.clear();
|
78
|
+
}
|
79
|
+
}
|
80
|
+
else // just use the mix result
|
81
|
+
{
|
82
|
+
res.push_back(*mixResItr);
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
return true;
|
87
|
+
}
|
88
|
+
|
89
|
+
|
90
|
+
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
|
91
|
+
{
|
92
|
+
if (begin >= end)
|
93
|
+
{
|
94
|
+
LogError("begin >= end");
|
95
|
+
return false;
|
96
|
+
}
|
97
|
+
|
98
|
+
vector<Unicode> uRes;
|
99
|
+
if (!cut(begin, end, uRes))
|
100
|
+
{
|
101
|
+
LogError("get unicode cut result error.");
|
102
|
+
return false;
|
103
|
+
}
|
104
|
+
|
105
|
+
string tmp;
|
106
|
+
for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
|
107
|
+
{
|
108
|
+
if (TransCode::encode(*uItr, tmp))
|
109
|
+
{
|
110
|
+
res.push_back(tmp);
|
111
|
+
}
|
112
|
+
else
|
113
|
+
{
|
114
|
+
LogError("encode failed.");
|
115
|
+
}
|
116
|
+
}
|
117
|
+
|
118
|
+
return true;
|
119
|
+
}
|
120
|
+
};
|
121
|
+
}
|
122
|
+
|
123
|
+
#endif
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#ifndef CPPJIEBA_SEGMENTBASE_H
|
2
|
+
#define CPPJIEBA_SEGMENTBASE_H
|
3
|
+
|
4
|
+
#include "TransCode.hpp"
|
5
|
+
#include "Limonp/Logger.hpp"
|
6
|
+
#include "Limonp/NonCopyable.hpp"
|
7
|
+
#include "Limonp/HandyMacro.hpp"
|
8
|
+
#include "ISegment.hpp"
|
9
|
+
#include <cassert>
|
10
|
+
|
11
|
+
|
12
|
+
namespace CppJieba
|
13
|
+
{
|
14
|
+
using namespace Limonp;
|
15
|
+
|
16
|
+
//const char* const SPECIAL_CHARS = " \t\n";
|
17
|
+
#ifndef CPPJIEBA_GBK
|
18
|
+
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
|
19
|
+
#else
|
20
|
+
const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
|
21
|
+
#endif
|
22
|
+
|
23
|
+
class SegmentBase: public ISegment, public NonCopyable
|
24
|
+
{
|
25
|
+
public:
|
26
|
+
SegmentBase(){_loadSpecialSymbols();};
|
27
|
+
virtual ~SegmentBase(){};
|
28
|
+
private:
|
29
|
+
unordered_set<UnicodeValueType> _specialSymbols;
|
30
|
+
private:
|
31
|
+
void _loadSpecialSymbols()
|
32
|
+
{
|
33
|
+
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
|
34
|
+
for(size_t i = 0; i < size; i ++)
|
35
|
+
{
|
36
|
+
_specialSymbols.insert(SPECIAL_SYMBOL[i]);
|
37
|
+
}
|
38
|
+
assert(_specialSymbols.size());
|
39
|
+
}
|
40
|
+
|
41
|
+
public:
|
42
|
+
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
|
43
|
+
virtual bool cut(const string& str, vector<string>& res) const
|
44
|
+
{
|
45
|
+
res.clear();
|
46
|
+
|
47
|
+
Unicode unicode;
|
48
|
+
unicode.reserve(str.size());
|
49
|
+
|
50
|
+
TransCode::decode(str, unicode);
|
51
|
+
|
52
|
+
Unicode::const_iterator left = unicode.begin();
|
53
|
+
Unicode::const_iterator right;
|
54
|
+
|
55
|
+
for(right = unicode.begin(); right != unicode.end(); right++)
|
56
|
+
{
|
57
|
+
if(isIn(_specialSymbols, *right))
|
58
|
+
{
|
59
|
+
if(left != right)
|
60
|
+
{
|
61
|
+
cut(left, right, res);
|
62
|
+
}
|
63
|
+
res.resize(res.size() + 1);
|
64
|
+
TransCode::encode(right, right + 1, res.back());
|
65
|
+
left = right + 1;
|
66
|
+
}
|
67
|
+
}
|
68
|
+
if(left != right)
|
69
|
+
{
|
70
|
+
cut(left, right, res);
|
71
|
+
}
|
72
|
+
|
73
|
+
return true;
|
74
|
+
}
|
75
|
+
};
|
76
|
+
}
|
77
|
+
|
78
|
+
#endif
|
@@ -0,0 +1,63 @@
|
|
1
|
+
/************************************
|
2
|
+
* file enc : utf-8
|
3
|
+
* author : wuyanyi09@gmail.com
|
4
|
+
************************************/
|
5
|
+
#ifndef CPPJIEBA_TRANSCODE_H
|
6
|
+
#define CPPJIEBA_TRANSCODE_H
|
7
|
+
|
8
|
+
|
9
|
+
#include "Limonp/StringUtil.hpp"
|
10
|
+
#include "Limonp/LocalVector.hpp"
|
11
|
+
|
12
|
+
namespace CppJieba
|
13
|
+
{
|
14
|
+
|
15
|
+
using namespace Limonp;
|
16
|
+
typedef uint16_t UnicodeValueType;
|
17
|
+
typedef Limonp::LocalVector<UnicodeValueType> Unicode;
|
18
|
+
namespace TransCode
|
19
|
+
{
|
20
|
+
inline bool decode(const string& str, Unicode& res)
|
21
|
+
{
|
22
|
+
#ifdef CPPJIEBA_GBK
|
23
|
+
return gbkTrans(str, res);
|
24
|
+
#else
|
25
|
+
return utf8ToUnicode(str, res);
|
26
|
+
#endif
|
27
|
+
}
|
28
|
+
|
29
|
+
inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
|
30
|
+
{
|
31
|
+
#ifdef CPPJIEBA_GBK
|
32
|
+
return gbkTrans(begin, end, res);
|
33
|
+
#else
|
34
|
+
return unicodeToUtf8(begin, end, res);
|
35
|
+
#endif
|
36
|
+
}
|
37
|
+
|
38
|
+
inline bool encode(const Unicode& uni, string& res)
|
39
|
+
{
|
40
|
+
return encode(uni.begin(), uni.end(), res);
|
41
|
+
}
|
42
|
+
|
43
|
+
// compiler is expected to optimized this function to avoid return value copy
|
44
|
+
inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end)
|
45
|
+
{
|
46
|
+
string res;
|
47
|
+
res.reserve(end - begin);
|
48
|
+
encode(begin, end, res);
|
49
|
+
return res;
|
50
|
+
}
|
51
|
+
|
52
|
+
// compiler is expected to optimized this function to avoid return value copy
|
53
|
+
inline Unicode decode(const string& str)
|
54
|
+
{
|
55
|
+
Unicode unicode;
|
56
|
+
unicode.reserve(str.size());
|
57
|
+
decode(str, unicode);
|
58
|
+
return unicode;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
#endif
|