jieba_rb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +51 -0
  8. data/Rakefile +11 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +27 -0
  12. data/ext/cppjieba/ChangeLog.md +81 -0
  13. data/ext/cppjieba/Dockerfile +11 -0
  14. data/ext/cppjieba/LICENSE +20 -0
  15. data/ext/cppjieba/README.md +359 -0
  16. data/ext/cppjieba/conf/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/conf/server.conf +16 -0
  18. data/ext/cppjieba/dict/CMakeLists.txt +1 -0
  19. data/ext/cppjieba/dict/README.md +31 -0
  20. data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
  21. data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
  22. data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
  23. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  24. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  25. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  26. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  27. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  28. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  29. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  30. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  31. data/ext/cppjieba/dict/user.dict.utf8 +3 -0
  32. data/ext/cppjieba/script/CMakeLists.txt +1 -0
  33. data/ext/cppjieba/script/cjserver.start +12 -0
  34. data/ext/cppjieba/script/cjserver.stop +13 -0
  35. data/ext/cppjieba/server/CMakeLists.txt +9 -0
  36. data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
  37. data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
  38. data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
  39. data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
  40. data/ext/cppjieba/server/server.cpp +91 -0
  41. data/ext/cppjieba/src/DictTrie.hpp +211 -0
  42. data/ext/cppjieba/src/FullSegment.hpp +153 -0
  43. data/ext/cppjieba/src/HMMSegment.hpp +394 -0
  44. data/ext/cppjieba/src/ISegment.hpp +17 -0
  45. data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
  46. data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
  47. data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
  48. data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
  49. data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
  50. data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
  51. data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
  52. data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
  53. data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
  54. data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
  55. data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
  56. data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
  57. data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
  58. data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
  59. data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
  60. data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
  61. data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
  62. data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
  63. data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
  64. data/ext/cppjieba/src/MPSegment.hpp +148 -0
  65. data/ext/cppjieba/src/MixSegment.hpp +121 -0
  66. data/ext/cppjieba/src/PosTagger.hpp +109 -0
  67. data/ext/cppjieba/src/QuerySegment.hpp +123 -0
  68. data/ext/cppjieba/src/SegmentBase.hpp +78 -0
  69. data/ext/cppjieba/src/TransCode.hpp +63 -0
  70. data/ext/cppjieba/src/Trie.hpp +298 -0
  71. data/ext/cppjieba/test/CMakeLists.txt +7 -0
  72. data/ext/cppjieba/test/keyword_demo.cpp +16 -0
  73. data/ext/cppjieba/test/load_test.cpp +56 -0
  74. data/ext/cppjieba/test/segment_demo.cpp +59 -0
  75. data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
  76. data/ext/cppjieba/test/servertest/load_test.py +91 -0
  77. data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
  78. data/ext/cppjieba/test/tagging_demo.cpp +12 -0
  79. data/ext/cppjieba/test/testdata/curl.res +1 -0
  80. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  81. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  82. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  83. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  84. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  85. data/ext/cppjieba/test/testdata/review.100 +100 -0
  86. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  87. data/ext/cppjieba/test/testdata/server.conf +13 -0
  88. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  89. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  90. data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
  91. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  92. data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
  93. data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
  94. data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
  95. data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
  96. data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
  97. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
  98. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
  99. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
  100. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
  101. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
  102. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
  103. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
  104. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
  105. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
  106. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
  107. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
  108. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
  109. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
  110. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
  111. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
  112. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  113. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  114. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
  115. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
  116. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
  117. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
  118. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
  119. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
  120. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
  121. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
  122. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
  123. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
  124. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
  125. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
  126. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
  127. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
  128. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
  129. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
  130. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
  131. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
  132. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
  133. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
  134. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
  135. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  136. data/ext/jieba/extconf.rb +26 -0
  137. data/ext/jieba/jieba.c +9 -0
  138. data/ext/jieba/jieba.h +9 -0
  139. data/ext/jieba/segment.cc +88 -0
  140. data/ext/jieba/segment.h +17 -0
  141. data/jieba_rb.gemspec +51 -0
  142. data/lib/jieba_rb/version.rb +3 -0
  143. data/lib/jieba_rb.rb +28 -0
  144. data/test/test_segment.rb +32 -0
  145. metadata +246 -0
@@ -0,0 +1,148 @@
1
+ #ifndef CPPJIEBA_MPSEGMENT_H
2
+ #define CPPJIEBA_MPSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "Limonp/Logger.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "ISegment.hpp"
10
+ #include "SegmentBase.hpp"
11
+
12
+ namespace CppJieba
13
+ {
14
+
15
+ class MPSegment: public SegmentBase
16
+ {
17
+ private:
18
+ DictTrie _dictTrie;
19
+
20
+ public:
21
+ MPSegment(){};
22
+ MPSegment(const string& dictPath, const string& userDictPath = "")
23
+ {
24
+ LIMONP_CHECK(init(dictPath, userDictPath));
25
+ };
26
+ virtual ~MPSegment(){};
27
+ public:
28
+ bool init(const string& dictPath, const string& userDictPath = "")
29
+ {
30
+ LIMONP_CHECK(_dictTrie.init(dictPath, userDictPath));
31
+ LogInfo("MPSegment init(%s) ok", dictPath.c_str());
32
+ return true;
33
+ }
34
+ bool isUserDictSingleChineseWord(const Unicode::value_type & value) const
35
+ {
36
+ return _dictTrie.isUserDictSingleChineseWord(value);
37
+ }
38
+ public:
39
+ using SegmentBase::cut;
40
+ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
41
+ {
42
+ if(begin == end)
43
+ {
44
+ return false;
45
+ }
46
+
47
+ vector<Unicode> words;
48
+ words.reserve(end - begin);
49
+ if(!cut(begin, end, words))
50
+ {
51
+ return false;
52
+ }
53
+ size_t offset = res.size();
54
+ res.resize(res.size() + words.size());
55
+ for(size_t i = 0; i < words.size(); i++)
56
+ {
57
+ if(!TransCode::encode(words[i], res[i + offset]))
58
+ {
59
+ LogError("encode failed.");
60
+ res[i + offset].clear();
61
+ }
62
+ }
63
+ return true;
64
+ }
65
+
66
+ bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const
67
+ {
68
+ if(end == begin)
69
+ {
70
+ return false;
71
+ }
72
+ vector<SegmentChar> segmentChars;
73
+
74
+ _dictTrie.find(begin, end, segmentChars);
75
+
76
+ _calcDP(segmentChars);
77
+
78
+ _cut(segmentChars, res);
79
+
80
+ return true;
81
+ }
82
+ const DictTrie* getDictTrie() const
83
+ {
84
+ return &_dictTrie;
85
+ }
86
+
87
+ private:
88
+ void _calcDP(vector<SegmentChar>& segmentChars) const
89
+ {
90
+ size_t nextPos;
91
+ const DictUnit* p;
92
+ double val;
93
+
94
+ for(ssize_t i = segmentChars.size() - 1; i >= 0; i--)
95
+ {
96
+ segmentChars[i].pInfo = NULL;
97
+ segmentChars[i].weight = MIN_DOUBLE;
98
+ assert(!segmentChars[i].dag.empty());
99
+ for(DagType::const_iterator it = segmentChars[i].dag.begin(); it != segmentChars[i].dag.end(); it++)
100
+ {
101
+ nextPos = it->first;
102
+ p = it->second;
103
+ val = 0.0;
104
+ if(nextPos + 1 < segmentChars.size())
105
+ {
106
+ val += segmentChars[nextPos + 1].weight;
107
+ }
108
+
109
+ if(p)
110
+ {
111
+ val += p->weight;
112
+ }
113
+ else
114
+ {
115
+ val += _dictTrie.getMinWeight();
116
+ }
117
+ if(val > segmentChars[i].weight)
118
+ {
119
+ segmentChars[i].pInfo = p;
120
+ segmentChars[i].weight = val;
121
+ }
122
+ }
123
+ }
124
+ }
125
+ void _cut(const vector<SegmentChar>& segmentChars, vector<Unicode>& res) const
126
+ {
127
+ size_t i = 0;
128
+ while(i < segmentChars.size())
129
+ {
130
+ const DictUnit* p = segmentChars[i].pInfo;
131
+ if(p)
132
+ {
133
+ res.push_back(p->word);
134
+ i += p->word.size();
135
+ }
136
+ else//single chinese word
137
+ {
138
+ res.push_back(Unicode(1, segmentChars[i].uniCh));
139
+ i++;
140
+ }
141
+ }
142
+ }
143
+
144
+
145
+ };
146
+ }
147
+
148
+ #endif
@@ -0,0 +1,121 @@
1
+ #ifndef CPPJIEBA_MIXSEGMENT_H
2
+ #define CPPJIEBA_MIXSEGMENT_H
3
+
4
+ #include <cassert>
5
+ #include "MPSegment.hpp"
6
+ #include "HMMSegment.hpp"
7
+ #include "Limonp/StringUtil.hpp"
8
+
9
+ namespace CppJieba
10
+ {
11
+ class MixSegment: public SegmentBase
12
+ {
13
+ private:
14
+ MPSegment _mpSeg;
15
+ HMMSegment _hmmSeg;
16
+ public:
17
+ MixSegment(){};
18
+ MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
19
+ {
20
+ LIMONP_CHECK(init(mpSegDict, hmmSegDict, userDict));
21
+ }
22
+ virtual ~MixSegment(){}
23
+ public:
24
+ bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
25
+ {
26
+ LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict));
27
+ LIMONP_CHECK(_hmmSeg.init(hmmSegDict));
28
+ LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str());
29
+ return true;
30
+ }
31
+ public:
32
+ using SegmentBase::cut;
33
+ public:
34
+ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
35
+ {
36
+ vector<Unicode> words;
37
+ words.reserve(end - begin);
38
+ if(!_mpSeg.cut(begin, end, words))
39
+ {
40
+ LogError("mpSeg cutDAG failed.");
41
+ return false;
42
+ }
43
+
44
+ vector<Unicode> hmmRes;
45
+ hmmRes.reserve(end - begin);
46
+ Unicode piece;
47
+ piece.reserve(end - begin);
48
+ for (size_t i = 0, j = 0; i < words.size(); i++)
49
+ {
50
+ //if mp get a word, it's ok, put it into result
51
+ if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0])))
52
+ {
53
+ res.push_back(words[i]);
54
+ continue;
55
+ }
56
+
57
+ // if mp get a single one and it is not in userdict, collect it in sequence
58
+ j = i;
59
+ while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0]))
60
+ {
61
+ piece.push_back(words[j][0]);
62
+ j++;
63
+ }
64
+
65
+ // cut the sequence with hmm
66
+ if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes))
67
+ {
68
+ LogError("_hmmSeg cut failed.");
69
+ return false;
70
+ }
71
+
72
+ //put hmm result to result
73
+ for (size_t k = 0; k < hmmRes.size(); k++)
74
+ {
75
+ res.push_back(hmmRes[k]);
76
+ }
77
+
78
+ //clear tmp vars
79
+ piece.clear();
80
+ hmmRes.clear();
81
+
82
+ //let i jump over this piece
83
+ i = j - 1;
84
+ }
85
+ return true;
86
+ }
87
+
88
+ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const
89
+ {
90
+ if(begin == end)
91
+ {
92
+ return false;
93
+ }
94
+
95
+ vector<Unicode> uRes;
96
+ uRes.reserve(end - begin);
97
+ if (!cut(begin, end, uRes))
98
+ {
99
+ return false;
100
+ }
101
+
102
+ size_t offset = res.size();
103
+ res.resize(res.size() + uRes.size());
104
+ for(size_t i = 0; i < uRes.size(); i ++, offset++)
105
+ {
106
+ if(!TransCode::encode(uRes[i], res[offset]))
107
+ {
108
+ LogError("encode failed.");
109
+ }
110
+ }
111
+ return true;
112
+ }
113
+
114
+ const DictTrie* getDictTrie() const
115
+ {
116
+ return _mpSeg.getDictTrie();
117
+ }
118
+ };
119
+ }
120
+
121
+ #endif
@@ -0,0 +1,109 @@
1
+ #ifndef CPPJIEBA_POS_TAGGING_H
2
+ #define CPPJIEBA_POS_TAGGING_H
3
+
4
+ #include "MixSegment.hpp"
5
+ #include "Limonp/StringUtil.hpp"
6
+ #include "DictTrie.hpp"
7
+
8
+ namespace CppJieba
9
+ {
10
+ using namespace Limonp;
11
+
12
+ static const char* const POS_M = "m";
13
+ static const char* const POS_ENG = "eng";
14
+ static const char* const POS_X = "x";
15
+
16
+ class PosTagger
17
+ {
18
+ private:
19
+ MixSegment _segment;
20
+ const DictTrie * _dictTrie;
21
+
22
+ public:
23
+ PosTagger()
24
+ {}
25
+ PosTagger(
26
+ const string& dictPath,
27
+ const string& hmmFilePath,
28
+ const string& userDictPath = ""
29
+ )
30
+ {
31
+ init(dictPath, hmmFilePath, userDictPath);
32
+ };
33
+ ~PosTagger(){};
34
+ public:
35
+ void init(
36
+ const string& dictPath,
37
+ const string& hmmFilePath,
38
+ const string& userDictPath = ""
39
+ )
40
+ {
41
+ LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath));
42
+ _dictTrie = _segment.getDictTrie();
43
+ LIMONP_CHECK(_dictTrie);
44
+ };
45
+
46
+
47
+ bool tag(const string& src, vector<pair<string, string> >& res) const
48
+ {
49
+ vector<string> cutRes;
50
+ if (!_segment.cut(src, cutRes))
51
+ {
52
+ LogError("_mixSegment cut failed");
53
+ return false;
54
+ }
55
+
56
+ const DictUnit *tmp = NULL;
57
+ Unicode unico;
58
+ for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr)
59
+ {
60
+ if (!TransCode::decode(*itr, unico))
61
+ {
62
+ LogError("decode failed.");
63
+ return false;
64
+ }
65
+ tmp = _dictTrie->find(unico.begin(), unico.end());
66
+ if(tmp == NULL || tmp->tag.empty())
67
+ {
68
+ res.push_back(make_pair(*itr, _specialRule(unico)));
69
+ }
70
+ else
71
+ {
72
+ res.push_back(make_pair(*itr, tmp->tag));
73
+ }
74
+ }
75
+ return !res.empty();
76
+ }
77
+ private:
78
+ const char* _specialRule(const Unicode& unicode) const
79
+ {
80
+ size_t m = 0;
81
+ size_t eng = 0;
82
+ for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++)
83
+ {
84
+ if(unicode[i] < 0x80)
85
+ {
86
+ eng ++;
87
+ if('0' <= unicode[i] && unicode[i] <= '9')
88
+ {
89
+ m++;
90
+ }
91
+ }
92
+ }
93
+ // ascii char is not found
94
+ if(eng == 0)
95
+ {
96
+ return POS_X;
97
+ }
98
+ // all the ascii is number char
99
+ if(m == eng)
100
+ {
101
+ return POS_M;
102
+ }
103
+ // the ascii chars contain english letter
104
+ return POS_ENG;
105
+ }
106
+ };
107
+ }
108
+
109
+ #endif
@@ -0,0 +1,123 @@
1
+ #ifndef CPPJIEBA_QUERYSEGMENT_H
2
+ #define CPPJIEBA_QUERYSEGMENT_H
3
+
4
+ #include <algorithm>
5
+ #include <set>
6
+ #include <cassert>
7
+ #include "Limonp/Logger.hpp"
8
+ #include "DictTrie.hpp"
9
+ #include "ISegment.hpp"
10
+ #include "SegmentBase.hpp"
11
+ #include "FullSegment.hpp"
12
+ #include "MixSegment.hpp"
13
+ #include "TransCode.hpp"
14
+ #include "DictTrie.hpp"
15
+
16
+ namespace CppJieba
17
+ {
18
+ class QuerySegment: public SegmentBase
19
+ {
20
+ private:
21
+ MixSegment _mixSeg;
22
+ FullSegment _fullSeg;
23
+ size_t _maxWordLen;
24
+
25
+ public:
26
+ QuerySegment(){};
27
+ QuerySegment(const string& dict, const string& model, size_t maxWordLen)
28
+ {
29
+ init(dict, model, maxWordLen);
30
+ };
31
+ virtual ~QuerySegment(){};
32
+ public:
33
+ bool init(const string& dict, const string& model, size_t maxWordLen)
34
+ {
35
+ LIMONP_CHECK(_mixSeg.init(dict, model));
36
+ LIMONP_CHECK(_fullSeg.init(_mixSeg.getDictTrie()));
37
+ assert(maxWordLen);
38
+ _maxWordLen = maxWordLen;
39
+ return true;
40
+ }
41
+
42
+ public:
43
+ using SegmentBase::cut;
44
+
45
+ public:
46
+ bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const
47
+ {
48
+ if (begin >= end)
49
+ {
50
+ LogError("begin >= end");
51
+ return false;
52
+ }
53
+
54
+ //use mix cut first
55
+ vector<Unicode> mixRes;
56
+ if (!_mixSeg.cut(begin, end, mixRes))
57
+ {
58
+ LogError("_mixSeg cut failed.");
59
+ return false;
60
+ }
61
+
62
+ vector<Unicode> fullRes;
63
+ for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++)
64
+ {
65
+
66
+ // if it's too long, cut with _fullSeg, put fullRes in res
67
+ if (mixResItr->size() > _maxWordLen)
68
+ {
69
+ if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes))
70
+ {
71
+ for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++)
72
+ {
73
+ res.push_back(*fullResItr);
74
+ }
75
+
76
+ //clear tmp res
77
+ fullRes.clear();
78
+ }
79
+ }
80
+ else // just use the mix result
81
+ {
82
+ res.push_back(*mixResItr);
83
+ }
84
+ }
85
+
86
+ return true;
87
+ }
88
+
89
+
90
+ bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const
91
+ {
92
+ if (begin >= end)
93
+ {
94
+ LogError("begin >= end");
95
+ return false;
96
+ }
97
+
98
+ vector<Unicode> uRes;
99
+ if (!cut(begin, end, uRes))
100
+ {
101
+ LogError("get unicode cut result error.");
102
+ return false;
103
+ }
104
+
105
+ string tmp;
106
+ for (vector<Unicode>::const_iterator uItr = uRes.begin(); uItr != uRes.end(); uItr++)
107
+ {
108
+ if (TransCode::encode(*uItr, tmp))
109
+ {
110
+ res.push_back(tmp);
111
+ }
112
+ else
113
+ {
114
+ LogError("encode failed.");
115
+ }
116
+ }
117
+
118
+ return true;
119
+ }
120
+ };
121
+ }
122
+
123
+ #endif
@@ -0,0 +1,78 @@
1
+ #ifndef CPPJIEBA_SEGMENTBASE_H
2
+ #define CPPJIEBA_SEGMENTBASE_H
3
+
4
+ #include "TransCode.hpp"
5
+ #include "Limonp/Logger.hpp"
6
+ #include "Limonp/NonCopyable.hpp"
7
+ #include "Limonp/HandyMacro.hpp"
8
+ #include "ISegment.hpp"
9
+ #include <cassert>
10
+
11
+
12
+ namespace CppJieba
13
+ {
14
+ using namespace Limonp;
15
+
16
+ //const char* const SPECIAL_CHARS = " \t\n";
17
+ #ifndef CPPJIEBA_GBK
18
+ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u, 12290u, 65292u};
19
+ #else
20
+ const UnicodeValueType SPECIAL_SYMBOL[] = {32u, 9u, 10u};
21
+ #endif
22
+
23
+ class SegmentBase: public ISegment, public NonCopyable
24
+ {
25
+ public:
26
+ SegmentBase(){_loadSpecialSymbols();};
27
+ virtual ~SegmentBase(){};
28
+ private:
29
+ unordered_set<UnicodeValueType> _specialSymbols;
30
+ private:
31
+ void _loadSpecialSymbols()
32
+ {
33
+ size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
34
+ for(size_t i = 0; i < size; i ++)
35
+ {
36
+ _specialSymbols.insert(SPECIAL_SYMBOL[i]);
37
+ }
38
+ assert(_specialSymbols.size());
39
+ }
40
+
41
+ public:
42
+ virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res) const = 0;
43
+ virtual bool cut(const string& str, vector<string>& res) const
44
+ {
45
+ res.clear();
46
+
47
+ Unicode unicode;
48
+ unicode.reserve(str.size());
49
+
50
+ TransCode::decode(str, unicode);
51
+
52
+ Unicode::const_iterator left = unicode.begin();
53
+ Unicode::const_iterator right;
54
+
55
+ for(right = unicode.begin(); right != unicode.end(); right++)
56
+ {
57
+ if(isIn(_specialSymbols, *right))
58
+ {
59
+ if(left != right)
60
+ {
61
+ cut(left, right, res);
62
+ }
63
+ res.resize(res.size() + 1);
64
+ TransCode::encode(right, right + 1, res.back());
65
+ left = right + 1;
66
+ }
67
+ }
68
+ if(left != right)
69
+ {
70
+ cut(left, right, res);
71
+ }
72
+
73
+ return true;
74
+ }
75
+ };
76
+ }
77
+
78
+ #endif
@@ -0,0 +1,63 @@
1
+ /************************************
2
+ * file enc : utf-8
3
+ * author : wuyanyi09@gmail.com
4
+ ************************************/
5
+ #ifndef CPPJIEBA_TRANSCODE_H
6
+ #define CPPJIEBA_TRANSCODE_H
7
+
8
+
9
+ #include "Limonp/StringUtil.hpp"
10
+ #include "Limonp/LocalVector.hpp"
11
+
12
+ namespace CppJieba
13
+ {
14
+
15
+ using namespace Limonp;
16
+ typedef uint16_t UnicodeValueType;
17
+ typedef Limonp::LocalVector<UnicodeValueType> Unicode;
18
+ namespace TransCode
19
+ {
20
+ inline bool decode(const string& str, Unicode& res)
21
+ {
22
+ #ifdef CPPJIEBA_GBK
23
+ return gbkTrans(str, res);
24
+ #else
25
+ return utf8ToUnicode(str, res);
26
+ #endif
27
+ }
28
+
29
+ inline bool encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res)
30
+ {
31
+ #ifdef CPPJIEBA_GBK
32
+ return gbkTrans(begin, end, res);
33
+ #else
34
+ return unicodeToUtf8(begin, end, res);
35
+ #endif
36
+ }
37
+
38
+ inline bool encode(const Unicode& uni, string& res)
39
+ {
40
+ return encode(uni.begin(), uni.end(), res);
41
+ }
42
+
43
+ // compiler is expected to optimized this function to avoid return value copy
44
+ inline string encode(Unicode::const_iterator begin, Unicode::const_iterator end)
45
+ {
46
+ string res;
47
+ res.reserve(end - begin);
48
+ encode(begin, end, res);
49
+ return res;
50
+ }
51
+
52
+ // compiler is expected to optimized this function to avoid return value copy
53
+ inline Unicode decode(const string& str)
54
+ {
55
+ Unicode unicode;
56
+ unicode.reserve(str.size());
57
+ decode(str, unicode);
58
+ return unicode;
59
+ }
60
+ }
61
+ }
62
+
63
+ #endif