cppjieba_rb 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,285 @@
1
+ # CppJieba [English](README_EN.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## 简介
13
+
14
+ CppJieba是"结巴(Jieba)"中文分词的C++版本
15
+
16
+ ## 特性
17
+
18
+ + 源代码都写进头文件`include/cppjieba/*.hpp`里,`include`即可使用。
19
+ + 支持`utf8`编码。
20
+ + 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。
21
+ + 支持载自定义用户词典,多路径时支持分隔符'|'或者';'分隔。
22
+ + 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
23
+
24
+ ## 用法
25
+
26
+ ### 依赖软件
27
+
28
+ * `g++ (version >= 4.1 is recommended) or clang++`;
29
+ * `cmake (version >= 2.6 is recommended)`;
30
+
31
+ ### 下载和编译
32
+
33
+ ```sh
34
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
35
+ cd cppjieba
36
+ mkdir build
37
+ cd build
38
+ cmake ..
39
+ make
40
+ ```
41
+
42
+ 有兴趣的可以跑跑测试(可选):
43
+
44
+ ```
45
+ make test
46
+ ```
47
+
48
+ ## Demo
49
+
50
+ ```
51
+ ./demo
52
+ ```
53
+
54
+ 结果示例:
55
+
56
+ ```
57
+ [demo] Cut With HMM
58
+ 他/来到/了/网易/杭研/大厦
59
+ [demo] Cut Without HMM
60
+ 他/来到/了/网易/杭/研/大厦
61
+ 我来到北京清华大学
62
+ [demo] CutAll
63
+ 我/来到/北京/清华/清华大学/华大/大学
64
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
65
+ [demo] CutForSearch
66
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
67
+ [demo] Insert User Word
68
+ 男默/女泪
69
+ 男默女泪
70
+ [demo] CutForSearch Word With Offset
71
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
72
+ [demo] Tagging
73
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
74
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
75
+ [demo] Keyword Extraction
76
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
77
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
78
+ ```
79
+
80
+ 详细请看 `test/demo.cpp`.
81
+
82
+ ### 分词结果示例
83
+
84
+ **MPSegment**
85
+
86
+ Output:
87
+ ```
88
+ 我来到北京清华大学
89
+ 我/来到/北京/清华大学
90
+
91
+ 他来到了网易杭研大厦
92
+ 他/来到/了/网易/杭/研/大厦
93
+
94
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
95
+ 小/明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
96
+
97
+ ```
98
+
99
+ **HMMSegment**
100
+
101
+ ```
102
+ 我来到北京清华大学
103
+ 我来/到/北京/清华大学
104
+
105
+ 他来到了网易杭研大厦
106
+ 他来/到/了/网易/杭/研大厦
107
+
108
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
109
+ 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造
110
+
111
+ ```
112
+
113
+ **MixSegment**
114
+
115
+ ```
116
+ 我来到北京清华大学
117
+ 我/来到/北京/清华大学
118
+
119
+ 他来到了网易杭研大厦
120
+ 他/来到/了/网易/杭研/大厦
121
+
122
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
123
+ 小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
124
+
125
+ ```
126
+
127
+ **FullSegment**
128
+
129
+ ```
130
+ 我来到北京清华大学
131
+ 我/来到/北京/清华/清华大学/华大/大学
132
+
133
+ 他来到了网易杭研大厦
134
+ 他/来到/了/网易/杭/研/大厦
135
+
136
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
137
+ 小/明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造
138
+
139
+ ```
140
+
141
+ **QuerySegment**
142
+
143
+ ```
144
+ 我来到北京清华大学
145
+ 我/来到/北京/清华/清华大学/华大/大学
146
+
147
+ 他来到了网易杭研大厦
148
+ 他/来到/了/网易/杭研/大厦
149
+
150
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
151
+ 小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/中国/中国科学院/科学/科学院/学院/日本/日本京都大学/京都/京都大学/大学/深造
152
+
153
+ ```
154
+
155
+ 以上依次是MP,HMM,Mix三种方法的效果。
156
+
157
+ 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。
158
+
159
+ Full方法切出所有字典里的词语。
160
+
161
+ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
162
+
163
+ ### 自定义用户词典
164
+
165
+ 自定义词典示例请看`dict/user.dict.utf8`。
166
+
167
+ 没有使用自定义用户词典时的结果:
168
+
169
+ ```
170
+ 令狐冲/是/云/计算/行业/的/专家
171
+ ```
172
+
173
+ 使用自定义用户词典时的结果:
174
+
175
+ ```
176
+ 令狐冲/是/云计算/行业/的/专家
177
+ ```
178
+
179
+ ### 关键词抽取
180
+
181
+ ```
182
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
183
+ ["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"]
184
+ ```
185
+
186
+ 详细请见 `test/demo.cpp`.
187
+
188
+ ### 词性标注
189
+
190
+ ```
191
+ 我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。
192
+ ["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
193
+ ```
194
+
195
+ 详细请看 `test/demo.cpp`.
196
+
197
+ 支持自定义词性。
198
+ 比如在(`dict/user.dict.utf8`)增加一行
199
+
200
+ ```
201
+ 蓝翔 nz
202
+ ```
203
+
204
+ 结果如下:
205
+
206
+ ```
207
+ ["我:r", "是:v", "蓝翔:nz", "技工:n", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当:t", "上:f", "总经理:n", ",:x", "出任:v", "CEO:eng", ",:x", "迎娶:v", "白富美:x", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
208
+ ```
209
+
210
+ ## 其它词典资料分享
211
+
212
+ + [dict.367W.utf8] iLife(562193561 at qq.com)
213
+
214
+ ## 应用
215
+
216
+ + [GoJieba] go语言版本的结巴中文分词。
217
+ + [NodeJieba] Node.js 版本的结巴中文分词。
218
+ + [simhash] 中文文档的的相似度计算
219
+ + [exjieba] Erlang 版本的结巴中文分词。
220
+ + [jiebaR] R语言版本的结巴中文分词。
221
+ + [cjieba] C语言版本的结巴分词。
222
+ + [jieba_rb] Ruby 版本的结巴分词。
223
+ + [iosjieba] iOS 版本的结巴分词。
224
+ + [SqlJieba] MySQL 全文索引的结巴中文分词插件。
225
+ + [pg_jieba] PostgreSQL 数据库的分词插件。
226
+ + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
+ + [ngx_http_cppjieba_module] Nginx 分词插件。
228
+ + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
229
+ + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
+ + [cppjieba-server] CppJieba HTTP 服务器。
231
+
232
+ ## 线上演示
233
+
234
+ [Web-Demo](http://cppjieba-webdemo.herokuapp.com/)
235
+ (建议使用chrome打开)
236
+
237
+ ## 性能评测
238
+
239
+ [Jieba中文分词系列性能评测]
240
+
241
+ ## 客服
242
+
243
+ + Email: `i@yanyiwu.com`
244
+ + QQ: 64162451
245
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
246
+
247
+ ## 鸣谢
248
+
249
+ "结巴"中文分词作者: [SunJunyi](https://github.com/fxsjy)
250
+
251
+ ## 许可证
252
+
253
+ [MIT](http://yanyiwu.mit-license.org)
254
+
255
+ ## 作者
256
+
257
+ - [yanyiwu](yanyiwu.com)
258
+ - [aholic](https://github.com/aholic)
259
+
260
+ [GoJieba]:https://github.com/yanyiwu/gojieba
261
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
262
+ [jannson]:https://github.com/jannson
263
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
264
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
+ [jiebaR]:https://github.com/qinwf/jiebaR
267
+ [simhash]:https://github.com/yanyiwu/simhash
268
+ [代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
269
+ [issue25]:https://github.com/yanyiwu/cppjieba/issues/25
270
+ [exjieba]:https://github.com/falood/exjieba
271
+ [KeywordServer]:https://github.com/yanyiwu/keyword_server
272
+ [ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
273
+ [dict.367W.utf8]:https://github.com/qinwf/BigDict
274
+ [cjieba]:http://github.com/yanyiwu/cjieba
275
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
276
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
277
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
278
+ [Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
279
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
282
+
283
+
284
+ [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
285
+
@@ -0,0 +1,111 @@
1
+ # CppJieba [简体中文](README.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## Introduction
13
+
14
+ The Jieba Chinese Word Segmentation Implemented By C++ .
15
+
16
+ ## Usage
17
+
18
+ ### Dependencies
19
+
20
+ + `g++ (version >= 4.1 is recommended) or clang++`;
21
+ + `cmake (version >= 2.6 is recommended)`;
22
+
23
+ ### Download & Compile
24
+
25
+ ```sh
26
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
27
+ cd cppjieba
28
+ mkdir build
29
+ cd build
30
+ cmake ..
31
+ make
32
+ ```
33
+
34
+ ### Unit Testing
35
+
36
+ ```
37
+ make test
38
+ ```
39
+
40
+ ## Demo
41
+
42
+ ```
43
+ ./demo
44
+ ```
45
+
46
+ Output:
47
+
48
+ ```
49
+ [demo] Cut With HMM
50
+ 他/来到/了/网易/杭研/大厦
51
+ [demo] Cut Without HMM
52
+ 他/来到/了/网易/杭/研/大厦
53
+ 我来到北京清华大学
54
+ [demo] CutAll
55
+ 我/来到/北京/清华/清华大学/华大/大学
56
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
57
+ [demo] CutForSearch
58
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
59
+ [demo] Insert User Word
60
+ 男默/女泪
61
+ 男默女泪
62
+ [demo] CutForSearch Word With Offset
63
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
64
+ [demo] Tagging
65
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
66
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
67
+ [demo] Keyword Extraction
68
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
69
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
70
+ ```
71
+
72
+ Please see details in `test/demo.cpp`.
73
+
74
+ ## Cases
75
+
76
+ + [GoJieba]
77
+ + [NodeJieba]
78
+ + [simhash]
79
+ + [exjieba]
80
+ + [jiebaR]
81
+ + [cjieba]
82
+ + [jieba_rb]
83
+ + [iosjieba]
84
+ + [SqlJieba]
85
+ + [pg_jieba]
86
+ + [ngx_http_cppjieba_module]
87
+ + [gitbook-plugin-search-pro]
88
+ + [cppjieba-server]
89
+
90
+ ## Contact
91
+
92
+ + Email: `i@yanyiwu.com`
93
+ + QQ: 64162451
94
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
95
+
96
+ [GoJieba]:https://github.com/yanyiwu/gojieba
97
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
98
+ [jannson]:https://github.com/jannson
99
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
100
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
101
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
102
+ [jiebaR]:https://github.com/qinwf/jiebaR
103
+ [simhash]:https://github.com/yanyiwu/simhash
104
+ [exjieba]:https://github.com/falood/exjieba
105
+ [cjieba]:http://github.com/yanyiwu/cjieba
106
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
107
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
108
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
109
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
@@ -0,0 +1,32 @@
1
+ os: Visual Studio 2015
2
+
3
+ platform: x64
4
+
5
+ # clone directory
6
+ clone_folder: c:\projects\cppjieba
7
+
8
+ # scripts to run before build
9
+ before_build:
10
+ - echo Running cmake...
11
+ - cd c:\projects\cppjieba
12
+ - cmake .
13
+
14
+ build:
15
+ project: ALL_BUILD.vcxproj # path to Visual Studio solution or project
16
+
17
+ # scripts to run after build
18
+ after_build:
19
+ - cd Debug
20
+ - demo.exe
21
+ - load_test.exe
22
+ - cd ..
23
+ - COPY .\test\Debug\test.run.exe .\test\test.run.exe
24
+ - cd test
25
+ - test.run.exe
26
+ - cd ..
27
+ - 7z a c:\projects\all.zip * -tzip
28
+ - cd c:\projects
29
+
30
+ artifacts:
31
+ - path: all.zip
32
+ name: all.zip
@@ -0,0 +1 @@
1
+ ADD_SUBDIRECTORY(gtest)
@@ -0,0 +1,5 @@
1
+ INCLUDE_DIRECTORIES(./ include)
2
+ ADD_LIBRARY(gtest STATIC src/gtest-all.cc)
3
+ if(NOT MSVC)
4
+ TARGET_LINK_LIBRARIES(gtest pthread)
5
+ endif()