jieba-rb 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,285 @@
1
+ # CppJieba [English](README_EN.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## 简介
13
+
14
+ CppJieba是"结巴(Jieba)"中文分词的C++版本
15
+
16
+ ## 特性
17
+
18
+ + 源代码都写进头文件`include/cppjieba/*.hpp`里,`include`即可使用。
19
+ + 支持`utf8`编码。
20
+ + 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。
21
+ + 支持载自定义用户词典,多路径时支持分隔符'|'或者';'分隔。
22
+ + 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
23
+
24
+ ## 用法
25
+
26
+ ### 依赖软件
27
+
28
+ * `g++ (version >= 4.1 is recommended) or clang++`;
29
+ * `cmake (version >= 2.6 is recommended)`;
30
+
31
+ ### 下载和编译
32
+
33
+ ```sh
34
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
35
+ cd cppjieba
36
+ mkdir build
37
+ cd build
38
+ cmake ..
39
+ make
40
+ ```
41
+
42
+ 有兴趣的可以跑跑测试(可选):
43
+
44
+ ```
45
+ make test
46
+ ```
47
+
48
+ ## Demo
49
+
50
+ ```
51
+ ./demo
52
+ ```
53
+
54
+ 结果示例:
55
+
56
+ ```
57
+ [demo] Cut With HMM
58
+ 他/来到/了/网易/杭研/大厦
59
+ [demo] Cut Without HMM
60
+ 他/来到/了/网易/杭/研/大厦
61
+ 我来到北京清华大学
62
+ [demo] CutAll
63
+ 我/来到/北京/清华/清华大学/华大/大学
64
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
65
+ [demo] CutForSearch
66
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
67
+ [demo] Insert User Word
68
+ 男默/女泪
69
+ 男默女泪
70
+ [demo] CutForSearch Word With Offset
71
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
72
+ [demo] Tagging
73
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
74
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
75
+ [demo] Keyword Extraction
76
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
77
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
78
+ ```
79
+
80
+ 详细请看 `test/demo.cpp`.
81
+
82
+ ### 分词结果示例
83
+
84
+ **MPSegment**
85
+
86
+ Output:
87
+ ```
88
+ 我来到北京清华大学
89
+ 我/来到/北京/清华大学
90
+
91
+ 他来到了网易杭研大厦
92
+ 他/来到/了/网易/杭/研/大厦
93
+
94
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
95
+ 小/明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
96
+
97
+ ```
98
+
99
+ **HMMSegment**
100
+
101
+ ```
102
+ 我来到北京清华大学
103
+ 我来/到/北京/清华大学
104
+
105
+ 他来到了网易杭研大厦
106
+ 他来/到/了/网易/杭/研大厦
107
+
108
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
109
+ 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造
110
+
111
+ ```
112
+
113
+ **MixSegment**
114
+
115
+ ```
116
+ 我来到北京清华大学
117
+ 我/来到/北京/清华大学
118
+
119
+ 他来到了网易杭研大厦
120
+ 他/来到/了/网易/杭研/大厦
121
+
122
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
123
+ 小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
124
+
125
+ ```
126
+
127
+ **FullSegment**
128
+
129
+ ```
130
+ 我来到北京清华大学
131
+ 我/来到/北京/清华/清华大学/华大/大学
132
+
133
+ 他来到了网易杭研大厦
134
+ 他/来到/了/网易/杭/研/大厦
135
+
136
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
137
+ 小/明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造
138
+
139
+ ```
140
+
141
+ **QuerySegment**
142
+
143
+ ```
144
+ 我来到北京清华大学
145
+ 我/来到/北京/清华/清华大学/华大/大学
146
+
147
+ 他来到了网易杭研大厦
148
+ 他/来到/了/网易/杭研/大厦
149
+
150
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
151
+ 小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/中国/中国科学院/科学/科学院/学院/日本/日本京都大学/京都/京都大学/大学/深造
152
+
153
+ ```
154
+
155
+ 以上依次是MP,HMM,Mix三种方法的效果。
156
+
157
+ 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。
158
+
159
+ Full方法切出所有字典里的词语。
160
+
161
+ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
162
+
163
+ ### 自定义用户词典
164
+
165
+ 自定义词典示例请看`dict/user.dict.utf8`。
166
+
167
+ 没有使用自定义用户词典时的结果:
168
+
169
+ ```
170
+ 令狐冲/是/云/计算/行业/的/专家
171
+ ```
172
+
173
+ 使用自定义用户词典时的结果:
174
+
175
+ ```
176
+ 令狐冲/是/云计算/行业/的/专家
177
+ ```
178
+
179
+ ### 关键词抽取
180
+
181
+ ```
182
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
183
+ ["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"]
184
+ ```
185
+
186
+ 详细请见 `test/demo.cpp`.
187
+
188
+ ### 词性标注
189
+
190
+ ```
191
+ 我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。
192
+ ["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
193
+ ```
194
+
195
+ 详细请看 `test/demo.cpp`.
196
+
197
+ 支持自定义词性。
198
+ 比如在(`dict/user.dict.utf8`)增加一行
199
+
200
+ ```
201
+ 蓝翔 nz
202
+ ```
203
+
204
+ 结果如下:
205
+
206
+ ```
207
+ ["我:r", "是:v", "蓝翔:nz", "技工:n", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当:t", "上:f", "总经理:n", ",:x", "出任:v", "CEO:eng", ",:x", "迎娶:v", "白富美:x", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
208
+ ```
209
+
210
+ ## 其它词典资料分享
211
+
212
+ + [dict.367W.utf8] iLife(562193561 at qq.com)
213
+
214
+ ## 应用
215
+
216
+ + [GoJieba] go语言版本的结巴中文分词。
217
+ + [NodeJieba] Node.js 版本的结巴中文分词。
218
+ + [simhash] 中文文档的的相似度计算
219
+ + [exjieba] Erlang 版本的结巴中文分词。
220
+ + [jiebaR] R语言版本的结巴中文分词。
221
+ + [cjieba] C语言版本的结巴分词。
222
+ + [jieba_rb] Ruby 版本的结巴分词。
223
+ + [iosjieba] iOS 版本的结巴分词。
224
+ + [SqlJieba] MySQL 全文索引的结巴中文分词插件。
225
+ + [pg_jieba] PostgreSQL 数据库的分词插件。
226
+ + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
+ + [ngx_http_cppjieba_module] Nginx 分词插件。
228
+ + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
229
+ + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
+ + [cppjieba-server] CppJieba HTTP 服务器。
231
+
232
+ ## 线上演示
233
+
234
+ [Web-Demo](http://cppjieba-webdemo.herokuapp.com/)
235
+ (建议使用chrome打开)
236
+
237
+ ## 性能评测
238
+
239
+ [Jieba中文分词系列性能评测]
240
+
241
+ ## 客服
242
+
243
+ + Email: `i@yanyiwu.com`
244
+ + QQ: 64162451
245
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
246
+
247
+ ## 鸣谢
248
+
249
+ "结巴"中文分词作者: [SunJunyi](https://github.com/fxsjy)
250
+
251
+ ## 许可证
252
+
253
+ [MIT](http://yanyiwu.mit-license.org)
254
+
255
+ ## 作者
256
+
257
+ - [yanyiwu](yanyiwu.com)
258
+ - [aholic](https://github.com/aholic)
259
+
260
+ [GoJieba]:https://github.com/yanyiwu/gojieba
261
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
262
+ [jannson]:https://github.com/jannson
263
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
264
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
+ [jiebaR]:https://github.com/qinwf/jiebaR
267
+ [simhash]:https://github.com/yanyiwu/simhash
268
+ [代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
269
+ [issue25]:https://github.com/yanyiwu/cppjieba/issues/25
270
+ [exjieba]:https://github.com/falood/exjieba
271
+ [KeywordServer]:https://github.com/yanyiwu/keyword_server
272
+ [ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
273
+ [dict.367W.utf8]:https://github.com/qinwf/BigDict
274
+ [cjieba]:http://github.com/yanyiwu/cjieba
275
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
276
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
277
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
278
+ [Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
279
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
282
+
283
+
284
+ [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
285
+
@@ -0,0 +1,111 @@
1
+ # CppJieba [简体中文](README.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## Introduction
13
+
14
+ The Jieba Chinese Word Segmentation Implemented By C++ .
15
+
16
+ ## Usage
17
+
18
+ ### Dependencies
19
+
20
+ + `g++ (version >= 4.1 is recommended) or clang++`;
21
+ + `cmake (version >= 2.6 is recommended)`;
22
+
23
+ ### Download & Compile
24
+
25
+ ```sh
26
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
27
+ cd cppjieba
28
+ mkdir build
29
+ cd build
30
+ cmake ..
31
+ make
32
+ ```
33
+
34
+ ### Unit Testing
35
+
36
+ ```
37
+ make test
38
+ ```
39
+
40
+ ## Demo
41
+
42
+ ```
43
+ ./demo
44
+ ```
45
+
46
+ Output:
47
+
48
+ ```
49
+ [demo] Cut With HMM
50
+ 他/来到/了/网易/杭研/大厦
51
+ [demo] Cut Without HMM
52
+ 他/来到/了/网易/杭/研/大厦
53
+ 我来到北京清华大学
54
+ [demo] CutAll
55
+ 我/来到/北京/清华/清华大学/华大/大学
56
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
57
+ [demo] CutForSearch
58
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
59
+ [demo] Insert User Word
60
+ 男默/女泪
61
+ 男默女泪
62
+ [demo] CutForSearch Word With Offset
63
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
64
+ [demo] Tagging
65
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
66
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
67
+ [demo] Keyword Extraction
68
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
69
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
70
+ ```
71
+
72
+ Please see details in `test/demo.cpp`.
73
+
74
+ ## Cases
75
+
76
+ + [GoJieba]
77
+ + [NodeJieba]
78
+ + [simhash]
79
+ + [exjieba]
80
+ + [jiebaR]
81
+ + [cjieba]
82
+ + [jieba_rb]
83
+ + [iosjieba]
84
+ + [SqlJieba]
85
+ + [pg_jieba]
86
+ + [ngx_http_cppjieba_module]
87
+ + [gitbook-plugin-search-pro]
88
+ + [cppjieba-server]
89
+
90
+ ## Contact
91
+
92
+ + Email: `i@yanyiwu.com`
93
+ + QQ: 64162451
94
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
95
+
96
+ [GoJieba]:https://github.com/yanyiwu/gojieba
97
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
98
+ [jannson]:https://github.com/jannson
99
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
100
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
101
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
102
+ [jiebaR]:https://github.com/qinwf/jiebaR
103
+ [simhash]:https://github.com/yanyiwu/simhash
104
+ [exjieba]:https://github.com/falood/exjieba
105
+ [cjieba]:http://github.com/yanyiwu/cjieba
106
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
107
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
108
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
109
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
@@ -0,0 +1,32 @@
1
+ os: Visual Studio 2015
2
+
3
+ platform: x64
4
+
5
+ # clone directory
6
+ clone_folder: c:\projects\cppjieba
7
+
8
+ # scripts to run before build
9
+ before_build:
10
+ - echo Running cmake...
11
+ - cd c:\projects\cppjieba
12
+ - cmake .
13
+
14
+ build:
15
+ project: ALL_BUILD.vcxproj # path to Visual Studio solution or project
16
+
17
+ # scripts to run after build
18
+ after_build:
19
+ - cd Debug
20
+ - demo.exe
21
+ - load_test.exe
22
+ - cd ..
23
+ - COPY .\test\Debug\test.run.exe .\test\test.run.exe
24
+ - cd test
25
+ - test.run.exe
26
+ - cd ..
27
+ - 7z a c:\projects\all.zip * -tzip
28
+ - cd c:\projects
29
+
30
+ artifacts:
31
+ - path: all.zip
32
+ name: all.zip
@@ -0,0 +1 @@
1
+ ADD_SUBDIRECTORY(gtest)
@@ -0,0 +1,5 @@
1
+ INCLUDE_DIRECTORIES(./ include)
2
+ ADD_LIBRARY(gtest STATIC src/gtest-all.cc)
3
+ if(NOT MSVC)
4
+ TARGET_LINK_LIBRARIES(gtest pthread)
5
+ endif()
@@ -0,0 +1,283 @@
1
+ // Copyright 2005, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ //
30
+ // Author: wan@google.com (Zhanyong Wan)
31
+ //
32
+ // The Google C++ Testing Framework (Google Test)
33
+ //
34
+ // This header file defines the public API for death tests. It is
35
+ // #included by gtest.h so a user doesn't need to include this
36
+ // directly.
37
+
38
+ #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
39
+ #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
40
+
41
+ #include "gtest/internal/gtest-death-test-internal.h"
42
+
43
+ namespace testing {
44
+
45
+ // This flag controls the style of death tests. Valid values are "threadsafe",
46
+ // meaning that the death test child process will re-execute the test binary
47
+ // from the start, running only a single death test, or "fast",
48
+ // meaning that the child process will execute the test logic immediately
49
+ // after forking.
50
+ GTEST_DECLARE_string_(death_test_style);
51
+
52
+ #if GTEST_HAS_DEATH_TEST
53
+
54
+ // The following macros are useful for writing death tests.
55
+
56
+ // Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
57
+ // executed:
58
+ //
59
+ // 1. It generates a warning if there is more than one active
60
+ // thread. This is because it's safe to fork() or clone() only
61
+ // when there is a single thread.
62
+ //
63
+ // 2. The parent process clone()s a sub-process and runs the death
64
+ // test in it; the sub-process exits with code 0 at the end of the
65
+ // death test, if it hasn't exited already.
66
+ //
67
+ // 3. The parent process waits for the sub-process to terminate.
68
+ //
69
+ // 4. The parent process checks the exit code and error message of
70
+ // the sub-process.
71
+ //
72
+ // Examples:
73
+ //
74
+ // ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
75
+ // for (int i = 0; i < 5; i++) {
76
+ // EXPECT_DEATH(server.ProcessRequest(i),
77
+ // "Invalid request .* in ProcessRequest()")
78
+ // << "Failed to die on request " << i);
79
+ // }
80
+ //
81
+ // ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
82
+ //
83
+ // bool KilledBySIGHUP(int exit_code) {
84
+ // return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
85
+ // }
86
+ //
87
+ // ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
88
+ //
89
+ // On the regular expressions used in death tests:
90
+ //
91
+ // On POSIX-compliant systems (*nix), we use the <regex.h> library,
92
+ // which uses the POSIX extended regex syntax.
93
+ //
94
+ // On other platforms (e.g. Windows), we only support a simple regex
95
+ // syntax implemented as part of Google Test. This limited
96
+ // implementation should be enough most of the time when writing
97
+ // death tests; though it lacks many features you can find in PCRE
98
+ // or POSIX extended regex syntax. For example, we don't support
99
+ // union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
100
+ // repetition count ("x{5,7}"), among others.
101
+ //
102
+ // Below is the syntax that we do support. We chose it to be a
103
+ // subset of both PCRE and POSIX extended regex, so it's easy to
104
+ // learn wherever you come from. In the following: 'A' denotes a
105
+ // literal character, period (.), or a single \\ escape sequence;
106
+ // 'x' and 'y' denote regular expressions; 'm' and 'n' are for
107
+ // natural numbers.
108
+ //
109
+ // c matches any literal character c
110
+ // \\d matches any decimal digit
111
+ // \\D matches any character that's not a decimal digit
112
+ // \\f matches \f
113
+ // \\n matches \n
114
+ // \\r matches \r
115
+ // \\s matches any ASCII whitespace, including \n
116
+ // \\S matches any character that's not a whitespace
117
+ // \\t matches \t
118
+ // \\v matches \v
119
+ // \\w matches any letter, _, or decimal digit
120
+ // \\W matches any character that \\w doesn't match
121
+ // \\c matches any literal character c, which must be a punctuation
122
+ // . matches any single character except \n
123
+ // A? matches 0 or 1 occurrences of A
124
+ // A* matches 0 or many occurrences of A
125
+ // A+ matches 1 or many occurrences of A
126
+ // ^ matches the beginning of a string (not that of each line)
127
+ // $ matches the end of a string (not that of each line)
128
+ // xy matches x followed by y
129
+ //
130
+ // If you accidentally use PCRE or POSIX extended regex features
131
+ // not implemented by us, you will get a run-time failure. In that
132
+ // case, please try to rewrite your regular expression within the
133
+ // above syntax.
134
+ //
135
+ // This implementation is *not* meant to be as highly tuned or robust
136
+ // as a compiled regex library, but should perform well enough for a
137
+ // death test, which already incurs significant overhead by launching
138
+ // a child process.
139
+ //
140
+ // Known caveats:
141
+ //
142
+ // A "threadsafe" style death test obtains the path to the test
143
+ // program from argv[0] and re-executes it in the sub-process. For
144
+ // simplicity, the current implementation doesn't search the PATH
145
+ // when launching the sub-process. This means that the user must
146
+ // invoke the test program via a path that contains at least one
147
+ // path separator (e.g. path/to/foo_test and
148
+ // /absolute/path/to/bar_test are fine, but foo_test is not). This
149
+ // is rarely a problem as people usually don't put the test binary
150
+ // directory in PATH.
151
+ //
152
+ // TODO(wan@google.com): make thread-safe death tests search the PATH.
153
+
154
+ // Asserts that a given statement causes the program to exit, with an
155
+ // integer exit status that satisfies predicate, and emitting error output
156
+ // that matches regex.
157
+ # define ASSERT_EXIT(statement, predicate, regex) \
158
+ GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
159
+
160
+ // Like ASSERT_EXIT, but continues on to successive tests in the
161
+ // test case, if any:
162
+ # define EXPECT_EXIT(statement, predicate, regex) \
163
+ GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
164
+
165
+ // Asserts that a given statement causes the program to exit, either by
166
+ // explicitly exiting with a nonzero exit code or being killed by a
167
+ // signal, and emitting error output that matches regex.
168
+ # define ASSERT_DEATH(statement, regex) \
169
+ ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
170
+
171
+ // Like ASSERT_DEATH, but continues on to successive tests in the
172
+ // test case, if any:
173
+ # define EXPECT_DEATH(statement, regex) \
174
+ EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
175
+
176
+ // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
177
+
178
+ // Tests that an exit code describes a normal exit with a given exit code.
179
+ class GTEST_API_ ExitedWithCode {
180
+ public:
181
+ explicit ExitedWithCode(int exit_code);
182
+ bool operator()(int exit_status) const;
183
+ private:
184
+ // No implementation - assignment is unsupported.
185
+ void operator=(const ExitedWithCode& other);
186
+
187
+ const int exit_code_;
188
+ };
189
+
190
+ # if !GTEST_OS_WINDOWS
191
+ // Tests that an exit code describes an exit due to termination by a
192
+ // given signal.
193
+ class GTEST_API_ KilledBySignal {
194
+ public:
195
+ explicit KilledBySignal(int signum);
196
+ bool operator()(int exit_status) const;
197
+ private:
198
+ const int signum_;
199
+ };
200
+ # endif // !GTEST_OS_WINDOWS
201
+
202
+ // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
203
+ // The death testing framework causes this to have interesting semantics,
204
+ // since the sideeffects of the call are only visible in opt mode, and not
205
+ // in debug mode.
206
+ //
207
+ // In practice, this can be used to test functions that utilize the
208
+ // LOG(DFATAL) macro using the following style:
209
+ //
210
+ // int DieInDebugOr12(int* sideeffect) {
211
+ // if (sideeffect) {
212
+ // *sideeffect = 12;
213
+ // }
214
+ // LOG(DFATAL) << "death";
215
+ // return 12;
216
+ // }
217
+ //
218
+ // TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
219
+ // int sideeffect = 0;
220
+ // // Only asserts in dbg.
221
+ // EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
222
+ //
223
+ // #ifdef NDEBUG
224
+ // // opt-mode has sideeffect visible.
225
+ // EXPECT_EQ(12, sideeffect);
226
+ // #else
227
+ // // dbg-mode no visible sideeffect.
228
+ // EXPECT_EQ(0, sideeffect);
229
+ // #endif
230
+ // }
231
+ //
232
+ // This will assert that DieInDebugReturn12InOpt() crashes in debug
233
+ // mode, usually due to a DCHECK or LOG(DFATAL), but returns the
234
+ // appropriate fallback value (12 in this case) in opt mode. If you
235
+ // need to test that a function has appropriate side-effects in opt
236
+ // mode, include assertions against the side-effects. A general
237
+ // pattern for this is:
238
+ //
239
+ // EXPECT_DEBUG_DEATH({
240
+ // // Side-effects here will have an effect after this statement in
241
+ // // opt mode, but none in debug mode.
242
+ // EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
243
+ // }, "death");
244
+ //
245
+ # ifdef NDEBUG
246
+
247
+ # define EXPECT_DEBUG_DEATH(statement, regex) \
248
+ do { statement; } while (::testing::internal::AlwaysFalse())
249
+
250
+ # define ASSERT_DEBUG_DEATH(statement, regex) \
251
+ do { statement; } while (::testing::internal::AlwaysFalse())
252
+
253
+ # else
254
+
255
+ # define EXPECT_DEBUG_DEATH(statement, regex) \
256
+ EXPECT_DEATH(statement, regex)
257
+
258
+ # define ASSERT_DEBUG_DEATH(statement, regex) \
259
+ ASSERT_DEATH(statement, regex)
260
+
261
+ # endif // NDEBUG for EXPECT_DEBUG_DEATH
262
+ #endif // GTEST_HAS_DEATH_TEST
263
+
264
+ // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
265
+ // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
266
+ // death tests are supported; otherwise they just issue a warning. This is
267
+ // useful when you are combining death test assertions with normal test
268
+ // assertions in one test.
269
+ #if GTEST_HAS_DEATH_TEST
270
+ # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
271
+ EXPECT_DEATH(statement, regex)
272
+ # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
273
+ ASSERT_DEATH(statement, regex)
274
+ #else
275
+ # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
276
+ GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
277
+ # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
278
+ GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
279
+ #endif
280
+
281
+ } // namespace testing
282
+
283
+ #endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_