jieba-rb 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,285 @@
1
+ # CppJieba [English](README_EN.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## 简介
13
+
14
+ CppJieba是"结巴(Jieba)"中文分词的C++版本
15
+
16
+ ## 特性
17
+
18
+ + 源代码都写进头文件`include/cppjieba/*.hpp`里,`include`即可使用。
19
+ + 支持`utf8`编码。
20
+ + 项目自带较为完善的单元测试,核心功能中文分词(utf8)的稳定性接受过线上环境检验。
21
+ + 支持载自定义用户词典,多路径时支持分隔符'|'或者';'分隔。
22
+ + 支持 `Linux` , `Mac OSX`, `Windows` 操作系统。
23
+
24
+ ## 用法
25
+
26
+ ### 依赖软件
27
+
28
+ * `g++ (version >= 4.1 is recommended) or clang++`;
29
+ * `cmake (version >= 2.6 is recommended)`;
30
+
31
+ ### 下载和编译
32
+
33
+ ```sh
34
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
35
+ cd cppjieba
36
+ mkdir build
37
+ cd build
38
+ cmake ..
39
+ make
40
+ ```
41
+
42
+ 有兴趣的可以跑跑测试(可选):
43
+
44
+ ```
45
+ make test
46
+ ```
47
+
48
+ ## Demo
49
+
50
+ ```
51
+ ./demo
52
+ ```
53
+
54
+ 结果示例:
55
+
56
+ ```
57
+ [demo] Cut With HMM
58
+ 他/来到/了/网易/杭研/大厦
59
+ [demo] Cut Without HMM
60
+ 他/来到/了/网易/杭/研/大厦
61
+ 我来到北京清华大学
62
+ [demo] CutAll
63
+ 我/来到/北京/清华/清华大学/华大/大学
64
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
65
+ [demo] CutForSearch
66
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
67
+ [demo] Insert User Word
68
+ 男默/女泪
69
+ 男默女泪
70
+ [demo] CutForSearch Word With Offset
71
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
72
+ [demo] Tagging
73
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
74
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
75
+ [demo] Keyword Extraction
76
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
77
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
78
+ ```
79
+
80
+ 详细请看 `test/demo.cpp`.
81
+
82
+ ### 分词结果示例
83
+
84
+ **MPSegment**
85
+
86
+ Output:
87
+ ```
88
+ 我来到北京清华大学
89
+ 我/来到/北京/清华大学
90
+
91
+ 他来到了网易杭研大厦
92
+ 他/来到/了/网易/杭/研/大厦
93
+
94
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
95
+ 小/明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
96
+
97
+ ```
98
+
99
+ **HMMSegment**
100
+
101
+ ```
102
+ 我来到北京清华大学
103
+ 我来/到/北京/清华大学
104
+
105
+ 他来到了网易杭研大厦
106
+ 他来/到/了/网易/杭/研大厦
107
+
108
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
109
+ 小明/硕士/毕业于/中国/科学院/计算所/,/后/在/日/本/京/都/大/学/深/造
110
+
111
+ ```
112
+
113
+ **MixSegment**
114
+
115
+ ```
116
+ 我来到北京清华大学
117
+ 我/来到/北京/清华大学
118
+
119
+ 他来到了网易杭研大厦
120
+ 他/来到/了/网易/杭研/大厦
121
+
122
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
123
+ 小明/硕士/毕业/于/中国科学院/计算所/,/后/在/日本京都大学/深造
124
+
125
+ ```
126
+
127
+ **FullSegment**
128
+
129
+ ```
130
+ 我来到北京清华大学
131
+ 我/来到/北京/清华/清华大学/华大/大学
132
+
133
+ 他来到了网易杭研大厦
134
+ 他/来到/了/网易/杭/研/大厦
135
+
136
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
137
+ 小/明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算/计算所/,/后/在/日本/日本京都大学/京都/京都大学/大学/深造
138
+
139
+ ```
140
+
141
+ **QuerySegment**
142
+
143
+ ```
144
+ 我来到北京清华大学
145
+ 我/来到/北京/清华/清华大学/华大/大学
146
+
147
+ 他来到了网易杭研大厦
148
+ 他/来到/了/网易/杭研/大厦
149
+
150
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
151
+ 小明/硕士/毕业/于/中国/中国科学院/科学/科学院/学院/计算所/,/后/在/中国/中国科学院/科学/科学院/学院/日本/日本京都大学/京都/京都大学/大学/深造
152
+
153
+ ```
154
+
155
+ 以上依次是MP,HMM,Mix三种方法的效果。
156
+
157
+ 可以看出效果最好的是Mix,也就是融合MP和HMM的切词算法。即可以准确切出词典已有的词,又可以切出像"杭研"这样的未登录词。
158
+
159
+ Full方法切出所有字典里的词语。
160
+
161
+ Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
162
+
163
+ ### 自定义用户词典
164
+
165
+ 自定义词典示例请看`dict/user.dict.utf8`。
166
+
167
+ 没有使用自定义用户词典时的结果:
168
+
169
+ ```
170
+ 令狐冲/是/云/计算/行业/的/专家
171
+ ```
172
+
173
+ 使用自定义用户词典时的结果:
174
+
175
+ ```
176
+ 令狐冲/是/云计算/行业/的/专家
177
+ ```
178
+
179
+ ### 关键词抽取
180
+
181
+ ```
182
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
183
+ ["CEO:11.7392", "升职:10.8562", "加薪:10.6426", "手扶拖拉机:10.0089", "巅峰:9.49396"]
184
+ ```
185
+
186
+ 详细请见 `test/demo.cpp`.
187
+
188
+ ### 词性标注
189
+
190
+ ```
191
+ 我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。
192
+ ["我:r", "是:v", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当上:t", "CEO:eng", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
193
+ ```
194
+
195
+ 详细请看 `test/demo.cpp`.
196
+
197
+ 支持自定义词性。
198
+ 比如在(`dict/user.dict.utf8`)增加一行
199
+
200
+ ```
201
+ 蓝翔 nz
202
+ ```
203
+
204
+ 结果如下:
205
+
206
+ ```
207
+ ["我:r", "是:v", "蓝翔:nz", "技工:n", "拖拉机:n", "学院:n", "手扶拖拉机:n", "专业:n", "的:uj", "。:x", "不用:v", "多久:m", ",:x", "我:r", "就:d", "会:v", "升职:v", "加薪:nr", ",:x", "当:t", "上:f", "总经理:n", ",:x", "出任:v", "CEO:eng", ",:x", "迎娶:v", "白富美:x", ",:x", "走上:v", "人生:n", "巅峰:n", "。:x"]
208
+ ```
209
+
210
+ ## 其它词典资料分享
211
+
212
+ + [dict.367W.utf8] iLife(562193561 at qq.com)
213
+
214
+ ## 应用
215
+
216
+ + [GoJieba] go语言版本的结巴中文分词。
217
+ + [NodeJieba] Node.js 版本的结巴中文分词。
218
+ + [simhash] 中文文档的的相似度计算
219
+ + [exjieba] Erlang 版本的结巴中文分词。
220
+ + [jiebaR] R语言版本的结巴中文分词。
221
+ + [cjieba] C语言版本的结巴分词。
222
+ + [jieba_rb] Ruby 版本的结巴分词。
223
+ + [iosjieba] iOS 版本的结巴分词。
224
+ + [SqlJieba] MySQL 全文索引的结巴中文分词插件。
225
+ + [pg_jieba] PostgreSQL 数据库的分词插件。
226
+ + [gitbook-plugin-search-pro] 支持中文搜索的 gitbook 插件。
227
+ + [ngx_http_cppjieba_module] Nginx 分词插件。
228
+ + [cppjiebapy] 由 [jannson] 开发的供 python 模块调用的项目 [cppjiebapy], 相关讨论 [cppjiebapy_discussion] .
229
+ + [KeywordServer] 50行搭建一个中文关键词抽取服务。
230
+ + [cppjieba-server] CppJieba HTTP 服务器。
231
+
232
+ ## 线上演示
233
+
234
+ [Web-Demo](http://cppjieba-webdemo.herokuapp.com/)
235
+ (建议使用chrome打开)
236
+
237
+ ## 性能评测
238
+
239
+ [Jieba中文分词系列性能评测]
240
+
241
+ ## 客服
242
+
243
+ + Email: `i@yanyiwu.com`
244
+ + QQ: 64162451
245
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
246
+
247
+ ## 鸣谢
248
+
249
+ "结巴"中文分词作者: [SunJunyi](https://github.com/fxsjy)
250
+
251
+ ## 许可证
252
+
253
+ [MIT](http://yanyiwu.mit-license.org)
254
+
255
+ ## 作者
256
+
257
+ - [yanyiwu](yanyiwu.com)
258
+ - [aholic](https://github.com/aholic)
259
+
260
+ [GoJieba]:https://github.com/yanyiwu/gojieba
261
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
262
+ [jannson]:https://github.com/jannson
263
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
264
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
265
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
266
+ [jiebaR]:https://github.com/qinwf/jiebaR
267
+ [simhash]:https://github.com/yanyiwu/simhash
268
+ [代码详解]:https://github.com/yanyiwu/cppjieba/wiki/CppJieba%E4%BB%A3%E7%A0%81%E8%AF%A6%E8%A7%A3
269
+ [issue25]:https://github.com/yanyiwu/cppjieba/issues/25
270
+ [exjieba]:https://github.com/falood/exjieba
271
+ [KeywordServer]:https://github.com/yanyiwu/keyword_server
272
+ [ngx_http_cppjieba_module]:https://github.com/yanyiwu/ngx_http_cppjieba_module
273
+ [dict.367W.utf8]:https://github.com/qinwf/BigDict
274
+ [cjieba]:http://github.com/yanyiwu/cjieba
275
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
276
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
277
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
278
+ [Jieba中文分词系列性能评测]:http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html
279
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
280
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
281
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
282
+
283
+
284
+ [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/yanyiwu/cppjieba/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
285
+
@@ -0,0 +1,111 @@
1
+ # CppJieba [简体中文](README.md)
2
+
3
+ [![Build Status](https://travis-ci.org/yanyiwu/cppjieba.png?branch=master)](https://travis-ci.org/yanyiwu/cppjieba)
4
+ [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](http://yanyiwu.com/)
5
+ [![Platform](https://img.shields.io/badge/platform-Linux,%20OS%20X,%20Windows-green.svg?style=flat)](https://github.com/yanyiwu/cppjieba)
6
+ [![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](http://yanyiwu.com/work/2015/06/14/jieba-series-performance-test.html)
7
+ [![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
8
+ [![Build status](https://ci.appveyor.com/api/projects/status/wl30fjnm2rhft6ta/branch/master?svg=true)](https://ci.appveyor.com/project/yanyiwu/cppjieba/branch/master)
9
+
10
+ [![logo](http://7viirv.com1.z0.glb.clouddn.com/CppJiebaLogo-v1.png)](https://github.com/yanyiwu/cppjieba)
11
+
12
+ ## Introduction
13
+
14
+ The Jieba Chinese Word Segmentation Implemented By C++ .
15
+
16
+ ## Usage
17
+
18
+ ### Dependencies
19
+
20
+ + `g++ (version >= 4.1 is recommended) or clang++`;
21
+ + `cmake (version >= 2.6 is recommended)`;
22
+
23
+ ### Download & Compile
24
+
25
+ ```sh
26
+ git clone --depth=10 --branch=master git://github.com/yanyiwu/cppjieba.git
27
+ cd cppjieba
28
+ mkdir build
29
+ cd build
30
+ cmake ..
31
+ make
32
+ ```
33
+
34
+ ### Unit Testing
35
+
36
+ ```
37
+ make test
38
+ ```
39
+
40
+ ## Demo
41
+
42
+ ```
43
+ ./demo
44
+ ```
45
+
46
+ Output:
47
+
48
+ ```
49
+ [demo] Cut With HMM
50
+ 他/来到/了/网易/杭研/大厦
51
+ [demo] Cut Without HMM
52
+ 他/来到/了/网易/杭/研/大厦
53
+ 我来到北京清华大学
54
+ [demo] CutAll
55
+ 我/来到/北京/清华/清华大学/华大/大学
56
+ 小明硕士毕业于中国科学院计算所,后在日本京都大学深造
57
+ [demo] CutForSearch
58
+ 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
59
+ [demo] Insert User Word
60
+ 男默/女泪
61
+ 男默女泪
62
+ [demo] CutForSearch Word With Offset
63
+ [{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", "offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, {"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": "科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", "offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, {"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", "offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, {"word": "日本京都大学", "offset": 54}, {"word": "深造", "offset": 72}]
64
+ [demo] Tagging
65
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
66
+ [我:r, 是:v, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, CEO:eng, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]
67
+ [demo] Keyword Extraction
68
+ 我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。
69
+ [{"word": "CEO", "offset": [93], "weight": 11.7392}, {"word": "升职", "offset": [72], "weight": 10.8562}, {"word": "加薪", "offset": [78], "weight": 10.6426}, {"word": "手扶拖拉机", "offset": [21], "weight": 10.0089}, {"word": "巅峰", "offset": [111], "weight": 9.49396}]
70
+ ```
71
+
72
+ Please see details in `test/demo.cpp`.
73
+
74
+ ## Cases
75
+
76
+ + [GoJieba]
77
+ + [NodeJieba]
78
+ + [simhash]
79
+ + [exjieba]
80
+ + [jiebaR]
81
+ + [cjieba]
82
+ + [jieba_rb]
83
+ + [iosjieba]
84
+ + [SqlJieba]
85
+ + [pg_jieba]
86
+ + [ngx_http_cppjieba_module]
87
+ + [gitbook-plugin-search-pro]
88
+ + [cppjieba-server]
89
+
90
+ ## Contact
91
+
92
+ + Email: `i@yanyiwu.com`
93
+ + QQ: 64162451
94
+ + WeChat: ![image](http://7viirv.com1.z0.glb.clouddn.com/5a7d1b5c0d_yanyiwu_personal_qrcodes.jpg)
95
+
96
+ [GoJieba]:https://github.com/yanyiwu/gojieba
97
+ [CppJieba]:https://github.com/yanyiwu/cppjieba
98
+ [jannson]:https://github.com/jannson
99
+ [cppjiebapy]:https://github.com/jannson/cppjiebapy
100
+ [cppjiebapy_discussion]:https://github.com/yanyiwu/cppjieba/issues/1
101
+ [NodeJieba]:https://github.com/yanyiwu/nodejieba
102
+ [jiebaR]:https://github.com/qinwf/jiebaR
103
+ [simhash]:https://github.com/yanyiwu/simhash
104
+ [exjieba]:https://github.com/falood/exjieba
105
+ [cjieba]:http://github.com/yanyiwu/cjieba
106
+ [jieba_rb]:https://github.com/altkatz/jieba_rb
107
+ [iosjieba]:https://github.com/yanyiwu/iosjieba
108
+ [SqlJieba]:https://github.com/yanyiwu/sqljieba
109
+ [pg_jieba]:https://github.com/jaiminpan/pg_jieba
110
+ [gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
111
+ [cppjieba-server]:https://github.com/yanyiwu/cppjieba-server
@@ -0,0 +1,32 @@
1
+ os: Visual Studio 2015
2
+
3
+ platform: x64
4
+
5
+ # clone directory
6
+ clone_folder: c:\projects\cppjieba
7
+
8
+ # scripts to run before build
9
+ before_build:
10
+ - echo Running cmake...
11
+ - cd c:\projects\cppjieba
12
+ - cmake .
13
+
14
+ build:
15
+ project: ALL_BUILD.vcxproj # path to Visual Studio solution or project
16
+
17
+ # scripts to run after build
18
+ after_build:
19
+ - cd Debug
20
+ - demo.exe
21
+ - load_test.exe
22
+ - cd ..
23
+ - COPY .\test\Debug\test.run.exe .\test\test.run.exe
24
+ - cd test
25
+ - test.run.exe
26
+ - cd ..
27
+ - 7z a c:\projects\all.zip * -tzip
28
+ - cd c:\projects
29
+
30
+ artifacts:
31
+ - path: all.zip
32
+ name: all.zip
@@ -0,0 +1 @@
1
+ ADD_SUBDIRECTORY(gtest)
@@ -0,0 +1,5 @@
1
+ INCLUDE_DIRECTORIES(./ include)
2
+ ADD_LIBRARY(gtest STATIC src/gtest-all.cc)
3
+ if(NOT MSVC)
4
+ TARGET_LINK_LIBRARIES(gtest pthread)
5
+ endif()
@@ -0,0 +1,283 @@
1
+ // Copyright 2005, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+ //
30
+ // Author: wan@google.com (Zhanyong Wan)
31
+ //
32
+ // The Google C++ Testing Framework (Google Test)
33
+ //
34
+ // This header file defines the public API for death tests. It is
35
+ // #included by gtest.h so a user doesn't need to include this
36
+ // directly.
37
+
38
+ #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
39
+ #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
40
+
41
+ #include "gtest/internal/gtest-death-test-internal.h"
42
+
43
+ namespace testing {
44
+
45
+ // This flag controls the style of death tests. Valid values are "threadsafe",
46
+ // meaning that the death test child process will re-execute the test binary
47
+ // from the start, running only a single death test, or "fast",
48
+ // meaning that the child process will execute the test logic immediately
49
+ // after forking.
50
+ GTEST_DECLARE_string_(death_test_style);
51
+
52
+ #if GTEST_HAS_DEATH_TEST
53
+
54
+ // The following macros are useful for writing death tests.
55
+
56
+ // Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
57
+ // executed:
58
+ //
59
+ // 1. It generates a warning if there is more than one active
60
+ // thread. This is because it's safe to fork() or clone() only
61
+ // when there is a single thread.
62
+ //
63
+ // 2. The parent process clone()s a sub-process and runs the death
64
+ // test in it; the sub-process exits with code 0 at the end of the
65
+ // death test, if it hasn't exited already.
66
+ //
67
+ // 3. The parent process waits for the sub-process to terminate.
68
+ //
69
+ // 4. The parent process checks the exit code and error message of
70
+ // the sub-process.
71
+ //
72
+ // Examples:
73
+ //
74
+ // ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
75
+ // for (int i = 0; i < 5; i++) {
76
+ // EXPECT_DEATH(server.ProcessRequest(i),
77
+ // "Invalid request .* in ProcessRequest()")
78
+ // << "Failed to die on request " << i);
79
+ // }
80
+ //
81
+ // ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
82
+ //
83
+ // bool KilledBySIGHUP(int exit_code) {
84
+ // return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
85
+ // }
86
+ //
87
+ // ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
88
+ //
89
+ // On the regular expressions used in death tests:
90
+ //
91
+ // On POSIX-compliant systems (*nix), we use the <regex.h> library,
92
+ // which uses the POSIX extended regex syntax.
93
+ //
94
+ // On other platforms (e.g. Windows), we only support a simple regex
95
+ // syntax implemented as part of Google Test. This limited
96
+ // implementation should be enough most of the time when writing
97
+ // death tests; though it lacks many features you can find in PCRE
98
+ // or POSIX extended regex syntax. For example, we don't support
99
+ // union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
100
+ // repetition count ("x{5,7}"), among others.
101
+ //
102
+ // Below is the syntax that we do support. We chose it to be a
103
+ // subset of both PCRE and POSIX extended regex, so it's easy to
104
+ // learn wherever you come from. In the following: 'A' denotes a
105
+ // literal character, period (.), or a single \\ escape sequence;
106
+ // 'x' and 'y' denote regular expressions; 'm' and 'n' are for
107
+ // natural numbers.
108
+ //
109
+ // c matches any literal character c
110
+ // \\d matches any decimal digit
111
+ // \\D matches any character that's not a decimal digit
112
+ // \\f matches \f
113
+ // \\n matches \n
114
+ // \\r matches \r
115
+ // \\s matches any ASCII whitespace, including \n
116
+ // \\S matches any character that's not a whitespace
117
+ // \\t matches \t
118
+ // \\v matches \v
119
+ // \\w matches any letter, _, or decimal digit
120
+ // \\W matches any character that \\w doesn't match
121
+ // \\c matches any literal character c, which must be a punctuation
122
+ // . matches any single character except \n
123
+ // A? matches 0 or 1 occurrences of A
124
+ // A* matches 0 or many occurrences of A
125
+ // A+ matches 1 or many occurrences of A
126
+ // ^ matches the beginning of a string (not that of each line)
127
+ // $ matches the end of a string (not that of each line)
128
+ // xy matches x followed by y
129
+ //
130
+ // If you accidentally use PCRE or POSIX extended regex features
131
+ // not implemented by us, you will get a run-time failure. In that
132
+ // case, please try to rewrite your regular expression within the
133
+ // above syntax.
134
+ //
135
+ // This implementation is *not* meant to be as highly tuned or robust
136
+ // as a compiled regex library, but should perform well enough for a
137
+ // death test, which already incurs significant overhead by launching
138
+ // a child process.
139
+ //
140
+ // Known caveats:
141
+ //
142
+ // A "threadsafe" style death test obtains the path to the test
143
+ // program from argv[0] and re-executes it in the sub-process. For
144
+ // simplicity, the current implementation doesn't search the PATH
145
+ // when launching the sub-process. This means that the user must
146
+ // invoke the test program via a path that contains at least one
147
+ // path separator (e.g. path/to/foo_test and
148
+ // /absolute/path/to/bar_test are fine, but foo_test is not). This
149
+ // is rarely a problem as people usually don't put the test binary
150
+ // directory in PATH.
151
+ //
152
+ // TODO(wan@google.com): make thread-safe death tests search the PATH.
153
+
154
+ // Asserts that a given statement causes the program to exit, with an
155
+ // integer exit status that satisfies predicate, and emitting error output
156
+ // that matches regex.
157
+ # define ASSERT_EXIT(statement, predicate, regex) \
158
+ GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
159
+
160
+ // Like ASSERT_EXIT, but continues on to successive tests in the
161
+ // test case, if any:
162
+ # define EXPECT_EXIT(statement, predicate, regex) \
163
+ GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
164
+
165
+ // Asserts that a given statement causes the program to exit, either by
166
+ // explicitly exiting with a nonzero exit code or being killed by a
167
+ // signal, and emitting error output that matches regex.
168
+ # define ASSERT_DEATH(statement, regex) \
169
+ ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
170
+
171
+ // Like ASSERT_DEATH, but continues on to successive tests in the
172
+ // test case, if any:
173
+ # define EXPECT_DEATH(statement, regex) \
174
+ EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
175
+
176
+ // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
177
+
178
+ // Tests that an exit code describes a normal exit with a given exit code.
179
+ class GTEST_API_ ExitedWithCode {
180
+ public:
181
+ explicit ExitedWithCode(int exit_code);
182
+ bool operator()(int exit_status) const;
183
+ private:
184
+ // No implementation - assignment is unsupported.
185
+ void operator=(const ExitedWithCode& other);
186
+
187
+ const int exit_code_;
188
+ };
189
+
190
+ # if !GTEST_OS_WINDOWS
191
+ // Tests that an exit code describes an exit due to termination by a
192
+ // given signal.
193
+ class GTEST_API_ KilledBySignal {
194
+ public:
195
+ explicit KilledBySignal(int signum);
196
+ bool operator()(int exit_status) const;
197
+ private:
198
+ const int signum_;
199
+ };
200
+ # endif // !GTEST_OS_WINDOWS
201
+
202
+ // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
203
+ // The death testing framework causes this to have interesting semantics,
204
+ // since the sideeffects of the call are only visible in opt mode, and not
205
+ // in debug mode.
206
+ //
207
+ // In practice, this can be used to test functions that utilize the
208
+ // LOG(DFATAL) macro using the following style:
209
+ //
210
+ // int DieInDebugOr12(int* sideeffect) {
211
+ // if (sideeffect) {
212
+ // *sideeffect = 12;
213
+ // }
214
+ // LOG(DFATAL) << "death";
215
+ // return 12;
216
+ // }
217
+ //
218
+ // TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
219
+ // int sideeffect = 0;
220
+ // // Only asserts in dbg.
221
+ // EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
222
+ //
223
+ // #ifdef NDEBUG
224
+ // // opt-mode has sideeffect visible.
225
+ // EXPECT_EQ(12, sideeffect);
226
+ // #else
227
+ // // dbg-mode no visible sideeffect.
228
+ // EXPECT_EQ(0, sideeffect);
229
+ // #endif
230
+ // }
231
+ //
232
+ // This will assert that DieInDebugReturn12InOpt() crashes in debug
233
+ // mode, usually due to a DCHECK or LOG(DFATAL), but returns the
234
+ // appropriate fallback value (12 in this case) in opt mode. If you
235
+ // need to test that a function has appropriate side-effects in opt
236
+ // mode, include assertions against the side-effects. A general
237
+ // pattern for this is:
238
+ //
239
+ // EXPECT_DEBUG_DEATH({
240
+ // // Side-effects here will have an effect after this statement in
241
+ // // opt mode, but none in debug mode.
242
+ // EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
243
+ // }, "death");
244
+ //
245
+ # ifdef NDEBUG
246
+
247
+ # define EXPECT_DEBUG_DEATH(statement, regex) \
248
+ do { statement; } while (::testing::internal::AlwaysFalse())
249
+
250
+ # define ASSERT_DEBUG_DEATH(statement, regex) \
251
+ do { statement; } while (::testing::internal::AlwaysFalse())
252
+
253
+ # else
254
+
255
+ # define EXPECT_DEBUG_DEATH(statement, regex) \
256
+ EXPECT_DEATH(statement, regex)
257
+
258
+ # define ASSERT_DEBUG_DEATH(statement, regex) \
259
+ ASSERT_DEATH(statement, regex)
260
+
261
+ # endif // NDEBUG for EXPECT_DEBUG_DEATH
262
+ #endif // GTEST_HAS_DEATH_TEST
263
+
264
+ // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
265
+ // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
266
+ // death tests are supported; otherwise they just issue a warning. This is
267
+ // useful when you are combining death test assertions with normal test
268
+ // assertions in one test.
269
+ #if GTEST_HAS_DEATH_TEST
270
+ # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
271
+ EXPECT_DEATH(statement, regex)
272
+ # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
273
+ ASSERT_DEATH(statement, regex)
274
+ #else
275
+ # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
276
+ GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
277
+ # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
278
+ GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
279
+ #endif
280
+
281
+ } // namespace testing
282
+
283
+ #endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_