scws4r 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2f35634967bfc1105d2a014dc66399455e821a4de9e9f7f47fe805711f20239f
4
+ data.tar.gz: 608912dbd14c16576e1dd14465a5beea562e677a6e37b075465d2fe6b4262804
5
+ SHA512:
6
+ metadata.gz: a176641cdf8518528591e5e11f803d6d99cd0cb7087af6a2830771436e45cea0b68c383dd7321abe2f1c8e7f996df436f13d58b8c3357086ac70d708100c799e
7
+ data.tar.gz: 26963a99702b3e1cfe82b81edf0400155fbfe3b8639633ad25902e7b221046d366200e4b7d951c0f7a72d2d077825786aa5a444cc8ef74931a8eed3977a96550
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.6
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: double_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: double_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-04-08
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in scws.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+ gem "rake-compiler"
10
+ gem "rspec", "~> 3.0"
11
+ gem "rubocop", "~> 1.21"
12
+
13
+ gem "pry", "~> 0.14.1"
data/Gemfile.lock ADDED
@@ -0,0 +1,65 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scws4r (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.2)
10
+ coderay (1.1.3)
11
+ diff-lcs (1.5.0)
12
+ method_source (1.0.0)
13
+ parallel (1.22.1)
14
+ parser (3.1.1.0)
15
+ ast (~> 2.4.1)
16
+ pry (0.14.1)
17
+ coderay (~> 1.1)
18
+ method_source (~> 1.0)
19
+ rainbow (3.1.1)
20
+ rake (13.0.6)
21
+ rake-compiler (1.2.0)
22
+ rake
23
+ regexp_parser (2.3.0)
24
+ rexml (3.2.5)
25
+ rspec (3.11.0)
26
+ rspec-core (~> 3.11.0)
27
+ rspec-expectations (~> 3.11.0)
28
+ rspec-mocks (~> 3.11.0)
29
+ rspec-core (3.11.0)
30
+ rspec-support (~> 3.11.0)
31
+ rspec-expectations (3.11.0)
32
+ diff-lcs (>= 1.2.0, < 2.0)
33
+ rspec-support (~> 3.11.0)
34
+ rspec-mocks (3.11.1)
35
+ diff-lcs (>= 1.2.0, < 2.0)
36
+ rspec-support (~> 3.11.0)
37
+ rspec-support (3.11.0)
38
+ rubocop (1.27.0)
39
+ parallel (~> 1.10)
40
+ parser (>= 3.1.0.0)
41
+ rainbow (>= 2.2.2, < 4.0)
42
+ regexp_parser (>= 1.8, < 3.0)
43
+ rexml
44
+ rubocop-ast (>= 1.16.0, < 2.0)
45
+ ruby-progressbar (~> 1.7)
46
+ unicode-display_width (>= 1.4.0, < 3.0)
47
+ rubocop-ast (1.17.0)
48
+ parser (>= 3.1.1.0)
49
+ ruby-progressbar (1.11.0)
50
+ unicode-display_width (2.1.0)
51
+
52
+ PLATFORMS
53
+ ruby
54
+ x86_64-darwin-21
55
+
56
+ DEPENDENCIES
57
+ pry (~> 0.14.1)
58
+ rake (~> 13.0)
59
+ rake-compiler
60
+ rspec (~> 3.0)
61
+ rubocop (~> 1.21)
62
+ scws4r!
63
+
64
+ BUNDLED WITH
65
+ 2.3.17
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2022 张小辉
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # SCWS(Simple Chinese Word Segmentation) for Ruby
2
+
3
+ This is a Ruby gem integrated with [scws](https://github.com/hightman/scws/blob/master/API.md) C lib
4
+
5
+ ## Installation
6
+
7
+ Install the gem and add to the application's Gemfile by executing:
8
+
9
+ $ bundle add scws4r
10
+
11
+ If bundler is not being used to manage dependencies, install the gem by executing:
12
+
13
+ $ gem install scws4r
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ 2.7.2 :001 > require 'scws4r'
19
+ => true
20
+ 2.7.2 :002 > s = Scws4r.new
21
+ => #<Scws4r:0x00007fbc2b1bc878>
22
+ 2.7.2 :003 > s.load_defaults
23
+ => false
24
+ 2.7.2 :004 > puts s.split('保障房资金压力')
25
+ {"offset"=>0, "idf"=>4.889999866485596, "length"=>6, "text"=>"保障", "attr"=>"vn"}
26
+ {"offset"=>6, "idf"=>0.0, "length"=>3, "text"=>"房", "attr"=>"n"}
27
+ {"offset"=>9, "idf"=>4.880000114440918, "length"=>6, "text"=>"资金", "attr"=>"n"}
28
+ {"offset"=>15, "idf"=>4.900000095367432, "length"=>6, "text"=>"压力", "attr"=>"n"}
29
+ => nil
30
+ 2.7.2 :005 > puts s.tops('保障房资金压力', 10)
31
+ {"times"=>1, "weight"=>4.900000095367432, "word"=>"压力", "attr"=>"n"}
32
+ {"times"=>1, "weight"=>4.889999866485596, "word"=>"保障", "attr"=>"vn"}
33
+ {"times"=>1, "weight"=>4.880000114440918, "word"=>"资金", "attr"=>"n"}
34
+ => nil
35
+ 2.7.2 :006 >
36
+ ```
37
+
38
+ ## Development
39
+
40
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile` to compile C-extension , then run `rake rspec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
41
+
42
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
43
+
44
+ ## references
45
+
46
+ - https://docs.ruby-lang.org/en/2.4.0/extension_rdoc.html
47
+ - https://github.com/hightman/scws
48
+ - https://github.com/amutu/zhparser
49
+
50
+ ## Contributing
51
+
52
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scws.
53
+
54
+ ## License
55
+
56
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ require "rake/extensiontask"
13
+
14
+ task build: :compile
15
+
16
+ Rake::ExtensionTask.new("scws4r") do |ext|
17
+ ext.lib_dir = "lib/scws4r"
18
+ end
19
+
20
+ task default: %i[clobber compile spec rubocop]
Binary file
@@ -0,0 +1,291 @@
1
+ ;
2
+ ; auto regular(utf-8)
3
+ ; $Id$
4
+ ;
5
+ ; special word, 特殊词汇
6
+ ;
7
+
8
+ [special]
9
+ C++
10
+ C#
11
+ R&B
12
+ P&G
13
+ J++
14
+ J#
15
+ UTF-8
16
+ PS/2
17
+
18
+ ;
19
+ ; nostats
20
+ ;
21
+ [nostats]
22
+ about
23
+ all
24
+ also
25
+ an
26
+ and
27
+ any
28
+ are
29
+ as
30
+ at
31
+ be
32
+ but
33
+ by
34
+ both
35
+ can
36
+ for
37
+ from
38
+ have
39
+ here
40
+ if
41
+ in
42
+ is
43
+ it
44
+ no
45
+ not
46
+ of
47
+ on
48
+ or
49
+ our
50
+ out
51
+ that
52
+ the
53
+ this
54
+ to
55
+ up
56
+ us
57
+
58
+ ;
59
+ ; 词性语法规则表
60
+ ;
61
+ [attrs]
62
+ ; c 是连词
63
+ n + f(1) = 300
64
+ n + m(1) = 500
65
+ n(1) + v = 100
66
+ n + v(1) = 10
67
+ r + n(1) = 1000
68
+ r(1) + n = 100
69
+ d(1) + r = 100
70
+ d(1) + v = 100
71
+ v(1) + r = 100
72
+ n + m(1) = 500
73
+ v + f(1) = 30
74
+ v(1) + m = 100
75
+ v(1) + n = 3
76
+ a + u(1) = 5
77
+ v + n(1) = 5
78
+ u(1) + a = 2
79
+ c(1) + * = 50
80
+ * + c(1) = 50
81
+
82
+ ;
83
+ ; 名字停用词表
84
+ ;
85
+ [noname]
86
+ :line = no
87
+ 给的说对在和是被最所那这有将
88
+ 你会与他为不没很了啊哦呵把去
89
+
90
+ ;
91
+ ; 双字节符号
92
+ ;
93
+ [symbol]
94
+ :type = none
95
+ :line = no
96
+ `-=[]、‘;/。,|?》《:“{}+—)(*…%¥#·!~
97
+ ’”〕〈〉「」『』〖〗【】<>
98
+
99
+ ;
100
+ ; 姓和外文名共同部分
101
+ ;
102
+ [pubname]
103
+ :type = prefix
104
+ :line = no
105
+ :exclude = noname,symbol,alpha,chnum2
106
+ :znum = 1,2
107
+ :tf = 5.0
108
+ :idf = 3.5
109
+ :attr = nr
110
+ 艾安贝卜戴费福盖戈古赫华霍吉贾金柯赖劳雷黎利林卢
111
+ 鲁伦罗洛马麦米莫穆齐乔冉萨沙史斯温谢尤詹诸
112
+
113
+
114
+ [pubname2]
115
+ :type = prefix
116
+ :line = no
117
+ :exclude = noname,symbol,alpha,chnum2
118
+ :tf = 5.0
119
+ :idf = 3.5
120
+ :attr = nr
121
+ 伍陆
122
+
123
+ [pubname3]
124
+ :type = prefix
125
+ :line = no
126
+ :exclude = noname,symbol,alpha,chnum2
127
+ :tf = 5.0
128
+ :idf = 3.5
129
+ :attr = nr
130
+ 万章
131
+
132
+ ;
133
+ ; 单姓
134
+ ;
135
+ [surname]
136
+ :type = prefix
137
+ :line = no
138
+ :exclude = noname,symbol,alpha,chnum2
139
+ :tf = 5.0
140
+ :idf = 3.5
141
+ :attr = nr
142
+ :znum = 1,2
143
+
144
+ 敖白班包宝保鲍毕边卞柏蔡曹岑柴昌常车陈成程迟池褚
145
+ 楚储淳崔刀邓狄刁丁董窦杜端段樊范方房斐丰封冯凤伏
146
+ 傅甘高耿龚宫勾苟辜谷顾官关管桂郭韩杭郝禾何贺衡洪
147
+ 侯胡花黄稽姬纪季简翦姜江蒋焦晋靳荆居康空孔匡邝况
148
+ 蓝郎朗乐冷李理厉励连廉练良梁廖凌刘柳隆龙楼娄吕路
149
+ 骆麻满茅毛梅孟苗缪闵明牟倪聂牛钮农潘庞裴彭皮朴平
150
+ 蒲溥浦戚祁钱强秦丘邱仇裘屈瞿权饶任荣容阮瑞芮赛单
151
+ 商邵佘申沈盛石寿舒宋苏孙邰谭谈汤唐陶滕田佟仝屠涂
152
+ 汪王危韦魏卫蔚闻翁巫邬武吴奚习夏鲜席冼项萧解辛邢
153
+ 幸熊徐许宣薛荀颜阎言严彦晏燕杨阳姚叶蚁易殷银尹应
154
+ 英游于於鱼虞俞余禹喻郁尉袁岳云臧曾查翟湛张赵甄郑
155
+ 钟周朱竺祝庄卓宗邹祖左肖
156
+
157
+ ;
158
+ ; 复姓
159
+ ;
160
+ [surname2]
161
+ :type = prefix
162
+ :line = yes
163
+ :exclude = noname,symbol,alpha,chnum2
164
+ :tf = 5.0
165
+ :idf = 3.5
166
+ :attr = nr
167
+ :znum = 1, 2
168
+ 东郭
169
+ 公孙
170
+ 皇甫
171
+ 慕容
172
+ 欧阳
173
+ 单于
174
+ 司空
175
+ 司马
176
+ 司徒
177
+ 澹台
178
+ 诸葛
179
+
180
+ ;
181
+ ; 地点名称
182
+ ;
183
+ [areaname]
184
+ :type = suffix
185
+ :znum = 2
186
+ :exclude = noname,symbol,alpha,chnum2
187
+ :tf = 4.5
188
+ :idf = 3.0
189
+ :attr = ns
190
+ :line = no
191
+
192
+ 县市镇村乡区
193
+
194
+ ;
195
+ ; 双字地点名称
196
+ ;
197
+ [areaname2]
198
+ :type = suffix
199
+ :znum = 2
200
+ :exclude = noname,symbol,alpha,chnum2
201
+ :tf = 4.5
202
+ :idf = 3.0
203
+ :attr = ns
204
+ :line = yes
205
+ 东路
206
+ 西路
207
+ 支路
208
+ 街道
209
+ 南路
210
+ 北路
211
+
212
+
213
+ [munit]
214
+ :type = none
215
+ :line = no
216
+ 萬亿零年点分秒回节名个多届次集
217
+
218
+ [chnum0]
219
+ :type = prefix
220
+ :line = no
221
+ :tf = 2.5
222
+ :idf = 1.0
223
+ :attr = mt
224
+ :include = chnum2,chnum3,munit,pubname3
225
+
226
+
227
+ [chnum1]
228
+ :type = prefix
229
+ :include = chnum0,chnum1,munit,pubname3
230
+ :tf = 3.0
231
+ :idf = 1.0
232
+ :attr = mt
233
+ :line = no
234
+ 一二三四五六七八九十百千
235
+
236
+ [chnum2]
237
+ :type = prefix
238
+ :line = no
239
+ :tf = 3.0
240
+ :idf = 1.0
241
+ :attr = mt
242
+ :include = chnum0,chnum2,chnum3,munit,pubname3
243
+ 123456789
244
+
245
+ [chnum3]
246
+ :type = none
247
+ :line = no
248
+
249
+
250
+ [chnum4]
251
+ :type = prefix
252
+ :line = no
253
+ :tf = 3.0
254
+ :idf = 1.0
255
+ :attr = mt
256
+ :include = chnum4,munit,pubname2,pubname3
257
+
258
+ 壹贰叁肆柒捌玖拾佰仟
259
+
260
+ [chnum5]
261
+ :type = prefix
262
+ :line = no
263
+ :tf = 3.5
264
+ :idf = 2.0
265
+ :attr = nz
266
+ :include = chnum1,munit,pubname3,chnum2
267
+
268
+ 第每
269
+
270
+ [alpha]
271
+ :type = prefix
272
+ :line = no
273
+ :tf = 2.5
274
+ :idf = 1.0
275
+ :attr = en
276
+ :include = alpha
277
+
278
+ abcdefghijklmnopqrstuvwxyz
279
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ'
280
+
281
+ [foregin]
282
+ :type = prefix
283
+ :line = no
284
+ :tf = 4.0
285
+ :idf = 3.0
286
+ :attr = nr
287
+ :include = foregin,pubname,pubname2,pubname3
288
+ 阿克拉加内亚巴尔姆爱兰西伊杰纳布可夫勒特坦芬尼根登都
289
+ 伯泰胥俄科索沃森奥瓦茨普蒂塞维大莱德冈墨哥弗库澳哈兹
290
+ 乌奇切诺里基延达塔卡雅来波迈蓬什比摩曼乃休合娜迪凯帕
291
+ 桑佩蒙博托格泽及希匹印埃努烈累法图喀土腓耶逊宾