scws4r 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2f35634967bfc1105d2a014dc66399455e821a4de9e9f7f47fe805711f20239f
4
+ data.tar.gz: 608912dbd14c16576e1dd14465a5beea562e677a6e37b075465d2fe6b4262804
5
+ SHA512:
6
+ metadata.gz: a176641cdf8518528591e5e11f803d6d99cd0cb7087af6a2830771436e45cea0b68c383dd7321abe2f1c8e7f996df436f13d58b8c3357086ac70d708100c799e
7
+ data.tar.gz: 26963a99702b3e1cfe82b81edf0400155fbfe3b8639633ad25902e7b221046d366200e4b7d951c0f7a72d2d077825786aa5a444cc8ef74931a8eed3977a96550
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.6
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: double_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: double_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2022-04-08
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in scws.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+ gem "rake-compiler"
10
+ gem "rspec", "~> 3.0"
11
+ gem "rubocop", "~> 1.21"
12
+
13
+ gem "pry", "~> 0.14.1"
data/Gemfile.lock ADDED
@@ -0,0 +1,65 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ scws4r (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ast (2.4.2)
10
+ coderay (1.1.3)
11
+ diff-lcs (1.5.0)
12
+ method_source (1.0.0)
13
+ parallel (1.22.1)
14
+ parser (3.1.1.0)
15
+ ast (~> 2.4.1)
16
+ pry (0.14.1)
17
+ coderay (~> 1.1)
18
+ method_source (~> 1.0)
19
+ rainbow (3.1.1)
20
+ rake (13.0.6)
21
+ rake-compiler (1.2.0)
22
+ rake
23
+ regexp_parser (2.3.0)
24
+ rexml (3.2.5)
25
+ rspec (3.11.0)
26
+ rspec-core (~> 3.11.0)
27
+ rspec-expectations (~> 3.11.0)
28
+ rspec-mocks (~> 3.11.0)
29
+ rspec-core (3.11.0)
30
+ rspec-support (~> 3.11.0)
31
+ rspec-expectations (3.11.0)
32
+ diff-lcs (>= 1.2.0, < 2.0)
33
+ rspec-support (~> 3.11.0)
34
+ rspec-mocks (3.11.1)
35
+ diff-lcs (>= 1.2.0, < 2.0)
36
+ rspec-support (~> 3.11.0)
37
+ rspec-support (3.11.0)
38
+ rubocop (1.27.0)
39
+ parallel (~> 1.10)
40
+ parser (>= 3.1.0.0)
41
+ rainbow (>= 2.2.2, < 4.0)
42
+ regexp_parser (>= 1.8, < 3.0)
43
+ rexml
44
+ rubocop-ast (>= 1.16.0, < 2.0)
45
+ ruby-progressbar (~> 1.7)
46
+ unicode-display_width (>= 1.4.0, < 3.0)
47
+ rubocop-ast (1.17.0)
48
+ parser (>= 3.1.1.0)
49
+ ruby-progressbar (1.11.0)
50
+ unicode-display_width (2.1.0)
51
+
52
+ PLATFORMS
53
+ ruby
54
+ x86_64-darwin-21
55
+
56
+ DEPENDENCIES
57
+ pry (~> 0.14.1)
58
+ rake (~> 13.0)
59
+ rake-compiler
60
+ rspec (~> 3.0)
61
+ rubocop (~> 1.21)
62
+ scws4r!
63
+
64
+ BUNDLED WITH
65
+ 2.3.17
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2022 张小辉
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # SCWS(Simple Chinese Word Segmentation) for Ruby
2
+
3
+ This is a Ruby gem integrated with [scws](https://github.com/hightman/scws/blob/master/API.md) C lib
4
+
5
+ ## Installation
6
+
7
+ Install the gem and add to the application's Gemfile by executing:
8
+
9
+ $ bundle add scws4r
10
+
11
+ If bundler is not being used to manage dependencies, install the gem by executing:
12
+
13
+ $ gem install scws4r
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ 2.7.2 :001 > require 'scws4r'
19
+ => true
20
+ 2.7.2 :002 > s = Scws4r.new
21
+ => #<Scws4r:0x00007fbc2b1bc878>
22
+ 2.7.2 :003 > s.load_defaults
23
+ => false
24
+ 2.7.2 :004 > puts s.split('保障房资金压力')
25
+ {"offset"=>0, "idf"=>4.889999866485596, "length"=>6, "text"=>"保障", "attr"=>"vn"}
26
+ {"offset"=>6, "idf"=>0.0, "length"=>3, "text"=>"房", "attr"=>"n"}
27
+ {"offset"=>9, "idf"=>4.880000114440918, "length"=>6, "text"=>"资金", "attr"=>"n"}
28
+ {"offset"=>15, "idf"=>4.900000095367432, "length"=>6, "text"=>"压力", "attr"=>"n"}
29
+ => nil
30
+ 2.7.2 :005 > puts s.tops('保障房资金压力', 10)
31
+ {"times"=>1, "weight"=>4.900000095367432, "word"=>"压力", "attr"=>"n"}
32
+ {"times"=>1, "weight"=>4.889999866485596, "word"=>"保障", "attr"=>"vn"}
33
+ {"times"=>1, "weight"=>4.880000114440918, "word"=>"资金", "attr"=>"n"}
34
+ => nil
35
+ 2.7.2 :006 >
36
+ ```
37
+
38
+ ## Development
39
+
40
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake compile` to compile C-extension , then run `rake rspec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
41
+
42
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
43
+
44
+ ## references
45
+
46
+ - https://docs.ruby-lang.org/en/2.4.0/extension_rdoc.html
47
+ - https://github.com/hightman/scws
48
+ - https://github.com/amutu/zhparser
49
+
50
+ ## Contributing
51
+
52
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/scws.
53
+
54
+ ## License
55
+
56
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ require "rake/extensiontask"
13
+
14
+ task build: :compile
15
+
16
+ Rake::ExtensionTask.new("scws4r") do |ext|
17
+ ext.lib_dir = "lib/scws4r"
18
+ end
19
+
20
+ task default: %i[clobber compile spec rubocop]
Binary file
@@ -0,0 +1,291 @@
1
+ ;
2
+ ; auto regular(utf-8)
3
+ ; $Id$
4
+ ;
5
+ ; special word, 特殊词汇
6
+ ;
7
+
8
+ [special]
9
+ C++
10
+ C#
11
+ R&B
12
+ P&G
13
+ J++
14
+ J#
15
+ UTF-8
16
+ PS/2
17
+
18
+ ;
19
+ ; nostats
20
+ ;
21
+ [nostats]
22
+ about
23
+ all
24
+ also
25
+ an
26
+ and
27
+ any
28
+ are
29
+ as
30
+ at
31
+ be
32
+ but
33
+ by
34
+ both
35
+ can
36
+ for
37
+ from
38
+ have
39
+ here
40
+ if
41
+ in
42
+ is
43
+ it
44
+ no
45
+ not
46
+ of
47
+ on
48
+ or
49
+ our
50
+ out
51
+ that
52
+ the
53
+ this
54
+ to
55
+ up
56
+ us
57
+
58
+ ;
59
+ ; 词性语法规则表
60
+ ;
61
+ [attrs]
62
+ ; c 是连词
63
+ n + f(1) = 300
64
+ n + m(1) = 500
65
+ n(1) + v = 100
66
+ n + v(1) = 10
67
+ r + n(1) = 1000
68
+ r(1) + n = 100
69
+ d(1) + r = 100
70
+ d(1) + v = 100
71
+ v(1) + r = 100
72
+ n + m(1) = 500
73
+ v + f(1) = 30
74
+ v(1) + m = 100
75
+ v(1) + n = 3
76
+ a + u(1) = 5
77
+ v + n(1) = 5
78
+ u(1) + a = 2
79
+ c(1) + * = 50
80
+ * + c(1) = 50
81
+
82
+ ;
83
+ ; 名字停用词表
84
+ ;
85
+ [noname]
86
+ :line = no
87
+ 给的说对在和是被最所那这有将
88
+ 你会与他为不没很了啊哦呵把去
89
+
90
+ ;
91
+ ; 双字节符号
92
+ ;
93
+ [symbol]
94
+ :type = none
95
+ :line = no
96
+ `-=[]、‘;/。,|?》《:“{}+—)(*…%¥#·!~
97
+ ’”〕〈〉「」『』〖〗【】<>
98
+
99
+ ;
100
+ ; 姓和外文名共同部分
101
+ ;
102
+ [pubname]
103
+ :type = prefix
104
+ :line = no
105
+ :exclude = noname,symbol,alpha,chnum2
106
+ :znum = 1,2
107
+ :tf = 5.0
108
+ :idf = 3.5
109
+ :attr = nr
110
+ 艾安贝卜戴费福盖戈古赫华霍吉贾金柯赖劳雷黎利林卢
111
+ 鲁伦罗洛马麦米莫穆齐乔冉萨沙史斯温谢尤詹诸
112
+
113
+
114
+ [pubname2]
115
+ :type = prefix
116
+ :line = no
117
+ :exclude = noname,symbol,alpha,chnum2
118
+ :tf = 5.0
119
+ :idf = 3.5
120
+ :attr = nr
121
+ 伍陆
122
+
123
+ [pubname3]
124
+ :type = prefix
125
+ :line = no
126
+ :exclude = noname,symbol,alpha,chnum2
127
+ :tf = 5.0
128
+ :idf = 3.5
129
+ :attr = nr
130
+ 万章
131
+
132
+ ;
133
+ ; 单姓
134
+ ;
135
+ [surname]
136
+ :type = prefix
137
+ :line = no
138
+ :exclude = noname,symbol,alpha,chnum2
139
+ :tf = 5.0
140
+ :idf = 3.5
141
+ :attr = nr
142
+ :znum = 1,2
143
+
144
+ 敖白班包宝保鲍毕边卞柏蔡曹岑柴昌常车陈成程迟池褚
145
+ 楚储淳崔刀邓狄刁丁董窦杜端段樊范方房斐丰封冯凤伏
146
+ 傅甘高耿龚宫勾苟辜谷顾官关管桂郭韩杭郝禾何贺衡洪
147
+ 侯胡花黄稽姬纪季简翦姜江蒋焦晋靳荆居康空孔匡邝况
148
+ 蓝郎朗乐冷李理厉励连廉练良梁廖凌刘柳隆龙楼娄吕路
149
+ 骆麻满茅毛梅孟苗缪闵明牟倪聂牛钮农潘庞裴彭皮朴平
150
+ 蒲溥浦戚祁钱强秦丘邱仇裘屈瞿权饶任荣容阮瑞芮赛单
151
+ 商邵佘申沈盛石寿舒宋苏孙邰谭谈汤唐陶滕田佟仝屠涂
152
+ 汪王危韦魏卫蔚闻翁巫邬武吴奚习夏鲜席冼项萧解辛邢
153
+ 幸熊徐许宣薛荀颜阎言严彦晏燕杨阳姚叶蚁易殷银尹应
154
+ 英游于於鱼虞俞余禹喻郁尉袁岳云臧曾查翟湛张赵甄郑
155
+ 钟周朱竺祝庄卓宗邹祖左肖
156
+
157
+ ;
158
+ ; 复姓
159
+ ;
160
+ [surname2]
161
+ :type = prefix
162
+ :line = yes
163
+ :exclude = noname,symbol,alpha,chnum2
164
+ :tf = 5.0
165
+ :idf = 3.5
166
+ :attr = nr
167
+ :znum = 1, 2
168
+ 东郭
169
+ 公孙
170
+ 皇甫
171
+ 慕容
172
+ 欧阳
173
+ 单于
174
+ 司空
175
+ 司马
176
+ 司徒
177
+ 澹台
178
+ 诸葛
179
+
180
+ ;
181
+ ; 地点名称
182
+ ;
183
+ [areaname]
184
+ :type = suffix
185
+ :znum = 2
186
+ :exclude = noname,symbol,alpha,chnum2
187
+ :tf = 4.5
188
+ :idf = 3.0
189
+ :attr = ns
190
+ :line = no
191
+
192
+ 县市镇村乡区
193
+
194
+ ;
195
+ ; 双字地点名称
196
+ ;
197
+ [areaname2]
198
+ :type = suffix
199
+ :znum = 2
200
+ :exclude = noname,symbol,alpha,chnum2
201
+ :tf = 4.5
202
+ :idf = 3.0
203
+ :attr = ns
204
+ :line = yes
205
+ 东路
206
+ 西路
207
+ 支路
208
+ 街道
209
+ 南路
210
+ 北路
211
+
212
+
213
+ [munit]
214
+ :type = none
215
+ :line = no
216
+ 萬亿零年点分秒回节名个多届次集
217
+
218
+ [chnum0]
219
+ :type = prefix
220
+ :line = no
221
+ :tf = 2.5
222
+ :idf = 1.0
223
+ :attr = mt
224
+ :include = chnum2,chnum3,munit,pubname3
225
+
226
+
227
+ [chnum1]
228
+ :type = prefix
229
+ :include = chnum0,chnum1,munit,pubname3
230
+ :tf = 3.0
231
+ :idf = 1.0
232
+ :attr = mt
233
+ :line = no
234
+ 一二三四五六七八九十百千
235
+
236
+ [chnum2]
237
+ :type = prefix
238
+ :line = no
239
+ :tf = 3.0
240
+ :idf = 1.0
241
+ :attr = mt
242
+ :include = chnum0,chnum2,chnum3,munit,pubname3
243
+ 123456789
244
+
245
+ [chnum3]
246
+ :type = none
247
+ :line = no
248
+
249
+
250
+ [chnum4]
251
+ :type = prefix
252
+ :line = no
253
+ :tf = 3.0
254
+ :idf = 1.0
255
+ :attr = mt
256
+ :include = chnum4,munit,pubname2,pubname3
257
+
258
+ 壹贰叁肆柒捌玖拾佰仟
259
+
260
+ [chnum5]
261
+ :type = prefix
262
+ :line = no
263
+ :tf = 3.5
264
+ :idf = 2.0
265
+ :attr = nz
266
+ :include = chnum1,munit,pubname3,chnum2
267
+
268
+ 第每
269
+
270
+ [alpha]
271
+ :type = prefix
272
+ :line = no
273
+ :tf = 2.5
274
+ :idf = 1.0
275
+ :attr = en
276
+ :include = alpha
277
+
278
+ abcdefghijklmnopqrstuvwxyz
279
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ'
280
+
281
+ [foregin]
282
+ :type = prefix
283
+ :line = no
284
+ :tf = 4.0
285
+ :idf = 3.0
286
+ :attr = nr
287
+ :include = foregin,pubname,pubname2,pubname3
288
+ 阿克拉加内亚巴尔姆爱兰西伊杰纳布可夫勒特坦芬尼根登都
289
+ 伯泰胥俄科索沃森奥瓦茨普蒂塞维大莱德冈墨哥弗库澳哈兹
290
+ 乌奇切诺里基延达塔卡雅来波迈蓬什比摩曼乃休合娜迪凯帕
291
+ 桑佩蒙博托格泽及希匹印埃努烈累法图喀土腓耶逊宾