pluskid-rmmseg-cpp 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/words.dic +0 -22
- data/ext/rmmseg/rmmseg.cpp +1 -0
- data/lib/rmmseg/dictionary.rb +3 -1
- data/lib/rmmseg/ferret.rb +1 -52
- data/tools/convert.rb +114 -0
- metadata +2 -2
- data/data/punctuation.dic +0 -79
data/data/words.dic
CHANGED
@@ -2602,7 +2602,6 @@
|
|
2602
2602
|
4 犬牙相临
|
2603
2603
|
2 变参
|
2604
2604
|
2 大典
|
2605
|
-
3 #的话
|
2606
2605
|
2 标准
|
2607
2606
|
2 守护
|
2608
2607
|
2 过失
|
@@ -15090,7 +15089,6 @@
|
|
15090
15089
|
2 分开
|
15091
15090
|
4 中外观众
|
15092
15091
|
2 只只
|
15093
|
-
4 #的意思
|
15094
15092
|
4 悔过自责
|
15095
15093
|
2 为重
|
15096
15094
|
4 面带笑容
|
@@ -17189,7 +17187,6 @@
|
|
17189
17187
|
4 神怡心醉
|
17190
17188
|
4 沉吟不语
|
17191
17189
|
2 全岛
|
17192
|
-
3 #的卡
|
17193
17190
|
3 统计员
|
17194
17191
|
2 中师
|
17195
17192
|
2 不尽
|
@@ -20816,7 +20813,6 @@
|
|
20816
20813
|
2 极佳
|
20817
20814
|
2 夜闭
|
20818
20815
|
3 立体镜
|
20819
|
-
3 #的士
|
20820
20816
|
2 冥冥
|
20821
20817
|
4 拨号网络
|
20822
20818
|
2 水师
|
@@ -20838,7 +20834,6 @@
|
|
20838
20834
|
2 半透
|
20839
20835
|
2 或许
|
20840
20836
|
2 邻水
|
20841
|
-
3 #我刚
|
20842
20837
|
2 短局
|
20843
20838
|
3 近卫军
|
20844
20839
|
2 调卷
|
@@ -26193,7 +26188,6 @@
|
|
26193
26188
|
3 印刷体
|
26194
26189
|
3 饰头巾
|
26195
26190
|
2 参预
|
26196
|
-
4 #的确良
|
26197
26191
|
2 预设
|
26198
26192
|
2 一着
|
26199
26193
|
4 反战运动
|
@@ -27457,7 +27451,6 @@
|
|
27457
27451
|
2 家姓
|
27458
27452
|
2 绝妙
|
27459
27453
|
2 触网
|
27460
|
-
4 #的方向
|
27461
27454
|
3 厘米波
|
27462
27455
|
2 赌马
|
27463
27456
|
4 涅而不缁
|
@@ -29576,7 +29569,6 @@
|
|
29576
29569
|
2 阅读
|
29577
29570
|
3 超短波
|
29578
29571
|
2 论件
|
29579
|
-
3 #的当
|
29580
29572
|
4 鼎足之势
|
29581
29573
|
5 少壮不努力
|
29582
29574
|
4 八方呼应
|
@@ -36413,7 +36405,6 @@
|
|
36413
36405
|
2 午狮
|
36414
36406
|
2 人学
|
36415
36407
|
2 舒兰
|
36416
|
-
4 #的中部
|
36417
36408
|
2 弄弄
|
36418
36409
|
2 幼教
|
36419
36410
|
2 强行
|
@@ -41703,7 +41694,6 @@
|
|
41703
41694
|
3 受精卵
|
41704
41695
|
4 强拉硬扯
|
41705
41696
|
2 福特
|
41706
|
-
3 #的呢
|
41707
41697
|
2 舒卷
|
41708
41698
|
4 凌杂米盐
|
41709
41699
|
2 跃至
|
@@ -43930,7 +43920,6 @@
|
|
43930
43920
|
4 狂轰滥炸
|
43931
43921
|
2 火神
|
43932
43922
|
2 结节
|
43933
|
-
4 #的结果
|
43934
43923
|
2 铆钉
|
43935
43924
|
2 名目
|
43936
43925
|
4 甘言厚币
|
@@ -57228,7 +57217,6 @@
|
|
57228
57217
|
2 榆木
|
57229
57218
|
4 首恶必办
|
57230
57219
|
2 河池
|
57231
|
-
4 #的下方
|
57232
57220
|
2 滑动
|
57233
57221
|
4 克爱克威
|
57234
57222
|
4 言出患入
|
@@ -61452,7 +61440,6 @@
|
|
61452
61440
|
2 敲定
|
61453
61441
|
2 改动
|
61454
61442
|
2 复摆
|
61455
|
-
3 #而不
|
61456
61443
|
2 治乱
|
61457
61444
|
2 门外
|
61458
61445
|
3 粗制品
|
@@ -67612,7 +67599,6 @@
|
|
67612
67599
|
2 云块
|
67613
67600
|
2 氮族
|
67614
67601
|
2 基值
|
67615
|
-
4 #的样子
|
67616
67602
|
2 举手
|
67617
67603
|
5 从群众中来
|
67618
67604
|
3 触发器
|
@@ -74495,7 +74481,6 @@
|
|
74495
74481
|
3 八一队
|
74496
74482
|
3 讽刺文
|
74497
74483
|
2 桅帆
|
74498
|
-
3 #的心
|
74499
74484
|
2 茶房
|
74500
74485
|
2 要塌
|
74501
74486
|
2 附识
|
@@ -78987,7 +78972,6 @@
|
|
78987
78972
|
4 不复存在
|
78988
78973
|
3 减速器
|
78989
78974
|
3 触摸者
|
78990
|
-
4 #的年代
|
78991
78975
|
2 唇形
|
78992
78976
|
4 势难两全
|
78993
78977
|
2 恕不
|
@@ -79343,7 +79327,6 @@
|
|
79343
79327
|
4 晚生后学
|
79344
79328
|
4 不经一事
|
79345
79329
|
2 刺激
|
79346
|
-
3 #的人
|
79347
79330
|
2 空检
|
79348
79331
|
2 代职
|
79349
79332
|
4 既往不咎
|
@@ -95875,7 +95858,6 @@
|
|
95875
95858
|
2 好生
|
95876
95859
|
4 抽样调查
|
95877
95860
|
4 闻雷失箸
|
95878
|
-
4 #的时候
|
95879
95861
|
4 抱蔓摘瓜
|
95880
95862
|
2 变干
|
95881
95863
|
2 监测
|
@@ -102186,7 +102168,6 @@
|
|
102186
102168
|
2 英史
|
102187
102169
|
3 适配器
|
102188
102170
|
4 长驱直入
|
102189
|
-
3 #的死
|
102190
102171
|
2 鸥类
|
102191
102172
|
4 遁入空门
|
102192
102173
|
4 言听计从
|
@@ -111611,7 +111592,6 @@
|
|
111611
111592
|
4 摩天大厦
|
111612
111593
|
2 代营
|
111613
111594
|
2 相辅
|
111614
|
-
4 #的意义
|
111615
111595
|
2 钢管
|
111616
111596
|
2 行贿
|
111617
111597
|
4 雕章镂句
|
@@ -114283,7 +114263,6 @@
|
|
114283
114263
|
4 神经纤维
|
114284
114264
|
2 暗补
|
114285
114265
|
2 工矿
|
114286
|
-
3 #的确
|
114287
114266
|
4 新硎初试
|
114288
114267
|
2 猛醒
|
114289
114268
|
3 指定者
|
@@ -115249,7 +115228,6 @@
|
|
115249
115228
|
2 厚板
|
115250
115229
|
4 拥兵玩寇
|
115251
115230
|
2 互通
|
115252
|
-
4 #的一端
|
115253
115231
|
2 词法
|
115254
115232
|
4 互为表里
|
115255
115233
|
3 该记住
|
data/ext/rmmseg/rmmseg.cpp
CHANGED
data/lib/rmmseg/dictionary.rb
CHANGED
@@ -31,7 +31,9 @@ module RMMSeg
|
|
31
31
|
#
|
32
32
|
# except the first number is not the frequency, but the number of
|
33
33
|
# characters (not number of bytes) in the word.
|
34
|
-
#
|
34
|
+
#
|
35
|
+
# There's a script (convert.rb) in the tools directory that can be used
|
36
|
+
# to convert and normalize dictionaries.
|
35
37
|
attr_accessor :dictionaries
|
36
38
|
|
37
39
|
# Add a user defined dictionary, +type+ can be
|
data/lib/rmmseg/ferret.rb
CHANGED
@@ -20,7 +20,7 @@ module RMMSeg
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def token_stream(field, text)
|
23
|
-
t =
|
23
|
+
t = Tokenizer.new(text)
|
24
24
|
if @brk
|
25
25
|
@brk.call(t)
|
26
26
|
else
|
@@ -61,56 +61,5 @@ module RMMSeg
|
|
61
61
|
@algor = Algorithm.new(@text)
|
62
62
|
end
|
63
63
|
end
|
64
|
-
|
65
|
-
# PunctuationFilter filter out the stand alone Chinese
|
66
|
-
# punctuation tokens.
|
67
|
-
class PunctuationFilter < ::Ferret::Analysis::TokenStream
|
68
|
-
# The punctuation dictionary.
|
69
|
-
class Dictionary
|
70
|
-
include Singleton
|
71
|
-
|
72
|
-
DIC_FILE = File.join(File.dirname(__FILE__),
|
73
|
-
"..",
|
74
|
-
"..",
|
75
|
-
"data",
|
76
|
-
"punctuation.dic")
|
77
|
-
def initialize
|
78
|
-
@dic = Hash.new
|
79
|
-
File.open(DIC_FILE, "r") do |f|
|
80
|
-
f.each_line { |line|
|
81
|
-
@dic[line.chomp.freeze] = nil
|
82
|
-
}
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
def include?(str)
|
87
|
-
@dic.has_key?(str)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def initialize(stream)
|
92
|
-
@stream = stream
|
93
|
-
end
|
94
|
-
|
95
|
-
# Get next token, skip stand alone Chinese punctuations.
|
96
|
-
def next
|
97
|
-
token = @stream.next
|
98
|
-
dic = Dictionary.instance
|
99
|
-
|
100
|
-
until token.nil? || !(dic.include? token.text)
|
101
|
-
token = @stream.next
|
102
|
-
end
|
103
|
-
|
104
|
-
token
|
105
|
-
end
|
106
|
-
|
107
|
-
def text
|
108
|
-
@stream.text
|
109
|
-
end
|
110
|
-
|
111
|
-
def text=(str)
|
112
|
-
@stream.text = str
|
113
|
-
end
|
114
|
-
end
|
115
64
|
end
|
116
65
|
end
|
data/tools/convert.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# A utility used to convert the old RMMSeg dictionary
|
4
|
+
# to rmmseg-cpp format.
|
5
|
+
|
6
|
+
# There are several constrains for the new rmmseg-cpp
|
7
|
+
# dictionary format.
|
8
|
+
# - length of word should be specified in the dict
|
9
|
+
# - number and string should be separated by ONE space
|
10
|
+
# - there should be a newline at the end of file
|
11
|
+
|
12
|
+
$KCODE='u'
|
13
|
+
require 'jcode'
|
14
|
+
|
15
|
+
def usage(msg=nil)
|
16
|
+
puts "***ERROR: #{msg}\n\n" if msg
|
17
|
+
puts <<EOT
|
18
|
+
Usage:
|
19
|
+
|
20
|
+
#{$0} action type input.dic output.dic
|
21
|
+
|
22
|
+
action: either 'convert' or 'normalize'
|
23
|
+
- 'convert' is used to convert the dict from
|
24
|
+
old RMMSeg format.
|
25
|
+
- 'normalize' is used to normalize an existing
|
26
|
+
rmmseg-cpp dict.
|
27
|
+
|
28
|
+
type: either 'words' or 'chars'
|
29
|
+
|
30
|
+
EOT
|
31
|
+
exit(0)
|
32
|
+
end
|
33
|
+
|
34
|
+
usage if ARGV.size != 4
|
35
|
+
usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
|
36
|
+
usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
|
37
|
+
|
38
|
+
def output(data)
|
39
|
+
File.open(ARGV[3], "w") do |f|
|
40
|
+
data.each do |num, word|
|
41
|
+
f.puts "#{num} #{word}" if word
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def read_RMMSeg_chars
|
47
|
+
max = 0
|
48
|
+
File.readlines(ARGV[2]).map do |line|
|
49
|
+
if line =~ /^(.)\s+(\d+)$/
|
50
|
+
n = $2.to_i
|
51
|
+
max = n if n > max
|
52
|
+
[n, $1]
|
53
|
+
else
|
54
|
+
[nil, nil]
|
55
|
+
end
|
56
|
+
end.map do |num, word|
|
57
|
+
if word
|
58
|
+
[num*65535/max, word]
|
59
|
+
else
|
60
|
+
[nil, nil]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def read_RMMSeg_words
|
66
|
+
File.readlines(ARGV[2]).map do |line|
|
67
|
+
line.chomp!
|
68
|
+
if !line.empty?
|
69
|
+
[line.jlength, line]
|
70
|
+
else
|
71
|
+
[nil, nil]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def read_rmmseg_cpp_chars
|
77
|
+
max = 0
|
78
|
+
File.readlines(ARGV[2]).map do |line|
|
79
|
+
if line =~ /^(\d+)\s+(.)$/
|
80
|
+
n = $1.to_i
|
81
|
+
max = n if n > max
|
82
|
+
[n, $2]
|
83
|
+
else
|
84
|
+
[nil, nil]
|
85
|
+
end
|
86
|
+
end.map do |num, word|
|
87
|
+
if word
|
88
|
+
[num*65535/max, word]
|
89
|
+
else
|
90
|
+
[nil, nil]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def read_rmmseg_cpp_words
|
96
|
+
File.readlines(ARGV[2]).map do |line|
|
97
|
+
if line =~ /^(\d+)\s+(\w+)$/
|
98
|
+
[$1, $2]
|
99
|
+
else
|
100
|
+
[nil, nil]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
case ARGV[0,2]
|
106
|
+
when ['convert', 'chars']
|
107
|
+
output(read_RMMSeg_chars)
|
108
|
+
when ['convert', 'words']
|
109
|
+
output(read_RMMSeg_words)
|
110
|
+
when ['normalize', 'chars']
|
111
|
+
output(read_rmmseg_cpp_chars)
|
112
|
+
when ['normalize', 'words']
|
113
|
+
output(read_rmmseg_cpp_words)
|
114
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pluskid-rmmseg-cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -36,9 +36,9 @@ files:
|
|
36
36
|
- ext/rmmseg/word.h
|
37
37
|
- ext/rmmseg/chunk.h
|
38
38
|
- ext/rmmseg/memory.h
|
39
|
-
- data/punctuation.dic
|
40
39
|
- data/words.dic
|
41
40
|
- data/chars.dic
|
41
|
+
- tools/convert.rb
|
42
42
|
- README
|
43
43
|
- ext/rmmseg/extconf.rb
|
44
44
|
has_rdoc: true
|
data/data/punctuation.dic
DELETED
@@ -1,79 +0,0 @@
|
|
1
|
-
{
|
2
|
-
×
|
3
|
-
π
|
4
|
-
)
|
5
|
-
〖
|
6
|
-
;
|
7
|
-
〗
|
8
|
-
<
|
9
|
-
°
|
10
|
-
“
|
11
|
-
+
|
12
|
-
◆
|
13
|
-
♀
|
14
|
-
=
|
15
|
-
±
|
16
|
-
←
|
17
|
-
}
|
18
|
-
,
|
19
|
-
”
|
20
|
-
㎡
|
21
|
-
◇
|
22
|
-
>
|
23
|
-
↑
|
24
|
-
~
|
25
|
-
△
|
26
|
-
?
|
27
|
-
♂
|
28
|
-
‰
|
29
|
-
——
|
30
|
-
→
|
31
|
-
■
|
32
|
-
¥
|
33
|
-
-
|
34
|
-
@
|
35
|
-
≈
|
36
|
-
↓
|
37
|
-
□
|
38
|
-
〈
|
39
|
-
′
|
40
|
-
〉
|
41
|
-
/
|
42
|
-
★
|
43
|
-
《
|
44
|
-
○
|
45
|
-
″
|
46
|
-
☆
|
47
|
-
》
|
48
|
-
·
|
49
|
-
∶
|
50
|
-
!
|
51
|
-
『
|
52
|
-
§
|
53
|
-
●
|
54
|
-
』
|
55
|
-
…
|
56
|
-
【
|
57
|
-
℃
|
58
|
-
#
|
59
|
-
÷
|
60
|
-
】
|
61
|
-
№
|
62
|
-
$
|
63
|
-
※
|
64
|
-
≤
|
65
|
-
‖
|
66
|
-
%
|
67
|
-
≥
|
68
|
-
|
69
|
-
&
|
70
|
-
、
|
71
|
-
‘
|
72
|
-
〔
|
73
|
-
。
|
74
|
-
’
|
75
|
-
√
|
76
|
-
(
|
77
|
-
〕
|
78
|
-
£
|
79
|
-
:
|