pluskid-rmmseg-cpp 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/data/words.dic CHANGED
@@ -2602,7 +2602,6 @@
2602
2602
  4 犬牙相临
2603
2603
  2 变参
2604
2604
  2 大典
2605
- 3 #的话
2606
2605
  2 标准
2607
2606
  2 守护
2608
2607
  2 过失
@@ -15090,7 +15089,6 @@
15090
15089
  2 分开
15091
15090
  4 中外观众
15092
15091
  2 只只
15093
- 4 #的意思
15094
15092
  4 悔过自责
15095
15093
  2 为重
15096
15094
  4 面带笑容
@@ -17189,7 +17187,6 @@
17189
17187
  4 神怡心醉
17190
17188
  4 沉吟不语
17191
17189
  2 全岛
17192
- 3 #的卡
17193
17190
  3 统计员
17194
17191
  2 中师
17195
17192
  2 不尽
@@ -20816,7 +20813,6 @@
20816
20813
  2 极佳
20817
20814
  2 夜闭
20818
20815
  3 立体镜
20819
- 3 #的士
20820
20816
  2 冥冥
20821
20817
  4 拨号网络
20822
20818
  2 水师
@@ -20838,7 +20834,6 @@
20838
20834
  2 半透
20839
20835
  2 或许
20840
20836
  2 邻水
20841
- 3 #我刚
20842
20837
  2 短局
20843
20838
  3 近卫军
20844
20839
  2 调卷
@@ -26193,7 +26188,6 @@
26193
26188
  3 印刷体
26194
26189
  3 饰头巾
26195
26190
  2 参预
26196
- 4 #的确良
26197
26191
  2 预设
26198
26192
  2 一着
26199
26193
  4 反战运动
@@ -27457,7 +27451,6 @@
27457
27451
  2 家姓
27458
27452
  2 绝妙
27459
27453
  2 触网
27460
- 4 #的方向
27461
27454
  3 厘米波
27462
27455
  2 赌马
27463
27456
  4 涅而不缁
@@ -29576,7 +29569,6 @@
29576
29569
  2 阅读
29577
29570
  3 超短波
29578
29571
  2 论件
29579
- 3 #的当
29580
29572
  4 鼎足之势
29581
29573
  5 少壮不努力
29582
29574
  4 八方呼应
@@ -36413,7 +36405,6 @@
36413
36405
  2 午狮
36414
36406
  2 人学
36415
36407
  2 舒兰
36416
- 4 #的中部
36417
36408
  2 弄弄
36418
36409
  2 幼教
36419
36410
  2 强行
@@ -41703,7 +41694,6 @@
41703
41694
  3 受精卵
41704
41695
  4 强拉硬扯
41705
41696
  2 福特
41706
- 3 #的呢
41707
41697
  2 舒卷
41708
41698
  4 凌杂米盐
41709
41699
  2 跃至
@@ -43930,7 +43920,6 @@
43930
43920
  4 狂轰滥炸
43931
43921
  2 火神
43932
43922
  2 结节
43933
- 4 #的结果
43934
43923
  2 铆钉
43935
43924
  2 名目
43936
43925
  4 甘言厚币
@@ -57228,7 +57217,6 @@
57228
57217
  2 榆木
57229
57218
  4 首恶必办
57230
57219
  2 河池
57231
- 4 #的下方
57232
57220
  2 滑动
57233
57221
  4 克爱克威
57234
57222
  4 言出患入
@@ -61452,7 +61440,6 @@
61452
61440
  2 敲定
61453
61441
  2 改动
61454
61442
  2 复摆
61455
- 3 #而不
61456
61443
  2 治乱
61457
61444
  2 门外
61458
61445
  3 粗制品
@@ -67612,7 +67599,6 @@
67612
67599
  2 云块
67613
67600
  2 氮族
67614
67601
  2 基值
67615
- 4 #的样子
67616
67602
  2 举手
67617
67603
  5 从群众中来
67618
67604
  3 触发器
@@ -74495,7 +74481,6 @@
74495
74481
  3 八一队
74496
74482
  3 讽刺文
74497
74483
  2 桅帆
74498
- 3 #的心
74499
74484
  2 茶房
74500
74485
  2 要塌
74501
74486
  2 附识
@@ -78987,7 +78972,6 @@
78987
78972
  4 不复存在
78988
78973
  3 减速器
78989
78974
  3 触摸者
78990
- 4 #的年代
78991
78975
  2 唇形
78992
78976
  4 势难两全
78993
78977
  2 恕不
@@ -79343,7 +79327,6 @@
79343
79327
  4 晚生后学
79344
79328
  4 不经一事
79345
79329
  2 刺激
79346
- 3 #的人
79347
79330
  2 空检
79348
79331
  2 代职
79349
79332
  4 既往不咎
@@ -95875,7 +95858,6 @@
95875
95858
  2 好生
95876
95859
  4 抽样调查
95877
95860
  4 闻雷失箸
95878
- 4 #的时候
95879
95861
  4 抱蔓摘瓜
95880
95862
  2 变干
95881
95863
  2 监测
@@ -102186,7 +102168,6 @@
102186
102168
  2 英史
102187
102169
  3 适配器
102188
102170
  4 长驱直入
102189
- 3 #的死
102190
102171
  2 鸥类
102191
102172
  4 遁入空门
102192
102173
  4 言听计从
@@ -111611,7 +111592,6 @@
111611
111592
  4 摩天大厦
111612
111593
  2 代营
111613
111594
  2 相辅
111614
- 4 #的意义
111615
111595
  2 钢管
111616
111596
  2 行贿
111617
111597
  4 雕章镂句
@@ -114283,7 +114263,6 @@
114283
114263
  4 神经纤维
114284
114264
  2 暗补
114285
114265
  2 工矿
114286
- 3 #的确
114287
114266
  4 新硎初试
114288
114267
  2 猛醒
114289
114268
  3 指定者
@@ -115249,7 +115228,6 @@
115249
115228
  2 厚板
115250
115229
  4 拥兵玩寇
115251
115230
  2 互通
115252
- 4 #的一端
115253
115231
  2 词法
115254
115232
  4 互为表里
115255
115233
  3 该记住
@@ -68,6 +68,7 @@ extern "C" {
68
68
 
69
69
  static void tk_mark(Token *t)
70
70
  {
71
+ // start and end are Fixnums, no need to mark
71
72
  rb_gc_mark(t->text);
72
73
  }
73
74
  static void tk_free(Token *t)
@@ -31,7 +31,9 @@ module RMMSeg
31
31
  #
32
32
  # except the first number is not the frequency, but the number of
33
33
  # characters (not number of bytes) in the word.
34
- #
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
35
37
  attr_accessor :dictionaries
36
38
 
37
39
  # Add a user defined dictionary, +type+ can be
data/lib/rmmseg/ferret.rb CHANGED
@@ -20,7 +20,7 @@ module RMMSeg
20
20
  end
21
21
 
22
22
  def token_stream(field, text)
23
- t = PunctuationFilter.new(Tokenizer.new(text))
23
+ t = Tokenizer.new(text)
24
24
  if @brk
25
25
  @brk.call(t)
26
26
  else
@@ -61,56 +61,5 @@ module RMMSeg
61
61
  @algor = Algorithm.new(@text)
62
62
  end
63
63
  end
64
-
65
- # PunctuationFilter filter out the stand alone Chinese
66
- # punctuation tokens.
67
- class PunctuationFilter < ::Ferret::Analysis::TokenStream
68
- # The punctuation dictionary.
69
- class Dictionary
70
- include Singleton
71
-
72
- DIC_FILE = File.join(File.dirname(__FILE__),
73
- "..",
74
- "..",
75
- "data",
76
- "punctuation.dic")
77
- def initialize
78
- @dic = Hash.new
79
- File.open(DIC_FILE, "r") do |f|
80
- f.each_line { |line|
81
- @dic[line.chomp.freeze] = nil
82
- }
83
- end
84
- end
85
-
86
- def include?(str)
87
- @dic.has_key?(str)
88
- end
89
- end
90
-
91
- def initialize(stream)
92
- @stream = stream
93
- end
94
-
95
- # Get next token, skip stand alone Chinese punctuations.
96
- def next
97
- token = @stream.next
98
- dic = Dictionary.instance
99
-
100
- until token.nil? || !(dic.include? token.text)
101
- token = @stream.next
102
- end
103
-
104
- token
105
- end
106
-
107
- def text
108
- @stream.text
109
- end
110
-
111
- def text=(str)
112
- @stream.text = str
113
- end
114
- end
115
64
  end
116
65
  end
data/tools/convert.rb ADDED
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # A utility used to convert the old RMMSeg dictionary
4
+ # to rmmseg-cpp format.
5
+
6
+ # There are several constrains for the new rmmseg-cpp
7
+ # dictionary format.
8
+ # - length of word should be specified in the dict
9
+ # - number and string should be separated by ONE space
10
+ # - there should be a newline at the end of file
11
+
12
+ $KCODE='u'
13
+ require 'jcode'
14
+
15
+ def usage(msg=nil)
16
+ puts "***ERROR: #{msg}\n\n" if msg
17
+ puts <<EOT
18
+ Usage:
19
+
20
+ #{$0} action type input.dic output.dic
21
+
22
+ action: either 'convert' or 'normalize'
23
+ - 'convert' is used to convert the dict from
24
+ old RMMSeg format.
25
+ - 'normalize' is used to normalize an existing
26
+ rmmseg-cpp dict.
27
+
28
+ type: either 'words' or 'chars'
29
+
30
+ EOT
31
+ exit(0)
32
+ end
33
+
34
+ usage if ARGV.size != 4
35
+ usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
36
+ usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
37
+
38
+ def output(data)
39
+ File.open(ARGV[3], "w") do |f|
40
+ data.each do |num, word|
41
+ f.puts "#{num} #{word}" if word
42
+ end
43
+ end
44
+ end
45
+
46
+ def read_RMMSeg_chars
47
+ max = 0
48
+ File.readlines(ARGV[2]).map do |line|
49
+ if line =~ /^(.)\s+(\d+)$/
50
+ n = $2.to_i
51
+ max = n if n > max
52
+ [n, $1]
53
+ else
54
+ [nil, nil]
55
+ end
56
+ end.map do |num, word|
57
+ if word
58
+ [num*65535/max, word]
59
+ else
60
+ [nil, nil]
61
+ end
62
+ end
63
+ end
64
+
65
+ def read_RMMSeg_words
66
+ File.readlines(ARGV[2]).map do |line|
67
+ line.chomp!
68
+ if !line.empty?
69
+ [line.jlength, line]
70
+ else
71
+ [nil, nil]
72
+ end
73
+ end
74
+ end
75
+
76
+ def read_rmmseg_cpp_chars
77
+ max = 0
78
+ File.readlines(ARGV[2]).map do |line|
79
+ if line =~ /^(\d+)\s+(.)$/
80
+ n = $1.to_i
81
+ max = n if n > max
82
+ [n, $2]
83
+ else
84
+ [nil, nil]
85
+ end
86
+ end.map do |num, word|
87
+ if word
88
+ [num*65535/max, word]
89
+ else
90
+ [nil, nil]
91
+ end
92
+ end
93
+ end
94
+
95
+ def read_rmmseg_cpp_words
96
+ File.readlines(ARGV[2]).map do |line|
97
+ if line =~ /^(\d+)\s+(\w+)$/
98
+ [$1, $2]
99
+ else
100
+ [nil, nil]
101
+ end
102
+ end
103
+ end
104
+
105
+ case ARGV[0,2]
106
+ when ['convert', 'chars']
107
+ output(read_RMMSeg_chars)
108
+ when ['convert', 'words']
109
+ output(read_RMMSeg_words)
110
+ when ['normalize', 'chars']
111
+ output(read_rmmseg_cpp_chars)
112
+ when ['normalize', 'words']
113
+ output(read_rmmseg_cpp_words)
114
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pluskid-rmmseg-cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -36,9 +36,9 @@ files:
36
36
  - ext/rmmseg/word.h
37
37
  - ext/rmmseg/chunk.h
38
38
  - ext/rmmseg/memory.h
39
- - data/punctuation.dic
40
39
  - data/words.dic
41
40
  - data/chars.dic
41
+ - tools/convert.rb
42
42
  - README
43
43
  - ext/rmmseg/extconf.rb
44
44
  has_rdoc: true
data/data/punctuation.dic DELETED
@@ -1,79 +0,0 @@
1
-
2
- ×
3
- π
4
-
5
-
6
-
7
-
8
-
9
- °
10
-
11
-
12
-
13
-
14
-
15
- ±
16
-
17
-
18
-
19
-
20
-
21
-
22
-
23
-
24
-
25
-
26
-
27
-
28
-
29
- ——
30
-
31
-
32
-
33
-
34
-
35
-
36
-
37
-
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
46
-
47
-
48
- ·
49
-
50
-
51
-
52
- §
53
-
54
-
55
-
56
-
57
-
58
-
59
- ÷
60
-
61
-
62
-
63
-
64
-
65
-
66
-
67
-
68
-  
69
-
70
-
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
-