tyccl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ class Tyccl
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tyccl.rb ADDED
@@ -0,0 +1,374 @@
1
+ # coding: utf-8
2
+
3
+ # = this gem is a tool for analysing similarity
4
+ # = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
5
+ # this gem only has one singleton class, instance once and use it always.
6
+ #
7
+ # learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
8
+ #
9
+ # Author:: Joe Woo (https://github.com/JoeWoo)
10
+ # License:: MIT
11
+ #
12
+
13
+ require File.expand_path("../tyccl/version", __FILE__)
14
+ require "algorithms"
15
+ require "yaml"
16
+ require "singleton"
17
+ require "logger"
18
+
19
+
20
+ # this struct is used to return analysing result
21
+ # * field 'value' store the analysing value
22
+ # * field 'x_id' 'y_id' store the ID of word X and Y
23
+ Result_t = Struct.new(:value,:x_id,:y_id)
24
+
25
+ # class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
26
+ # to keep Tyccl object just only one.
27
+ class Tyccl
28
+
29
+ include Singleton
30
+
31
+ # Read the Cilin file to memory.
32
+ # Format the data structure \#@IDsTire.
33
+ # Index the hash \#@IDsIndex.
34
+ def initialize()#:notnew: stops RDoc from seeing the initialize method as the new method
35
+ #--
36
+ #read the cilin.txt to ids[] and items[]
37
+ #++
38
+ @logger = Logger.new(STDOUT)
39
+ @logger.level = Logger::WARN
40
+ codes=[]
41
+ items=[]
42
+ @IDsIndex = Hash.new
43
+ f = File.new(File.expand_path("../cilin.txt", __FILE__))
44
+ i=0
45
+ f.each { |line|
46
+ line.force_encoding('utf-8')
47
+ m=line.split(" ")
48
+ codes << m[0]
49
+ @IDsIndex[m[0]] = i
50
+ i += 1
51
+ word = Array.new
52
+ m[1..-1].each{ |term|
53
+ word << term
54
+ }
55
+ items << word
56
+ }
57
+ #--
58
+ #init Trie of cilin.txt
59
+ #++
60
+ @IDsTrie = Containers::Trie.new
61
+ i=0
62
+ codes.each{ |key|
63
+ @IDsTrie[key]=items[i]
64
+ i+=1
65
+ }
66
+ #--
67
+ #init index of cilin.txt
68
+ #++
69
+ @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
70
+ end
71
+
72
+
73
+
74
+ # Given id(string) such as:"Aa01A01=" "Aa01A03#"
75
+ # Returns an array containing words(string) that match this id
76
+ # If no match is found, nil is returned.
77
+ def get_words_by_id(id)
78
+ @IDsTrie[id]
79
+ end
80
+
81
+ # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
82
+ # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
83
+ # If no match is found, an empty array is returned.
84
+ def get_ids_by_wildcard(wildcard)
85
+ @IDsTrie.wildcard(wildcard)
86
+ end
87
+
88
+ # Returns an array containing IDs(string) that the parameter Word(string) matchs.
89
+ #
90
+ # tips: the same word may have a few semantic meanings, so a word can match many IDs.
91
+ def get_ids_by_word(word)
92
+ m = @index[word]
93
+ if(m==nil)
94
+ @logger.error(word+" is an unlisted word!")
95
+ return nil
96
+ else
97
+ return m
98
+ end
99
+ end
100
+
101
+ # Given a word(string).
102
+ # Test to see if the parameter Word has any synonym.
103
+ # Returns true or false.
104
+ def has_same?(word)
105
+ ids = get_ids_by_word(word)
106
+ i=0
107
+ flag=false
108
+ while i < ids.size && flag==false do
109
+ if ids[i][-1]=="="
110
+ flag=true
111
+ else
112
+ flag=false
113
+ end
114
+ i+=1
115
+ end
116
+ return flag
117
+ end
118
+
119
+ # Given a word(string).
120
+ # Test to see if the parameter Word has any equivalent word.
121
+ # Returns true or false.
122
+ def has_equal?(word)
123
+ ids = get_ids_by_word(word)
124
+ i=0
125
+ flag=false
126
+ while i < ids.size && flag==false do
127
+ if ids[i][-1]=="#"
128
+ flag=true
129
+ else
130
+ flag=false
131
+ end
132
+ i+=1
133
+ end
134
+ return flag
135
+ end
136
+
137
+ # Given a word(string).
138
+ # Test to see if the parameter Word has any ID whose corresponding
139
+ # words list just has only one element.
140
+ # Returns true or false.
141
+ def has_single?(word)
142
+ ids = get_ids_by_word(word)
143
+ i=0
144
+ flag=false
145
+ while i < ids.size && flag==false do
146
+ if ids[i][-1]=="@"
147
+ flag=true
148
+ else
149
+ flag=false
150
+ end
151
+ i+=1
152
+ end
153
+ return flag
154
+ end
155
+
156
+ # Given a word(string).
157
+ # Returns a two dimensional array that contains the parameter Word`s
158
+ # synonym which divided by different ID that the word matchs.
159
+ # If the word has no synonym, nil is returned.
160
+ def get_same(word)
161
+ if has_same?(word)
162
+ same_words=[]
163
+ ids = get_ids_by_word(word)
164
+ ids.each{ |code|
165
+ if code[-1]=="="
166
+ same_words << get_words_by_id(code)
167
+ end
168
+ }
169
+ return same_words
170
+ end
171
+ return nil
172
+ end
173
+
174
+ # Given a word(string).
175
+ # Returns a two dimensional array that contains the parameter Word`s
176
+ # equivalent words which divided by different ID that the word matchs.
177
+ # If the word has no synonym, nil is returned.
178
+ def get_equal(word)
179
+ if has_equal?(word)
180
+ equal_words=[]
181
+ ids = get_ids_by_word(word)
182
+ ids.each{ |code|
183
+ if code[-1]=="#"
184
+ equal_words << get_words_by_id(code)
185
+ end
186
+ }
187
+ return equal_words
188
+ end
189
+ return nil
190
+ end
191
+
192
+ # Given a word(string) and a level(int),level`s value range is [0,4],
193
+ # 4 is default, value of level is more bigger, the similarity between
194
+ # returned words and the given word is more less.
195
+ # Returns a two dimensional array that contains the parameter Word`s
196
+ # similar words which divided by different ID that the word matchs.
197
+ # If the word has no similar, nil is returned.
198
+ #
199
+ # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
200
+ # segment: A,a,01,A,01=.
201
+ def get_similar(word, level=4)
202
+ ids = get_ids_by_word(word)
203
+ similar=[]
204
+ ids.each{ |code|
205
+ mini_similar=[]
206
+ findstring = gen_findstring(code, level+1)
207
+ similar_IDs=@IDsTrie.wildcard(findstring)
208
+ similar_IDs.each{|item|
209
+ get_words_by_id(item).each{|term|
210
+ mini_similar << term
211
+ }
212
+ }
213
+ similar << mini_similar
214
+ }
215
+ if similar.size > 0
216
+ return similar
217
+ else
218
+ return nil
219
+ end
220
+ end
221
+
222
+ # Given idA(string) and idB(string).
223
+ # Returns semantic distance(int) between idA and idB, values in [0,10].
224
+ def get_dist_by_id(idA, idB)
225
+ alpha=10.0/5
226
+ n = compare_id(idA,idB)
227
+ (alpha*(5-n)).round
228
+ end
229
+
230
+ # Given idA(string) and idB(string).
231
+ # Returns similarity(float) between idA and idB, values in [0,1].
232
+ def get_sim_by_id(idA, idB)
233
+ n = compare_id(idA,idB)
234
+ str = idA.clone
235
+ if n==0
236
+ _sim = factor[0]
237
+ elsif n==5
238
+ if idA[-1] == "="
239
+ _sim = factor[5]
240
+ elsif idA[-1] == "#"
241
+ _sim = factor[6]
242
+ elsif idA[-1] == "@"
243
+ _sim = factor[5]
244
+ end
245
+ elsif n < 5
246
+ findstring=gen_findstring(str,n)
247
+ node_num = @IDsTrie.wildcard(findstring).size
248
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
249
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
250
+ end
251
+ return _sim
252
+ end
253
+
254
+ # Given wordA(string) and wordB(string).
255
+ # Returns a Struct Result_t which contains idA, idB, and shortest
256
+ # semantic distance(int) between wordA and wordB.
257
+ def dist(wordA, wordB)
258
+ alpha=10.0/5
259
+ shortest_Pair = Result_t.new(100,"","")
260
+ idAs = get_ids_by_word(wordA)
261
+ idBs = get_ids_by_word(wordB)
262
+
263
+ idAs.each{ |idA|
264
+ idBs.each{ |idB|
265
+ n = compare_id(idA,idB)
266
+ distance = (alpha*(5-n)).round
267
+ if distance < shortest_Pair.value
268
+ shortest_Pair.value = distance
269
+ shortest_Pair.x_id = idA
270
+ shortest_Pair.y_id = idB
271
+ end
272
+ }
273
+ }
274
+ return shortest_Pair
275
+ end
276
+
277
+ # Given wordA(string) and wordB(string).
278
+ # Returns a Struct Result_t which contains the most similar Pairs
279
+ # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
280
+ def sim(wordA, wordB)
281
+ factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
282
+ longest_Pair = Result_t.new(-1,"","")
283
+ idAs = get_ids_by_word(wordA)
284
+ idBs = get_ids_by_word(wordB)
285
+
286
+ idAs.each{ |idA|
287
+ idBs.each{ |idB|
288
+ n = compare_id(idA,idB)
289
+ str = idA.clone
290
+ if n==0
291
+ _sim = factor[0]
292
+ elsif n==5
293
+ if idA[-1] == "="
294
+ _sim = factor[5]
295
+ elsif idA[-1] == "#"
296
+ _sim = factor[6]
297
+ elsif idA[-1] == "@"
298
+ _sim = factor[5]
299
+ end
300
+ elsif n < 5
301
+ findstring=gen_findstring(str,n)
302
+ node_num = @IDsTrie.wildcard(findstring).size
303
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
304
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
305
+ end
306
+
307
+ if _sim > longest_Pair.value
308
+ longest_Pair.value = _sim
309
+ longest_Pair.x_id = idA
310
+ longest_Pair.y_id = idB
311
+ end
312
+ }
313
+ }
314
+ longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
315
+ return longest_Pair
316
+ end
317
+
318
+ # Given a word(string) and start_index(int),start_index`s value
319
+ # range is [0,4], corresponding Cilin(同义词词林) ID`s different
320
+ # segment: A,a,01,A,01=.
321
+ # Returns a string that is used '.' to explace every char from
322
+ # the start_index to the string`s end.
323
+ def gen_findstring(code, start_index)
324
+ frame = cut_id(code)
325
+ (start_index).upto(4){|i|
326
+ 0.upto(frame[i].size-1){ |j|
327
+ frame[i][j]='.'
328
+ }
329
+ }
330
+ combine_id(frame)
331
+ end
332
+
333
+ # Given a id(string).
334
+ # Returns an array that contains 5 strings which are ID`s
335
+ # diffrent segment, like: A,a,01,A,01= .
336
+ def cut_id(id)
337
+ frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
338
+ return frame
339
+ end
340
+
341
+ # the method #cut_id`s inverse process.
342
+ def combine_id(frame)
343
+ m=""
344
+ frame.each{|seg|
345
+ m << seg
346
+ }
347
+ return m
348
+ end
349
+
350
+ # Given idA(string) and idB(string).
351
+ # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
352
+ # if they are the same , returns 5.
353
+ def compare_id(idA, idB)
354
+ frameA=cut_id(idA)
355
+ frameB=cut_id(idB)
356
+ 0.upto(frameA.length-1){ |i|
357
+ if frameA[i].eql?(frameB[i]) == false
358
+ return i
359
+ end
360
+ }
361
+ return 5
362
+ end
363
+
364
+ # Returns the total number of different ID in Cilin.
365
+ def get_id_sum
366
+ @IDsIndex.size
367
+ end
368
+
369
+ # Returns the total number of different words in Cilin.
370
+ def get_index_sum
371
+ @index.size
372
+ end
373
+
374
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'test/unit'
5
+ require File.expand_path('../../lib/tyccl', __FILE__)
6
+
7
+
8
+ $tyc=Tyccl.instance
9
+
10
+ class TycclTest < Test::Unit::TestCase
11
+
12
+ def test_instance
13
+ assert_equal 17809,
14
+ $tyc.get_id_sum
15
+ assert_equal 77457,
16
+ $tyc.get_index_sum
17
+ end
18
+
19
+ def test_get_words_by_id
20
+ assert_equal ["人","士","人物","人士","人氏","人选"],
21
+ $tyc.get_words_by_id("Aa01A01=")
22
+ assert_equal nil,
23
+ $tyc.get_words_by_id("dfdf")
24
+
25
+ end
26
+
27
+ def test_get_ids_by_wildcard
28
+ assert_equal 9,
29
+ $tyc.get_ids_by_wildcard("Aa01A...").size
30
+ assert_equal 32,
31
+ $tyc.get_ids_by_wildcard("Aa**A...").size
32
+ end
33
+
34
+ def test_get_ids_by_word
35
+ assert_equal nil,
36
+ $tyc.get_ids_by_word("屌丝")
37
+ assert_equal 1,
38
+ $tyc.get_ids_by_word("桅顶").size
39
+ assert_equal 7,
40
+ $tyc.get_ids_by_word("底").size
41
+ end
42
+
43
+ def test_has_same
44
+ assert_equal true,
45
+ $tyc.has_same?("人")
46
+ assert_equal false,
47
+ $tyc.has_same?("顺民")
48
+ assert_equal false,
49
+ $tyc.has_same?("众学生")
50
+ end
51
+
52
+ def test_has_equal
53
+ assert_equal true,
54
+ $tyc.has_equal?("良民")
55
+ assert_equal false,
56
+ $tyc.has_equal?("众学生")
57
+ assert_equal false,
58
+ $tyc.has_equal?("人")
59
+ end
60
+
61
+ def test_has_single
62
+ assert_equal false,
63
+ $tyc.has_single?("良民")
64
+ assert_equal true,
65
+ $tyc.has_single?("众学生")
66
+ assert_equal false,
67
+ $tyc.has_single?("人")
68
+ end
69
+
70
+ def test_get_same
71
+ m=$tyc.get_same("人")
72
+
73
+ assert_equal nil,
74
+ $tyc.get_same("顺民")
75
+ assert_equal nil,
76
+ $tyc.get_same("众学生")
77
+ assert_equal 5,
78
+ m.size
79
+ assert_equal 6,
80
+ m[0].size
81
+ assert_equal 8,
82
+ m[1].size
83
+ assert_equal 2,
84
+ m[2].size
85
+ assert_equal 9,
86
+ m[3].size
87
+ assert_equal 9,
88
+ m[4].size
89
+
90
+ end
91
+
92
+ def test_get_equal
93
+ assert_equal nil,
94
+ $tyc.get_equal("人")
95
+ assert_equal nil,
96
+ $tyc.get_equal("众学生")
97
+ assert_equal 1,
98
+ $tyc.get_equal("流民").size
99
+ assert_equal 9,
100
+ $tyc.get_equal("流民")[0].size
101
+ end
102
+
103
+ def test_get_similar
104
+ assert_equal [ ["人", "士", "人物", "人士", "人氏", "人选"],
105
+ ["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
106
+ ["身体", "人"],
107
+ ["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
108
+ ["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"] ],
109
+ $tyc.get_similar("人")
110
+ end
111
+
112
+ # dist ranges [0,10];
113
+ # if dist<7 then we believe that the two words are related
114
+ def test_dist
115
+ assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
116
+ $tyc.dist("人","士")
117
+ assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
118
+ $tyc.dist("西红柿","黄瓜")
119
+ assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
120
+ $tyc.dist("匹夫","良民")
121
+ assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
122
+ $tyc.dist("苹果","西红柿")
123
+ assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
124
+ $tyc.dist("群众","村姑")
125
+ assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
126
+ $tyc.dist("人","哟")
127
+ end
128
+
129
+ def test_sim
130
+ result=[ Result_t.new(1.0,"Aa01B01=","Aa01B01="),
131
+ Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
132
+ Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
133
+ Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
134
+ Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
135
+ Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
136
+ Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
137
+ Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
138
+ Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
139
+ Result_t.new(0.04007,"Aa01B01=","Al05B01=") ]
140
+
141
+ words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
142
+ i=0
143
+ words.each{ |word|
144
+ assert_equal result[i],
145
+ $tyc.sim("人民",word)
146
+ i+=1
147
+ }
148
+ end
149
+
150
+ end
151
+
data/tyccl.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tyccl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tyccl"
8
+ spec.version = Tyccl::VERSION
9
+ spec.authors = ["JoeWoo"]
10
+ spec.email = ["0wujian0@gmail.com"]
11
+ spec.summary = %q{"tools of analysing similarity between Chinese Words."}
12
+ spec.description = %q{"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions to analyse similarity between Chinese Words."}
13
+ spec.homepage = "https://github.com/JoeWoo/tyccl"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rake"
23
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tyccl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - JoeWoo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: "\"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions
42
+ to analyse similarity between Chinese Words.\""
43
+ email:
44
+ - 0wujian0@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - Gemfile
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - lib/Inverted.yaml
55
+ - lib/cilin.txt
56
+ - lib/tyccl.rb
57
+ - lib/tyccl/version.rb
58
+ - test/test_tyccl.rb
59
+ - tyccl.gemspec
60
+ homepage: https://github.com/JoeWoo/tyccl
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.1.9
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: "\"tools of analysing similarity between Chinese Words.\""
84
+ test_files:
85
+ - test/test_tyccl.rb