tyccl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ class Tyccl
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tyccl.rb ADDED
@@ -0,0 +1,374 @@
1
+ # coding: utf-8
2
+
3
+ # = this gem is a tool for analysing similarity
4
+ # = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
5
+ # this gem only has one singleton class, instance once and use it always.
6
+ #
7
+ # learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
8
+ #
9
+ # Author:: Joe Woo (https://github.com/JoeWoo)
10
+ # License:: MIT
11
+ #
12
+
13
+ require File.expand_path("../tyccl/version", __FILE__)
14
+ require "algorithms"
15
+ require "yaml"
16
+ require "singleton"
17
+ require "logger"
18
+
19
+
20
+ # this struct is used to return analysing result
21
+ # * field 'value' store the analysing value
22
+ # * field 'x_id' 'y_id' store the ID of word X and Y
23
+ Result_t = Struct.new(:value,:x_id,:y_id)
24
+
25
+ # class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
26
+ # to keep Tyccl object just only one.
27
+ class Tyccl
28
+
29
+ include Singleton
30
+
31
+ # Read the Cilin file to memory.
32
+ # Format the data structure \#@IDsTire.
33
+ # Index the hash \#@IDsIndex.
34
+ def initialize()#:notnew: stops RDoc from seeing the initialize method as the new method
35
+ #--
36
+ #read the cilin.txt to ids[] and items[]
37
+ #++
38
+ @logger = Logger.new(STDOUT)
39
+ @logger.level = Logger::WARN
40
+ codes=[]
41
+ items=[]
42
+ @IDsIndex = Hash.new
43
+ f = File.new(File.expand_path("../cilin.txt", __FILE__))
44
+ i=0
45
+ f.each { |line|
46
+ line.force_encoding('utf-8')
47
+ m=line.split(" ")
48
+ codes << m[0]
49
+ @IDsIndex[m[0]] = i
50
+ i += 1
51
+ word = Array.new
52
+ m[1..-1].each{ |term|
53
+ word << term
54
+ }
55
+ items << word
56
+ }
57
+ #--
58
+ #init Trie of cilin.txt
59
+ #++
60
+ @IDsTrie = Containers::Trie.new
61
+ i=0
62
+ codes.each{ |key|
63
+ @IDsTrie[key]=items[i]
64
+ i+=1
65
+ }
66
+ #--
67
+ #init index of cilin.txt
68
+ #++
69
+ @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
70
+ end
71
+
72
+
73
+
74
+ # Given id(string) such as:"Aa01A01=" "Aa01A03#"
75
+ # Returns an array containing words(string) that match this id
76
+ # If no match is found, nil is returned.
77
+ def get_words_by_id(id)
78
+ @IDsTrie[id]
79
+ end
80
+
81
+ # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
82
+ # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
83
+ # If no match is found, an empty array is returned.
84
+ def get_ids_by_wildcard(wildcard)
85
+ @IDsTrie.wildcard(wildcard)
86
+ end
87
+
88
+ # Returns an array containing IDs(string) that the parameter Word(string) matchs.
89
+ #
90
+ # tips: the same word may have a few semantic meanings, so a word can match many IDs.
91
+ def get_ids_by_word(word)
92
+ m = @index[word]
93
+ if(m==nil)
94
+ @logger.error(word+" is an unlisted word!")
95
+ return nil
96
+ else
97
+ return m
98
+ end
99
+ end
100
+
101
+ # Given a word(string).
102
+ # Test to see if the parameter Word has any synonym.
103
+ # Returns true or false.
104
+ def has_same?(word)
105
+ ids = get_ids_by_word(word)
106
+ i=0
107
+ flag=false
108
+ while i < ids.size && flag==false do
109
+ if ids[i][-1]=="="
110
+ flag=true
111
+ else
112
+ flag=false
113
+ end
114
+ i+=1
115
+ end
116
+ return flag
117
+ end
118
+
119
+ # Given a word(string).
120
+ # Test to see if the parameter Word has any equivalent word.
121
+ # Returns true or false.
122
+ def has_equal?(word)
123
+ ids = get_ids_by_word(word)
124
+ i=0
125
+ flag=false
126
+ while i < ids.size && flag==false do
127
+ if ids[i][-1]=="#"
128
+ flag=true
129
+ else
130
+ flag=false
131
+ end
132
+ i+=1
133
+ end
134
+ return flag
135
+ end
136
+
137
+ # Given a word(string).
138
+ # Test to see if the parameter Word has any ID whose corresponding
139
+ # words list just has only one element.
140
+ # Returns true or false.
141
+ def has_single?(word)
142
+ ids = get_ids_by_word(word)
143
+ i=0
144
+ flag=false
145
+ while i < ids.size && flag==false do
146
+ if ids[i][-1]=="@"
147
+ flag=true
148
+ else
149
+ flag=false
150
+ end
151
+ i+=1
152
+ end
153
+ return flag
154
+ end
155
+
156
+ # Given a word(string).
157
+ # Returns a two dimensional array that contains the parameter Word`s
158
+ # synonym which divided by different ID that the word matchs.
159
+ # If the word has no synonym, nil is returned.
160
+ def get_same(word)
161
+ if has_same?(word)
162
+ same_words=[]
163
+ ids = get_ids_by_word(word)
164
+ ids.each{ |code|
165
+ if code[-1]=="="
166
+ same_words << get_words_by_id(code)
167
+ end
168
+ }
169
+ return same_words
170
+ end
171
+ return nil
172
+ end
173
+
174
+ # Given a word(string).
175
+ # Returns a two dimensional array that contains the parameter Word`s
176
+ # equivalent words which divided by different ID that the word matchs.
177
+ # If the word has no synonym, nil is returned.
178
+ def get_equal(word)
179
+ if has_equal?(word)
180
+ equal_words=[]
181
+ ids = get_ids_by_word(word)
182
+ ids.each{ |code|
183
+ if code[-1]=="#"
184
+ equal_words << get_words_by_id(code)
185
+ end
186
+ }
187
+ return equal_words
188
+ end
189
+ return nil
190
+ end
191
+
192
+ # Given a word(string) and a level(int),level`s value range is [0,4],
193
+ # 4 is default, value of level is more bigger, the similarity between
194
+ # returned words and the given word is more less.
195
+ # Returns a two dimensional array that contains the parameter Word`s
196
+ # similar words which divided by different ID that the word matchs.
197
+ # If the word has no similar, nil is returned.
198
+ #
199
+ # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
200
+ # segment: A,a,01,A,01=.
201
+ def get_similar(word, level=4)
202
+ ids = get_ids_by_word(word)
203
+ similar=[]
204
+ ids.each{ |code|
205
+ mini_similar=[]
206
+ findstring = gen_findstring(code, level+1)
207
+ similar_IDs=@IDsTrie.wildcard(findstring)
208
+ similar_IDs.each{|item|
209
+ get_words_by_id(item).each{|term|
210
+ mini_similar << term
211
+ }
212
+ }
213
+ similar << mini_similar
214
+ }
215
+ if similar.size > 0
216
+ return similar
217
+ else
218
+ return nil
219
+ end
220
+ end
221
+
222
+ # Given idA(string) and idB(string).
223
+ # Returns semantic distance(int) between idA and idB, values in [0,10].
224
+ def get_dist_by_id(idA, idB)
225
+ alpha=10.0/5
226
+ n = compare_id(idA,idB)
227
+ (alpha*(5-n)).round
228
+ end
229
+
230
+ # Given idA(string) and idB(string).
231
+ # Returns similarity(float) between idA and idB, values in [0,1].
232
+ def get_sim_by_id(idA, idB)
233
+ n = compare_id(idA,idB)
234
+ str = idA.clone
235
+ if n==0
236
+ _sim = factor[0]
237
+ elsif n==5
238
+ if idA[-1] == "="
239
+ _sim = factor[5]
240
+ elsif idA[-1] == "#"
241
+ _sim = factor[6]
242
+ elsif idA[-1] == "@"
243
+ _sim = factor[5]
244
+ end
245
+ elsif n < 5
246
+ findstring=gen_findstring(str,n)
247
+ node_num = @IDsTrie.wildcard(findstring).size
248
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
249
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
250
+ end
251
+ return _sim
252
+ end
253
+
254
+ # Given wordA(string) and wordB(string).
255
+ # Returns a Struct Result_t which contains idA, idB, and shortest
256
+ # semantic distance(int) between wordA and wordB.
257
+ def dist(wordA, wordB)
258
+ alpha=10.0/5
259
+ shortest_Pair = Result_t.new(100,"","")
260
+ idAs = get_ids_by_word(wordA)
261
+ idBs = get_ids_by_word(wordB)
262
+
263
+ idAs.each{ |idA|
264
+ idBs.each{ |idB|
265
+ n = compare_id(idA,idB)
266
+ distance = (alpha*(5-n)).round
267
+ if distance < shortest_Pair.value
268
+ shortest_Pair.value = distance
269
+ shortest_Pair.x_id = idA
270
+ shortest_Pair.y_id = idB
271
+ end
272
+ }
273
+ }
274
+ return shortest_Pair
275
+ end
276
+
277
+ # Given wordA(string) and wordB(string).
278
+ # Returns a Struct Result_t which contains the most similar Pairs
279
+ # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
280
+ def sim(wordA, wordB)
281
+ factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
282
+ longest_Pair = Result_t.new(-1,"","")
283
+ idAs = get_ids_by_word(wordA)
284
+ idBs = get_ids_by_word(wordB)
285
+
286
+ idAs.each{ |idA|
287
+ idBs.each{ |idB|
288
+ n = compare_id(idA,idB)
289
+ str = idA.clone
290
+ if n==0
291
+ _sim = factor[0]
292
+ elsif n==5
293
+ if idA[-1] == "="
294
+ _sim = factor[5]
295
+ elsif idA[-1] == "#"
296
+ _sim = factor[6]
297
+ elsif idA[-1] == "@"
298
+ _sim = factor[5]
299
+ end
300
+ elsif n < 5
301
+ findstring=gen_findstring(str,n)
302
+ node_num = @IDsTrie.wildcard(findstring).size
303
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
304
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
305
+ end
306
+
307
+ if _sim > longest_Pair.value
308
+ longest_Pair.value = _sim
309
+ longest_Pair.x_id = idA
310
+ longest_Pair.y_id = idB
311
+ end
312
+ }
313
+ }
314
+ longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
315
+ return longest_Pair
316
+ end
317
+
318
+ # Given a word(string) and start_index(int),start_index`s value
319
+ # range is [0,4], corresponding Cilin(同义词词林) ID`s different
320
+ # segment: A,a,01,A,01=.
321
+ # Returns a string that is used '.' to explace every char from
322
+ # the start_index to the string`s end.
323
+ def gen_findstring(code, start_index)
324
+ frame = cut_id(code)
325
+ (start_index).upto(4){|i|
326
+ 0.upto(frame[i].size-1){ |j|
327
+ frame[i][j]='.'
328
+ }
329
+ }
330
+ combine_id(frame)
331
+ end
332
+
333
+ # Given a id(string).
334
+ # Returns an array that contains 5 strings which are ID`s
335
+ # diffrent segment, like: A,a,01,A,01= .
336
+ def cut_id(id)
337
+ frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
338
+ return frame
339
+ end
340
+
341
+ # the method #cut_id`s inverse process.
342
+ def combine_id(frame)
343
+ m=""
344
+ frame.each{|seg|
345
+ m << seg
346
+ }
347
+ return m
348
+ end
349
+
350
+ # Given idA(string) and idB(string).
351
+ # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
352
+ # if they are the same , returns 5.
353
+ def compare_id(idA, idB)
354
+ frameA=cut_id(idA)
355
+ frameB=cut_id(idB)
356
+ 0.upto(frameA.length-1){ |i|
357
+ if frameA[i].eql?(frameB[i]) == false
358
+ return i
359
+ end
360
+ }
361
+ return 5
362
+ end
363
+
364
+ # Returns the total number of different ID in Cilin.
365
+ def get_id_sum
366
+ @IDsIndex.size
367
+ end
368
+
369
+ # Returns the total number of different words in Cilin.
370
+ def get_index_sum
371
+ @index.size
372
+ end
373
+
374
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'test/unit'
5
+ require File.expand_path('../../lib/tyccl', __FILE__)
6
+
7
+
8
+ $tyc=Tyccl.instance
9
+
10
+ class TycclTest < Test::Unit::TestCase
11
+
12
+ def test_instance
13
+ assert_equal 17809,
14
+ $tyc.get_id_sum
15
+ assert_equal 77457,
16
+ $tyc.get_index_sum
17
+ end
18
+
19
+ def test_get_words_by_id
20
+ assert_equal ["人","士","人物","人士","人氏","人选"],
21
+ $tyc.get_words_by_id("Aa01A01=")
22
+ assert_equal nil,
23
+ $tyc.get_words_by_id("dfdf")
24
+
25
+ end
26
+
27
+ def test_get_ids_by_wildcard
28
+ assert_equal 9,
29
+ $tyc.get_ids_by_wildcard("Aa01A...").size
30
+ assert_equal 32,
31
+ $tyc.get_ids_by_wildcard("Aa**A...").size
32
+ end
33
+
34
+ def test_get_ids_by_word
35
+ assert_equal nil,
36
+ $tyc.get_ids_by_word("屌丝")
37
+ assert_equal 1,
38
+ $tyc.get_ids_by_word("桅顶").size
39
+ assert_equal 7,
40
+ $tyc.get_ids_by_word("底").size
41
+ end
42
+
43
+ def test_has_same
44
+ assert_equal true,
45
+ $tyc.has_same?("人")
46
+ assert_equal false,
47
+ $tyc.has_same?("顺民")
48
+ assert_equal false,
49
+ $tyc.has_same?("众学生")
50
+ end
51
+
52
+ def test_has_equal
53
+ assert_equal true,
54
+ $tyc.has_equal?("良民")
55
+ assert_equal false,
56
+ $tyc.has_equal?("众学生")
57
+ assert_equal false,
58
+ $tyc.has_equal?("人")
59
+ end
60
+
61
+ def test_has_single
62
+ assert_equal false,
63
+ $tyc.has_single?("良民")
64
+ assert_equal true,
65
+ $tyc.has_single?("众学生")
66
+ assert_equal false,
67
+ $tyc.has_single?("人")
68
+ end
69
+
70
+ def test_get_same
71
+ m=$tyc.get_same("人")
72
+
73
+ assert_equal nil,
74
+ $tyc.get_same("顺民")
75
+ assert_equal nil,
76
+ $tyc.get_same("众学生")
77
+ assert_equal 5,
78
+ m.size
79
+ assert_equal 6,
80
+ m[0].size
81
+ assert_equal 8,
82
+ m[1].size
83
+ assert_equal 2,
84
+ m[2].size
85
+ assert_equal 9,
86
+ m[3].size
87
+ assert_equal 9,
88
+ m[4].size
89
+
90
+ end
91
+
92
+ def test_get_equal
93
+ assert_equal nil,
94
+ $tyc.get_equal("人")
95
+ assert_equal nil,
96
+ $tyc.get_equal("众学生")
97
+ assert_equal 1,
98
+ $tyc.get_equal("流民").size
99
+ assert_equal 9,
100
+ $tyc.get_equal("流民")[0].size
101
+ end
102
+
103
+ def test_get_similar
104
+ assert_equal [ ["人", "士", "人物", "人士", "人氏", "人选"],
105
+ ["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
106
+ ["身体", "人"],
107
+ ["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
108
+ ["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"] ],
109
+ $tyc.get_similar("人")
110
+ end
111
+
112
+ # dist ranges [0,10];
113
+ # if dist<7 then we believe that the two words are related
114
+ def test_dist
115
+ assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
116
+ $tyc.dist("人","士")
117
+ assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
118
+ $tyc.dist("西红柿","黄瓜")
119
+ assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
120
+ $tyc.dist("匹夫","良民")
121
+ assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
122
+ $tyc.dist("苹果","西红柿")
123
+ assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
124
+ $tyc.dist("群众","村姑")
125
+ assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
126
+ $tyc.dist("人","哟")
127
+ end
128
+
129
+ def test_sim
130
+ result=[ Result_t.new(1.0,"Aa01B01=","Aa01B01="),
131
+ Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
132
+ Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
133
+ Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
134
+ Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
135
+ Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
136
+ Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
137
+ Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
138
+ Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
139
+ Result_t.new(0.04007,"Aa01B01=","Al05B01=") ]
140
+
141
+ words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
142
+ i=0
143
+ words.each{ |word|
144
+ assert_equal result[i],
145
+ $tyc.sim("人民",word)
146
+ i+=1
147
+ }
148
+ end
149
+
150
+ end
151
+
data/tyccl.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tyccl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tyccl"
8
+ spec.version = Tyccl::VERSION
9
+ spec.authors = ["JoeWoo"]
10
+ spec.email = ["0wujian0@gmail.com"]
11
+ spec.summary = %q{"tools of analysing similarity between Chinese Words."}
12
+ spec.description = %q{"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions to analyse similarity between Chinese Words."}
13
+ spec.homepage = "https://github.com/JoeWoo/tyccl"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.5"
22
+ spec.add_development_dependency "rake"
23
+ end
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tyccl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - JoeWoo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: "\"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions
42
+ to analyse similarity between Chinese Words.\""
43
+ email:
44
+ - 0wujian0@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - Gemfile
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - lib/Inverted.yaml
55
+ - lib/cilin.txt
56
+ - lib/tyccl.rb
57
+ - lib/tyccl/version.rb
58
+ - test/test_tyccl.rb
59
+ - tyccl.gemspec
60
+ homepage: https://github.com/JoeWoo/tyccl
61
+ licenses:
62
+ - MIT
63
+ metadata: {}
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ requirements:
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 2.1.9
81
+ signing_key:
82
+ specification_version: 4
83
+ summary: "\"tools of analysing similarity between Chinese Words.\""
84
+ test_files:
85
+ - test/test_tyccl.rb