tyccl_tim_fixed 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +20 -0
  4. data/README.md +75 -0
  5. data/Rakefile +11 -0
  6. data/doc/Object.html +154 -0
  7. data/doc/README_md.html +182 -0
  8. data/doc/Tyccl/Containers.html +130 -0
  9. data/doc/Tyccl/Logger.html +130 -0
  10. data/doc/Tyccl/YAML.html +130 -0
  11. data/doc/Tyccl.html +1020 -0
  12. data/doc/created.rid +3 -0
  13. data/doc/images/add.png +0 -0
  14. data/doc/images/arrow_up.png +0 -0
  15. data/doc/images/brick.png +0 -0
  16. data/doc/images/brick_link.png +0 -0
  17. data/doc/images/bug.png +0 -0
  18. data/doc/images/bullet_black.png +0 -0
  19. data/doc/images/bullet_toggle_minus.png +0 -0
  20. data/doc/images/bullet_toggle_plus.png +0 -0
  21. data/doc/images/date.png +0 -0
  22. data/doc/images/delete.png +0 -0
  23. data/doc/images/find.png +0 -0
  24. data/doc/images/loadingAnimation.gif +0 -0
  25. data/doc/images/macFFBgHack.png +0 -0
  26. data/doc/images/package.png +0 -0
  27. data/doc/images/page_green.png +0 -0
  28. data/doc/images/page_white_text.png +0 -0
  29. data/doc/images/page_white_width.png +0 -0
  30. data/doc/images/plugin.png +0 -0
  31. data/doc/images/ruby.png +0 -0
  32. data/doc/images/tag_blue.png +0 -0
  33. data/doc/images/tag_green.png +0 -0
  34. data/doc/images/transparent.png +0 -0
  35. data/doc/images/wrench.png +0 -0
  36. data/doc/images/wrench_orange.png +0 -0
  37. data/doc/images/zoom.png +0 -0
  38. data/doc/index.html +166 -0
  39. data/doc/js/darkfish.js +155 -0
  40. data/doc/js/jquery.js +18 -0
  41. data/doc/js/navigation.js +142 -0
  42. data/doc/js/search.js +94 -0
  43. data/doc/js/search_index.js +1 -0
  44. data/doc/js/searcher.js +228 -0
  45. data/doc/rdoc.css +595 -0
  46. data/doc/table_of_contents.html +111 -0
  47. data/lib/Inverted.yaml +77458 -0
  48. data/lib/cilin.txt +17817 -0
  49. data/lib/tyccl/version.rb +3 -0
  50. data/lib/tyccl.rb +371 -0
  51. data/test/test_tyccl.rb +151 -0
  52. data/tyccl.gemspec +23 -0
  53. metadata +133 -0
@@ -0,0 +1,3 @@
1
+ class Tyccl #:nodoc:all
2
+ VERSION = "0.0.3"
3
+ end
data/lib/tyccl.rb ADDED
@@ -0,0 +1,371 @@
1
+ # coding: utf-8
2
+
3
+ # = this gem is a tool for analysing similarity
4
+ # = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
5
+ #
6
+ # * learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
7
+ #
8
+ # * Author:: Joe Woo (https://github.com/JoeWoo)
9
+ # * License:: MIT
10
+ #
11
+
12
+ require File.expand_path("../tyccl/version", __FILE__)#:nodoc:all
13
+ require "algorithms"#:nodoc:all
14
+ require "yaml"#:nodoc:all
15
+ require "logger"#:nodoc:all
16
+
17
+
18
+ # this struct is used to return analysing result
19
+ # * field 'value' store the analysing value
20
+ # * field 'x_id' 'y_id' store the ID of word X and Y
21
+ Result_t = Struct.new(:value,:x_id,:y_id)
22
+
23
+ # class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
24
+ # to keep Tyccl object just only one.
25
+ class Tyccl
26
+
27
+ #--
28
+ # Read the Cilin file to memory.
29
+ # Format the data structure \#@IDsTire.
30
+ # Index the hash \#@IDsIndex.
31
+ #++
32
+ #--
33
+ #read the cilin.txt to ids[] and items[]
34
+ #++
35
+ @logger = Logger.new(STDOUT)
36
+ @logger.level = Logger::WARN
37
+ codes=[]
38
+ items=[]
39
+ @IDsIndex = Hash.new
40
+ f = File.new(File.expand_path("../cilin.txt", __FILE__))
41
+ i=0
42
+ f.each { |line|
43
+ line.force_encoding('utf-8')
44
+ m=line.split(" ")
45
+ codes << m[0]
46
+ @IDsIndex[m[0]] = i
47
+ i += 1
48
+ word = Array.new
49
+ m[1..-1].each{ |term|
50
+ word << term
51
+ }
52
+ items << word
53
+ }
54
+ #--
55
+ #init Trie of cilin.txt
56
+ #++
57
+ @IDsTrie = Containers::Trie.new
58
+ i=0
59
+ codes.each{ |key|
60
+ @IDsTrie[key]=items[i]
61
+ i+=1
62
+ }
63
+ #--
64
+ #init index of cilin.txt
65
+ #++
66
+ @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
67
+
68
+
69
+
70
+
71
+ # Given id(string) such as:"Aa01A01=" "Aa01A03#"
72
+ # Returns an array containing words(string) that match this id
73
+ # If no match is found, nil is returned.
74
+ def self.get_words_by_id(id)
75
+ @IDsTrie[id]
76
+ end
77
+
78
+ # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
79
+ # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
80
+ # If no match is found, an empty array is returned.
81
+ def self.get_ids_by_wildcard(wildcard)
82
+ @IDsTrie.wildcard(wildcard)
83
+ end
84
+
85
+ # Returns an array containing IDs(string) that the parameter Word(string) matchs.
86
+ #
87
+ # tips: the same word may have a few semantic meanings, so a word can match many IDs.
88
+ def self.get_ids_by_word(word)
89
+ m = @index[word]
90
+ if(m==nil)
91
+ @logger.error(word+" is an unlisted word!")
92
+ return word
93
+ else
94
+ return m
95
+ end
96
+ end
97
+
98
+ # Given a word(string).
99
+ # Test to see if the parameter Word has any synonym.
100
+ # Returns true or false.
101
+ def self.has_same?(word)
102
+ ids = get_ids_by_word(word)
103
+ i=0
104
+ flag=false
105
+ while i < ids.size && flag==false do
106
+ if ids[i][-1]=="="
107
+ flag=true
108
+ else
109
+ flag=false
110
+ end
111
+ i+=1
112
+ end
113
+ return flag
114
+ end
115
+
116
+ # Given a word(string).
117
+ # Test to see if the parameter Word has any equivalent word.
118
+ # Returns true or false.
119
+ def self.has_equal?(word)
120
+ ids = get_ids_by_word(word)
121
+ i=0
122
+ flag=false
123
+ while i < ids.size && flag==false do
124
+ if ids[i][-1]=="#"
125
+ flag=true
126
+ else
127
+ flag=false
128
+ end
129
+ i+=1
130
+ end
131
+ return flag
132
+ end
133
+
134
+ # Given a word(string).
135
+ # Test to see if the parameter Word has any ID whose corresponding
136
+ # words list just has only one element.
137
+ # Returns true or false.
138
+ def self.has_single?(word)
139
+ ids = get_ids_by_word(word)
140
+ i=0
141
+ flag=false
142
+ while i < ids.size && flag==false do
143
+ if ids[i][-1]=="@"
144
+ flag=true
145
+ else
146
+ flag=false
147
+ end
148
+ i+=1
149
+ end
150
+ return flag
151
+ end
152
+
153
+ # Given a word(string).
154
+ # Returns a two dimensional array that contains the parameter Word`s
155
+ # synonym which divided by different ID that the word matchs.
156
+ # If the word has no synonym, nil is returned.
157
+ def self.get_same(word)
158
+ if has_same?(word)
159
+ same_words=[]
160
+ ids = get_ids_by_word(word)
161
+ ids.each{ |code|
162
+ if code[-1]=="="
163
+ same_words << get_words_by_id(code)
164
+ end
165
+ }
166
+ return same_words
167
+ end
168
+ return nil
169
+ end
170
+
171
+ # Given a word(string).
172
+ # Returns a two dimensional array that contains the parameter Word`s
173
+ # equivalent words which divided by different ID that the word matchs.
174
+ # If the word has no synonym, nil is returned.
175
+ def self.get_equal(word)
176
+ if has_equal?(word)
177
+ equal_words=[]
178
+ ids = get_ids_by_word(word)
179
+ ids.each{ |code|
180
+ if code[-1]=="#"
181
+ equal_words << get_words_by_id(code)
182
+ end
183
+ }
184
+ return equal_words
185
+ end
186
+ return nil
187
+ end
188
+
189
+ # Given a word(string) and a level(int),level`s value range is [0,4],
190
+ # 4 is default, value of level is more bigger, the similarity between
191
+ # returned words and the given word is more less.
192
+ # Returns a two dimensional array that contains the parameter Word`s
193
+ # similar words which divided by different ID that the word matchs.
194
+ # If the word has no similar, nil is returned.
195
+ #
196
+ # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
197
+ # segment: A,a,01,A,01=.
198
+ def self.get_similar(word, level=4)
199
+ ids = get_ids_by_word(word)
200
+ similar=[]
201
+ ids.each{ |code|
202
+ mini_similar=[]
203
+ findstring = gen_findstring(code, level+1)
204
+ similar_IDs=@IDsTrie.wildcard(findstring)
205
+ similar_IDs.each{|item|
206
+ get_words_by_id(item).each{|term|
207
+ mini_similar << term
208
+ }
209
+ }
210
+ similar << mini_similar
211
+ }
212
+ if similar.size > 0
213
+ return similar
214
+ else
215
+ return nil
216
+ end
217
+ end
218
+
219
+ # Given idA(string) and idB(string).
220
+ # Returns semantic distance(int) between idA and idB, values in [0,10].
221
+ def self.get_dist_by_id(idA, idB)
222
+ alpha=10.0/5
223
+ n = compare_id(idA,idB)
224
+ (alpha*(5-n)).round
225
+ end
226
+
227
+ # Given idA(string) and idB(string).
228
+ # Returns similarity(float) between idA and idB, values in [0,1].
229
+ def self.get_sim_by_id(idA, idB)
230
+ n = compare_id(idA,idB)
231
+ str = idA.clone
232
+ if n==0
233
+ _sim = factor[0]
234
+ elsif n==5
235
+ if idA[-1] == "="
236
+ _sim = factor[5]
237
+ elsif idA[-1] == "#"
238
+ _sim = factor[6]
239
+ elsif idA[-1] == "@"
240
+ _sim = factor[5]
241
+ end
242
+ elsif n < 5
243
+ findstring=gen_findstring(str,n)
244
+ node_num = @IDsTrie.wildcard(findstring).size
245
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
246
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
247
+ end
248
+ return _sim
249
+ end
250
+
251
+ # Given wordA(string) and wordB(string).
252
+ # Returns a Struct Result_t which contains idA, idB, and shortest
253
+ # semantic distance(int) between wordA and wordB.
254
+ def self.dist(wordA, wordB)
255
+ alpha=10.0/5
256
+ shortest_Pair = Result_t.new(100,"","")
257
+ idAs = get_ids_by_word(wordA)
258
+ idBs = get_ids_by_word(wordB)
259
+
260
+ idAs.each{ |idA|
261
+ idBs.each{ |idB|
262
+ n = compare_id(idA,idB)
263
+ distance = (alpha*(5-n)).round
264
+ if distance < shortest_Pair.value
265
+ shortest_Pair.value = distance
266
+ shortest_Pair.x_id = idA
267
+ shortest_Pair.y_id = idB
268
+ end
269
+ }
270
+ }
271
+ return shortest_Pair
272
+ end
273
+
274
+ # Given wordA(string) and wordB(string).
275
+ # Returns a Struct Result_t which contains the most similar Pairs
276
+ # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
277
+ def self.sim(wordA, wordB)
278
+ factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
279
+ longest_Pair = Result_t.new(-1,"","")
280
+ idAs = get_ids_by_word(wordA)
281
+ idBs = get_ids_by_word(wordB)
282
+
283
+ idAs.each{ |idA|
284
+ idBs.each{ |idB|
285
+ n = compare_id(idA,idB)
286
+ str = idA.clone
287
+ if n==0
288
+ _sim = factor[0]
289
+ elsif n==5
290
+ if idA[-1] == "="
291
+ _sim = factor[5]
292
+ elsif idA[-1] == "#"
293
+ _sim = factor[6]
294
+ elsif idA[-1] == "@"
295
+ _sim = factor[5]
296
+ end
297
+ elsif n < 5
298
+ findstring=gen_findstring(str,n)
299
+ node_num = @IDsTrie.wildcard(findstring).size
300
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
301
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
302
+ end
303
+
304
+ if _sim > longest_Pair.value
305
+ longest_Pair.value = _sim
306
+ longest_Pair.x_id = idA
307
+ longest_Pair.y_id = idB
308
+ end
309
+ }
310
+ }
311
+ longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
312
+ return longest_Pair
313
+ end
314
+
315
+ # Given a word(string) and start_index(int),start_index`s value
316
+ # range is [0,4], corresponding Cilin(同义词词林) ID`s different
317
+ # segment: A,a,01,A,01=.
318
+ # Returns a string that is used '.' to explace every char from
319
+ # the start_index to the string`s end.
320
+ def self.gen_findstring(code, start_index)
321
+ frame = cut_id(code)
322
+ (start_index).upto(4){|i|
323
+ 0.upto(frame[i].size-1){ |j|
324
+ frame[i][j]='.'
325
+ }
326
+ }
327
+ combine_id(frame)
328
+ end
329
+
330
+ # Given a id(string).
331
+ # Returns an array that contains 5 strings which are ID`s
332
+ # diffrent segment, like: A,a,01,A,01= .
333
+ def self.cut_id(id)
334
+ frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
335
+ return frame
336
+ end
337
+
338
+ # the method #cut_id`s inverse process.
339
+ def self.combine_id(frame)
340
+ m=""
341
+ frame.each{|seg|
342
+ m << seg
343
+ }
344
+ return m
345
+ end
346
+
347
+ # Given idA(string) and idB(string).
348
+ # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
349
+ # if they are the same , returns 5.
350
+ def self.compare_id(idA, idB)
351
+ frameA=cut_id(idA)
352
+ frameB=cut_id(idB)
353
+ 0.upto(frameA.length-1){ |i|
354
+ if frameA[i].eql?(frameB[i]) == false
355
+ return i
356
+ end
357
+ }
358
+ return 5
359
+ end
360
+
361
+ # Returns the total number of different ID in Cilin.
362
+ def self.get_id_sum
363
+ @IDsIndex.size
364
+ end
365
+
366
+ # Returns the total number of different words in Cilin.
367
+ def self.get_index_sum
368
+ @index.size
369
+ end
370
+
371
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'test/unit'
5
+ require File.expand_path('../../lib/tyccl', __FILE__)
6
+
7
+
8
+
9
+
10
+ class TycclTest < Test::Unit::TestCase #:nodoc:all
11
+
12
+ def test_instance
13
+ assert_equal 17809,
14
+ Tyccl.get_id_sum
15
+ assert_equal 77457,
16
+ Tyccl.get_index_sum
17
+ end
18
+
19
+ def test_get_words_by_id
20
+ assert_equal ["人","士","人物","人士","人氏","人选"],
21
+ Tyccl.get_words_by_id("Aa01A01=")
22
+ assert_equal nil,
23
+ Tyccl.get_words_by_id("dfdf")
24
+
25
+ end
26
+
27
+ def test_get_ids_by_wildcard
28
+ assert_equal 9,
29
+ Tyccl.get_ids_by_wildcard("Aa01A...").size
30
+ assert_equal 32,
31
+ Tyccl.get_ids_by_wildcard("Aa**A...").size
32
+ end
33
+
34
+ def test_get_ids_by_word
35
+ assert_equal nil,
36
+ Tyccl.get_ids_by_word("屌丝")
37
+ assert_equal 1,
38
+ Tyccl.get_ids_by_word("桅顶").size
39
+ assert_equal 7,
40
+ Tyccl.get_ids_by_word("底").size
41
+ end
42
+
43
+ def test_has_same
44
+ assert_equal true,
45
+ Tyccl.has_same?("人")
46
+ assert_equal false,
47
+ Tyccl.has_same?("顺民")
48
+ assert_equal false,
49
+ Tyccl.has_same?("众学生")
50
+ end
51
+
52
+ def test_has_equal
53
+ assert_equal true,
54
+ Tyccl.has_equal?("良民")
55
+ assert_equal false,
56
+ Tyccl.has_equal?("众学生")
57
+ assert_equal false,
58
+ Tyccl.has_equal?("人")
59
+ end
60
+
61
+ def test_has_single
62
+ assert_equal false,
63
+ Tyccl.has_single?("良民")
64
+ assert_equal true,
65
+ Tyccl.has_single?("众学生")
66
+ assert_equal false,
67
+ Tyccl.has_single?("人")
68
+ end
69
+
70
+ def test_get_same
71
+ m=Tyccl.get_same("人")
72
+
73
+ assert_equal nil,
74
+ Tyccl.get_same("顺民")
75
+ assert_equal nil,
76
+ Tyccl.get_same("众学生")
77
+ assert_equal 5,
78
+ m.size
79
+ assert_equal 6,
80
+ m[0].size
81
+ assert_equal 8,
82
+ m[1].size
83
+ assert_equal 2,
84
+ m[2].size
85
+ assert_equal 9,
86
+ m[3].size
87
+ assert_equal 9,
88
+ m[4].size
89
+
90
+ end
91
+
92
+ def test_get_equal
93
+ assert_equal nil,
94
+ Tyccl.get_equal("人")
95
+ assert_equal nil,
96
+ Tyccl.get_equal("众学生")
97
+ assert_equal 1,
98
+ Tyccl.get_equal("流民").size
99
+ assert_equal 9,
100
+ Tyccl.get_equal("流民")[0].size
101
+ end
102
+
103
+ def test_get_similar
104
+ assert_equal [ ["人", "士", "人物", "人士", "人氏", "人选"],
105
+ ["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
106
+ ["身体", "人"],
107
+ ["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
108
+ ["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"] ],
109
+ Tyccl.get_similar("人")
110
+ end
111
+
112
+ # dist ranges [0,10];
113
+ # if dist<7 then we believe that the two words are related
114
+ def test_dist
115
+ assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
116
+ Tyccl.dist("人","士")
117
+ assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
118
+ Tyccl.dist("西红柿","黄瓜")
119
+ assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
120
+ Tyccl.dist("匹夫","良民")
121
+ assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
122
+ Tyccl.dist("苹果","西红柿")
123
+ assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
124
+ Tyccl.dist("群众","村姑")
125
+ assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
126
+ Tyccl.dist("人","哟")
127
+ end
128
+
129
+ def test_sim
130
+ result=[ Result_t.new(1.0,"Aa01B01=","Aa01B01="),
131
+ Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
132
+ Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
133
+ Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
134
+ Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
135
+ Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
136
+ Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
137
+ Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
138
+ Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
139
+ Result_t.new(0.04007,"Aa01B01=","Al05B01=") ]
140
+
141
+ words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
142
+ i=0
143
+ words.each{ |word|
144
+ assert_equal result[i],
145
+ Tyccl.sim("人民",word)
146
+ i+=1
147
+ }
148
+ end
149
+
150
+ end
151
+