tyccl_tim_fixed 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +20 -0
  4. data/README.md +75 -0
  5. data/Rakefile +11 -0
  6. data/doc/Object.html +154 -0
  7. data/doc/README_md.html +182 -0
  8. data/doc/Tyccl/Containers.html +130 -0
  9. data/doc/Tyccl/Logger.html +130 -0
  10. data/doc/Tyccl/YAML.html +130 -0
  11. data/doc/Tyccl.html +1020 -0
  12. data/doc/created.rid +3 -0
  13. data/doc/images/add.png +0 -0
  14. data/doc/images/arrow_up.png +0 -0
  15. data/doc/images/brick.png +0 -0
  16. data/doc/images/brick_link.png +0 -0
  17. data/doc/images/bug.png +0 -0
  18. data/doc/images/bullet_black.png +0 -0
  19. data/doc/images/bullet_toggle_minus.png +0 -0
  20. data/doc/images/bullet_toggle_plus.png +0 -0
  21. data/doc/images/date.png +0 -0
  22. data/doc/images/delete.png +0 -0
  23. data/doc/images/find.png +0 -0
  24. data/doc/images/loadingAnimation.gif +0 -0
  25. data/doc/images/macFFBgHack.png +0 -0
  26. data/doc/images/package.png +0 -0
  27. data/doc/images/page_green.png +0 -0
  28. data/doc/images/page_white_text.png +0 -0
  29. data/doc/images/page_white_width.png +0 -0
  30. data/doc/images/plugin.png +0 -0
  31. data/doc/images/ruby.png +0 -0
  32. data/doc/images/tag_blue.png +0 -0
  33. data/doc/images/tag_green.png +0 -0
  34. data/doc/images/transparent.png +0 -0
  35. data/doc/images/wrench.png +0 -0
  36. data/doc/images/wrench_orange.png +0 -0
  37. data/doc/images/zoom.png +0 -0
  38. data/doc/index.html +166 -0
  39. data/doc/js/darkfish.js +155 -0
  40. data/doc/js/jquery.js +18 -0
  41. data/doc/js/navigation.js +142 -0
  42. data/doc/js/search.js +94 -0
  43. data/doc/js/search_index.js +1 -0
  44. data/doc/js/searcher.js +228 -0
  45. data/doc/rdoc.css +595 -0
  46. data/doc/table_of_contents.html +111 -0
  47. data/lib/Inverted.yaml +77458 -0
  48. data/lib/cilin.txt +17817 -0
  49. data/lib/tyccl/version.rb +3 -0
  50. data/lib/tyccl.rb +371 -0
  51. data/test/test_tyccl.rb +151 -0
  52. data/tyccl.gemspec +23 -0
  53. metadata +133 -0
@@ -0,0 +1,3 @@
1
+ class Tyccl #:nodoc:all
2
+ VERSION = "0.0.3"
3
+ end
data/lib/tyccl.rb ADDED
@@ -0,0 +1,371 @@
1
+ # coding: utf-8
2
+
3
+ # = this gem is a tool for analysing similarity
4
+ # = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
5
+ #
6
+ # * learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
7
+ #
8
+ # * Author:: Joe Woo (https://github.com/JoeWoo)
9
+ # * License:: MIT
10
+ #
11
+
12
+ require File.expand_path("../tyccl/version", __FILE__)#:nodoc:all
13
+ require "algorithms"#:nodoc:all
14
+ require "yaml"#:nodoc:all
15
+ require "logger"#:nodoc:all
16
+
17
+
18
+ # this struct is used to return analysing result
19
+ # * field 'value' store the analysing value
20
+ # * field 'x_id' 'y_id' store the ID of word X and Y
21
+ Result_t = Struct.new(:value,:x_id,:y_id)
22
+
23
+ # class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
24
+ # to keep Tyccl object just only one.
25
+ class Tyccl
26
+
27
+ #--
28
+ # Read the Cilin file to memory.
29
+ # Format the data structure \#@IDsTire.
30
+ # Index the hash \#@IDsIndex.
31
+ #++
32
+ #--
33
+ #read the cilin.txt to ids[] and items[]
34
+ #++
35
+ @logger = Logger.new(STDOUT)
36
+ @logger.level = Logger::WARN
37
+ codes=[]
38
+ items=[]
39
+ @IDsIndex = Hash.new
40
+ f = File.new(File.expand_path("../cilin.txt", __FILE__))
41
+ i=0
42
+ f.each { |line|
43
+ line.force_encoding('utf-8')
44
+ m=line.split(" ")
45
+ codes << m[0]
46
+ @IDsIndex[m[0]] = i
47
+ i += 1
48
+ word = Array.new
49
+ m[1..-1].each{ |term|
50
+ word << term
51
+ }
52
+ items << word
53
+ }
54
+ #--
55
+ #init Trie of cilin.txt
56
+ #++
57
+ @IDsTrie = Containers::Trie.new
58
+ i=0
59
+ codes.each{ |key|
60
+ @IDsTrie[key]=items[i]
61
+ i+=1
62
+ }
63
+ #--
64
+ #init index of cilin.txt
65
+ #++
66
+ @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
67
+
68
+
69
+
70
+
71
+ # Given id(string) such as:"Aa01A01=" "Aa01A03#"
72
+ # Returns an array containing words(string) that match this id
73
+ # If no match is found, nil is returned.
74
+ def self.get_words_by_id(id)
75
+ @IDsTrie[id]
76
+ end
77
+
78
+ # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
79
+ # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
80
+ # If no match is found, an empty array is returned.
81
+ def self.get_ids_by_wildcard(wildcard)
82
+ @IDsTrie.wildcard(wildcard)
83
+ end
84
+
85
+ # Returns an array containing IDs(string) that the parameter Word(string) matchs.
86
+ #
87
+ # tips: the same word may have a few semantic meanings, so a word can match many IDs.
88
+ def self.get_ids_by_word(word)
89
+ m = @index[word]
90
+ if(m==nil)
91
+ @logger.error(word+" is an unlisted word!")
92
+ return word
93
+ else
94
+ return m
95
+ end
96
+ end
97
+
98
+ # Given a word(string).
99
+ # Test to see if the parameter Word has any synonym.
100
+ # Returns true or false.
101
+ def self.has_same?(word)
102
+ ids = get_ids_by_word(word)
103
+ i=0
104
+ flag=false
105
+ while i < ids.size && flag==false do
106
+ if ids[i][-1]=="="
107
+ flag=true
108
+ else
109
+ flag=false
110
+ end
111
+ i+=1
112
+ end
113
+ return flag
114
+ end
115
+
116
+ # Given a word(string).
117
+ # Test to see if the parameter Word has any equivalent word.
118
+ # Returns true or false.
119
+ def self.has_equal?(word)
120
+ ids = get_ids_by_word(word)
121
+ i=0
122
+ flag=false
123
+ while i < ids.size && flag==false do
124
+ if ids[i][-1]=="#"
125
+ flag=true
126
+ else
127
+ flag=false
128
+ end
129
+ i+=1
130
+ end
131
+ return flag
132
+ end
133
+
134
+ # Given a word(string).
135
+ # Test to see if the parameter Word has any ID whose corresponding
136
+ # words list just has only one element.
137
+ # Returns true or false.
138
+ def self.has_single?(word)
139
+ ids = get_ids_by_word(word)
140
+ i=0
141
+ flag=false
142
+ while i < ids.size && flag==false do
143
+ if ids[i][-1]=="@"
144
+ flag=true
145
+ else
146
+ flag=false
147
+ end
148
+ i+=1
149
+ end
150
+ return flag
151
+ end
152
+
153
+ # Given a word(string).
154
+ # Returns a two dimensional array that contains the parameter Word`s
155
+ # synonym which divided by different ID that the word matchs.
156
+ # If the word has no synonym, nil is returned.
157
+ def self.get_same(word)
158
+ if has_same?(word)
159
+ same_words=[]
160
+ ids = get_ids_by_word(word)
161
+ ids.each{ |code|
162
+ if code[-1]=="="
163
+ same_words << get_words_by_id(code)
164
+ end
165
+ }
166
+ return same_words
167
+ end
168
+ return nil
169
+ end
170
+
171
+ # Given a word(string).
172
+ # Returns a two dimensional array that contains the parameter Word`s
173
+ # equivalent words which divided by different ID that the word matchs.
174
+ # If the word has no synonym, nil is returned.
175
+ def self.get_equal(word)
176
+ if has_equal?(word)
177
+ equal_words=[]
178
+ ids = get_ids_by_word(word)
179
+ ids.each{ |code|
180
+ if code[-1]=="#"
181
+ equal_words << get_words_by_id(code)
182
+ end
183
+ }
184
+ return equal_words
185
+ end
186
+ return nil
187
+ end
188
+
189
+ # Given a word(string) and a level(int),level`s value range is [0,4],
190
+ # 4 is default, value of level is more bigger, the similarity between
191
+ # returned words and the given word is more less.
192
+ # Returns a two dimensional array that contains the parameter Word`s
193
+ # similar words which divided by different ID that the word matchs.
194
+ # If the word has no similar, nil is returned.
195
+ #
196
+ # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
197
+ # segment: A,a,01,A,01=.
198
+ def self.get_similar(word, level=4)
199
+ ids = get_ids_by_word(word)
200
+ similar=[]
201
+ ids.each{ |code|
202
+ mini_similar=[]
203
+ findstring = gen_findstring(code, level+1)
204
+ similar_IDs=@IDsTrie.wildcard(findstring)
205
+ similar_IDs.each{|item|
206
+ get_words_by_id(item).each{|term|
207
+ mini_similar << term
208
+ }
209
+ }
210
+ similar << mini_similar
211
+ }
212
+ if similar.size > 0
213
+ return similar
214
+ else
215
+ return nil
216
+ end
217
+ end
218
+
219
+ # Given idA(string) and idB(string).
220
+ # Returns semantic distance(int) between idA and idB, values in [0,10].
221
+ def self.get_dist_by_id(idA, idB)
222
+ alpha=10.0/5
223
+ n = compare_id(idA,idB)
224
+ (alpha*(5-n)).round
225
+ end
226
+
227
+ # Given idA(string) and idB(string).
228
+ # Returns similarity(float) between idA and idB, values in [0,1].
229
+ def self.get_sim_by_id(idA, idB)
230
+ n = compare_id(idA,idB)
231
+ str = idA.clone
232
+ if n==0
233
+ _sim = factor[0]
234
+ elsif n==5
235
+ if idA[-1] == "="
236
+ _sim = factor[5]
237
+ elsif idA[-1] == "#"
238
+ _sim = factor[6]
239
+ elsif idA[-1] == "@"
240
+ _sim = factor[5]
241
+ end
242
+ elsif n < 5
243
+ findstring=gen_findstring(str,n)
244
+ node_num = @IDsTrie.wildcard(findstring).size
245
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
246
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
247
+ end
248
+ return _sim
249
+ end
250
+
251
+ # Given wordA(string) and wordB(string).
252
+ # Returns a Struct Result_t which contains idA, idB, and shortest
253
+ # semantic distance(int) between wordA and wordB.
254
+ def self.dist(wordA, wordB)
255
+ alpha=10.0/5
256
+ shortest_Pair = Result_t.new(100,"","")
257
+ idAs = get_ids_by_word(wordA)
258
+ idBs = get_ids_by_word(wordB)
259
+
260
+ idAs.each{ |idA|
261
+ idBs.each{ |idB|
262
+ n = compare_id(idA,idB)
263
+ distance = (alpha*(5-n)).round
264
+ if distance < shortest_Pair.value
265
+ shortest_Pair.value = distance
266
+ shortest_Pair.x_id = idA
267
+ shortest_Pair.y_id = idB
268
+ end
269
+ }
270
+ }
271
+ return shortest_Pair
272
+ end
273
+
274
+ # Given wordA(string) and wordB(string).
275
+ # Returns a Struct Result_t which contains the most similar Pairs
276
+ # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
277
+ def self.sim(wordA, wordB)
278
+ factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
279
+ longest_Pair = Result_t.new(-1,"","")
280
+ idAs = get_ids_by_word(wordA)
281
+ idBs = get_ids_by_word(wordB)
282
+
283
+ idAs.each{ |idA|
284
+ idBs.each{ |idB|
285
+ n = compare_id(idA,idB)
286
+ str = idA.clone
287
+ if n==0
288
+ _sim = factor[0]
289
+ elsif n==5
290
+ if idA[-1] == "="
291
+ _sim = factor[5]
292
+ elsif idA[-1] == "#"
293
+ _sim = factor[6]
294
+ elsif idA[-1] == "@"
295
+ _sim = factor[5]
296
+ end
297
+ elsif n < 5
298
+ findstring=gen_findstring(str,n)
299
+ node_num = @IDsTrie.wildcard(findstring).size
300
+ k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
301
+ _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
302
+ end
303
+
304
+ if _sim > longest_Pair.value
305
+ longest_Pair.value = _sim
306
+ longest_Pair.x_id = idA
307
+ longest_Pair.y_id = idB
308
+ end
309
+ }
310
+ }
311
+ longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
312
+ return longest_Pair
313
+ end
314
+
315
+ # Given a word(string) and start_index(int),start_index`s value
316
+ # range is [0,4], corresponding Cilin(同义词词林) ID`s different
317
+ # segment: A,a,01,A,01=.
318
+ # Returns a string that is used '.' to explace every char from
319
+ # the start_index to the string`s end.
320
+ def self.gen_findstring(code, start_index)
321
+ frame = cut_id(code)
322
+ (start_index).upto(4){|i|
323
+ 0.upto(frame[i].size-1){ |j|
324
+ frame[i][j]='.'
325
+ }
326
+ }
327
+ combine_id(frame)
328
+ end
329
+
330
+ # Given a id(string).
331
+ # Returns an array that contains 5 strings which are ID`s
332
+ # diffrent segment, like: A,a,01,A,01= .
333
+ def self.cut_id(id)
334
+ frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
335
+ return frame
336
+ end
337
+
338
+ # the method #cut_id`s inverse process.
339
+ def self.combine_id(frame)
340
+ m=""
341
+ frame.each{|seg|
342
+ m << seg
343
+ }
344
+ return m
345
+ end
346
+
347
+ # Given idA(string) and idB(string).
348
+ # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
349
+ # if they are the same , returns 5.
350
+ def self.compare_id(idA, idB)
351
+ frameA=cut_id(idA)
352
+ frameB=cut_id(idB)
353
+ 0.upto(frameA.length-1){ |i|
354
+ if frameA[i].eql?(frameB[i]) == false
355
+ return i
356
+ end
357
+ }
358
+ return 5
359
+ end
360
+
361
+ # Returns the total number of different ID in Cilin.
362
+ def self.get_id_sum
363
+ @IDsIndex.size
364
+ end
365
+
366
+ # Returns the total number of different words in Cilin.
367
+ def self.get_index_sum
368
+ @index.size
369
+ end
370
+
371
+ end
@@ -0,0 +1,151 @@
1
+ # coding: utf-8
2
+ require 'rake'
3
+ require 'rake/testtask'
4
+ require 'test/unit'
5
+ require File.expand_path('../../lib/tyccl', __FILE__)
6
+
7
+
8
+
9
+
10
+ class TycclTest < Test::Unit::TestCase #:nodoc:all
11
+
12
+ def test_instance
13
+ assert_equal 17809,
14
+ Tyccl.get_id_sum
15
+ assert_equal 77457,
16
+ Tyccl.get_index_sum
17
+ end
18
+
19
+ def test_get_words_by_id
20
+ assert_equal ["人","士","人物","人士","人氏","人选"],
21
+ Tyccl.get_words_by_id("Aa01A01=")
22
+ assert_equal nil,
23
+ Tyccl.get_words_by_id("dfdf")
24
+
25
+ end
26
+
27
+ def test_get_ids_by_wildcard
28
+ assert_equal 9,
29
+ Tyccl.get_ids_by_wildcard("Aa01A...").size
30
+ assert_equal 32,
31
+ Tyccl.get_ids_by_wildcard("Aa**A...").size
32
+ end
33
+
34
+ def test_get_ids_by_word
35
+ assert_equal nil,
36
+ Tyccl.get_ids_by_word("屌丝")
37
+ assert_equal 1,
38
+ Tyccl.get_ids_by_word("桅顶").size
39
+ assert_equal 7,
40
+ Tyccl.get_ids_by_word("底").size
41
+ end
42
+
43
+ def test_has_same
44
+ assert_equal true,
45
+ Tyccl.has_same?("人")
46
+ assert_equal false,
47
+ Tyccl.has_same?("顺民")
48
+ assert_equal false,
49
+ Tyccl.has_same?("众学生")
50
+ end
51
+
52
+ def test_has_equal
53
+ assert_equal true,
54
+ Tyccl.has_equal?("良民")
55
+ assert_equal false,
56
+ Tyccl.has_equal?("众学生")
57
+ assert_equal false,
58
+ Tyccl.has_equal?("人")
59
+ end
60
+
61
+ def test_has_single
62
+ assert_equal false,
63
+ Tyccl.has_single?("良民")
64
+ assert_equal true,
65
+ Tyccl.has_single?("众学生")
66
+ assert_equal false,
67
+ Tyccl.has_single?("人")
68
+ end
69
+
70
+ def test_get_same
71
+ m=Tyccl.get_same("人")
72
+
73
+ assert_equal nil,
74
+ Tyccl.get_same("顺民")
75
+ assert_equal nil,
76
+ Tyccl.get_same("众学生")
77
+ assert_equal 5,
78
+ m.size
79
+ assert_equal 6,
80
+ m[0].size
81
+ assert_equal 8,
82
+ m[1].size
83
+ assert_equal 2,
84
+ m[2].size
85
+ assert_equal 9,
86
+ m[3].size
87
+ assert_equal 9,
88
+ m[4].size
89
+
90
+ end
91
+
92
+ def test_get_equal
93
+ assert_equal nil,
94
+ Tyccl.get_equal("人")
95
+ assert_equal nil,
96
+ Tyccl.get_equal("众学生")
97
+ assert_equal 1,
98
+ Tyccl.get_equal("流民").size
99
+ assert_equal 9,
100
+ Tyccl.get_equal("流民")[0].size
101
+ end
102
+
103
+ def test_get_similar
104
+ assert_equal [ ["人", "士", "人物", "人士", "人氏", "人选"],
105
+ ["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
106
+ ["身体", "人"],
107
+ ["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
108
+ ["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"] ],
109
+ Tyccl.get_similar("人")
110
+ end
111
+
112
+ # dist ranges [0,10];
113
+ # if dist<7 then we believe that the two words are related
114
+ def test_dist
115
+ assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
116
+ Tyccl.dist("人","士")
117
+ assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
118
+ Tyccl.dist("西红柿","黄瓜")
119
+ assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
120
+ Tyccl.dist("匹夫","良民")
121
+ assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
122
+ Tyccl.dist("苹果","西红柿")
123
+ assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
124
+ Tyccl.dist("群众","村姑")
125
+ assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
126
+ Tyccl.dist("人","哟")
127
+ end
128
+
129
+ def test_sim
130
+ result=[ Result_t.new(1.0,"Aa01B01=","Aa01B01="),
131
+ Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
132
+ Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
133
+ Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
134
+ Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
135
+ Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
136
+ Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
137
+ Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
138
+ Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
139
+ Result_t.new(0.04007,"Aa01B01=","Al05B01=") ]
140
+
141
+ words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
142
+ i=0
143
+ words.each{ |word|
144
+ assert_equal result[i],
145
+ Tyccl.sim("人民",word)
146
+ i+=1
147
+ }
148
+ end
149
+
150
+ end
151
+