RubyGems - tyccl_tim_fixed - Versions diffs - 0.0.3 - Mend

tyccl_tim_fixed 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

data/.gitignore +17 -0
data/Gemfile +6 -0
data/LICENSE +20 -0
data/README.md +75 -0
data/Rakefile +11 -0
data/doc/Object.html +154 -0
data/doc/README_md.html +182 -0
data/doc/Tyccl/Containers.html +130 -0
data/doc/Tyccl/Logger.html +130 -0
data/doc/Tyccl/YAML.html +130 -0
data/doc/Tyccl.html +1020 -0
data/doc/created.rid +3 -0
data/doc/images/add.png +0 -0
data/doc/images/arrow_up.png +0 -0
data/doc/images/brick.png +0 -0
data/doc/images/brick_link.png +0 -0
data/doc/images/bug.png +0 -0
data/doc/images/bullet_black.png +0 -0
data/doc/images/bullet_toggle_minus.png +0 -0
data/doc/images/bullet_toggle_plus.png +0 -0
data/doc/images/date.png +0 -0
data/doc/images/delete.png +0 -0
data/doc/images/find.png +0 -0
data/doc/images/loadingAnimation.gif +0 -0
data/doc/images/macFFBgHack.png +0 -0
data/doc/images/package.png +0 -0
data/doc/images/page_green.png +0 -0
data/doc/images/page_white_text.png +0 -0
data/doc/images/page_white_width.png +0 -0
data/doc/images/plugin.png +0 -0
data/doc/images/ruby.png +0 -0
data/doc/images/tag_blue.png +0 -0
data/doc/images/tag_green.png +0 -0
data/doc/images/transparent.png +0 -0
data/doc/images/wrench.png +0 -0
data/doc/images/wrench_orange.png +0 -0
data/doc/images/zoom.png +0 -0
data/doc/index.html +166 -0
data/doc/js/darkfish.js +155 -0
data/doc/js/jquery.js +18 -0
data/doc/js/navigation.js +142 -0
data/doc/js/search.js +94 -0
data/doc/js/search_index.js +1 -0
data/doc/js/searcher.js +228 -0
data/doc/rdoc.css +595 -0
data/doc/table_of_contents.html +111 -0
data/lib/Inverted.yaml +77458 -0
data/lib/cilin.txt +17817 -0
data/lib/tyccl/version.rb +3 -0
data/lib/tyccl.rb +371 -0
data/test/test_tyccl.rb +151 -0
data/tyccl.gemspec +23 -0
metadata +133 -0

data/lib/tyccl/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class Tyccl  #:nodoc:all
+  VERSION = "0.0.3"
+end

data/lib/tyccl.rb ADDED Viewed

@@ -0,0 +1,371 @@
+# coding: utf-8
+# = this gem is a tool for analysing similarity
+# = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
+#
+# * learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
+#
+# * Author::    Joe Woo  (https://github.com/JoeWoo)
+# * License::   MIT
+#
+require File.expand_path("../tyccl/version", __FILE__)#:nodoc:all
+require "algorithms"#:nodoc:all
+require "yaml"#:nodoc:all
+require "logger"#:nodoc:all
+# this struct is used to return analysing result
+# * field 'value' store the analysing value
+# * field 'x_id' 'y_id' store the ID of word X and Y
+Result_t = Struct.new(:value,:x_id,:y_id)
+# class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
+# to keep Tyccl object just only one.
+class Tyccl
+  #--
+  # Read the Cilin file to memory.
+  # Format the data structure \#@IDsTire.
+  # Index the hash \#@IDsIndex.
+  #++
+	#--
+  #read the cilin.txt to ids[] and items[]
+  #++
+  @logger = Logger.new(STDOUT)
+  @logger.level = Logger::WARN
+  codes=[]
+  items=[]
+  @IDsIndex = Hash.new
+  f = File.new(File.expand_path("../cilin.txt", __FILE__))
+  i=0
+  f.each { |line|
+    line.force_encoding('utf-8')
+    m=line.split(" ")
+    codes << m[0]
+    @IDsIndex[m[0]] = i
+    i += 1
+    word = Array.new
+    m[1..-1].each{ |term|
+      word << term
+    }
+    items << word
+  }
+  #--
+  #init Trie of cilin.txt
+  #++
+  @IDsTrie = Containers::Trie.new
+  i=0
+  codes.each{ |key|
+    @IDsTrie[key]=items[i]
+    i+=1
+  }
+  #--
+  #init index of cilin.txt
+  #++
+  @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
+  # Given id(string) such as:"Aa01A01=" "Aa01A03#"
+  # Returns an array containing words(string) that match this id
+  # If no match is found, nil is returned.
+  def self.get_words_by_id(id)
+    @IDsTrie[id]
+  end
+  # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
+  # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
+  # If no match is found, an empty array is returned.
+  def self.get_ids_by_wildcard(wildcard)
+    @IDsTrie.wildcard(wildcard)
+  end
+  # Returns an array containing IDs(string) that the parameter Word(string) matchs.
+  #
+  # tips: the same word may have a few semantic meanings, so a word can match many IDs.
+  def self.get_ids_by_word(word)
+    m = @index[word]
+  	if(m==nil)
+  		@logger.error(word+" is an unlisted word!")
+  		return word
+  	else
+  		return m
+  	end
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any synonym.
+  # Returns true or false.
+  def self.has_same?(word)
+    ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="="
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any equivalent word.
+  # Returns true or false.
+  def self.has_equal?(word)
+    ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="#"
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any ID whose corresponding
+  # words list just has only one element.
+  # Returns true or false.
+  def self.has_single?(word)
+  	ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="@"
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Returns a two dimensional array that contains the parameter Word`s
+  # synonym which divided by different ID that the word matchs.
+  # If the word has no synonym, nil is returned.
+  def self.get_same(word)
+    if has_same?(word)
+      same_words=[]
+      ids = get_ids_by_word(word)
+      ids.each{ |code|
+        if code[-1]=="="
+         same_words << get_words_by_id(code)
+        end
+      }
+      return same_words
+    end
+    return nil
+  end
+  # Given a word(string).
+  # Returns a two dimensional array that contains the parameter Word`s
+  # equivalent words which divided by different ID that the word matchs.
+  # If the word has no synonym, nil is returned.
+  def self.get_equal(word)
+    if has_equal?(word)
+      equal_words=[]
+      ids = get_ids_by_word(word)
+      ids.each{ |code|
+        if code[-1]=="#"
+          equal_words << get_words_by_id(code)
+        end
+      }
+      return equal_words
+    end
+    return nil
+  end
+  # Given a word(string) and a level(int),level`s value range is [0,4],
+  # 4 is default, value of level is more bigger, the similarity between
+  # returned words and the given word is more less.
+  # Returns a two dimensional array that contains the parameter Word`s
+  # similar words which divided by different ID that the word matchs.
+  # If the word has no similar, nil is returned.
+  #
+  # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
+  # segment: A，a，01，A，01=.
+  def self.get_similar(word, level=4)
+  	ids = get_ids_by_word(word)
+    similar=[]
+    ids.each{ |code|
+      mini_similar=[]
+      findstring = gen_findstring(code, level+1)
+      similar_IDs=@IDsTrie.wildcard(findstring)
+      similar_IDs.each{|item|
+        get_words_by_id(item).each{|term|
+          mini_similar << term
+        }
+      }
+      similar << mini_similar
+    }
+    if similar.size > 0
+    	return similar
+    else
+    	return nil
+    end
+  end
+  # Given idA(string) and idB(string).
+  # Returns semantic distance(int) between idA and idB, values in [0,10].
+  def self.get_dist_by_id(idA, idB)
+  	alpha=10.0/5
+  	n = compare_id(idA,idB)
+  	(alpha*(5-n)).round
+  end
+  # Given idA(string) and idB(string).
+  # Returns similarity(float) between idA and idB, values in [0,1].
+  def self.get_sim_by_id(idA, idB)
+   	n = compare_id(idA,idB)
+    str = idA.clone
+    if n==0
+      _sim = factor[0]
+    elsif n==5
+      if idA[-1] == "="
+        _sim = factor[5]
+      elsif idA[-1] == "#"
+        _sim = factor[6]
+      elsif idA[-1] == "@"
+        _sim = factor[5]
+      end
+  	elsif n < 5
+  	  findstring=gen_findstring(str,n)
+      node_num = @IDsTrie.wildcard(findstring).size
+      k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
+      _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
+    end
+    return _sim
+  end
+  # Given wordA(string) and wordB(string).
+  # Returns a Struct Result_t which contains idA, idB, and shortest
+  # semantic distance(int) between wordA and wordB.
+  def self.dist(wordA, wordB)
+    alpha=10.0/5
+    shortest_Pair = Result_t.new(100,"","")
+    idAs = get_ids_by_word(wordA)
+    idBs = get_ids_by_word(wordB)
+    idAs.each{ |idA|
+      idBs.each{ |idB|
+        n = compare_id(idA,idB)
+          distance = (alpha*(5-n)).round
+        if distance < shortest_Pair.value
+          shortest_Pair.value = distance
+          shortest_Pair.x_id = idA
+          shortest_Pair.y_id = idB
+        end
+      }
+    }
+    return shortest_Pair
+  end
+  # Given wordA(string) and wordB(string).
+  # Returns a Struct Result_t which contains the most similar Pairs
+  # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
+  def self.sim(wordA, wordB)
+    factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
+    longest_Pair = Result_t.new(-1,"","")
+    idAs = get_ids_by_word(wordA)
+    idBs = get_ids_by_word(wordB)
+    idAs.each{ |idA|
+      idBs.each{ |idB|
+        n = compare_id(idA,idB)
+        str = idA.clone
+        if n==0
+          _sim = factor[0]
+        elsif n==5
+          if idA[-1] == "="
+            _sim = factor[5]
+          elsif idA[-1] == "#"
+            _sim = factor[6]
+          elsif idA[-1] == "@"
+            _sim = factor[5]
+          end
+      	elsif n < 5
+      	  findstring=gen_findstring(str,n)
+          node_num = @IDsTrie.wildcard(findstring).size
+          k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
+          _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
+        end
+        if _sim > longest_Pair.value
+          longest_Pair.value = _sim
+          longest_Pair.x_id = idA
+          longest_Pair.y_id = idB
+        end
+      }
+    }
+    longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
+    return longest_Pair
+  end
+  # Given a word(string) and start_index(int),start_index`s value
+  # range is [0,4], corresponding Cilin(同义词词林) ID`s different
+  # segment: A，a，01，A，01=.
+  # Returns a string that is used '.' to explace every char from
+  # the start_index to the string`s end.
+  def self.gen_findstring(code, start_index)
+    frame = cut_id(code)
+    (start_index).upto(4){|i|
+    	0.upto(frame[i].size-1){ |j|
+    		frame[i][j]='.'
+    	}
+    }
+    combine_id(frame)
+  end
+  # Given a id(string).
+  # Returns an array that contains 5 strings which are ID`s
+  # diffrent segment, like: A，a，01，A，01= .
+  def self.cut_id(id)
+    frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
+    return frame
+  end
+  # the method #cut_id`s inverse process.
+  def self.combine_id(frame)
+    m=""
+    frame.each{|seg|
+      m << seg
+    }
+    return m
+  end
+  # Given idA(string) and idB(string).
+  # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
+  # if they are the same , returns 5.
+  def self.compare_id(idA, idB)
+    frameA=cut_id(idA)
+    frameB=cut_id(idB)
+    0.upto(frameA.length-1){ |i|
+      if frameA[i].eql?(frameB[i]) == false
+        return i
+      end
+    }
+    return 5
+  end
+  # Returns the total number of different ID in Cilin.
+  def self.get_id_sum
+  	@IDsIndex.size
+  end
+  # Returns the total number of different words in Cilin.
+  def self.get_index_sum
+  	@index.size
+  end
+end

data/test/test_tyccl.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# coding: utf-8
+require 'rake'
+require 'rake/testtask'
+require 'test/unit'
+require File.expand_path('../../lib/tyccl', __FILE__)
+class TycclTest < Test::Unit::TestCase  #:nodoc:all
+  def test_instance
+    assert_equal 17809,
+        Tyccl.get_id_sum
+    assert_equal 77457,
+    	Tyccl.get_index_sum
+  end
+  def test_get_words_by_id
+  	assert_equal ["人","士","人物","人士","人氏","人选"],
+  		Tyccl.get_words_by_id("Aa01A01=")
+  	assert_equal nil,
+  		Tyccl.get_words_by_id("dfdf")
+  end
+  def test_get_ids_by_wildcard
+  	assert_equal 9,
+  		Tyccl.get_ids_by_wildcard("Aa01A...").size
+  	assert_equal 32,
+  		Tyccl.get_ids_by_wildcard("Aa**A...").size
+  end
+  def test_get_ids_by_word
+  	assert_equal nil,
+  		Tyccl.get_ids_by_word("屌丝")
+  	assert_equal 1,
+  		Tyccl.get_ids_by_word("桅顶").size
+  	assert_equal 7,
+  		Tyccl.get_ids_by_word("底").size
+  end
+  def test_has_same
+  	assert_equal true,
+  		Tyccl.has_same?("人")
+  	assert_equal false,
+  		Tyccl.has_same?("顺民")
+  	assert_equal false,
+  		Tyccl.has_same?("众学生")
+  end
+  def test_has_equal
+  	assert_equal true,
+  		Tyccl.has_equal?("良民")
+  	assert_equal false,
+  		Tyccl.has_equal?("众学生")
+  	assert_equal false,
+  		Tyccl.has_equal?("人")
+  end
+  def test_has_single
+  	assert_equal false,
+  		Tyccl.has_single?("良民")
+  	assert_equal true,
+  		Tyccl.has_single?("众学生")
+  	assert_equal false,
+  		Tyccl.has_single?("人")
+  end
+  def test_get_same
+  	m=Tyccl.get_same("人")
+  	assert_equal nil,
+  		Tyccl.get_same("顺民")
+  	assert_equal nil,
+  		Tyccl.get_same("众学生")
+  	assert_equal 5,
+  		m.size
+  	assert_equal 6,
+  		m[0].size
+  	assert_equal 8,
+  		m[1].size
+  	assert_equal 2,
+  		m[2].size
+  	assert_equal 9,
+  		m[3].size
+  	assert_equal 9,
+  		m[4].size
+  end
+  def test_get_equal
+  	assert_equal nil,
+  		Tyccl.get_equal("人")
+  	assert_equal nil,
+  		Tyccl.get_equal("众学生")
+  	assert_equal 1,
+  		Tyccl.get_equal("流民").size
+  	assert_equal 9,
+  		Tyccl.get_equal("流民")[0].size
+  end
+  def test_get_similar
+   	assert_equal [	["人", "士", "人物", "人士", "人氏", "人选"],
+ 					["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
+ 					["身体", "人"],
+ 					["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
+ 					["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"]	],
+  		Tyccl.get_similar("人")
+  end
+# dist ranges [0,10];
+# if dist<7 then we believe that the two words are related
+  def test_dist
+  	assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
+  		Tyccl.dist("人","士")
+  	assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
+  		Tyccl.dist("西红柿","黄瓜")
+  	assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
+  		Tyccl.dist("匹夫","良民")
+  	assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
+  		Tyccl.dist("苹果","西红柿")
+  	assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
+  		Tyccl.dist("群众","村姑")
+  	assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
+  		Tyccl.dist("人","哟")
+  end
+  def test_sim
+	result=[	Result_t.new(1.0,"Aa01B01=","Aa01B01="),
+				Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
+				Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
+				Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
+				Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
+				Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
+				Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
+				Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
+				Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
+				Result_t.new(0.04007,"Aa01B01=","Al05B01=") 	]
+  	words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
+  	i=0
+  	words.each{  |word|
+  		assert_equal result[i],
+  			Tyccl.sim("人民",word)
+  		i+=1
+  	}
+  end
+end