RubyGems - tyccl - Versions diffs - 0.0.1 - Mend

tyccl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/lib/tyccl/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class Tyccl
+  VERSION = "0.0.1"
+end

data/lib/tyccl.rb ADDED Viewed

@@ -0,0 +1,374 @@
+# coding: utf-8
+# = this gem is a tool for analysing similarity
+# = between Chinese words. it based on <em>HIT Tongyici Cilin (Extended)<\em>(同义词词林())
+# this gem only has one singleton class, instance once and use it always.
+#
+# learn more about Tongyici Cilin(同义词词林) http://vdisk.weibo.com/s/qGrIviGdExvx
+#
+# Author::    Joe Woo  (https://github.com/JoeWoo)
+# License::   MIT
+#
+require File.expand_path("../tyccl/version", __FILE__)
+require "algorithms"
+require "yaml"
+require "singleton"
+require "logger"
+# this struct is used to return analysing result
+# * field 'value' store the analysing value
+# * field 'x_id' 'y_id' store the ID of word X and Y
+Result_t = Struct.new(:value,:x_id,:y_id)
+# class Tyccl is a singleton class, no Tyccl.new() method instead of Tyccl.instance()
+# to keep Tyccl object just only one.
+class Tyccl
+  include Singleton
+  # Read the Cilin file to memory.
+  # Format the data structure \#@IDsTire.
+  # Index the hash \#@IDsIndex.
+  def initialize()#:notnew: stops RDoc from seeing the initialize method as the new method
+  	#--
+    #read the cilin.txt to ids[] and items[]
+    #++
+    @logger = Logger.new(STDOUT)
+    @logger.level = Logger::WARN
+    codes=[]
+    items=[]
+    @IDsIndex = Hash.new
+    f = File.new(File.expand_path("../cilin.txt", __FILE__))
+    i=0
+    f.each { |line|
+      line.force_encoding('utf-8')
+      m=line.split(" ")
+      codes << m[0]
+      @IDsIndex[m[0]] = i
+      i += 1
+      word = Array.new
+      m[1..-1].each{ |term|
+        word << term
+      }
+      items << word
+    }
+    #--
+    #init Trie of cilin.txt
+    #++
+    @IDsTrie = Containers::Trie.new
+    i=0
+    codes.each{ |key|
+      @IDsTrie[key]=items[i]
+      i+=1
+    }
+    #--
+    #init index of cilin.txt
+    #++
+    @index = YAML::load(File.open(File.expand_path("../Inverted.yaml", __FILE__)))
+  end
+  # Given id(string) such as:"Aa01A01=" "Aa01A03#"
+  # Returns an array containing words(string) that match this id
+  # If no match is found, nil is returned.
+  def get_words_by_id(id)
+    @IDsTrie[id]
+  end
+  # Returns a sorted array containing IDs(string) that match the parameter Wildcard(string).
+  # The wildcard characters that match any character are ‘*’ and ‘.’ such as "Aa01A..=","Aa**A..."
+  # If no match is found, an empty array is returned.
+  def get_ids_by_wildcard(wildcard)
+    @IDsTrie.wildcard(wildcard)
+  end
+  # Returns an array containing IDs(string) that the parameter Word(string) matchs.
+  #
+  # tips: the same word may have a few semantic meanings, so a word can match many IDs.
+  def get_ids_by_word(word)
+    m = @index[word]
+  	if(m==nil)
+  		@logger.error(word+" is an unlisted word!")
+  		return nil
+  	else
+  		return m
+  	end
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any synonym.
+  # Returns true or false.
+  def has_same?(word)
+    ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="="
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any equivalent word.
+  # Returns true or false.
+  def has_equal?(word)
+    ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="#"
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Test to see if the parameter Word has any ID whose corresponding
+  # words list just has only one element.
+  # Returns true or false.
+  def has_single?(word)
+  	ids = get_ids_by_word(word)
+    i=0
+    flag=false
+    while i < ids.size && flag==false  do
+      if ids[i][-1]=="@"
+        flag=true
+      else
+        flag=false
+      end
+      i+=1
+    end
+    return flag
+  end
+  # Given a word(string).
+  # Returns a two dimensional array that contains the parameter Word`s
+  # synonym which divided by different ID that the word matchs.
+  # If the word has no synonym, nil is returned.
+  def get_same(word)
+    if has_same?(word)
+      same_words=[]
+      ids = get_ids_by_word(word)
+      ids.each{ |code|
+        if code[-1]=="="
+         same_words << get_words_by_id(code)
+        end
+      }
+      return same_words
+    end
+    return nil
+  end
+  # Given a word(string).
+  # Returns a two dimensional array that contains the parameter Word`s
+  # equivalent words which divided by different ID that the word matchs.
+  # If the word has no synonym, nil is returned.
+  def get_equal(word)
+    if has_equal?(word)
+      equal_words=[]
+      ids = get_ids_by_word(word)
+      ids.each{ |code|
+        if code[-1]=="#"
+          equal_words << get_words_by_id(code)
+        end
+      }
+      return equal_words
+    end
+    return nil
+  end
+  # Given a word(string) and a level(int),level`s value range is [0,4],
+  # 4 is default, value of level is more bigger, the similarity between
+  # returned words and the given word is more less.
+  # Returns a two dimensional array that contains the parameter Word`s
+  # similar words which divided by different ID that the word matchs.
+  # If the word has no similar, nil is returned.
+  #
+  # tips: level 0,1,2,3,4 correspond Cilin(同义词词林) ID`s different
+  # segment: A，a，01，A，01=.
+  def get_similar(word, level=4)
+  	ids = get_ids_by_word(word)
+    similar=[]
+    ids.each{ |code|
+      mini_similar=[]
+      findstring = gen_findstring(code, level+1)
+      similar_IDs=@IDsTrie.wildcard(findstring)
+      similar_IDs.each{|item|
+        get_words_by_id(item).each{|term|
+          mini_similar << term
+        }
+      }
+      similar << mini_similar
+    }
+    if similar.size > 0
+    	return similar
+    else
+    	return nil
+    end
+  end
+  # Given idA(string) and idB(string).
+  # Returns semantic distance(int) between idA and idB, values in [0,10].
+  def get_dist_by_id(idA, idB)
+  	alpha=10.0/5
+  	n = compare_id(idA,idB)
+  	(alpha*(5-n)).round
+  end
+  # Given idA(string) and idB(string).
+  # Returns similarity(float) between idA and idB, values in [0,1].
+  def get_sim_by_id(idA, idB)
+   	n = compare_id(idA,idB)
+    str = idA.clone
+    if n==0
+      _sim = factor[0]
+    elsif n==5
+      if idA[-1] == "="
+        _sim = factor[5]
+      elsif idA[-1] == "#"
+        _sim = factor[6]
+      elsif idA[-1] == "@"
+        _sim = factor[5]
+      end
+  	elsif n < 5
+  	  findstring=gen_findstring(str,n)
+      node_num = @IDsTrie.wildcard(findstring).size
+      k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
+      _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
+    end
+    return _sim
+  end
+  # Given wordA(string) and wordB(string).
+  # Returns a Struct Result_t which contains idA, idB, and shortest
+  # semantic distance(int) between wordA and wordB.
+  def dist(wordA, wordB)
+    alpha=10.0/5
+    shortest_Pair = Result_t.new(100,"","")
+    idAs = get_ids_by_word(wordA)
+    idBs = get_ids_by_word(wordB)
+    idAs.each{ |idA|
+      idBs.each{ |idB|
+        n = compare_id(idA,idB)
+          distance = (alpha*(5-n)).round
+        if distance < shortest_Pair.value
+          shortest_Pair.value = distance
+          shortest_Pair.x_id = idA
+          shortest_Pair.y_id = idB
+        end
+      }
+    }
+    return shortest_Pair
+  end
+  # Given wordA(string) and wordB(string).
+  # Returns a Struct Result_t which contains the most similar Pairs
+  # wordA`s ID and wordB`s ID, and similarity(float) between idA and idB.
+  def sim(wordA, wordB)
+    factor=[0.02,0.65,0.8,0.9,0.96,1,0.5]#0,1,2,3,4,5各层参数
+    longest_Pair = Result_t.new(-1,"","")
+    idAs = get_ids_by_word(wordA)
+    idBs = get_ids_by_word(wordB)
+    idAs.each{ |idA|
+      idBs.each{ |idB|
+        n = compare_id(idA,idB)
+        str = idA.clone
+        if n==0
+          _sim = factor[0]
+        elsif n==5
+          if idA[-1] == "="
+            _sim = factor[5]
+          elsif idA[-1] == "#"
+            _sim = factor[6]
+          elsif idA[-1] == "@"
+            _sim = factor[5]
+          end
+      	elsif n < 5
+      	  findstring=gen_findstring(str,n)
+          node_num = @IDsTrie.wildcard(findstring).size
+          k = (@IDsIndex[idA]-@IDsIndex[idB]).abs
+          _sim = factor[n]*(Math.cos(node_num*Math::PI/180)).abs*((node_num-k+1)*1.0/node_num)
+        end
+        if _sim > longest_Pair.value
+          longest_Pair.value = _sim
+          longest_Pair.x_id = idA
+          longest_Pair.y_id = idB
+        end
+      }
+    }
+    longest_Pair.value = ("%1.5f" % longest_Pair.value).to_f
+    return longest_Pair
+  end
+  # Given a word(string) and start_index(int),start_index`s value
+  # range is [0,4], corresponding Cilin(同义词词林) ID`s different
+  # segment: A，a，01，A，01=.
+  # Returns a string that is used '.' to explace every char from
+  # the start_index to the string`s end.
+  def gen_findstring(code, start_index)
+    frame = cut_id(code)
+    (start_index).upto(4){|i|
+    	0.upto(frame[i].size-1){ |j|
+    		frame[i][j]='.'
+    	}
+    }
+    combine_id(frame)
+  end
+  # Given a id(string).
+  # Returns an array that contains 5 strings which are ID`s
+  # diffrent segment, like: A，a，01，A，01= .
+  def cut_id(id)
+    frame=[id[0],id[1],id[2..3],id[4],id[5..7]]
+    return frame
+  end
+  # the method #cut_id`s inverse process.
+  def combine_id(frame)
+    m=""
+    frame.each{|seg|
+      m << seg
+    }
+    return m
+  end
+  # Given idA(string) and idB(string).
+  # Returns fisrt diffrent place of their segment, place vlaues in[0,4].
+  # if they are the same , returns 5.
+  def compare_id(idA, idB)
+    frameA=cut_id(idA)
+    frameB=cut_id(idB)
+    0.upto(frameA.length-1){ |i|
+      if frameA[i].eql?(frameB[i]) == false
+        return i
+      end
+    }
+    return 5
+  end
+  # Returns the total number of different ID in Cilin.
+  def get_id_sum
+  	@IDsIndex.size
+  end
+  # Returns the total number of different words in Cilin.
+  def get_index_sum
+  	@index.size
+  end
+end

data/test/test_tyccl.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# coding: utf-8
+require 'rake'
+require 'rake/testtask'
+require 'test/unit'
+require File.expand_path('../../lib/tyccl', __FILE__)
+$tyc=Tyccl.instance
+class TycclTest < Test::Unit::TestCase
+  def test_instance
+    assert_equal 17809,
+        $tyc.get_id_sum
+    assert_equal 77457,
+    	$tyc.get_index_sum
+  end
+  def test_get_words_by_id
+  	assert_equal ["人","士","人物","人士","人氏","人选"],
+  		$tyc.get_words_by_id("Aa01A01=")
+  	assert_equal nil,
+  		$tyc.get_words_by_id("dfdf")
+  end
+  def test_get_ids_by_wildcard
+  	assert_equal 9,
+  		$tyc.get_ids_by_wildcard("Aa01A...").size
+  	assert_equal 32,
+  		$tyc.get_ids_by_wildcard("Aa**A...").size
+  end
+  def test_get_ids_by_word
+  	assert_equal nil,
+  		$tyc.get_ids_by_word("屌丝")
+  	assert_equal 1,
+  		$tyc.get_ids_by_word("桅顶").size
+  	assert_equal 7,
+  		$tyc.get_ids_by_word("底").size
+  end
+  def test_has_same
+  	assert_equal true,
+  		$tyc.has_same?("人")
+  	assert_equal false,
+  		$tyc.has_same?("顺民")
+  	assert_equal false,
+  		$tyc.has_same?("众学生")
+  end
+  def test_has_equal
+  	assert_equal true,
+  		$tyc.has_equal?("良民")
+  	assert_equal false,
+  		$tyc.has_equal?("众学生")
+  	assert_equal false,
+  		$tyc.has_equal?("人")
+  end
+  def test_has_single
+  	assert_equal false,
+  		$tyc.has_single?("良民")
+  	assert_equal true,
+  		$tyc.has_single?("众学生")
+  	assert_equal false,
+  		$tyc.has_single?("人")
+  end
+  def test_get_same
+  	m=$tyc.get_same("人")
+  	assert_equal nil,
+  		$tyc.get_same("顺民")
+  	assert_equal nil,
+  		$tyc.get_same("众学生")
+  	assert_equal 5,
+  		m.size
+  	assert_equal 6,
+  		m[0].size
+  	assert_equal 8,
+  		m[1].size
+  	assert_equal 2,
+  		m[2].size
+  	assert_equal 9,
+  		m[3].size
+  	assert_equal 9,
+  		m[4].size
+  end
+  def test_get_equal
+  	assert_equal nil,
+  		$tyc.get_equal("人")
+  	assert_equal nil,
+  		$tyc.get_equal("众学生")
+  	assert_equal 1,
+  		$tyc.get_equal("流民").size
+  	assert_equal 9,
+  		$tyc.get_equal("流民")[0].size
+  end
+  def test_get_similar
+   	assert_equal [	["人", "士", "人物", "人士", "人氏", "人选"],
+ 					["成年人", "壮年人", "大人", "人", "丁", "壮丁", "佬", "中年人"],
+ 					["身体", "人"],
+ 					["人格", "人品", "人头", "人", "品质", "质地", "格调", "灵魂", "为人"],
+ 					["人数", "人头", "人口", "人", "口", "丁", "家口", "食指", "总人口"]	],
+  		$tyc.get_similar("人")
+  end
+# dist ranges [0,10];
+# if dist<7 then we believe that the two words are related
+  def test_dist
+  	assert_equal Result_t.new(0,"Aa01A01=","Aa01A01="),
+  		$tyc.dist("人","士")
+  	assert_equal Result_t.new(2,"Bh06A32=","Bh06A34="),
+  		$tyc.dist("西红柿","黄瓜")
+  	assert_equal Result_t.new(4,"Aa01A05=","Aa01B03#"),
+  		$tyc.dist("匹夫","良民")
+  	assert_equal Result_t.new(6,"Bh07A14=","Bh06A32="),
+  		$tyc.dist("苹果","西红柿")
+  	assert_equal Result_t.new(8,"Aa01B02=","Ab01B10="),
+  		$tyc.dist("群众","村姑")
+  	assert_equal Result_t.new(10,"Aa01A01=","Kd04C01="),
+  		$tyc.dist("人","哟")
+  end
+  def test_sim
+	result=[	Result_t.new(1.0,"Aa01B01=","Aa01B01="),
+				Result_t.new(0.95766,"Aa01B01=","Aa01B02="),
+				Result_t.new(0.71825,"Aa01B01=","Aa01B03#"),
+				Result_t.new(0.48013,"Aa01B01=","Aa01C07#"),
+				Result_t.new(0.40396,"Aa01B01=","Ab02B01="),
+				Result_t.new(0.39028,"Aa01B01=","Ad01A02="),
+				Result_t.new(0.21692,"Aa01B01=","Aa03A05="),
+				Result_t.new(0.20361,"Aa01B01=","Ah01A01="),
+				Result_t.new(0.08112,"Aa01B01=","Ak03A03#"),
+				Result_t.new(0.04007,"Aa01B01=","Al05B01=") 	]
+  	words=["国民","群众","良民","党群","成年人","市民","同志","亲属","志愿者","先锋"]
+  	i=0
+  	words.each{  |word|
+  		assert_equal result[i],
+  			$tyc.sim("人民",word)
+  		i+=1
+  	}
+  end
+end

data/tyccl.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'tyccl/version'
+Gem::Specification.new do |spec|
+  spec.name          = "tyccl"
+  spec.version       = Tyccl::VERSION
+  spec.authors       = ["JoeWoo"]
+  spec.email         = ["0wujian0@gmail.com"]
+  spec.summary       = %q{"tools of analysing similarity between Chinese Words."}
+  spec.description   = %q{"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions to analyse similarity between Chinese Words."}
+  spec.homepage      = "https://github.com/JoeWoo/tyccl"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.5"
+  spec.add_development_dependency "rake"
+end

metadata ADDED Viewed

@@ -0,0 +1,85 @@
+--- !ruby/object:Gem::Specification
+name: tyccl
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- JoeWoo
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-01-24 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: "\"tyccl(同义词词林 哈工大扩展版) is a ruby gem that provides friendly functions
+  to analyse similarity between Chinese Words.\""
+email:
+- 0wujian0@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- lib/Inverted.yaml
+- lib/cilin.txt
+- lib/tyccl.rb
+- lib/tyccl/version.rb
+- test/test_tyccl.rb
+- tyccl.gemspec
+homepage: https://github.com/JoeWoo/tyccl
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.1.9
+signing_key:
+specification_version: 4
+summary: "\"tools of analysing similarity between Chinese Words.\""
+test_files:
+- test/test_tyccl.rb