RubyGems - imatch - Versions diffs - 0.1.0 - Mend

imatch 0.1.0

Files changed (11) hide show

data/lib/imatch.rb ADDED

@@ -0,0 +1,85 @@
+require 'set'
+require 'digest/sha1'
+# gem install stemmer (Porter stemmer implementation)
+require 'stemmer'
+require 'lexicon'
+class IMatch
+  VERSION = '0.1.0'
+  DEFAULT_LEXICON_FILE = File.join(File.dirname(__FILE__), 'data', 'en.dat')
+  DEFAULT_NUMBER_OF_LEXICONS = 0
+  DEFAULT_LEXICON_FRACTION = 0.66
+  def initialize(file = DEFAULT_LEXICON_FILE, options = {})
+    @lexicon = IMatch::Lexicon.new(file).freeze
+    @stop_words = (options[:stop_words] || []).to_set
+    @should_stem = !!options[:stemming]
+    @number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
+    @lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
+    @subsets = []
+    if @number_of_lexicons > 0
+      @number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
+    end
+  end
+  def multiple_signatures(string, tokenize = /\s+/)
+    signatures = Set.new
+    if sig = signature(string, tokenize)
+      signatures << sig
+    end
+    @subsets.each do |lex|
+      if sig = signature(string, tokenize, lex)
+        signatures << sig
+      end
+    end
+    signatures
+  end
+  def signature(string, tokenize = /\s+/, lexicon = nil)
+    return nil unless string
+    tokens = string.split(tokenize)
+    return nil if tokens.empty?
+    current_lexicon = lexicon || @lexicon
+    usable_tokens = Set.new
+    tokens.each do |t|
+      token = t.downcase
+      token = token.stem if @should_stem && token.respond_to?(:stem)
+      next if @stop_words.include?(token)
+      next unless current_lexicon.include?(token)
+      usable_tokens << token
+    end
+    return nil if usable_tokens.empty?
+    finger_print(usable_tokens.to_a.sort) unless tokens.empty?
+  end
+  def lexicon
+    @lexicon
+  end
+  def to_s
+    %Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
+  end
+  private
+  def finger_print(tokens)
+    digest = Digest::SHA1.new
+    tokens.each{|t| digest.update(t) }
+    digest.to_s
+  end
+end

data/lib/lexicon.rb ADDED

@@ -0,0 +1,48 @@
+class IMatch
+  class Lexicon
+    def initialize(file_or_set)
+      if file_or_set.kind_of?(Set)
+        @file = "N/A"
+        @data = file_or_set.clone.freeze
+      elsif file_or_set.kind_of?(File)
+        @file = File.expand_path(file_or_set.path)
+        @data = IO.read(@file).split(/\r?\n/).to_set.freeze
+      elsif file_or_set.kind_of?(String)
+        raise(InvalidLexiconError, "Invalid/missing lexicon file: #{file_or_set}") unless File.exist?(file_or_set)
+        @file = File.expand_path(file_or_set)
+        @data = IO.read(@file).split(/\r?\n/).to_set.freeze
+      else
+        raise(InvalidLexiconError, "Invalid/missing lexicon argument: #{file_or_set}")
+      end
+      raise(InvalidLexiconError, "Empty lexicon file: #{file_or_set}") if @data.empty?
+    end
+    def include?(key)
+      @data.include?(key)
+    end
+    def size
+      @data.size
+    end
+    def to_s
+      %Q{<IMatch::Lexicon size="#{size}" file="#{@file}" />}
+    end
+    # percentage should be between 0.0 and 1.0
+    def subset(percentage)
+      subset = Set.new
+      @data.each do |term|
+        if rand > percentage
+          subset << term
+        end
+      end
+      self.class.new(subset)
+    end
+  end
+end

data/test/test_imatch.rb ADDED

@@ -0,0 +1,81 @@
+require "test/unit"
+require "imatch"
+class InvalidLexiconError < Exception; end;
+class TestIMatch < Test::Unit::TestCase
+  def test_defines_imatch_class
+    assert IMatch
+    assert IMatch.kind_of?(Class)
+  end
+  def test_initalize_with_no_args_loads_the_default_lexicon
+    imatch = IMatch.new
+    assert imatch
+    assert imatch.lexicon, "expected a lexicon"
+    assert imatch.lexicon.size > 0, "Didn't expect a blank lexicon"
+  end
+  def test_nil_input_creates_nil_output
+    assert_nil IMatch.new.signature(nil)
+  end
+  def test_known_imatch_score
+    signature = IMatch.new.signature('foo bar')
+    assert signature.kind_of?(String)
+    assert_equal '60518c1c11dc0452be71a7118a43ab68e3451b82', signature
+  end
+  def test_imatch_consistent
+    assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('foo bar')
+  end
+  def test_imatch_unordered
+    assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('bar foo')
+  end
+  def test_imatch_simple_plurals_if_stemming_enabled
+    imatch_stemming = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stemming => true)
+    imatch_non_stemming = IMatch.new
+    assert_equal imatch_stemming.signature('follower'), imatch_stemming.signature('followers'), "Failed to stem when enabled"
+    assert_not_equal imatch_non_stemming.signature('follower'), imatch_non_stemming.signature('followers')
+  end
+  def test_stop_words_skipped
+    imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stop_words => ['a'])
+    assert_nil imatch.signature("a")
+    assert_equal imatch.signature("foo"), imatch.signature("a foo")
+  end
+  def test_skipping_unknown_terms
+    imatch = IMatch.new
+    assert !imatch.lexicon.include?('{{example}}')
+    assert_nil imatch.signature('{{example}}')
+    assert_equal imatch.signature("string"), imatch.signature("{{example}} string")
+  end
+  def test_alternate_splitting
+    assert_equal IMatch.new.signature('F 16'), IMatch.new.signature('F-16', /\W+/)
+  end
+  def test_to_s
+    imatch = IMatch.new
+    str = imatch.to_s
+    assert str.include?("stemming=\"false\"")
+    assert str.include?("stop_word_count=\"0\"")
+    assert str.include?(imatch.lexicon.to_s)
+  end
+  def test_multiple_lexicon_signatures
+    string = "this is a test"
+    imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :lexicons => 5)
+    default = imatch.signature(string)
+    signatures = imatch.multiple_signatures(string)
+    assert signatures.kind_of?(Set)
+    assert !signatures.empty?
+    assert signatures.include?(default)
+  end
+end

data/test/test_lexicon.rb ADDED

@@ -0,0 +1,76 @@
+require "test/unit"
+require "imatch"
+class TestIMatchLexicon < Test::Unit::TestCase
+  def test_defines_lexicon_class
+    assert IMatch::Lexicon
+    assert IMatch::Lexicon.kind_of?(Class)
+  end
+  def test_nil_file_raises_error
+    assert_raise InvalidLexiconError do
+      IMatch::Lexicon.new(nil)
+    end
+  end
+  def test_missing_file_raises_error
+    assert_raise InvalidLexiconError do
+      IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'not_such_file'))
+    end
+  end
+  def test_empty_file_raises_error
+    assert_raise InvalidLexiconError do
+      IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'empty.dat'))
+    end
+  end
+  def test_lexicon_size
+    lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
+    assert_equal 10, lexicon.size
+  end
+  def test_lexicon_duplicates_ignored
+    lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'duplicates.dat'))
+    assert_equal 7, lexicon.size
+  end
+  def test_lexicon_include
+    lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
+    %w(this file has ten terms in the lexicon for testing).each do |term|
+      assert lexicon.include?(term), "Lexicon did not include test term: #{term}"
+    end
+  end
+  def test_to_s
+    filename = File.expand_path(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
+    lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
+    assert_match(/#{filename}/, lexicon.to_s)
+    assert_match(/#{lexicon.size}/, lexicon.to_s)
+  end
+  def test_new_with_file_argument
+    file = File.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
+    lexicon = IMatch::Lexicon.new(file)
+    assert_equal 10, lexicon.size
+  end
+  def test_new_with_set_argument
+    lexicon = IMatch::Lexicon.new(%w(a b c d).to_set)
+    assert_equal 4, lexicon.size
+    assert_match(/N\/A/, lexicon.to_s)
+  end
+  def test_random_subset
+    lexicon = IMatch::Lexicon.new(IMatch::DEFAULT_LEXICON_FILE)
+    assert lexicon.size > 10000, "Default lexicon is too small for this test"
+    subset = lexicon.subset(0.5)
+    portion = (subset.size.to_f / lexicon.size.to_f).to_f
+    assert portion > 0.4, "A 50% subset should be >40% of the size or else random is not working (#{portion})"
+    assert portion < 0.6, "A 50% subset should be <60% of the size or else random is not working (#{portion})"
+  end
+end

metadata ADDED

@@ -0,0 +1,128 @@
+--- !ruby/object:Gem::Specification
+name: imatch
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Matt Sanford
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-09-01 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: stemmer
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 21
+        segments:
+        - 1
+        - 0
+        - 1
+        version: 1.0.1
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rubyforge
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 2
+        - 0
+        - 4
+        version: 2.0.4
+  type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: hoe
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 21
+        segments:
+        - 2
+        - 6
+        - 1
+        version: 2.6.1
+  type: :development
+  version_requirements: *id003
+description: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
+email:
+- matt@twitter.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- History.txt
+- Manifest.txt
+- README.txt
+files:
+- .autotest
+- History.txt
+- Manifest.txt
+- README.txt
+- Rakefile
+- lib/imatch.rb
+- lib/lexicon.rb
+- lib/data/en.dat
+- test/test_imatch.rb
+- test/test_lexicon.rb
+has_rdoc: true
+homepage: http://twitter.com/mzsanford
+licenses: []
+post_install_message:
+rdoc_options:
+- --main
+- README.txt
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: imatch
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
+test_files:
+- test/test_imatch.rb
+- test/test_lexicon.rb