RubyGems - perseus_match - Versions diffs - 0.0.6 → 0.0.7 - Mend

perseus_match 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/README +1 -1
data/Rakefile +1 -1
data/bin/perseus_match +22 -8
data/lib/perseus_match/core_ext.rb +14 -0
data/lib/perseus_match/token.rb +43 -0
data/lib/perseus_match/token_set.rb +171 -116
data/lib/perseus_match/version.rb +1 -1
data/lib/perseus_match.rb +39 -13
data/spec/perseus_match/token_set_spec.rb +80 -56
data/spec/perseus_match/token_spec.rb +23 -0
data/spec/perseus_match_spec.rb +3 -8
metadata +15 -2

data/README CHANGED Viewed

@@ -2,7 +2,7 @@
 == VERSION
-This documentation refers to perseus_match version 0.0.6
+This documentation refers to perseus_match version 0.0.7
 == DESCRIPTION

data/Rakefile CHANGED Viewed

@@ -14,7 +14,7 @@ begin
       :summary      => %q{Fuzzy string matching based on linguistic analysis},
       :files        => FileList['lib/**/*.rb', 'bin/*'].to_a,
       :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
-      :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
+      :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0'], ['unicode', '>= 0.1.1']]
     }
   }}
 rescue LoadError

data/bin/perseus_match CHANGED Viewed

@@ -130,7 +130,7 @@ end
 unknowns = Set.new if options[:unknowns]
-PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
+PerseusMatch.tokenize(file, unknowns || !options[:silent])
 if unknowns
   File.open(options[:unknowns], 'w') { |f|
@@ -154,7 +154,12 @@ list_options = { :minimal => options[:minimal] }
 threshold, count, count_all = options[:threshold], 0, 0
 action = if options[:check]
-  require 'fastercsv'
+  require 'csv'
+  if CSV.const_defined?(:Reader)
+    require 'fastercsv'
+    CSV = FasterCSV
+  end
   format = if options[:align]
     require 'jcode'
@@ -184,23 +189,23 @@ action = if options[:check]
     positives = negatives = false_positives = false_negatives = 0.0
     phrases.each { |line|
-      phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
+      phrase, target, threshold, operator, _ = *CSV.parse_line(line)
       threshold ||= global_threshold
       operator  ||= '>'
       assign      = operator =~ />/ || operator == '=='
       begin
-        PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
+        res = PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
         count += 1
         assign ? positives += 1 : negatives += 1
-        puts format[line, 'OK'] unless adjust_coeff || failed_only
+        puts format[line, "OK -- #{res.value} (#{res.pm.distance})"] unless adjust_coeff || failed_only
       rescue PerseusMatch::CheckFailedError => err
         assign ? false_negatives += 1 : false_positives += 1
-        puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
+        puts format[line, "FAILED -- #{err.value} (#{err.pm.distance})"] unless adjust_coeff
       end
       count_all += 1
@@ -222,8 +227,17 @@ action = if options[:check]
       precision = divide[positives, positives + false_positives]
       f1        = divide[2 * recall * precision, recall + precision]
-      stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
-        recall * 100, precision * 100, f1, error
+      error_all = divide[  # trivial: assign all
+        negatives + false_positives,
+        positives + negatives + false_positives + false_negatives
+      ]
+      error_none = divide[  # trivial: assign none
+        positives + false_negatives,
+        positives + negatives + false_positives + false_negatives
+      ]
+      stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f (ALL = %0.4f, NONE = %0.4f)' % [
+        recall * 100, precision * 100, f1, error, error_all, error_none
       ]
       stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff

data/lib/perseus_match/core_ext.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'rubygems'
+require 'unicode'
+class String
+  def downcase
+    Unicode.downcase(self)
+  end
+  def downcase!
+    replace downcase
+  end
+end

data/lib/perseus_match/token.rb ADDED Viewed

@@ -0,0 +1,43 @@
+class PerseusMatch
+  class Token < String
+    WC_RE = %r{[/|]([^/|]*)\z}
+    ANY_WC = '*'.freeze
+    attr_reader :form, :wc
+    def initialize(form, wc = nil)
+      @form = form.sub(WC_RE, '')
+      @wc   = wc || $1
+      super(@form)
+    end
+    def match?(wcs)
+      wcs = [*wcs].compact
+      wcs.include?(wc) || wcs.include?(ANY_WC)
+    end
+    def unk?
+      wc == '?'
+    end
+    def ==(other)
+      other.is_a?(self.class) ? form == other.form : form == other
+    end
+    def eql?(other)
+      self == other && wc == other.wc
+    end
+    def inspect
+      "#{super}/#{wc}"
+    end
+    alias_method :to_s, :inspect
+  end
+end

data/lib/perseus_match/token_set.rb CHANGED Viewed

@@ -40,20 +40,32 @@ require 'nuggets/util/i18n'
 begin
   require 'text/soundex'
 rescue LoadError
-  warn "could not load the Text gem -- soundex functionality will not be available"
+  warn "Could not load the Text gem -- Soundex functionality will not be available"
 end
 LINGO_BASE = ENV['PM_LINGO_BASE'] || (
   File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
 )
-LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
-warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
+if LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
+  begin
+    require File.join(LINGO_BASE, 'lib', 'const')
+  rescue LoadError
+  end
+else
+  warn "Lingo installation not found at #{LINGO_BASE} -- proceeding anyway"
+end
+unless Object.const_defined?(:PRINTABLE_CHAR)
+  PRINTABLE_CHAR = '[\w-]'
+end
+PRINTABLE_CHAR_RE = %r{(?:#{PRINTABLE_CHAR})+}
 lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
   YAML.load_file(file)
 else
-  warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
+  warn "Lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
   {
     'meeting' => {
@@ -78,30 +90,81 @@ class PerseusMatch
   class TokenSet < Array
-    def self.tokenize(form, unknowns = false)
-      return @tokens[form] if @tokens
+    class << self
-      @_tokens, @tokens = {}, Hash.new { |h, k|
-        h[k] = new(
-          k, (@_tokens[k] || []) | (
-            k.scan(/\w+/) + k.scan(/[\w-]+/)
-          ).map { |i| @_tokens[i] }.flatten.compact
-        )
-      }
+      def tokenize(form, unknowns = false)
+        form.downcase!
+        return @tokens[form] if @tokens ||= nil
+        @_tokens = Hash.new
+        @tokens  = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }
+        tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'
+        if File.readable?(tokens_file)
+          File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
+          @tokens[form]
+        else
+          raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
+          cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
+            YAML.dump(LINGO_CONFIG, t)
+          }
+          file = file?(form) || begin
+            temp = Tempfile.open('perseus_match_temp') { |t| t.puts form }
+            temp.path
+          end
+          ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
+          if keep = ENV['PM_KEEP_TOKENS']
+            keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/i ? tokens_file : keep)
+          end
+          begin
+            Dir.chdir(LINGO_BASE) {
+              tokens = %x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}
+              File.open(keep, 'w') { |f| f.puts tokens } if keep
+              parse(tokens, unknowns, @_tokens)
+            }
+          ensure
+            cfg.unlink
+            temp.unlink if temp
+          end
+          if temp
+            tokens, @tokens = @tokens[form], nil
+            tokens
+          end
+        end
+      end
+      def file?(form)
+        file = Pathname.new(form).absolute? ? form : File.expand_path(form)
+        file if File.file?(file) && File.readable?(file)
+      end
+      private
-      parse = lambda { |x|
-        x.each_line { |res|
+      def parse(output, unknowns = false, tokens = {})
+        sanitize = lambda { |a|
+          a.sub!(Token::WC_RE, '')
+          a.downcase!
+        }
+        output.each_line { |res|
           case res
             when /<(.*?)\s=\s\[(.*)\]>/
               a, b = $1, $2
-              a.sub!(/\|.*/, '')
+              sanitize[a]
-              @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
+              tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
             when /<(.*)>/, /:(.*):/
-              a, b = $1, $1.dup
-              a.sub!(/[\/|].*/, '')
+              a, b = $1, Token.new($1.downcase)
+              sanitize[a]
-              if unknowns && b =~ /\|\?\z/
+              if unknowns && b.unk?
                 if unknowns.respond_to?(:<<)
                   unknowns << a
                 else
@@ -109,134 +172,65 @@ class PerseusMatch
                 end
               end
-              @_tokens[a] ||= [b.replace_diacritics.downcase]
+              tokens[a] ||= [b]
           end
         }
-      }
-      if File.readable?(t = 'perseus.tokens')
-        File.open(t) { |f| parse[f] }
-        @tokens[form]
-      else
-        raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
-        cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
-          YAML.dump(LINGO_CONFIG, t)
-        }
-        file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
-        unless File.file?(file) && File.readable?(file)
-          temp = Tempfile.open('perseus_match_temp') { |t|
-            t.puts form
-          }
-          file = temp.path
-        end
-        ruby = Config::CONFIG.values_at('RUBY_INSTALL_NAME', 'EXEEXT').join
-        begin
-          Dir.chdir(LINGO_BASE) {
-            parse[%x{#{ruby} lingo.rb -c "#{cfg.path}" < "#{file}"}]
-          }
-        ensure
-          cfg.unlink
-          temp.unlink if temp
-        end
-        if temp
-          tokens, @tokens = @tokens[form], nil
-          tokens
-        else
-          @tokens[form]
-        end
+        tokens
       end
     end
     private :push, :<<, :[]=  # maybe more...
-    attr_reader :form
+    attr_reader :form, :tokens
     def initialize(form, tokens = nil)
       super(tokens || self.class.tokenize(form))
       @form   = form
-      @tokens = to_a.flatten
+      @tokens = to_a
     end
     def distance(other)
-      tokens1, tokens2 = tokens, other.tokens
-      size1, size2 = tokens1.size, tokens2.size
-      return size2 if tokens1.empty?
-      return size1 if tokens2.empty?
-      distance, costs = nil, (0..size2).to_a
-      0.upto(size1 - 1) { |index1|
-        token1, cost = tokens1[index1], index1 + 1
-        0.upto(size2 - 1) { |index2|
-          penalty = token1 == tokens2[index2] ? 0 : 1
-          # rcov hack :-(
-          _ = [
-            costs[index2 + 1] + 1,   # insertion
-            cost + 1,                # deletion
-            costs[index2] + penalty  # substitution
-          ]
-          distance = _.min
-          costs[index2], cost = cost, distance
-        }
-        costs[size2] = distance
-      }
-      distance + 1  # > 0 !?!
+      (forms | other.forms).size - (forms & other.forms).size
     end
-    def tokens(wc = true)
-      wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
-        token.sub(%r{[/|].*?\z}, '')
-      }
+    def forms
+      @forms ||= map { |token| token.form }
     end
     def disjoint?(other)
-      (tokens(false) & other.tokens(false)).empty?
+      (forms.flatten & other.forms.flatten).flatten.empty?
     end
     def inclexcl(inclexcl = {})
-      incl(inclexcl[:incl] || '.*').excl(inclexcl[:excl])
+      incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
     end
-    def incl(*wc)
-      (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
-        match?(token, wc)
-      }.to_token_set(form)
+    def incl(wcs)
+      self.class.new(form, select { |token| token.match?(wcs) })
     end
-    def excl(*wc)
-      (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
-        match?(token, wc)
-      }.to_token_set(form)
+    def excl(wcs)
+      self.class.new(form, reject { |token| token.match?(wcs) })
     end
     def soundex
-      raise "soundex functionality not available" unless defined?(Text::Soundex)
+      ensure_soundex!
-      @soundex ||= map { |token|
-        token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
-      }.to_token_set(form)
+      @soundex ||= self.class.new(form, map { |token|
+        form = token.form.replace_diacritics.sub(/\W+/, '')
+        Token.new(Text::Soundex.soundex(form) || '', token.wc)
+      })
     end
-    def soundex!
-      replace soundex
+    def ==(other)
+      tokens == other.tokens
     end
     def eql?(other)
-      tokens == other.tokens && form == other.form
+      self == other && form == other.form
     end
     def inspect
@@ -247,16 +241,77 @@ class PerseusMatch
     private
-    def match?(token, wc)
-      token =~ %r{[/|](?:#{wc.join('|')})\z}
+    def ensure_soundex!
+      unless defined?(Text::Soundex)
+        raise RuntimeError, "Soundex functionality not available", caller(1)
+      end
     end
   end
-  class ::Array
+  class PhraseTokenSet < TokenSet
+    class << self
+      def tokenize(form, unknowns = false)
+        (@tokens ||= {})[form] ||= new(form, form.scan(PRINTABLE_CHAR_RE).map { |i|
+          TokenSet.tokenize(i, unknowns)
+        })
+      end
+    end
+    alias_method :phrase, :form
+    alias_method :token_sets, :tokens
+    # (size1 - size2).abs <= distance <= [size1, size2].max
+    def distance(other)
+      token_sets1, token_sets2 = token_sets, other.token_sets
+      size1, size2 = token_sets1.size, token_sets2.size
+      return size2 if size1 == 0
+      return size1 if size2 == 0
+      distance, costs = nil, (0..size2).to_a
+      0.upto(size1 - 1) { |index1|
+        token_set1, cost = token_sets1[index1], index1 + 1
+        0.upto(size2 - 1) { |index2|
+          penalty = token_set1.distance(token_sets2[index2])
+          # rcov hack :-(
+          _ = [
+            costs[index2 + 1] + 1,   # insertion
+            cost + 1,                # deletion
+            costs[index2] + penalty  # substitution
+          ]
+          distance = _.min
-    def to_token_set(form)
-      TokenSet.new(form, self)
+          costs[index2], cost = cost, distance
+        }
+        costs[size2] = distance
+      }
+      distance
+    end
+    def forms
+      @forms ||= map { |token_set| token_set.forms }
+    end
+    def incl(wcs)
+      self.class.new(form, map { |token_set| token_set.incl(wcs) })
+    end
+    def excl(wcs)
+      self.class.new(form, map { |token_set| token_set.excl(wcs) })
+    end
+    def soundex
+      ensure_soundex!
+      @soundex ||= self.class.new(form, map { |token_set| token_set.soundex })
     end
   end

data/lib/perseus_match/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ class PerseusMatch
     MAJOR = 0
     MINOR = 0
-    TINY  = 6
+    TINY  = 7
     class << self

data/lib/perseus_match.rb CHANGED Viewed

@@ -26,8 +26,11 @@
 ###############################################################################
 #++
+require 'perseus_match/core_ext'
 require 'perseus_match/list'
 require 'perseus_match/cluster'
+require 'perseus_match/token'
 require 'perseus_match/token_set'
 require 'perseus_match/version'
@@ -36,7 +39,7 @@ class PerseusMatch
   Infinity = 1.0 / 0
-  DEFAULT_COEFF = 20
+  DEFAULT_COEFF = 2
   DISTANCE_SPEC = [                # {
     [{},                      1],  #   {}                      => 1,
@@ -68,8 +71,22 @@ class PerseusMatch
     end
     def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
-      value = new(phrase, target, pm_options).send(attribute)
-      value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
+      pm = new(phrase, target, pm_options)
+      value = pm.send(attribute)
+      if value.send(operator, threshold)
+        Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator)
+      else
+        raise CheckFailedError.new(pm, value, threshold, operator)
+      end
+    end
+    def tokenize(form, unknowns = false)
+      if file = TokenSet.file?(form)
+        TokenSet.tokenize(file, unknowns)
+      else
+        PhraseTokenSet.tokenize(form, unknowns)
+      end
     end
   end
@@ -77,8 +94,8 @@ class PerseusMatch
   attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
   def initialize(phrase, target, options = {})
-    @phrase = phrase.to_s
-    @target = target.to_s
+    @phrase = sanitize(phrase.to_s)
+    @target = sanitize(target.to_s)
     @default_coeff = options[:default_coeff] || DEFAULT_COEFF
     @distance_spec = options[:distance_spec] || DISTANCE_SPEC
@@ -89,11 +106,11 @@ class PerseusMatch
   end
   def phrase_tokens
-    @phrase_tokens ||= tokenize(phrase)
+    @phrase_tokens ||= self.class.tokenize(phrase)
   end
   def target_tokens
-    @target_tokens ||= tokenize(target)
+    @target_tokens ||= self.class.tokenize(target)
   end
   # 0 <= distance <= Infinity
@@ -104,13 +121,13 @@ class PerseusMatch
   # 1 >= similarity >= 0
   def similarity(coeff = nil)
     coeff ||= default_coeff  # passed arg may be nil
-    @similarity[coeff] ||= 1 / Math.exp(distance / (coeff * total_weight))
+    @similarity[coeff] ||= normalize_distance(coeff)
   end
   private
-  def tokenize(str)
-    TokenSet.new(str)
+  def sanitize(str)
+    str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '')
   end
   def calculate_distance
@@ -148,16 +165,25 @@ class PerseusMatch
     distance
   end
+  def normalize_distance(coeff)
+    length = phrase_tokens.size + target_tokens.size
+    return 0 if length == 0
+    norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E
+    1 / Math.exp(distance / norm)
+  end
   def total_weight
     @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
   end
   class CheckFailedError < StandardError
-    attr_reader :value, :threshold, :operator
+    attr_reader :pm, :value, :threshold, :operator
-    def initialize(value, threshold, operator)
-      @value, @threshold, @operator = value, threshold, operator
+    def initialize(pm, value, threshold, operator)
+      @pm, @value, @threshold, @operator = pm, value, threshold, operator
     end
     def to_s

data/spec/perseus_match/token_set_spec.rb CHANGED Viewed

@@ -1,81 +1,105 @@
-describe PerseusMatch::TokenSet, ' with lingo' do
+describe PerseusMatch::PhraseTokenSet do
-  before :each do
-    PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
-  end
+  describe 'with lingo' do
-  before :all do
-    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
-  end
+    before :all do
+      @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+      @original_phrase_tokens = PerseusMatch::PhraseTokenSet.instance_variable_get(:@tokens)
+    end
-  after :all do
-    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
-  end
+    after :all do
+      PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+      PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, @original_phrase_tokens)
+    end
-  it 'should tokenize a string' do
-    PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
-  end
+    before :each do
+      PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
+      PerseusMatch::PhraseTokenSet.instance_variable_set(:@tokens, nil)
+    end
-  it 'should report strictly equal TokenSets as ==' do
-    PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
-  end
+    it 'should tokenize a string' do
+      PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
+    end
-  it 'should report strictly equal TokenSets as eql' do
-    PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
-  end
+    it 'should report strictly equal PhraseTokenSets as ==' do
+      PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('foo bar')
+    end
-  it 'should report slightly equal TokenSets as ==' do
-    PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
-  end
+    it 'should report strictly equal PhraseTokenSets as eql' do
+      PerseusMatch::PhraseTokenSet.new('foo bar').should be_eql(PerseusMatch::PhraseTokenSet.new('foo bar'))
+    end
-  it 'should *not* report slightly equal TokenSets as eql' do
-    PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
-  end
+    it 'should report slightly equal PhraseTokenSets as ==' do
+      PerseusMatch::PhraseTokenSet.new('foo bar').should == PerseusMatch::PhraseTokenSet.new('Foo Bar')
+    end
-  it 'should include form in inspect' do
-    PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
-  end
+    it 'should *not* report slightly equal PhraseTokenSets as eql' do
+      PerseusMatch::PhraseTokenSet.new('foo bar').should_not be_eql(PerseusMatch::PhraseTokenSet.new('Foo Bar'))
+    end
-end if LINGO_FOUND
+    it 'should collect unknown tokens' do
+      unknowns = []
+      PerseusMatch::PhraseTokenSet.tokenize('foo bar', unknowns)
+      unknowns.should == %w[foo]
+    end
-describe PerseusMatch::TokenSet, ' without lingo' do
+    it 'should include form in inspect' do
+      PerseusMatch::PhraseTokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
+    end
-  before :each do
-    PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
-  end
+  end if LINGO_FOUND
-  before :all do
-    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
-  end
+  describe 'without lingo' do
-  after :all do
-    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
-  end
+    before :all do
+      @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+    end
+    after :all do
+      PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+    end
-  it 'should take a prepared file for tokenization' do
-    # prevent lingo from being used
-    lingo_base = LINGO_BASE.dup
-    LINGO_BASE.replace('')
+    before :each do
+      PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
+    end
-    temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
-      t.puts *%w[<foo|?> <bar|?>]
-    }
+    it 'should take a prepared file for tokenization' do
+      # prevent lingo from being used
+      lingo_base = LINGO_BASE.dup
+      LINGO_BASE.replace('')
-    path = temp.path
-    link = 'perseus.tokens'
+      temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
+        t.puts *%w[<foo|?> <bar|?>]
+      }
-    Dir.chdir(File.dirname(path)) {
-      File.symlink(path, link)
+      path = temp.path
+      link = 'perseus.tokens'
-      PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
+      Dir.chdir(File.dirname(path)) {
+        begin
+          File.symlink(path, link)
+          PerseusMatch::PhraseTokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::PhraseTokenSet)
+        ensure
+          File.unlink(link) if File.symlink?(link) && File.readlink(link) == path
+        end
+      }
+      temp.unlink
+      # reset lingo base
+      LINGO_BASE.replace(lingo_base)
+    end
+  end
-      File.unlink(link)
-    }
+  it 'should raise an error if asked for Soundex but is not available' do
+    soundex = Text.send(:remove_const, :Soundex)
-    temp.unlink
+    lambda {
+      PerseusMatch::PhraseTokenSet.new('foo bar').soundex
+    }.should raise_error(RuntimeError, /soundex/i)
-    # reset lingo base
-    LINGO_BASE.replace(lingo_base)
+    Text::Soundex = soundex
   end
 end

data/spec/perseus_match/token_spec.rb ADDED Viewed

@@ -0,0 +1,23 @@
+describe PerseusMatch::Token do
+  it 'should report strictly equal Tokens as ==' do
+    PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'a')
+  end
+  it 'should report strictly equal Tokens as eql' do
+    PerseusMatch::Token.new('foo', 'a').should be_eql(PerseusMatch::Token.new('foo', 'a'))
+  end
+  it 'should report slightly equal Tokens as ==' do
+    PerseusMatch::Token.new('foo', 'a').should == PerseusMatch::Token.new('foo', 'b')
+  end
+  it 'should *not* report slightly equal Tokens as eql' do
+    PerseusMatch::Token.new('foo', 'a').should_not be_eql(PerseusMatch::Token.new('foo', 'b'))
+  end
+  it 'should include the word class in inspect' do
+    PerseusMatch::Token.new('foo', 'a').inspect.to_s.should =~ /\/a\z/
+  end
+end

data/spec/perseus_match_spec.rb CHANGED Viewed

@@ -37,7 +37,7 @@ describe PerseusMatch do
       t.puts *phrases
     }
-    PerseusMatch::TokenSet.tokenize(temp.path)
+    PerseusMatch.tokenize(temp.path)
     temp.unlink
@@ -158,13 +158,8 @@ describe PerseusMatch do
   it 'should be checkable (2)' do
     lambda {
-      begin
-        PerseusMatch.check!('foo', 'bar', 0, :>)
-      rescue PerseusMatch::CheckFailedError => err
-        err.to_s.should =~ /0/
-        raise err
-      end
-    }.should raise_error(PerseusMatch::CheckFailedError)
+      PerseusMatch.check!('foo', 'bar', 0, :>)
+    }.should raise_error(PerseusMatch::CheckFailedError, /0/)
   end
 end if LINGO_FOUND

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: perseus_match
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 0.0.7
 platform: ruby
 authors:
 - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-01-26 00:00:00 +01:00
+date: 2009-02-24 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -32,6 +32,16 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.4.0
     version:
+- !ruby/object:Gem::Dependency
+  name: unicode
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.1
+    version:
 description: Fuzzy string matching based on linguistic analysis
 email: jens.wille@uni-koeln.de
 executables:
@@ -43,6 +53,8 @@ extra_rdoc_files:
 - ChangeLog
 - README
 files:
+- lib/perseus_match/token.rb
+- lib/perseus_match/core_ext.rb
 - lib/perseus_match/list.rb
 - lib/perseus_match/version.rb
 - lib/perseus_match/token_set.rb
@@ -56,6 +68,7 @@ files:
 - spec/spec_helper.rb
 - spec/perseus_match/list_spec.rb
 - spec/perseus_match/cluster_spec.rb
+- spec/perseus_match/token_spec.rb
 - spec/perseus_match/token_set_spec.rb
 - spec/perseus_match_spec.rb
 - sample/config.yaml