RubyGems - perseus_match - Versions diffs - 0.0.2 → 0.0.3 - Mend

perseus_match 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/LINGO_BASE +1 -0
data/README +8 -6
data/Rakefile +2 -2
data/bin/perseus_match +226 -22
data/lib/perseus_match/cluster.rb +9 -8
data/lib/perseus_match/list.rb +31 -9
data/lib/perseus_match/token_set.rb +105 -90
data/lib/perseus_match/version.rb +1 -1
data/lib/perseus_match.rb +67 -21
data/spec/perseus_match/cluster_spec.rb +45 -0
data/spec/perseus_match/list_spec.rb +16 -0
data/spec/perseus_match/token_set_spec.rb +65 -0
data/spec/perseus_match_spec.rb +168 -0
data/spec/spec_helper.rb +18 -0
metadata +28 -12

data/LINGO_BASE ADDED Viewed

	@@ -0,0 +1 @@
1	+ /home/jw/devel/lingo/trunk

data/README CHANGED Viewed

@@ -2,7 +2,7 @@
 == VERSION
-This documentation refers to perseus_match version 0.0.2
+This documentation refers to perseus_match version 0.0.3
 == DESCRIPTION
@@ -10,15 +10,17 @@ This documentation refers to perseus_match version 0.0.2
 Fuzzy string matching based on linguistic analysis.
-== AUTHORS
+== LINKS
-* Jens Wille <mailto:jens.wille@uni-koeln.de>
+<b></b>
+Documentation::     <http://prometheus.rubyforge.org/perseus_match>
+Source code::       <http://github.com/blackwinter/perseus_match>
+Rubyforge project:: <http://rubyforge.org/projects/prometheus>
-== LINKS
+== AUTHORS
-* <http://prometheus.rubyforge.org/perseus_match>
-* <http://github.com/blackwinter/perseus_match>
+* Jens Wille <mailto:jens.wille@uni-koeln.de>
 == LICENSE AND COPYRIGHT

data/Rakefile CHANGED Viewed

@@ -13,8 +13,8 @@ begin
       :version      => PerseusMatch::VERSION,
       :summary      => %q{Fuzzy string matching based on linguistic analysis},
       :files        => FileList['lib/**/*.rb', 'bin/*'].to_a,
-      :extra_files  => FileList['[A-Z]*'].to_a,
-      :dependencies => [['ruby-nuggets', '>= 0.3.0']]
+      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
+      :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
     }
   }}
 rescue LoadError

data/bin/perseus_match CHANGED Viewed

@@ -2,8 +2,10 @@
 require 'optparse'
 require 'benchmark'
+require 'yaml'
 require 'rubygems'
+require 'nuggets/enumerable/minmax'
 require 'nuggets/numeric/duration'
 $: << File.join(File.dirname(__FILE__), '..', 'lib')
@@ -14,19 +16,29 @@ USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
 abort USAGE if ARGV.empty?
 options = {
-  :stats     => false,
-  :threshold => 0,
-  :sort      => false
+  :config       => nil,
+  :threshold    => 0,
+  :sort         => false,
+  :stats        => false,
+  :lingo        => false,
+  :minimal      => false,
+  :separate     => false,
+  :check        => false,
+  :failed_only  => false,
+  :align        => false,
+  :adjust_coeff => false
 }
 OptionParser.new { |opts|
   opts.banner = USAGE
-  opts.separator ''
+  opts.separator ' '
   opts.separator 'Options:'
-  opts.on('--stats', 'Output some statistics at the end') {
-    options[:stats] = true
+  opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
+    abort "Can't find config file: #{f}." unless File.readable?(f)
+    options[:config] = f
   }
   opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
@@ -37,7 +49,51 @@ OptionParser.new { |opts|
     options[:sort] = true
   }
-  opts.separator ''
+  opts.on('-S', '--stats', 'Output some statistics at the end') {
+    options[:stats] = true
+  }
+  opts.on('-v', '--verbose', 'Print additional information during processing') {
+    options[:verbose] = true
+  }
+  opts.separator ' '
+  opts.separator '  * Calculating similarities (default)'
+  opts.separator ' '
+  opts.on('-m', '--minimal', 'Produce minimal pairs only') {
+    options[:minimal] = true
+  }
+  opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
+    options[:separate] = p || ''
+  }
+  opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
+    options[:lingo] = true
+  }
+  opts.separator ' '
+  opts.separator '  * Checking pairs'
+  opts.separator ' '
+  opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
+    options[:check] = true
+  }
+  opts.on('-f', '--failed', 'Print only failed checks') {
+    options[:failed_only] = true
+  }
+  opts.on('-a', '--align', 'Align check results') {
+    options[:align] = true
+  }
+  opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
+    options[:adjust_coeff] = true
+  }
+  opts.separator ' '
   opts.separator 'Generic options:'
   opts.on('-h', '--help', 'Print this help message and exit') {
@@ -57,33 +113,179 @@ end
 PerseusMatch::TokenSet.tokenize(file)
-phrases = File.readlines(file).map { |line| line.chomp }
+skip_re = %r{\A\s*(?:#|\z)}o
+phrases = []
+File.open(file).each { |line|
+  phrases << line.chomp unless line =~ skip_re
+}.close
+pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
+pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
+list_options = { :minimal => options[:minimal] }
 threshold, count, count_all = options[:threshold], 0, 0
-time = Benchmark.realtime {
-  if options[:sort]
-    require 'pp'
+action = if options[:check]
+  require 'fastercsv'
-    pp PerseusMatch::Cluster.new(phrases).sort { |pm|
-      if pm.similarity >= threshold
-        [pm.target, pm.distance, pm.similarity]
-        count += 1
-      end
-      count_all += 1
-    }.compact
+  format = if options[:align]
+    require 'jcode'
+    width = phrases.max(:jlength) + 3
+    lambda { |line, res|
+      "#{line} #{'.' * (width - line.jlength)} [#{res}]"
+    }
   else
-    PerseusMatch::List.pair(phrases) { |pm|
-      if pm.similarity >= threshold
-        p [pm.phrase, pm.target, pm.distance, pm.similarity]
+    lambda { |line, res|
+      "#{line} [#{res}]"
+    }
+  end
+  phrases.sort! if options[:sort]
+  phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
+  global_threshold = options[:threshold]
+  failed_only      = options[:failed_only]
+  collect_stats    = options[:stats]
+  adjust_coeff     = options[:adjust_coeff]
+  _action = lambda { |*args|
+    pm_options[:default_coeff] = args.first unless args.empty?
+    count, count_all = 0, 0
+    positives = negatives = false_positives = false_negatives = 0.0
+    phrases.each { |line, spec|
+      phrase, target, threshold, operator, _ = *spec
+      threshold ||= global_threshold
+      operator  ||= '>'
+      assign      = operator =~ />/
+      begin
+        PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
         count += 1
+        assign ? positives += 1 : negatives += 1
+        puts format[line, 'OK'] unless adjust_coeff || failed_only
+      rescue PerseusMatch::CheckFailedError => err
+        assign ? false_negatives += 1 : false_positives += 1
+        puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
       end
       count_all += 1
     }
+    divide = lambda { |numerator, denominator|
+      denominator == 0 ? 0 : numerator / denominator
+    }
+    if collect_stats || adjust_coeff
+      error = divide[
+        false_positives + false_negatives,
+        positives + negatives + false_positives + false_negatives
+      ]
+    end
+    if collect_stats
+      recall    = divide[positives, positives + false_negatives]
+      precision = divide[positives, positives + false_positives]
+      f1        = divide[2 * recall * precision, recall + precision]
+      warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
+        recall * 100, precision * 100, f1, error
+      ]
+    end
+    error if adjust_coeff
+  }
+  if adjust_coeff
+    lambda {
+      step, max = 1, 100
+      start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
+      start_err   = _action[start_coeff]
+      previous_coeff = next_coeff = start_coeff
+      previous_err   = next_err   = start_err
+      max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
+      max.times { break if (next_err     = _action[next_coeff     += step]) != start_err }
+      best_err = [start_err, previous_err, next_err].min
+      if best_err == start_err
+        best_coeff = start_coeff
+      else
+        if best_err == previous_err
+          max.times {
+            break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
+            previous_err, previous_coeff = current_err, current_coeff
+          }
+          best_err, best_coeff = previous_err, previous_coeff
+        else
+          max.times {
+            break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
+            next_err, next_coeff = current_err, current_coeff
+          }
+          best_err, best_coeff = next_err, next_coeff
+        end
+      end
+      puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
+    }
+  else
+    _action
   end
-}
+else
+  format =
+    options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
+    options[:sort]  ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
+                      lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
+  if options[:sort]
+    require 'pp'
+    lambda {
+      pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+        if pm.similarity >= threshold
+          res = format[pm]
+          count += 1
+        end
+        count_all += 1
+        res
+      }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
+    }
+  else
+    lambda {
+      separator, previous_phrase = options[:separate], nil
+      PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
+        if separator && pm.phrase != previous_phrase ||= pm.phrase
+          puts separator
+          previous_phrase = pm.phrase
+        end
+        if pm.similarity >= threshold
+          puts format[pm]
+          count += 1
+        end
+        count_all += 1
+      }
+    }
+  end
+end
 if options[:stats]
+  time = Benchmark.realtime(&action)
   hms, x, y = time.to_hms(2), time / count, time / count_all
   precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
@@ -91,4 +293,6 @@ if options[:stats]
   warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
     phrases.size, count, count_all, hms, x, y
   ]
+else
+  action.call
 end

data/lib/perseus_match/cluster.rb CHANGED Viewed

@@ -30,10 +30,10 @@ class PerseusMatch
   class Cluster < Hash
-    def initialize(phrases = [])
+    def initialize(phrases = [], pm_options = {}, list_options = {})
       super() { |h, k| h[k] = [] }
-      List.new(phrases).each { |pm| add(pm) }
+      List.pair(phrases, pm_options, list_options) { |pm| add(pm) }
     end
     def add(pm)
@@ -42,7 +42,7 @@ class PerseusMatch
     alias_method :<<, :add
-    def sort_by(attribute, *args, &block)
+    def sort_by(attribute, *args)
       options = args.last.is_a?(Hash) ? args.pop : {}
       _ = map { |phrase, matches|
@@ -63,15 +63,15 @@ class PerseusMatch
             lambda { |match| res[match] < threshold } :
             lambda { |match| res[match] > threshold }
-          matches.reject! { |match| condition[match] }
+          matches.reject!(&condition)
         end
         if limit = options[:limit]
-          matches.slice!(limit..-1)
+          matches.slice!(limit..-1) if matches.size > limit
         end
         # transform entries if so requested
-        matches.map!(&block) if block
+        matches.map! { |match| yield(match) } if block_given?
         [phrase, matches]
       }.sort
@@ -79,8 +79,9 @@ class PerseusMatch
       _  # rcov hack :-(
     end
-    def sort(options = {}, &block)
-      sort_by(:similarity, options.delete(:coeff), options, &block)
+    def sort(options = {})
+      args = [:similarity, options.delete(:coeff), options]
+      block_given? ? sort_by(*args) { |*a| yield(*a) } : sort_by(*args)
     end
     def rank(options = {})

data/lib/perseus_match/list.rb CHANGED Viewed

@@ -32,24 +32,46 @@ class PerseusMatch
     class << self
-      def pair(phrases)
+      def pair(phrases, pm_options = {}, list_options = {})
         phrases.uniq!
-        phrases.each { |phrase|
-          phrases.each { |target|
-            yield PerseusMatch.new(phrase, target)
+        pairs = [] unless block_given?
+        unless list_options[:minimal]
+          # => pairs.size = phrases.size ** 2
+          phrases.each { |phrase|
+            phrases.each { |target|
+              pm = PerseusMatch.new(phrase, target, pm_options)
+              block_given? ? yield(pm) : pairs << pm
+            }
+          }
+        else
+          # => pairs.size = (phrases.size ** 2 - phrases.size) / 2
+          size = phrases.size
+          1.upto(size) { |i|
+            phrase = phrases[i - 1]
+            i.upto(size - 1) { |j|
+              pm = PerseusMatch.new(phrase, phrases[j], pm_options)
+              block_given? ? yield(pm) : pairs << pm
+            }
           }
-        }
+        end
+        pairs || phrases
       end
     end
-    alias_method :add, :push
-    def initialize(phrases = [])
-      self.class.pair(phrases) { |pm| add(pm) }
+    def initialize(phrases = [], pm_options = {}, list_options = {})
+      self.class.pair(phrases, pm_options, list_options) { |pm| add(pm) }
     end
+    alias_method :add, :push
   end
 end

data/lib/perseus_match/token_set.rb CHANGED Viewed

@@ -28,44 +28,52 @@
 $KCODE = 'u'
-LINGO_BASE = '/home/jw/devel/lingo/trunk'
-LINGO_CONFIG = {
-  'meeting' => {
-    'attendees' => [
-      { 'textreader'   => { 'files'=> 'STDIN' } },
-      { 'tokenizer'    => {  } },
-      { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
-      { 'decomposer'   => { 'source' => 'sys-dic' } },
-      { 'multiworder'  => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
-      { 'synonymer'    => { 'source' => 'sys-syn', 'out' => 'syn', 'skip'=>'?,t' } },
-      { 'debugger'     => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } }
-    ]
-  }
-}
-require 'tempfile'
+require 'pathname'
+require 'rbconfig'
 require 'yaml'
-# use enhanced Tempfile#make_tmpname, as of r13631
-if RUBY_RELEASE_DATE < '2007-10-05'
-  class Tempfile
-    def make_tmpname(basename, n)
-      case basename
-      when Array
-        prefix, suffix = *basename
-      else
-        prefix, suffix = basename, ''
-      end
+require 'rubygems'
+require 'backports/tempfile'
+require 'nuggets/tempfile/open'
+require 'nuggets/util/i18n'
-      t = Time.now.strftime("%Y%m%d")
-      path = "#{prefix}#{t}-#{$$}-#{rand(0x100000000).to_s(36)}-#{n}#{suffix}"
-    end
+begin
+  require 'text/soundex'
+rescue LoadError
+  warn "could not load the Text gem -- soundex functionality will not be available"
+end
-  end
+LINGO_BASE = ENV['PM_LINGO_BASE'] || (
+  File.readable?('LINGO_BASE') ? File.read('LINGO_BASE').chomp : '.'
+)
+LINGO_FOUND = File.readable?(File.join(LINGO_BASE, 'lingo.rb'))
+warn "lingo installation not found at #{LINGO_BASE} -- proceeding anyway" unless LINGO_FOUND
+lingo_config = if File.readable?(file = ENV['PM_LINGO_CONFIG'] || 'lingo.cfg')
+  YAML.load_file(file)
+else
+  warn "lingo config not found at #{ENV['PM_LINGO_CONFIG']} -- using default" if ENV.has_key?('PM_LINGO_CONFIG')
+  {
+    'meeting' => {
+      'attendees' => [
+        { 'tokenizer'    => {  } },
+        { 'wordsearcher' => { 'source' => 'sys-dic', 'mode' => 'first' } },
+        { 'decomposer'   => { 'source' => 'sys-dic' } },
+        { 'multiworder'  => { 'source' => 'sys-mul', 'stopper' => 'PUNC,OTHR' } },
+        { 'synonymer'    => { 'source' => 'sys-syn', 'skip' => '?,t' } },
+      ]
+    }
+  }
 end
+lingo_config['meeting']['attendees'].
+  unshift({ 'textreader' => { 'files'=> 'STDIN' } }).
+  push({ 'debugger' => { 'prompt' => '', 'eval' => 'true', 'ceval' => 'false' } })
+LINGO_CONFIG = lingo_config
 class PerseusMatch
   class TokenSet < Array
@@ -73,10 +81,8 @@ class PerseusMatch
     def self.tokenize(form)
       return @tokens[form] if @tokens
-      @_tokens = {}
-      @tokens  = Hash.new { |h, k| h[k] = new(
-        k, @_tokens.has_key?(k) ? @_tokens[k] :
-          k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
+      @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
+        k, (@_tokens[k] || []) | k.scan(/\w+/).map { |i| @_tokens[i] }.flatten.compact
       )}
       parse = lambda { |x|
@@ -85,8 +91,11 @@ class PerseusMatch
             when /<(.*?)\s=\s\[(.*)\]>/
               a, b = $1, $2
               @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
-            #when /<(.*)>/, /:(.*):/
-            #  # ignore
+            when /<(.*)>/, /:(.*):/
+              a, b = $1, $1.dup
+              @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
+              warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
           end
         }
       }
@@ -95,29 +104,32 @@ class PerseusMatch
         File.open(t) { |f| parse[f] }
         @tokens[form]
       else
-        cfg = Tempfile.new(['perseus_match_lingo', '.cfg'])
-        YAML.dump(LINGO_CONFIG, cfg)
-        cfg.close
+        raise "lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND
+        cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
+          YAML.dump(LINGO_CONFIG, t)
+        }
-        file = form[0] == ?/ ? form : File.join(Dir.pwd, form)
+        file = Pathname.new(form).absolute? ? form : File.join(Dir.pwd, form)
         unless File.file?(file) && File.readable?(file)
-          temp = Tempfile.new('perseus_match_temp')
-          temp.puts form
-          temp.close
+          temp = Tempfile.open('perseus_match_temp') { |t|
+            t.puts form
+          }
           file = temp.path
         end
-        Dir.chdir(LINGO_BASE) { parse[%x{
-          ./lingo.rb -c #{cfg.path} < #{file}
-        }] }
-        cfg.unlink
+        begin
+          Dir.chdir(LINGO_BASE) { parse[%x{
+            #{Config::CONFIG['ruby_install_name']} lingo.rb -c "#{cfg.path}" < "#{file}"
+          }] }
+        ensure
+          cfg.unlink
+          temp.unlink if temp
+        end
         if temp
-          temp.unlink
           tokens, @tokens = @tokens[form], nil
           tokens
         else
@@ -138,42 +150,40 @@ class PerseusMatch
     end
     def distance(other)
-      distance, index, max = xor(other).size, -1, size
+      tokens1, tokens2 = tokens, other.tokens
+      size1, size2 = tokens1.size, tokens2.size
-      intersect(other).each { |token|
-        while current = other.tokens[index += 1] and current != token
-          distance += 1
+      return size2 if tokens1.empty?
+      return size1 if tokens2.empty?
-          break if index > max
-        end
-      }
+      distance, costs = nil, (0..size2).to_a
-      distance
-    end
+      0.upto(size1 - 1) { |index1|
+        token1, cost = tokens1[index1], index1 + 1
-    def tokens(wc = true)
-      wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
-        token.sub(%r{[/|].*?\z}, '')
-      }
-    end
+        0.upto(size2 - 1) { |index2|
+          penalty = token1 == tokens2[index2] ? 0 : 1
-    def &(other)
-      tokens & other.tokens
-    end
+          # rcov hack :-(
+          _ = [
+            costs[index2 + 1] + 1,   # insertion
+            cost + 1,                # deletion
+            costs[index2] + penalty  # substitution
+          ]
+          distance = _.min
-    def |(other)
-      tokens | other.tokens
-    end
+          costs[index2], cost = cost, distance
+        }
-    def intersect(other)
-      (self & other).inject([]) { |memo, token|
-        memo + [token] * [count(token), other.count(token)].max
+        costs[size2] = distance
       }
+      distance + 1  # > 0 !?!
     end
-    def xor(other)
-      ((self | other) - (self & other)).inject([]) { |memo, token|
-        memo + [token] * (count(token) + other.count(token))
+    def tokens(wc = true)
+      wc ? @tokens : @tokens_sans_wc ||= @tokens.map { |token|
+        token.sub(%r{[/|].*?\z}, '')
       }
     end
@@ -186,26 +196,31 @@ class PerseusMatch
     end
     def incl(*wc)
-      (@incl ||= {})[wc = [*wc].compact] ||= map { |tokens|
-        tokens.reject { |token| !match?(token, wc) }
+      (@incl ||= {})[wc = [*wc].compact] ||= select { |token|
+        match?(token, wc)
       }.to_token_set(form)
     end
     def excl(*wc)
-      (@excl ||= {})[wc = [*wc].compact] ||= map { |tokens|
-        tokens.reject { |token| match?(token, wc) }
+      (@excl ||= {})[wc = [*wc].compact] ||= reject { |token|
+        match?(token, wc)
       }.to_token_set(form)
     end
-    def count(token)
-      counts[token]
+    def soundex
+      raise "soundex functionality not available" unless defined?(Text::Soundex)
+      @soundex ||= map { |token|
+        token.sub(/(.*)(?=[\/|])/) { |m| Text::Soundex.soundex(m.replace_diacritics) }
+      }.to_token_set(form)
     end
-    def counts
-      @counts ||= tokens.inject(Hash.new(0)) { |counts, token|
-        counts[token] += 1
-        counts
-      }
+    def soundex!
+      replace soundex
+    end
+    def eql?(other)
+      tokens == other.tokens && form == other.form
     end
     def inspect

data/lib/perseus_match/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ class PerseusMatch
     MAJOR = 0
     MINOR = 0
-    TINY  = 2
+    TINY  = 3
     class << self

data/lib/perseus_match.rb CHANGED Viewed

@@ -38,35 +38,53 @@ class PerseusMatch
   DEFAULT_COEFF = 20
-  DISTANCE_SPEC = {
-    {}                   => 1,
-    { :excl => %w[a t] } => 1,
-    { :incl => 's'     } => 2,
-    { :incl => 'y'     } => 4,
-    { :sort => true    } => 4
-  }
+  DISTANCE_SPEC = [                # {
+    [{},                      1],  #   {}                      => 1,
+    [{ :excl    => %w[a t] }, 2],  #   { :excl    => %w[a t] } => 1,
+    [{ :incl    => 's'     }, 3],  #   { :incl    => 's'     } => 2,
+    [{ :incl    => 'y'     }, 4],  #   { :incl    => 'y'     } => 4,
+    [{ :sort    => true    }, 4],  #   { :sort    => true    } => 4,
+    [{ :soundex => true    }, 4]   #   { :soundex => true    } => 8
+  ]                                # }
   class << self
-    def match(phrases)
-      List.new(phrases)
+    def distance(*args)
+      new(*args).distance
     end
-    def cluster(phrases, options = {})
-      Cluster.new(phrases).rank(options)
+    def match(phrases, pm_options = {})
+      List.new(phrases, pm_options)
+    end
+    def cluster(phrases, options = {}, pm_options = {})
+      Cluster.new(phrases, pm_options).rank(options)
+    end
+    def check(*args)
+      check!(*args)
+    rescue CheckFailedError
+      false
+    end
+    def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
+      value = new(phrase, target, pm_options).send(attribute)
+      value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
     end
   end
-  attr_reader :phrase, :target, :distance_spec, :default_coeff
+  attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
   def initialize(phrase, target, options = {})
-    @phrase = phrase
-    @target = target
+    @phrase = phrase.to_s
+    @target = target.to_s
     @default_coeff = options[:default_coeff] || DEFAULT_COEFF
     @distance_spec = options[:distance_spec] || DISTANCE_SPEC
+    @verbose = options[:verbose]
     @similarity = {}
   end
@@ -97,7 +115,7 @@ class PerseusMatch
   def calculate_distance
     return Infinity if phrase_tokens.disjoint?(target_tokens)
-    return 0        if phrase_tokens == target_tokens
+    return 0        if phrase_tokens.eql?(target_tokens)
     distance_spec.inject(0) { |distance, (options, weight)|
       distance + token_distance(options) * weight
@@ -105,19 +123,47 @@ class PerseusMatch
   end
   def token_distance(options = {})
-    phrase_tokens = self.phrase_tokens.inclexcl(options)
-    target_tokens = self.target_tokens.inclexcl(options)
+    tokens1 = phrase_tokens.inclexcl(options)
+    tokens2 = target_tokens.inclexcl(options)
     if options[:sort]
-      phrase_tokens.sort!
-      target_tokens.sort!
+      tokens1 = tokens1.sort
+      tokens2 = tokens2.sort
     end
-    (phrase_tokens.distance(target_tokens) + target_tokens.distance(phrase_tokens)) / 2.0
+    if options[:soundex]
+      tokens1 = tokens1.soundex
+      tokens2 = tokens2.soundex
+    end
+    distance = tokens1.distance(tokens2)
+    warn <<-EOT if verbose
+#{options.inspect}:
+  #{tokens1.inspect}
+  #{tokens2.inspect}
+=> #{distance}
+    EOT
+    distance
   end
   def total_weight
-    distance_spec.values.inject(0.0) { |total, weight| total + weight }
+    @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
+  end
+  class CheckFailedError < StandardError
+    attr_reader :value, :threshold, :operator
+    def initialize(value, threshold, operator)
+      @value, @threshold, @operator = value, threshold, operator
+    end
+    def to_s
+      "FAILED: #{value} #{operator} #{threshold}"
+    end
   end
 end

data/spec/perseus_match/cluster_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+describe PerseusMatch::Cluster do
+  it 'should accept limit option in sort_by' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :limit => 1).all? { |phrase, matches|
+      matches.size.should == 1
+      matches.size.should == matches.nitems
+    }
+  end
+  it 'should accept threshold option in sort_by (1a)' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
+      matches.size.should be_zero
+      matches.size.should == matches.nitems
+    }
+  end
+  it 'should accept threshold option in sort_by (1b)' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0).all? { |phrase, matches|
+      matches.size.should == 2
+      matches.size.should == matches.nitems
+    }
+  end
+  it 'should accept threshold option in sort_by (2)' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'c').all? { |phrase, matches|
+      matches.size.should == 1
+      matches.size.should == matches.nitems
+    }
+  end
+  it 'should accept both limit and threshold options in sort_by (1)' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'z', :limit => 1).all? { |phrase, matches|
+      matches.size.should == 1
+      matches.size.should == matches.nitems
+    }
+  end
+  it 'should accept both limit and threshold options in sort_by (2)' do
+    PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:target, :threshold => 'a', :limit => 1).all? { |phrase, matches|
+      matches.size.should be_zero
+      matches.size.should == matches.nitems
+    }
+  end
+end if LINGO_FOUND

data/spec/perseus_match/list_spec.rb ADDED Viewed

@@ -0,0 +1,16 @@
+describe PerseusMatch::List, '::pair' do
+  before :all do
+    @phrases = %w[foo bar baz]
+    @size = @phrases.size
+  end
+  it 'should produce full list of pairs with correct size' do
+    PerseusMatch::List.pair(@phrases).size.should == @size ** 2
+  end
+  it 'should produce minimal list of pairs with correct size' do
+    PerseusMatch::List.pair(@phrases, {}, :minimal => true).size.should == (@size ** 2 - @size) / 2
+  end
+end

data/spec/perseus_match/token_set_spec.rb ADDED Viewed

@@ -0,0 +1,65 @@
+describe PerseusMatch::TokenSet, ' with lingo' do
+  before :each do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
+  end
+  it 'should tokenize a string' do
+    PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
+  end
+  it 'should report strictly equal TokenSets as ==' do
+    PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('foo bar')
+  end
+  it 'should report strictly equal TokenSets as eql' do
+    PerseusMatch::TokenSet.new('foo bar').should be_eql(PerseusMatch::TokenSet.new('foo bar'))
+  end
+  it 'should report slightly equal TokenSets as ==' do
+    PerseusMatch::TokenSet.new('foo bar').should == PerseusMatch::TokenSet.new('Foo Bar')
+  end
+  it 'should *not* report slightly equal TokenSets as eql' do
+    PerseusMatch::TokenSet.new('foo bar').should_not be_eql(PerseusMatch::TokenSet.new('Foo Bar'))
+  end
+  it 'should include form in inspect' do
+    PerseusMatch::TokenSet.new('foo', []).inspect.to_s.should =~ /<foo>/
+  end
+end if LINGO_FOUND
+describe PerseusMatch::TokenSet, ' without lingo' do
+  before :each do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
+  end
+  it 'should take a prepared file for tokenization' do
+    # prevent lingo from being used
+    lingo_base = LINGO_BASE.dup
+    LINGO_BASE.replace('')
+    temp = Tempfile.open('perseus_match_spec_tokens_temp') { |t|
+      t.puts *%w[<foo|?> <bar|?>]
+    }
+    path = temp.path
+    link = 'perseus.tokens'
+    Dir.chdir(File.dirname(path)) {
+      File.symlink(path, link)
+      PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
+      File.unlink(link)
+    }
+    temp.unlink
+    # reset lingo base
+    LINGO_BASE.replace(lingo_base)
+  end
+end

data/spec/perseus_match_spec.rb ADDED Viewed

@@ -0,0 +1,168 @@
+require 'rubygems'
+require 'nuggets/tempfile/open'
+require 'nuggets/util/i18n'
+describe PerseusMatch do
+  before :all do
+    @highly_similar = [
+      'Anbetung der Könige',
+      'Die Anbetung der Könige'
+    ]  # ok
+    @similar = [
+      # @highly_similar + ...
+      'Die Anbetung der Heiligen Drei Könige',
+      'dIE AnBeTuNg der heILIGen dREI KÖniGE'
+    ]  # ok
+    @unfortunately_similar = [
+      # @similar + ...
+      'Die Die Die Anbetung der Könige',
+      'Die Könige der Anbetung',
+      'Königsanbetung hoch drei'
+    ]  # *not* ok -- eventually try to drop these below the threshold
+    @somewhat_similar = @highly_similar + @similar + @unfortunately_similar
+    phrases = @somewhat_similar + [
+      'Drei mal drei macht sechs',
+      'Das Ende dieses Blödsinns',
+      ''
+    ]
+    temp = Tempfile.open('perseus_match_spec_temp') { |t|
+      t.puts *phrases
+    }
+    PerseusMatch::TokenSet.tokenize(temp.path)
+    temp.unlink
+    @matchings = PerseusMatch.match(phrases)
+  end
+  it 'should identify identical (non-empty) strings as identical' do
+    @matchings.each { |matching|
+      if !matching.phrase.empty? && matching.phrase == matching.target
+        inform_on_error(matching) { matching.similarity.should == 1.0 }
+      end
+    }
+  end
+  it 'should identify case-insensitively identical (non-empty) strings as nearly identical' do
+    @matchings.each { |matching|
+      if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase == matching.target.replace_diacritics.downcase
+        inform_on_error(matching) { matching.similarity.should > 0.95 }
+      end
+    }
+  end
+  it 'should identify *only* case-insensitively identical (non-empty) strings as nearly identical' do
+    @matchings.each { |matching|
+      if !matching.phrase.empty? && matching.phrase.replace_diacritics.downcase != matching.target.replace_diacritics.downcase
+        inform_on_error(matching) { matching.similarity.should < 0.98 }
+      end
+    }
+  end
+  it 'should identify disjunct (non-empty) strings as disjunct' do
+    @matchings.each { |matching|
+      if !matching.phrase.empty? && matching.phrase_tokens.disjoint?(matching.target_tokens)
+        inform_on_error(matching) { matching.similarity.should == 0.0 }
+      end
+    }
+  end
+  it 'should identify empty string as disjunct with anything, even with itself' do
+    @matchings.each { |matching|
+      if matching.phrase.empty? || matching.target.empty?
+        inform_on_error(matching) { matching.similarity.should == 0.0 }
+      end
+    }
+  end
+  it 'should identify certain strings as highly similar (1)' do
+    @matchings.each { |matching|
+      if @highly_similar.include?(matching.phrase) && @highly_similar.include?(matching.target)
+        inform_on_error(matching) { matching.similarity.should > 0.9 }
+      end
+    }
+  end
+  it 'should identify certain strings as highly similar (2)' do
+    @highly_similar.each { |phrase|
+      @highly_similar.each { |target|
+        inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.9).should be_true }
+      }
+    }
+  end
+  it 'should identify certain strings as similar (1)' do
+    @matchings.each { |matching|
+      if @similar.include?(matching.phrase) && @similar.include?(matching.target)
+        inform_on_error(matching) { matching.similarity.should > 0.8 }
+      end
+    }
+  end
+  it 'should identify certain strings as similar (2)' do
+    @similar.each { |phrase|
+      @similar.each { |target|
+        inform_on_error([phrase, target]) { PerseusMatch.check(phrase, target, 0.8).should be_true }
+      }
+    }
+  end
+  it 'should *not* identify other strings as similar (1)' do
+    @matchings.each { |matching|
+      if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
+        inform_on_error(matching) { matching.similarity.should_not > 0.8 }
+      end
+    }
+  end
+  it 'should *not* identify other strings as similar (2)' do
+    @matchings.each { |matching|
+      if @somewhat_similar.include?(matching.phrase) && !@somewhat_similar.include?(matching.target)
+        inform_on_error(matching) { PerseusMatch.check(matching.phrase, matching.target, 0.8).should be_false }
+      end
+    }
+  end
+  it 'should be symmetrical' do
+    similarities = {}
+    @matchings.each { |matching|
+      if similarity = similarities[[matching.target, matching.phrase]]
+        inform_on_error(matching) { similarity.should == matching.similarity }
+      else
+        similarities[[matching.phrase, matching.target]] = matching.similarity
+      end
+    }
+  end
+  it 'should calculate pair distance' do
+    PerseusMatch.distance('foo', 'bar').class.should < Numeric
+  end
+  it 'should be clusterable' do
+    PerseusMatch.cluster(@somewhat_similar).should be_an_instance_of(Array)
+  end
+  it 'should be checkable (1)' do
+    PerseusMatch.check('foo', 'bar', 0, :>=).should be_true
+  end
+  it 'should be checkable (2)' do
+    lambda {
+      begin
+        PerseusMatch.check!('foo', 'bar', 0, :>)
+      rescue PerseusMatch::CheckFailedError => err
+        err.to_s.should =~ /0/
+        raise err
+      end
+    }.should raise_error(PerseusMatch::CheckFailedError)
+  end
+end if LINGO_FOUND

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,18 @@
+unless Object.const_defined?(:PerseusMatch)
+  $: << File.join(File.dirname(__FILE__), '..', 'lib')
+  require 'perseus_match'
+end
+def inform_on_error(*args)
+  begin
+    yield
+  rescue Spec::Expectations::ExpectationNotMetError => err
+    unless args.empty?
+      puts
+      p *args
+      puts
+    end
+    raise
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: perseus_match
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Jens Wille
@@ -9,9 +9,19 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-08-15 00:00:00 +02:00
+date: 2008-12-09 00:00:00 +01:00
 default_executable:
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: ruby-backports
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
 - !ruby/object:Gem::Dependency
   name: ruby-nuggets
   type: :runtime
@@ -20,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.3.0
+        version: 0.4.0
     version:
 description: Fuzzy string matching based on linguistic analysis
 email: jens.wille@uni-koeln.de
@@ -33,29 +43,35 @@ extra_rdoc_files:
 - ChangeLog
 - README
 files:
-- lib/perseus_match.rb
+- lib/perseus_match/list.rb
 - lib/perseus_match/version.rb
 - lib/perseus_match/token_set.rb
-- lib/perseus_match/list.rb
 - lib/perseus_match/cluster.rb
+- lib/perseus_match.rb
 - bin/perseus_match
+- Rakefile
 - COPYING
-- README
 - ChangeLog
-- Rakefile
+- LINGO_BASE
+- README
+- spec/spec_helper.rb
+- spec/perseus_match/list_spec.rb
+- spec/perseus_match/cluster_spec.rb
+- spec/perseus_match/token_set_spec.rb
+- spec/perseus_match_spec.rb
 has_rdoc: true
 homepage: http://prometheus.rubyforge.org/perseus_match
 post_install_message:
 rdoc_options:
-- --charset
-- UTF-8
+- --line-numbers
+- --inline-source
 - --title
 - perseus_match Application documentation
 - --main
 - README
+- --charset
+- UTF-8
 - --all
-- --line-numbers
-- --inline-source
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
@@ -73,7 +89,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: prometheus
-rubygems_version: 1.2.0
+rubygems_version: 1.3.1
 signing_key:
 specification_version: 2
 summary: Fuzzy string matching based on linguistic analysis