RubyGems - blackwinter-perseus_match - Versions diffs - 0.0.3 - Mend

blackwinter-perseus_match 0.0.3

Files changed (16) hide show

data/COPYING +676 -0
data/ChangeLog +5 -0
data/README +41 -0
data/Rakefile +24 -0
data/bin/perseus_match +298 -0
data/lib/perseus_match.rb +169 -0
data/lib/perseus_match/cluster.rb +94 -0
data/lib/perseus_match/list.rb +77 -0
data/lib/perseus_match/token_set.rb +248 -0
data/lib/perseus_match/version.rb +27 -0
data/spec/perseus_match/cluster_spec.rb +45 -0
data/spec/perseus_match/list_spec.rb +16 -0
data/spec/perseus_match/token_set_spec.rb +65 -0
data/spec/perseus_match_spec.rb +168 -0
data/spec/spec_helper.rb +18 -0
metadata +95 -0

data/ChangeLog ADDED

@@ -0,0 +1,5 @@
+= Revision history for perseus_match
+== 0.0.1 [2008-08-11]
+* Birthday :-)

data/README ADDED

@@ -0,0 +1,41 @@
+= perseus_match - Fuzzy string matching based on linguistic analysis
+== VERSION
+This documentation refers to perseus_match version 0.0.3
+== DESCRIPTION
+Fuzzy string matching based on linguistic analysis.
+== LINKS
+<b></b>
+Documentation::     <http://prometheus.rubyforge.org/perseus_match>
+Source code::       <http://github.com/blackwinter/perseus_match>
+Rubyforge project:: <http://rubyforge.org/projects/prometheus>
+== AUTHORS
+* Jens Wille <mailto:jens.wille@uni-koeln.de>
+== LICENSE AND COPYRIGHT
+Copyright (C) 2008 Cologne University of Applied Sciences,
+Claudiusstr. 1, 50678 Cologne, Germany
+perseus_match is free software: you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, either version 3 of the License, or (at your option) any later
+version.
+perseus_match is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along with
+perseus_match. If not, see <http://www.gnu.org/licenses/>.

data/Rakefile ADDED

@@ -0,0 +1,24 @@
+require %q{lib/perseus_match/version}
+begin
+  require 'hen'
+  Hen.lay! {{
+    :rubyforge => {
+      :project => %q{prometheus},
+      :package => %q{perseus_match}
+    },
+    :gem => {
+      :version      => PerseusMatch::VERSION,
+      :summary      => %q{Fuzzy string matching based on linguistic analysis},
+      :files        => FileList['lib/**/*.rb', 'bin/*'].to_a,
+      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
+      :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
+    }
+  }}
+rescue LoadError
+  abort "Please install the 'hen' gem first."
+end
+### Place your custom Rake tasks here.

data/bin/perseus_match ADDED

@@ -0,0 +1,298 @@
+#! /usr/bin/ruby
+require 'optparse'
+require 'benchmark'
+require 'yaml'
+require 'rubygems'
+require 'nuggets/enumerable/minmax'
+require 'nuggets/numeric/duration'
+$: << File.join(File.dirname(__FILE__), '..', 'lib')
+require 'perseus_match'
+USAGE = "Usage: #{$0} [-h|--help] [options] <file>"
+abort USAGE if ARGV.empty?
+options = {
+  :config       => nil,
+  :threshold    => 0,
+  :sort         => false,
+  :stats        => false,
+  :lingo        => false,
+  :minimal      => false,
+  :separate     => false,
+  :check        => false,
+  :failed_only  => false,
+  :align        => false,
+  :adjust_coeff => false
+}
+OptionParser.new { |opts|
+  opts.banner = USAGE
+  opts.separator ' '
+  opts.separator 'Options:'
+  opts.on('-c', '--config YAML', 'Config file in YAML format') { |f|
+    abort "Can't find config file: #{f}." unless File.readable?(f)
+    options[:config] = f
+  }
+  opts.on('-t', '--threshold NUM', Float, "Similarity threshold [Default: #{options[:threshold]}]") { |t|
+    options[:threshold] = t
+  }
+  opts.on('-s', '--sort', 'Sort results (considerably slower!)') {
+    options[:sort] = true
+  }
+  opts.on('-S', '--stats', 'Output some statistics at the end') {
+    options[:stats] = true
+  }
+  opts.on('-v', '--verbose', 'Print additional information during processing') {
+    options[:verbose] = true
+  }
+  opts.separator ' '
+  opts.separator '  * Calculating similarities (default)'
+  opts.separator ' '
+  opts.on('-m', '--minimal', 'Produce minimal pairs only') {
+    options[:minimal] = true
+  }
+  opts.on('-p', '--separate [SEPARATOR]', 'Separate phrase blocks by SEPARATOR', '[Default: empty line]') { |p|
+    options[:separate] = p || ''
+  }
+  opts.on('-l', '--lingo', 'Output results in lingo dictionary format') {
+    options[:lingo] = true
+  }
+  opts.separator ' '
+  opts.separator '  * Checking pairs'
+  opts.separator ' '
+  opts.on('-C', '--check', 'Check pairs in input file (CSV) against', 'specified (or global) threshold') {
+    options[:check] = true
+  }
+  opts.on('-f', '--failed', 'Print only failed checks') {
+    options[:failed_only] = true
+  }
+  opts.on('-a', '--align', 'Align check results') {
+    options[:align] = true
+  }
+  opts.on('-A', '--adjust-coeff', 'Adjust coefficient to minimize the error') {
+    options[:adjust_coeff] = true
+  }
+  opts.separator ' '
+  opts.separator 'Generic options:'
+  opts.on('-h', '--help', 'Print this help message and exit') {
+    abort opts.to_s
+  }
+  opts.on('--version', 'Print program version and exit') {
+    abort "#{File.basename($0)} v#{PerseusMatch::VERSION}"
+  }
+}.parse!
+unless file = ARGV.shift
+  abort "No input file specified.\n#{USAGE}"
+else
+  abort "Input file not found: #{file}" unless File.readable?(file)
+end
+PerseusMatch::TokenSet.tokenize(file)
+skip_re = %r{\A\s*(?:#|\z)}o
+phrases = []
+File.open(file).each { |line|
+  phrases << line.chomp unless line =~ skip_re
+}.close
+pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
+pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
+list_options = { :minimal => options[:minimal] }
+threshold, count, count_all = options[:threshold], 0, 0
+action = if options[:check]
+  require 'fastercsv'
+  format = if options[:align]
+    require 'jcode'
+    width = phrases.max(:jlength) + 3
+    lambda { |line, res|
+      "#{line} #{'.' * (width - line.jlength)} [#{res}]"
+    }
+  else
+    lambda { |line, res|
+      "#{line} [#{res}]"
+    }
+  end
+  phrases.sort! if options[:sort]
+  phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
+  global_threshold = options[:threshold]
+  failed_only      = options[:failed_only]
+  collect_stats    = options[:stats]
+  adjust_coeff     = options[:adjust_coeff]
+  _action = lambda { |*args|
+    pm_options[:default_coeff] = args.first unless args.empty?
+    count, count_all = 0, 0
+    positives = negatives = false_positives = false_negatives = 0.0
+    phrases.each { |line, spec|
+      phrase, target, threshold, operator, _ = *spec
+      threshold ||= global_threshold
+      operator  ||= '>'
+      assign      = operator =~ />/
+      begin
+        PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
+        count += 1
+        assign ? positives += 1 : negatives += 1
+        puts format[line, 'OK'] unless adjust_coeff || failed_only
+      rescue PerseusMatch::CheckFailedError => err
+        assign ? false_negatives += 1 : false_positives += 1
+        puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
+      end
+      count_all += 1
+    }
+    divide = lambda { |numerator, denominator|
+      denominator == 0 ? 0 : numerator / denominator
+    }
+    if collect_stats || adjust_coeff
+      error = divide[
+        false_positives + false_negatives,
+        positives + negatives + false_positives + false_negatives
+      ]
+    end
+    if collect_stats
+      recall    = divide[positives, positives + false_negatives]
+      precision = divide[positives, positives + false_positives]
+      f1        = divide[2 * recall * precision, recall + precision]
+      warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
+        recall * 100, precision * 100, f1, error
+      ]
+    end
+    error if adjust_coeff
+  }
+  if adjust_coeff
+    lambda {
+      step, max = 1, 100
+      start_coeff = pm_options[:default_coeff] || PerseusMatch::DEFAULT_COEFF
+      start_err   = _action[start_coeff]
+      previous_coeff = next_coeff = start_coeff
+      previous_err   = next_err   = start_err
+      max.times { break if (previous_err = _action[previous_coeff -= step]) != start_err }
+      max.times { break if (next_err     = _action[next_coeff     += step]) != start_err }
+      best_err = [start_err, previous_err, next_err].min
+      if best_err == start_err
+        best_coeff = start_coeff
+      else
+        if best_err == previous_err
+          max.times {
+            break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
+            previous_err, previous_coeff = current_err, current_coeff
+          }
+          best_err, best_coeff = previous_err, previous_coeff
+        else
+          max.times {
+            break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
+            next_err, next_coeff = current_err, current_coeff
+          }
+          best_err, best_coeff = next_err, next_coeff
+        end
+      end
+      puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
+    }
+  else
+    _action
+  end
+else
+  format =
+    options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
+    options[:sort]  ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
+                      lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
+  if options[:sort]
+    require 'pp'
+    lambda {
+      pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+        if pm.similarity >= threshold
+          res = format[pm]
+          count += 1
+        end
+        count_all += 1
+        res
+      }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
+    }
+  else
+    lambda {
+      separator, previous_phrase = options[:separate], nil
+      PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
+        if separator && pm.phrase != previous_phrase ||= pm.phrase
+          puts separator
+          previous_phrase = pm.phrase
+        end
+        if pm.similarity >= threshold
+          puts format[pm]
+          count += 1
+        end
+        count_all += 1
+      }
+    }
+  end
+end
+if options[:stats]
+  time = Benchmark.realtime(&action)
+  hms, x, y = time.to_hms(2), time / count, time / count_all
+  precision = lambda { |i| i.to_s.sub(/\./, '')[/\A0*/].length + 2 }
+  warn "%d (%d/%d): %s => %0.#{precision[x]}fs/%0.#{precision[y]}fs" % [
+    phrases.size, count, count_all, hms, x, y
+  ]
+else
+  action.call
+end

data/lib/perseus_match.rb ADDED

@@ -0,0 +1,169 @@
+#--
+###############################################################################
+#                                                                             #
+# perseus_match -- Fuzzy string matching based on linguistic analysis         #
+#                                                                             #
+# Copyright (C) 2008 Cologne University of Applied Sciences                   #
+#                    Claudiusstr. 1                                           #
+#                    50678 Cologne, Germany                                   #
+#                                                                             #
+# Authors:                                                                    #
+#     Jens Wille <jens.wille@uni-koeln.de>                                    #
+#                                                                             #
+# perseus_match is free software: you can redistribute it and/or modify it    #
+# under the terms of the GNU General Public License as published by the Free  #
+# Software Foundation, either version 3 of the License, or (at your option)   #
+# any later version.                                                          #
+#                                                                             #
+# perseus_match is distributed in the hope that it will be useful, but        #
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  #
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License     #
+# for more details.                                                           #
+#                                                                             #
+# You should have received a copy of the GNU General Public License along     #
+# with perseus_match. If not, see <http://www.gnu.org/licenses/>.             #
+#                                                                             #
+###############################################################################
+#++
+require 'perseus_match/list'
+require 'perseus_match/cluster'
+require 'perseus_match/token_set'
+require 'perseus_match/version'
+class PerseusMatch
+  Infinity = 1.0 / 0
+  DEFAULT_COEFF = 20
+  DISTANCE_SPEC = [                # {
+    [{},                      1],  #   {}                      => 1,
+    [{ :excl    => %w[a t] }, 2],  #   { :excl    => %w[a t] } => 1,
+    [{ :incl    => 's'     }, 3],  #   { :incl    => 's'     } => 2,
+    [{ :incl    => 'y'     }, 4],  #   { :incl    => 'y'     } => 4,
+    [{ :sort    => true    }, 4],  #   { :sort    => true    } => 4,
+    [{ :soundex => true    }, 4]   #   { :soundex => true    } => 8
+  ]                                # }
+  class << self
+    def distance(*args)
+      new(*args).distance
+    end
+    def match(phrases, pm_options = {})
+      List.new(phrases, pm_options)
+    end
+    def cluster(phrases, options = {}, pm_options = {})
+      Cluster.new(phrases, pm_options).rank(options)
+    end
+    def check(*args)
+      check!(*args)
+    rescue CheckFailedError
+      false
+    end
+    def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity)
+      value = new(phrase, target, pm_options).send(attribute)
+      value.send(operator, threshold) or raise CheckFailedError.new(value, threshold, operator)
+    end
+  end
+  attr_reader :phrase, :target, :distance_spec, :default_coeff, :verbose
+  def initialize(phrase, target, options = {})
+    @phrase = phrase.to_s
+    @target = target.to_s
+    @default_coeff = options[:default_coeff] || DEFAULT_COEFF
+    @distance_spec = options[:distance_spec] || DISTANCE_SPEC
+    @verbose = options[:verbose]
+    @similarity = {}
+  end
+  def phrase_tokens
+    @phrase_tokens ||= tokenize(phrase)
+  end
+  def target_tokens
+    @target_tokens ||= tokenize(target)
+  end
+  # 0 <= distance <= Infinity
+  def distance
+    @distance ||= calculate_distance
+  end
+  # 1 >= similarity >= 0
+  def similarity(coeff = nil)
+    coeff ||= default_coeff  # passed arg may be nil
+    @similarity[coeff] ||= 1 / Math.exp(distance / (coeff * total_weight))
+  end
+  private
+  def tokenize(str)
+    TokenSet.new(str)
+  end
+  def calculate_distance
+    return Infinity if phrase_tokens.disjoint?(target_tokens)
+    return 0        if phrase_tokens.eql?(target_tokens)
+    distance_spec.inject(0) { |distance, (options, weight)|
+      distance + token_distance(options) * weight
+    }
+  end
+  def token_distance(options = {})
+    tokens1 = phrase_tokens.inclexcl(options)
+    tokens2 = target_tokens.inclexcl(options)
+    if options[:sort]
+      tokens1 = tokens1.sort
+      tokens2 = tokens2.sort
+    end
+    if options[:soundex]
+      tokens1 = tokens1.soundex
+      tokens2 = tokens2.soundex
+    end
+    distance = tokens1.distance(tokens2)
+    warn <<-EOT if verbose
+#{options.inspect}:
+  #{tokens1.inspect}
+  #{tokens2.inspect}
+=> #{distance}
+    EOT
+    distance
+  end
+  def total_weight
+    @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight }
+  end
+  class CheckFailedError < StandardError
+    attr_reader :value, :threshold, :operator
+    def initialize(value, threshold, operator)
+      @value, @threshold, @operator = value, threshold, operator
+    end
+    def to_s
+      "FAILED: #{value} #{operator} #{threshold}"
+    end
+  end
+end