RubyGems - perseus_match - Versions diffs - 0.0.3 → 0.0.4 - Mend

perseus_match 0.0.3 → 0.0.4

Files changed (13) hide show

data/README +1 -1
data/Rakefile +1 -1
data/bin/perseus_match +55 -34
data/lib/perseus_match/token_set.rb +14 -4
data/lib/perseus_match/version.rb +1 -1
data/sample/check.csv +27 -0
data/sample/config.yaml +28 -0
data/sample/lingo.cfg +8 -0
data/sample/phrases.txt +9 -0
data/spec/perseus_match/cluster_spec.rb +2 -1
data/spec/perseus_match/token_set_spec.rb +16 -0
metadata +6 -3
data/LINGO_BASE +0 -1

data/README CHANGED Viewed

@@ -2,7 +2,7 @@
 == VERSION
-This documentation refers to perseus_match version 0.0.3
+This documentation refers to perseus_match version 0.0.4
 == DESCRIPTION

data/Rakefile CHANGED Viewed

@@ -13,7 +13,7 @@ begin
       :version      => PerseusMatch::VERSION,
       :summary      => %q{Fuzzy string matching based on linguistic analysis},
       :files        => FileList['lib/**/*.rb', 'bin/*'].to_a,
-      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
+      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
       :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
     }
   }}

data/bin/perseus_match CHANGED Viewed

@@ -3,6 +3,7 @@
 require 'optparse'
 require 'benchmark'
 require 'yaml'
+require 'set'
 require 'rubygems'
 require 'nuggets/enumerable/minmax'
@@ -20,9 +21,11 @@ options = {
   :threshold    => 0,
   :sort         => false,
   :stats        => false,
-  :lingo        => false,
+  :silent       => false,
+  :unknowns     => nil,
   :minimal      => false,
   :separate     => false,
+  :lingo        => false,
   :check        => false,
   :failed_only  => false,
   :align        => false,
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
     options[:verbose] = true
   }
+  opts.on('-n', '--silent', 'Suppress warnings') {
+    options[:silent] = true
+  }
+  opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
+    options[:unknowns] = f
+  }
   opts.separator ' '
   opts.separator '  * Calculating similarities (default)'
   opts.separator ' '
@@ -111,14 +122,24 @@ else
   abort "Input file not found: #{file}" unless File.readable?(file)
 end
-PerseusMatch::TokenSet.tokenize(file)
+unknowns = Set.new if options[:unknowns]
+PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
+if unknowns
+  File.open(options[:unknowns], 'w') { |f|
+    unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
+  }
+  unknowns = nil
+end
 skip_re = %r{\A\s*(?:#|\z)}o
 phrases = []
-File.open(file).each { |line|
-  phrases << line.chomp unless line =~ skip_re
-}.close
+File.open(file) { |f|
+  f.each { |line| phrases << line.chomp unless line =~ skip_re }
+}
 pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
 pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
@@ -144,7 +165,6 @@ action = if options[:check]
   end
   phrases.sort! if options[:sort]
-  phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
   global_threshold = options[:threshold]
   failed_only      = options[:failed_only]
@@ -157,12 +177,12 @@ action = if options[:check]
     count, count_all = 0, 0
     positives = negatives = false_positives = false_negatives = 0.0
-    phrases.each { |line, spec|
-      phrase, target, threshold, operator, _ = *spec
+    phrases.each { |line|
+      phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
       threshold ||= global_threshold
       operator  ||= '>'
-      assign      = operator =~ />/
+      assign      = operator =~ />/ || operator == '=='
       begin
         PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
@@ -176,6 +196,7 @@ action = if options[:check]
         puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
       end
       count_all += 1
     }
@@ -195,9 +216,13 @@ action = if options[:check]
       precision = divide[positives, positives + false_positives]
       f1        = divide[2 * recall * precision, recall + precision]
-      warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
+      stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
         recall * 100, precision * 100, f1, error
       ]
+      stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
+      warn stats
     end
     error if adjust_coeff
@@ -222,23 +247,19 @@ action = if options[:check]
         best_coeff = start_coeff
       else
         if best_err == previous_err
-          max.times {
-            break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
-            previous_err, previous_coeff = current_err, current_coeff
-          }
-          best_err, best_coeff = previous_err, previous_coeff
+          step *= -1
+          best_coeff = previous_coeff
         else
-          max.times {
-            break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
-            next_err, next_coeff = current_err, current_coeff
-          }
-          best_err, best_coeff = next_err, next_coeff
+          best_coeff = next_coeff
         end
+        max.times {
+          break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
+          best_err, best_coeff = current_err, current_coeff
+        }
       end
-      puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
+      puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
     }
   else
     _action
@@ -246,38 +267,38 @@ action = if options[:check]
 else
   format =
     options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
-    options[:sort]  ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
+    options[:sort]  ? lambda { |pm| "  #{[pm.target, pm.distance, pm.similarity].inspect}" } :
                       lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
   if options[:sort]
-    require 'pp'
     lambda {
-      pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+      PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+        count_all += 1
         if pm.similarity >= threshold
-          res = format[pm]
           count += 1
+          format[pm]
         end
-        count_all += 1
-        res
-      }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
+      }.each { |phrase, matches|
+        puts "#{phrase.inspect}:", matches.compact
+      }
     }
   else
     lambda {
       separator, previous_phrase = options[:separate], nil
       PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
+        count_all += 1
         if separator && pm.phrase != previous_phrase ||= pm.phrase
           puts separator
           previous_phrase = pm.phrase
         end
         if pm.similarity >= threshold
-          puts format[pm]
           count += 1
+          puts format[pm]
         end
-        count_all += 1
       }
     }
   end

data/lib/perseus_match/token_set.rb CHANGED Viewed

@@ -78,7 +78,7 @@ class PerseusMatch
   class TokenSet < Array
-    def self.tokenize(form)
+    def self.tokenize(form, unknowns = false)
       return @tokens[form] if @tokens
       @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
@@ -90,12 +90,22 @@ class PerseusMatch
           case res
             when /<(.*?)\s=\s\[(.*)\]>/
               a, b = $1, $2
-              @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
+              a.sub!(/\|.*/, '')
+              @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
             when /<(.*)>/, /:(.*):/
               a, b = $1, $1.dup
-              @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
+              a.sub!(/[\/|].*/, '')
+              if unknowns && b =~ /\|\?\z/
+                if unknowns.respond_to?(:<<)
+                  unknowns << a
+                else
+                  warn "UNK: #{a} [#{res.strip}]"
+                end
+              end
-              warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
+              @_tokens[a] ||= [b.replace_diacritics.downcase]
           end
         }
       }

data/lib/perseus_match/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ class PerseusMatch
     MAJOR = 0
     MINOR = 0
-    TINY  = 3
+    TINY  = 4
     class << self

data/sample/check.csv ADDED Viewed

@@ -0,0 +1,27 @@
+# phrase,target,threshold[,operator (default: >)]
+"Anbetung der Könige","Die Anbetung der Könige",0.95
+"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
+"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
+"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
+"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
+"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
+"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
+"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
+"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
+"Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
+"Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
+"Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
+"Anbetung der Könige","Die Könige der Anbetung",0.95,<
+"Anbetung der Könige","Die Könige der Anbetung",0.8
+"Anbetung der Könige","Königsanbetung hoch drei",0.95,<
+"Anbetung der Könige","Königsanbetung hoch drei",0.8
+"Anbetung der Könige","Drei mal drei macht sechs",0.5,<
+"Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
+# test ;-)
+,,,

data/sample/config.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+---
+:distance_spec:
+  # default, as is
+#  - - {}
+#    - 1
+  # ignore (exclude) adjectives and particles
+  - - :excl: [a, t]
+    - 2
+  # consider (include) only substantives
+  - - :incl: s
+    - 3
+  # consider (include) only synonyms
+#  - - :incl: y
+#    - 4
+  # sort the tokens when comparing
+#  - - :sort: true
+#    - 4
+  # replace tokens by their soundex value
+  - - :soundex: true
+    - 4
+:default_coeff: 35

data/sample/lingo.cfg ADDED Viewed

@@ -0,0 +1,8 @@
+---
+meeting:
+  attendees:
+    - tokenizer:     { }
+    - wordsearcher:  { source: 'sys-dic', mode: 'first' }
+    - decomposer:    { source: 'sys-dic' }
+    - multiworder:   { source: 'sys-mul', stopper: 'PUNC,OTHR' }
+    - synonymer:     { source: 'sys-syn', skip: '?,t' }

data/sample/phrases.txt ADDED Viewed

@@ -0,0 +1,9 @@
+Anbetung der Könige
+Das Ende dieses Blödsinns
+dIE AnBeTuNg der heILIGen dREI KÖniGE
+Die Anbetung der Heiligen Drei Könige
+Die Anbetung der Könige
+Die Die Die Anbetung der Könige
+Die Könige der Anbetung
+Drei mal drei macht sechs
+Königsanbetung hoch drei

data/spec/perseus_match/cluster_spec.rb CHANGED Viewed

@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
   it 'should accept threshold option in sort_by (1a)' do
     PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
-      matches.size.should be_zero
+      matches.size.should == 1
       matches.size.should == matches.nitems
+      matches.each { |match| match.target.should == phrase }
     }
   end

data/spec/perseus_match/token_set_spec.rb CHANGED Viewed

@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
     PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
   end
+  before :all do
+    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+  end
+  after :all do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+  end
   it 'should tokenize a string' do
     PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
   end
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
     PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
   end
+  before :all do
+    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+  end
+  after :all do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+  end
   it 'should take a prepared file for tokenization' do
     # prevent lingo from being used
     lingo_base = LINGO_BASE.dup

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: perseus_match
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-12-09 00:00:00 +01:00
+date: 2009-01-13 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,13 +52,16 @@ files:
 - Rakefile
 - COPYING
 - ChangeLog
-- LINGO_BASE
 - README
 - spec/spec_helper.rb
 - spec/perseus_match/list_spec.rb
 - spec/perseus_match/cluster_spec.rb
 - spec/perseus_match/token_set_spec.rb
 - spec/perseus_match_spec.rb
+- sample/config.yaml
+- sample/lingo.cfg
+- sample/phrases.txt
+- sample/check.csv
 has_rdoc: true
 homepage: http://prometheus.rubyforge.org/perseus_match
 post_install_message:

data/LINGO_BASE DELETED Viewed

	@@ -1 +0,0 @@
1	- /home/jw/devel/lingo/trunk