RubyGems - perseus_match - Versions diffs - 0.0.3 → 0.0.4 - Mend

perseus_match 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/README +1 -1
data/Rakefile +1 -1
data/bin/perseus_match +55 -34
data/lib/perseus_match/token_set.rb +14 -4
data/lib/perseus_match/version.rb +1 -1
data/sample/check.csv +27 -0
data/sample/config.yaml +28 -0
data/sample/lingo.cfg +8 -0
data/sample/phrases.txt +9 -0
data/spec/perseus_match/cluster_spec.rb +2 -1
data/spec/perseus_match/token_set_spec.rb +16 -0
metadata +6 -3
data/LINGO_BASE +0 -1

data/README CHANGED Viewed

@@ -2,7 +2,7 @@
 == VERSION
-This documentation refers to perseus_match version 0.0.3
+This documentation refers to perseus_match version 0.0.4
 == DESCRIPTION

data/Rakefile CHANGED Viewed

@@ -13,7 +13,7 @@ begin
       :version      => PerseusMatch::VERSION,
       :summary      => %q{Fuzzy string matching based on linguistic analysis},
       :files        => FileList['lib/**/*.rb', 'bin/*'].to_a,
-      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb'].to_a,
+      :extra_files  => FileList['[A-Z]*', 'spec/**/*.rb', 'sample/**/*'].to_a - %w[LINGO_BASE],
       :dependencies => ['ruby-backports', ['ruby-nuggets', '>= 0.4.0']]
     }
   }}

data/bin/perseus_match CHANGED Viewed

@@ -3,6 +3,7 @@
 require 'optparse'
 require 'benchmark'
 require 'yaml'
+require 'set'
 require 'rubygems'
 require 'nuggets/enumerable/minmax'
@@ -20,9 +21,11 @@ options = {
   :threshold    => 0,
   :sort         => false,
   :stats        => false,
-  :lingo        => false,
+  :silent       => false,
+  :unknowns     => nil,
   :minimal      => false,
   :separate     => false,
+  :lingo        => false,
   :check        => false,
   :failed_only  => false,
   :align        => false,
@@ -57,6 +60,14 @@ OptionParser.new { |opts|
     options[:verbose] = true
   }
+  opts.on('-n', '--silent', 'Suppress warnings') {
+    options[:silent] = true
+  }
+  opts.on('-u', '--unknowns FILE', 'Write unknown tokens as lingo dictionary to FILE') { |f|
+    options[:unknowns] = f
+  }
   opts.separator ' '
   opts.separator '  * Calculating similarities (default)'
   opts.separator ' '
@@ -111,14 +122,24 @@ else
   abort "Input file not found: #{file}" unless File.readable?(file)
 end
-PerseusMatch::TokenSet.tokenize(file)
+unknowns = Set.new if options[:unknowns]
+PerseusMatch::TokenSet.tokenize(file, unknowns || !options[:silent])
+if unknowns
+  File.open(options[:unknowns], 'w') { |f|
+    unknowns.sort.each { |unk| f.puts "#{unk}=#{unk} #?" }
+  }
+  unknowns = nil
+end
 skip_re = %r{\A\s*(?:#|\z)}o
 phrases = []
-File.open(file).each { |line|
-  phrases << line.chomp unless line =~ skip_re
-}.close
+File.open(file) { |f|
+  f.each { |line| phrases << line.chomp unless line =~ skip_re }
+}
 pm_options = options[:config] ? YAML.load_file(options[:config]) : {}
 pm_options[:verbose] = options[:verbose] if options.has_key?(:verbose)
@@ -144,7 +165,6 @@ action = if options[:check]
   end
   phrases.sort! if options[:sort]
-  phrases.map! { |line| [line, FasterCSV.parse_line(line)] }
   global_threshold = options[:threshold]
   failed_only      = options[:failed_only]
@@ -157,12 +177,12 @@ action = if options[:check]
     count, count_all = 0, 0
     positives = negatives = false_positives = false_negatives = 0.0
-    phrases.each { |line, spec|
-      phrase, target, threshold, operator, _ = *spec
+    phrases.each { |line|
+      phrase, target, threshold, operator, _ = *FasterCSV.parse_line(line)
       threshold ||= global_threshold
       operator  ||= '>'
-      assign      = operator =~ />/
+      assign      = operator =~ />/ || operator == '=='
       begin
         PerseusMatch.check!(phrase, target, threshold.to_f, operator, pm_options)
@@ -176,6 +196,7 @@ action = if options[:check]
         puts format[line, "FAILED -- #{err.value}"] unless adjust_coeff
       end
       count_all += 1
     }
@@ -195,9 +216,13 @@ action = if options[:check]
       precision = divide[positives, positives + false_positives]
       f1        = divide[2 * recall * precision, recall + precision]
-      warn 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
+      stats = 'R = %0.2f%%, P = %0.2f%%, F1 = %0.4f, Err = %0.4f' % [
         recall * 100, precision * 100, f1, error
       ]
+      stats << " (Coeff = #{pm_options[:default_coeff]})" if adjust_coeff
+      warn stats
     end
     error if adjust_coeff
@@ -222,23 +247,19 @@ action = if options[:check]
         best_coeff = start_coeff
       else
         if best_err == previous_err
-          max.times {
-            break if (current_err = _action[current_coeff = previous_coeff - step]) > previous_err
-            previous_err, previous_coeff = current_err, current_coeff
-          }
-          best_err, best_coeff = previous_err, previous_coeff
+          step *= -1
+          best_coeff = previous_coeff
         else
-          max.times {
-            break if (current_err = _action[current_coeff = next_coeff + step]) > next_err
-            next_err, next_coeff = current_err, current_coeff
-          }
-          best_err, best_coeff = next_err, next_coeff
+          best_coeff = next_coeff
         end
+        max.times {
+          break if (current_err = _action[current_coeff = best_coeff + step]) > best_err
+          best_err, best_coeff = current_err, current_coeff
+        }
       end
-      puts 'Coeff = %d, Err = %0.4f' % [best_coeff, best_err]
+      puts 'Coeff = %d (%d), Err = %0.4f (%0.4f)' % [best_coeff, start_coeff, best_err, start_err]
     }
   else
     _action
@@ -246,38 +267,38 @@ action = if options[:check]
 else
   format =
     options[:lingo] ? lambda { |pm| "#{pm.phrase}*#{pm.target}" } :
-    options[:sort]  ? lambda { |pm| [pm.target, pm.distance, pm.similarity] } :
+    options[:sort]  ? lambda { |pm| "  #{[pm.target, pm.distance, pm.similarity].inspect}" } :
                       lambda { |pm| [pm.phrase, pm.target, pm.distance, pm.similarity].inspect }
   if options[:sort]
-    require 'pp'
     lambda {
-      pp PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+      PerseusMatch::Cluster.new(phrases, pm_options, list_options).sort { |pm|
+        count_all += 1
         if pm.similarity >= threshold
-          res = format[pm]
           count += 1
+          format[pm]
         end
-        count_all += 1
-        res
-      }.map { |i| i.map { |j| j.is_a?(Array) ? j.compact : j } }
+      }.each { |phrase, matches|
+        puts "#{phrase.inspect}:", matches.compact
+      }
     }
   else
     lambda {
       separator, previous_phrase = options[:separate], nil
       PerseusMatch::List.pair(phrases, pm_options, list_options) { |pm|
+        count_all += 1
         if separator && pm.phrase != previous_phrase ||= pm.phrase
           puts separator
           previous_phrase = pm.phrase
         end
         if pm.similarity >= threshold
-          puts format[pm]
           count += 1
+          puts format[pm]
         end
-        count_all += 1
       }
     }
   end

data/lib/perseus_match/token_set.rb CHANGED Viewed

@@ -78,7 +78,7 @@ class PerseusMatch
   class TokenSet < Array
-    def self.tokenize(form)
+    def self.tokenize(form, unknowns = false)
       return @tokens[form] if @tokens
       @_tokens, @tokens = {}, Hash.new { |h, k| h[k] = new(
@@ -90,12 +90,22 @@ class PerseusMatch
           case res
             when /<(.*?)\s=\s\[(.*)\]>/
               a, b = $1, $2
-              @_tokens[a.sub(/\|.*/, '')] ||= b.scan(/\((.*?)\+?\)/).flatten
+              a.sub!(/\|.*/, '')
+              @_tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten
             when /<(.*)>/, /:(.*):/
               a, b = $1, $1.dup
-              @_tokens[a.sub!(/[\/|].*/, '')] ||= [b.replace_diacritics.downcase]
+              a.sub!(/[\/|].*/, '')
+              if unknowns && b =~ /\|\?\z/
+                if unknowns.respond_to?(:<<)
+                  unknowns << a
+                else
+                  warn "UNK: #{a} [#{res.strip}]"
+                end
+              end
-              warn "UNK: #{a} [#{res.strip}]" if b =~ /\|\?\z/
+              @_tokens[a] ||= [b.replace_diacritics.downcase]
           end
         }
       }

data/lib/perseus_match/version.rb CHANGED Viewed

@@ -4,7 +4,7 @@ class PerseusMatch
     MAJOR = 0
     MINOR = 0
-    TINY  = 3
+    TINY  = 4
     class << self

data/sample/check.csv ADDED Viewed

@@ -0,0 +1,27 @@
+# phrase,target,threshold[,operator (default: >)]
+"Anbetung der Könige","Die Anbetung der Könige",0.95
+"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
+"Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
+"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
+"Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
+"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.9
+"Die Anbetung der Könige","Die Anbetung der Heiligen Drei Könige",0.95,<
+"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.9
+"Die Anbetung der Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95,<
+"Die Anbetung der Heiligen Drei Könige","dIE AnBeTuNg der heILIGen dREI KÖniGE",0.95
+"Anbetung der Könige","Die Die Die Anbetung der Könige",0.95,<
+"Anbetung der Könige","Die Die Die Anbetung der Könige",0.8
+"Anbetung der Könige","Die Könige der Anbetung",0.95,<
+"Anbetung der Könige","Die Könige der Anbetung",0.8
+"Anbetung der Könige","Königsanbetung hoch drei",0.95,<
+"Anbetung der Könige","Königsanbetung hoch drei",0.8
+"Anbetung der Könige","Drei mal drei macht sechs",0.5,<
+"Anbetung der Könige","Das Ende dieses Blödsinns",0.5,<
+# test ;-)
+,,,

data/sample/config.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+---
+:distance_spec:
+  # default, as is
+#  - - {}
+#    - 1
+  # ignore (exclude) adjectives and particles
+  - - :excl: [a, t]
+    - 2
+  # consider (include) only substantives
+  - - :incl: s
+    - 3
+  # consider (include) only synonyms
+#  - - :incl: y
+#    - 4
+  # sort the tokens when comparing
+#  - - :sort: true
+#    - 4
+  # replace tokens by their soundex value
+  - - :soundex: true
+    - 4
+:default_coeff: 35

data/sample/lingo.cfg ADDED Viewed

@@ -0,0 +1,8 @@
+---
+meeting:
+  attendees:
+    - tokenizer:     { }
+    - wordsearcher:  { source: 'sys-dic', mode: 'first' }
+    - decomposer:    { source: 'sys-dic' }
+    - multiworder:   { source: 'sys-mul', stopper: 'PUNC,OTHR' }
+    - synonymer:     { source: 'sys-syn', skip: '?,t' }

data/sample/phrases.txt ADDED Viewed

@@ -0,0 +1,9 @@
+Anbetung der Könige
+Das Ende dieses Blödsinns
+dIE AnBeTuNg der heILIGen dREI KÖniGE
+Die Anbetung der Heiligen Drei Könige
+Die Anbetung der Könige
+Die Die Die Anbetung der Könige
+Die Könige der Anbetung
+Drei mal drei macht sechs
+Königsanbetung hoch drei

data/spec/perseus_match/cluster_spec.rb CHANGED Viewed

@@ -9,8 +9,9 @@ describe PerseusMatch::Cluster do
   it 'should accept threshold option in sort_by (1a)' do
     PerseusMatch::Cluster.new(%w[foo bar]).sort_by(:similarity, :threshold => 0.1).all? { |phrase, matches|
-      matches.size.should be_zero
+      matches.size.should == 1
       matches.size.should == matches.nitems
+      matches.each { |match| match.target.should == phrase }
     }
   end

data/spec/perseus_match/token_set_spec.rb CHANGED Viewed

@@ -4,6 +4,14 @@ describe PerseusMatch::TokenSet, ' with lingo' do
     PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
   end
+  before :all do
+    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+  end
+  after :all do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+  end
   it 'should tokenize a string' do
     PerseusMatch::TokenSet.tokenize('foo bar').should be_an_instance_of(PerseusMatch::TokenSet)
   end
@@ -36,6 +44,14 @@ describe PerseusMatch::TokenSet, ' without lingo' do
     PerseusMatch::TokenSet.instance_variable_set(:@tokens, nil)
   end
+  before :all do
+    @original_tokens = PerseusMatch::TokenSet.instance_variable_get(:@tokens)
+  end
+  after :all do
+    PerseusMatch::TokenSet.instance_variable_set(:@tokens, @original_tokens)
+  end
   it 'should take a prepared file for tokenization' do
     # prevent lingo from being used
     lingo_base = LINGO_BASE.dup

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: perseus_match
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Jens Wille
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-12-09 00:00:00 +01:00
+date: 2009-01-13 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -52,13 +52,16 @@ files:
 - Rakefile
 - COPYING
 - ChangeLog
-- LINGO_BASE
 - README
 - spec/spec_helper.rb
 - spec/perseus_match/list_spec.rb
 - spec/perseus_match/cluster_spec.rb
 - spec/perseus_match/token_set_spec.rb
 - spec/perseus_match_spec.rb
+- sample/config.yaml
+- sample/lingo.cfg
+- sample/phrases.txt
+- sample/check.csv
 has_rdoc: true
 homepage: http://prometheus.rubyforge.org/perseus_match
 post_install_message:

data/LINGO_BASE DELETED Viewed

	@@ -1 +0,0 @@
1	- /home/jw/devel/lingo/trunk