RubyGems - dimus-taxamatch_rb - Versions diffs - 0.1.7 → 0.5.0 - Mend

dimus-taxamatch_rb 0.1.7 → 0.5.0

Files changed (11) hide show

data/lib/taxamatch_rb/authmatch.rb +70 -68
data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +87 -85
data/lib/taxamatch_rb/normalizer.rb +44 -40
data/lib/taxamatch_rb/parser.rb +66 -62
data/lib/taxamatch_rb/phonetizer.rb +67 -64
data/lib/taxamatch_rb.rb +74 -67
data/spec/damerau_levenshtein_mod_test.txt +2 -0
data/spec/spec_helper.rb +2 -2
data/spec/taxamatch_rb_spec.rb +53 -16
data/spec/taxamatch_test.txt +1 -1
metadata +1 -1

data/lib/taxamatch_rb/authmatch.rb CHANGED Viewed

@@ -1,85 +1,87 @@
-class Authmatch
+module Taxamatch
+  class Authmatch
-  def self.authmatch(authors1, authors2, years1, years2)
-    unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
-    year_difference = compare_years(years1, years2)
-    get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
-  end
+    def self.authmatch(authors1, authors2, years1, years2)
+      unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
+      year_difference = compare_years(years1, years2)
+      get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
+    end
-  def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
-    count_before = authors1.size + authors2.size
-    count_after = unique_authors1.size + unique_authors2.size
-    score = 0
-    if count_after == 0
-      if year_diff != nil
-        if year_diff == 0
-          score = 100
-        elsif year_diff == 1
-          score = 54
+    def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
+      count_before = authors1.size + authors2.size
+      count_after = unique_authors1.size + unique_authors2.size
+      score = 0
+      if count_after == 0
+        if year_diff != nil
+          if year_diff == 0
+            score = 100
+          elsif year_diff == 1
+            score = 54
+          end
+        else
+          score = 94
         end
-      else
-        score = 94
-      end
-    elsif unique_authors1.size > 0 || unique_authors2.size > 0
-      if year_diff != nil
-        if year_diff == 0
-          score = 91
-        elsif year_diff == 1
-          score = 51
+      elsif unique_authors1.size == 0 || unique_authors2.size == 0
+        if year_diff != nil
+          if year_diff == 0
+            score = 91
+          elsif year_diff == 1
+            score = 51
+          end
+        else
+          score = 90
         end
       else
-        score = 90
+        score = ((1 - count_after.to_f/count_before.to_f) * 100).round
+        score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
       end
-    else
-      score = ((1 - count_after.to_f/count_before.to_f) * 100).round
-      score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
+      score > 50 ? score : 0
     end
-    score > 50 ? score : 0
-  end
-  def self.remove_duplicate_authors(authors1, authors2)
-    unique_authors1 = authors1.dup
-    unique_authors2 = authors2.dup
-    authors1.each do |au1|
-      au1_match = false
-      authors2.each do |au2|
-        au2_match = false
-        if au1 == au2
-          au1_match = au2_match = true if au1 == au2
-        elsif au1 == au2[0...au1.size]
-          au1_match = true
-        elsif au1[0...au2.size] == au2
-          au2_match = true
-        end
-        if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
-          unique_authors1.delete au1
-          unique_authors2.delete au2
-        elsif au1_match
-          unique_authors1.delete au1
-        elsif au2_match
-          unique_authors2.delete au2
-        else
-          if self.fuzzy_match_authors(au1, au2)
+    def self.remove_duplicate_authors(authors1, authors2)
+      unique_authors1 = authors1.dup
+      unique_authors2 = authors2.dup
+      authors1.each do |au1|
+        authors2.each do |au2|
+          au1_match = au2_match = false
+          if au1 == au2
+            au1_match = au2_match = true
+          elsif au1 == au2[0...au1.size]
+            au1_match = true
+          elsif au1[0...au2.size] == au2
+            au2_match = true
+          end
+          if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
+            unique_authors1.delete au1
+            unique_authors2.delete au2
+          elsif au1_match
             unique_authors1.delete au1
+          elsif au2_match
             unique_authors2.delete au2
+          else
+            #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
+            if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
+              unique_authors1.delete au1
+              unique_authors2.delete au2
+            end
           end
         end
       end
+      [unique_authors1, unique_authors2]
     end
-    [unique_authors1, unique_authors2]
-  end
-  def self.fuzzy_match_authors(author1, author2)
-    au1_length = author1.size
-    au2_length = author2.size
-    dlm = DamerauLevenshteinMod.new
-    ed = dlm.distance(author1, author2,2,3)
-    (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
-  end
+    def self.fuzzy_match_authors(author1, author2)
+      au1_length = author1.size
+      au2_length = author2.size
+      dlm = Taxamatch::DamerauLevenshteinMod.new
+      ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
+      (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
+    end
-  def self.compare_years(years1, years2)
-    return 0 if years1 == [] && years2 == []
-    return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
-    nil
+    def self.compare_years(years1, years2)
+      return 0 if years1 == [] && years2 == []
+      return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
+      nil
+    end
   end
-end
+end

data/lib/taxamatch_rb/damerau_levenshtein_mod.rb CHANGED Viewed

@@ -2,114 +2,116 @@
 require 'rubygems'
 require 'inline'
 require 'time'
+module Taxamatch
-class DamerauLevenshteinMod
-  def distance(str1, str2, block_size=2, max_distance=10)
-    # puts str1.unpack("U*");
-    distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
-  end
+  class DamerauLevenshteinMod
+    def distance(str1, str2, block_size=2, max_distance=10)
+      # puts str1.unpack("U*");
+      distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
+    end
-  inline do |builder|
-    builder.c "
-    static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
-      long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
-      long stop_execution = 0;
+    inline do |builder|
+      builder.c "
+      static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
+        long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
+        long stop_execution = 0;
-      VALUE *sv = RARRAY_PTR(_s);
-      VALUE *tv = RARRAY_PTR(_t);
+        VALUE *sv = RARRAY_PTR(_s);
+        VALUE *tv = RARRAY_PTR(_t);
-      sl = RARRAY_LEN(_s);
-      tl = RARRAY_LEN(_t);
+        sl = RARRAY_LEN(_s);
+        tl = RARRAY_LEN(_t);
-      if (sl == 0) return LONG2NUM(tl);
-      if (tl == 0) return LONG2NUM(sl);
-      //case of lengths 1 must present or it will break further in the code
-      if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
+        if (sl == 0) return LONG2NUM(tl);
+        if (tl == 0) return LONG2NUM(sl);
+        //case of lengths 1 must present or it will break further in the code
+        if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
-      long s[sl];
-      long t[tl];
+        long s[sl];
+        long t[tl];
-      for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
-      for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
+        for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
+        for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
-      sl++;
-      tl++;
+        sl++;
+        tl++;
-      //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
-      d = malloc((sizeof(long))*(sl)*(tl));
-      //populate 'vertical' row starting from the 2nd position (first one is filled already)
-      for(i = 0; i < tl; i++){
-        d[i*sl] = i;
-      }
+        //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
+        d = malloc((sizeof(long))*(sl)*(tl));
+        //populate 'vertical' row starting from the 2nd position (first one is filled already)
+        for(i = 0; i < tl; i++){
+          d[i*sl] = i;
+        }
-      //fill up array with scores
-      for(i = 1; i<sl; i++){
-        d[i] = i;
-        if (stop_execution == 1) break;
-        current_distance = 10000;
-        for(j = 1; j<tl; j++){
+        //fill up array with scores
+        for(i = 1; i<sl; i++){
+          d[i] = i;
+          if (stop_execution == 1) break;
+          current_distance = 10000;
+          for(j = 1; j<tl; j++){
-          cost = 1;
-          if(s[i-1] == t[j-1]) cost = 0;
+            cost = 1;
+            if(s[i-1] == t[j-1]) cost = 0;
-          half_sl = (sl - 1)/2;
-          half_tl = (tl - 1)/2;
+            half_sl = (sl - 1)/2;
+            half_tl = (tl - 1)/2;
-          block = block_size < half_sl ? block_size : half_sl;
-          block = block < half_tl ? block : half_tl;
+            block = block_size < half_sl ? block_size : half_sl;
+            block = block < half_tl ? block : half_tl;
-          while (block >= 1){
-            long swap1 = 1;
-            long swap2 = 1;
-            i1 = i - (block * 2);
-            j1 = j - (block * 2);
-            for (k = i1; k < i1 + block; k++) {
-              if (s[k] != t[k + block]){
-                swap1 = 0;
-                break;
+            while (block >= 1){
+              long swap1 = 1;
+              long swap2 = 1;
+              i1 = i - (block * 2);
+              j1 = j - (block * 2);
+              for (k = i1; k < i1 + block; k++) {
+                if (s[k] != t[k + block]){
+                  swap1 = 0;
+                  break;
+                }
               }
-            }
-            for (k = j1; k < j1 + block; k++) {
-              if (t[k] != s[k + block]){
-                swap2 = 0;
-                break;
+              for (k = j1; k < j1 + block; k++) {
+                if (t[k] != s[k + block]){
+                  swap2 = 0;
+                  break;
+                }
               }
-            }
-            del = d[j*sl + i - 1] + 1;
-            ins = d[(j-1)*sl + i] + 1;
-            min = del;
-            if (ins < min) min = ins;
-            //if (i == 2 && j==2) return LONG2NUM(swap2+5);
-            if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
-              transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
-              if (transp < min) min = transp;
-              block = 0;
-            } else if (block == 1) {
-              subs = d[(j-1)*sl + i - 1] + cost;
-              if (subs < min) min = subs;
-            }
-            block--;
-          }
-          d[j*sl+i]=min;
-          if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
-        }
-        if (current_distance > max_distance) {
-          stop_execution = 1;
+              del = d[j*sl + i - 1] + 1;
+              ins = d[(j-1)*sl + i] + 1;
+              min = del;
+              if (ins < min) min = ins;
+              //if (i == 2 && j==2) return LONG2NUM(swap2+5);
+              if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
+                transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
+                if (transp < min) min = transp;
+                block = 0;
+              } else if (block == 1) {
+                subs = d[(j-1)*sl + i - 1] + cost;
+                if (subs < min) min = subs;
+              }
+              block--;
+            }
+            d[j*sl+i]=min;
+            if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
+          }
+          if (current_distance > max_distance) {
+            stop_execution = 1;
+          }
         }
-      }
-      distance=d[sl * tl - 1];
-      if (stop_execution == 1) distance = current_distance;
+        distance=d[sl * tl - 1];
+        if (stop_execution == 1) distance = current_distance;
-      free(d);
-      return LONG2NUM(distance);
-    }
-   "
+        free(d);
+        return LONG2NUM(distance);
+      }
+     "
+    end
   end
 end
 if __FILE__ == $0
-  a=DamerauLevenshteinMod.new
+  a=Taxamatch::DamerauLevenshteinMod.new
   s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
   t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')

data/lib/taxamatch_rb/normalizer.rb CHANGED Viewed

@@ -1,47 +1,51 @@
 # encoding: UTF-8
-module Normalizer
-  def self.normalize(string)
-    utf8_to_ascii(string).upcase
-  end
+module Taxamatch
-  def self.normalize_word(word)
-    self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
-  end
+  module Normalizer
+    def self.normalize(string)
+      utf8_to_ascii(string).upcase
+    end
+    def self.normalize_word(word)
+      self.normalize(word).gsub(/[^A-Z0-9\-]/, '')
+    end
+  protected
+    def self.utf8_to_ascii(string)
+      string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
+      string = string.gsub(/[ÉÈÊË]/, "E")
+      string = string.gsub(/[ÍÌÎÏ]/, "I")
+      string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
+      string = string.gsub(/[ÚÙÛÜ]/, "U")
+      string = string.gsub(/[Ý]/, "Y")
+      string = string.gsub(/Æ/, "AE")
+      string = string.gsub(/[ČÇ]/, "C")
+      string = string.gsub(/[ŠŞ]/, "S")
+      string = string.gsub(/[Đ]/, "D")
+      string = string.gsub(/Ž/, "Z")
+      string = string.gsub(/Ñ/, "N")
+      string = string.gsub(/Œ/, "OE")
+      string = string.gsub(/ß/, "B")
+      string = string.gsub(/Ķ/, "K")
+      string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
+      string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
+      string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
+      string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
+      string = string.gsub(/[úùûüůưừựủứụ]/, "u")
+      string = string.gsub(/[žź]/, "z")
+      string = string.gsub(/[ýÿỹ]/, "y")
+      string = string.gsub(/[đ]/, "d")
+      string = string.gsub(/æ/, "ae")
+      string = string.gsub(/[čćç]/, "c")
+      string = string.gsub(/[ñńň]/, "n")
+      string = string.gsub(/œ/, "oe")
+      string = string.gsub(/[śšş]/, "s")
+      string = string.gsub(/ř/, "r")
+      string = string.gsub(/ğ/, "g")
+      string = string.gsub(/Ř/, "R")
+    end
-protected
-  def self.utf8_to_ascii(string)
-    string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
-    string = string.gsub(/[ÉÈÊË]/, "E")
-    string = string.gsub(/[ÍÌÎÏ]/, "I")
-    string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
-    string = string.gsub(/[ÚÙÛÜ]/, "U")
-    string = string.gsub(/[Ý]/, "Y")
-    string = string.gsub(/Æ/, "AE")
-    string = string.gsub(/[ČÇ]/, "C")
-    string = string.gsub(/[ŠŞ]/, "S")
-    string = string.gsub(/[Đ]/, "D")
-    string = string.gsub(/Ž/, "Z")
-    string = string.gsub(/Ñ/, "N")
-    string = string.gsub(/Œ/, "OE")
-    string = string.gsub(/ß/, "B")
-    string = string.gsub(/Ķ/, "K")
-    string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
-    string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
-    string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
-    string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
-    string = string.gsub(/[úùûüůưừựủứụ]/, "u")
-    string = string.gsub(/[žź]/, "z")
-    string = string.gsub(/[ýÿỹ]/, "y")
-    string = string.gsub(/[đ]/, "d")
-    string = string.gsub(/æ/, "ae")
-    string = string.gsub(/[čćç]/, "c")
-    string = string.gsub(/[ñńň]/, "n")
-    string = string.gsub(/œ/, "oe")
-    string = string.gsub(/[śšş]/, "s")
-    string = string.gsub(/ř/, "r")
-    string = string.gsub(/ğ/, "g")
-    string = string.gsub(/Ř/, "R")
   end
 end

data/lib/taxamatch_rb/parser.rb CHANGED Viewed

@@ -1,83 +1,87 @@
 # encoding: UTF-8
 require 'biodiversity'
-class TaxamatchParser
-  def initialize
-    @parser = ScientificNameParser.new
-    @parsed_raw = nil
-    @res = {}
-  end
+module Taxamatch
+  class Parser
+    def initialize
+      @parser = ScientificNameParser.new
+      @parsed_raw = nil
+      @res = {}
+    end
-  def parse(name)
-    @res = {:all_authors => [], :all_years => []}
-    @parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
-    organize_results
-  end
+    def parse(name)
+      @res = {:all_authors => [], :all_years => []}
+      @parsed_raw = JSON.load(@parser.parse(name).to_json)['scientificName']
+      organize_results
+    end
-  def parsed_raw
-    return @parsed_raw
-  end
+    def parsed_raw
+      return @parsed_raw
+    end
-protected
+  protected
-  def organize_results
-    pr = @parsed_raw
-    return nil unless pr['parsed']
-    d = pr['details'][0]
-    process_node(:uninomial, d['uninomial'])
-    process_node(:genus, d['genus'])
-    process_node(:species, d['species'], true)
-    process_infraspecies(d['infraspecies'])
-    @res[:all_authors] = @res[:all_authors].uniq.map {|a| Normalizer.normalize(a)}
-    @res[:all_years].uniq!
-    @res.keys.size > 2 ? @res : nil
-  end
+    def organize_results
+      pr = @parsed_raw
+      return nil unless pr['parsed']
+      d = pr['details'][0]
+      process_node(:uninomial, d['uninomial'])
+      process_node(:genus, d['genus'])
+      process_node(:species, d['species'], true)
+      process_infraspecies(d['infraspecies'])
+      @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
+      @res[:all_years].uniq!
+      @res.keys.size > 2 ? @res : nil
+    end
-  def process_node(name, node, is_species = false)
-    return unless node
-    @res[name] = {}
-    @res[name][:epitheton] = node['epitheton']
-    @res[name][:normalized] = Normalizer.normalize(node['epitheton'])
-    @res[name][:phonetized] = Phonetizer.near_match(node['epitheton'], is_species)
-    get_authors_years(node, @res[name])
-  end
+    def process_node(name, node, is_species = false)
+      return unless node
+      @res[name] = {}
+      @res[name][:epitheton] = node['epitheton']
+      @res[name][:normalized] = Taxamatch::Normalizer.normalize(node['epitheton'])
+      @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node['epitheton'], is_species)
+      get_authors_years(node, @res[name])
+    end
-  def process_infraspecies(node)
-    return unless node
-    @res[:infraspecies] = []
-    node.each do |infr|
-      hsh = {}
-      hsh[:epitheton] = infr['epitheton']
-      hsh[:normalized] = Normalizer.normalize(infr['epitheton'])
-      hsh[:phonetized] = Phonetizer.near_match(infr['epitheton'], true)
-      get_authors_years(infr,hsh)
-      @res[:infraspecies] << hsh
+    def process_infraspecies(node)
+      return unless node
+      @res[:infraspecies] = []
+      node.each do |infr|
+        hsh = {}
+        hsh[:epitheton] = infr['epitheton']
+        hsh[:normalized] = Taxamatch::Normalizer.normalize(infr['epitheton'])
+        hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr['epitheton'], true)
+        get_authors_years(infr,hsh)
+        @res[:infraspecies] << hsh
+      end
     end
-  end
-  def get_authors_years(node, res)
-    res[:authors] = []
-    res[:years] = []
-    ['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
-      if node[au]
-        res[:authors] += node[au]['author']
-        res[:years] << node[au]['year'] if node[au]['year']
-        if node[au]['exAuthorTeam']
-          res[:authors] += node[au]['exAuthorTeam']['author']
-          res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
+    def get_authors_years(node, res)
+      res[:authors] = []
+      res[:years] = []
+      ['basionymAuthorTeam','combinationAuthorTeam'].each do |au|
+        if node[au]
+          res[:authors] += node[au]['author']
+          res[:years] << node[au]['year'] if node[au]['year']
+          if node[au]['exAuthorTeam']
+            res[:authors] += node[au]['exAuthorTeam']['author']
+            res[:years] << node[au]['exAuthorTeam']['year'] if node[au]['exAuthorTeam']['year']
+          end
         end
       end
+      res[:authors].uniq!
+      res[:years].uniq!
+      @res[:all_authors] += res[:authors] if res[:authors].size > 0
+      @res[:all_years] += res[:years] if res[:years].size > 0
     end
-    res[:authors].uniq!
-    res[:years].uniq!
-    @res[:all_authors] += res[:authors] if res[:authors].size > 0
-    @res[:all_years] += res[:years] if res[:years].size > 0
-  end
+  end
 end
 if __FILE__ == $0
   require 'pp'
   p = Parser.new
   puts p.parse('Salmonella werahensis (Castellani) Hauduroy and Ehringer in Hauduroy 1937')
-end
+end

data/lib/taxamatch_rb/phonetizer.rb CHANGED Viewed

@@ -1,72 +1,75 @@
 # encoding: UTF-8
+module Taxamatch
-class Phonetizer
+  class Phonetizer
-  def self.near_match(a_word, normalize_ending = false)
-    a_word = a_word.strip rescue ''
-    return '' if a_word == ''
-    a_word = Normalizer.normalize a_word
-    case a_word
-      when /^AE/
-        a_word = 'E' + a_word[2..-1]
-      when /^CN/
-        a_word = 'N' + a_word[2..-1]
-      when /^CT/
-        a_word = 'T' + a_word[2..-1]
-      when /^CZ/
-        a_word = 'C' + a_word[2..-1]
-      when /^DJ/
-        a_word = 'J' + a_word[2..-1]
-      when /^EA/
-        a_word = 'E' + a_word[2..-1]
-      when /^EU/
-        a_word = 'U' + a_word[2..-1]
-      when /^GN/
-        a_word = 'N' + a_word[2..-1]
-      when /^KN/
-        a_word = 'N' + a_word[2..-1]
-      when /^MC/
-        a_word = 'MAC' + a_word[2..-1]
-      when /^MN/
-        a_word = 'N' + a_word[2..-1]
-      when /^OE/
-        a_word = 'E' + a_word[2..-1]
-      when /^QU/
-        a_word = 'Q' + a_word[2..-1]
-      when /^PS/
-        a_word = 'S' + a_word[2..-1]
-      when /^PT/
-        a_word = 'T' + a_word[2..-1]
-      when /^TS/
-        a_word = 'S' + a_word[2..-1]
-      when /^WR/
-        a_word = 'R' + a_word[2..-1]
-      when /^X/
-        a_word = 'Z' + a_word[1..-1]
-    end
-    first_char = a_word.split('')[0]
-    rest_chars = a_word.split('')[1..-1].join('')
-    rest_chars.gsub!('AE', 'I')
-    rest_chars.gsub!('IA', 'A')
-    rest_chars.gsub!('OE', 'I')
-    rest_chars.gsub!('OI', 'A')
-    rest_chars.gsub!('SC', 'S')
-    rest_chars.gsub!('H', '')
-    rest_chars.tr!('EOUYKZ', 'IAIICS')
-    a_word = (first_char + rest_chars).squeeze
+    def self.near_match(a_word, normalize_ending = false)
+      a_word = a_word.strip rescue ''
+      return '' if a_word == ''
+      a_word = Taxamatch::Normalizer.normalize a_word
+      case a_word
+        when /^AE/
+          a_word = 'E' + a_word[2..-1]
+        when /^CN/
+          a_word = 'N' + a_word[2..-1]
+        when /^CT/
+          a_word = 'T' + a_word[2..-1]
+        when /^CZ/
+          a_word = 'C' + a_word[2..-1]
+        when /^DJ/
+          a_word = 'J' + a_word[2..-1]
+        when /^EA/
+          a_word = 'E' + a_word[2..-1]
+        when /^EU/
+          a_word = 'U' + a_word[2..-1]
+        when /^GN/
+          a_word = 'N' + a_word[2..-1]
+        when /^KN/
+          a_word = 'N' + a_word[2..-1]
+        when /^MC/
+          a_word = 'MAC' + a_word[2..-1]
+        when /^MN/
+          a_word = 'N' + a_word[2..-1]
+        when /^OE/
+          a_word = 'E' + a_word[2..-1]
+        when /^QU/
+          a_word = 'Q' + a_word[2..-1]
+        when /^PS/
+          a_word = 'S' + a_word[2..-1]
+        when /^PT/
+          a_word = 'T' + a_word[2..-1]
+        when /^TS/
+          a_word = 'S' + a_word[2..-1]
+        when /^WR/
+          a_word = 'R' + a_word[2..-1]
+        when /^X/
+          a_word = 'Z' + a_word[1..-1]
+      end
+      first_char = a_word.split('')[0]
+      rest_chars = a_word.split('')[1..-1].join('')
+      rest_chars.gsub!('AE', 'I')
+      rest_chars.gsub!('IA', 'A')
+      rest_chars.gsub!('OE', 'I')
+      rest_chars.gsub!('OI', 'A')
+      rest_chars.gsub!('SC', 'S')
+      rest_chars.gsub!('H', '')
+      rest_chars.tr!('EOUYKZ', 'IAIICS')
+      a_word = (first_char + rest_chars).squeeze
-    if normalize_ending && a_word.size > 4
-      a_word = self.normalize_ending(a_word)
+      if normalize_ending && a_word.size > 4
+        a_word = self.normalize_ending(a_word)
+      end
+      a_word
     end
-    a_word
-  end
-  def self.normalize_ending(a_word)
-      # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
-      # -- at the end of a string translate all to -a
-      a_word.gsub!(/IS$/, 'A')
-      a_word.gsub!(/IM$/, 'A')
-      a_word.gsub(/AS$/, 'A')
-  end
+    def self.normalize_ending(a_word)
+        # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
+        # -- at the end of a string translate all to -a
+        a_word.gsub!(/IS$/, 'A')
+        a_word.gsub!(/IM$/, 'A')
+        a_word.gsub(/AS$/, 'A')
+    end
+  end
 end

data/lib/taxamatch_rb.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# encoding: UTF-8
 $:.unshift(File.dirname(__FILE__)) unless
    $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
 # $:.unshift('taxamatch_rb')
@@ -7,85 +8,91 @@ require 'taxamatch_rb/normalizer'
 require 'taxamatch_rb/phonetizer'
 require 'taxamatch_rb/authmatch'
-class Taxamatch
+$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
+module Taxamatch
+  class Base
-  def initialize
-    @parser = TaxamatchParser.new
-    @dlm = DamerauLevenshteinMod.new
-  end
+    def initialize
+      @parser = Taxamatch::Parser.new
+      @dlm = Taxamatch::DamerauLevenshteinMod.new
+    end
-  #takes two scientific names and returns true if names match and false if they don't
-  def taxamatch(str1, str2)
-    parsed_data_1 = @parser.parse(str1)
-    parsed_data_2 = @parser.parse(str2)
-    taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
-  end
+    #takes two scientific names and returns true if names match and false if they don't
+    def taxamatch(str1, str2)
+      parsed_data_1 = @parser.parse(str1)
+      parsed_data_2 = @parser.parse(str2)
+      taxamatch_parsed_data(parsed_data_1, parsed_data_2)[:match]
+    end
-  #takes two hashes of parsed scientific names, analyses them and returns back
-  #this function is useful when species strings are preparsed.
-  def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
-    result = nil
-    result =  match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
-    result =  match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
-    if result && result[:match]
-      result[:match] = match_authors(parsed_data_1, parsed_data_2) > 0 ? true : false
+    #takes two hashes of parsed scientific names, analyses them and returns back
+    #this function is useful when species strings are preparsed.
+    def taxamatch_parsed_data(parsed_data_1, parsed_data_2)
+      result = nil
+      result =  match_uninomial(parsed_data_1, parsed_data_2) if parsed_data_1[:uninomial] && parsed_data_2[:uninomial]
+      result =  match_multinomial(parsed_data_1, parsed_data_2) if parsed_data_1[:genus] && parsed_data_2[:genus]
+      if result && result[:match]
+        result[:match] = false if match_authors(parsed_data_1, parsed_data_2) == 0
+      end
+      return result
     end
-    return result
-  end
-  def match_uninomial(parsed_data_1, parsed_data_2)
-    return false
-  end
+    def match_uninomial(parsed_data_1, parsed_data_2)
+      return false
+    end
-  def match_multinomial(parsed_data_1, parsed_data_2)
-    gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
-    sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
-    au_match = match_authors(parsed_data_1, parsed_data_2)
-    total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
-    match = match_matches(gen_match, sp_match)
-    match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
-  end
+    def match_multinomial(parsed_data_1, parsed_data_2)
+      gen_match = match_genera(parsed_data_1[:genus], parsed_data_2[:genus])
+      sp_match = match_species(parsed_data_1[:species], parsed_data_2[:species])
+      au_match = match_authors(parsed_data_1, parsed_data_2)
+      total_length = parsed_data_1[:genus][:epitheton].size + parsed_data_2[:genus][:epitheton].size + parsed_data_1[:species][:epitheton].size + parsed_data_2[:species][:epitheton].size
+      match = match_matches(gen_match, sp_match)
+      match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
+    end
-  def match_genera(genus1, genus2)
-    genus1_length = genus1[:normalized].size
-    genus2_length = genus2[:normalized].size
-    match = false
-    ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
-    return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
+    def match_genera(genus1, genus2)
+      genus1_length = genus1[:normalized].size
+      genus2_length = genus2[:normalized].size
+      match = false
+      ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
+      return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
-    match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
-    {:edit_distance => ed, :match => match, :phonetic_match => false}
-  end
+      match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
+      {:edit_distance => ed, :match => match, :phonetic_match => false}
+    end
-  def match_species(sp1, sp2)
-    sp1_length = sp1[:normalized].size
-    sp2_length = sp2[:normalized].size
-    sp1[:phonetized] = Phonetizer.normalize_ending sp1[:phonetized]
-    sp2[:phonetized] = Phonetizer.normalize_ending sp2[:phonetized]
-    match = false
-    ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
-    return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
+    def match_species(sp1, sp2)
+      sp1_length = sp1[:normalized].size
+      sp2_length = sp2[:normalized].size
+      sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
+      sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
+      match = false
+      ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
+      return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
-    match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
-    {:edit_distance => ed, :match => match, :phonetic_match => false}
-  end
+      match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
+      {:edit_distance => ed, :match => match, :phonetic_match => false}
+    end
-  def match_authors(parsed_data_1, parsed_data_2)
-    au1 = parsed_data_1[:all_authors]
-    au2 = parsed_data_2[:all_authors]
-    yr1 = parsed_data_1[:all_years]
-    yr2 = parsed_data_2[:all_years]
-    Authmatch.authmatch(au1, au2, yr1, yr2)
-  end
+    def match_authors(parsed_data_1, parsed_data_2)
+      au1 = parsed_data_1[:all_authors]
+      au2 = parsed_data_2[:all_authors]
+      yr1 = parsed_data_1[:all_years]
+      yr2 = parsed_data_2[:all_years]
+      Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
+    end
-  def match_matches(genus_match, species_match, infraspecies_matches = [])
-    match = species_match
-    match[:edit_distance] += genus_match[:edit_distance]
-    match[:match] = false if match[:edit_distance] > 4
-    match[:match] &&= genus_match[:match]
-    match[:phonetic_match] &&= genus_match[:phonetic_match]
-    match
+    def match_matches(genus_match, species_match, infraspecies_matches = [])
+      match = species_match
+      match[:edit_distance] += genus_match[:edit_distance]
+      match[:match] = false if match[:edit_distance] > 4
+      match[:match] &&= genus_match[:match]
+      match[:phonetic_match] &&= genus_match[:phonetic_match]
+      match
+    end
   end
-end
+end

data/spec/damerau_levenshtein_mod_test.txt CHANGED Viewed

@@ -28,6 +28,8 @@ Pomatomus|pomatomus|10|1|1
 Pomatomus||10|1|9
 |Pomatomus|10|1|9
 P|p|10|1|1
+#TODO: one letter vs longer string generates a big negative number
+#L|Linneaus|10|1|7
 #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)

data/spec/spec_helper.rb CHANGED Viewed

@@ -23,6 +23,6 @@ def read_test_file(file, fields_num)
 end
 def make_taxamatch_hash(string)
-  normalized = Normalizer.normalize(string)
-  {:epitheton => string, :normalized => normalized, :phonetized => Phonetizer.near_match(normalized)}
+  normalized = Taxamatch::Normalizer.normalize(string)
+  {:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
 end

data/spec/taxamatch_rb_spec.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 # encoding: UTF-8
 require File.dirname(__FILE__) + '/spec_helper.rb'
-describe 'DamerauLevensteinMod' do
+describe 'DamerauLevenshteinMod' do
   it 'should get tests' do
     read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
-      dl = DamerauLevenshteinMod.new
+      dl = Taxamatch::DamerauLevenshteinMod.new
       if y
         res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
         puts y if res != y[4].to_i
@@ -16,7 +16,7 @@ end
 describe 'Parser' do
   before(:all) do
-    @parser =TaxamatchParser.new
+    @parser = Taxamatch::Parser.new
   end
   it 'should parse uninomials' do
@@ -35,27 +35,27 @@ describe 'Parser' do
 end
-describe 'Normalizer' do
+describe 'Taxamatch::Normalizer' do
   it 'should normalize  strings' do
-    Normalizer.normalize('abcd').should == 'ABCD'
-    Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
-    Normalizer.normalize('Ærenea').should == 'AERENEA'
-    Normalizer.normalize('Fallén').should == 'FALLEN'
-    Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
+    Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
+    Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
+    Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
+    Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
+    Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
   end
   it 'should normalize words' do
-    Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
+    Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
   end
 end
-describe 'Taxamatch' do
+describe 'Taxamatch::Base' do
   before(:all) do
-    @tm = Taxamatch.new
+    @tm = Taxamatch::Base.new
   end
   it 'should get txt tests' do
-    dl = DamerauLevenshteinMod.new
+    dl = Taxamatch::DamerauLevenshteinMod.new
     read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 3) do |y|
       if y
         y[2] = y[2] == 'true' ? true : false
@@ -174,14 +174,40 @@ describe 'Taxamatch' do
     @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
   end
-  describe 'Authmatch' do
+  describe 'Taxamatch::Authmatch' do
     before(:all) do
-      @am = Authmatch
+      @am = Taxamatch::Authmatch
     end
     it 'should calculate score' do
-      res = @am.authmatch(['Linnaeus', 'Muller'], ['L', 'Kenn'], [], [1788])
+      res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
       res.should == 90
+      res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
+      res.should == 0
+      #found all authors, same year
+      res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
+      res.should == 100
+      #all authors, 1 year diff
+      res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
+      res.should == 54
+      #year is not counted in
+      res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
+      res.should == 94
+      #found all authors on one side, same year
+      res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
+      res.should == 91
+      #found all authors on one side, 1 year diff
+      res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
+      res.should == 51
+      #found all authors on one side, year does not count
+      res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
+      res.should == 90
+      #found some authors
+      res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
+      res.should == 67
+      #if year does not match or not present no match for previous case
+      res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
+      res.should == 0
     end
     it 'should compare years' do
@@ -205,7 +231,18 @@ describe 'Taxamatch' do
       #fuzzy match
       res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
       res.should == [["Dem"], ["Stepanov"]]
+      res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
+      res.should == [['Linnaeus', 'Muller'], ['Kenn']]
+      res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
+      res.should == [[],['Kurtz']]
+    end
+    it 'should fuzzy match authors' do
+      #TODO: fix the bug revealed by this test
+      # res = @am.fuzzy_match_authors('L', 'Muller')
+      # res.should be_false
     end
   end
 end

data/spec/taxamatch_test.txt CHANGED Viewed

@@ -16,4 +16,4 @@ Pomatomus saltator|Pomatomus saltatrix|true
 Loligo pealeii|Loligo plei|false
 # different authors should not match
-#Puma concolor Linnaeus|Puma concolor Kurtz|false
+Puma concolor Linnaeus|Puma concolor Kurtz|false

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: dimus-taxamatch_rb
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.5.0
 platform: ruby
 authors:
 - Dmitry Mozzherin