RubyGems - taxamatch_rb - Versions diffs - 0.9.10 → 1.0.0 - Mend

taxamatch_rb 0.9.10 → 1.0.0

Files changed (15) hide show

data/CHANGELOG +5 -2
data/Gemfile +14 -16
data/Gemfile.lock +18 -19
data/LICENSE +1 -1
data/{README.rdoc → README.md} +26 -7
data/Rakefile +11 -9
data/VERSION +1 -1
data/lib/taxamatch_rb.rb +76 -43
data/lib/taxamatch_rb/atomizer.rb +19 -10
data/lib/taxamatch_rb/authmatch.rb +29 -16
data/lib/taxamatch_rb/normalizer.rb +4 -4
data/lib/taxamatch_rb/phonetizer.rb +9 -8
data/spec/taxamatch_rb_spec.rb +223 -109
data/taxamatch_rb.gemspec +11 -41
metadata +11 -171

data/CHANGELOG CHANGED

@@ -1,11 +1,14 @@
+1.0.0 - fixed a parsing problem with infraspecies without string,
+upgraded version to 1 because the signature of the gem did stabilized
 0.9.8 - fixed a parsing problem with species nodes without name
 0.9.4 - updated parser (to 1.0.16), updated code to ruby 1.9.3
-0.9.3 - Taxamatch::Normalizer substitutes multiplication sign to 'x'
+0.9.3 - Taxamatch::Normalizer substitutes multiplication sign to 'x'
 (lowcase) instead of '?'
-0.9.2 - Taxamatch::Normalizer.normalize always returns only ASCII
+0.9.2 - Taxamatch::Normalizer.normalize always returns only ASCII
 characters, all utf-8 characters unknown to normalizer are becoming '?'
 0.9.1 - updated gems

data/Gemfile CHANGED

@@ -1,21 +1,19 @@
-source "http://rubygems.org"
+source 'https://rubygems.org'
 require 'yaml'
-# YAML::ENGINE.yamler= 'syck'
-gem "biodiversity19","~> 2.1"
-gem "damerau-levenshtein", ">= 0.5.4"
+gem 'biodiversity','~> 3.0.1'
+gem 'damerau-levenshtein', '~> 0.5.4'
 gem 'json', '~> 1.7.7'
-group :development do
-  gem "rake"
-  gem "rake-compiler"
-  gem "rspec"
-  gem "cucumber", ">= 0"
-  gem "bundler", "~> 1.3"
-  gem "jeweler", "~> 1.6.0"
-  gem "debugger"
-  gem "ruby-prof"
-  gem "shoulda"
-  gem "mocha"
+group :test do
+  gem 'rake', '~> 10.0'
+  gem 'rake-compiler', '~> 0.8'
+  gem 'rspec', '~> 2.13'
+  gem 'cucumber', '~> 1.3'
+  gem 'bundler', '~> 1.3'
+  gem 'jeweler', '~> 1.8'
+  gem 'debugger', '~> 1.5'
+  gem 'ruby-prof', '~> 0.13'
+  gem 'shoulda', '~> 3.5'
+  gem 'mocha', '~> 0.13'
 end

data/Gemfile.lock CHANGED

@@ -1,10 +1,10 @@
 GEM
-  remote: http://rubygems.org/
+  remote: https://rubygems.org/
   specs:
     activesupport (3.2.13)
       i18n (= 0.6.1)
       multi_json (~> 1.0)
-    biodiversity19 (2.1.0)
+    biodiversity (3.0.1)
       parallel
       parallel (~> 0.6)
       rake (~> 10.0)
@@ -18,11 +18,7 @@ GEM
       diff-lcs (>= 1.1.3)
       gherkin (~> 2.12.0)
       multi_json (~> 1.3)
-    damerau-levenshtein (1.0.0)
-      bundler (~> 1)
-      jeweler (~> 1)
-      rake (~> 10)
-      rake-compiler (~> 0.8)
+    damerau-levenshtein (0.5.4)
     debugger (1.5.0)
       columnize (>= 0.3.1)
       debugger-linecache (~> 1.2.0)
@@ -34,10 +30,11 @@ GEM
       multi_json (~> 1.3)
     git (1.2.5)
     i18n (0.6.1)
-    jeweler (1.6.4)
+    jeweler (1.8.4)
       bundler (~> 1.0)
       git (>= 1.2.5)
       rake
+      rdoc
     json (1.7.7)
     metaclass (0.0.1)
     mocha (0.13.3)
@@ -48,6 +45,8 @@ GEM
     rake (10.0.4)
     rake-compiler (0.8.3)
       rake
+    rdoc (4.0.1)
+      json (~> 1.4)
     rspec (2.13.0)
       rspec-core (~> 2.13.0)
       rspec-expectations (~> 2.13.0)
@@ -72,16 +71,16 @@ PLATFORMS
   ruby
 DEPENDENCIES
-  biodiversity19 (~> 2.1)
+  biodiversity (~> 3.0.1)
   bundler (~> 1.3)
-  cucumber
-  damerau-levenshtein (>= 0.5.4)
-  debugger
-  jeweler (~> 1.6.0)
+  cucumber (~> 1.3)
+  damerau-levenshtein (~> 0.5.4)
+  debugger (~> 1.5)
+  jeweler (~> 1.8)
   json (~> 1.7.7)
-  mocha
-  rake
-  rake-compiler
-  rspec
-  ruby-prof
-  shoulda
+  mocha (~> 0.13)
+  rake (~> 10.0)
+  rake-compiler (~> 0.8)
+  rspec (~> 2.13)
+  ruby-prof (~> 0.13)
+  shoulda (~> 3.5)

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2009 Dmitry Mozzherin
+Copyright (c) 2009-2013 Marine Biological Laboratory
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/{README.rdoc → README.md} RENAMED

@@ -1,8 +1,16 @@
-= taxamatch_rb
+Taxamatch_Rb
+============
-Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
+[![Gem Version][1]][2]
+[![Continuous Integration Status][3]][4]
+[![Dependency Status][5]][6]
-The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
+Taxamatch_Rb is a ruby implementation of Taxamatch algorithms
+[developed by Tony Rees][7]:
+The purpose of Taxamatch gem is to facilitate fuzzy comparison of
+two scientific name renderings to find out if they actually point to
+the same scientific name.
     require 'taxamatch_rb'
     tm = Taxamatch::Base.new
@@ -12,11 +20,13 @@ The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific
 Taxamatch_Rb is compatible with ruby versions 1.9.1 and higher
-== Installation
+Installation
+------------
     sudo gem install taxamatch_rb
-== Usage
+Usage
+-----
     require 'taxamatch_rb'
@@ -51,6 +61,15 @@ Taxamatch_Rb is compatible with ruby versions 1.9.1 and higher
 You can find more examples in spec section of the code
-== Copyright
+Copyright
+---------
+Copyright (c) 2009-2013 Marine Biological Laboratory. See LICENSE for details.
-Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
+[1]: https://badge.fury.io/rb/taxamatch_rb.png
+[2]: http://badge.fury.io/rb/taxamatch_rb
+[3]: https://secure.travis-ci.org/GlobalNamesArchitecture/taxamatch_rb.png
+[4]: http://travis-ci.org/GlobalNamesArchitecture/taxamatch_rb
+[5]: https://gemnasium.com/GlobalNamesArchitecture/taxamatch_rb.png
+[6]: https://gemnasium.com/GlobalNamesArchitecture/taxamatch_rb
+[7]: http://www.cmar.csiro.au/datacentre/taxamatch.htm

data/Rakefile CHANGED

@@ -5,7 +5,7 @@ begin
   Bundler.setup(:default, :development)
 rescue Bundler::BundlerError => e
   $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
+  $stderr.puts 'Run `bundle install` to install missing gems'
   exit e.status_code
 end
@@ -14,21 +14,23 @@ require 'rake'
 begin
   require 'jeweler'
   Jeweler::Tasks.new do |gem|
-    gem.name = "taxamatch_rb"
+    gem.name = 'taxamatch_rb'
     gem.summary = 'Implementation of Tony Rees Taxamatch algorithms'
-    gem.description = 'This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees'
-    gem.email = "dmozzherin@eol.org"
-    gem.homepage = "http://github.com/GlobalNamesArchitecture/taxamatch_rb"
-    gem.authors = ["Dmitry Mozzherin"]
-    gem.files = FileList["[A-Z]*", "*.gemspec", "{bin,generators,lib,spec}/**/*"]
+    gem.description = 'This gem implements algorithm ' +
+      'for fuzzy matching scientific names developed by Tony Rees'
+    gem.email = 'dmozzherin@gmail.com'
+    gem.homepage = 'http://github.com/GlobalNamesArchitecture/taxamatch_rb'
+    gem.authors = ['Dmitry Mozzherin']
+    gem.files = FileList['[A-Z]*',
+      '*.gemspec', '{bin,generators,lib,spec}/**/*']
     gem.files -= FileList['lib/**/*.bundle', 'lib/**/*.dll', 'lib/**/*.so']
     gem.files += FileList['ext/**/*.c']
     gem.extensions = FileList['ext/**/extconf.rb']
-    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
   end
 rescue LoadError
-  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+  puts 'Jeweler (or a dependency) not available.' +
+  ' Install it with: sudo gem install jeweler'
 end
 require 'rspec/core'

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.~~9.10~~
1	+ 1.0.0

data/lib/taxamatch_rb.rb CHANGED

@@ -1,6 +1,7 @@
 # encoding: UTF-8
 $:.unshift(File.dirname(__FILE__)) unless
-   $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+   $:.include?(File.dirname(__FILE__)) ||
+   $:.include?(File.expand_path(File.dirname(__FILE__)))
 # $:.unshift('taxamatch_rb')
 require 'damerau-levenshtein'
 require 'taxamatch_rb/atomizer'
@@ -8,8 +9,9 @@ require 'taxamatch_rb/normalizer'
 require 'taxamatch_rb/phonetizer'
 require 'taxamatch_rb/authmatch'
-raise "IMPORTANT: Parsley-store gem  requires ruby >= 1.9.1" if RUBY_VERSION < "1.9.1"
-$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
+if RUBY_VERSION < '1.9.1'
+  raise 'IMPORTANT: Parsley-store gem  requires ruby >= 1.9.1'
+end
 module Taxamatch
@@ -21,7 +23,8 @@ module Taxamatch
     end
-    #takes two scientific names and returns true if names match and false if they don't
+    # takes two scientific names and returns true
+    # if names match and false if they don't
     def taxamatch(str1, str2, return_boolean = true)
       preparsed_1 = @parser.parse(str1)
       preparsed_2 = @parser.parse(str2)
@@ -29,14 +32,19 @@ module Taxamatch
       return_boolean ? (!!match && match['match']) : match
     end
-    #takes two hashes of parsed scientific names, analyses them and returns back
-    #this function is useful when species strings are preparsed.
+    # takes two hashes of parsed scientific names, analyses them and
+    # returns back this function is useful when species strings are preparsed.
     def taxamatch_preparsed(preparsed_1, preparsed_2)
       result = nil
-      result =  match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
-      result =  match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
+      if preparsed_1[:uninomial] && preparsed_2[:uninomial]
+        result =  match_uninomial(preparsed_1, preparsed_2)
+      end
+      if preparsed_1[:genus] && preparsed_2[:genus]
+        result =  match_multinomial(preparsed_1, preparsed_2)
+      end
       if result && result['match']
-        result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ? false : true
+        result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ?
+          false : true
       end
       return result
     end
@@ -48,65 +56,89 @@ module Taxamatch
     def match_multinomial(preparsed_1, preparsed_2)
       gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
       sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
-      total_length = preparsed_1[:genus][:string].size + preparsed_2[:genus][:string].size + preparsed_1[:species][:string].size + preparsed_2[:species][:string].size
+      total_length = preparsed_1[:genus][:string].size +
+        preparsed_2[:genus][:string].size +
+        preparsed_1[:species][:string].size +
+        preparsed_2[:species][:string].size
       if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
-        infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
-        total_length += preparsed_1[:infraspecies][0][:string].size + preparsed_2[:infraspecies][0][:string].size
+        infrasp_match = match_species(preparsed_1[:infraspecies][0],
+                                      preparsed_2[:infraspecies][0])
+        total_length += preparsed_1[:infraspecies][0][:string].size +
+          preparsed_2[:infraspecies][0][:string].size
         match_hash = match_matches(gen_match, sp_match, infrasp_match)
-      elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
-        match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
-        total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
+      elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) ||
+        (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
+        match_hash = { 'match' => false,
+          'edit_distance' => 5,
+          'phonetic_match' => false }
+        total_length += preparsed_1[:infraspecies] ?
+          preparsed_1[:infraspecies][0][:string].size :
+          preparsed_2[:infraspecies][0][:string].size
       else
         match_hash = match_matches(gen_match, sp_match)
       end
-      match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
+      match_hash.merge({ 'score' =>
+                       (1 - match_hash['edit_distance']/(total_length/2)) })
       match_hash
     end
     def match_genera(genus1, genus2, opts = {})
       genus1_length = genus1[:normalized].size
       genus2_length = genus2[:normalized].size
-      opts = {:with_phonetic_match => true}.merge(opts)
+      opts = { with_phonetic_match: true }.merge(opts)
       min_length = [genus1_length, genus2_length].min
-      unless opts[:with_phonetic_match]
-        genus1[:phonetized] = "A"
-        genus2[:phonetized] = "B"
+      unless opts[:with_phonetic_match]
+        genus1[:phonetized] = 'A'
+        genus2[:phonetized] = 'B'
       end
       match = false
-      ed = @dlm.distance(genus1[:normalized], genus2[:normalized], 1, 3) #TODO put block = 2
-      return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
-      return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
-      match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
-      {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
+      ed = @dlm.distance(genus1[:normalized],
+                         genus2[:normalized], 1, 3) #TODO put block = 2
+      return { 'edit_distance' => ed,
+        'phonetic_match' => false,
+        'match' => false } if ed/min_length.to_f > 0.2
+      return { 'edit_distance' => ed,
+        'phonetic_match' => true,
+        'match' => true } if genus1[:phonetized] == genus2[:phonetized]
+      match = true if ed <= 3 && (min_length > ed * 2) &&
+        (ed < 2 || genus1[0] == genus2[0])
+      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
     end
     def match_species(sp1, sp2, opts = {})
       sp1_length = sp1[:normalized].size
       sp2_length = sp2[:normalized].size
-      opts = {:with_phonetic_match => true}.merge(opts)
+      opts = { with_phonetic_match: true }.merge(opts)
       min_length = [sp1_length, sp2_length].min
       unless opts[:with_phonetic_match]
-        sp1[:phonetized] = "A"
-        sp2[:phonetized] = "B"
-      end
+        sp1[:phonetized] = 'A'
+        sp2[:phonetized] = 'B'
+      end
       sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
       sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
       match = false
-      ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
-      return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
-      #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
-      return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
-      match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
-      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
+      ed = @dlm.distance(sp1[:normalized],
+                         sp2[:normalized], 1, 4) #TODO put block 4
+      return { 'edit_distance' => ed,
+        'phonetic_match' => false,
+        'match' => false } if ed/min_length.to_f > 0.3334
+      return {'edit_distance' => ed,
+        'phonetic_match' => true,
+        'match' => true} if sp1[:phonetized] == sp2[:phonetized]
+      match = true if ed <= 4 &&
+        (min_length >= ed * 2) &&
+        (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) &&
+        (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
+      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
     end
     def match_authors(preparsed_1, preparsed_2)
-      p1 = { :normalized_authors => [], :years => [] }
-      p2 = { :normalized_authors => [], :years => [] }
+      p1 = { normalized_authors: [], years: [] }
+      p2 = { normalized_authors: [], years: [] }
       if preparsed_1[:infraspecies] || preparsed_2[:infraspecies]
-        p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
+        p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
         p2 = preparsed_2[:infraspecies].last if preparsed_2[:infraspecies]
       elsif preparsed_1[:species] || preparsed_2[:species]
         p1 = preparsed_1[:species] if preparsed_1[:species]
@@ -119,7 +151,7 @@ module Taxamatch
       au2 = p2[:normalized_authors]
       yr1 = p1[:years]
       yr2 = p2[:years]
-      return 0 if au1.empty? || au2.empty?
+      return 0 if au1.empty? || au2.empty?
       score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
       score == 0 ? -1 : 1
     end
@@ -132,12 +164,13 @@ module Taxamatch
         match['phonetic_match'] &&= infraspecies_match['phonetic_match']
       end
       match['edit_distance'] += genus_match['edit_distance']
-      match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
+      if match['edit_distance'] > (infraspecies_match ? 6 : 4)
+        match['match'] = false
+      end
       match['match'] &&= genus_match['match']
       match['phonetic_match'] &&= genus_match['phonetic_match']
       match
     end
   end
 end

data/lib/taxamatch_rb/atomizer.rb CHANGED

@@ -9,12 +9,12 @@ module Taxamatch
       @parsed_raw = nil
       @res = {}
     end
     def parse(name)
       @parsed_raw = @parser.parse(name)[:scientificName]
       organize_results(@parsed_raw)
     end
     def parsed_raw
       return @parsed_raw
     end
@@ -29,11 +29,13 @@ module Taxamatch
       process_node(:genus, d[:genus])
       process_node(:species, d[:species], true)
       process_infraspecies(d[:infraspecies])
-      @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
+      @res[:all_authors] = @res[:all_authors].uniq.map do |a|
+        Taxamatch::Normalizer.normalize(a)
+      end
       @res[:all_years].uniq!
       @res.keys.size > 2 ? @res : nil
     end
     private
     def process_node(name, node, is_species = false)
@@ -41,14 +43,16 @@ module Taxamatch
       @res[name] = {}
       @res[name][:string] = node[:string]
       @res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:string])
-      @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:string], is_species)
+      @res[name][:phonetized] =
+        Taxamatch::Phonetizer.near_match(node[:string], is_species)
       get_authors_years(node, @res[name])
     end
     def process_infraspecies(node)
       return unless node
       @res[:infraspecies] = []
       node.each do |infr|
+        next unless infr[:string]
         hsh = {}
         hsh[:string] = infr[:string]
         hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:string])
@@ -57,7 +61,7 @@ module Taxamatch
         @res[:infraspecies] << hsh
       end
     end
     def get_authors_years(node, res)
       res[:authors] = []
       res[:years] = []
@@ -71,16 +75,21 @@ module Taxamatch
           if node[au][:exAuthorTeam]
             res[:authors] += node[au][:exAuthorTeam][:author]
             if node[au][:exAuthorTeam][:year]
-              year = Taxamatch::Normalizer.normalize_year(node[au][:exAuthorTeam][:year])
+              year = node[au][:exAuthorTeam][:year]
+              year = Taxamatch::Normalizer.normalize_year(year)
               res[:years] << year if year
             end
           end
         end
       end
       res[:authors].uniq!
-      res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
+      res[:normalized_authors] = res[:authors].map do |a|
+        Taxamatch::Normalizer.normalize_author(a)
+      end
       res[:years].uniq!
-      @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
+      if res[:normalized_authors].size > 0
+        @res[:all_authors] += res[:normalized_authors]
+      end
       @res[:all_years] += res[:years] if res[:years].size > 0
     end