RubyGems - taxamatch_rb - Versions diffs - 0.9.10 → 1.0.0 - Mend

taxamatch_rb 0.9.10 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/CHANGELOG +5 -2
data/Gemfile +14 -16
data/Gemfile.lock +18 -19
data/LICENSE +1 -1
data/{README.rdoc → README.md} +26 -7
data/Rakefile +11 -9
data/VERSION +1 -1
data/lib/taxamatch_rb.rb +76 -43
data/lib/taxamatch_rb/atomizer.rb +19 -10
data/lib/taxamatch_rb/authmatch.rb +29 -16
data/lib/taxamatch_rb/normalizer.rb +4 -4
data/lib/taxamatch_rb/phonetizer.rb +9 -8
data/spec/taxamatch_rb_spec.rb +223 -109
data/taxamatch_rb.gemspec +11 -41
metadata +11 -171

data/CHANGELOG CHANGED

@@ -1,11 +1,14 @@
+1.0.0 - fixed a parsing problem with infraspecies without string,
+upgraded version to 1 because the signature of the gem did stabilized
 0.9.8 - fixed a parsing problem with species nodes without name
 0.9.4 - updated parser (to 1.0.16), updated code to ruby 1.9.3
-0.9.3 - Taxamatch::Normalizer substitutes multiplication sign to 'x'
+0.9.3 - Taxamatch::Normalizer substitutes multiplication sign to 'x'
 (lowcase) instead of '?'
-0.9.2 - Taxamatch::Normalizer.normalize always returns only ASCII
+0.9.2 - Taxamatch::Normalizer.normalize always returns only ASCII
 characters, all utf-8 characters unknown to normalizer are becoming '?'
 0.9.1 - updated gems

data/Gemfile CHANGED

@@ -1,21 +1,19 @@
-source "http://rubygems.org"
+source 'https://rubygems.org'
 require 'yaml'
-# YAML::ENGINE.yamler= 'syck'
-gem "biodiversity19","~> 2.1"
-gem "damerau-levenshtein", ">= 0.5.4"
+gem 'biodiversity','~> 3.0.1'
+gem 'damerau-levenshtein', '~> 0.5.4'
 gem 'json', '~> 1.7.7'
-group :development do
-  gem "rake"
-  gem "rake-compiler"
-  gem "rspec"
-  gem "cucumber", ">= 0"
-  gem "bundler", "~> 1.3"
-  gem "jeweler", "~> 1.6.0"
-  gem "debugger"
-  gem "ruby-prof"
-  gem "shoulda"
-  gem "mocha"
+group :test do
+  gem 'rake', '~> 10.0'
+  gem 'rake-compiler', '~> 0.8'
+  gem 'rspec', '~> 2.13'
+  gem 'cucumber', '~> 1.3'
+  gem 'bundler', '~> 1.3'
+  gem 'jeweler', '~> 1.8'
+  gem 'debugger', '~> 1.5'
+  gem 'ruby-prof', '~> 0.13'
+  gem 'shoulda', '~> 3.5'
+  gem 'mocha', '~> 0.13'
 end

data/Gemfile.lock CHANGED

@@ -1,10 +1,10 @@
 GEM
-  remote: http://rubygems.org/
+  remote: https://rubygems.org/
   specs:
     activesupport (3.2.13)
       i18n (= 0.6.1)
       multi_json (~> 1.0)
-    biodiversity19 (2.1.0)
+    biodiversity (3.0.1)
       parallel
       parallel (~> 0.6)
       rake (~> 10.0)
@@ -18,11 +18,7 @@ GEM
       diff-lcs (>= 1.1.3)
       gherkin (~> 2.12.0)
       multi_json (~> 1.3)
-    damerau-levenshtein (1.0.0)
-      bundler (~> 1)
-      jeweler (~> 1)
-      rake (~> 10)
-      rake-compiler (~> 0.8)
+    damerau-levenshtein (0.5.4)
     debugger (1.5.0)
       columnize (>= 0.3.1)
       debugger-linecache (~> 1.2.0)
@@ -34,10 +30,11 @@ GEM
       multi_json (~> 1.3)
     git (1.2.5)
     i18n (0.6.1)
-    jeweler (1.6.4)
+    jeweler (1.8.4)
       bundler (~> 1.0)
       git (>= 1.2.5)
       rake
+      rdoc
     json (1.7.7)
     metaclass (0.0.1)
     mocha (0.13.3)
@@ -48,6 +45,8 @@ GEM
     rake (10.0.4)
     rake-compiler (0.8.3)
       rake
+    rdoc (4.0.1)
+      json (~> 1.4)
     rspec (2.13.0)
       rspec-core (~> 2.13.0)
       rspec-expectations (~> 2.13.0)
@@ -72,16 +71,16 @@ PLATFORMS
   ruby
 DEPENDENCIES
-  biodiversity19 (~> 2.1)
+  biodiversity (~> 3.0.1)
   bundler (~> 1.3)
-  cucumber
-  damerau-levenshtein (>= 0.5.4)
-  debugger
-  jeweler (~> 1.6.0)
+  cucumber (~> 1.3)
+  damerau-levenshtein (~> 0.5.4)
+  debugger (~> 1.5)
+  jeweler (~> 1.8)
   json (~> 1.7.7)
-  mocha
-  rake
-  rake-compiler
-  rspec
-  ruby-prof
-  shoulda
+  mocha (~> 0.13)
+  rake (~> 10.0)
+  rake-compiler (~> 0.8)
+  rspec (~> 2.13)
+  ruby-prof (~> 0.13)
+  shoulda (~> 3.5)

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2009 Dmitry Mozzherin
+Copyright (c) 2009-2013 Marine Biological Laboratory
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/{README.rdoc → README.md} RENAMED

@@ -1,8 +1,16 @@
-= taxamatch_rb
+Taxamatch_Rb
+============
-Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
+[![Gem Version][1]][2]
+[![Continuous Integration Status][3]][4]
+[![Dependency Status][5]][6]
-The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
+Taxamatch_Rb is a ruby implementation of Taxamatch algorithms
+[developed by Tony Rees][7]:
+The purpose of Taxamatch gem is to facilitate fuzzy comparison of
+two scientific name renderings to find out if they actually point to
+the same scientific name.
     require 'taxamatch_rb'
     tm = Taxamatch::Base.new
@@ -12,11 +20,13 @@ The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific
 Taxamatch_Rb is compatible with ruby versions 1.9.1 and higher
-== Installation
+Installation
+------------
     sudo gem install taxamatch_rb
-== Usage
+Usage
+-----
     require 'taxamatch_rb'
@@ -51,6 +61,15 @@ Taxamatch_Rb is compatible with ruby versions 1.9.1 and higher
 You can find more examples in spec section of the code
-== Copyright
+Copyright
+---------
+Copyright (c) 2009-2013 Marine Biological Laboratory. See LICENSE for details.
-Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
+[1]: https://badge.fury.io/rb/taxamatch_rb.png
+[2]: http://badge.fury.io/rb/taxamatch_rb
+[3]: https://secure.travis-ci.org/GlobalNamesArchitecture/taxamatch_rb.png
+[4]: http://travis-ci.org/GlobalNamesArchitecture/taxamatch_rb
+[5]: https://gemnasium.com/GlobalNamesArchitecture/taxamatch_rb.png
+[6]: https://gemnasium.com/GlobalNamesArchitecture/taxamatch_rb
+[7]: http://www.cmar.csiro.au/datacentre/taxamatch.htm

data/Rakefile CHANGED

@@ -5,7 +5,7 @@ begin
   Bundler.setup(:default, :development)
 rescue Bundler::BundlerError => e
   $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
+  $stderr.puts 'Run `bundle install` to install missing gems'
   exit e.status_code
 end
@@ -14,21 +14,23 @@ require 'rake'
 begin
   require 'jeweler'
   Jeweler::Tasks.new do |gem|
-    gem.name = "taxamatch_rb"
+    gem.name = 'taxamatch_rb'
     gem.summary = 'Implementation of Tony Rees Taxamatch algorithms'
-    gem.description = 'This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees'
-    gem.email = "dmozzherin@eol.org"
-    gem.homepage = "http://github.com/GlobalNamesArchitecture/taxamatch_rb"
-    gem.authors = ["Dmitry Mozzherin"]
-    gem.files = FileList["[A-Z]*", "*.gemspec", "{bin,generators,lib,spec}/**/*"]
+    gem.description = 'This gem implements algorithm ' +
+      'for fuzzy matching scientific names developed by Tony Rees'
+    gem.email = 'dmozzherin@gmail.com'
+    gem.homepage = 'http://github.com/GlobalNamesArchitecture/taxamatch_rb'
+    gem.authors = ['Dmitry Mozzherin']
+    gem.files = FileList['[A-Z]*',
+      '*.gemspec', '{bin,generators,lib,spec}/**/*']
     gem.files -= FileList['lib/**/*.bundle', 'lib/**/*.dll', 'lib/**/*.so']
     gem.files += FileList['ext/**/*.c']
     gem.extensions = FileList['ext/**/extconf.rb']
-    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
   end
 rescue LoadError
-  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+  puts 'Jeweler (or a dependency) not available.' +
+  ' Install it with: sudo gem install jeweler'
 end
 require 'rspec/core'

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.~~9.10~~
1	+ 1.0.0

data/lib/taxamatch_rb.rb CHANGED

@@ -1,6 +1,7 @@
 # encoding: UTF-8
 $:.unshift(File.dirname(__FILE__)) unless
-   $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+   $:.include?(File.dirname(__FILE__)) ||
+   $:.include?(File.expand_path(File.dirname(__FILE__)))
 # $:.unshift('taxamatch_rb')
 require 'damerau-levenshtein'
 require 'taxamatch_rb/atomizer'
@@ -8,8 +9,9 @@ require 'taxamatch_rb/normalizer'
 require 'taxamatch_rb/phonetizer'
 require 'taxamatch_rb/authmatch'
-raise "IMPORTANT: Parsley-store gem  requires ruby >= 1.9.1" if RUBY_VERSION < "1.9.1"
-$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
+if RUBY_VERSION < '1.9.1'
+  raise 'IMPORTANT: Parsley-store gem  requires ruby >= 1.9.1'
+end
 module Taxamatch
@@ -21,7 +23,8 @@ module Taxamatch
     end
-    #takes two scientific names and returns true if names match and false if they don't
+    # takes two scientific names and returns true
+    # if names match and false if they don't
     def taxamatch(str1, str2, return_boolean = true)
       preparsed_1 = @parser.parse(str1)
       preparsed_2 = @parser.parse(str2)
@@ -29,14 +32,19 @@ module Taxamatch
       return_boolean ? (!!match && match['match']) : match
     end
-    #takes two hashes of parsed scientific names, analyses them and returns back
-    #this function is useful when species strings are preparsed.
+    # takes two hashes of parsed scientific names, analyses them and
+    # returns back this function is useful when species strings are preparsed.
     def taxamatch_preparsed(preparsed_1, preparsed_2)
       result = nil
-      result =  match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
-      result =  match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
+      if preparsed_1[:uninomial] && preparsed_2[:uninomial]
+        result =  match_uninomial(preparsed_1, preparsed_2)
+      end
+      if preparsed_1[:genus] && preparsed_2[:genus]
+        result =  match_multinomial(preparsed_1, preparsed_2)
+      end
       if result && result['match']
-        result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ? false : true
+        result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ?
+          false : true
       end
       return result
     end
@@ -48,65 +56,89 @@ module Taxamatch
     def match_multinomial(preparsed_1, preparsed_2)
       gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
       sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
-      total_length = preparsed_1[:genus][:string].size + preparsed_2[:genus][:string].size + preparsed_1[:species][:string].size + preparsed_2[:species][:string].size
+      total_length = preparsed_1[:genus][:string].size +
+        preparsed_2[:genus][:string].size +
+        preparsed_1[:species][:string].size +
+        preparsed_2[:species][:string].size
       if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
-        infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
-        total_length += preparsed_1[:infraspecies][0][:string].size + preparsed_2[:infraspecies][0][:string].size
+        infrasp_match = match_species(preparsed_1[:infraspecies][0],
+                                      preparsed_2[:infraspecies][0])
+        total_length += preparsed_1[:infraspecies][0][:string].size +
+          preparsed_2[:infraspecies][0][:string].size
         match_hash = match_matches(gen_match, sp_match, infrasp_match)
-      elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
-        match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
-        total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:string].size : preparsed_2[:infraspecies][0][:string].size
+      elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) ||
+        (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
+        match_hash = { 'match' => false,
+          'edit_distance' => 5,
+          'phonetic_match' => false }
+        total_length += preparsed_1[:infraspecies] ?
+          preparsed_1[:infraspecies][0][:string].size :
+          preparsed_2[:infraspecies][0][:string].size
       else
         match_hash = match_matches(gen_match, sp_match)
       end
-      match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
+      match_hash.merge({ 'score' =>
+                       (1 - match_hash['edit_distance']/(total_length/2)) })
       match_hash
     end
     def match_genera(genus1, genus2, opts = {})
       genus1_length = genus1[:normalized].size
       genus2_length = genus2[:normalized].size
-      opts = {:with_phonetic_match => true}.merge(opts)
+      opts = { with_phonetic_match: true }.merge(opts)
       min_length = [genus1_length, genus2_length].min
-      unless opts[:with_phonetic_match]
-        genus1[:phonetized] = "A"
-        genus2[:phonetized] = "B"
+      unless opts[:with_phonetic_match]
+        genus1[:phonetized] = 'A'
+        genus2[:phonetized] = 'B'
       end
       match = false
-      ed = @dlm.distance(genus1[:normalized], genus2[:normalized], 1, 3) #TODO put block = 2
-      return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
-      return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
-      match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
-      {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
+      ed = @dlm.distance(genus1[:normalized],
+                         genus2[:normalized], 1, 3) #TODO put block = 2
+      return { 'edit_distance' => ed,
+        'phonetic_match' => false,
+        'match' => false } if ed/min_length.to_f > 0.2
+      return { 'edit_distance' => ed,
+        'phonetic_match' => true,
+        'match' => true } if genus1[:phonetized] == genus2[:phonetized]
+      match = true if ed <= 3 && (min_length > ed * 2) &&
+        (ed < 2 || genus1[0] == genus2[0])
+      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
     end
     def match_species(sp1, sp2, opts = {})
       sp1_length = sp1[:normalized].size
       sp2_length = sp2[:normalized].size
-      opts = {:with_phonetic_match => true}.merge(opts)
+      opts = { with_phonetic_match: true }.merge(opts)
       min_length = [sp1_length, sp2_length].min
       unless opts[:with_phonetic_match]
-        sp1[:phonetized] = "A"
-        sp2[:phonetized] = "B"
-      end
+        sp1[:phonetized] = 'A'
+        sp2[:phonetized] = 'B'
+      end
       sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
       sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
       match = false
-      ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
-      return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
-      #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
-      return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
-      match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
-      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
+      ed = @dlm.distance(sp1[:normalized],
+                         sp2[:normalized], 1, 4) #TODO put block 4
+      return { 'edit_distance' => ed,
+        'phonetic_match' => false,
+        'match' => false } if ed/min_length.to_f > 0.3334
+      return {'edit_distance' => ed,
+        'phonetic_match' => true,
+        'match' => true} if sp1[:phonetized] == sp2[:phonetized]
+      match = true if ed <= 4 &&
+        (min_length >= ed * 2) &&
+        (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) &&
+        (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
+      { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
     end
     def match_authors(preparsed_1, preparsed_2)
-      p1 = { :normalized_authors => [], :years => [] }
-      p2 = { :normalized_authors => [], :years => [] }
+      p1 = { normalized_authors: [], years: [] }
+      p2 = { normalized_authors: [], years: [] }
       if preparsed_1[:infraspecies] || preparsed_2[:infraspecies]
-        p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
+        p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
         p2 = preparsed_2[:infraspecies].last if preparsed_2[:infraspecies]
       elsif preparsed_1[:species] || preparsed_2[:species]
         p1 = preparsed_1[:species] if preparsed_1[:species]
@@ -119,7 +151,7 @@ module Taxamatch
       au2 = p2[:normalized_authors]
       yr1 = p1[:years]
       yr2 = p2[:years]
-      return 0 if au1.empty? || au2.empty?
+      return 0 if au1.empty? || au2.empty?
       score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
       score == 0 ? -1 : 1
     end
@@ -132,12 +164,13 @@ module Taxamatch
         match['phonetic_match'] &&= infraspecies_match['phonetic_match']
       end
       match['edit_distance'] += genus_match['edit_distance']
-      match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
+      if match['edit_distance'] > (infraspecies_match ? 6 : 4)
+        match['match'] = false
+      end
       match['match'] &&= genus_match['match']
       match['phonetic_match'] &&= genus_match['phonetic_match']
       match
     end
   end
 end

data/lib/taxamatch_rb/atomizer.rb CHANGED

@@ -9,12 +9,12 @@ module Taxamatch
       @parsed_raw = nil
       @res = {}
     end
     def parse(name)
       @parsed_raw = @parser.parse(name)[:scientificName]
       organize_results(@parsed_raw)
     end
     def parsed_raw
       return @parsed_raw
     end
@@ -29,11 +29,13 @@ module Taxamatch
       process_node(:genus, d[:genus])
       process_node(:species, d[:species], true)
       process_infraspecies(d[:infraspecies])
-      @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
+      @res[:all_authors] = @res[:all_authors].uniq.map do |a|
+        Taxamatch::Normalizer.normalize(a)
+      end
       @res[:all_years].uniq!
       @res.keys.size > 2 ? @res : nil
     end
     private
     def process_node(name, node, is_species = false)
@@ -41,14 +43,16 @@ module Taxamatch
       @res[name] = {}
       @res[name][:string] = node[:string]
       @res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:string])
-      @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:string], is_species)
+      @res[name][:phonetized] =
+        Taxamatch::Phonetizer.near_match(node[:string], is_species)
       get_authors_years(node, @res[name])
     end
     def process_infraspecies(node)
       return unless node
       @res[:infraspecies] = []
       node.each do |infr|
+        next unless infr[:string]
         hsh = {}
         hsh[:string] = infr[:string]
         hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:string])
@@ -57,7 +61,7 @@ module Taxamatch
         @res[:infraspecies] << hsh
       end
     end
     def get_authors_years(node, res)
       res[:authors] = []
       res[:years] = []
@@ -71,16 +75,21 @@ module Taxamatch
           if node[au][:exAuthorTeam]
             res[:authors] += node[au][:exAuthorTeam][:author]
             if node[au][:exAuthorTeam][:year]
-              year = Taxamatch::Normalizer.normalize_year(node[au][:exAuthorTeam][:year])
+              year = node[au][:exAuthorTeam][:year]
+              year = Taxamatch::Normalizer.normalize_year(year)
               res[:years] << year if year
             end
           end
         end
       end
       res[:authors].uniq!
-      res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
+      res[:normalized_authors] = res[:authors].map do |a|
+        Taxamatch::Normalizer.normalize_author(a)
+      end
       res[:years].uniq!
-      @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
+      if res[:normalized_authors].size > 0
+        @res[:all_authors] += res[:normalized_authors]
+      end
       @res[:all_years] += res[:years] if res[:years].size > 0
     end