taxamatch_rb 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
 - data/README.rdoc +61 -0
 - data/lib/taxamatch_rb.rb +117 -0
 - data/lib/taxamatch_rb/atomizer.rb +82 -0
 - data/lib/taxamatch_rb/authmatch.rb +89 -0
 - data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +139 -0
 - data/lib/taxamatch_rb/normalizer.rb +55 -0
 - data/lib/taxamatch_rb/phonetizer.rb +79 -0
 - data/spec/damerau_levenshtein_mod_test.txt +63 -0
 - data/spec/spec.opts +1 -0
 - data/spec/spec_helper.rb +28 -0
 - data/spec/taxamatch_rb_spec.rb +254 -0
 - data/spec/taxamatch_test.txt +45 -0
 - metadata +101 -0
 
    
        data/LICENSE
    ADDED
    
    | 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Copyright (c) 2009 Dmitry Mozzherin
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Permission is hereby granted, free of charge, to any person obtaining
         
     | 
| 
      
 4 
     | 
    
         
            +
            a copy of this software and associated documentation files (the
         
     | 
| 
      
 5 
     | 
    
         
            +
            "Software"), to deal in the Software without restriction, including
         
     | 
| 
      
 6 
     | 
    
         
            +
            without limitation the rights to use, copy, modify, merge, publish,
         
     | 
| 
      
 7 
     | 
    
         
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         
     | 
| 
      
 8 
     | 
    
         
            +
            permit persons to whom the Software is furnished to do so, subject to
         
     | 
| 
      
 9 
     | 
    
         
            +
            the following conditions:
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            The above copyright notice and this permission notice shall be
         
     | 
| 
      
 12 
     | 
    
         
            +
            included in all copies or substantial portions of the Software.
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         
     | 
| 
      
 15 
     | 
    
         
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         
     | 
| 
      
 16 
     | 
    
         
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         
     | 
| 
      
 17 
     | 
    
         
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         
     | 
| 
      
 18 
     | 
    
         
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         
     | 
| 
      
 19 
     | 
    
         
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         
     | 
| 
      
 20 
     | 
    
         
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         
     | 
    
        data/README.rdoc
    ADDED
    
    | 
         @@ -0,0 +1,61 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            = taxamatch_rb
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                require 'taxamatch_rb'
         
     | 
| 
      
 8 
     | 
    
         
            +
                tm = Taxamatch::Base.new
         
     | 
| 
      
 9 
     | 
    
         
            +
                tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
         
     | 
| 
      
 10 
     | 
    
         
            +
                tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
         
     | 
| 
      
 11 
     | 
    
         
            +
                tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false 
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            == Installation
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                sudo gem install dimus-taxamatch_rb --source http://gems.github.com
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
            or
         
     | 
| 
      
 20 
     | 
    
         
            +
                sudo gem sources -a http://gems.github.com #(you only have to do this once)
         
     | 
| 
      
 21 
     | 
    
         
            +
                sudo gem install dimus-taxamatch_rb
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            == Usage
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                require 'rubygems' #not needed for ruby > 1.9.1
         
     | 
| 
      
 26 
     | 
    
         
            +
                require 'taxamatch_rb'
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
                tm = Taxamatch::Base.new
         
     | 
| 
      
 29 
     | 
    
         
            +
                
         
     | 
| 
      
 30 
     | 
    
         
            +
            * compare full scientific names
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            * preparse names for the matching (necessary for large databases of scientific names)
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                p = Taxamatch::Atomizer.new
         
     | 
| 
      
 37 
     | 
    
         
            +
                parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
         
     | 
| 
      
 38 
     | 
    
         
            +
                parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
            * compare preparsed names
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                tm.taxamatch_preparsed(parsed_name1, parsed_name2)
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
            * compare genera
         
     | 
| 
      
 45 
     | 
    
         
            +
                
         
     | 
| 
      
 46 
     | 
    
         
            +
                tm.match_genera('Monacanthus', 'MONOCANTUS')
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
            * compare species
         
     | 
| 
      
 49 
     | 
    
         
            +
                
         
     | 
| 
      
 50 
     | 
    
         
            +
                tm.match_species('fronticinctus', 'frontecinctus')
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
            * compare authors and years
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
            You can find more examples in spec section of the code
         
     | 
| 
      
 58 
     | 
    
         
            +
             
     | 
| 
      
 59 
     | 
    
         
            +
            == Copyright
         
     | 
| 
      
 60 
     | 
    
         
            +
             
     | 
| 
      
 61 
     | 
    
         
            +
            Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
         
     | 
    
        data/lib/taxamatch_rb.rb
    ADDED
    
    | 
         @@ -0,0 +1,117 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            $:.unshift(File.dirname(__FILE__)) unless
         
     | 
| 
      
 3 
     | 
    
         
            +
               $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
         
     | 
| 
      
 4 
     | 
    
         
            +
            # $:.unshift('taxamatch_rb')
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'taxamatch_rb/damerau_levenshtein_mod'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'taxamatch_rb/atomizer'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'taxamatch_rb/normalizer'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'taxamatch_rb/phonetizer'
         
     | 
| 
      
 9 
     | 
    
         
            +
            require 'taxamatch_rb/authmatch'
         
     | 
| 
      
 10 
     | 
    
         
            +
            require 'ruby-debug'
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            $KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9  
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              class Base
         
     | 
| 
      
 17 
     | 
    
         
            +
              
         
     | 
| 
      
 18 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 19 
     | 
    
         
            +
                  @parser = Taxamatch::Atomizer.new
         
     | 
| 
      
 20 
     | 
    
         
            +
                  @dlm = Taxamatch::DamerauLevenshteinMod.new
         
     | 
| 
      
 21 
     | 
    
         
            +
                end
         
     | 
| 
      
 22 
     | 
    
         
            +
               
         
     | 
| 
      
 23 
     | 
    
         
            +
               
         
     | 
| 
      
 24 
     | 
    
         
            +
                #takes two scientific names and returns true if names match and false if they don't
         
     | 
| 
      
 25 
     | 
    
         
            +
                def taxamatch(str1, str2, return_boolean = true) 
         
     | 
| 
      
 26 
     | 
    
         
            +
                  preparsed_1 = @parser.parse(str1)
         
     | 
| 
      
 27 
     | 
    
         
            +
                  preparsed_2 = @parser.parse(str2)
         
     | 
| 
      
 28 
     | 
    
         
            +
                  match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
         
     | 
| 
      
 29 
     | 
    
         
            +
                  return_boolean && match ? match['match'] : match
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
              
         
     | 
| 
      
 32 
     | 
    
         
            +
                #takes two hashes of parsed scientific names, analyses them and returns back 
         
     | 
| 
      
 33 
     | 
    
         
            +
                #this function is useful when species strings are preparsed.
         
     | 
| 
      
 34 
     | 
    
         
            +
                def taxamatch_preparsed(preparsed_1, preparsed_2)
         
     | 
| 
      
 35 
     | 
    
         
            +
                  result = nil
         
     | 
| 
      
 36 
     | 
    
         
            +
                  result =  match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial] 
         
     | 
| 
      
 37 
     | 
    
         
            +
                  result =  match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
         
     | 
| 
      
 38 
     | 
    
         
            +
                  if result && result['match']
         
     | 
| 
      
 39 
     | 
    
         
            +
                    result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true 
         
     | 
| 
      
 40 
     | 
    
         
            +
                  end
         
     | 
| 
      
 41 
     | 
    
         
            +
                  return result
         
     | 
| 
      
 42 
     | 
    
         
            +
                end
         
     | 
| 
      
 43 
     | 
    
         
            +
              
         
     | 
| 
      
 44 
     | 
    
         
            +
                def match_uninomial(preparsed_1, preparsed_2)
         
     | 
| 
      
 45 
     | 
    
         
            +
                  match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                def match_multinomial(preparsed_1, preparsed_2)
         
     | 
| 
      
 49 
     | 
    
         
            +
                  gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
         
     | 
| 
      
 50 
     | 
    
         
            +
                  sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
         
     | 
| 
      
 51 
     | 
    
         
            +
                  total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
         
     | 
| 
      
 52 
     | 
    
         
            +
                  if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
         
     | 
| 
      
 53 
     | 
    
         
            +
                    infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
         
     | 
| 
      
 54 
     | 
    
         
            +
                    total_length += preparsed_1[:infraspecies][0][:epitheton].size + preparsed_2[:infraspecies][0][:epitheton].size
         
     | 
| 
      
 55 
     | 
    
         
            +
                    match_hash = match_matches(gen_match, sp_match, infrasp_match)
         
     | 
| 
      
 56 
     | 
    
         
            +
                  elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
         
     | 
| 
      
 57 
     | 
    
         
            +
                    match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
         
     | 
| 
      
 58 
     | 
    
         
            +
                    total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:epitheton].size : preparsed_2[:infraspecies][0][:epitheton].size 
         
     | 
| 
      
 59 
     | 
    
         
            +
                  else
         
     | 
| 
      
 60 
     | 
    
         
            +
                    match_hash = match_matches(gen_match, sp_match)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                  match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
         
     | 
| 
      
 63 
     | 
    
         
            +
                  match_hash
         
     | 
| 
      
 64 
     | 
    
         
            +
                end
         
     | 
| 
      
 65 
     | 
    
         
            +
              
         
     | 
| 
      
 66 
     | 
    
         
            +
                def match_genera(genus1, genus2)
         
     | 
| 
      
 67 
     | 
    
         
            +
                  genus1_length = genus1[:normalized].size
         
     | 
| 
      
 68 
     | 
    
         
            +
                  genus2_length = genus2[:normalized].size
         
     | 
| 
      
 69 
     | 
    
         
            +
                  match = false
         
     | 
| 
      
 70 
     | 
    
         
            +
                  ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
         
     | 
| 
      
 71 
     | 
    
         
            +
                  return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
         
     | 
| 
      
 72 
     | 
    
         
            +
                  return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized] 
         
     | 
| 
      
 73 
     | 
    
         
            +
                
         
     | 
| 
      
 74 
     | 
    
         
            +
                  match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
         
     | 
| 
      
 75 
     | 
    
         
            +
                  {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false} 
         
     | 
| 
      
 76 
     | 
    
         
            +
                end
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                def match_species(sp1, sp2)
         
     | 
| 
      
 79 
     | 
    
         
            +
                  sp1_length = sp1[:normalized].size
         
     | 
| 
      
 80 
     | 
    
         
            +
                  sp2_length = sp2[:normalized].size
         
     | 
| 
      
 81 
     | 
    
         
            +
                  sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
         
     | 
| 
      
 82 
     | 
    
         
            +
                  sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
         
     | 
| 
      
 83 
     | 
    
         
            +
                  match = false
         
     | 
| 
      
 84 
     | 
    
         
            +
                  ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
         
     | 
| 
      
 85 
     | 
    
         
            +
                  return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
         
     | 
| 
      
 86 
     | 
    
         
            +
                  #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
         
     | 
| 
      
 87 
     | 
    
         
            +
                  return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
         
     | 
| 
      
 88 
     | 
    
         
            +
                
         
     | 
| 
      
 89 
     | 
    
         
            +
                  match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
         
     | 
| 
      
 90 
     | 
    
         
            +
                  { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
              
         
     | 
| 
      
 93 
     | 
    
         
            +
                def match_authors(preparsed_1, preparsed_2)
         
     | 
| 
      
 94 
     | 
    
         
            +
                  au1 = preparsed_1[:all_authors]
         
     | 
| 
      
 95 
     | 
    
         
            +
                  au2 = preparsed_2[:all_authors]
         
     | 
| 
      
 96 
     | 
    
         
            +
                  yr1 = preparsed_1[:all_years]
         
     | 
| 
      
 97 
     | 
    
         
            +
                  yr2 = preparsed_2[:all_years]
         
     | 
| 
      
 98 
     | 
    
         
            +
                  Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
         
     | 
| 
      
 99 
     | 
    
         
            +
                end
         
     | 
| 
      
 100 
     | 
    
         
            +
              
         
     | 
| 
      
 101 
     | 
    
         
            +
                def match_matches(genus_match, species_match, infraspecies_match = nil) 
         
     | 
| 
      
 102 
     | 
    
         
            +
                  match = species_match
         
     | 
| 
      
 103 
     | 
    
         
            +
                  if infraspecies_match
         
     | 
| 
      
 104 
     | 
    
         
            +
                    match['edit_distance'] += infraspecies_match['edit_distance']
         
     | 
| 
      
 105 
     | 
    
         
            +
                    match['match'] &&= infraspecies_match['match']
         
     | 
| 
      
 106 
     | 
    
         
            +
                    match['phonetic_match'] &&= infraspecies_match['phonetic_match']
         
     | 
| 
      
 107 
     | 
    
         
            +
                  end
         
     | 
| 
      
 108 
     | 
    
         
            +
                  match['edit_distance'] += genus_match['edit_distance']
         
     | 
| 
      
 109 
     | 
    
         
            +
                  match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
         
     | 
| 
      
 110 
     | 
    
         
            +
                  match['match'] &&= genus_match['match']
         
     | 
| 
      
 111 
     | 
    
         
            +
                  match['phonetic_match'] &&= genus_match['phonetic_match']
         
     | 
| 
      
 112 
     | 
    
         
            +
                  match
         
     | 
| 
      
 113 
     | 
    
         
            +
                end
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
              end
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,82 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'biodiversity'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
              class Atomizer
         
     | 
| 
      
 7 
     | 
    
         
            +
                def initialize
         
     | 
| 
      
 8 
     | 
    
         
            +
                  @parser = ScientificNameParser.new
         
     | 
| 
      
 9 
     | 
    
         
            +
                  @parsed_raw = nil
         
     | 
| 
      
 10 
     | 
    
         
            +
                  @res = {}
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
              
         
     | 
| 
      
 13 
     | 
    
         
            +
                def parse(name)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  @res = {:all_authors => [], :all_years => []}
         
     | 
| 
      
 15 
     | 
    
         
            +
                  @parsed_raw = @parser.parse(name)[:scientificName]
         
     | 
| 
      
 16 
     | 
    
         
            +
                  organize_results
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
              
         
     | 
| 
      
 19 
     | 
    
         
            +
                def parsed_raw
         
     | 
| 
      
 20 
     | 
    
         
            +
                  return @parsed_raw
         
     | 
| 
      
 21 
     | 
    
         
            +
                end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
              protected
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                def organize_results
         
     | 
| 
      
 26 
     | 
    
         
            +
                  pr = @parsed_raw
         
     | 
| 
      
 27 
     | 
    
         
            +
                  return nil unless pr[:parsed]
         
     | 
| 
      
 28 
     | 
    
         
            +
                  d = pr[:details][0]
         
     | 
| 
      
 29 
     | 
    
         
            +
                  process_node(:uninomial, d[:uninomial])
         
     | 
| 
      
 30 
     | 
    
         
            +
                  process_node(:genus, d[:genus])
         
     | 
| 
      
 31 
     | 
    
         
            +
                  process_node(:species, d[:species], true)
         
     | 
| 
      
 32 
     | 
    
         
            +
                  process_infraspecies(d[:infraspecies])
         
     | 
| 
      
 33 
     | 
    
         
            +
                  @res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @res[:all_years].uniq!
         
     | 
| 
      
 35 
     | 
    
         
            +
                  @res.keys.size > 2 ? @res : nil
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
              
         
     | 
| 
      
 38 
     | 
    
         
            +
                def process_node(name, node, is_species = false)
         
     | 
| 
      
 39 
     | 
    
         
            +
                  return unless node
         
     | 
| 
      
 40 
     | 
    
         
            +
                  @res[name] = {}
         
     | 
| 
      
 41 
     | 
    
         
            +
                  @res[name][:epitheton] = node[:epitheton]
         
     | 
| 
      
 42 
     | 
    
         
            +
                  @res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:epitheton])
         
     | 
| 
      
 43 
     | 
    
         
            +
                  @res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:epitheton], is_species)
         
     | 
| 
      
 44 
     | 
    
         
            +
                  get_authors_years(node, @res[name])
         
     | 
| 
      
 45 
     | 
    
         
            +
                end
         
     | 
| 
      
 46 
     | 
    
         
            +
              
         
     | 
| 
      
 47 
     | 
    
         
            +
                def process_infraspecies(node)
         
     | 
| 
      
 48 
     | 
    
         
            +
                  return unless node
         
     | 
| 
      
 49 
     | 
    
         
            +
                  @res[:infraspecies] = []
         
     | 
| 
      
 50 
     | 
    
         
            +
                  node.each do |infr|
         
     | 
| 
      
 51 
     | 
    
         
            +
                    hsh = {}
         
     | 
| 
      
 52 
     | 
    
         
            +
                    hsh[:epitheton] = infr[:epitheton]
         
     | 
| 
      
 53 
     | 
    
         
            +
                    hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:epitheton])
         
     | 
| 
      
 54 
     | 
    
         
            +
                    hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr[:epitheton], true)
         
     | 
| 
      
 55 
     | 
    
         
            +
                    get_authors_years(infr,hsh)
         
     | 
| 
      
 56 
     | 
    
         
            +
                    @res[:infraspecies] << hsh
         
     | 
| 
      
 57 
     | 
    
         
            +
                  end
         
     | 
| 
      
 58 
     | 
    
         
            +
                end
         
     | 
| 
      
 59 
     | 
    
         
            +
              
         
     | 
| 
      
 60 
     | 
    
         
            +
                def get_authors_years(node, res)
         
     | 
| 
      
 61 
     | 
    
         
            +
                  res[:authors] = []
         
     | 
| 
      
 62 
     | 
    
         
            +
                  res[:years] = []
         
     | 
| 
      
 63 
     | 
    
         
            +
                  [:basionymAuthorTeam, :combinationAuthorTeam].each do |au|
         
     | 
| 
      
 64 
     | 
    
         
            +
                    if node[au]
         
     | 
| 
      
 65 
     | 
    
         
            +
                      res[:authors] += node[au][:author] 
         
     | 
| 
      
 66 
     | 
    
         
            +
                      res[:years] << node[au][:year] if node[au][:year]
         
     | 
| 
      
 67 
     | 
    
         
            +
                      if node[au][:exAuthorTeam]
         
     | 
| 
      
 68 
     | 
    
         
            +
                        res[:authors] += node[au][:exAuthorTeam][:author]
         
     | 
| 
      
 69 
     | 
    
         
            +
                        res[:years] << node[au][:exAuthorTeam][:year] if node[au][:exAuthorTeam][:year]
         
     | 
| 
      
 70 
     | 
    
         
            +
                      end
         
     | 
| 
      
 71 
     | 
    
         
            +
                    end
         
     | 
| 
      
 72 
     | 
    
         
            +
                  end
         
     | 
| 
      
 73 
     | 
    
         
            +
                  res[:authors].uniq!
         
     | 
| 
      
 74 
     | 
    
         
            +
                  res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
         
     | 
| 
      
 75 
     | 
    
         
            +
                  res[:years].uniq!
         
     | 
| 
      
 76 
     | 
    
         
            +
                  @res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
         
     | 
| 
      
 77 
     | 
    
         
            +
                  @res[:all_years] += res[:years] if res[:years].size > 0
         
     | 
| 
      
 78 
     | 
    
         
            +
                end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
              end
         
     | 
| 
      
 81 
     | 
    
         
            +
            end
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,89 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 4 
     | 
    
         
            +
              class Authmatch
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                def self.authmatch(authors1, authors2, years1, years2)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
         
     | 
| 
      
 8 
     | 
    
         
            +
                  year_difference = compare_years(years1, years2)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
         
     | 
| 
      
 10 
     | 
    
         
            +
                end
         
     | 
| 
      
 11 
     | 
    
         
            +
              
         
     | 
| 
      
 12 
     | 
    
         
            +
                def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  count_before = authors1.size + authors2.size
         
     | 
| 
      
 14 
     | 
    
         
            +
                  count_after = unique_authors1.size + unique_authors2.size
         
     | 
| 
      
 15 
     | 
    
         
            +
                  score = 0
         
     | 
| 
      
 16 
     | 
    
         
            +
                  if count_after == 0
         
     | 
| 
      
 17 
     | 
    
         
            +
                    if year_diff != nil
         
     | 
| 
      
 18 
     | 
    
         
            +
                      if year_diff == 0
         
     | 
| 
      
 19 
     | 
    
         
            +
                        score = 100
         
     | 
| 
      
 20 
     | 
    
         
            +
                      elsif year_diff == 1
         
     | 
| 
      
 21 
     | 
    
         
            +
                        score = 54  
         
     | 
| 
      
 22 
     | 
    
         
            +
                      end
         
     | 
| 
      
 23 
     | 
    
         
            +
                    else
         
     | 
| 
      
 24 
     | 
    
         
            +
                      score = 94
         
     | 
| 
      
 25 
     | 
    
         
            +
                    end
         
     | 
| 
      
 26 
     | 
    
         
            +
                  elsif unique_authors1.size == 0 || unique_authors2.size == 0
         
     | 
| 
      
 27 
     | 
    
         
            +
                    if year_diff != nil
         
     | 
| 
      
 28 
     | 
    
         
            +
                      if year_diff == 0
         
     | 
| 
      
 29 
     | 
    
         
            +
                        score = 91
         
     | 
| 
      
 30 
     | 
    
         
            +
                      elsif year_diff == 1
         
     | 
| 
      
 31 
     | 
    
         
            +
                        score = 51
         
     | 
| 
      
 32 
     | 
    
         
            +
                      end
         
     | 
| 
      
 33 
     | 
    
         
            +
                    else
         
     | 
| 
      
 34 
     | 
    
         
            +
                      score = 90
         
     | 
| 
      
 35 
     | 
    
         
            +
                    end
         
     | 
| 
      
 36 
     | 
    
         
            +
                  else
         
     | 
| 
      
 37 
     | 
    
         
            +
                    score = ((1 - count_after.to_f/count_before.to_f) * 100).round
         
     | 
| 
      
 38 
     | 
    
         
            +
                    score = 0 unless year_diff == nil || (year_diff && year_diff == 0)  
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
                  score > 50 ? score : 0
         
     | 
| 
      
 41 
     | 
    
         
            +
                end
         
     | 
| 
      
 42 
     | 
    
         
            +
              
         
     | 
| 
      
 43 
     | 
    
         
            +
                def self.remove_duplicate_authors(authors1, authors2)
         
     | 
| 
      
 44 
     | 
    
         
            +
                  unique_authors1 = authors1.dup
         
     | 
| 
      
 45 
     | 
    
         
            +
                  unique_authors2 = authors2.dup
         
     | 
| 
      
 46 
     | 
    
         
            +
                  authors1.each do |au1|
         
     | 
| 
      
 47 
     | 
    
         
            +
                    authors2.each do |au2|
         
     | 
| 
      
 48 
     | 
    
         
            +
                      au1_match = au2_match = false
         
     | 
| 
      
 49 
     | 
    
         
            +
                      if au1 == au2
         
     | 
| 
      
 50 
     | 
    
         
            +
                        au1_match = au2_match = true
         
     | 
| 
      
 51 
     | 
    
         
            +
                      elsif au1 == au2[0...au1.size]          
         
     | 
| 
      
 52 
     | 
    
         
            +
                        au1_match = true
         
     | 
| 
      
 53 
     | 
    
         
            +
                      elsif au1[0...au2.size] == au2
         
     | 
| 
      
 54 
     | 
    
         
            +
                        au2_match = true
         
     | 
| 
      
 55 
     | 
    
         
            +
                      end
         
     | 
| 
      
 56 
     | 
    
         
            +
                      if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
         
     | 
| 
      
 57 
     | 
    
         
            +
                        unique_authors1.delete au1
         
     | 
| 
      
 58 
     | 
    
         
            +
                        unique_authors2.delete au2
         
     | 
| 
      
 59 
     | 
    
         
            +
                      elsif au1_match
         
     | 
| 
      
 60 
     | 
    
         
            +
                        unique_authors1.delete au1
         
     | 
| 
      
 61 
     | 
    
         
            +
                      elsif au2_match
         
     | 
| 
      
 62 
     | 
    
         
            +
                        unique_authors2.delete au2
         
     | 
| 
      
 63 
     | 
    
         
            +
                      else
         
     | 
| 
      
 64 
     | 
    
         
            +
                        #TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
         
     | 
| 
      
 65 
     | 
    
         
            +
                        if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
         
     | 
| 
      
 66 
     | 
    
         
            +
                          unique_authors1.delete au1
         
     | 
| 
      
 67 
     | 
    
         
            +
                          unique_authors2.delete au2
         
     | 
| 
      
 68 
     | 
    
         
            +
                        end
         
     | 
| 
      
 69 
     | 
    
         
            +
                      end
         
     | 
| 
      
 70 
     | 
    
         
            +
                    end
         
     | 
| 
      
 71 
     | 
    
         
            +
                  end
         
     | 
| 
      
 72 
     | 
    
         
            +
                  [unique_authors1, unique_authors2]
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
              
         
     | 
| 
      
 75 
     | 
    
         
            +
                def self.fuzzy_match_authors(author1, author2)
         
     | 
| 
      
 76 
     | 
    
         
            +
                  au1_length = author1.size
         
     | 
| 
      
 77 
     | 
    
         
            +
                  au2_length = author2.size
         
     | 
| 
      
 78 
     | 
    
         
            +
                  dlm = Taxamatch::DamerauLevenshteinMod.new
         
     | 
| 
      
 79 
     | 
    
         
            +
                  ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
         
     | 
| 
      
 80 
     | 
    
         
            +
                  (ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
         
     | 
| 
      
 81 
     | 
    
         
            +
                end
         
     | 
| 
      
 82 
     | 
    
         
            +
             
     | 
| 
      
 83 
     | 
    
         
            +
                def self.compare_years(years1, years2)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  return 0 if years1 == [] && years2 == []
         
     | 
| 
      
 85 
     | 
    
         
            +
                  return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
         
     | 
| 
      
 86 
     | 
    
         
            +
                  nil
         
     | 
| 
      
 87 
     | 
    
         
            +
                end
         
     | 
| 
      
 88 
     | 
    
         
            +
              end
         
     | 
| 
      
 89 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,139 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'rubygems'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'inline'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'time'
         
     | 
| 
      
 5 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
              class DamerauLevenshteinMod
         
     | 
| 
      
 8 
     | 
    
         
            +
                def distance(str1, str2, block_size=2, max_distance=10)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  # puts str1.unpack("U*");
         
     | 
| 
      
 10 
     | 
    
         
            +
                  distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                inline do |builder|
         
     | 
| 
      
 14 
     | 
    
         
            +
                  builder.c "
         
     | 
| 
      
 15 
     | 
    
         
            +
                  static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
         
     | 
| 
      
 16 
     | 
    
         
            +
                    int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
         
     | 
| 
      
 17 
     | 
    
         
            +
                    int stop_execution = 0;
         
     | 
| 
      
 18 
     | 
    
         
            +
                    int min = 0;
         
     | 
| 
      
 19 
     | 
    
         
            +
                    int current_distance = 0;
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                    VALUE *sv = RARRAY_PTR(_s);
         
     | 
| 
      
 22 
     | 
    
         
            +
                    VALUE *tv = RARRAY_PTR(_t);
         
     | 
| 
      
 23 
     | 
    
         
            +
                  
         
     | 
| 
      
 24 
     | 
    
         
            +
                    sl = RARRAY_LEN(_s);
         
     | 
| 
      
 25 
     | 
    
         
            +
                    tl = RARRAY_LEN(_t);
         
     | 
| 
      
 26 
     | 
    
         
            +
                  
         
     | 
| 
      
 27 
     | 
    
         
            +
                    if (sl == 0) return INT2NUM(tl);
         
     | 
| 
      
 28 
     | 
    
         
            +
                    if (tl == 0) return INT2NUM(sl);
         
     | 
| 
      
 29 
     | 
    
         
            +
                    //case of lengths 1 must present or it will break further in the code
         
     | 
| 
      
 30 
     | 
    
         
            +
                    if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
         
     | 
| 
      
 31 
     | 
    
         
            +
                  
         
     | 
| 
      
 32 
     | 
    
         
            +
                    int s[sl];
         
     | 
| 
      
 33 
     | 
    
         
            +
                    int t[tl];
         
     | 
| 
      
 34 
     | 
    
         
            +
                  
         
     | 
| 
      
 35 
     | 
    
         
            +
                    for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
         
     | 
| 
      
 36 
     | 
    
         
            +
                    for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
         
     | 
| 
      
 37 
     | 
    
         
            +
                  
         
     | 
| 
      
 38 
     | 
    
         
            +
                    sl++;
         
     | 
| 
      
 39 
     | 
    
         
            +
                    tl++;
         
     | 
| 
      
 40 
     | 
    
         
            +
                  
         
     | 
| 
      
 41 
     | 
    
         
            +
                    //one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
         
     | 
| 
      
 42 
     | 
    
         
            +
                    d = malloc((sizeof(int))*(sl)*(tl));
         
     | 
| 
      
 43 
     | 
    
         
            +
                    //populate 'vertical' row starting from the 2nd position (first one is filled already)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    for(i = 0; i < tl; i++){
         
     | 
| 
      
 45 
     | 
    
         
            +
                      d[i*sl] = i;
         
     | 
| 
      
 46 
     | 
    
         
            +
                    }
         
     | 
| 
      
 47 
     | 
    
         
            +
                  
         
     | 
| 
      
 48 
     | 
    
         
            +
                    //fill up array with scores
         
     | 
| 
      
 49 
     | 
    
         
            +
                    for(i = 1; i<sl; i++){
         
     | 
| 
      
 50 
     | 
    
         
            +
                      d[i] = i;
         
     | 
| 
      
 51 
     | 
    
         
            +
                      if (stop_execution == 1) break;
         
     | 
| 
      
 52 
     | 
    
         
            +
                      current_distance = 10000;
         
     | 
| 
      
 53 
     | 
    
         
            +
                      for(j = 1; j<tl; j++){
         
     | 
| 
      
 54 
     | 
    
         
            +
                      
         
     | 
| 
      
 55 
     | 
    
         
            +
                        cost = 1;
         
     | 
| 
      
 56 
     | 
    
         
            +
                        if(s[i-1] == t[j-1]) cost = 0;
         
     | 
| 
      
 57 
     | 
    
         
            +
                      
         
     | 
| 
      
 58 
     | 
    
         
            +
                        half_sl = (sl - 1)/2;
         
     | 
| 
      
 59 
     | 
    
         
            +
                        half_tl = (tl - 1)/2;
         
     | 
| 
      
 60 
     | 
    
         
            +
                      
         
     | 
| 
      
 61 
     | 
    
         
            +
                        block = block_size < half_sl ? block_size : half_sl;
         
     | 
| 
      
 62 
     | 
    
         
            +
                        block = block < half_tl ? block : half_tl;
         
     | 
| 
      
 63 
     | 
    
         
            +
                      
         
     | 
| 
      
 64 
     | 
    
         
            +
                        while (block >= 1){   
         
     | 
| 
      
 65 
     | 
    
         
            +
                          int swap1 = 1;
         
     | 
| 
      
 66 
     | 
    
         
            +
                          int swap2 = 1;
         
     | 
| 
      
 67 
     | 
    
         
            +
                          i1 = i - (block * 2);
         
     | 
| 
      
 68 
     | 
    
         
            +
                          j1 = j - (block * 2);
         
     | 
| 
      
 69 
     | 
    
         
            +
                          for (k = i1; k < i1 + block; k++) {
         
     | 
| 
      
 70 
     | 
    
         
            +
                            if (s[k] != t[k + block]){
         
     | 
| 
      
 71 
     | 
    
         
            +
                              swap1 = 0;
         
     | 
| 
      
 72 
     | 
    
         
            +
                              break;
         
     | 
| 
      
 73 
     | 
    
         
            +
                            }
         
     | 
| 
      
 74 
     | 
    
         
            +
                          }
         
     | 
| 
      
 75 
     | 
    
         
            +
                          for (k = j1; k < j1 + block; k++) {
         
     | 
| 
      
 76 
     | 
    
         
            +
                            if (t[k] != s[k + block]){
         
     | 
| 
      
 77 
     | 
    
         
            +
                              swap2 = 0;
         
     | 
| 
      
 78 
     | 
    
         
            +
                              break;
         
     | 
| 
      
 79 
     | 
    
         
            +
                            }
         
     | 
| 
      
 80 
     | 
    
         
            +
                          }
         
     | 
| 
      
 81 
     | 
    
         
            +
                        
         
     | 
| 
      
 82 
     | 
    
         
            +
                          del = d[j*sl + i - 1] + 1; 
         
     | 
| 
      
 83 
     | 
    
         
            +
                          ins = d[(j-1)*sl + i] + 1;
         
     | 
| 
      
 84 
     | 
    
         
            +
                          min = del;
         
     | 
| 
      
 85 
     | 
    
         
            +
                          if (ins < min) min = ins;
         
     | 
| 
      
 86 
     | 
    
         
            +
                          //if (i == 2 && j==2) return INT2NUM(swap2+5); 
         
     | 
| 
      
 87 
     | 
    
         
            +
                          if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
         
     | 
| 
      
 88 
     | 
    
         
            +
                            transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1; 
         
     | 
| 
      
 89 
     | 
    
         
            +
                            if (transp < min) min = transp;
         
     | 
| 
      
 90 
     | 
    
         
            +
                            block = 0;
         
     | 
| 
      
 91 
     | 
    
         
            +
                          } else if (block == 1) {
         
     | 
| 
      
 92 
     | 
    
         
            +
                            subs = d[(j-1)*sl + i - 1] + cost;
         
     | 
| 
      
 93 
     | 
    
         
            +
                            if (subs < min) min = subs;
         
     | 
| 
      
 94 
     | 
    
         
            +
                          }
         
     | 
| 
      
 95 
     | 
    
         
            +
                          block--;
         
     | 
| 
      
 96 
     | 
    
         
            +
                        } 
         
     | 
| 
      
 97 
     | 
    
         
            +
                        d[j*sl+i]=min;          
         
     | 
| 
      
 98 
     | 
    
         
            +
                        if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
         
     | 
| 
      
 99 
     | 
    
         
            +
                      }
         
     | 
| 
      
 100 
     | 
    
         
            +
                      if (current_distance > max_distance) {
         
     | 
| 
      
 101 
     | 
    
         
            +
                        stop_execution = 1;
         
     | 
| 
      
 102 
     | 
    
         
            +
                      }
         
     | 
| 
      
 103 
     | 
    
         
            +
                    }
         
     | 
| 
      
 104 
     | 
    
         
            +
                    distance=d[sl * tl - 1];
         
     | 
| 
      
 105 
     | 
    
         
            +
                    if (stop_execution == 1) distance = current_distance;
         
     | 
| 
      
 106 
     | 
    
         
            +
                  
         
     | 
| 
      
 107 
     | 
    
         
            +
                    free(d);
         
     | 
| 
      
 108 
     | 
    
         
            +
                    return INT2NUM(distance);
         
     | 
| 
      
 109 
     | 
    
         
            +
                  }
         
     | 
| 
      
 110 
     | 
    
         
            +
                 "
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
              end
         
     | 
| 
      
 113 
     | 
    
         
            +
            end
         
     | 
| 
      
 114 
     | 
    
         
            +
             
     | 
| 
      
 115 
     | 
    
         
            +
            if __FILE__ == $0
         
     | 
| 
      
 116 
     | 
    
         
            +
              a=Taxamatch::DamerauLevenshteinMod.new
         
     | 
| 
      
 117 
     | 
    
         
            +
              s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
         
     | 
| 
      
 118 
     | 
    
         
            +
              t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
         
     | 
| 
      
 119 
     | 
    
         
            +
              
         
     | 
| 
      
 120 
     | 
    
         
            +
              #puts s.join(",")
         
     | 
| 
      
 121 
     | 
    
         
            +
              #puts t.join(",")
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
              start = Time.now
         
     | 
| 
      
 124 
     | 
    
         
            +
              (1..100000).each do
         
     | 
| 
      
 125 
     | 
    
         
            +
               a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
         
     | 
| 
      
 126 
     | 
    
         
            +
              end
         
     | 
| 
      
 127 
     | 
    
         
            +
              puts "with unpack time: " + (Time.now - start).to_s + ' sec'
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
              start = Time.now
         
     | 
| 
      
 130 
     | 
    
         
            +
              (1..100000).each do
         
     | 
| 
      
 131 
     | 
    
         
            +
               a.distance_utf(s, t, 1, 10)
         
     | 
| 
      
 132 
     | 
    
         
            +
              end
         
     | 
| 
      
 133 
     | 
    
         
            +
              puts 'utf time: ' + (Time.now - start).to_s + ' sec'
         
     | 
| 
      
 134 
     | 
    
         
            +
             
     | 
| 
      
 135 
     | 
    
         
            +
              #puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
         
     | 
| 
      
 136 
     | 
    
         
            +
              #puts a.distance_utf(s, t, 2, 10) 
         
     | 
| 
      
 137 
     | 
    
         
            +
              #puts a.distance('tar','atp',1,10);
         
     | 
| 
      
 138 
     | 
    
         
            +
              puts a.distance('sub', 'usb', 1, 10);
         
     | 
| 
      
 139 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,55 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 4 
     | 
    
         
            +
              
         
     | 
| 
      
 5 
     | 
    
         
            +
              module Normalizer
         
     | 
| 
      
 6 
     | 
    
         
            +
                def self.normalize(string)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  utf8_to_ascii(string).upcase
         
     | 
| 
      
 8 
     | 
    
         
            +
                end
         
     | 
| 
      
 9 
     | 
    
         
            +
              
         
     | 
| 
      
 10 
     | 
    
         
            +
                def self.normalize_word(word)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
         
     | 
| 
      
 12 
     | 
    
         
            +
                end
         
     | 
| 
      
 13 
     | 
    
         
            +
                
         
     | 
| 
      
 14 
     | 
    
         
            +
                def self.normalize_author(string)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
         
     | 
| 
      
 16 
     | 
    
         
            +
                end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
              protected
         
     | 
| 
      
 19 
     | 
    
         
            +
                def self.utf8_to_ascii(string)
         
     | 
| 
      
 20 
     | 
    
         
            +
                  string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
         
     | 
| 
      
 21 
     | 
    
         
            +
                  string = string.gsub(/[ÉÈÊË]/, "E")
         
     | 
| 
      
 22 
     | 
    
         
            +
                  string = string.gsub(/[ÍÌÎÏ]/, "I")
         
     | 
| 
      
 23 
     | 
    
         
            +
                  string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
         
     | 
| 
      
 24 
     | 
    
         
            +
                  string = string.gsub(/[ÚÙÛÜ]/, "U")
         
     | 
| 
      
 25 
     | 
    
         
            +
                  string = string.gsub(/[Ý]/, "Y")
         
     | 
| 
      
 26 
     | 
    
         
            +
                  string = string.gsub(/Æ/, "AE")
         
     | 
| 
      
 27 
     | 
    
         
            +
                  string = string.gsub(/[ČÇ]/, "C")
         
     | 
| 
      
 28 
     | 
    
         
            +
                  string = string.gsub(/[ŠŞ]/, "S")
         
     | 
| 
      
 29 
     | 
    
         
            +
                  string = string.gsub(/[Đ]/, "D")
         
     | 
| 
      
 30 
     | 
    
         
            +
                  string = string.gsub(/Ž/, "Z")
         
     | 
| 
      
 31 
     | 
    
         
            +
                  string = string.gsub(/Ñ/, "N")
         
     | 
| 
      
 32 
     | 
    
         
            +
                  string = string.gsub(/Œ/, "OE")
         
     | 
| 
      
 33 
     | 
    
         
            +
                  string = string.gsub(/ß/, "B")
         
     | 
| 
      
 34 
     | 
    
         
            +
                  string = string.gsub(/Ķ/, "K")
         
     | 
| 
      
 35 
     | 
    
         
            +
                  string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
         
     | 
| 
      
 36 
     | 
    
         
            +
                  string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
         
     | 
| 
      
 37 
     | 
    
         
            +
                  string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
         
     | 
| 
      
 38 
     | 
    
         
            +
                  string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
         
     | 
| 
      
 39 
     | 
    
         
            +
                  string = string.gsub(/[úùûüůưừựủứụ]/, "u")
         
     | 
| 
      
 40 
     | 
    
         
            +
                  string = string.gsub(/[žź]/, "z")
         
     | 
| 
      
 41 
     | 
    
         
            +
                  string = string.gsub(/[ýÿỹ]/, "y")
         
     | 
| 
      
 42 
     | 
    
         
            +
                  string = string.gsub(/[đ]/, "d")
         
     | 
| 
      
 43 
     | 
    
         
            +
                  string = string.gsub(/æ/, "ae")
         
     | 
| 
      
 44 
     | 
    
         
            +
                  string = string.gsub(/[čćç]/, "c")
         
     | 
| 
      
 45 
     | 
    
         
            +
                  string = string.gsub(/[ñńň]/, "n")
         
     | 
| 
      
 46 
     | 
    
         
            +
                  string = string.gsub(/œ/, "oe")
         
     | 
| 
      
 47 
     | 
    
         
            +
                  string = string.gsub(/[śšş]/, "s")
         
     | 
| 
      
 48 
     | 
    
         
            +
                  string = string.gsub(/ř/, "r")
         
     | 
| 
      
 49 
     | 
    
         
            +
                  string = string.gsub(/ğ/, "g")
         
     | 
| 
      
 50 
     | 
    
         
            +
                  string = string.gsub(/Ř/, "R")
         
     | 
| 
      
 51 
     | 
    
         
            +
                end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
              end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,79 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            module Taxamatch
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
              module Phonetizer
         
     | 
| 
      
 5 
     | 
    
         
            +
                
         
     | 
| 
      
 6 
     | 
    
         
            +
                def self.phonetize(a_word, normalize_ending = false)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  self.near_match(a_word, normalize_ending)
         
     | 
| 
      
 8 
     | 
    
         
            +
                end
         
     | 
| 
      
 9 
     | 
    
         
            +
                
         
     | 
| 
      
 10 
     | 
    
         
            +
                def self.near_match(a_word, normalize_ending = false)
         
     | 
| 
      
 11 
     | 
    
         
            +
                  a_word = a_word.strip rescue ''
         
     | 
| 
      
 12 
     | 
    
         
            +
                  return '' if a_word == ''
         
     | 
| 
      
 13 
     | 
    
         
            +
                  a_word = Taxamatch::Normalizer.normalize a_word
         
     | 
| 
      
 14 
     | 
    
         
            +
                  case a_word
         
     | 
| 
      
 15 
     | 
    
         
            +
                    when /^AE/
         
     | 
| 
      
 16 
     | 
    
         
            +
                      a_word = 'E' + a_word[2..-1]
         
     | 
| 
      
 17 
     | 
    
         
            +
                    when /^CN/
         
     | 
| 
      
 18 
     | 
    
         
            +
                      a_word = 'N' + a_word[2..-1]
         
     | 
| 
      
 19 
     | 
    
         
            +
                    when /^CT/
         
     | 
| 
      
 20 
     | 
    
         
            +
                      a_word = 'T' + a_word[2..-1]
         
     | 
| 
      
 21 
     | 
    
         
            +
                    when /^CZ/
         
     | 
| 
      
 22 
     | 
    
         
            +
                      a_word = 'C' + a_word[2..-1]
         
     | 
| 
      
 23 
     | 
    
         
            +
                    when /^DJ/
         
     | 
| 
      
 24 
     | 
    
         
            +
                      a_word = 'J' + a_word[2..-1]
         
     | 
| 
      
 25 
     | 
    
         
            +
                    when /^EA/
         
     | 
| 
      
 26 
     | 
    
         
            +
                      a_word = 'E' + a_word[2..-1]
         
     | 
| 
      
 27 
     | 
    
         
            +
                    when /^EU/
         
     | 
| 
      
 28 
     | 
    
         
            +
                      a_word = 'U' + a_word[2..-1]
         
     | 
| 
      
 29 
     | 
    
         
            +
                    when /^GN/
         
     | 
| 
      
 30 
     | 
    
         
            +
                      a_word = 'N' + a_word[2..-1]
         
     | 
| 
      
 31 
     | 
    
         
            +
                    when /^KN/
         
     | 
| 
      
 32 
     | 
    
         
            +
                      a_word = 'N' + a_word[2..-1]
         
     | 
| 
      
 33 
     | 
    
         
            +
                    when /^MC/
         
     | 
| 
      
 34 
     | 
    
         
            +
                      a_word = 'MAC' + a_word[2..-1]
         
     | 
| 
      
 35 
     | 
    
         
            +
                    when /^MN/
         
     | 
| 
      
 36 
     | 
    
         
            +
                      a_word = 'N' + a_word[2..-1]
         
     | 
| 
      
 37 
     | 
    
         
            +
                    when /^OE/
         
     | 
| 
      
 38 
     | 
    
         
            +
                      a_word = 'E' + a_word[2..-1]
         
     | 
| 
      
 39 
     | 
    
         
            +
                    when /^QU/
         
     | 
| 
      
 40 
     | 
    
         
            +
                      a_word = 'Q' + a_word[2..-1]
         
     | 
| 
      
 41 
     | 
    
         
            +
                    when /^PS/
         
     | 
| 
      
 42 
     | 
    
         
            +
                      a_word = 'S' + a_word[2..-1]
         
     | 
| 
      
 43 
     | 
    
         
            +
                    when /^PT/
         
     | 
| 
      
 44 
     | 
    
         
            +
                      a_word = 'T' + a_word[2..-1]
         
     | 
| 
      
 45 
     | 
    
         
            +
                    when /^TS/
         
     | 
| 
      
 46 
     | 
    
         
            +
                      a_word = 'S' + a_word[2..-1]
         
     | 
| 
      
 47 
     | 
    
         
            +
                    when /^WR/
         
     | 
| 
      
 48 
     | 
    
         
            +
                      a_word = 'R' + a_word[2..-1]
         
     | 
| 
      
 49 
     | 
    
         
            +
                    when /^X/
         
     | 
| 
      
 50 
     | 
    
         
            +
                      a_word = 'Z' + a_word[1..-1]
         
     | 
| 
      
 51 
     | 
    
         
            +
                  end
         
     | 
| 
      
 52 
     | 
    
         
            +
                  first_char = a_word.split('')[0]
         
     | 
| 
      
 53 
     | 
    
         
            +
                  rest_chars = a_word.split('')[1..-1].join('')   
         
     | 
| 
      
 54 
     | 
    
         
            +
                  rest_chars.gsub!('AE', 'I')
         
     | 
| 
      
 55 
     | 
    
         
            +
                  rest_chars.gsub!('IA', 'A')
         
     | 
| 
      
 56 
     | 
    
         
            +
                  rest_chars.gsub!('OE', 'I')
         
     | 
| 
      
 57 
     | 
    
         
            +
                  rest_chars.gsub!('OI', 'A')
         
     | 
| 
      
 58 
     | 
    
         
            +
                  rest_chars.gsub!('SC', 'S')
         
     | 
| 
      
 59 
     | 
    
         
            +
                  rest_chars.gsub!('H', '')
         
     | 
| 
      
 60 
     | 
    
         
            +
                  rest_chars.tr!('EOUYKZ', 'IAIICS')
         
     | 
| 
      
 61 
     | 
    
         
            +
                  a_word = (first_char + rest_chars).squeeze
         
     | 
| 
      
 62 
     | 
    
         
            +
                
         
     | 
| 
      
 63 
     | 
    
         
            +
                  if normalize_ending && a_word.size > 4
         
     | 
| 
      
 64 
     | 
    
         
            +
                    a_word = self.normalize_ending(a_word)
         
     | 
| 
      
 65 
     | 
    
         
            +
                  end
         
     | 
| 
      
 66 
     | 
    
         
            +
                  a_word
         
     | 
| 
      
 67 
     | 
    
         
            +
                end
         
     | 
| 
      
 68 
     | 
    
         
            +
                
         
     | 
| 
      
 69 
     | 
    
         
            +
                def self.normalize_ending(a_word)
         
     | 
| 
      
 70 
     | 
    
         
            +
                    # -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
         
     | 
| 
      
 71 
     | 
    
         
            +
                    # -- at the end of a string translate all to -a
         
     | 
| 
      
 72 
     | 
    
         
            +
                    a_word.gsub!(/IS$/, 'A')
         
     | 
| 
      
 73 
     | 
    
         
            +
                    a_word.gsub!(/IM$/, 'A')
         
     | 
| 
      
 74 
     | 
    
         
            +
                    a_word.gsub(/AS$/, 'A')
         
     | 
| 
      
 75 
     | 
    
         
            +
                end
         
     | 
| 
      
 76 
     | 
    
         
            +
              
         
     | 
| 
      
 77 
     | 
    
         
            +
              end
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,63 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ######################
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
         
     | 
| 
      
 5 
     | 
    
         
            +
            # * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            # Fields:
         
     | 
| 
      
 8 
     | 
    
         
            +
            # String1|String2|maximum distance|transposition block size|expected distance
         
     | 
| 
      
 9 
     | 
    
         
            +
            #  - String1, String2
         
     | 
| 
      
 10 
     | 
    
         
            +
            #     compared strings
         
     | 
| 
      
 11 
     | 
    
         
            +
            #  - maximum distance
         
     | 
| 
      
 12 
     | 
    
         
            +
            #     stops execution of the algorithm when calculated distance exceeds the maximum distance number
         
     | 
| 
      
 13 
     | 
    
         
            +
            #  - transosition block size
         
     | 
| 
      
 14 
     | 
    
         
            +
            #     determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
         
     | 
| 
      
 15 
     | 
    
         
            +
            #  - expected distance
         
     | 
| 
      
 16 
     | 
    
         
            +
            #     resulting distance that has to be achieved by the algorithm
         
     | 
| 
      
 17 
     | 
    
         
            +
            #  Note: algorithm does not try to normalize or interpret strings in any way.
         
     | 
| 
      
 18 
     | 
    
         
            +
            ######################
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
            #it whould recognize the exact match
         
     | 
| 
      
 21 
     | 
    
         
            +
            Pomatomus|Pomatomus|10|1|0
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            #it should not try to normalize incoming strings
         
     | 
| 
      
 24 
     | 
    
         
            +
             Pomatomus|Pomatomus|10|1|1
         
     | 
| 
      
 25 
     | 
    
         
            +
            Pomatomus|pomatomus|10|1|1
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            #it should calculate special cases
         
     | 
| 
      
 28 
     | 
    
         
            +
            Pomatomus||10|1|9
         
     | 
| 
      
 29 
     | 
    
         
            +
            |Pomatomus|10|1|9
         
     | 
| 
      
 30 
     | 
    
         
            +
            P|p|10|1|1
         
     | 
| 
      
 31 
     | 
    
         
            +
            #TODO: one letter vs longer string generates a big negative number
         
     | 
| 
      
 32 
     | 
    
         
            +
            #L|Linneaus|10|1|7
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
            #it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
         
     | 
| 
      
 36 
     | 
    
         
            +
            Pomatomus|Pomatomux|10|1|1
         
     | 
| 
      
 37 
     | 
    
         
            +
            Pmatomus|Pomatomus|10|1|1
         
     | 
| 
      
 38 
     | 
    
         
            +
            Pomatomus|Pmatomus|10|1|1
         
     | 
| 
      
 39 
     | 
    
         
            +
            Rpmatomus|Pomatomus|10|1|2
         
     | 
| 
      
 40 
     | 
    
         
            +
            Pommtomus|Pomatomus|10|1|1
         
     | 
| 
      
 41 
     | 
    
         
            +
            Potamomus|Pomatomus|10|1|2
         
     | 
| 
      
 42 
     | 
    
         
            +
            Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
         
     | 
| 
      
 43 
     | 
    
         
            +
            Pomatomus|oPmatomus|10|1|1
         
     | 
| 
      
 44 
     | 
    
         
            +
            Pomatomus|Pomatomsu|10|1|1
         
     | 
| 
      
 45 
     | 
    
         
            +
            Pomtaomus|Pomatomus|10|1|1
         
     | 
| 
      
 46 
     | 
    
         
            +
            Pomatoums|Pomatomus|10|1|1
         
     | 
| 
      
 47 
     | 
    
         
            +
            Potamomus|Pomatomus|10|1|2
         
     | 
| 
      
 48 
     | 
    
         
            +
            Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            #it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
         
     | 
| 
      
 51 
     | 
    
         
            +
            serrulatus|serratulus|10|2|2
         
     | 
| 
      
 52 
     | 
    
         
            +
            Pomatomus|Poomumats|10|3|3
         
     | 
| 
      
 53 
     | 
    
         
            +
            vesiculosus|vecusilosus|10|1|4
         
     | 
| 
      
 54 
     | 
    
         
            +
            vesiculosus|vecusilosus|10|2|2
         
     | 
| 
      
 55 
     | 
    
         
            +
            trimerophyton|mertriophyton|10|1|6
         
     | 
| 
      
 56 
     | 
    
         
            +
            trimerophyton|mertriophyton|10|3|3
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
            #it should stop trying if distance exceeds maximum allowed distance
         
     | 
| 
      
 59 
     | 
    
         
            +
            Pxxxxomus|Pomatomus|10|1|4
         
     | 
| 
      
 60 
     | 
    
         
            +
            Pxxxxomus|Pomatomus|2|1|3
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
            #
         
     | 
| 
      
 63 
     | 
    
         
            +
            PUNCTATA|PUNCTATA|10|1|0
         
     | 
    
        data/spec/spec.opts
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --colour
         
     | 
    
        data/spec/spec_helper.rb
    ADDED
    
    | 
         @@ -0,0 +1,28 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            begin
         
     | 
| 
      
 2 
     | 
    
         
            +
              require 'spec'
         
     | 
| 
      
 3 
     | 
    
         
            +
            rescue LoadError
         
     | 
| 
      
 4 
     | 
    
         
            +
              require 'rubygems' unless ENV['NO_RUBYGEMS']
         
     | 
| 
      
 5 
     | 
    
         
            +
              gem 'rspec'
         
     | 
| 
      
 6 
     | 
    
         
            +
              require 'spec'
         
     | 
| 
      
 7 
     | 
    
         
            +
            end
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            $:.unshift(File.dirname(__FILE__) + '/../lib')
         
     | 
| 
      
 10 
     | 
    
         
            +
            require 'taxamatch_rb'
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            def read_test_file(file, fields_num)
         
     | 
| 
      
 13 
     | 
    
         
            +
              f = open(file)
         
     | 
| 
      
 14 
     | 
    
         
            +
              f.each do |line|
         
     | 
| 
      
 15 
     | 
    
         
            +
                fields = line.split("|")
         
     | 
| 
      
 16 
     | 
    
         
            +
                if line.match(/^\s*#/) == nil && fields.size == fields_num
         
     | 
| 
      
 17 
     | 
    
         
            +
                  fields[-1] = fields[-1].split('#')[0].strip
         
     | 
| 
      
 18 
     | 
    
         
            +
                  yield(fields)
         
     | 
| 
      
 19 
     | 
    
         
            +
                else
         
     | 
| 
      
 20 
     | 
    
         
            +
                  yield(nil)
         
     | 
| 
      
 21 
     | 
    
         
            +
                end
         
     | 
| 
      
 22 
     | 
    
         
            +
              end
         
     | 
| 
      
 23 
     | 
    
         
            +
            end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
            def make_taxamatch_hash(string)
         
     | 
| 
      
 26 
     | 
    
         
            +
              normalized = Taxamatch::Normalizer.normalize(string)
         
     | 
| 
      
 27 
     | 
    
         
            +
              {:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
         
     | 
| 
      
 28 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,254 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # encoding: UTF-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            require File.dirname(__FILE__) + '/spec_helper.rb'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            describe 'DamerauLevenshteinMod' do
         
     | 
| 
      
 5 
     | 
    
         
            +
              it 'should get tests' do
         
     | 
| 
      
 6 
     | 
    
         
            +
                read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
         
     | 
| 
      
 7 
     | 
    
         
            +
                  dl = Taxamatch::DamerauLevenshteinMod.new
         
     | 
| 
      
 8 
     | 
    
         
            +
                  if y
         
     | 
| 
      
 9 
     | 
    
         
            +
                    res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
         
     | 
| 
      
 10 
     | 
    
         
            +
                    puts y if res != y[4].to_i
         
     | 
| 
      
 11 
     | 
    
         
            +
                    res.should == y[4].to_i
         
     | 
| 
      
 12 
     | 
    
         
            +
                  end
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
              end
         
     | 
| 
      
 15 
     | 
    
         
            +
            end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            describe 'Atomizer' do
         
     | 
| 
      
 18 
     | 
    
         
            +
              before(:all) do
         
     | 
| 
      
 19 
     | 
    
         
            +
                @parser = Taxamatch::Atomizer.new
         
     | 
| 
      
 20 
     | 
    
         
            +
              end
         
     | 
| 
      
 21 
     | 
    
         
            +
              
         
     | 
| 
      
 22 
     | 
    
         
            +
              it 'should parse uninomials' do
         
     | 
| 
      
 23 
     | 
    
         
            +
                @parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
         
     | 
| 
      
 24 
     | 
    
         
            +
                @parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
              
         
     | 
| 
      
 27 
     | 
    
         
            +
              it 'should parse binomials' do
         
     | 
| 
      
 28 
     | 
    
         
            +
                @parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
         
     | 
| 
      
 29 
     | 
    
         
            +
              end
         
     | 
| 
      
 30 
     | 
    
         
            +
              
         
     | 
| 
      
 31 
     | 
    
         
            +
              it 'should parse trinomials' do 
         
     | 
| 
      
 32 
     | 
    
         
            +
                @parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
         
     | 
| 
      
 33 
     | 
    
         
            +
              end
         
     | 
| 
      
 34 
     | 
    
         
            +
            end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
            describe 'Taxamatch::Normalizer' do
         
     | 
| 
      
 38 
     | 
    
         
            +
              it 'should normalize  strings' do
         
     | 
| 
      
 39 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
         
     | 
| 
      
 40 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
         
     | 
| 
      
 41 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
         
     | 
| 
      
 42 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
         
     | 
| 
      
 43 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
              
         
     | 
| 
      
 46 
     | 
    
         
            +
              it 'should normalize words' do
         
     | 
| 
      
 47 
     | 
    
         
            +
                Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
      
 49 
     | 
    
         
            +
            end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
            describe 'Taxamatch::Base' do
         
     | 
| 
      
 52 
     | 
    
         
            +
              before(:all) do
         
     | 
| 
      
 53 
     | 
    
         
            +
                @tm = Taxamatch::Base.new
         
     | 
| 
      
 54 
     | 
    
         
            +
              end
         
     | 
| 
      
 55 
     | 
    
         
            +
              
         
     | 
| 
      
 56 
     | 
    
         
            +
              it 'should get txt tests' do
         
     | 
| 
      
 57 
     | 
    
         
            +
                dl = Taxamatch::DamerauLevenshteinMod.new
         
     | 
| 
      
 58 
     | 
    
         
            +
                read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
         
     | 
| 
      
 59 
     | 
    
         
            +
                  if y
         
     | 
| 
      
 60 
     | 
    
         
            +
                    y[2] = y[2] == 'true' ? true : false
         
     | 
| 
      
 61 
     | 
    
         
            +
                    res = @tm.taxamatch(y[0], y[1], false)
         
     | 
| 
      
 62 
     | 
    
         
            +
                    puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]] 
         
     | 
| 
      
 63 
     | 
    
         
            +
                    res['match'].should == y[2]
         
     | 
| 
      
 64 
     | 
    
         
            +
                    res['edit_distance'].should == y[3].to_i
         
     | 
| 
      
 65 
     | 
    
         
            +
                  end
         
     | 
| 
      
 66 
     | 
    
         
            +
                end
         
     | 
| 
      
 67 
     | 
    
         
            +
              end
         
     | 
| 
      
 68 
     | 
    
         
            +
              
         
     | 
| 
      
 69 
     | 
    
         
            +
              it 'should work with names that cannot be parsed' do
         
     | 
| 
      
 70 
     | 
    
         
            +
                res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
         
     | 
| 
      
 71 
     | 
    
         
            +
                res = false
         
     | 
| 
      
 72 
     | 
    
         
            +
              end
         
     | 
| 
      
 73 
     | 
    
         
            +
              
         
     | 
| 
      
 74 
     | 
    
         
            +
              it 'should compare genera' do
         
     | 
| 
      
 75 
     | 
    
         
            +
                #edit distance 1 always match
         
     | 
| 
      
 76 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Plantago'
         
     | 
| 
      
 77 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Plantagon'
         
     | 
| 
      
 78 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
         
     | 
| 
      
 79 
     | 
    
         
            +
                #edit_distance above threshold does not math
         
     | 
| 
      
 80 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Plantago'
         
     | 
| 
      
 81 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'This shouldnt match'
         
     | 
| 
      
 82 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
         
     | 
| 
      
 83 
     | 
    
         
            +
                #phonetic_match matches
         
     | 
| 
      
 84 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Plantagi'
         
     | 
| 
      
 85 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Plantagy'
         
     | 
| 
      
 86 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
         
     | 
| 
      
 87 
     | 
    
         
            +
                #distance 1 in first letter also matches
         
     | 
| 
      
 88 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xantheri'
         
     | 
| 
      
 89 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Pantheri'
         
     | 
| 
      
 90 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
         
     | 
| 
      
 91 
     | 
    
         
            +
                #phonetic match tramps everything
         
     | 
| 
      
 92 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
         
     | 
| 
      
 93 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Zanthery'
         
     | 
| 
      
 94 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
         
     | 
| 
      
 95 
     | 
    
         
            +
                #same first letter and distance 2 should match
         
     | 
| 
      
 96 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xantherii'
         
     | 
| 
      
 97 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Xantherrr'
         
     | 
| 
      
 98 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
         
     | 
| 
      
 99 
     | 
    
         
            +
                #First letter is the same and distance is 3 should match, no phonetic match
         
     | 
| 
      
 100 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xantheriii'
         
     | 
| 
      
 101 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Xantherrrr'
         
     | 
| 
      
 102 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
         
     | 
| 
      
 103 
     | 
    
         
            +
                #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
         
     | 
| 
      
 104 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xant'
         
     | 
| 
      
 105 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Xanthe'
         
     | 
| 
      
 106 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should ==  {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
         
     | 
| 
      
 107 
     | 
    
         
            +
                #Should not match if edit distance > 3 and no phonetic match
         
     | 
| 
      
 108 
     | 
    
         
            +
                g1 = make_taxamatch_hash 'Xantheriiii'
         
     | 
| 
      
 109 
     | 
    
         
            +
                g2 = make_taxamatch_hash 'Xantherrrrr'
         
     | 
| 
      
 110 
     | 
    
         
            +
                @tm.match_genera(g1, g2).should ==  {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
         
     | 
| 
      
 111 
     | 
    
         
            +
              end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
              it 'should compare species' do
         
     | 
| 
      
 114 
     | 
    
         
            +
                #Exact match
         
     | 
| 
      
 115 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'major'
         
     | 
| 
      
 116 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'major'
         
     | 
| 
      
 117 
     | 
    
         
            +
                @tm.match_species(s1, s2).should ==  {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
         
     | 
| 
      
 118 
     | 
    
         
            +
                #Phonetic match always works
         
     | 
| 
      
 119 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'xanteriiiiiiii'
         
     | 
| 
      
 120 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'zantereeeeeeee'
         
     | 
| 
      
 121 
     | 
    
         
            +
                @tm.match_species(s1, s2).should ==  {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
         
     | 
| 
      
 122 
     | 
    
         
            +
                #Phonetic match works with different endings
         
     | 
| 
      
 123 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'majorum'
         
     | 
| 
      
 124 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'majoris'
         
     | 
| 
      
 125 
     | 
    
         
            +
                @tm.match_species(s1, s2).should ==  {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
         
     | 
| 
      
 126 
     | 
    
         
            +
                #Distance 4 matches if first 3 chars are the same
         
     | 
| 
      
 127 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'majorrrrr'
         
     | 
| 
      
 128 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'majoraaaa'
         
     | 
| 
      
 129 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
         
     | 
| 
      
 130 
     | 
    
         
            +
                #Should not match if Distance 4 matches and first 3 chars are not the same
         
     | 
| 
      
 131 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'majorrrrr'
         
     | 
| 
      
 132 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'marorraaa'
         
     | 
| 
      
 133 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
         
     | 
| 
      
 134 
     | 
    
         
            +
                #Distance 2 or 3 matches if first 1 char is the same
         
     | 
| 
      
 135 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'morrrr'
         
     | 
| 
      
 136 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'moraaa'
         
     | 
| 
      
 137 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
         
     | 
| 
      
 138 
     | 
    
         
            +
                #Should not match if Distance 2 or 3 and first 1 char is not the same
         
     | 
| 
      
 139 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'morrrr'
         
     | 
| 
      
 140 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'torraa'
         
     | 
| 
      
 141 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3} 
         
     | 
| 
      
 142 
     | 
    
         
            +
                #Distance 1 will match anywhere
         
     | 
| 
      
 143 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'major'
         
     | 
| 
      
 144 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'rajor'
         
     | 
| 
      
 145 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1} 
         
     | 
| 
      
 146 
     | 
    
         
            +
                #Will not match if distance 3 and length is less then twice of the edit distance
         
     | 
| 
      
 147 
     | 
    
         
            +
                s1 = make_taxamatch_hash 'marrr'
         
     | 
| 
      
 148 
     | 
    
         
            +
                s2 = make_taxamatch_hash 'maaaa'
         
     | 
| 
      
 149 
     | 
    
         
            +
                @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
         
     | 
| 
      
 150 
     | 
    
         
            +
              end
         
     | 
| 
      
 151 
     | 
    
         
            +
              
         
     | 
| 
      
 152 
     | 
    
         
            +
              it 'should match mathes' do
         
     | 
| 
      
 153 
     | 
    
         
            +
                #No trobule case
         
     | 
| 
      
 154 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 155 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 156 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
         
     | 
| 
      
 157 
     | 
    
         
            +
                #Will not match if either genus or sp. epithet dont match
         
     | 
| 
      
 158 
     | 
    
         
            +
                gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
         
     | 
| 
      
 159 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 160 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
         
     | 
| 
      
 161 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 162 
     | 
    
         
            +
                smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}    
         
     | 
| 
      
 163 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
         
     | 
| 
      
 164 
     | 
    
         
            +
                #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
         
     | 
| 
      
 165 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
         
     | 
| 
      
 166 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
         
     | 
| 
      
 167 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
         
     | 
| 
      
 168 
     | 
    
         
            +
                #Should not have phonetic match if one of the components does not match phonetically
         
     | 
| 
      
 169 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
         
     | 
| 
      
 170 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 171 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
         
     | 
| 
      
 172 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
         
     | 
| 
      
 173 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
         
     | 
| 
      
 174 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
         
     | 
| 
      
 175 
     | 
    
         
            +
                #edit distance should be equal the sum of of edit distances
         
     | 
| 
      
 176 
     | 
    
         
            +
                gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
         
     | 
| 
      
 177 
     | 
    
         
            +
                smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
         
     | 
| 
      
 178 
     | 
    
         
            +
                @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
         
     | 
| 
      
 179 
     | 
    
         
            +
              end
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
              describe 'Taxamatch::Authmatch' do
         
     | 
| 
      
 182 
     | 
    
         
            +
                before(:all) do
         
     | 
| 
      
 183 
     | 
    
         
            +
                  @am = Taxamatch::Authmatch
         
     | 
| 
      
 184 
     | 
    
         
            +
                end
         
     | 
| 
      
 185 
     | 
    
         
            +
                
         
     | 
| 
      
 186 
     | 
    
         
            +
                it 'should calculate score' do
         
     | 
| 
      
 187 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
         
     | 
| 
      
 188 
     | 
    
         
            +
                  res.should == 90
         
     | 
| 
      
 189 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
         
     | 
| 
      
 190 
     | 
    
         
            +
                  res.should == 0
         
     | 
| 
      
 191 
     | 
    
         
            +
                  #found all authors, same year
         
     | 
| 
      
 192 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
         
     | 
| 
      
 193 
     | 
    
         
            +
                  res.should == 100
         
     | 
| 
      
 194 
     | 
    
         
            +
                  #all authors, 1 year diff
         
     | 
| 
      
 195 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
         
     | 
| 
      
 196 
     | 
    
         
            +
                  res.should == 54
         
     | 
| 
      
 197 
     | 
    
         
            +
                  #year is not counted in
         
     | 
| 
      
 198 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
         
     | 
| 
      
 199 
     | 
    
         
            +
                  res.should == 94
         
     | 
| 
      
 200 
     | 
    
         
            +
                  #found all authors on one side, same year
         
     | 
| 
      
 201 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
         
     | 
| 
      
 202 
     | 
    
         
            +
                  res.should == 91
         
     | 
| 
      
 203 
     | 
    
         
            +
                  #found all authors on one side, 1 year diff
         
     | 
| 
      
 204 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
         
     | 
| 
      
 205 
     | 
    
         
            +
                  res.should == 51
         
     | 
| 
      
 206 
     | 
    
         
            +
                  #found all authors on one side, year does not count
         
     | 
| 
      
 207 
     | 
    
         
            +
                  res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
         
     | 
| 
      
 208 
     | 
    
         
            +
                  res.should == 90
         
     | 
| 
      
 209 
     | 
    
         
            +
                  #found some authors
         
     | 
| 
      
 210 
     | 
    
         
            +
                  res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
         
     | 
| 
      
 211 
     | 
    
         
            +
                  res.should == 67
         
     | 
| 
      
 212 
     | 
    
         
            +
                  #if year does not match or not present no match for previous case
         
     | 
| 
      
 213 
     | 
    
         
            +
                  res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
         
     | 
| 
      
 214 
     | 
    
         
            +
                  res.should == 0
         
     | 
| 
      
 215 
     | 
    
         
            +
                end
         
     | 
| 
      
 216 
     | 
    
         
            +
                
         
     | 
| 
      
 217 
     | 
    
         
            +
                it 'should compare years' do
         
     | 
| 
      
 218 
     | 
    
         
            +
                  @am.compare_years([1882],[1880]).should == 2
         
     | 
| 
      
 219 
     | 
    
         
            +
                  @am.compare_years([1882],[]).should == nil
         
     | 
| 
      
 220 
     | 
    
         
            +
                  @am.compare_years([],[]).should == 0
         
     | 
| 
      
 221 
     | 
    
         
            +
                  @am.compare_years([1788,1798], [1788,1798]).should be_nil
         
     | 
| 
      
 222 
     | 
    
         
            +
                end
         
     | 
| 
      
 223 
     | 
    
         
            +
                
         
     | 
| 
      
 224 
     | 
    
         
            +
                it 'should remove duplicate authors' do 
         
     | 
| 
      
 225 
     | 
    
         
            +
                  #Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
         
     | 
| 
      
 226 
     | 
    
         
            +
                  #Muller is identical
         
     | 
| 
      
 227 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
         
     | 
| 
      
 228 
     | 
    
         
            +
                  res.should == [[], []]
         
     | 
| 
      
 229 
     | 
    
         
            +
                  #same in different order
         
     | 
| 
      
 230 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
         
     | 
| 
      
 231 
     | 
    
         
            +
                  res.should == [[], []]      
         
     | 
| 
      
 232 
     | 
    
         
            +
                  #auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
         
     | 
| 
      
 233 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
         
     | 
| 
      
 234 
     | 
    
         
            +
                  res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
         
     | 
| 
      
 235 
     | 
    
         
            +
                  #fuzzy match
         
     | 
| 
      
 236 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
         
     | 
| 
      
 237 
     | 
    
         
            +
                  res.should == [["Dem"], ["Stepanov"]]
         
     | 
| 
      
 238 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
         
     | 
| 
      
 239 
     | 
    
         
            +
                  res.should == [['Linnaeus', 'Muller'], ['Kenn']]
         
     | 
| 
      
 240 
     | 
    
         
            +
                  res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
         
     | 
| 
      
 241 
     | 
    
         
            +
                  res.should == [[],['Kurtz']]
         
     | 
| 
      
 242 
     | 
    
         
            +
                end
         
     | 
| 
      
 243 
     | 
    
         
            +
             
     | 
| 
      
 244 
     | 
    
         
            +
                it 'should fuzzy match authors' do
         
     | 
| 
      
 245 
     | 
    
         
            +
                  #TODO: fix the bug revealed by this test
         
     | 
| 
      
 246 
     | 
    
         
            +
                  # res = @am.fuzzy_match_authors('L', 'Muller')
         
     | 
| 
      
 247 
     | 
    
         
            +
                  # res.should be_false
         
     | 
| 
      
 248 
     | 
    
         
            +
                end
         
     | 
| 
      
 249 
     | 
    
         
            +
                
         
     | 
| 
      
 250 
     | 
    
         
            +
              end
         
     | 
| 
      
 251 
     | 
    
         
            +
             
     | 
| 
      
 252 
     | 
    
         
            +
            end
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
             
     | 
| 
         @@ -0,0 +1,45 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ###
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Tests for string comparison by taxamatch algorithm
         
     | 
| 
      
 4 
     | 
    
         
            +
            # name1|name2|match|edit_distance
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            ##
         
     | 
| 
      
 7 
     | 
    
         
            +
            # Comparing uninomials
         
     | 
| 
      
 8 
     | 
    
         
            +
            Pomatomus|Pomatomas|true|1
         
     | 
| 
      
 9 
     | 
    
         
            +
            Pomatomus L.|Pomatomas Linn.|true|1
         
     | 
| 
      
 10 
     | 
    
         
            +
            Pomatomus Ber|Pomatomas Linn|false|1
         
     | 
| 
      
 11 
     | 
    
         
            +
            Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            ## additional authorship should match
         
     | 
| 
      
 14 
     | 
    
         
            +
            Puma concolor|Puma concolor L.|true|0
         
     | 
| 
      
 15 
     | 
    
         
            +
            #
         
     | 
| 
      
 16 
     | 
    
         
            +
            ## one-letter misspeling in species epithet should match
         
     | 
| 
      
 17 
     | 
    
         
            +
            Puma concolor|Puma cancolor|true|1
         
     | 
| 
      
 18 
     | 
    
         
            +
            #
         
     | 
| 
      
 19 
     | 
    
         
            +
            Pomatomus saltatrix|Pomatomus saltratix|true|2
         
     | 
| 
      
 20 
     | 
    
         
            +
            Pomatomus saltator|Pomatomus saltatrix|true|3
         
     | 
| 
      
 21 
     | 
    
         
            +
            #
         
     | 
| 
      
 22 
     | 
    
         
            +
            Loligo pealeii|Loligo plei|false|3
         
     | 
| 
      
 23 
     | 
    
         
            +
            #
         
     | 
| 
      
 24 
     | 
    
         
            +
            ## different authors should not match
         
     | 
| 
      
 25 
     | 
    
         
            +
            Puma concolor Linnaeus|Puma concolor Kurtz|false|0
         
     | 
| 
      
 26 
     | 
    
         
            +
            #
         
     | 
| 
      
 27 
     | 
    
         
            +
            ##real life examples
         
     | 
| 
      
 28 
     | 
    
         
            +
            Biatora borealis|Bactra borealis Diakonoff 1964|false|3
         
     | 
| 
      
 29 
     | 
    
         
            +
            #
         
     | 
| 
      
 30 
     | 
    
         
            +
            Homo sapien|Homo sapiens|true|1
         
     | 
| 
      
 31 
     | 
    
         
            +
            Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
         
     | 
| 
      
 32 
     | 
    
         
            +
            Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
         
     | 
| 
      
 33 
     | 
    
         
            +
            #
         
     | 
| 
      
 34 
     | 
    
         
            +
            Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
         
     | 
| 
      
 35 
     | 
    
         
            +
            Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
         
     | 
| 
      
 36 
     | 
    
         
            +
            #
         
     | 
| 
      
 37 
     | 
    
         
            +
            #Trinomial names
         
     | 
| 
      
 38 
     | 
    
         
            +
            Homo sapiens stupidus|Homo spiens stupidus|true|1
         
     | 
| 
      
 39 
     | 
    
         
            +
            Pomatomus saltator saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|true|1
         
     | 
| 
      
 40 
     | 
    
         
            +
            Pomatomus saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|false|5
         
     | 
| 
      
 41 
     | 
    
         
            +
            Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
         
     | 
| 
      
 42 
     | 
    
         
            +
             
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,101 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification 
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: taxamatch_rb
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version 
         
     | 
| 
      
 4 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 5 
     | 
    
         
            +
              segments: 
         
     | 
| 
      
 6 
     | 
    
         
            +
              - 0
         
     | 
| 
      
 7 
     | 
    
         
            +
              - 6
         
     | 
| 
      
 8 
     | 
    
         
            +
              - 0
         
     | 
| 
      
 9 
     | 
    
         
            +
              version: 0.6.0
         
     | 
| 
      
 10 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 11 
     | 
    
         
            +
            authors: 
         
     | 
| 
      
 12 
     | 
    
         
            +
            - Dmitry Mozzherin
         
     | 
| 
      
 13 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 14 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 15 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            date: 2010-03-19 00:00:00 -04:00
         
     | 
| 
      
 18 
     | 
    
         
            +
            default_executable: 
         
     | 
| 
      
 19 
     | 
    
         
            +
            dependencies: 
         
     | 
| 
      
 20 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency 
         
     | 
| 
      
 21 
     | 
    
         
            +
              name: RubyInline
         
     | 
| 
      
 22 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 23 
     | 
    
         
            +
              requirement: &id001 !ruby/object:Gem::Requirement 
         
     | 
| 
      
 24 
     | 
    
         
            +
                requirements: 
         
     | 
| 
      
 25 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 26 
     | 
    
         
            +
                  - !ruby/object:Gem::Version 
         
     | 
| 
      
 27 
     | 
    
         
            +
                    segments: 
         
     | 
| 
      
 28 
     | 
    
         
            +
                    - 0
         
     | 
| 
      
 29 
     | 
    
         
            +
                    version: "0"
         
     | 
| 
      
 30 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 31 
     | 
    
         
            +
              version_requirements: *id001
         
     | 
| 
      
 32 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency 
         
     | 
| 
      
 33 
     | 
    
         
            +
              name: biodiversity
         
     | 
| 
      
 34 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 35 
     | 
    
         
            +
              requirement: &id002 !ruby/object:Gem::Requirement 
         
     | 
| 
      
 36 
     | 
    
         
            +
                requirements: 
         
     | 
| 
      
 37 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 38 
     | 
    
         
            +
                  - !ruby/object:Gem::Version 
         
     | 
| 
      
 39 
     | 
    
         
            +
                    segments: 
         
     | 
| 
      
 40 
     | 
    
         
            +
                    - 0
         
     | 
| 
      
 41 
     | 
    
         
            +
                    - 5
         
     | 
| 
      
 42 
     | 
    
         
            +
                    - 13
         
     | 
| 
      
 43 
     | 
    
         
            +
                    version: 0.5.13
         
     | 
| 
      
 44 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 45 
     | 
    
         
            +
              version_requirements: *id002
         
     | 
| 
      
 46 
     | 
    
         
            +
            description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
         
     | 
| 
      
 47 
     | 
    
         
            +
            email: dmozzherin@eol.org
         
     | 
| 
      
 48 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
            extra_rdoc_files: 
         
     | 
| 
      
 53 
     | 
    
         
            +
            - LICENSE
         
     | 
| 
      
 54 
     | 
    
         
            +
            - README.rdoc
         
     | 
| 
      
 55 
     | 
    
         
            +
            files: 
         
     | 
| 
      
 56 
     | 
    
         
            +
            - README.rdoc
         
     | 
| 
      
 57 
     | 
    
         
            +
            - lib/taxamatch_rb.rb
         
     | 
| 
      
 58 
     | 
    
         
            +
            - lib/taxamatch_rb/atomizer.rb
         
     | 
| 
      
 59 
     | 
    
         
            +
            - lib/taxamatch_rb/authmatch.rb
         
     | 
| 
      
 60 
     | 
    
         
            +
            - lib/taxamatch_rb/damerau_levenshtein_mod.rb
         
     | 
| 
      
 61 
     | 
    
         
            +
            - lib/taxamatch_rb/normalizer.rb
         
     | 
| 
      
 62 
     | 
    
         
            +
            - lib/taxamatch_rb/phonetizer.rb
         
     | 
| 
      
 63 
     | 
    
         
            +
            - spec/damerau_levenshtein_mod_test.txt
         
     | 
| 
      
 64 
     | 
    
         
            +
            - spec/spec.opts
         
     | 
| 
      
 65 
     | 
    
         
            +
            - spec/spec_helper.rb
         
     | 
| 
      
 66 
     | 
    
         
            +
            - spec/taxamatch_rb_spec.rb
         
     | 
| 
      
 67 
     | 
    
         
            +
            - spec/taxamatch_test.txt
         
     | 
| 
      
 68 
     | 
    
         
            +
            - LICENSE
         
     | 
| 
      
 69 
     | 
    
         
            +
            has_rdoc: true
         
     | 
| 
      
 70 
     | 
    
         
            +
            homepage: http://github.com/dimus/taxamatch_rb
         
     | 
| 
      
 71 
     | 
    
         
            +
            licenses: []
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 74 
     | 
    
         
            +
            rdoc_options: 
         
     | 
| 
      
 75 
     | 
    
         
            +
            - --charset=UTF-8
         
     | 
| 
      
 76 
     | 
    
         
            +
            require_paths: 
         
     | 
| 
      
 77 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 78 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 79 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 80 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 81 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 82 
     | 
    
         
            +
                  segments: 
         
     | 
| 
      
 83 
     | 
    
         
            +
                  - 0
         
     | 
| 
      
 84 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 85 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 86 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 87 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 88 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 89 
     | 
    
         
            +
                  segments: 
         
     | 
| 
      
 90 
     | 
    
         
            +
                  - 0
         
     | 
| 
      
 91 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 92 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 95 
     | 
    
         
            +
            rubygems_version: 1.3.6
         
     | 
| 
      
 96 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 97 
     | 
    
         
            +
            specification_version: 3
         
     | 
| 
      
 98 
     | 
    
         
            +
            summary: Implementation of Tony Rees Taxamatch algorithms
         
     | 
| 
      
 99 
     | 
    
         
            +
            test_files: 
         
     | 
| 
      
 100 
     | 
    
         
            +
            - spec/spec_helper.rb
         
     | 
| 
      
 101 
     | 
    
         
            +
            - spec/taxamatch_rb_spec.rb
         
     |