taxamatch_rb 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +61 -0
- data/lib/taxamatch_rb.rb +117 -0
- data/lib/taxamatch_rb/atomizer.rb +82 -0
- data/lib/taxamatch_rb/authmatch.rb +89 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +139 -0
- data/lib/taxamatch_rb/normalizer.rb +55 -0
- data/lib/taxamatch_rb/phonetizer.rb +79 -0
- data/spec/damerau_levenshtein_mod_test.txt +63 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/taxamatch_rb_spec.rb +254 -0
- data/spec/taxamatch_test.txt +45 -0
- metadata +101 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Dmitry Mozzherin
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
= taxamatch_rb
|
2
|
+
|
3
|
+
Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
|
4
|
+
|
5
|
+
The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
|
6
|
+
|
7
|
+
require 'taxamatch_rb'
|
8
|
+
tm = Taxamatch::Base.new
|
9
|
+
tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
|
10
|
+
tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
|
11
|
+
tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
|
12
|
+
|
13
|
+
Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
|
14
|
+
|
15
|
+
== Installation
|
16
|
+
|
17
|
+
sudo gem install dimus-taxamatch_rb --source http://gems.github.com
|
18
|
+
|
19
|
+
or
|
20
|
+
sudo gem sources -a http://gems.github.com #(you only have to do this once)
|
21
|
+
sudo gem install dimus-taxamatch_rb
|
22
|
+
|
23
|
+
== Usage
|
24
|
+
|
25
|
+
require 'rubygems' #not needed for ruby > 1.9.1
|
26
|
+
require 'taxamatch_rb'
|
27
|
+
|
28
|
+
tm = Taxamatch::Base.new
|
29
|
+
|
30
|
+
* compare full scientific names
|
31
|
+
|
32
|
+
tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
|
33
|
+
|
34
|
+
* preparse names for the matching (necessary for large databases of scientific names)
|
35
|
+
|
36
|
+
p = Taxamatch::Atomizer.new
|
37
|
+
parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
|
38
|
+
parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
|
39
|
+
|
40
|
+
* compare preparsed names
|
41
|
+
|
42
|
+
tm.taxamatch_preparsed(parsed_name1, parsed_name2)
|
43
|
+
|
44
|
+
* compare genera
|
45
|
+
|
46
|
+
tm.match_genera('Monacanthus', 'MONOCANTUS')
|
47
|
+
|
48
|
+
* compare species
|
49
|
+
|
50
|
+
tm.match_species('fronticinctus', 'frontecinctus')
|
51
|
+
|
52
|
+
* compare authors and years
|
53
|
+
|
54
|
+
Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
|
55
|
+
|
56
|
+
|
57
|
+
You can find more examples in spec section of the code
|
58
|
+
|
59
|
+
== Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
|
data/lib/taxamatch_rb.rb
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
3
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
|
+
# $:.unshift('taxamatch_rb')
|
5
|
+
require 'taxamatch_rb/damerau_levenshtein_mod'
|
6
|
+
require 'taxamatch_rb/atomizer'
|
7
|
+
require 'taxamatch_rb/normalizer'
|
8
|
+
require 'taxamatch_rb/phonetizer'
|
9
|
+
require 'taxamatch_rb/authmatch'
|
10
|
+
require 'ruby-debug'
|
11
|
+
|
12
|
+
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
13
|
+
|
14
|
+
module Taxamatch
|
15
|
+
|
16
|
+
class Base
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@parser = Taxamatch::Atomizer.new
|
20
|
+
@dlm = Taxamatch::DamerauLevenshteinMod.new
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
#takes two scientific names and returns true if names match and false if they don't
|
25
|
+
def taxamatch(str1, str2, return_boolean = true)
|
26
|
+
preparsed_1 = @parser.parse(str1)
|
27
|
+
preparsed_2 = @parser.parse(str2)
|
28
|
+
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
29
|
+
return_boolean && match ? match['match'] : match
|
30
|
+
end
|
31
|
+
|
32
|
+
#takes two hashes of parsed scientific names, analyses them and returns back
|
33
|
+
#this function is useful when species strings are preparsed.
|
34
|
+
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
35
|
+
result = nil
|
36
|
+
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
37
|
+
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
38
|
+
if result && result['match']
|
39
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
40
|
+
end
|
41
|
+
return result
|
42
|
+
end
|
43
|
+
|
44
|
+
def match_uninomial(preparsed_1, preparsed_2)
|
45
|
+
match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
|
46
|
+
end
|
47
|
+
|
48
|
+
def match_multinomial(preparsed_1, preparsed_2)
|
49
|
+
gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
|
50
|
+
sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
|
51
|
+
total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
|
52
|
+
if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
|
53
|
+
infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
|
54
|
+
total_length += preparsed_1[:infraspecies][0][:epitheton].size + preparsed_2[:infraspecies][0][:epitheton].size
|
55
|
+
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
56
|
+
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
57
|
+
match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
|
58
|
+
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:epitheton].size : preparsed_2[:infraspecies][0][:epitheton].size
|
59
|
+
else
|
60
|
+
match_hash = match_matches(gen_match, sp_match)
|
61
|
+
end
|
62
|
+
match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
|
63
|
+
match_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
def match_genera(genus1, genus2)
|
67
|
+
genus1_length = genus1[:normalized].size
|
68
|
+
genus2_length = genus2[:normalized].size
|
69
|
+
match = false
|
70
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
71
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
|
72
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
73
|
+
|
74
|
+
match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
75
|
+
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
76
|
+
end
|
77
|
+
|
78
|
+
def match_species(sp1, sp2)
|
79
|
+
sp1_length = sp1[:normalized].size
|
80
|
+
sp2_length = sp2[:normalized].size
|
81
|
+
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
82
|
+
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
83
|
+
match = false
|
84
|
+
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
|
85
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
|
86
|
+
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
87
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
88
|
+
|
89
|
+
match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
90
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
91
|
+
end
|
92
|
+
|
93
|
+
def match_authors(preparsed_1, preparsed_2)
|
94
|
+
au1 = preparsed_1[:all_authors]
|
95
|
+
au2 = preparsed_2[:all_authors]
|
96
|
+
yr1 = preparsed_1[:all_years]
|
97
|
+
yr2 = preparsed_2[:all_years]
|
98
|
+
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
99
|
+
end
|
100
|
+
|
101
|
+
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
102
|
+
match = species_match
|
103
|
+
if infraspecies_match
|
104
|
+
match['edit_distance'] += infraspecies_match['edit_distance']
|
105
|
+
match['match'] &&= infraspecies_match['match']
|
106
|
+
match['phonetic_match'] &&= infraspecies_match['phonetic_match']
|
107
|
+
end
|
108
|
+
match['edit_distance'] += genus_match['edit_distance']
|
109
|
+
match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
|
110
|
+
match['match'] &&= genus_match['match']
|
111
|
+
match['phonetic_match'] &&= genus_match['phonetic_match']
|
112
|
+
match
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'biodiversity'
|
3
|
+
|
4
|
+
module Taxamatch
|
5
|
+
|
6
|
+
class Atomizer
|
7
|
+
def initialize
|
8
|
+
@parser = ScientificNameParser.new
|
9
|
+
@parsed_raw = nil
|
10
|
+
@res = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse(name)
|
14
|
+
@res = {:all_authors => [], :all_years => []}
|
15
|
+
@parsed_raw = @parser.parse(name)[:scientificName]
|
16
|
+
organize_results
|
17
|
+
end
|
18
|
+
|
19
|
+
def parsed_raw
|
20
|
+
return @parsed_raw
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
|
25
|
+
def organize_results
|
26
|
+
pr = @parsed_raw
|
27
|
+
return nil unless pr[:parsed]
|
28
|
+
d = pr[:details][0]
|
29
|
+
process_node(:uninomial, d[:uninomial])
|
30
|
+
process_node(:genus, d[:genus])
|
31
|
+
process_node(:species, d[:species], true)
|
32
|
+
process_infraspecies(d[:infraspecies])
|
33
|
+
@res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
|
34
|
+
@res[:all_years].uniq!
|
35
|
+
@res.keys.size > 2 ? @res : nil
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_node(name, node, is_species = false)
|
39
|
+
return unless node
|
40
|
+
@res[name] = {}
|
41
|
+
@res[name][:epitheton] = node[:epitheton]
|
42
|
+
@res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:epitheton])
|
43
|
+
@res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:epitheton], is_species)
|
44
|
+
get_authors_years(node, @res[name])
|
45
|
+
end
|
46
|
+
|
47
|
+
def process_infraspecies(node)
|
48
|
+
return unless node
|
49
|
+
@res[:infraspecies] = []
|
50
|
+
node.each do |infr|
|
51
|
+
hsh = {}
|
52
|
+
hsh[:epitheton] = infr[:epitheton]
|
53
|
+
hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:epitheton])
|
54
|
+
hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr[:epitheton], true)
|
55
|
+
get_authors_years(infr,hsh)
|
56
|
+
@res[:infraspecies] << hsh
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_authors_years(node, res)
|
61
|
+
res[:authors] = []
|
62
|
+
res[:years] = []
|
63
|
+
[:basionymAuthorTeam, :combinationAuthorTeam].each do |au|
|
64
|
+
if node[au]
|
65
|
+
res[:authors] += node[au][:author]
|
66
|
+
res[:years] << node[au][:year] if node[au][:year]
|
67
|
+
if node[au][:exAuthorTeam]
|
68
|
+
res[:authors] += node[au][:exAuthorTeam][:author]
|
69
|
+
res[:years] << node[au][:exAuthorTeam][:year] if node[au][:exAuthorTeam][:year]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
res[:authors].uniq!
|
74
|
+
res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
|
75
|
+
res[:years].uniq!
|
76
|
+
@res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
|
77
|
+
@res[:all_years] += res[:years] if res[:years].size > 0
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
|
2
|
+
|
3
|
+
module Taxamatch
|
4
|
+
class Authmatch
|
5
|
+
|
6
|
+
def self.authmatch(authors1, authors2, years1, years2)
|
7
|
+
unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
|
8
|
+
year_difference = compare_years(years1, years2)
|
9
|
+
get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
|
13
|
+
count_before = authors1.size + authors2.size
|
14
|
+
count_after = unique_authors1.size + unique_authors2.size
|
15
|
+
score = 0
|
16
|
+
if count_after == 0
|
17
|
+
if year_diff != nil
|
18
|
+
if year_diff == 0
|
19
|
+
score = 100
|
20
|
+
elsif year_diff == 1
|
21
|
+
score = 54
|
22
|
+
end
|
23
|
+
else
|
24
|
+
score = 94
|
25
|
+
end
|
26
|
+
elsif unique_authors1.size == 0 || unique_authors2.size == 0
|
27
|
+
if year_diff != nil
|
28
|
+
if year_diff == 0
|
29
|
+
score = 91
|
30
|
+
elsif year_diff == 1
|
31
|
+
score = 51
|
32
|
+
end
|
33
|
+
else
|
34
|
+
score = 90
|
35
|
+
end
|
36
|
+
else
|
37
|
+
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
38
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
39
|
+
end
|
40
|
+
score > 50 ? score : 0
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.remove_duplicate_authors(authors1, authors2)
|
44
|
+
unique_authors1 = authors1.dup
|
45
|
+
unique_authors2 = authors2.dup
|
46
|
+
authors1.each do |au1|
|
47
|
+
authors2.each do |au2|
|
48
|
+
au1_match = au2_match = false
|
49
|
+
if au1 == au2
|
50
|
+
au1_match = au2_match = true
|
51
|
+
elsif au1 == au2[0...au1.size]
|
52
|
+
au1_match = true
|
53
|
+
elsif au1[0...au2.size] == au2
|
54
|
+
au2_match = true
|
55
|
+
end
|
56
|
+
if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
|
57
|
+
unique_authors1.delete au1
|
58
|
+
unique_authors2.delete au2
|
59
|
+
elsif au1_match
|
60
|
+
unique_authors1.delete au1
|
61
|
+
elsif au2_match
|
62
|
+
unique_authors2.delete au2
|
63
|
+
else
|
64
|
+
#TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
|
65
|
+
if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
|
66
|
+
unique_authors1.delete au1
|
67
|
+
unique_authors2.delete au2
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
[unique_authors1, unique_authors2]
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.fuzzy_match_authors(author1, author2)
|
76
|
+
au1_length = author1.size
|
77
|
+
au2_length = author2.size
|
78
|
+
dlm = Taxamatch::DamerauLevenshteinMod.new
|
79
|
+
ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
|
80
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.compare_years(years1, years2)
|
84
|
+
return 0 if years1 == [] && years2 == []
|
85
|
+
return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
|
86
|
+
nil
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'inline'
|
4
|
+
require 'time'
|
5
|
+
module Taxamatch
|
6
|
+
|
7
|
+
class DamerauLevenshteinMod
|
8
|
+
def distance(str1, str2, block_size=2, max_distance=10)
|
9
|
+
# puts str1.unpack("U*");
|
10
|
+
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
11
|
+
end
|
12
|
+
|
13
|
+
inline do |builder|
|
14
|
+
builder.c "
|
15
|
+
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
|
16
|
+
int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
17
|
+
int stop_execution = 0;
|
18
|
+
int min = 0;
|
19
|
+
int current_distance = 0;
|
20
|
+
|
21
|
+
VALUE *sv = RARRAY_PTR(_s);
|
22
|
+
VALUE *tv = RARRAY_PTR(_t);
|
23
|
+
|
24
|
+
sl = RARRAY_LEN(_s);
|
25
|
+
tl = RARRAY_LEN(_t);
|
26
|
+
|
27
|
+
if (sl == 0) return INT2NUM(tl);
|
28
|
+
if (tl == 0) return INT2NUM(sl);
|
29
|
+
//case of lengths 1 must present or it will break further in the code
|
30
|
+
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
31
|
+
|
32
|
+
int s[sl];
|
33
|
+
int t[tl];
|
34
|
+
|
35
|
+
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
36
|
+
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
37
|
+
|
38
|
+
sl++;
|
39
|
+
tl++;
|
40
|
+
|
41
|
+
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
42
|
+
d = malloc((sizeof(int))*(sl)*(tl));
|
43
|
+
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
44
|
+
for(i = 0; i < tl; i++){
|
45
|
+
d[i*sl] = i;
|
46
|
+
}
|
47
|
+
|
48
|
+
//fill up array with scores
|
49
|
+
for(i = 1; i<sl; i++){
|
50
|
+
d[i] = i;
|
51
|
+
if (stop_execution == 1) break;
|
52
|
+
current_distance = 10000;
|
53
|
+
for(j = 1; j<tl; j++){
|
54
|
+
|
55
|
+
cost = 1;
|
56
|
+
if(s[i-1] == t[j-1]) cost = 0;
|
57
|
+
|
58
|
+
half_sl = (sl - 1)/2;
|
59
|
+
half_tl = (tl - 1)/2;
|
60
|
+
|
61
|
+
block = block_size < half_sl ? block_size : half_sl;
|
62
|
+
block = block < half_tl ? block : half_tl;
|
63
|
+
|
64
|
+
while (block >= 1){
|
65
|
+
int swap1 = 1;
|
66
|
+
int swap2 = 1;
|
67
|
+
i1 = i - (block * 2);
|
68
|
+
j1 = j - (block * 2);
|
69
|
+
for (k = i1; k < i1 + block; k++) {
|
70
|
+
if (s[k] != t[k + block]){
|
71
|
+
swap1 = 0;
|
72
|
+
break;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
for (k = j1; k < j1 + block; k++) {
|
76
|
+
if (t[k] != s[k + block]){
|
77
|
+
swap2 = 0;
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
del = d[j*sl + i - 1] + 1;
|
83
|
+
ins = d[(j-1)*sl + i] + 1;
|
84
|
+
min = del;
|
85
|
+
if (ins < min) min = ins;
|
86
|
+
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
87
|
+
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
88
|
+
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
89
|
+
if (transp < min) min = transp;
|
90
|
+
block = 0;
|
91
|
+
} else if (block == 1) {
|
92
|
+
subs = d[(j-1)*sl + i - 1] + cost;
|
93
|
+
if (subs < min) min = subs;
|
94
|
+
}
|
95
|
+
block--;
|
96
|
+
}
|
97
|
+
d[j*sl+i]=min;
|
98
|
+
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
99
|
+
}
|
100
|
+
if (current_distance > max_distance) {
|
101
|
+
stop_execution = 1;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
distance=d[sl * tl - 1];
|
105
|
+
if (stop_execution == 1) distance = current_distance;
|
106
|
+
|
107
|
+
free(d);
|
108
|
+
return INT2NUM(distance);
|
109
|
+
}
|
110
|
+
"
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
if __FILE__ == $0
|
116
|
+
a=Taxamatch::DamerauLevenshteinMod.new
|
117
|
+
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
118
|
+
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
119
|
+
|
120
|
+
#puts s.join(",")
|
121
|
+
#puts t.join(",")
|
122
|
+
|
123
|
+
start = Time.now
|
124
|
+
(1..100000).each do
|
125
|
+
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
|
126
|
+
end
|
127
|
+
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
|
128
|
+
|
129
|
+
start = Time.now
|
130
|
+
(1..100000).each do
|
131
|
+
a.distance_utf(s, t, 1, 10)
|
132
|
+
end
|
133
|
+
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
134
|
+
|
135
|
+
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
136
|
+
#puts a.distance_utf(s, t, 2, 10)
|
137
|
+
#puts a.distance('tar','atp',1,10);
|
138
|
+
puts a.distance('sub', 'usb', 1, 10);
|
139
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Taxamatch
|
4
|
+
|
5
|
+
module Normalizer
|
6
|
+
def self.normalize(string)
|
7
|
+
utf8_to_ascii(string).upcase
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.normalize_word(word)
|
11
|
+
self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.normalize_author(string)
|
15
|
+
self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
def self.utf8_to_ascii(string)
|
20
|
+
string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
|
21
|
+
string = string.gsub(/[ÉÈÊË]/, "E")
|
22
|
+
string = string.gsub(/[ÍÌÎÏ]/, "I")
|
23
|
+
string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
|
24
|
+
string = string.gsub(/[ÚÙÛÜ]/, "U")
|
25
|
+
string = string.gsub(/[Ý]/, "Y")
|
26
|
+
string = string.gsub(/Æ/, "AE")
|
27
|
+
string = string.gsub(/[ČÇ]/, "C")
|
28
|
+
string = string.gsub(/[ŠŞ]/, "S")
|
29
|
+
string = string.gsub(/[Đ]/, "D")
|
30
|
+
string = string.gsub(/Ž/, "Z")
|
31
|
+
string = string.gsub(/Ñ/, "N")
|
32
|
+
string = string.gsub(/Œ/, "OE")
|
33
|
+
string = string.gsub(/ß/, "B")
|
34
|
+
string = string.gsub(/Ķ/, "K")
|
35
|
+
string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
|
36
|
+
string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
|
37
|
+
string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
|
38
|
+
string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
|
39
|
+
string = string.gsub(/[úùûüůưừựủứụ]/, "u")
|
40
|
+
string = string.gsub(/[žź]/, "z")
|
41
|
+
string = string.gsub(/[ýÿỹ]/, "y")
|
42
|
+
string = string.gsub(/[đ]/, "d")
|
43
|
+
string = string.gsub(/æ/, "ae")
|
44
|
+
string = string.gsub(/[čćç]/, "c")
|
45
|
+
string = string.gsub(/[ñńň]/, "n")
|
46
|
+
string = string.gsub(/œ/, "oe")
|
47
|
+
string = string.gsub(/[śšş]/, "s")
|
48
|
+
string = string.gsub(/ř/, "r")
|
49
|
+
string = string.gsub(/ğ/, "g")
|
50
|
+
string = string.gsub(/Ř/, "R")
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Taxamatch
|
3
|
+
|
4
|
+
module Phonetizer
|
5
|
+
|
6
|
+
def self.phonetize(a_word, normalize_ending = false)
|
7
|
+
self.near_match(a_word, normalize_ending)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.near_match(a_word, normalize_ending = false)
|
11
|
+
a_word = a_word.strip rescue ''
|
12
|
+
return '' if a_word == ''
|
13
|
+
a_word = Taxamatch::Normalizer.normalize a_word
|
14
|
+
case a_word
|
15
|
+
when /^AE/
|
16
|
+
a_word = 'E' + a_word[2..-1]
|
17
|
+
when /^CN/
|
18
|
+
a_word = 'N' + a_word[2..-1]
|
19
|
+
when /^CT/
|
20
|
+
a_word = 'T' + a_word[2..-1]
|
21
|
+
when /^CZ/
|
22
|
+
a_word = 'C' + a_word[2..-1]
|
23
|
+
when /^DJ/
|
24
|
+
a_word = 'J' + a_word[2..-1]
|
25
|
+
when /^EA/
|
26
|
+
a_word = 'E' + a_word[2..-1]
|
27
|
+
when /^EU/
|
28
|
+
a_word = 'U' + a_word[2..-1]
|
29
|
+
when /^GN/
|
30
|
+
a_word = 'N' + a_word[2..-1]
|
31
|
+
when /^KN/
|
32
|
+
a_word = 'N' + a_word[2..-1]
|
33
|
+
when /^MC/
|
34
|
+
a_word = 'MAC' + a_word[2..-1]
|
35
|
+
when /^MN/
|
36
|
+
a_word = 'N' + a_word[2..-1]
|
37
|
+
when /^OE/
|
38
|
+
a_word = 'E' + a_word[2..-1]
|
39
|
+
when /^QU/
|
40
|
+
a_word = 'Q' + a_word[2..-1]
|
41
|
+
when /^PS/
|
42
|
+
a_word = 'S' + a_word[2..-1]
|
43
|
+
when /^PT/
|
44
|
+
a_word = 'T' + a_word[2..-1]
|
45
|
+
when /^TS/
|
46
|
+
a_word = 'S' + a_word[2..-1]
|
47
|
+
when /^WR/
|
48
|
+
a_word = 'R' + a_word[2..-1]
|
49
|
+
when /^X/
|
50
|
+
a_word = 'Z' + a_word[1..-1]
|
51
|
+
end
|
52
|
+
first_char = a_word.split('')[0]
|
53
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
54
|
+
rest_chars.gsub!('AE', 'I')
|
55
|
+
rest_chars.gsub!('IA', 'A')
|
56
|
+
rest_chars.gsub!('OE', 'I')
|
57
|
+
rest_chars.gsub!('OI', 'A')
|
58
|
+
rest_chars.gsub!('SC', 'S')
|
59
|
+
rest_chars.gsub!('H', '')
|
60
|
+
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
61
|
+
a_word = (first_char + rest_chars).squeeze
|
62
|
+
|
63
|
+
if normalize_ending && a_word.size > 4
|
64
|
+
a_word = self.normalize_ending(a_word)
|
65
|
+
end
|
66
|
+
a_word
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.normalize_ending(a_word)
|
70
|
+
# -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
71
|
+
# -- at the end of a string translate all to -a
|
72
|
+
a_word.gsub!(/IS$/, 'A')
|
73
|
+
a_word.gsub!(/IM$/, 'A')
|
74
|
+
a_word.gsub(/AS$/, 'A')
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
######################
|
2
|
+
# Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
|
3
|
+
#
|
4
|
+
# * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
|
5
|
+
# * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
|
6
|
+
#
|
7
|
+
# Fields:
|
8
|
+
# String1|String2|maximum distance|transposition block size|expected distance
|
9
|
+
# - String1, String2
|
10
|
+
# compared strings
|
11
|
+
# - maximum distance
|
12
|
+
# stops execution of the algorithm when calculated distance exceeds the maximum distance number
|
13
|
+
# - transosition block size
|
14
|
+
# determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
|
15
|
+
# - expected distance
|
16
|
+
# resulting distance that has to be achieved by the algorithm
|
17
|
+
# Note: algorithm does not try to normalize or interpret strings in any way.
|
18
|
+
######################
|
19
|
+
|
20
|
+
#it whould recognize the exact match
|
21
|
+
Pomatomus|Pomatomus|10|1|0
|
22
|
+
|
23
|
+
#it should not try to normalize incoming strings
|
24
|
+
Pomatomus|Pomatomus|10|1|1
|
25
|
+
Pomatomus|pomatomus|10|1|1
|
26
|
+
|
27
|
+
#it should calculate special cases
|
28
|
+
Pomatomus||10|1|9
|
29
|
+
|Pomatomus|10|1|9
|
30
|
+
P|p|10|1|1
|
31
|
+
#TODO: one letter vs longer string generates a big negative number
|
32
|
+
#L|Linneaus|10|1|7
|
33
|
+
|
34
|
+
|
35
|
+
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
36
|
+
Pomatomus|Pomatomux|10|1|1
|
37
|
+
Pmatomus|Pomatomus|10|1|1
|
38
|
+
Pomatomus|Pmatomus|10|1|1
|
39
|
+
Rpmatomus|Pomatomus|10|1|2
|
40
|
+
Pommtomus|Pomatomus|10|1|1
|
41
|
+
Potamomus|Pomatomus|10|1|2
|
42
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
43
|
+
Pomatomus|oPmatomus|10|1|1
|
44
|
+
Pomatomus|Pomatomsu|10|1|1
|
45
|
+
Pomtaomus|Pomatomus|10|1|1
|
46
|
+
Pomatoums|Pomatomus|10|1|1
|
47
|
+
Potamomus|Pomatomus|10|1|2
|
48
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
|
49
|
+
|
50
|
+
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
51
|
+
serrulatus|serratulus|10|2|2
|
52
|
+
Pomatomus|Poomumats|10|3|3
|
53
|
+
vesiculosus|vecusilosus|10|1|4
|
54
|
+
vesiculosus|vecusilosus|10|2|2
|
55
|
+
trimerophyton|mertriophyton|10|1|6
|
56
|
+
trimerophyton|mertriophyton|10|3|3
|
57
|
+
|
58
|
+
#it should stop trying if distance exceeds maximum allowed distance
|
59
|
+
Pxxxxomus|Pomatomus|10|1|4
|
60
|
+
Pxxxxomus|Pomatomus|2|1|3
|
61
|
+
|
62
|
+
#
|
63
|
+
PUNCTATA|PUNCTATA|10|1|0
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
end
|
8
|
+
|
9
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
10
|
+
require 'taxamatch_rb'
|
11
|
+
|
12
|
+
def read_test_file(file, fields_num)
|
13
|
+
f = open(file)
|
14
|
+
f.each do |line|
|
15
|
+
fields = line.split("|")
|
16
|
+
if line.match(/^\s*#/) == nil && fields.size == fields_num
|
17
|
+
fields[-1] = fields[-1].split('#')[0].strip
|
18
|
+
yield(fields)
|
19
|
+
else
|
20
|
+
yield(nil)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def make_taxamatch_hash(string)
|
26
|
+
normalized = Taxamatch::Normalizer.normalize(string)
|
27
|
+
{:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
|
28
|
+
end
|
@@ -0,0 +1,254 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.dirname(__FILE__) + '/spec_helper.rb'
|
3
|
+
|
4
|
+
describe 'DamerauLevenshteinMod' do
|
5
|
+
it 'should get tests' do
|
6
|
+
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
|
7
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
8
|
+
if y
|
9
|
+
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
10
|
+
puts y if res != y[4].to_i
|
11
|
+
res.should == y[4].to_i
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe 'Atomizer' do
|
18
|
+
before(:all) do
|
19
|
+
@parser = Taxamatch::Atomizer.new
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should parse uninomials' do
|
23
|
+
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
|
24
|
+
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should parse binomials' do
|
28
|
+
@parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should parse trinomials' do
|
32
|
+
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
describe 'Taxamatch::Normalizer' do
|
38
|
+
it 'should normalize strings' do
|
39
|
+
Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
|
40
|
+
Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
41
|
+
Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
|
42
|
+
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
43
|
+
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should normalize words' do
|
47
|
+
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe 'Taxamatch::Base' do
|
52
|
+
before(:all) do
|
53
|
+
@tm = Taxamatch::Base.new
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should get txt tests' do
|
57
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
58
|
+
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
59
|
+
if y
|
60
|
+
y[2] = y[2] == 'true' ? true : false
|
61
|
+
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
63
|
+
res['match'].should == y[2]
|
64
|
+
res['edit_distance'].should == y[3].to_i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should work with names that cannot be parsed' do
|
70
|
+
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
|
71
|
+
res = false
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should compare genera' do
|
75
|
+
#edit distance 1 always match
|
76
|
+
g1 = make_taxamatch_hash 'Plantago'
|
77
|
+
g2 = make_taxamatch_hash 'Plantagon'
|
78
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
79
|
+
#edit_distance above threshold does not math
|
80
|
+
g1 = make_taxamatch_hash 'Plantago'
|
81
|
+
g2 = make_taxamatch_hash 'This shouldnt match'
|
82
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
83
|
+
#phonetic_match matches
|
84
|
+
g1 = make_taxamatch_hash 'Plantagi'
|
85
|
+
g2 = make_taxamatch_hash 'Plantagy'
|
86
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
|
87
|
+
#distance 1 in first letter also matches
|
88
|
+
g1 = make_taxamatch_hash 'Xantheri'
|
89
|
+
g2 = make_taxamatch_hash 'Pantheri'
|
90
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
91
|
+
#phonetic match tramps everything
|
92
|
+
g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
|
93
|
+
g2 = make_taxamatch_hash 'Zanthery'
|
94
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
95
|
+
#same first letter and distance 2 should match
|
96
|
+
g1 = make_taxamatch_hash 'Xantherii'
|
97
|
+
g2 = make_taxamatch_hash 'Xantherrr'
|
98
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
99
|
+
#First letter is the same and distance is 3 should match, no phonetic match
|
100
|
+
g1 = make_taxamatch_hash 'Xantheriii'
|
101
|
+
g2 = make_taxamatch_hash 'Xantherrrr'
|
102
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
103
|
+
#Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
|
104
|
+
g1 = make_taxamatch_hash 'Xant'
|
105
|
+
g2 = make_taxamatch_hash 'Xanthe'
|
106
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
|
107
|
+
#Should not match if edit distance > 3 and no phonetic match
|
108
|
+
g1 = make_taxamatch_hash 'Xantheriiii'
|
109
|
+
g2 = make_taxamatch_hash 'Xantherrrrr'
|
110
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'should compare species' do
|
114
|
+
#Exact match
|
115
|
+
s1 = make_taxamatch_hash 'major'
|
116
|
+
s2 = make_taxamatch_hash 'major'
|
117
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
118
|
+
#Phonetic match always works
|
119
|
+
s1 = make_taxamatch_hash 'xanteriiiiiiii'
|
120
|
+
s2 = make_taxamatch_hash 'zantereeeeeeee'
|
121
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
|
122
|
+
#Phonetic match works with different endings
|
123
|
+
s1 = make_taxamatch_hash 'majorum'
|
124
|
+
s2 = make_taxamatch_hash 'majoris'
|
125
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
126
|
+
#Distance 4 matches if first 3 chars are the same
|
127
|
+
s1 = make_taxamatch_hash 'majorrrrr'
|
128
|
+
s2 = make_taxamatch_hash 'majoraaaa'
|
129
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
|
130
|
+
#Should not match if Distance 4 matches and first 3 chars are not the same
|
131
|
+
s1 = make_taxamatch_hash 'majorrrrr'
|
132
|
+
s2 = make_taxamatch_hash 'marorraaa'
|
133
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
134
|
+
#Distance 2 or 3 matches if first 1 char is the same
|
135
|
+
s1 = make_taxamatch_hash 'morrrr'
|
136
|
+
s2 = make_taxamatch_hash 'moraaa'
|
137
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
138
|
+
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
139
|
+
s1 = make_taxamatch_hash 'morrrr'
|
140
|
+
s2 = make_taxamatch_hash 'torraa'
|
141
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
142
|
+
#Distance 1 will match anywhere
|
143
|
+
s1 = make_taxamatch_hash 'major'
|
144
|
+
s2 = make_taxamatch_hash 'rajor'
|
145
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
146
|
+
#Will not match if distance 3 and length is less then twice of the edit distance
|
147
|
+
s1 = make_taxamatch_hash 'marrr'
|
148
|
+
s2 = make_taxamatch_hash 'maaaa'
|
149
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'should match mathes' do
|
153
|
+
#No trobule case
|
154
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
155
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
156
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
|
157
|
+
#Will not match if either genus or sp. epithet dont match
|
158
|
+
gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
159
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
160
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
161
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
162
|
+
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
163
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
164
|
+
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
165
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
166
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
167
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
|
168
|
+
#Should not have phonetic match if one of the components does not match phonetically
|
169
|
+
gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
170
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
171
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
172
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
173
|
+
smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
174
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
175
|
+
#edit distance should be equal the sum of of edit distances
|
176
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
177
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
178
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
|
179
|
+
end
|
180
|
+
|
181
|
+
describe 'Taxamatch::Authmatch' do
|
182
|
+
before(:all) do
|
183
|
+
@am = Taxamatch::Authmatch
|
184
|
+
end
|
185
|
+
|
186
|
+
it 'should calculate score' do
|
187
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
188
|
+
res.should == 90
|
189
|
+
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
190
|
+
res.should == 0
|
191
|
+
#found all authors, same year
|
192
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
|
193
|
+
res.should == 100
|
194
|
+
#all authors, 1 year diff
|
195
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
|
196
|
+
res.should == 54
|
197
|
+
#year is not counted in
|
198
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
|
199
|
+
res.should == 94
|
200
|
+
#found all authors on one side, same year
|
201
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
|
202
|
+
res.should == 91
|
203
|
+
#found all authors on one side, 1 year diff
|
204
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
|
205
|
+
res.should == 51
|
206
|
+
#found all authors on one side, year does not count
|
207
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
208
|
+
res.should == 90
|
209
|
+
#found some authors
|
210
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
211
|
+
res.should == 67
|
212
|
+
#if year does not match or not present no match for previous case
|
213
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
214
|
+
res.should == 0
|
215
|
+
end
|
216
|
+
|
217
|
+
it 'should compare years' do
|
218
|
+
@am.compare_years([1882],[1880]).should == 2
|
219
|
+
@am.compare_years([1882],[]).should == nil
|
220
|
+
@am.compare_years([],[]).should == 0
|
221
|
+
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
222
|
+
end
|
223
|
+
|
224
|
+
it 'should remove duplicate authors' do
|
225
|
+
#Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
|
226
|
+
#Muller is identical
|
227
|
+
res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
|
228
|
+
res.should == [[], []]
|
229
|
+
#same in different order
|
230
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
|
231
|
+
res.should == [[], []]
|
232
|
+
#auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
|
233
|
+
res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
|
234
|
+
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
235
|
+
#fuzzy match
|
236
|
+
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
|
237
|
+
res.should == [["Dem"], ["Stepanov"]]
|
238
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
|
239
|
+
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
240
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
|
241
|
+
res.should == [[],['Kurtz']]
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'should fuzzy match authors' do
|
245
|
+
#TODO: fix the bug revealed by this test
|
246
|
+
# res = @am.fuzzy_match_authors('L', 'Muller')
|
247
|
+
# res.should be_false
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
###
|
2
|
+
#
|
3
|
+
# Tests for string comparison by taxamatch algorithm
|
4
|
+
# name1|name2|match|edit_distance
|
5
|
+
#
|
6
|
+
##
|
7
|
+
# Comparing uninomials
|
8
|
+
Pomatomus|Pomatomas|true|1
|
9
|
+
Pomatomus L.|Pomatomas Linn.|true|1
|
10
|
+
Pomatomus Ber|Pomatomas Linn|false|1
|
11
|
+
Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
|
12
|
+
|
13
|
+
## additional authorship should match
|
14
|
+
Puma concolor|Puma concolor L.|true|0
|
15
|
+
#
|
16
|
+
## one-letter misspeling in species epithet should match
|
17
|
+
Puma concolor|Puma cancolor|true|1
|
18
|
+
#
|
19
|
+
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
20
|
+
Pomatomus saltator|Pomatomus saltatrix|true|3
|
21
|
+
#
|
22
|
+
Loligo pealeii|Loligo plei|false|3
|
23
|
+
#
|
24
|
+
## different authors should not match
|
25
|
+
Puma concolor Linnaeus|Puma concolor Kurtz|false|0
|
26
|
+
#
|
27
|
+
##real life examples
|
28
|
+
Biatora borealis|Bactra borealis Diakonoff 1964|false|3
|
29
|
+
#
|
30
|
+
Homo sapien|Homo sapiens|true|1
|
31
|
+
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
|
32
|
+
Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
|
33
|
+
#
|
34
|
+
Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
|
35
|
+
Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
|
36
|
+
#
|
37
|
+
#Trinomial names
|
38
|
+
Homo sapiens stupidus|Homo spiens stupidus|true|1
|
39
|
+
Pomatomus saltator saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|true|1
|
40
|
+
Pomatomus saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|false|5
|
41
|
+
Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: taxamatch_rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 6
|
8
|
+
- 0
|
9
|
+
version: 0.6.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dmitry Mozzherin
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-19 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: RubyInline
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: biodiversity
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
41
|
+
- 5
|
42
|
+
- 13
|
43
|
+
version: 0.5.13
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
|
47
|
+
email: dmozzherin@eol.org
|
48
|
+
executables: []
|
49
|
+
|
50
|
+
extensions: []
|
51
|
+
|
52
|
+
extra_rdoc_files:
|
53
|
+
- LICENSE
|
54
|
+
- README.rdoc
|
55
|
+
files:
|
56
|
+
- README.rdoc
|
57
|
+
- lib/taxamatch_rb.rb
|
58
|
+
- lib/taxamatch_rb/atomizer.rb
|
59
|
+
- lib/taxamatch_rb/authmatch.rb
|
60
|
+
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
61
|
+
- lib/taxamatch_rb/normalizer.rb
|
62
|
+
- lib/taxamatch_rb/phonetizer.rb
|
63
|
+
- spec/damerau_levenshtein_mod_test.txt
|
64
|
+
- spec/spec.opts
|
65
|
+
- spec/spec_helper.rb
|
66
|
+
- spec/taxamatch_rb_spec.rb
|
67
|
+
- spec/taxamatch_test.txt
|
68
|
+
- LICENSE
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/dimus/taxamatch_rb
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --charset=UTF-8
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
segments:
|
83
|
+
- 0
|
84
|
+
version: "0"
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
version: "0"
|
92
|
+
requirements: []
|
93
|
+
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.3.6
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: Implementation of Tony Rees Taxamatch algorithms
|
99
|
+
test_files:
|
100
|
+
- spec/spec_helper.rb
|
101
|
+
- spec/taxamatch_rb_spec.rb
|