taxamatch_rb 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.rdoc +61 -0
- data/lib/taxamatch_rb.rb +117 -0
- data/lib/taxamatch_rb/atomizer.rb +82 -0
- data/lib/taxamatch_rb/authmatch.rb +89 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +139 -0
- data/lib/taxamatch_rb/normalizer.rb +55 -0
- data/lib/taxamatch_rb/phonetizer.rb +79 -0
- data/spec/damerau_levenshtein_mod_test.txt +63 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/taxamatch_rb_spec.rb +254 -0
- data/spec/taxamatch_test.txt +45 -0
- metadata +101 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Dmitry Mozzherin
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
= taxamatch_rb
|
2
|
+
|
3
|
+
Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony Rees: http://www.cmar.csiro.au/datacentre/taxamatch.htm
|
4
|
+
|
5
|
+
The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
|
6
|
+
|
7
|
+
require 'taxamatch_rb'
|
8
|
+
tm = Taxamatch::Base.new
|
9
|
+
tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
|
10
|
+
tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
|
11
|
+
tm.taxamatch('Homo sapiens Mozzherin', 'Homo sapiens Linnaeus') #returns false
|
12
|
+
|
13
|
+
Taxamatch_Rb is compatible with ruby versions 1.8.7 and 1.9.1 and higher
|
14
|
+
|
15
|
+
== Installation
|
16
|
+
|
17
|
+
sudo gem install dimus-taxamatch_rb --source http://gems.github.com
|
18
|
+
|
19
|
+
or
|
20
|
+
sudo gem sources -a http://gems.github.com #(you only have to do this once)
|
21
|
+
sudo gem install dimus-taxamatch_rb
|
22
|
+
|
23
|
+
== Usage
|
24
|
+
|
25
|
+
require 'rubygems' #not needed for ruby > 1.9.1
|
26
|
+
require 'taxamatch_rb'
|
27
|
+
|
28
|
+
tm = Taxamatch::Base.new
|
29
|
+
|
30
|
+
* compare full scientific names
|
31
|
+
|
32
|
+
tm.taxamatch('Hommo sapiens L.', 'Homo sapiens Linnaeus')
|
33
|
+
|
34
|
+
* preparse names for the matching (necessary for large databases of scientific names)
|
35
|
+
|
36
|
+
p = Taxamatch::Atomizer.new
|
37
|
+
parsed_name1 = p.parse('Monacanthus fronticinctus Günther 1867 sec. Eschmeyer 2004')
|
38
|
+
parsed_name2 = p.parse('Monacanthus fronticinctus (Gunther, 1867)')
|
39
|
+
|
40
|
+
* compare preparsed names
|
41
|
+
|
42
|
+
tm.taxamatch_preparsed(parsed_name1, parsed_name2)
|
43
|
+
|
44
|
+
* compare genera
|
45
|
+
|
46
|
+
tm.match_genera('Monacanthus', 'MONOCANTUS')
|
47
|
+
|
48
|
+
* compare species
|
49
|
+
|
50
|
+
tm.match_species('fronticinctus', 'frontecinctus')
|
51
|
+
|
52
|
+
* compare authors and years
|
53
|
+
|
54
|
+
Taxamatch::Authmatch.authmatch(['Linnaeus'], ['L','Muller'], [1786], [1787])
|
55
|
+
|
56
|
+
|
57
|
+
You can find more examples in spec section of the code
|
58
|
+
|
59
|
+
== Copyright
|
60
|
+
|
61
|
+
Copyright (c) 2009 Dmitry Mozzherin. See LICENSE for details.
|
data/lib/taxamatch_rb.rb
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
3
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
|
+
# $:.unshift('taxamatch_rb')
|
5
|
+
require 'taxamatch_rb/damerau_levenshtein_mod'
|
6
|
+
require 'taxamatch_rb/atomizer'
|
7
|
+
require 'taxamatch_rb/normalizer'
|
8
|
+
require 'taxamatch_rb/phonetizer'
|
9
|
+
require 'taxamatch_rb/authmatch'
|
10
|
+
require 'ruby-debug'
|
11
|
+
|
12
|
+
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
13
|
+
|
14
|
+
module Taxamatch
|
15
|
+
|
16
|
+
class Base
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@parser = Taxamatch::Atomizer.new
|
20
|
+
@dlm = Taxamatch::DamerauLevenshteinMod.new
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
#takes two scientific names and returns true if names match and false if they don't
|
25
|
+
def taxamatch(str1, str2, return_boolean = true)
|
26
|
+
preparsed_1 = @parser.parse(str1)
|
27
|
+
preparsed_2 = @parser.parse(str2)
|
28
|
+
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
29
|
+
return_boolean && match ? match['match'] : match
|
30
|
+
end
|
31
|
+
|
32
|
+
#takes two hashes of parsed scientific names, analyses them and returns back
|
33
|
+
#this function is useful when species strings are preparsed.
|
34
|
+
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
35
|
+
result = nil
|
36
|
+
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
37
|
+
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
38
|
+
if result && result['match']
|
39
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == 0 ? false : true
|
40
|
+
end
|
41
|
+
return result
|
42
|
+
end
|
43
|
+
|
44
|
+
def match_uninomial(preparsed_1, preparsed_2)
|
45
|
+
match_genera(preparsed_1[:uninomial], preparsed_2[:uninomial])
|
46
|
+
end
|
47
|
+
|
48
|
+
def match_multinomial(preparsed_1, preparsed_2)
|
49
|
+
gen_match = match_genera(preparsed_1[:genus], preparsed_2[:genus])
|
50
|
+
sp_match = match_species(preparsed_1[:species], preparsed_2[:species])
|
51
|
+
total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
|
52
|
+
if preparsed_1[:infraspecies] && preparsed_2[:infraspecies]
|
53
|
+
infrasp_match = match_species(preparsed_1[:infraspecies][0], preparsed_2[:infraspecies][0])
|
54
|
+
total_length += preparsed_1[:infraspecies][0][:epitheton].size + preparsed_2[:infraspecies][0][:epitheton].size
|
55
|
+
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
56
|
+
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) || (!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
57
|
+
match_hash = { 'match' => false, 'edit_distance' => 5, 'phonetic_match' => false }
|
58
|
+
total_length += preparsed_1[:infraspecies] ? preparsed_1[:infraspecies][0][:epitheton].size : preparsed_2[:infraspecies][0][:epitheton].size
|
59
|
+
else
|
60
|
+
match_hash = match_matches(gen_match, sp_match)
|
61
|
+
end
|
62
|
+
match_hash.merge({'score' => (1 - match_hash['edit_distance']/(total_length/2))})
|
63
|
+
match_hash
|
64
|
+
end
|
65
|
+
|
66
|
+
def match_genera(genus1, genus2)
|
67
|
+
genus1_length = genus1[:normalized].size
|
68
|
+
genus2_length = genus2[:normalized].size
|
69
|
+
match = false
|
70
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
71
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
|
72
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
73
|
+
|
74
|
+
match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
75
|
+
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
76
|
+
end
|
77
|
+
|
78
|
+
def match_species(sp1, sp2)
|
79
|
+
sp1_length = sp1[:normalized].size
|
80
|
+
sp2_length = sp2[:normalized].size
|
81
|
+
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
82
|
+
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
83
|
+
match = false
|
84
|
+
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
|
85
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
|
86
|
+
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
87
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
88
|
+
|
89
|
+
match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
90
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
91
|
+
end
|
92
|
+
|
93
|
+
def match_authors(preparsed_1, preparsed_2)
|
94
|
+
au1 = preparsed_1[:all_authors]
|
95
|
+
au2 = preparsed_2[:all_authors]
|
96
|
+
yr1 = preparsed_1[:all_years]
|
97
|
+
yr2 = preparsed_2[:all_years]
|
98
|
+
Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
99
|
+
end
|
100
|
+
|
101
|
+
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
102
|
+
match = species_match
|
103
|
+
if infraspecies_match
|
104
|
+
match['edit_distance'] += infraspecies_match['edit_distance']
|
105
|
+
match['match'] &&= infraspecies_match['match']
|
106
|
+
match['phonetic_match'] &&= infraspecies_match['phonetic_match']
|
107
|
+
end
|
108
|
+
match['edit_distance'] += genus_match['edit_distance']
|
109
|
+
match['match'] = false if match['edit_distance'] > (infraspecies_match ? 6 : 4)
|
110
|
+
match['match'] &&= genus_match['match']
|
111
|
+
match['phonetic_match'] &&= genus_match['phonetic_match']
|
112
|
+
match
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'biodiversity'
|
3
|
+
|
4
|
+
module Taxamatch
|
5
|
+
|
6
|
+
class Atomizer
|
7
|
+
def initialize
|
8
|
+
@parser = ScientificNameParser.new
|
9
|
+
@parsed_raw = nil
|
10
|
+
@res = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse(name)
|
14
|
+
@res = {:all_authors => [], :all_years => []}
|
15
|
+
@parsed_raw = @parser.parse(name)[:scientificName]
|
16
|
+
organize_results
|
17
|
+
end
|
18
|
+
|
19
|
+
def parsed_raw
|
20
|
+
return @parsed_raw
|
21
|
+
end
|
22
|
+
|
23
|
+
protected
|
24
|
+
|
25
|
+
def organize_results
|
26
|
+
pr = @parsed_raw
|
27
|
+
return nil unless pr[:parsed]
|
28
|
+
d = pr[:details][0]
|
29
|
+
process_node(:uninomial, d[:uninomial])
|
30
|
+
process_node(:genus, d[:genus])
|
31
|
+
process_node(:species, d[:species], true)
|
32
|
+
process_infraspecies(d[:infraspecies])
|
33
|
+
@res[:all_authors] = @res[:all_authors].uniq.map {|a| Taxamatch::Normalizer.normalize(a)}
|
34
|
+
@res[:all_years].uniq!
|
35
|
+
@res.keys.size > 2 ? @res : nil
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_node(name, node, is_species = false)
|
39
|
+
return unless node
|
40
|
+
@res[name] = {}
|
41
|
+
@res[name][:epitheton] = node[:epitheton]
|
42
|
+
@res[name][:normalized] = Taxamatch::Normalizer.normalize(node[:epitheton])
|
43
|
+
@res[name][:phonetized] = Taxamatch::Phonetizer.near_match(node[:epitheton], is_species)
|
44
|
+
get_authors_years(node, @res[name])
|
45
|
+
end
|
46
|
+
|
47
|
+
def process_infraspecies(node)
|
48
|
+
return unless node
|
49
|
+
@res[:infraspecies] = []
|
50
|
+
node.each do |infr|
|
51
|
+
hsh = {}
|
52
|
+
hsh[:epitheton] = infr[:epitheton]
|
53
|
+
hsh[:normalized] = Taxamatch::Normalizer.normalize(infr[:epitheton])
|
54
|
+
hsh[:phonetized] = Taxamatch::Phonetizer.near_match(infr[:epitheton], true)
|
55
|
+
get_authors_years(infr,hsh)
|
56
|
+
@res[:infraspecies] << hsh
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_authors_years(node, res)
|
61
|
+
res[:authors] = []
|
62
|
+
res[:years] = []
|
63
|
+
[:basionymAuthorTeam, :combinationAuthorTeam].each do |au|
|
64
|
+
if node[au]
|
65
|
+
res[:authors] += node[au][:author]
|
66
|
+
res[:years] << node[au][:year] if node[au][:year]
|
67
|
+
if node[au][:exAuthorTeam]
|
68
|
+
res[:authors] += node[au][:exAuthorTeam][:author]
|
69
|
+
res[:years] << node[au][:exAuthorTeam][:year] if node[au][:exAuthorTeam][:year]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
res[:authors].uniq!
|
74
|
+
res[:normalized_authors] = res[:authors].map {|a| Taxamatch::Normalizer.normalize_author(a)}
|
75
|
+
res[:years].uniq!
|
76
|
+
@res[:all_authors] += res[:normalized_authors] if res[:normalized_authors].size > 0
|
77
|
+
@res[:all_years] += res[:years] if res[:years].size > 0
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
|
2
|
+
|
3
|
+
module Taxamatch
|
4
|
+
class Authmatch
|
5
|
+
|
6
|
+
def self.authmatch(authors1, authors2, years1, years2)
|
7
|
+
unique_authors1, unique_authors2 = remove_duplicate_authors(authors1, authors2)
|
8
|
+
year_difference = compare_years(years1, years2)
|
9
|
+
get_score(authors1, unique_authors1, authors2, unique_authors2, year_difference)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.get_score(authors1, unique_authors1, authors2, unique_authors2, year_diff)
|
13
|
+
count_before = authors1.size + authors2.size
|
14
|
+
count_after = unique_authors1.size + unique_authors2.size
|
15
|
+
score = 0
|
16
|
+
if count_after == 0
|
17
|
+
if year_diff != nil
|
18
|
+
if year_diff == 0
|
19
|
+
score = 100
|
20
|
+
elsif year_diff == 1
|
21
|
+
score = 54
|
22
|
+
end
|
23
|
+
else
|
24
|
+
score = 94
|
25
|
+
end
|
26
|
+
elsif unique_authors1.size == 0 || unique_authors2.size == 0
|
27
|
+
if year_diff != nil
|
28
|
+
if year_diff == 0
|
29
|
+
score = 91
|
30
|
+
elsif year_diff == 1
|
31
|
+
score = 51
|
32
|
+
end
|
33
|
+
else
|
34
|
+
score = 90
|
35
|
+
end
|
36
|
+
else
|
37
|
+
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
38
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
39
|
+
end
|
40
|
+
score > 50 ? score : 0
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.remove_duplicate_authors(authors1, authors2)
|
44
|
+
unique_authors1 = authors1.dup
|
45
|
+
unique_authors2 = authors2.dup
|
46
|
+
authors1.each do |au1|
|
47
|
+
authors2.each do |au2|
|
48
|
+
au1_match = au2_match = false
|
49
|
+
if au1 == au2
|
50
|
+
au1_match = au2_match = true
|
51
|
+
elsif au1 == au2[0...au1.size]
|
52
|
+
au1_match = true
|
53
|
+
elsif au1[0...au2.size] == au2
|
54
|
+
au2_match = true
|
55
|
+
end
|
56
|
+
if (au1.size >= 3 && au1_match) || (au2.size >= 3 && au2_match) || (au1_match && au2_match)
|
57
|
+
unique_authors1.delete au1
|
58
|
+
unique_authors2.delete au2
|
59
|
+
elsif au1_match
|
60
|
+
unique_authors1.delete au1
|
61
|
+
elsif au2_match
|
62
|
+
unique_authors2.delete au2
|
63
|
+
else
|
64
|
+
#TODO: masking a bug in damerau levenshtsin mod which appears comparing 1letter to a longer string
|
65
|
+
if au1.size > 1 && au2.size > 1 && self.fuzzy_match_authors(au1, au2)
|
66
|
+
unique_authors1.delete au1
|
67
|
+
unique_authors2.delete au2
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
[unique_authors1, unique_authors2]
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.fuzzy_match_authors(author1, author2)
|
76
|
+
au1_length = author1.size
|
77
|
+
au2_length = author2.size
|
78
|
+
dlm = Taxamatch::DamerauLevenshteinMod.new
|
79
|
+
ed = dlm.distance(author1, author2,2,3) #get around a bug in C code, but it really has to be fixed
|
80
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.compare_years(years1, years2)
|
84
|
+
return 0 if years1 == [] && years2 == []
|
85
|
+
return (years1[0].to_i - years2[0].to_i).abs if years1.size == 1 && years2.size == 1
|
86
|
+
nil
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'inline'
|
4
|
+
require 'time'
|
5
|
+
module Taxamatch
|
6
|
+
|
7
|
+
class DamerauLevenshteinMod
|
8
|
+
def distance(str1, str2, block_size=2, max_distance=10)
|
9
|
+
# puts str1.unpack("U*");
|
10
|
+
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
11
|
+
end
|
12
|
+
|
13
|
+
inline do |builder|
|
14
|
+
builder.c "
|
15
|
+
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
|
16
|
+
int i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block;
|
17
|
+
int stop_execution = 0;
|
18
|
+
int min = 0;
|
19
|
+
int current_distance = 0;
|
20
|
+
|
21
|
+
VALUE *sv = RARRAY_PTR(_s);
|
22
|
+
VALUE *tv = RARRAY_PTR(_t);
|
23
|
+
|
24
|
+
sl = RARRAY_LEN(_s);
|
25
|
+
tl = RARRAY_LEN(_t);
|
26
|
+
|
27
|
+
if (sl == 0) return INT2NUM(tl);
|
28
|
+
if (tl == 0) return INT2NUM(sl);
|
29
|
+
//case of lengths 1 must present or it will break further in the code
|
30
|
+
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return INT2NUM(1);
|
31
|
+
|
32
|
+
int s[sl];
|
33
|
+
int t[tl];
|
34
|
+
|
35
|
+
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
|
36
|
+
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
|
37
|
+
|
38
|
+
sl++;
|
39
|
+
tl++;
|
40
|
+
|
41
|
+
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
42
|
+
d = malloc((sizeof(int))*(sl)*(tl));
|
43
|
+
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
44
|
+
for(i = 0; i < tl; i++){
|
45
|
+
d[i*sl] = i;
|
46
|
+
}
|
47
|
+
|
48
|
+
//fill up array with scores
|
49
|
+
for(i = 1; i<sl; i++){
|
50
|
+
d[i] = i;
|
51
|
+
if (stop_execution == 1) break;
|
52
|
+
current_distance = 10000;
|
53
|
+
for(j = 1; j<tl; j++){
|
54
|
+
|
55
|
+
cost = 1;
|
56
|
+
if(s[i-1] == t[j-1]) cost = 0;
|
57
|
+
|
58
|
+
half_sl = (sl - 1)/2;
|
59
|
+
half_tl = (tl - 1)/2;
|
60
|
+
|
61
|
+
block = block_size < half_sl ? block_size : half_sl;
|
62
|
+
block = block < half_tl ? block : half_tl;
|
63
|
+
|
64
|
+
while (block >= 1){
|
65
|
+
int swap1 = 1;
|
66
|
+
int swap2 = 1;
|
67
|
+
i1 = i - (block * 2);
|
68
|
+
j1 = j - (block * 2);
|
69
|
+
for (k = i1; k < i1 + block; k++) {
|
70
|
+
if (s[k] != t[k + block]){
|
71
|
+
swap1 = 0;
|
72
|
+
break;
|
73
|
+
}
|
74
|
+
}
|
75
|
+
for (k = j1; k < j1 + block; k++) {
|
76
|
+
if (t[k] != s[k + block]){
|
77
|
+
swap2 = 0;
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
del = d[j*sl + i - 1] + 1;
|
83
|
+
ins = d[(j-1)*sl + i] + 1;
|
84
|
+
min = del;
|
85
|
+
if (ins < min) min = ins;
|
86
|
+
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
87
|
+
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
88
|
+
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
89
|
+
if (transp < min) min = transp;
|
90
|
+
block = 0;
|
91
|
+
} else if (block == 1) {
|
92
|
+
subs = d[(j-1)*sl + i - 1] + cost;
|
93
|
+
if (subs < min) min = subs;
|
94
|
+
}
|
95
|
+
block--;
|
96
|
+
}
|
97
|
+
d[j*sl+i]=min;
|
98
|
+
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
99
|
+
}
|
100
|
+
if (current_distance > max_distance) {
|
101
|
+
stop_execution = 1;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
distance=d[sl * tl - 1];
|
105
|
+
if (stop_execution == 1) distance = current_distance;
|
106
|
+
|
107
|
+
free(d);
|
108
|
+
return INT2NUM(distance);
|
109
|
+
}
|
110
|
+
"
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
if __FILE__ == $0
|
116
|
+
a=Taxamatch::DamerauLevenshteinMod.new
|
117
|
+
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
118
|
+
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
119
|
+
|
120
|
+
#puts s.join(",")
|
121
|
+
#puts t.join(",")
|
122
|
+
|
123
|
+
start = Time.now
|
124
|
+
(1..100000).each do
|
125
|
+
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
|
126
|
+
end
|
127
|
+
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
|
128
|
+
|
129
|
+
start = Time.now
|
130
|
+
(1..100000).each do
|
131
|
+
a.distance_utf(s, t, 1, 10)
|
132
|
+
end
|
133
|
+
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
134
|
+
|
135
|
+
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
136
|
+
#puts a.distance_utf(s, t, 2, 10)
|
137
|
+
#puts a.distance('tar','atp',1,10);
|
138
|
+
puts a.distance('sub', 'usb', 1, 10);
|
139
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Taxamatch
|
4
|
+
|
5
|
+
module Normalizer
|
6
|
+
def self.normalize(string)
|
7
|
+
utf8_to_ascii(string).upcase
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.normalize_word(word)
|
11
|
+
self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.normalize_author(string)
|
15
|
+
self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
def self.utf8_to_ascii(string)
|
20
|
+
string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
|
21
|
+
string = string.gsub(/[ÉÈÊË]/, "E")
|
22
|
+
string = string.gsub(/[ÍÌÎÏ]/, "I")
|
23
|
+
string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
|
24
|
+
string = string.gsub(/[ÚÙÛÜ]/, "U")
|
25
|
+
string = string.gsub(/[Ý]/, "Y")
|
26
|
+
string = string.gsub(/Æ/, "AE")
|
27
|
+
string = string.gsub(/[ČÇ]/, "C")
|
28
|
+
string = string.gsub(/[ŠŞ]/, "S")
|
29
|
+
string = string.gsub(/[Đ]/, "D")
|
30
|
+
string = string.gsub(/Ž/, "Z")
|
31
|
+
string = string.gsub(/Ñ/, "N")
|
32
|
+
string = string.gsub(/Œ/, "OE")
|
33
|
+
string = string.gsub(/ß/, "B")
|
34
|
+
string = string.gsub(/Ķ/, "K")
|
35
|
+
string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
|
36
|
+
string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
|
37
|
+
string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
|
38
|
+
string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
|
39
|
+
string = string.gsub(/[úùûüůưừựủứụ]/, "u")
|
40
|
+
string = string.gsub(/[žź]/, "z")
|
41
|
+
string = string.gsub(/[ýÿỹ]/, "y")
|
42
|
+
string = string.gsub(/[đ]/, "d")
|
43
|
+
string = string.gsub(/æ/, "ae")
|
44
|
+
string = string.gsub(/[čćç]/, "c")
|
45
|
+
string = string.gsub(/[ñńň]/, "n")
|
46
|
+
string = string.gsub(/œ/, "oe")
|
47
|
+
string = string.gsub(/[śšş]/, "s")
|
48
|
+
string = string.gsub(/ř/, "r")
|
49
|
+
string = string.gsub(/ğ/, "g")
|
50
|
+
string = string.gsub(/Ř/, "R")
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Taxamatch
|
3
|
+
|
4
|
+
module Phonetizer
|
5
|
+
|
6
|
+
def self.phonetize(a_word, normalize_ending = false)
|
7
|
+
self.near_match(a_word, normalize_ending)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.near_match(a_word, normalize_ending = false)
|
11
|
+
a_word = a_word.strip rescue ''
|
12
|
+
return '' if a_word == ''
|
13
|
+
a_word = Taxamatch::Normalizer.normalize a_word
|
14
|
+
case a_word
|
15
|
+
when /^AE/
|
16
|
+
a_word = 'E' + a_word[2..-1]
|
17
|
+
when /^CN/
|
18
|
+
a_word = 'N' + a_word[2..-1]
|
19
|
+
when /^CT/
|
20
|
+
a_word = 'T' + a_word[2..-1]
|
21
|
+
when /^CZ/
|
22
|
+
a_word = 'C' + a_word[2..-1]
|
23
|
+
when /^DJ/
|
24
|
+
a_word = 'J' + a_word[2..-1]
|
25
|
+
when /^EA/
|
26
|
+
a_word = 'E' + a_word[2..-1]
|
27
|
+
when /^EU/
|
28
|
+
a_word = 'U' + a_word[2..-1]
|
29
|
+
when /^GN/
|
30
|
+
a_word = 'N' + a_word[2..-1]
|
31
|
+
when /^KN/
|
32
|
+
a_word = 'N' + a_word[2..-1]
|
33
|
+
when /^MC/
|
34
|
+
a_word = 'MAC' + a_word[2..-1]
|
35
|
+
when /^MN/
|
36
|
+
a_word = 'N' + a_word[2..-1]
|
37
|
+
when /^OE/
|
38
|
+
a_word = 'E' + a_word[2..-1]
|
39
|
+
when /^QU/
|
40
|
+
a_word = 'Q' + a_word[2..-1]
|
41
|
+
when /^PS/
|
42
|
+
a_word = 'S' + a_word[2..-1]
|
43
|
+
when /^PT/
|
44
|
+
a_word = 'T' + a_word[2..-1]
|
45
|
+
when /^TS/
|
46
|
+
a_word = 'S' + a_word[2..-1]
|
47
|
+
when /^WR/
|
48
|
+
a_word = 'R' + a_word[2..-1]
|
49
|
+
when /^X/
|
50
|
+
a_word = 'Z' + a_word[1..-1]
|
51
|
+
end
|
52
|
+
first_char = a_word.split('')[0]
|
53
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
54
|
+
rest_chars.gsub!('AE', 'I')
|
55
|
+
rest_chars.gsub!('IA', 'A')
|
56
|
+
rest_chars.gsub!('OE', 'I')
|
57
|
+
rest_chars.gsub!('OI', 'A')
|
58
|
+
rest_chars.gsub!('SC', 'S')
|
59
|
+
rest_chars.gsub!('H', '')
|
60
|
+
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
61
|
+
a_word = (first_char + rest_chars).squeeze
|
62
|
+
|
63
|
+
if normalize_ending && a_word.size > 4
|
64
|
+
a_word = self.normalize_ending(a_word)
|
65
|
+
end
|
66
|
+
a_word
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.normalize_ending(a_word)
|
70
|
+
# -- deal with variant endings -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
71
|
+
# -- at the end of a string translate all to -a
|
72
|
+
a_word.gsub!(/IS$/, 'A')
|
73
|
+
a_word.gsub!(/IM$/, 'A')
|
74
|
+
a_word.gsub(/AS$/, 'A')
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
######################
|
2
|
+
# Tests for modified Damerau Levenshtein Distance algorithm (UTF-8 compatible)
|
3
|
+
#
|
4
|
+
# * B. Boehmer, T. Rees, Modified Damerau-Levenshtein Distance, Boehmer & Rees 2008
|
5
|
+
# * F.J. Damerau. A technique for computer detection and correction of spelling errors, Communications of the ACM, 1964
|
6
|
+
#
|
7
|
+
# Fields:
|
8
|
+
# String1|String2|maximum distance|transposition block size|expected distance
|
9
|
+
# - String1, String2
|
10
|
+
# compared strings
|
11
|
+
# - maximum distance
|
12
|
+
# stops execution of the algorithm when calculated distance exceeds the maximum distance number
|
13
|
+
# - transosition block size
|
14
|
+
# determines how many characters can be transposed. Block size 1 returns score according to Damerau-Levenshtein algorithm
|
15
|
+
# - expected distance
|
16
|
+
# resulting distance that has to be achieved by the algorithm
|
17
|
+
# Note: algorithm does not try to normalize or interpret strings in any way.
|
18
|
+
######################
|
19
|
+
|
20
|
+
#it whould recognize the exact match
|
21
|
+
Pomatomus|Pomatomus|10|1|0
|
22
|
+
|
23
|
+
#it should not try to normalize incoming strings
|
24
|
+
Pomatomus|Pomatomus|10|1|1
|
25
|
+
Pomatomus|pomatomus|10|1|1
|
26
|
+
|
27
|
+
#it should calculate special cases
|
28
|
+
Pomatomus||10|1|9
|
29
|
+
|Pomatomus|10|1|9
|
30
|
+
P|p|10|1|1
|
31
|
+
#TODO: one letter vs longer string generates a big negative number
|
32
|
+
#L|Linneaus|10|1|7
|
33
|
+
|
34
|
+
|
35
|
+
#it should calculate Damerau Levenshtein distance with 1 character transpositions, insertions, deletions, substitutions (block size 1)
|
36
|
+
Pomatomus|Pomatomux|10|1|1
|
37
|
+
Pmatomus|Pomatomus|10|1|1
|
38
|
+
Pomatomus|Pmatomus|10|1|1
|
39
|
+
Rpmatomus|Pomatomus|10|1|2
|
40
|
+
Pommtomus|Pomatomus|10|1|1
|
41
|
+
Potamomus|Pomatomus|10|1|2
|
42
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
43
|
+
Pomatomus|oPmatomus|10|1|1
|
44
|
+
Pomatomus|Pomatomsu|10|1|1
|
45
|
+
Pomtaomus|Pomatomus|10|1|1
|
46
|
+
Pomatoums|Pomatomus|10|1|1
|
47
|
+
Potamomus|Pomatomus|10|1|2
|
48
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|2|1
|
49
|
+
|
50
|
+
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
51
|
+
serrulatus|serratulus|10|2|2
|
52
|
+
Pomatomus|Poomumats|10|3|3
|
53
|
+
vesiculosus|vecusilosus|10|1|4
|
54
|
+
vesiculosus|vecusilosus|10|2|2
|
55
|
+
trimerophyton|mertriophyton|10|1|6
|
56
|
+
trimerophyton|mertriophyton|10|3|3
|
57
|
+
|
58
|
+
#it should stop trying if distance exceeds maximum allowed distance
|
59
|
+
Pxxxxomus|Pomatomus|10|1|4
|
60
|
+
Pxxxxomus|Pomatomus|2|1|3
|
61
|
+
|
62
|
+
#
|
63
|
+
PUNCTATA|PUNCTATA|10|1|0
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
begin
|
2
|
+
require 'spec'
|
3
|
+
rescue LoadError
|
4
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
end
|
8
|
+
|
9
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
10
|
+
require 'taxamatch_rb'
|
11
|
+
|
12
|
+
def read_test_file(file, fields_num)
|
13
|
+
f = open(file)
|
14
|
+
f.each do |line|
|
15
|
+
fields = line.split("|")
|
16
|
+
if line.match(/^\s*#/) == nil && fields.size == fields_num
|
17
|
+
fields[-1] = fields[-1].split('#')[0].strip
|
18
|
+
yield(fields)
|
19
|
+
else
|
20
|
+
yield(nil)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def make_taxamatch_hash(string)
|
26
|
+
normalized = Taxamatch::Normalizer.normalize(string)
|
27
|
+
{:epitheton => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
|
28
|
+
end
|
@@ -0,0 +1,254 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.dirname(__FILE__) + '/spec_helper.rb'
|
3
|
+
|
4
|
+
describe 'DamerauLevenshteinMod' do
|
5
|
+
it 'should get tests' do
|
6
|
+
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_mod_test.txt', 5) do |y|
|
7
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
8
|
+
if y
|
9
|
+
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
10
|
+
puts y if res != y[4].to_i
|
11
|
+
res.should == y[4].to_i
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe 'Atomizer' do
|
18
|
+
before(:all) do
|
19
|
+
@parser = Taxamatch::Atomizer.new
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should parse uninomials' do
|
23
|
+
@parser.parse('Betula').should == {:all_authors=>[], :all_years=>[], :uninomial=>{:epitheton=>"Betula", :normalized=>"BETULA", :phonetized=>"BITILA", :authors=>[], :years=>[], :normalized_authors=>[]}}
|
24
|
+
@parser.parse('Ærenea Lacordaire, 1872').should == {:all_authors=>["LACORDAIRE"], :all_years=>["1872"], :uninomial=>{:epitheton=>"Aerenea", :normalized=>"AERENEA", :phonetized=>"ERINIA", :authors=>["Lacordaire"], :years=>["1872"], :normalized_authors=>["LACORDAIRE"]}}
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should parse binomials' do
|
28
|
+
@parser.parse('Leœptura laetifica Dow, 1913').should == {:all_authors=>["DOW"], :all_years=>["1913"], :genus=>{:epitheton=>"Leoeptura", :normalized=>"LEOEPTURA", :phonetized=>"LIPTIRA", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"laetifica", :normalized=>"LAETIFICA", :phonetized=>"LITIFICA", :authors=>["Dow"], :years=>["1913"], :normalized_authors=>["DOW"]}}
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should parse trinomials' do
|
32
|
+
@parser.parse('Hydnellum scrobiculatum zonatum (Banker) D. Hall et D.E. Stuntz 1972').should == {:all_authors=>["BANKER", "D HALL", "D E STUNTZ"], :all_years=>["1972"], :genus=>{:epitheton=>"Hydnellum", :normalized=>"HYDNELLUM", :phonetized=>"HIDNILIM", :authors=>[], :years=>[], :normalized_authors=>[]}, :species=>{:epitheton=>"scrobiculatum", :normalized=>"SCROBICULATUM", :phonetized=>"SCRABICILATA", :authors=>[], :years=>[], :normalized_authors=>[]}, :infraspecies=>[{:epitheton=>"zonatum", :normalized=>"ZONATUM", :phonetized=>"ZANATA", :authors=>["Banker", "D. Hall", "D.E. Stuntz"], :years=>["1972"], :normalized_authors=>["BANKER", "D HALL", "D E STUNTZ"]}]}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
describe 'Taxamatch::Normalizer' do
|
38
|
+
it 'should normalize strings' do
|
39
|
+
Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
|
40
|
+
Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
41
|
+
Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
|
42
|
+
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
43
|
+
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should == 'CHORIOZOPELLA TRAGARDHI'
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should normalize words' do
|
47
|
+
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should == 'L-3EOEPTURA'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe 'Taxamatch::Base' do
|
52
|
+
before(:all) do
|
53
|
+
@tm = Taxamatch::Base.new
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should get txt tests' do
|
57
|
+
dl = Taxamatch::DamerauLevenshteinMod.new
|
58
|
+
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt', 4) do |y|
|
59
|
+
if y
|
60
|
+
y[2] = y[2] == 'true' ? true : false
|
61
|
+
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
63
|
+
res['match'].should == y[2]
|
64
|
+
res['edit_distance'].should == y[3].to_i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'should work with names that cannot be parsed' do
|
70
|
+
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921','Quadraspidiotus ostreaeformis Curtis)')
|
71
|
+
res = false
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'should compare genera' do
|
75
|
+
#edit distance 1 always match
|
76
|
+
g1 = make_taxamatch_hash 'Plantago'
|
77
|
+
g2 = make_taxamatch_hash 'Plantagon'
|
78
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
79
|
+
#edit_distance above threshold does not math
|
80
|
+
g1 = make_taxamatch_hash 'Plantago'
|
81
|
+
g2 = make_taxamatch_hash 'This shouldnt match'
|
82
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
83
|
+
#phonetic_match matches
|
84
|
+
g1 = make_taxamatch_hash 'Plantagi'
|
85
|
+
g2 = make_taxamatch_hash 'Plantagy'
|
86
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
|
87
|
+
#distance 1 in first letter also matches
|
88
|
+
g1 = make_taxamatch_hash 'Xantheri'
|
89
|
+
g2 = make_taxamatch_hash 'Pantheri'
|
90
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
91
|
+
#phonetic match tramps everything
|
92
|
+
g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
|
93
|
+
g2 = make_taxamatch_hash 'Zanthery'
|
94
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
95
|
+
#same first letter and distance 2 should match
|
96
|
+
g1 = make_taxamatch_hash 'Xantherii'
|
97
|
+
g2 = make_taxamatch_hash 'Xantherrr'
|
98
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
99
|
+
#First letter is the same and distance is 3 should match, no phonetic match
|
100
|
+
g1 = make_taxamatch_hash 'Xantheriii'
|
101
|
+
g2 = make_taxamatch_hash 'Xantherrrr'
|
102
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
103
|
+
#Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
|
104
|
+
g1 = make_taxamatch_hash 'Xant'
|
105
|
+
g2 = make_taxamatch_hash 'Xanthe'
|
106
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
|
107
|
+
#Should not match if edit distance > 3 and no phonetic match
|
108
|
+
g1 = make_taxamatch_hash 'Xantheriiii'
|
109
|
+
g2 = make_taxamatch_hash 'Xantherrrrr'
|
110
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'should compare species' do
|
114
|
+
#Exact match
|
115
|
+
s1 = make_taxamatch_hash 'major'
|
116
|
+
s2 = make_taxamatch_hash 'major'
|
117
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
118
|
+
#Phonetic match always works
|
119
|
+
s1 = make_taxamatch_hash 'xanteriiiiiiii'
|
120
|
+
s2 = make_taxamatch_hash 'zantereeeeeeee'
|
121
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
|
122
|
+
#Phonetic match works with different endings
|
123
|
+
s1 = make_taxamatch_hash 'majorum'
|
124
|
+
s2 = make_taxamatch_hash 'majoris'
|
125
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
126
|
+
#Distance 4 matches if first 3 chars are the same
|
127
|
+
s1 = make_taxamatch_hash 'majorrrrr'
|
128
|
+
s2 = make_taxamatch_hash 'majoraaaa'
|
129
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
|
130
|
+
#Should not match if Distance 4 matches and first 3 chars are not the same
|
131
|
+
s1 = make_taxamatch_hash 'majorrrrr'
|
132
|
+
s2 = make_taxamatch_hash 'marorraaa'
|
133
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
134
|
+
#Distance 2 or 3 matches if first 1 char is the same
|
135
|
+
s1 = make_taxamatch_hash 'morrrr'
|
136
|
+
s2 = make_taxamatch_hash 'moraaa'
|
137
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
138
|
+
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
139
|
+
s1 = make_taxamatch_hash 'morrrr'
|
140
|
+
s2 = make_taxamatch_hash 'torraa'
|
141
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
142
|
+
#Distance 1 will match anywhere
|
143
|
+
s1 = make_taxamatch_hash 'major'
|
144
|
+
s2 = make_taxamatch_hash 'rajor'
|
145
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
146
|
+
#Will not match if distance 3 and length is less then twice of the edit distance
|
147
|
+
s1 = make_taxamatch_hash 'marrr'
|
148
|
+
s2 = make_taxamatch_hash 'maaaa'
|
149
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'should match mathes' do
|
153
|
+
#No trobule case
|
154
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
155
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
156
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
|
157
|
+
#Will not match if either genus or sp. epithet dont match
|
158
|
+
gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
159
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
160
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
161
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
162
|
+
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
163
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
164
|
+
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
165
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
166
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
167
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
|
168
|
+
#Should not have phonetic match if one of the components does not match phonetically
|
169
|
+
gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
170
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
171
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
172
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
173
|
+
smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
174
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
175
|
+
#edit distance should be equal the sum of of edit distances
|
176
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
177
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
178
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
|
179
|
+
end
|
180
|
+
|
181
|
+
describe 'Taxamatch::Authmatch' do
|
182
|
+
before(:all) do
|
183
|
+
@am = Taxamatch::Authmatch
|
184
|
+
end
|
185
|
+
|
186
|
+
it 'should calculate score' do
|
187
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
188
|
+
res.should == 90
|
189
|
+
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
190
|
+
res.should == 0
|
191
|
+
#found all authors, same year
|
192
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1766], [1766])
|
193
|
+
res.should == 100
|
194
|
+
#all authors, 1 year diff
|
195
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [1766])
|
196
|
+
res.should == 54
|
197
|
+
#year is not counted in
|
198
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus'], [1767], [])
|
199
|
+
res.should == 94
|
200
|
+
#found all authors on one side, same year
|
201
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1767], [1767])
|
202
|
+
res.should == 91
|
203
|
+
#found all authors on one side, 1 year diff
|
204
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'], ['Muller', 'Linnaeus'], [1766], [1767])
|
205
|
+
res.should == 51
|
206
|
+
#found all authors on one side, year does not count
|
207
|
+
res = @am.authmatch(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
208
|
+
res.should == 90
|
209
|
+
#found some authors
|
210
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
211
|
+
res.should == 67
|
212
|
+
#if year does not match or not present no match for previous case
|
213
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'], ['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
214
|
+
res.should == 0
|
215
|
+
end
|
216
|
+
|
217
|
+
it 'should compare years' do
|
218
|
+
@am.compare_years([1882],[1880]).should == 2
|
219
|
+
@am.compare_years([1882],[]).should == nil
|
220
|
+
@am.compare_years([],[]).should == 0
|
221
|
+
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
222
|
+
end
|
223
|
+
|
224
|
+
it 'should remove duplicate authors' do
|
225
|
+
#Li submatches Linnaeus and it its size 3 is big enought to remove Linnaeus
|
226
|
+
#Muller is identical
|
227
|
+
res = @am.remove_duplicate_authors(['Lin', 'Muller'], ['Linnaeus', 'Muller'])
|
228
|
+
res.should == [[], []]
|
229
|
+
#same in different order
|
230
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Linn', 'Muller'])
|
231
|
+
res.should == [[], []]
|
232
|
+
#auth Li submatches Linnaeus, but Li size less then 3 required to remove Linnaeus
|
233
|
+
res = @am.remove_duplicate_authors(['Dem', 'Li'], ['Linnaeus', 'Stepanov'])
|
234
|
+
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
235
|
+
#fuzzy match
|
236
|
+
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'], ['Linnaeus', 'Stepanov'])
|
237
|
+
res.should == [["Dem"], ["Stepanov"]]
|
238
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['L', 'Kenn'])
|
239
|
+
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
240
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'], ['Muller', 'Linnaeus', 'Kurtz'])
|
241
|
+
res.should == [[],['Kurtz']]
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'should fuzzy match authors' do
|
245
|
+
#TODO: fix the bug revealed by this test
|
246
|
+
# res = @am.fuzzy_match_authors('L', 'Muller')
|
247
|
+
# res.should be_false
|
248
|
+
end
|
249
|
+
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
###
|
2
|
+
#
|
3
|
+
# Tests for string comparison by taxamatch algorithm
|
4
|
+
# name1|name2|match|edit_distance
|
5
|
+
#
|
6
|
+
##
|
7
|
+
# Comparing uninomials
|
8
|
+
Pomatomus|Pomatomas|true|1
|
9
|
+
Pomatomus L.|Pomatomas Linn.|true|1
|
10
|
+
Pomatomus Ber|Pomatomas Linn|false|1
|
11
|
+
Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
|
12
|
+
|
13
|
+
## additional authorship should match
|
14
|
+
Puma concolor|Puma concolor L.|true|0
|
15
|
+
#
|
16
|
+
## one-letter misspeling in species epithet should match
|
17
|
+
Puma concolor|Puma cancolor|true|1
|
18
|
+
#
|
19
|
+
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
20
|
+
Pomatomus saltator|Pomatomus saltatrix|true|3
|
21
|
+
#
|
22
|
+
Loligo pealeii|Loligo plei|false|3
|
23
|
+
#
|
24
|
+
## different authors should not match
|
25
|
+
Puma concolor Linnaeus|Puma concolor Kurtz|false|0
|
26
|
+
#
|
27
|
+
##real life examples
|
28
|
+
Biatora borealis|Bactra borealis Diakonoff 1964|false|3
|
29
|
+
#
|
30
|
+
Homo sapien|Homo sapiens|true|1
|
31
|
+
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
|
32
|
+
Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
|
33
|
+
#
|
34
|
+
Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
|
35
|
+
Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
|
36
|
+
#
|
37
|
+
#Trinomial names
|
38
|
+
Homo sapiens stupidus|Homo spiens stupidus|true|1
|
39
|
+
Pomatomus saltator saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|true|1
|
40
|
+
Pomatomus saltator L. 1753|Pomatomus saltator var. saltatror L. 1753|false|5
|
41
|
+
Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: taxamatch_rb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 6
|
8
|
+
- 0
|
9
|
+
version: 0.6.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dmitry Mozzherin
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-19 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: RubyInline
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :runtime
|
31
|
+
version_requirements: *id001
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: biodiversity
|
34
|
+
prerelease: false
|
35
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
segments:
|
40
|
+
- 0
|
41
|
+
- 5
|
42
|
+
- 13
|
43
|
+
version: 0.5.13
|
44
|
+
type: :runtime
|
45
|
+
version_requirements: *id002
|
46
|
+
description: This gem implements algorithsm for fuzzy matching scientific names developed by Tony Rees
|
47
|
+
email: dmozzherin@eol.org
|
48
|
+
executables: []
|
49
|
+
|
50
|
+
extensions: []
|
51
|
+
|
52
|
+
extra_rdoc_files:
|
53
|
+
- LICENSE
|
54
|
+
- README.rdoc
|
55
|
+
files:
|
56
|
+
- README.rdoc
|
57
|
+
- lib/taxamatch_rb.rb
|
58
|
+
- lib/taxamatch_rb/atomizer.rb
|
59
|
+
- lib/taxamatch_rb/authmatch.rb
|
60
|
+
- lib/taxamatch_rb/damerau_levenshtein_mod.rb
|
61
|
+
- lib/taxamatch_rb/normalizer.rb
|
62
|
+
- lib/taxamatch_rb/phonetizer.rb
|
63
|
+
- spec/damerau_levenshtein_mod_test.txt
|
64
|
+
- spec/spec.opts
|
65
|
+
- spec/spec_helper.rb
|
66
|
+
- spec/taxamatch_rb_spec.rb
|
67
|
+
- spec/taxamatch_test.txt
|
68
|
+
- LICENSE
|
69
|
+
has_rdoc: true
|
70
|
+
homepage: http://github.com/dimus/taxamatch_rb
|
71
|
+
licenses: []
|
72
|
+
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options:
|
75
|
+
- --charset=UTF-8
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
segments:
|
83
|
+
- 0
|
84
|
+
version: "0"
|
85
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
segments:
|
90
|
+
- 0
|
91
|
+
version: "0"
|
92
|
+
requirements: []
|
93
|
+
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.3.6
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: Implementation of Tony Rees Taxamatch algorithms
|
99
|
+
test_files:
|
100
|
+
- spec/spec_helper.rb
|
101
|
+
- spec/taxamatch_rb_spec.rb
|