taxamatch_rb 0.9.10 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +5 -2
- data/Gemfile +14 -16
- data/Gemfile.lock +18 -19
- data/LICENSE +1 -1
- data/{README.rdoc → README.md} +26 -7
- data/Rakefile +11 -9
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +76 -43
- data/lib/taxamatch_rb/atomizer.rb +19 -10
- data/lib/taxamatch_rb/authmatch.rb +29 -16
- data/lib/taxamatch_rb/normalizer.rb +4 -4
- data/lib/taxamatch_rb/phonetizer.rb +9 -8
- data/spec/taxamatch_rb_spec.rb +223 -109
- data/taxamatch_rb.gemspec +11 -41
- metadata +11 -171
@@ -1,15 +1,19 @@
|
|
1
|
-
# Algorithms for Taxamatch::Authmatch
|
1
|
+
# Algorithms for Taxamatch::Authmatch
|
2
|
+
# are developed by Patrick Leary of uBio and EOL fame
|
2
3
|
|
3
4
|
module Taxamatch
|
4
5
|
class Authmatch
|
5
6
|
|
6
7
|
def self.authmatch(authors1, authors2, years1, years2)
|
7
|
-
unique_authors1, unique_authors2 =
|
8
|
+
unique_authors1, unique_authors2 =
|
9
|
+
remove_duplicate_authors(authors1, authors2)
|
8
10
|
year_difference = compare_years(years1, years2)
|
9
|
-
get_score(authors1, unique_authors1,
|
11
|
+
get_score(authors1, unique_authors1,
|
12
|
+
authors2, unique_authors2, year_difference)
|
10
13
|
end
|
11
|
-
|
12
|
-
def self.get_score(authors1, unique_authors1,
|
14
|
+
|
15
|
+
def self.get_score(authors1, unique_authors1,
|
16
|
+
authors2, unique_authors2, year_diff)
|
13
17
|
count_before = authors1.size + authors2.size
|
14
18
|
count_after = unique_authors1.size + unique_authors2.size
|
15
19
|
score = 0
|
@@ -18,7 +22,7 @@ module Taxamatch
|
|
18
22
|
if year_diff == 0
|
19
23
|
score = 100
|
20
24
|
elsif year_diff == 1
|
21
|
-
score = 54
|
25
|
+
score = 54
|
22
26
|
end
|
23
27
|
else
|
24
28
|
score = 94
|
@@ -35,11 +39,11 @@ module Taxamatch
|
|
35
39
|
end
|
36
40
|
else
|
37
41
|
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
38
|
-
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
42
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
39
43
|
end
|
40
44
|
score > 50 ? score : 0
|
41
45
|
end
|
42
|
-
|
46
|
+
|
43
47
|
def self.remove_duplicate_authors(authors1, authors2)
|
44
48
|
unique_authors1 = authors1.dup
|
45
49
|
unique_authors2 = authors2.dup
|
@@ -48,12 +52,14 @@ module Taxamatch
|
|
48
52
|
au1_match = au2_match = false
|
49
53
|
if au1 == au2
|
50
54
|
au1_match = au2_match = true
|
51
|
-
elsif au1 == au2[0...au1.size]
|
55
|
+
elsif au1 == au2[0...au1.size]
|
52
56
|
au1_match = true
|
53
57
|
elsif au1[0...au2.size] == au2
|
54
58
|
au2_match = true
|
55
59
|
end
|
56
|
-
if (au1.size >= 3 && au1_match) ||
|
60
|
+
if (au1.size >= 3 && au1_match) ||
|
61
|
+
(au2.size >= 3 && au2_match) ||
|
62
|
+
(au1_match && au2_match)
|
57
63
|
unique_authors1.delete au1
|
58
64
|
unique_authors2.delete au2
|
59
65
|
elsif au1_match
|
@@ -61,8 +67,11 @@ module Taxamatch
|
|
61
67
|
elsif au2_match
|
62
68
|
unique_authors2.delete au2
|
63
69
|
else
|
64
|
-
#TODO: masking a bug in damerau levenshtsin
|
65
|
-
|
70
|
+
#TODO: masking a bug in damerau levenshtsin
|
71
|
+
# mod which appears comparing 1letter to a longer string
|
72
|
+
if au1.size > 1 &&
|
73
|
+
au2.size > 1 &&
|
74
|
+
self.fuzzy_match_authors(au1, au2)
|
66
75
|
unique_authors1.delete au1
|
67
76
|
unique_authors2.delete au2
|
68
77
|
end
|
@@ -71,18 +80,22 @@ module Taxamatch
|
|
71
80
|
end
|
72
81
|
[unique_authors1, unique_authors2]
|
73
82
|
end
|
74
|
-
|
83
|
+
|
75
84
|
def self.fuzzy_match_authors(author1, author2)
|
76
85
|
au1_length = author1.size
|
77
86
|
au2_length = author2.size
|
78
87
|
dlm = DamerauLevenshtein
|
79
|
-
|
80
|
-
|
88
|
+
#get around a bug in C code, but it really has to be fixed
|
89
|
+
ed = dlm.distance(author1, author2,1,3)
|
90
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) &&
|
91
|
+
(ed < 2 || author1[0] == author2[0]))
|
81
92
|
end
|
82
93
|
|
83
94
|
def self.compare_years(years1, years2)
|
84
95
|
return 0 if years1 == [] && years2 == []
|
85
|
-
|
96
|
+
if years1.size == 1 && years2.size == 1
|
97
|
+
return (years1[0].to_i - years2[0].to_i).abs
|
98
|
+
end
|
86
99
|
nil
|
87
100
|
end
|
88
101
|
end
|
@@ -1,16 +1,16 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Taxamatch
|
4
|
-
|
4
|
+
|
5
5
|
module Normalizer
|
6
6
|
def self.normalize(string)
|
7
7
|
utf8_to_ascii(string.strip.upcase).gsub(/[^\x00-\x7F]/,'?')
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def self.normalize_word(word)
|
11
11
|
self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def self.normalize_author(string)
|
15
15
|
self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
|
16
16
|
end
|
@@ -20,7 +20,7 @@ module Taxamatch
|
|
20
20
|
year_int = nil unless year_int.between?(1757, Time.now.year + 1)
|
21
21
|
year_int
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
|
25
25
|
private
|
26
26
|
def self.utf8_to_ascii(string)
|
@@ -2,11 +2,11 @@
|
|
2
2
|
module Taxamatch
|
3
3
|
|
4
4
|
module Phonetizer
|
5
|
-
|
5
|
+
|
6
6
|
def self.phonetize(a_word, normalize_ending = false)
|
7
7
|
self.near_match(a_word, normalize_ending)
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def self.near_match(a_word, normalize_ending = false)
|
11
11
|
a_word = a_word.strip rescue ''
|
12
12
|
return '' if a_word == ''
|
@@ -50,7 +50,7 @@ module Taxamatch
|
|
50
50
|
a_word = 'Z' + a_word[1..-1]
|
51
51
|
end
|
52
52
|
first_char = a_word.split('')[0]
|
53
|
-
rest_chars = a_word.split('')[1..-1].join('')
|
53
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
54
54
|
rest_chars.gsub!('AE', 'I')
|
55
55
|
rest_chars.gsub!('IA', 'A')
|
56
56
|
rest_chars.gsub!('OE', 'I')
|
@@ -59,21 +59,22 @@ module Taxamatch
|
|
59
59
|
rest_chars.gsub!('H', '')
|
60
60
|
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
61
61
|
a_word = (first_char + rest_chars).squeeze
|
62
|
-
|
62
|
+
|
63
63
|
if normalize_ending && a_word.size > 4
|
64
64
|
a_word = self.normalize_ending(a_word)
|
65
65
|
end
|
66
66
|
a_word
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
def self.normalize_ending(a_word)
|
70
|
-
# -- deal with variant endings
|
70
|
+
# -- deal with variant endings
|
71
|
+
# -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
71
72
|
# -- at the end of a string translate all to -a
|
72
73
|
a_word.gsub!(/IS$/, 'A')
|
73
74
|
a_word.gsub!(/IM$/, 'A')
|
74
75
|
a_word.gsub(/AS$/, 'A')
|
75
76
|
end
|
76
|
-
|
77
|
+
|
77
78
|
end
|
78
79
|
|
79
|
-
end
|
80
|
+
end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -7,25 +7,81 @@ describe 'Atomizer' do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
it 'should parse uninomials' do
|
10
|
-
@parser.parse('Betula').should == {:all_authors=>[], :all_years
|
11
|
-
|
10
|
+
@parser.parse('Betula').should == { :all_authors => [], :all_years => [],
|
11
|
+
:canonical_form => "Betula", :uninomial => { :string => "Betula",
|
12
|
+
:normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
|
13
|
+
:years => [], :normalized_authors => [] } }
|
14
|
+
@parser.parse('Ærenea Lacordaire, 1872').should == {
|
15
|
+
:all_authors => ["LACORDAIRE"], :all_years => [1872],
|
16
|
+
:canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
|
17
|
+
:normalized => "AERENEA", :phonetized => "ERINIA",
|
18
|
+
:authors => ["Lacordaire"], :years => [1872],
|
19
|
+
:normalized_authors => ["LACORDAIRE"] } }
|
12
20
|
end
|
13
21
|
|
14
22
|
it 'should parse binomials' do
|
15
|
-
@parser.parse('Leœptura laetifica Dow, 1913').should == {
|
23
|
+
@parser.parse('Leœptura laetifica Dow, 1913').should == {
|
24
|
+
:all_authors => ["DOW"], :all_years => [1913],
|
25
|
+
:canonical_form => "Leoeptura laetifica", :genus => {
|
26
|
+
:string => "Leoeptura", :normalized => "LEOEPTURA",
|
27
|
+
:phonetized => "LIPTIRA", :authors => [], :years => [],
|
28
|
+
:normalized_authors => []}, :species => {
|
29
|
+
:string => "laetifica", :normalized => "LAETIFICA",
|
30
|
+
:phonetized => "LITIFICA", :authors => ["Dow"],
|
31
|
+
:years => [1913], :normalized_authors => ["DOW"] } }
|
16
32
|
end
|
17
33
|
|
18
34
|
it 'should parse trinomials' do
|
19
|
-
@parser.parse('Hydnellum scrobiculatum zonatum
|
35
|
+
@parser.parse('Hydnellum scrobiculatum zonatum ' +
|
36
|
+
'(Banker) D. Hall et D.E. Stuntz 1972').should == {
|
37
|
+
:all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
|
38
|
+
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
|
39
|
+
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
40
|
+
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
41
|
+
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
42
|
+
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
43
|
+
:authors => [], :years => [], :normalized_authors => [] },
|
44
|
+
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
45
|
+
:phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
|
46
|
+
:years => [1972], :normalized_authors => ["BANKER", "D HALL",
|
47
|
+
"D E STUNTZ"] }] }
|
20
48
|
end
|
21
49
|
|
22
50
|
it 'should normalize years to integers' do
|
23
51
|
future_year = Time.now.year + 10
|
24
|
-
@parser.parse("Hydnellum scrobiculatum Kern #{future_year}
|
52
|
+
@parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
|
53
|
+
"zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
|
54
|
+
:all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
|
55
|
+
:all_years => [1972],
|
56
|
+
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
|
57
|
+
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
58
|
+
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
59
|
+
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
60
|
+
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
61
|
+
:authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
|
62
|
+
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
63
|
+
:phonetized => "ZANATA", :authors =>
|
64
|
+
["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
|
65
|
+
:normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
|
25
66
|
end
|
26
67
|
|
27
68
|
it 'should normalize names with abbreviated genus after cf.' do
|
28
|
-
@parser.parse('Unio cf. U. alba').should == {:all_authors
|
69
|
+
@parser.parse('Unio cf. U. alba').should == { :all_authors => [],
|
70
|
+
:all_years => [], :canonical_form => "Unio",
|
71
|
+
:genus => { :string => "Unio", :normalized => "UNIO",
|
72
|
+
:phonetized => "UNIA", :authors => [], :years => [],
|
73
|
+
:normalized_authors => [] } }
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should parse names which broke it before' do
|
77
|
+
['Parus caeruleus species complex',
|
78
|
+
'Euxoa nr. idahoensis sp. 1clay',
|
79
|
+
'Cetraria islandica ? islandica',
|
80
|
+
'Buteo borealis ? ventralis'].each do |n|
|
81
|
+
res = @parser.parse(n)
|
82
|
+
res.class.should == Hash
|
83
|
+
res.empty?.should be_false
|
84
|
+
end
|
29
85
|
end
|
30
86
|
end
|
31
87
|
|
@@ -38,12 +94,14 @@ describe 'Taxamatch::Normalizer' do
|
|
38
94
|
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
39
95
|
Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
|
40
96
|
Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
|
41
|
-
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
|
97
|
+
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
|
98
|
+
'CHORIOZOPELLA TRAGARDHI'
|
42
99
|
Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
|
43
100
|
end
|
44
101
|
|
45
102
|
it 'should normalize words' do
|
46
|
-
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
|
103
|
+
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
|
104
|
+
'L-3EOEPTURA'
|
47
105
|
end
|
48
106
|
end
|
49
107
|
|
@@ -53,7 +111,8 @@ describe 'Taxamatch::Base' do
|
|
53
111
|
end
|
54
112
|
|
55
113
|
it 'should get txt tests' do
|
56
|
-
|
114
|
+
test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
|
115
|
+
read_test_file(test_file, 4) do |y|
|
57
116
|
if y
|
58
117
|
y[2] = y[2] == 'true' ? true : false
|
59
118
|
res = @tm.taxamatch(y[0], y[1], false)
|
@@ -65,127 +124,169 @@ describe 'Taxamatch::Base' do
|
|
65
124
|
end
|
66
125
|
|
67
126
|
it 'should work with names that cannot be parsed' do
|
68
|
-
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
|
127
|
+
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
|
128
|
+
'Quadraspidiotus ostreaeformis Curtis)')
|
69
129
|
res = false
|
70
130
|
end
|
71
131
|
|
72
132
|
it 'should compare genera' do
|
73
|
-
#edit distance 1 always match
|
133
|
+
# edit distance 1 always match
|
74
134
|
g1 = make_taxamatch_hash 'Plantago'
|
75
135
|
g2 = make_taxamatch_hash 'Plantagon'
|
76
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
77
|
-
|
136
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
137
|
+
'edit_distance' => 1, 'match' => true }
|
138
|
+
# edit_distance above threshold does not math
|
78
139
|
g1 = make_taxamatch_hash 'Plantago'
|
79
140
|
g2 = make_taxamatch_hash 'This shouldnt match'
|
80
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
81
|
-
|
141
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
142
|
+
'match' => false, 'edit_distance' => 4 }
|
143
|
+
# phonetic_match matches
|
82
144
|
g1 = make_taxamatch_hash 'Plantagi'
|
83
145
|
g2 = make_taxamatch_hash 'Plantagy'
|
84
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => true,
|
85
|
-
|
86
|
-
|
146
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
147
|
+
'edit_distance' => 1, 'match' => true }
|
148
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
149
|
+
'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
|
150
|
+
# distance 1 in first letter also matches
|
87
151
|
g1 = make_taxamatch_hash 'Xantheri'
|
88
152
|
g2 = make_taxamatch_hash 'Pantheri'
|
89
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
90
|
-
|
153
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
154
|
+
'edit_distance' => 1, 'match' => true }
|
155
|
+
# phonetic match tramps everything
|
91
156
|
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
92
157
|
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
93
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => true,
|
94
|
-
|
95
|
-
|
158
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
159
|
+
'edit_distance' => 4, 'match' => true }
|
160
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
161
|
+
'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
|
162
|
+
# same first letter and distance 2 should match
|
96
163
|
g1 = make_taxamatch_hash 'Xaaaantherii'
|
97
164
|
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
98
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
99
|
-
|
165
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
166
|
+
'match' => true, 'edit_distance' => 2 }
|
167
|
+
# First letter is the same and distance is 3 should match, no phonetic match
|
100
168
|
g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
|
101
169
|
g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
|
102
|
-
@tm.match_genera(g1, g2).should ==
|
103
|
-
|
170
|
+
@tm.match_genera(g1, g2).should ==
|
171
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
|
172
|
+
# Should not match if one of words is shorter than 2x edit
|
173
|
+
# distance and distance is 2 or 3
|
104
174
|
g1 = make_taxamatch_hash 'Xant'
|
105
175
|
g2 = make_taxamatch_hash 'Xanthe'
|
106
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
107
|
-
|
176
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
177
|
+
'match' => false, 'edit_distance' => 2 }
|
178
|
+
# Should not match if edit distance > 3 and no phonetic match
|
108
179
|
g1 = make_taxamatch_hash 'Xantheriiii'
|
109
180
|
g2 = make_taxamatch_hash 'Xantherrrrr'
|
110
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
181
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
182
|
+
'match' => false, 'edit_distance' => 4 }
|
111
183
|
end
|
112
184
|
|
113
185
|
it 'should compare species' do
|
114
|
-
#Exact match
|
186
|
+
# Exact match
|
115
187
|
s1 = make_taxamatch_hash 'major'
|
116
188
|
s2 = make_taxamatch_hash 'major'
|
117
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true,
|
118
|
-
|
119
|
-
|
189
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
190
|
+
'match' => true, 'edit_distance' => 0 }
|
191
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {
|
192
|
+
'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
|
193
|
+
# Phonetic match always works
|
120
194
|
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
121
195
|
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
122
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true,
|
123
|
-
|
124
|
-
|
196
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
197
|
+
'match' => true, 'edit_distance' => 4 }
|
198
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
199
|
+
{ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
200
|
+
# Phonetic match works with different endings
|
125
201
|
s1 = make_taxamatch_hash 'majorum'
|
126
202
|
s2 = make_taxamatch_hash 'majoris'
|
127
|
-
@tm.match_species(s1, s2).should == {
|
128
|
-
|
129
|
-
|
203
|
+
@tm.match_species(s1, s2).should == {
|
204
|
+
'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
|
205
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
206
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
|
207
|
+
# Distance 4 matches if first 3 chars are the same
|
130
208
|
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
131
209
|
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
132
|
-
@tm.match_species(s1, s2).should ==
|
133
|
-
|
210
|
+
@tm.match_species(s1, s2).should ==
|
211
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
|
212
|
+
# Should not match if Distance 4 matches and first 3 chars are not the same
|
134
213
|
s1 = make_taxamatch_hash 'majorrrrr'
|
135
214
|
s2 = make_taxamatch_hash 'marorraaa'
|
136
|
-
@tm.match_species(s1, s2).should == {
|
137
|
-
|
215
|
+
@tm.match_species(s1, s2).should == {
|
216
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
217
|
+
# Distance 2 or 3 matches if first 1 char is the same
|
138
218
|
s1 = make_taxamatch_hash 'moooorrrr'
|
139
219
|
s2 = make_taxamatch_hash 'mooooraaa'
|
140
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false,
|
141
|
-
|
220
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => false,
|
221
|
+
'match' => true, 'edit_distance' => 3 }
|
222
|
+
# Should not match if Distance 2 or 3 and first 1 char is not the same
|
142
223
|
s1 = make_taxamatch_hash 'morrrr'
|
143
224
|
s2 = make_taxamatch_hash 'torraa'
|
144
|
-
@tm.match_species(s1, s2).should == {
|
145
|
-
|
225
|
+
@tm.match_species(s1, s2).should == {
|
226
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
227
|
+
# Distance 1 will match anywhere
|
146
228
|
s1 = make_taxamatch_hash 'major'
|
147
229
|
s2 = make_taxamatch_hash 'rajor'
|
148
|
-
@tm.match_species(s1, s2).should == {
|
149
|
-
|
230
|
+
@tm.match_species(s1, s2).should == {
|
231
|
+
'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
|
232
|
+
# Will not match if distance 3 and length is less then twice
|
233
|
+
# of the edit distance
|
150
234
|
s1 = make_taxamatch_hash 'marrr'
|
151
235
|
s2 = make_taxamatch_hash 'maaaa'
|
152
|
-
@tm.match_species(s1, s2).should == {
|
236
|
+
@tm.match_species(s1, s2).should == {
|
237
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
153
238
|
end
|
154
239
|
|
155
240
|
it 'should match matches' do
|
156
|
-
#No trobule case
|
157
|
-
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
158
|
-
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
159
|
-
@tm.match_matches(gmatch, smatch).should ==
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'
|
167
|
-
|
168
|
-
gmatch = {'match' => true, 'phonetic_match' => true,
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
gmatch
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
241
|
+
# No trobule case
|
242
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
243
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
244
|
+
@tm.match_matches(gmatch, smatch).should ==
|
245
|
+
{ 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
|
246
|
+
# Will not match if either genus or sp. epithet dont match
|
247
|
+
gmatch = { 'match' => false,
|
248
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
249
|
+
smatch = { 'match' => true,
|
250
|
+
'phonetic_match' => true, 'edit_distance' => 1 }
|
251
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
252
|
+
'edit_distance' => 2, 'match' => false }
|
253
|
+
gmatch = { 'match' => true, 'phonetic_match' => true,
|
254
|
+
'edit_distance' => 1 }
|
255
|
+
smatch = { 'match' => false, 'phonetic_match' => false,
|
256
|
+
'edit_distance' => 1 }
|
257
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
258
|
+
'edit_distance' => 2, 'match' => false }
|
259
|
+
# Should not match if binomial edit distance > 4
|
260
|
+
# NOTE: EVEN with full phonetic match
|
261
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
|
262
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
263
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
|
264
|
+
'edit_distance' => 5, 'match' => false }
|
265
|
+
# Should not have phonetic match if one of the components
|
266
|
+
# does not match phonetically
|
267
|
+
gmatch = { 'match' => true,
|
268
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
269
|
+
smatch = { 'match' => true,
|
270
|
+
'phonetic_match' => true, 'edit_distance' => 1 }
|
271
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
272
|
+
'edit_distance' => 2, 'match' => true }
|
273
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
274
|
+
smatch = { 'match' => true,
|
275
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
276
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
277
|
+
'edit_distance' => 2, 'match' => true }
|
278
|
+
# edit distance should be equal the sum of of edit distances
|
279
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
280
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
281
|
+
@tm.match_matches(gmatch, smatch).should == {
|
282
|
+
'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
|
182
283
|
end
|
183
284
|
|
184
285
|
it 'should return only boolean values' do
|
185
286
|
@tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
|
186
287
|
@tm.taxamatch('Olsl','a')
|
187
288
|
end
|
188
|
-
|
289
|
+
|
189
290
|
it "should not match authors from different parts of name" do
|
190
291
|
parser = Taxamatch::Atomizer.new
|
191
292
|
t = Taxamatch::Base.new
|
@@ -199,11 +300,11 @@ describe 'Taxamatch::Base' do
|
|
199
300
|
n8 = parser.parse "Betula alba Linnaeus alba Smith"
|
200
301
|
n9 = parser.parse "Betula alba Smith alba L."
|
201
302
|
n10 = parser.parse "Betula Linn."
|
202
|
-
#if one authorship is empty, return 0
|
303
|
+
# if one authorship is empty, return 0
|
203
304
|
t.match_authors(n1, n5).should == 0
|
204
305
|
t.match_authors(n5, n1).should == 0
|
205
306
|
t.match_authors(n5, n6).should == 0
|
206
|
-
#if authorship matches on different levels ignore
|
307
|
+
# if authorship matches on different levels ignore
|
207
308
|
t.match_authors(n7, n3).should == 0
|
208
309
|
t.match_authors(n8, n3).should == -1
|
209
310
|
t.match_authors(n2, n8).should == 0
|
@@ -227,29 +328,37 @@ describe 'Taxamatch::Base' do
|
|
227
328
|
res.should == 90
|
228
329
|
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
229
330
|
res.should == 0
|
230
|
-
#found all authors, same year
|
231
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
331
|
+
# found all authors, same year
|
332
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
333
|
+
['Muller', 'Linnaeus'], [1766], [1766])
|
232
334
|
res.should == 100
|
233
|
-
#all authors, 1 year diff
|
234
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
335
|
+
# all authors, 1 year diff
|
336
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
337
|
+
['Muller', 'Linnaeus'], [1767], [1766])
|
235
338
|
res.should == 54
|
236
|
-
#year is not counted in
|
237
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
339
|
+
# year is not counted in
|
340
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
341
|
+
['Muller', 'Linnaeus'], [1767], [])
|
238
342
|
res.should == 94
|
239
|
-
#found all authors on one side, same year
|
240
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
343
|
+
# found all authors on one side, same year
|
344
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
345
|
+
['Muller', 'Linnaeus'], [1767], [1767])
|
241
346
|
res.should == 91
|
242
|
-
#found all authors on one side, 1 year diff
|
243
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
347
|
+
# found all authors on one side, 1 year diff
|
348
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
349
|
+
['Muller', 'Linnaeus'], [1766], [1767])
|
244
350
|
res.should == 51
|
245
|
-
#found all authors on one side, year does not count
|
246
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
351
|
+
# found all authors on one side, year does not count
|
352
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
353
|
+
['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
247
354
|
res.should == 90
|
248
|
-
#found some authors
|
249
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
355
|
+
# found some authors
|
356
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
357
|
+
['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
250
358
|
res.should == 67
|
251
|
-
#if year does not match or not present no match for previous case
|
252
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
359
|
+
# if year does not match or not present no match for previous case
|
360
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
361
|
+
['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
253
362
|
res.should == 0
|
254
363
|
end
|
255
364
|
|
@@ -261,22 +370,29 @@ describe 'Taxamatch::Base' do
|
|
261
370
|
end
|
262
371
|
|
263
372
|
it 'should remove duplicate authors' do
|
264
|
-
#Li submatches Linnaeus and it its size 3 is big enought to remove
|
265
|
-
#Muller is identical
|
266
|
-
res = @am.remove_duplicate_authors(['Lin', 'Muller'],
|
373
|
+
# Li submatches Linnaeus and it its size 3 is big enought to remove
|
374
|
+
# Linnaeus Muller is identical
|
375
|
+
res = @am.remove_duplicate_authors(['Lin', 'Muller'],
|
376
|
+
['Linnaeus', 'Muller'])
|
267
377
|
res.should == [[], []]
|
268
|
-
#same in different order
|
269
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
378
|
+
# same in different order
|
379
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
380
|
+
['Linn', 'Muller'])
|
270
381
|
res.should == [[], []]
|
271
|
-
#auth Li submatches Linnaeus, but Li size less then 3
|
272
|
-
|
382
|
+
# auth Li submatches Linnaeus, but Li size less then 3
|
383
|
+
# required to remove Linnaeus
|
384
|
+
res = @am.remove_duplicate_authors(['Dem', 'Li'],
|
385
|
+
['Linnaeus', 'Stepanov'])
|
273
386
|
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
274
|
-
#fuzzy match
|
275
|
-
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
|
387
|
+
# fuzzy match
|
388
|
+
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
|
389
|
+
['Linnaeus', 'Stepanov'])
|
276
390
|
res.should == [["Dem"], ["Stepanov"]]
|
277
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
391
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
392
|
+
['L', 'Kenn'])
|
278
393
|
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
279
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
394
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
395
|
+
['Muller', 'Linnaeus', 'Kurtz'])
|
280
396
|
res.should == [[],['Kurtz']]
|
281
397
|
end
|
282
398
|
|
@@ -288,5 +404,3 @@ describe 'Taxamatch::Base' do
|
|
288
404
|
end
|
289
405
|
|
290
406
|
end
|
291
|
-
|
292
|
-
|