taxamatch_rb 0.9.10 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -2
- data/Gemfile +14 -16
- data/Gemfile.lock +18 -19
- data/LICENSE +1 -1
- data/{README.rdoc → README.md} +26 -7
- data/Rakefile +11 -9
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +76 -43
- data/lib/taxamatch_rb/atomizer.rb +19 -10
- data/lib/taxamatch_rb/authmatch.rb +29 -16
- data/lib/taxamatch_rb/normalizer.rb +4 -4
- data/lib/taxamatch_rb/phonetizer.rb +9 -8
- data/spec/taxamatch_rb_spec.rb +223 -109
- data/taxamatch_rb.gemspec +11 -41
- metadata +11 -171
@@ -1,15 +1,19 @@
|
|
1
|
-
# Algorithms for Taxamatch::Authmatch
|
1
|
+
# Algorithms for Taxamatch::Authmatch
|
2
|
+
# are developed by Patrick Leary of uBio and EOL fame
|
2
3
|
|
3
4
|
module Taxamatch
|
4
5
|
class Authmatch
|
5
6
|
|
6
7
|
def self.authmatch(authors1, authors2, years1, years2)
|
7
|
-
unique_authors1, unique_authors2 =
|
8
|
+
unique_authors1, unique_authors2 =
|
9
|
+
remove_duplicate_authors(authors1, authors2)
|
8
10
|
year_difference = compare_years(years1, years2)
|
9
|
-
get_score(authors1, unique_authors1,
|
11
|
+
get_score(authors1, unique_authors1,
|
12
|
+
authors2, unique_authors2, year_difference)
|
10
13
|
end
|
11
|
-
|
12
|
-
def self.get_score(authors1, unique_authors1,
|
14
|
+
|
15
|
+
def self.get_score(authors1, unique_authors1,
|
16
|
+
authors2, unique_authors2, year_diff)
|
13
17
|
count_before = authors1.size + authors2.size
|
14
18
|
count_after = unique_authors1.size + unique_authors2.size
|
15
19
|
score = 0
|
@@ -18,7 +22,7 @@ module Taxamatch
|
|
18
22
|
if year_diff == 0
|
19
23
|
score = 100
|
20
24
|
elsif year_diff == 1
|
21
|
-
score = 54
|
25
|
+
score = 54
|
22
26
|
end
|
23
27
|
else
|
24
28
|
score = 94
|
@@ -35,11 +39,11 @@ module Taxamatch
|
|
35
39
|
end
|
36
40
|
else
|
37
41
|
score = ((1 - count_after.to_f/count_before.to_f) * 100).round
|
38
|
-
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
42
|
+
score = 0 unless year_diff == nil || (year_diff && year_diff == 0)
|
39
43
|
end
|
40
44
|
score > 50 ? score : 0
|
41
45
|
end
|
42
|
-
|
46
|
+
|
43
47
|
def self.remove_duplicate_authors(authors1, authors2)
|
44
48
|
unique_authors1 = authors1.dup
|
45
49
|
unique_authors2 = authors2.dup
|
@@ -48,12 +52,14 @@ module Taxamatch
|
|
48
52
|
au1_match = au2_match = false
|
49
53
|
if au1 == au2
|
50
54
|
au1_match = au2_match = true
|
51
|
-
elsif au1 == au2[0...au1.size]
|
55
|
+
elsif au1 == au2[0...au1.size]
|
52
56
|
au1_match = true
|
53
57
|
elsif au1[0...au2.size] == au2
|
54
58
|
au2_match = true
|
55
59
|
end
|
56
|
-
if (au1.size >= 3 && au1_match) ||
|
60
|
+
if (au1.size >= 3 && au1_match) ||
|
61
|
+
(au2.size >= 3 && au2_match) ||
|
62
|
+
(au1_match && au2_match)
|
57
63
|
unique_authors1.delete au1
|
58
64
|
unique_authors2.delete au2
|
59
65
|
elsif au1_match
|
@@ -61,8 +67,11 @@ module Taxamatch
|
|
61
67
|
elsif au2_match
|
62
68
|
unique_authors2.delete au2
|
63
69
|
else
|
64
|
-
#TODO: masking a bug in damerau levenshtsin
|
65
|
-
|
70
|
+
#TODO: masking a bug in damerau levenshtsin
|
71
|
+
# mod which appears comparing 1letter to a longer string
|
72
|
+
if au1.size > 1 &&
|
73
|
+
au2.size > 1 &&
|
74
|
+
self.fuzzy_match_authors(au1, au2)
|
66
75
|
unique_authors1.delete au1
|
67
76
|
unique_authors2.delete au2
|
68
77
|
end
|
@@ -71,18 +80,22 @@ module Taxamatch
|
|
71
80
|
end
|
72
81
|
[unique_authors1, unique_authors2]
|
73
82
|
end
|
74
|
-
|
83
|
+
|
75
84
|
def self.fuzzy_match_authors(author1, author2)
|
76
85
|
au1_length = author1.size
|
77
86
|
au2_length = author2.size
|
78
87
|
dlm = DamerauLevenshtein
|
79
|
-
|
80
|
-
|
88
|
+
#get around a bug in C code, but it really has to be fixed
|
89
|
+
ed = dlm.distance(author1, author2,1,3)
|
90
|
+
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) &&
|
91
|
+
(ed < 2 || author1[0] == author2[0]))
|
81
92
|
end
|
82
93
|
|
83
94
|
def self.compare_years(years1, years2)
|
84
95
|
return 0 if years1 == [] && years2 == []
|
85
|
-
|
96
|
+
if years1.size == 1 && years2.size == 1
|
97
|
+
return (years1[0].to_i - years2[0].to_i).abs
|
98
|
+
end
|
86
99
|
nil
|
87
100
|
end
|
88
101
|
end
|
@@ -1,16 +1,16 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module Taxamatch
|
4
|
-
|
4
|
+
|
5
5
|
module Normalizer
|
6
6
|
def self.normalize(string)
|
7
7
|
utf8_to_ascii(string.strip.upcase).gsub(/[^\x00-\x7F]/,'?')
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def self.normalize_word(word)
|
11
11
|
self.normalize(word).gsub(/[^A-Z0-9\-]/, '').strip
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def self.normalize_author(string)
|
15
15
|
self.normalize(string).gsub(/[^A-Z]/, ' ').gsub(/[\s]{2,}/, ' ').strip
|
16
16
|
end
|
@@ -20,7 +20,7 @@ module Taxamatch
|
|
20
20
|
year_int = nil unless year_int.between?(1757, Time.now.year + 1)
|
21
21
|
year_int
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
|
25
25
|
private
|
26
26
|
def self.utf8_to_ascii(string)
|
@@ -2,11 +2,11 @@
|
|
2
2
|
module Taxamatch
|
3
3
|
|
4
4
|
module Phonetizer
|
5
|
-
|
5
|
+
|
6
6
|
def self.phonetize(a_word, normalize_ending = false)
|
7
7
|
self.near_match(a_word, normalize_ending)
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def self.near_match(a_word, normalize_ending = false)
|
11
11
|
a_word = a_word.strip rescue ''
|
12
12
|
return '' if a_word == ''
|
@@ -50,7 +50,7 @@ module Taxamatch
|
|
50
50
|
a_word = 'Z' + a_word[1..-1]
|
51
51
|
end
|
52
52
|
first_char = a_word.split('')[0]
|
53
|
-
rest_chars = a_word.split('')[1..-1].join('')
|
53
|
+
rest_chars = a_word.split('')[1..-1].join('')
|
54
54
|
rest_chars.gsub!('AE', 'I')
|
55
55
|
rest_chars.gsub!('IA', 'A')
|
56
56
|
rest_chars.gsub!('OE', 'I')
|
@@ -59,21 +59,22 @@ module Taxamatch
|
|
59
59
|
rest_chars.gsub!('H', '')
|
60
60
|
rest_chars.tr!('EOUYKZ', 'IAIICS')
|
61
61
|
a_word = (first_char + rest_chars).squeeze
|
62
|
-
|
62
|
+
|
63
63
|
if normalize_ending && a_word.size > 4
|
64
64
|
a_word = self.normalize_ending(a_word)
|
65
65
|
end
|
66
66
|
a_word
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
def self.normalize_ending(a_word)
|
70
|
-
# -- deal with variant endings
|
70
|
+
# -- deal with variant endings
|
71
|
+
# -is (includes -us, -ys, -es), -im (was -um), -as (-os)
|
71
72
|
# -- at the end of a string translate all to -a
|
72
73
|
a_word.gsub!(/IS$/, 'A')
|
73
74
|
a_word.gsub!(/IM$/, 'A')
|
74
75
|
a_word.gsub(/AS$/, 'A')
|
75
76
|
end
|
76
|
-
|
77
|
+
|
77
78
|
end
|
78
79
|
|
79
|
-
end
|
80
|
+
end
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -7,25 +7,81 @@ describe 'Atomizer' do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
it 'should parse uninomials' do
|
10
|
-
@parser.parse('Betula').should == {:all_authors=>[], :all_years
|
11
|
-
|
10
|
+
@parser.parse('Betula').should == { :all_authors => [], :all_years => [],
|
11
|
+
:canonical_form => "Betula", :uninomial => { :string => "Betula",
|
12
|
+
:normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
|
13
|
+
:years => [], :normalized_authors => [] } }
|
14
|
+
@parser.parse('Ærenea Lacordaire, 1872').should == {
|
15
|
+
:all_authors => ["LACORDAIRE"], :all_years => [1872],
|
16
|
+
:canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
|
17
|
+
:normalized => "AERENEA", :phonetized => "ERINIA",
|
18
|
+
:authors => ["Lacordaire"], :years => [1872],
|
19
|
+
:normalized_authors => ["LACORDAIRE"] } }
|
12
20
|
end
|
13
21
|
|
14
22
|
it 'should parse binomials' do
|
15
|
-
@parser.parse('Leœptura laetifica Dow, 1913').should == {
|
23
|
+
@parser.parse('Leœptura laetifica Dow, 1913').should == {
|
24
|
+
:all_authors => ["DOW"], :all_years => [1913],
|
25
|
+
:canonical_form => "Leoeptura laetifica", :genus => {
|
26
|
+
:string => "Leoeptura", :normalized => "LEOEPTURA",
|
27
|
+
:phonetized => "LIPTIRA", :authors => [], :years => [],
|
28
|
+
:normalized_authors => []}, :species => {
|
29
|
+
:string => "laetifica", :normalized => "LAETIFICA",
|
30
|
+
:phonetized => "LITIFICA", :authors => ["Dow"],
|
31
|
+
:years => [1913], :normalized_authors => ["DOW"] } }
|
16
32
|
end
|
17
33
|
|
18
34
|
it 'should parse trinomials' do
|
19
|
-
@parser.parse('Hydnellum scrobiculatum zonatum
|
35
|
+
@parser.parse('Hydnellum scrobiculatum zonatum ' +
|
36
|
+
'(Banker) D. Hall et D.E. Stuntz 1972').should == {
|
37
|
+
:all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
|
38
|
+
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
|
39
|
+
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
40
|
+
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
41
|
+
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
42
|
+
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
43
|
+
:authors => [], :years => [], :normalized_authors => [] },
|
44
|
+
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
45
|
+
:phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
|
46
|
+
:years => [1972], :normalized_authors => ["BANKER", "D HALL",
|
47
|
+
"D E STUNTZ"] }] }
|
20
48
|
end
|
21
49
|
|
22
50
|
it 'should normalize years to integers' do
|
23
51
|
future_year = Time.now.year + 10
|
24
|
-
@parser.parse("Hydnellum scrobiculatum Kern #{future_year}
|
52
|
+
@parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
|
53
|
+
"zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
|
54
|
+
:all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
|
55
|
+
:all_years => [1972],
|
56
|
+
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
|
57
|
+
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
58
|
+
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
59
|
+
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
60
|
+
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
61
|
+
:authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
|
62
|
+
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
63
|
+
:phonetized => "ZANATA", :authors =>
|
64
|
+
["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
|
65
|
+
:normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
|
25
66
|
end
|
26
67
|
|
27
68
|
it 'should normalize names with abbreviated genus after cf.' do
|
28
|
-
@parser.parse('Unio cf. U. alba').should == {:all_authors
|
69
|
+
@parser.parse('Unio cf. U. alba').should == { :all_authors => [],
|
70
|
+
:all_years => [], :canonical_form => "Unio",
|
71
|
+
:genus => { :string => "Unio", :normalized => "UNIO",
|
72
|
+
:phonetized => "UNIA", :authors => [], :years => [],
|
73
|
+
:normalized_authors => [] } }
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'should parse names which broke it before' do
|
77
|
+
['Parus caeruleus species complex',
|
78
|
+
'Euxoa nr. idahoensis sp. 1clay',
|
79
|
+
'Cetraria islandica ? islandica',
|
80
|
+
'Buteo borealis ? ventralis'].each do |n|
|
81
|
+
res = @parser.parse(n)
|
82
|
+
res.class.should == Hash
|
83
|
+
res.empty?.should be_false
|
84
|
+
end
|
29
85
|
end
|
30
86
|
end
|
31
87
|
|
@@ -38,12 +94,14 @@ describe 'Taxamatch::Normalizer' do
|
|
38
94
|
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
39
95
|
Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
|
40
96
|
Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
|
41
|
-
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
|
97
|
+
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
|
98
|
+
'CHORIOZOPELLA TRAGARDHI'
|
42
99
|
Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
|
43
100
|
end
|
44
101
|
|
45
102
|
it 'should normalize words' do
|
46
|
-
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
|
103
|
+
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
|
104
|
+
'L-3EOEPTURA'
|
47
105
|
end
|
48
106
|
end
|
49
107
|
|
@@ -53,7 +111,8 @@ describe 'Taxamatch::Base' do
|
|
53
111
|
end
|
54
112
|
|
55
113
|
it 'should get txt tests' do
|
56
|
-
|
114
|
+
test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
|
115
|
+
read_test_file(test_file, 4) do |y|
|
57
116
|
if y
|
58
117
|
y[2] = y[2] == 'true' ? true : false
|
59
118
|
res = @tm.taxamatch(y[0], y[1], false)
|
@@ -65,127 +124,169 @@ describe 'Taxamatch::Base' do
|
|
65
124
|
end
|
66
125
|
|
67
126
|
it 'should work with names that cannot be parsed' do
|
68
|
-
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
|
127
|
+
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
|
128
|
+
'Quadraspidiotus ostreaeformis Curtis)')
|
69
129
|
res = false
|
70
130
|
end
|
71
131
|
|
72
132
|
it 'should compare genera' do
|
73
|
-
#edit distance 1 always match
|
133
|
+
# edit distance 1 always match
|
74
134
|
g1 = make_taxamatch_hash 'Plantago'
|
75
135
|
g2 = make_taxamatch_hash 'Plantagon'
|
76
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
77
|
-
|
136
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
137
|
+
'edit_distance' => 1, 'match' => true }
|
138
|
+
# edit_distance above threshold does not math
|
78
139
|
g1 = make_taxamatch_hash 'Plantago'
|
79
140
|
g2 = make_taxamatch_hash 'This shouldnt match'
|
80
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
81
|
-
|
141
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
142
|
+
'match' => false, 'edit_distance' => 4 }
|
143
|
+
# phonetic_match matches
|
82
144
|
g1 = make_taxamatch_hash 'Plantagi'
|
83
145
|
g2 = make_taxamatch_hash 'Plantagy'
|
84
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => true,
|
85
|
-
|
86
|
-
|
146
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
147
|
+
'edit_distance' => 1, 'match' => true }
|
148
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
149
|
+
'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
|
150
|
+
# distance 1 in first letter also matches
|
87
151
|
g1 = make_taxamatch_hash 'Xantheri'
|
88
152
|
g2 = make_taxamatch_hash 'Pantheri'
|
89
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
90
|
-
|
153
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
154
|
+
'edit_distance' => 1, 'match' => true }
|
155
|
+
# phonetic match tramps everything
|
91
156
|
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
92
157
|
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
93
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => true,
|
94
|
-
|
95
|
-
|
158
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
159
|
+
'edit_distance' => 4, 'match' => true }
|
160
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
161
|
+
'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
|
162
|
+
# same first letter and distance 2 should match
|
96
163
|
g1 = make_taxamatch_hash 'Xaaaantherii'
|
97
164
|
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
98
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
99
|
-
|
165
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
166
|
+
'match' => true, 'edit_distance' => 2 }
|
167
|
+
# First letter is the same and distance is 3 should match, no phonetic match
|
100
168
|
g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
|
101
169
|
g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
|
102
|
-
@tm.match_genera(g1, g2).should ==
|
103
|
-
|
170
|
+
@tm.match_genera(g1, g2).should ==
|
171
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
|
172
|
+
# Should not match if one of words is shorter than 2x edit
|
173
|
+
# distance and distance is 2 or 3
|
104
174
|
g1 = make_taxamatch_hash 'Xant'
|
105
175
|
g2 = make_taxamatch_hash 'Xanthe'
|
106
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
107
|
-
|
176
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
177
|
+
'match' => false, 'edit_distance' => 2 }
|
178
|
+
# Should not match if edit distance > 3 and no phonetic match
|
108
179
|
g1 = make_taxamatch_hash 'Xantheriiii'
|
109
180
|
g2 = make_taxamatch_hash 'Xantherrrrr'
|
110
|
-
@tm.match_genera(g1, g2).should == {'phonetic_match' => false,
|
181
|
+
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
182
|
+
'match' => false, 'edit_distance' => 4 }
|
111
183
|
end
|
112
184
|
|
113
185
|
it 'should compare species' do
|
114
|
-
#Exact match
|
186
|
+
# Exact match
|
115
187
|
s1 = make_taxamatch_hash 'major'
|
116
188
|
s2 = make_taxamatch_hash 'major'
|
117
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true,
|
118
|
-
|
119
|
-
|
189
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
190
|
+
'match' => true, 'edit_distance' => 0 }
|
191
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {
|
192
|
+
'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
|
193
|
+
# Phonetic match always works
|
120
194
|
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
121
195
|
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
122
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true,
|
123
|
-
|
124
|
-
|
196
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
197
|
+
'match' => true, 'edit_distance' => 4 }
|
198
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
199
|
+
{ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
200
|
+
# Phonetic match works with different endings
|
125
201
|
s1 = make_taxamatch_hash 'majorum'
|
126
202
|
s2 = make_taxamatch_hash 'majoris'
|
127
|
-
@tm.match_species(s1, s2).should == {
|
128
|
-
|
129
|
-
|
203
|
+
@tm.match_species(s1, s2).should == {
|
204
|
+
'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
|
205
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
206
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
|
207
|
+
# Distance 4 matches if first 3 chars are the same
|
130
208
|
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
131
209
|
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
132
|
-
@tm.match_species(s1, s2).should ==
|
133
|
-
|
210
|
+
@tm.match_species(s1, s2).should ==
|
211
|
+
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
|
212
|
+
# Should not match if Distance 4 matches and first 3 chars are not the same
|
134
213
|
s1 = make_taxamatch_hash 'majorrrrr'
|
135
214
|
s2 = make_taxamatch_hash 'marorraaa'
|
136
|
-
@tm.match_species(s1, s2).should == {
|
137
|
-
|
215
|
+
@tm.match_species(s1, s2).should == {
|
216
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
217
|
+
# Distance 2 or 3 matches if first 1 char is the same
|
138
218
|
s1 = make_taxamatch_hash 'moooorrrr'
|
139
219
|
s2 = make_taxamatch_hash 'mooooraaa'
|
140
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => false,
|
141
|
-
|
220
|
+
@tm.match_species(s1, s2).should == { 'phonetic_match' => false,
|
221
|
+
'match' => true, 'edit_distance' => 3 }
|
222
|
+
# Should not match if Distance 2 or 3 and first 1 char is not the same
|
142
223
|
s1 = make_taxamatch_hash 'morrrr'
|
143
224
|
s2 = make_taxamatch_hash 'torraa'
|
144
|
-
@tm.match_species(s1, s2).should == {
|
145
|
-
|
225
|
+
@tm.match_species(s1, s2).should == {
|
226
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
227
|
+
# Distance 1 will match anywhere
|
146
228
|
s1 = make_taxamatch_hash 'major'
|
147
229
|
s2 = make_taxamatch_hash 'rajor'
|
148
|
-
@tm.match_species(s1, s2).should == {
|
149
|
-
|
230
|
+
@tm.match_species(s1, s2).should == {
|
231
|
+
'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
|
232
|
+
# Will not match if distance 3 and length is less then twice
|
233
|
+
# of the edit distance
|
150
234
|
s1 = make_taxamatch_hash 'marrr'
|
151
235
|
s2 = make_taxamatch_hash 'maaaa'
|
152
|
-
@tm.match_species(s1, s2).should == {
|
236
|
+
@tm.match_species(s1, s2).should == {
|
237
|
+
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
153
238
|
end
|
154
239
|
|
155
240
|
it 'should match matches' do
|
156
|
-
#No trobule case
|
157
|
-
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
158
|
-
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
159
|
-
@tm.match_matches(gmatch, smatch).should ==
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'
|
167
|
-
|
168
|
-
gmatch = {'match' => true, 'phonetic_match' => true,
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
gmatch
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
241
|
+
# No trobule case
|
242
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
243
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
244
|
+
@tm.match_matches(gmatch, smatch).should ==
|
245
|
+
{ 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
|
246
|
+
# Will not match if either genus or sp. epithet dont match
|
247
|
+
gmatch = { 'match' => false,
|
248
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
249
|
+
smatch = { 'match' => true,
|
250
|
+
'phonetic_match' => true, 'edit_distance' => 1 }
|
251
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
252
|
+
'edit_distance' => 2, 'match' => false }
|
253
|
+
gmatch = { 'match' => true, 'phonetic_match' => true,
|
254
|
+
'edit_distance' => 1 }
|
255
|
+
smatch = { 'match' => false, 'phonetic_match' => false,
|
256
|
+
'edit_distance' => 1 }
|
257
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
258
|
+
'edit_distance' => 2, 'match' => false }
|
259
|
+
# Should not match if binomial edit distance > 4
|
260
|
+
# NOTE: EVEN with full phonetic match
|
261
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
|
262
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
263
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
|
264
|
+
'edit_distance' => 5, 'match' => false }
|
265
|
+
# Should not have phonetic match if one of the components
|
266
|
+
# does not match phonetically
|
267
|
+
gmatch = { 'match' => true,
|
268
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
269
|
+
smatch = { 'match' => true,
|
270
|
+
'phonetic_match' => true, 'edit_distance' => 1 }
|
271
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
272
|
+
'edit_distance' => 2, 'match' => true }
|
273
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
274
|
+
smatch = { 'match' => true,
|
275
|
+
'phonetic_match' => false, 'edit_distance' => 1 }
|
276
|
+
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
277
|
+
'edit_distance' => 2, 'match' => true }
|
278
|
+
# edit distance should be equal the sum of of edit distances
|
279
|
+
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
280
|
+
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
281
|
+
@tm.match_matches(gmatch, smatch).should == {
|
282
|
+
'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
|
182
283
|
end
|
183
284
|
|
184
285
|
it 'should return only boolean values' do
|
185
286
|
@tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
|
186
287
|
@tm.taxamatch('Olsl','a')
|
187
288
|
end
|
188
|
-
|
289
|
+
|
189
290
|
it "should not match authors from different parts of name" do
|
190
291
|
parser = Taxamatch::Atomizer.new
|
191
292
|
t = Taxamatch::Base.new
|
@@ -199,11 +300,11 @@ describe 'Taxamatch::Base' do
|
|
199
300
|
n8 = parser.parse "Betula alba Linnaeus alba Smith"
|
200
301
|
n9 = parser.parse "Betula alba Smith alba L."
|
201
302
|
n10 = parser.parse "Betula Linn."
|
202
|
-
#if one authorship is empty, return 0
|
303
|
+
# if one authorship is empty, return 0
|
203
304
|
t.match_authors(n1, n5).should == 0
|
204
305
|
t.match_authors(n5, n1).should == 0
|
205
306
|
t.match_authors(n5, n6).should == 0
|
206
|
-
#if authorship matches on different levels ignore
|
307
|
+
# if authorship matches on different levels ignore
|
207
308
|
t.match_authors(n7, n3).should == 0
|
208
309
|
t.match_authors(n8, n3).should == -1
|
209
310
|
t.match_authors(n2, n8).should == 0
|
@@ -227,29 +328,37 @@ describe 'Taxamatch::Base' do
|
|
227
328
|
res.should == 90
|
228
329
|
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
229
330
|
res.should == 0
|
230
|
-
#found all authors, same year
|
231
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
331
|
+
# found all authors, same year
|
332
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
333
|
+
['Muller', 'Linnaeus'], [1766], [1766])
|
232
334
|
res.should == 100
|
233
|
-
#all authors, 1 year diff
|
234
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
335
|
+
# all authors, 1 year diff
|
336
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
337
|
+
['Muller', 'Linnaeus'], [1767], [1766])
|
235
338
|
res.should == 54
|
236
|
-
#year is not counted in
|
237
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
339
|
+
# year is not counted in
|
340
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
341
|
+
['Muller', 'Linnaeus'], [1767], [])
|
238
342
|
res.should == 94
|
239
|
-
#found all authors on one side, same year
|
240
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
343
|
+
# found all authors on one side, same year
|
344
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
345
|
+
['Muller', 'Linnaeus'], [1767], [1767])
|
241
346
|
res.should == 91
|
242
|
-
#found all authors on one side, 1 year diff
|
243
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
347
|
+
# found all authors on one side, 1 year diff
|
348
|
+
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
349
|
+
['Muller', 'Linnaeus'], [1766], [1767])
|
244
350
|
res.should == 51
|
245
|
-
#found all authors on one side, year does not count
|
246
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
351
|
+
# found all authors on one side, year does not count
|
352
|
+
res = @am.authmatch(['Linnaeus', 'Muller'],
|
353
|
+
['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
247
354
|
res.should == 90
|
248
|
-
#found some authors
|
249
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
355
|
+
# found some authors
|
356
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
357
|
+
['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
250
358
|
res.should == 67
|
251
|
-
#if year does not match or not present no match for previous case
|
252
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
359
|
+
# if year does not match or not present no match for previous case
|
360
|
+
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
361
|
+
['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
253
362
|
res.should == 0
|
254
363
|
end
|
255
364
|
|
@@ -261,22 +370,29 @@ describe 'Taxamatch::Base' do
|
|
261
370
|
end
|
262
371
|
|
263
372
|
it 'should remove duplicate authors' do
|
264
|
-
#Li submatches Linnaeus and it its size 3 is big enought to remove
|
265
|
-
#Muller is identical
|
266
|
-
res = @am.remove_duplicate_authors(['Lin', 'Muller'],
|
373
|
+
# Li submatches Linnaeus and it its size 3 is big enought to remove
|
374
|
+
# Linnaeus Muller is identical
|
375
|
+
res = @am.remove_duplicate_authors(['Lin', 'Muller'],
|
376
|
+
['Linnaeus', 'Muller'])
|
267
377
|
res.should == [[], []]
|
268
|
-
#same in different order
|
269
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
378
|
+
# same in different order
|
379
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
380
|
+
['Linn', 'Muller'])
|
270
381
|
res.should == [[], []]
|
271
|
-
#auth Li submatches Linnaeus, but Li size less then 3
|
272
|
-
|
382
|
+
# auth Li submatches Linnaeus, but Li size less then 3
|
383
|
+
# required to remove Linnaeus
|
384
|
+
res = @am.remove_duplicate_authors(['Dem', 'Li'],
|
385
|
+
['Linnaeus', 'Stepanov'])
|
273
386
|
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
274
|
-
#fuzzy match
|
275
|
-
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
|
387
|
+
# fuzzy match
|
388
|
+
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
|
389
|
+
['Linnaeus', 'Stepanov'])
|
276
390
|
res.should == [["Dem"], ["Stepanov"]]
|
277
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
391
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
392
|
+
['L', 'Kenn'])
|
278
393
|
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
279
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
394
|
+
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
395
|
+
['Muller', 'Linnaeus', 'Kurtz'])
|
280
396
|
res.should == [[],['Kurtz']]
|
281
397
|
end
|
282
398
|
|
@@ -288,5 +404,3 @@ describe 'Taxamatch::Base' do
|
|
288
404
|
end
|
289
405
|
|
290
406
|
end
|
291
|
-
|
292
|
-
|