taxamatch_rb 0.8.7 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +33 -9
- data/lib/taxamatch_rb/authmatch.rb +1 -1
- data/spec/taxamatch_rb_spec.rb +38 -4
- data/taxamatch_rb.gemspec +2 -2
- metadata +27 -27
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -36,7 +36,7 @@ module Taxamatch
|
|
36
36
|
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
37
37
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
38
38
|
if result && result['match']
|
39
|
-
result['match'] = match_authors(preparsed_1, preparsed_2) ==
|
39
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ? false : true
|
40
40
|
end
|
41
41
|
return result
|
42
42
|
end
|
@@ -63,12 +63,17 @@ module Taxamatch
|
|
63
63
|
match_hash
|
64
64
|
end
|
65
65
|
|
66
|
-
def match_genera(genus1, genus2)
|
66
|
+
def match_genera(genus1, genus2, opts = {})
|
67
67
|
genus1_length = genus1[:normalized].size
|
68
68
|
genus2_length = genus2[:normalized].size
|
69
|
+
opts = {:with_phonetic_match => true}.merge(opts)
|
69
70
|
min_length = [genus1_length, genus2_length].min
|
71
|
+
unless opts[:with_phonetic_match]
|
72
|
+
genus1[:phonetized] = "A"
|
73
|
+
genus2[:phonetized] = "B"
|
74
|
+
end
|
70
75
|
match = false
|
71
|
-
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
76
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized], 1, 3) #TODO put block = 2
|
72
77
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
73
78
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
74
79
|
|
@@ -76,10 +81,15 @@ module Taxamatch
|
|
76
81
|
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
77
82
|
end
|
78
83
|
|
79
|
-
def match_species(sp1, sp2)
|
84
|
+
def match_species(sp1, sp2, opts = {})
|
80
85
|
sp1_length = sp1[:normalized].size
|
81
86
|
sp2_length = sp2[:normalized].size
|
87
|
+
opts = {:with_phonetic_match => true}.merge(opts)
|
82
88
|
min_length = [sp1_length, sp2_length].min
|
89
|
+
unless opts[:with_phonetic_match]
|
90
|
+
sp1[:phonetized] = "A"
|
91
|
+
sp2[:phonetized] = "B"
|
92
|
+
end
|
83
93
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
84
94
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
85
95
|
match = false
|
@@ -93,11 +103,25 @@ module Taxamatch
|
|
93
103
|
end
|
94
104
|
|
95
105
|
def match_authors(preparsed_1, preparsed_2)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
106
|
+
p1 = { :normalized_authors => [], :years => [] }
|
107
|
+
p2 = { :normalized_authors => [], :years => [] }
|
108
|
+
if preparsed_1[:infraspecies] || preparsed_2[:infraspecies]
|
109
|
+
p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
|
110
|
+
p2 = preparsed_2[:infraspecies].last if preparsed_2[:infraspecies]
|
111
|
+
elsif preparsed_1[:species] || preparsed_2[:species]
|
112
|
+
p1 = preparsed_1[:species] if preparsed_1[:species]
|
113
|
+
p2 = preparsed_2[:species] if preparsed_2[:species]
|
114
|
+
elsif preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
115
|
+
p1 = preparsed_1[:uninomial]
|
116
|
+
p2 = preparsed_2[:uninomial]
|
117
|
+
end
|
118
|
+
au1 = p1[:normalized_authors]
|
119
|
+
au2 = p2[:normalized_authors]
|
120
|
+
yr1 = p1[:years]
|
121
|
+
yr2 = p2[:years]
|
122
|
+
return 0 if au1.empty? || au2.empty?
|
123
|
+
score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
124
|
+
score == 0 ? -1 : 1
|
101
125
|
end
|
102
126
|
|
103
127
|
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
@@ -76,7 +76,7 @@ module Taxamatch
|
|
76
76
|
au1_length = author1.size
|
77
77
|
au2_length = author2.size
|
78
78
|
dlm = DamerauLevenshtein
|
79
|
-
ed = dlm.distance(author1, author2,
|
79
|
+
ed = dlm.distance(author1, author2,1,3) #get around a bug in C code, but it really has to be fixed
|
80
80
|
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
81
81
|
end
|
82
82
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -75,6 +75,7 @@ describe 'Taxamatch::Base' do
|
|
75
75
|
g1 = make_taxamatch_hash 'Plantagi'
|
76
76
|
g2 = make_taxamatch_hash 'Plantagy'
|
77
77
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
|
78
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
78
79
|
#distance 1 in first letter also matches
|
79
80
|
g1 = make_taxamatch_hash 'Xantheri'
|
80
81
|
g2 = make_taxamatch_hash 'Pantheri'
|
@@ -83,6 +84,7 @@ describe 'Taxamatch::Base' do
|
|
83
84
|
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
84
85
|
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
85
86
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
87
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 4, 'match' => false}
|
86
88
|
#same first letter and distance 2 should match
|
87
89
|
g1 = make_taxamatch_hash 'Xaaaantherii'
|
88
90
|
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
@@ -106,14 +108,17 @@ describe 'Taxamatch::Base' do
|
|
106
108
|
s1 = make_taxamatch_hash 'major'
|
107
109
|
s2 = make_taxamatch_hash 'major'
|
108
110
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
111
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 0}
|
109
112
|
#Phonetic match always works
|
110
113
|
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
111
114
|
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
112
115
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
|
116
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
113
117
|
#Phonetic match works with different endings
|
114
118
|
s1 = make_taxamatch_hash 'majorum'
|
115
119
|
s2 = make_taxamatch_hash 'majoris'
|
116
120
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
121
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
117
122
|
#Distance 4 matches if first 3 chars are the same
|
118
123
|
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
119
124
|
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
@@ -155,7 +160,7 @@ describe 'Taxamatch::Base' do
|
|
155
160
|
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
156
161
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
157
162
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
158
|
-
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
|
163
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 5, 'match' => false}
|
159
164
|
#Should not have phonetic match if one of the components does not match phonetically
|
160
165
|
gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
161
166
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
@@ -173,6 +178,36 @@ describe 'Taxamatch::Base' do
|
|
173
178
|
@tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
|
174
179
|
@tm.taxamatch('Olsl','a')
|
175
180
|
end
|
181
|
+
|
182
|
+
it "should not match authors from different parts of name" do
|
183
|
+
parser = Taxamatch::Atomizer.new
|
184
|
+
t = Taxamatch::Base.new
|
185
|
+
n1 = parser.parse "Betula Linnaeus"
|
186
|
+
n2 = parser.parse "Betula alba Linnaeus"
|
187
|
+
n3 = parser.parse "Betula alba alba Linnaeus"
|
188
|
+
n4 = parser.parse "Betula alba L."
|
189
|
+
n5 = parser.parse "Betula alba"
|
190
|
+
n6 = parser.parse "Betula olba"
|
191
|
+
n7 = parser.parse "Betula alba Linnaeus alba"
|
192
|
+
n8 = parser.parse "Betula alba Linnaeus alba Smith"
|
193
|
+
n9 = parser.parse "Betula alba Smith alba L."
|
194
|
+
n10 = parser.parse "Betula Linn."
|
195
|
+
#if one authorship is empty, return 0
|
196
|
+
t.match_authors(n1, n5).should == 0
|
197
|
+
t.match_authors(n5, n1).should == 0
|
198
|
+
t.match_authors(n5, n6).should == 0
|
199
|
+
#if authorship matches on different levels ignore
|
200
|
+
t.match_authors(n7, n3).should == 0
|
201
|
+
t.match_authors(n8, n3).should == -1
|
202
|
+
t.match_authors(n2, n8).should == 0
|
203
|
+
t.match_authors(n1, n2).should == 0
|
204
|
+
# match on infraspecies level
|
205
|
+
t.match_authors(n9, n3).should == 1
|
206
|
+
# match on species level
|
207
|
+
t.match_authors(n2, n4).should == 1
|
208
|
+
# match on uninomial level
|
209
|
+
t.match_authors(n1, n10).should == 1
|
210
|
+
end
|
176
211
|
|
177
212
|
|
178
213
|
describe 'Taxamatch::Authmatch' do
|
@@ -239,9 +274,8 @@ describe 'Taxamatch::Base' do
|
|
239
274
|
end
|
240
275
|
|
241
276
|
it 'should fuzzy match authors' do
|
242
|
-
|
243
|
-
|
244
|
-
# res.should be_false
|
277
|
+
res = @am.fuzzy_match_authors('L', 'Muller')
|
278
|
+
res.should be_false
|
245
279
|
end
|
246
280
|
|
247
281
|
end
|
data/taxamatch_rb.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "taxamatch_rb"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.9.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-03-
|
12
|
+
s.date = "2012-03-08"
|
13
13
|
s.description = "This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees"
|
14
14
|
s.email = "dmozzherin@eol.org"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-08 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: biodiversity19
|
16
|
-
requirement: &
|
16
|
+
requirement: &70230050526820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.0.10
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70230050526820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: damerau-levenshtein
|
27
|
-
requirement: &
|
27
|
+
requirement: &70230050526220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.5.4
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70230050526220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rake-compiler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70230050513120 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70230050513120
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &70230050512520 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 2.3.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70230050512520
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cucumber
|
60
|
-
requirement: &
|
60
|
+
requirement: &70230050511860 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70230050511860
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70230050511260 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.0.0
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70230050511260
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: jeweler
|
82
|
-
requirement: &
|
82
|
+
requirement: &70230050510740 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 1.6.0
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70230050510740
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: rcov
|
93
|
-
requirement: &
|
93
|
+
requirement: &70230050510140 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70230050510140
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: ruby-debug19
|
104
|
-
requirement: &
|
104
|
+
requirement: &70230050509540 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70230050509540
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: ruby-prof
|
115
|
-
requirement: &
|
115
|
+
requirement: &70230050508960 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70230050508960
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: shoulda
|
126
|
-
requirement: &
|
126
|
+
requirement: &70230050508360 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,10 +131,10 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70230050508360
|
135
135
|
- !ruby/object:Gem::Dependency
|
136
136
|
name: mocha
|
137
|
-
requirement: &
|
137
|
+
requirement: &70230050507760 !ruby/object:Gem::Requirement
|
138
138
|
none: false
|
139
139
|
requirements:
|
140
140
|
- - ! '>='
|
@@ -142,7 +142,7 @@ dependencies:
|
|
142
142
|
version: '0'
|
143
143
|
type: :development
|
144
144
|
prerelease: false
|
145
|
-
version_requirements: *
|
145
|
+
version_requirements: *70230050507760
|
146
146
|
description: This gem implements algorithm for fuzzy matching scientific names developed
|
147
147
|
by Tony Rees
|
148
148
|
email: dmozzherin@eol.org
|
@@ -183,7 +183,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
183
183
|
version: '0'
|
184
184
|
segments:
|
185
185
|
- 0
|
186
|
-
hash:
|
186
|
+
hash: 1938855165172895621
|
187
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
188
|
none: false
|
189
189
|
requirements:
|