taxamatch_rb 0.8.7 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/taxamatch_rb.rb +33 -9
- data/lib/taxamatch_rb/authmatch.rb +1 -1
- data/spec/taxamatch_rb_spec.rb +38 -4
- data/taxamatch_rb.gemspec +2 -2
- metadata +27 -27
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.9.0
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -36,7 +36,7 @@ module Taxamatch
|
|
36
36
|
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
37
37
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
38
38
|
if result && result['match']
|
39
|
-
result['match'] = match_authors(preparsed_1, preparsed_2) ==
|
39
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ? false : true
|
40
40
|
end
|
41
41
|
return result
|
42
42
|
end
|
@@ -63,12 +63,17 @@ module Taxamatch
|
|
63
63
|
match_hash
|
64
64
|
end
|
65
65
|
|
66
|
-
def match_genera(genus1, genus2)
|
66
|
+
def match_genera(genus1, genus2, opts = {})
|
67
67
|
genus1_length = genus1[:normalized].size
|
68
68
|
genus2_length = genus2[:normalized].size
|
69
|
+
opts = {:with_phonetic_match => true}.merge(opts)
|
69
70
|
min_length = [genus1_length, genus2_length].min
|
71
|
+
unless opts[:with_phonetic_match]
|
72
|
+
genus1[:phonetized] = "A"
|
73
|
+
genus2[:phonetized] = "B"
|
74
|
+
end
|
70
75
|
match = false
|
71
|
-
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
76
|
+
ed = @dlm.distance(genus1[:normalized], genus2[:normalized], 1, 3) #TODO put block = 2
|
72
77
|
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
73
78
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
74
79
|
|
@@ -76,10 +81,15 @@ module Taxamatch
|
|
76
81
|
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
77
82
|
end
|
78
83
|
|
79
|
-
def match_species(sp1, sp2)
|
84
|
+
def match_species(sp1, sp2, opts = {})
|
80
85
|
sp1_length = sp1[:normalized].size
|
81
86
|
sp2_length = sp2[:normalized].size
|
87
|
+
opts = {:with_phonetic_match => true}.merge(opts)
|
82
88
|
min_length = [sp1_length, sp2_length].min
|
89
|
+
unless opts[:with_phonetic_match]
|
90
|
+
sp1[:phonetized] = "A"
|
91
|
+
sp2[:phonetized] = "B"
|
92
|
+
end
|
83
93
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
84
94
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
85
95
|
match = false
|
@@ -93,11 +103,25 @@ module Taxamatch
|
|
93
103
|
end
|
94
104
|
|
95
105
|
def match_authors(preparsed_1, preparsed_2)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
106
|
+
p1 = { :normalized_authors => [], :years => [] }
|
107
|
+
p2 = { :normalized_authors => [], :years => [] }
|
108
|
+
if preparsed_1[:infraspecies] || preparsed_2[:infraspecies]
|
109
|
+
p1 = preparsed_1[:infraspecies].last if preparsed_1[:infraspecies]
|
110
|
+
p2 = preparsed_2[:infraspecies].last if preparsed_2[:infraspecies]
|
111
|
+
elsif preparsed_1[:species] || preparsed_2[:species]
|
112
|
+
p1 = preparsed_1[:species] if preparsed_1[:species]
|
113
|
+
p2 = preparsed_2[:species] if preparsed_2[:species]
|
114
|
+
elsif preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
115
|
+
p1 = preparsed_1[:uninomial]
|
116
|
+
p2 = preparsed_2[:uninomial]
|
117
|
+
end
|
118
|
+
au1 = p1[:normalized_authors]
|
119
|
+
au2 = p2[:normalized_authors]
|
120
|
+
yr1 = p1[:years]
|
121
|
+
yr2 = p2[:years]
|
122
|
+
return 0 if au1.empty? || au2.empty?
|
123
|
+
score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
124
|
+
score == 0 ? -1 : 1
|
101
125
|
end
|
102
126
|
|
103
127
|
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
@@ -76,7 +76,7 @@ module Taxamatch
|
|
76
76
|
au1_length = author1.size
|
77
77
|
au2_length = author2.size
|
78
78
|
dlm = DamerauLevenshtein
|
79
|
-
ed = dlm.distance(author1, author2,
|
79
|
+
ed = dlm.distance(author1, author2,1,3) #get around a bug in C code, but it really has to be fixed
|
80
80
|
(ed <= 3 && ([au1_length, au2_length].min > ed * 2) && (ed < 2 || author1[0] == author2[0]))
|
81
81
|
end
|
82
82
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -75,6 +75,7 @@ describe 'Taxamatch::Base' do
|
|
75
75
|
g1 = make_taxamatch_hash 'Plantagi'
|
76
76
|
g2 = make_taxamatch_hash 'Plantagy'
|
77
77
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
|
78
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
78
79
|
#distance 1 in first letter also matches
|
79
80
|
g1 = make_taxamatch_hash 'Xantheri'
|
80
81
|
g2 = make_taxamatch_hash 'Pantheri'
|
@@ -83,6 +84,7 @@ describe 'Taxamatch::Base' do
|
|
83
84
|
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
84
85
|
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
85
86
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
87
|
+
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'edit_distance' => 4, 'match' => false}
|
86
88
|
#same first letter and distance 2 should match
|
87
89
|
g1 = make_taxamatch_hash 'Xaaaantherii'
|
88
90
|
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
@@ -106,14 +108,17 @@ describe 'Taxamatch::Base' do
|
|
106
108
|
s1 = make_taxamatch_hash 'major'
|
107
109
|
s2 = make_taxamatch_hash 'major'
|
108
110
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
111
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 0}
|
109
112
|
#Phonetic match always works
|
110
113
|
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
111
114
|
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
112
115
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
|
116
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
113
117
|
#Phonetic match works with different endings
|
114
118
|
s1 = make_taxamatch_hash 'majorum'
|
115
119
|
s2 = make_taxamatch_hash 'majoris'
|
116
120
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
121
|
+
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
117
122
|
#Distance 4 matches if first 3 chars are the same
|
118
123
|
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
119
124
|
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
@@ -155,7 +160,7 @@ describe 'Taxamatch::Base' do
|
|
155
160
|
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
156
161
|
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
157
162
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
158
|
-
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
|
163
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 5, 'match' => false}
|
159
164
|
#Should not have phonetic match if one of the components does not match phonetically
|
160
165
|
gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
161
166
|
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
@@ -173,6 +178,36 @@ describe 'Taxamatch::Base' do
|
|
173
178
|
@tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
|
174
179
|
@tm.taxamatch('Olsl','a')
|
175
180
|
end
|
181
|
+
|
182
|
+
it "should not match authors from different parts of name" do
|
183
|
+
parser = Taxamatch::Atomizer.new
|
184
|
+
t = Taxamatch::Base.new
|
185
|
+
n1 = parser.parse "Betula Linnaeus"
|
186
|
+
n2 = parser.parse "Betula alba Linnaeus"
|
187
|
+
n3 = parser.parse "Betula alba alba Linnaeus"
|
188
|
+
n4 = parser.parse "Betula alba L."
|
189
|
+
n5 = parser.parse "Betula alba"
|
190
|
+
n6 = parser.parse "Betula olba"
|
191
|
+
n7 = parser.parse "Betula alba Linnaeus alba"
|
192
|
+
n8 = parser.parse "Betula alba Linnaeus alba Smith"
|
193
|
+
n9 = parser.parse "Betula alba Smith alba L."
|
194
|
+
n10 = parser.parse "Betula Linn."
|
195
|
+
#if one authorship is empty, return 0
|
196
|
+
t.match_authors(n1, n5).should == 0
|
197
|
+
t.match_authors(n5, n1).should == 0
|
198
|
+
t.match_authors(n5, n6).should == 0
|
199
|
+
#if authorship matches on different levels ignore
|
200
|
+
t.match_authors(n7, n3).should == 0
|
201
|
+
t.match_authors(n8, n3).should == -1
|
202
|
+
t.match_authors(n2, n8).should == 0
|
203
|
+
t.match_authors(n1, n2).should == 0
|
204
|
+
# match on infraspecies level
|
205
|
+
t.match_authors(n9, n3).should == 1
|
206
|
+
# match on species level
|
207
|
+
t.match_authors(n2, n4).should == 1
|
208
|
+
# match on uninomial level
|
209
|
+
t.match_authors(n1, n10).should == 1
|
210
|
+
end
|
176
211
|
|
177
212
|
|
178
213
|
describe 'Taxamatch::Authmatch' do
|
@@ -239,9 +274,8 @@ describe 'Taxamatch::Base' do
|
|
239
274
|
end
|
240
275
|
|
241
276
|
it 'should fuzzy match authors' do
|
242
|
-
|
243
|
-
|
244
|
-
# res.should be_false
|
277
|
+
res = @am.fuzzy_match_authors('L', 'Muller')
|
278
|
+
res.should be_false
|
245
279
|
end
|
246
280
|
|
247
281
|
end
|
data/taxamatch_rb.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "taxamatch_rb"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.9.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dmitry Mozzherin"]
|
12
|
-
s.date = "2012-03-
|
12
|
+
s.date = "2012-03-08"
|
13
13
|
s.description = "This gem implements algorithm for fuzzy matching scientific names developed by Tony Rees"
|
14
14
|
s.email = "dmozzherin@eol.org"
|
15
15
|
s.extra_rdoc_files = [
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-08 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: biodiversity19
|
16
|
-
requirement: &
|
16
|
+
requirement: &70230050526820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.0.10
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70230050526820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: damerau-levenshtein
|
27
|
-
requirement: &
|
27
|
+
requirement: &70230050526220 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.5.4
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70230050526220
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rake-compiler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70230050513120 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70230050513120
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rspec
|
49
|
-
requirement: &
|
49
|
+
requirement: &70230050512520 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 2.3.0
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70230050512520
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: cucumber
|
60
|
-
requirement: &
|
60
|
+
requirement: &70230050511860 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70230050511860
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70230050511260 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.0.0
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70230050511260
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: jeweler
|
82
|
-
requirement: &
|
82
|
+
requirement: &70230050510740 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: 1.6.0
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70230050510740
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: rcov
|
93
|
-
requirement: &
|
93
|
+
requirement: &70230050510140 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70230050510140
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: ruby-debug19
|
104
|
-
requirement: &
|
104
|
+
requirement: &70230050509540 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70230050509540
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: ruby-prof
|
115
|
-
requirement: &
|
115
|
+
requirement: &70230050508960 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70230050508960
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: shoulda
|
126
|
-
requirement: &
|
126
|
+
requirement: &70230050508360 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,10 +131,10 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70230050508360
|
135
135
|
- !ruby/object:Gem::Dependency
|
136
136
|
name: mocha
|
137
|
-
requirement: &
|
137
|
+
requirement: &70230050507760 !ruby/object:Gem::Requirement
|
138
138
|
none: false
|
139
139
|
requirements:
|
140
140
|
- - ! '>='
|
@@ -142,7 +142,7 @@ dependencies:
|
|
142
142
|
version: '0'
|
143
143
|
type: :development
|
144
144
|
prerelease: false
|
145
|
-
version_requirements: *
|
145
|
+
version_requirements: *70230050507760
|
146
146
|
description: This gem implements algorithm for fuzzy matching scientific names developed
|
147
147
|
by Tony Rees
|
148
148
|
email: dmozzherin@eol.org
|
@@ -183,7 +183,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
183
183
|
version: '0'
|
184
184
|
segments:
|
185
185
|
- 0
|
186
|
-
hash:
|
186
|
+
hash: 1938855165172895621
|
187
187
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
188
|
none: false
|
189
189
|
requirements:
|