dimus-taxamatch_rb 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -0
- data/lib/taxamatch_rb/authmatch.rb +2 -0
- data/lib/taxamatch_rb/phonetizer.rb +6 -2
- data/lib/taxamatch_rb.rb +12 -12
- data/spec/taxamatch_rb_spec.rb +39 -39
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -4,6 +4,7 @@ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony
|
|
4
4
|
|
5
5
|
The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
|
6
6
|
|
7
|
+
require 'taxamatch_rb'
|
7
8
|
tm = Taxamatch::Base.new
|
8
9
|
tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
|
9
10
|
tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
|
@@ -1,8 +1,12 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
module Taxamatch
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
module Phonetizer
|
5
|
+
|
6
|
+
def self.phonetize(a_word, normalize_ending = false)
|
7
|
+
self.near_match(a_word, normalize_ending)
|
8
|
+
end
|
9
|
+
|
6
10
|
def self.near_match(a_word, normalize_ending = false)
|
7
11
|
a_word = a_word.strip rescue ''
|
8
12
|
return '' if a_word == ''
|
data/lib/taxamatch_rb.rb
CHANGED
@@ -24,7 +24,7 @@ module Taxamatch
|
|
24
24
|
def taxamatch(str1, str2)
|
25
25
|
preparsed_1 = @parser.parse(str1)
|
26
26
|
preparsed_2 = @parser.parse(str2)
|
27
|
-
taxamatch_preparsed(preparsed_1, preparsed_2)[
|
27
|
+
taxamatch_preparsed(preparsed_1, preparsed_2)['match']
|
28
28
|
end
|
29
29
|
|
30
30
|
#takes two hashes of parsed scientific names, analyses them and returns back
|
@@ -33,8 +33,8 @@ module Taxamatch
|
|
33
33
|
result = nil
|
34
34
|
result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
35
35
|
result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
|
36
|
-
if result && result[
|
37
|
-
result[
|
36
|
+
if result && result['match']
|
37
|
+
result['match'] = false if match_authors(preparsed_1, preparsed_2) == 0
|
38
38
|
end
|
39
39
|
return result
|
40
40
|
end
|
@@ -49,7 +49,7 @@ module Taxamatch
|
|
49
49
|
au_match = match_authors(preparsed_1, preparsed_2)
|
50
50
|
total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
|
51
51
|
match = match_matches(gen_match, sp_match)
|
52
|
-
match.merge({
|
52
|
+
match.merge({'score' => (1- match['edit_distance']/(total_length/2))})
|
53
53
|
end
|
54
54
|
|
55
55
|
def match_genera(genus1, genus2)
|
@@ -57,10 +57,10 @@ module Taxamatch
|
|
57
57
|
genus2_length = genus2[:normalized].size
|
58
58
|
match = false
|
59
59
|
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
|
60
|
-
return {
|
60
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
61
61
|
|
62
62
|
match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
63
|
-
{
|
63
|
+
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
64
64
|
end
|
65
65
|
|
66
66
|
def match_species(sp1, sp2)
|
@@ -70,10 +70,10 @@ module Taxamatch
|
|
70
70
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
71
71
|
match = false
|
72
72
|
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
|
73
|
-
return {
|
73
|
+
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
74
74
|
|
75
75
|
match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
76
|
-
{
|
76
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
77
77
|
end
|
78
78
|
|
79
79
|
def match_authors(preparsed_1, preparsed_2)
|
@@ -86,10 +86,10 @@ module Taxamatch
|
|
86
86
|
|
87
87
|
def match_matches(genus_match, species_match, infraspecies_matches = [])
|
88
88
|
match = species_match
|
89
|
-
match[
|
90
|
-
match[
|
91
|
-
match[
|
92
|
-
match[
|
89
|
+
match['edit_distance'] += genus_match['edit_distance']
|
90
|
+
match['match'] = false if match['edit_distance'] > 4
|
91
|
+
match['match'] &&= genus_match['match']
|
92
|
+
match['phonetic_match'] &&= genus_match['phonetic_match']
|
93
93
|
match
|
94
94
|
end
|
95
95
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -70,107 +70,107 @@ describe 'Taxamatch::Base' do
|
|
70
70
|
#edit distance 1 always match
|
71
71
|
g1 = make_taxamatch_hash 'Plantago'
|
72
72
|
g2 = make_taxamatch_hash 'Plantagon'
|
73
|
-
@tm.match_genera(g1, g2).should == {
|
73
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
74
74
|
#edit_distance above threshold does not math
|
75
75
|
g1 = make_taxamatch_hash 'Plantago'
|
76
76
|
g2 = make_taxamatch_hash 'This shouldnt match'
|
77
|
-
@tm.match_genera(g1, g2).should == {
|
77
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
78
78
|
#phonetic_match matches
|
79
79
|
g1 = make_taxamatch_hash 'Plantagi'
|
80
80
|
g2 = make_taxamatch_hash 'Plantagy'
|
81
|
-
@tm.match_genera(g1, g2).should == {
|
81
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
|
82
82
|
#distance 1 in first letter also matches
|
83
83
|
g1 = make_taxamatch_hash 'Xantheri'
|
84
84
|
g2 = make_taxamatch_hash 'Pantheri'
|
85
|
-
@tm.match_genera(g1, g2).should == {
|
85
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
86
86
|
#phonetic match tramps everything
|
87
87
|
g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
|
88
88
|
g2 = make_taxamatch_hash 'Zanthery'
|
89
|
-
@tm.match_genera(g1, g2).should == {
|
89
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
90
90
|
#same first letter and distance 2 should match
|
91
91
|
g1 = make_taxamatch_hash 'Xantherii'
|
92
92
|
g2 = make_taxamatch_hash 'Xantherrr'
|
93
|
-
@tm.match_genera(g1, g2).should == {
|
93
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
94
94
|
#First letter is the same and distance is 3 should match, no phonetic match
|
95
95
|
g1 = make_taxamatch_hash 'Xantheriii'
|
96
96
|
g2 = make_taxamatch_hash 'Xantherrrr'
|
97
|
-
@tm.match_genera(g1, g2).should == {
|
97
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
98
98
|
#Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
|
99
99
|
g1 = make_taxamatch_hash 'Xant'
|
100
100
|
g2 = make_taxamatch_hash 'Xanthe'
|
101
|
-
@tm.match_genera(g1, g2).should == {
|
101
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
|
102
102
|
#Should not match if edit distance > 3 and no phonetic match
|
103
103
|
g1 = make_taxamatch_hash 'Xantheriiii'
|
104
104
|
g2 = make_taxamatch_hash 'Xantherrrrr'
|
105
|
-
@tm.match_genera(g1, g2).should == {
|
105
|
+
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
106
106
|
end
|
107
107
|
|
108
108
|
it 'should compare species' do
|
109
109
|
#Exact match
|
110
110
|
s1 = make_taxamatch_hash 'major'
|
111
111
|
s2 = make_taxamatch_hash 'major'
|
112
|
-
@tm.match_species(s1, s2).should == {
|
112
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
113
113
|
#Phonetic match always works
|
114
114
|
s1 = make_taxamatch_hash 'xanteriiiiiiii'
|
115
115
|
s2 = make_taxamatch_hash 'zantereeeeeeee'
|
116
|
-
@tm.match_species(s1, s2).should == {
|
116
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
|
117
117
|
#Phonetic match works with different endings
|
118
118
|
s1 = make_taxamatch_hash 'majorum'
|
119
119
|
s2 = make_taxamatch_hash 'majoris'
|
120
|
-
@tm.match_species(s1, s2).should == {
|
120
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
121
121
|
#Distance 4 matches if first 3 chars are the same
|
122
122
|
s1 = make_taxamatch_hash 'majorrrrr'
|
123
123
|
s2 = make_taxamatch_hash 'majoraaaa'
|
124
|
-
@tm.match_species(s1, s2).should == {
|
124
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
|
125
125
|
#Should not match if Distance 4 matches and first 3 chars are not the same
|
126
126
|
s1 = make_taxamatch_hash 'majorrrrr'
|
127
127
|
s2 = make_taxamatch_hash 'marorraaa'
|
128
|
-
@tm.match_species(s1, s2).should == {
|
128
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
129
129
|
#Distance 2 or 3 matches if first 1 char is the same
|
130
130
|
s1 = make_taxamatch_hash 'morrrr'
|
131
131
|
s2 = make_taxamatch_hash 'moraaa'
|
132
|
-
@tm.match_species(s1, s2).should == {
|
132
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
133
133
|
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
134
134
|
s1 = make_taxamatch_hash 'morrrr'
|
135
135
|
s2 = make_taxamatch_hash 'torraa'
|
136
|
-
@tm.match_species(s1, s2).should == {
|
136
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
137
137
|
#Distance 1 will match anywhere
|
138
138
|
s1 = make_taxamatch_hash 'major'
|
139
139
|
s2 = make_taxamatch_hash 'rajor'
|
140
|
-
@tm.match_species(s1, s2).should == {
|
140
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
|
141
141
|
#Will not match if distance 3 and length is less then twice of the edit distance
|
142
142
|
s1 = make_taxamatch_hash 'marrr'
|
143
143
|
s2 = make_taxamatch_hash 'maaaa'
|
144
|
-
@tm.match_species(s1, s2).should == {
|
144
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
|
145
145
|
end
|
146
146
|
|
147
147
|
it 'should match mathes' do
|
148
148
|
#No trobule case
|
149
|
-
gmatch = {
|
150
|
-
smatch = {
|
151
|
-
@tm.match_matches(gmatch, smatch).should == {
|
149
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
150
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
151
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
|
152
152
|
#Will not match if either genus or sp. epithet dont match
|
153
|
-
gmatch = {
|
154
|
-
smatch = {
|
155
|
-
@tm.match_matches(gmatch, smatch).should == {
|
156
|
-
gmatch = {
|
157
|
-
smatch = {
|
158
|
-
@tm.match_matches(gmatch, smatch).should == {
|
153
|
+
gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
154
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
155
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
156
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
157
|
+
smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
|
158
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
|
159
159
|
#Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
|
160
|
-
gmatch = {
|
161
|
-
smatch = {
|
162
|
-
@tm.match_matches(gmatch, smatch).should == {
|
160
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
|
161
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
162
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
|
163
163
|
#Should not have phonetic match if one of the components does not match phonetically
|
164
|
-
gmatch = {
|
165
|
-
smatch = {
|
166
|
-
@tm.match_matches(gmatch, smatch).should == {
|
167
|
-
gmatch = {
|
168
|
-
smatch = {
|
169
|
-
@tm.match_matches(gmatch, smatch).should == {
|
164
|
+
gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
165
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
166
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
167
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
|
168
|
+
smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
|
169
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
|
170
170
|
#edit distance should be equal the sum of of edit distances
|
171
|
-
gmatch = {
|
172
|
-
smatch = {
|
173
|
-
@tm.match_matches(gmatch, smatch).should == {
|
171
|
+
gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
172
|
+
smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
|
173
|
+
@tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
|
174
174
|
end
|
175
175
|
|
176
176
|
describe 'Taxamatch::Authmatch' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimus-taxamatch_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-08-
|
12
|
+
date: 2009-08-09 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|