dimus-taxamatch_rb 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -4,6 +4,7 @@ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony
4
4
 
5
5
  The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
6
6
 
7
+ require 'taxamatch_rb'
7
8
  tm = Taxamatch::Base.new
8
9
  tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
9
10
  tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
@@ -1,3 +1,5 @@
1
+ # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
2
+
1
3
  module Taxamatch
2
4
  class Authmatch
3
5
 
@@ -1,8 +1,12 @@
1
1
  # encoding: UTF-8
2
2
  module Taxamatch
3
3
 
4
- class Phonetizer
5
-
4
+ module Phonetizer
5
+
6
+ def self.phonetize(a_word, normalize_ending = false)
7
+ self.near_match(a_word, normalize_ending)
8
+ end
9
+
6
10
  def self.near_match(a_word, normalize_ending = false)
7
11
  a_word = a_word.strip rescue ''
8
12
  return '' if a_word == ''
data/lib/taxamatch_rb.rb CHANGED
@@ -24,7 +24,7 @@ module Taxamatch
24
24
  def taxamatch(str1, str2)
25
25
  preparsed_1 = @parser.parse(str1)
26
26
  preparsed_2 = @parser.parse(str2)
27
- taxamatch_preparsed(preparsed_1, preparsed_2)[:match]
27
+ taxamatch_preparsed(preparsed_1, preparsed_2)['match']
28
28
  end
29
29
 
30
30
  #takes two hashes of parsed scientific names, analyses them and returns back
@@ -33,8 +33,8 @@ module Taxamatch
33
33
  result = nil
34
34
  result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
35
35
  result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
36
- if result && result[:match]
37
- result[:match] = false if match_authors(preparsed_1, preparsed_2) == 0
36
+ if result && result['match']
37
+ result['match'] = false if match_authors(preparsed_1, preparsed_2) == 0
38
38
  end
39
39
  return result
40
40
  end
@@ -49,7 +49,7 @@ module Taxamatch
49
49
  au_match = match_authors(preparsed_1, preparsed_2)
50
50
  total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
51
51
  match = match_matches(gen_match, sp_match)
52
- match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
52
+ match.merge({'score' => (1- match['edit_distance']/(total_length/2))})
53
53
  end
54
54
 
55
55
  def match_genera(genus1, genus2)
@@ -57,10 +57,10 @@ module Taxamatch
57
57
  genus2_length = genus2[:normalized].size
58
58
  match = false
59
59
  ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
60
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
60
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
61
61
 
62
62
  match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
63
- {:edit_distance => ed, :match => match, :phonetic_match => false}
63
+ {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
64
64
  end
65
65
 
66
66
  def match_species(sp1, sp2)
@@ -70,10 +70,10 @@ module Taxamatch
70
70
  sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
71
71
  match = false
72
72
  ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
73
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
73
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
74
74
 
75
75
  match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
76
- {:edit_distance => ed, :match => match, :phonetic_match => false}
76
+ { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
77
77
  end
78
78
 
79
79
  def match_authors(preparsed_1, preparsed_2)
@@ -86,10 +86,10 @@ module Taxamatch
86
86
 
87
87
  def match_matches(genus_match, species_match, infraspecies_matches = [])
88
88
  match = species_match
89
- match[:edit_distance] += genus_match[:edit_distance]
90
- match[:match] = false if match[:edit_distance] > 4
91
- match[:match] &&= genus_match[:match]
92
- match[:phonetic_match] &&= genus_match[:phonetic_match]
89
+ match['edit_distance'] += genus_match['edit_distance']
90
+ match['match'] = false if match['edit_distance'] > 4
91
+ match['match'] &&= genus_match['match']
92
+ match['phonetic_match'] &&= genus_match['phonetic_match']
93
93
  match
94
94
  end
95
95
 
@@ -70,107 +70,107 @@ describe 'Taxamatch::Base' do
70
70
  #edit distance 1 always match
71
71
  g1 = make_taxamatch_hash 'Plantago'
72
72
  g2 = make_taxamatch_hash 'Plantagon'
73
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
73
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
74
74
  #edit_distance above threshold does not math
75
75
  g1 = make_taxamatch_hash 'Plantago'
76
76
  g2 = make_taxamatch_hash 'This shouldnt match'
77
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
77
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
78
78
  #phonetic_match matches
79
79
  g1 = make_taxamatch_hash 'Plantagi'
80
80
  g2 = make_taxamatch_hash 'Plantagy'
81
- @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>1, :match=>true}
81
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
82
82
  #distance 1 in first letter also matches
83
83
  g1 = make_taxamatch_hash 'Xantheri'
84
84
  g2 = make_taxamatch_hash 'Pantheri'
85
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
85
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
86
86
  #phonetic match tramps everything
87
87
  g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
88
88
  g2 = make_taxamatch_hash 'Zanthery'
89
- @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
89
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
90
90
  #same first letter and distance 2 should match
91
91
  g1 = make_taxamatch_hash 'Xantherii'
92
92
  g2 = make_taxamatch_hash 'Xantherrr'
93
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>2}
93
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
94
94
  #First letter is the same and distance is 3 should match, no phonetic match
95
95
  g1 = make_taxamatch_hash 'Xantheriii'
96
96
  g2 = make_taxamatch_hash 'Xantherrrr'
97
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
97
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
98
98
  #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
99
99
  g1 = make_taxamatch_hash 'Xant'
100
100
  g2 = make_taxamatch_hash 'Xanthe'
101
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>2}
101
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
102
102
  #Should not match if edit distance > 3 and no phonetic match
103
103
  g1 = make_taxamatch_hash 'Xantheriiii'
104
104
  g2 = make_taxamatch_hash 'Xantherrrrr'
105
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
105
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
106
106
  end
107
107
 
108
108
  it 'should compare species' do
109
109
  #Exact match
110
110
  s1 = make_taxamatch_hash 'major'
111
111
  s2 = make_taxamatch_hash 'major'
112
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>0}
112
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
113
113
  #Phonetic match always works
114
114
  s1 = make_taxamatch_hash 'xanteriiiiiiii'
115
115
  s2 = make_taxamatch_hash 'zantereeeeeeee'
116
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>5}
116
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
117
117
  #Phonetic match works with different endings
118
118
  s1 = make_taxamatch_hash 'majorum'
119
119
  s2 = make_taxamatch_hash 'majoris'
120
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>2}
120
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
121
121
  #Distance 4 matches if first 3 chars are the same
122
122
  s1 = make_taxamatch_hash 'majorrrrr'
123
123
  s2 = make_taxamatch_hash 'majoraaaa'
124
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>4}
124
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
125
125
  #Should not match if Distance 4 matches and first 3 chars are not the same
126
126
  s1 = make_taxamatch_hash 'majorrrrr'
127
127
  s2 = make_taxamatch_hash 'marorraaa'
128
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
128
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
129
129
  #Distance 2 or 3 matches if first 1 char is the same
130
130
  s1 = make_taxamatch_hash 'morrrr'
131
131
  s2 = make_taxamatch_hash 'moraaa'
132
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
132
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
133
133
  #Should not match if Distance 2 or 3 and first 1 char is not the same
134
134
  s1 = make_taxamatch_hash 'morrrr'
135
135
  s2 = make_taxamatch_hash 'torraa'
136
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
136
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
137
137
  #Distance 1 will match anywhere
138
138
  s1 = make_taxamatch_hash 'major'
139
139
  s2 = make_taxamatch_hash 'rajor'
140
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>1}
140
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
141
141
  #Will not match if distance 3 and length is less then twice of the edit distance
142
142
  s1 = make_taxamatch_hash 'marrr'
143
143
  s2 = make_taxamatch_hash 'maaaa'
144
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
144
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
145
145
  end
146
146
 
147
147
  it 'should match mathes' do
148
148
  #No trobule case
149
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
150
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
151
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>2, :match=>true}
149
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
150
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
151
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
152
152
  #Will not match if either genus or sp. epithet dont match
153
- gmatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
154
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
155
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
156
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
157
- smatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
158
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
153
+ gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
154
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
155
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
156
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
157
+ smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
158
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
159
159
  #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
160
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 3}
161
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
162
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>5, :match=>false}
160
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
161
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
162
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
163
163
  #Should not have phonetic match if one of the components does not match phonetically
164
- gmatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
165
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
166
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
167
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
168
- smatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
169
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
164
+ gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
165
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
166
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
167
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
168
+ smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
169
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
170
170
  #edit distance should be equal the sum of of edit distances
171
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
172
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
173
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
171
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
172
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
173
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
174
174
  end
175
175
 
176
176
  describe 'Taxamatch::Authmatch' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-08 00:00:00 -07:00
12
+ date: 2009-08-09 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency