dimus-taxamatch_rb 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -4,6 +4,7 @@ Taxamatch_Rb is a ruby implementation of Taxamatch algorithms developed by Tony
4
4
 
5
5
  The purpose of Taxamatch gem is to facilitate fuzzy comparison of two scientific name renderings to find out if they actually point to the same scientific name.
6
6
 
7
+ require 'taxamatch_rb'
7
8
  tm = Taxamatch::Base.new
8
9
  tm.taxamatch('Homo sapien', 'Homo sapiens') #returns true
9
10
  tm.taxamatch('Homo sapiens Linnaeus', 'Hommo sapens (Linn. 1758)') #returns true
@@ -1,3 +1,5 @@
1
+ # Algorithms for Taxamatch::Authmatch are developed by Patrick Leary of uBio and EOL fame
2
+
1
3
  module Taxamatch
2
4
  class Authmatch
3
5
 
@@ -1,8 +1,12 @@
1
1
  # encoding: UTF-8
2
2
  module Taxamatch
3
3
 
4
- class Phonetizer
5
-
4
+ module Phonetizer
5
+
6
+ def self.phonetize(a_word, normalize_ending = false)
7
+ self.near_match(a_word, normalize_ending)
8
+ end
9
+
6
10
  def self.near_match(a_word, normalize_ending = false)
7
11
  a_word = a_word.strip rescue ''
8
12
  return '' if a_word == ''
data/lib/taxamatch_rb.rb CHANGED
@@ -24,7 +24,7 @@ module Taxamatch
24
24
  def taxamatch(str1, str2)
25
25
  preparsed_1 = @parser.parse(str1)
26
26
  preparsed_2 = @parser.parse(str2)
27
- taxamatch_preparsed(preparsed_1, preparsed_2)[:match]
27
+ taxamatch_preparsed(preparsed_1, preparsed_2)['match']
28
28
  end
29
29
 
30
30
  #takes two hashes of parsed scientific names, analyses them and returns back
@@ -33,8 +33,8 @@ module Taxamatch
33
33
  result = nil
34
34
  result = match_uninomial(preparsed_1, preparsed_2) if preparsed_1[:uninomial] && preparsed_2[:uninomial]
35
35
  result = match_multinomial(preparsed_1, preparsed_2) if preparsed_1[:genus] && preparsed_2[:genus]
36
- if result && result[:match]
37
- result[:match] = false if match_authors(preparsed_1, preparsed_2) == 0
36
+ if result && result['match']
37
+ result['match'] = false if match_authors(preparsed_1, preparsed_2) == 0
38
38
  end
39
39
  return result
40
40
  end
@@ -49,7 +49,7 @@ module Taxamatch
49
49
  au_match = match_authors(preparsed_1, preparsed_2)
50
50
  total_length = preparsed_1[:genus][:epitheton].size + preparsed_2[:genus][:epitheton].size + preparsed_1[:species][:epitheton].size + preparsed_2[:species][:epitheton].size
51
51
  match = match_matches(gen_match, sp_match)
52
- match.merge({:score => (1- match[:edit_distance]/(total_length/2))})
52
+ match.merge({'score' => (1- match['edit_distance']/(total_length/2))})
53
53
  end
54
54
 
55
55
  def match_genera(genus1, genus2)
@@ -57,10 +57,10 @@ module Taxamatch
57
57
  genus2_length = genus2[:normalized].size
58
58
  match = false
59
59
  ed = @dlm.distance(genus1[:normalized], genus2[:normalized],2,3)
60
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if genus1[:phonetized] == genus2[:phonetized]
60
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
61
61
 
62
62
  match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
63
- {:edit_distance => ed, :match => match, :phonetic_match => false}
63
+ {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
64
64
  end
65
65
 
66
66
  def match_species(sp1, sp2)
@@ -70,10 +70,10 @@ module Taxamatch
70
70
  sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
71
71
  match = false
72
72
  ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 4, 4)
73
- return {:edit_distance => ed, :phonetic_match => true, :match => true} if sp1[:phonetized] == sp2[:phonetized]
73
+ return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
74
74
 
75
75
  match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
76
- {:edit_distance => ed, :match => match, :phonetic_match => false}
76
+ { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
77
77
  end
78
78
 
79
79
  def match_authors(preparsed_1, preparsed_2)
@@ -86,10 +86,10 @@ module Taxamatch
86
86
 
87
87
  def match_matches(genus_match, species_match, infraspecies_matches = [])
88
88
  match = species_match
89
- match[:edit_distance] += genus_match[:edit_distance]
90
- match[:match] = false if match[:edit_distance] > 4
91
- match[:match] &&= genus_match[:match]
92
- match[:phonetic_match] &&= genus_match[:phonetic_match]
89
+ match['edit_distance'] += genus_match['edit_distance']
90
+ match['match'] = false if match['edit_distance'] > 4
91
+ match['match'] &&= genus_match['match']
92
+ match['phonetic_match'] &&= genus_match['phonetic_match']
93
93
  match
94
94
  end
95
95
 
@@ -70,107 +70,107 @@ describe 'Taxamatch::Base' do
70
70
  #edit distance 1 always match
71
71
  g1 = make_taxamatch_hash 'Plantago'
72
72
  g2 = make_taxamatch_hash 'Plantagon'
73
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
73
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
74
74
  #edit_distance above threshold does not math
75
75
  g1 = make_taxamatch_hash 'Plantago'
76
76
  g2 = make_taxamatch_hash 'This shouldnt match'
77
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
77
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
78
78
  #phonetic_match matches
79
79
  g1 = make_taxamatch_hash 'Plantagi'
80
80
  g2 = make_taxamatch_hash 'Plantagy'
81
- @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>1, :match=>true}
81
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 1, 'match' => true}
82
82
  #distance 1 in first letter also matches
83
83
  g1 = make_taxamatch_hash 'Xantheri'
84
84
  g2 = make_taxamatch_hash 'Pantheri'
85
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :edit_distance=>1, :match=>true}
85
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
86
86
  #phonetic match tramps everything
87
87
  g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
88
88
  g2 = make_taxamatch_hash 'Zanthery'
89
- @tm.match_genera(g1, g2).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
89
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
90
90
  #same first letter and distance 2 should match
91
91
  g1 = make_taxamatch_hash 'Xantherii'
92
92
  g2 = make_taxamatch_hash 'Xantherrr'
93
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>2}
93
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
94
94
  #First letter is the same and distance is 3 should match, no phonetic match
95
95
  g1 = make_taxamatch_hash 'Xantheriii'
96
96
  g2 = make_taxamatch_hash 'Xantherrrr'
97
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
97
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
98
98
  #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
99
99
  g1 = make_taxamatch_hash 'Xant'
100
100
  g2 = make_taxamatch_hash 'Xanthe'
101
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>2}
101
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 2}
102
102
  #Should not match if edit distance > 3 and no phonetic match
103
103
  g1 = make_taxamatch_hash 'Xantheriiii'
104
104
  g2 = make_taxamatch_hash 'Xantherrrrr'
105
- @tm.match_genera(g1, g2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
105
+ @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
106
106
  end
107
107
 
108
108
  it 'should compare species' do
109
109
  #Exact match
110
110
  s1 = make_taxamatch_hash 'major'
111
111
  s2 = make_taxamatch_hash 'major'
112
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>0}
112
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
113
113
  #Phonetic match always works
114
114
  s1 = make_taxamatch_hash 'xanteriiiiiiii'
115
115
  s2 = make_taxamatch_hash 'zantereeeeeeee'
116
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>5}
116
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
117
117
  #Phonetic match works with different endings
118
118
  s1 = make_taxamatch_hash 'majorum'
119
119
  s2 = make_taxamatch_hash 'majoris'
120
- @tm.match_species(s1, s2).should == {:phonetic_match=>true, :match=>true, :edit_distance=>2}
120
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
121
121
  #Distance 4 matches if first 3 chars are the same
122
122
  s1 = make_taxamatch_hash 'majorrrrr'
123
123
  s2 = make_taxamatch_hash 'majoraaaa'
124
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>4}
124
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
125
125
  #Should not match if Distance 4 matches and first 3 chars are not the same
126
126
  s1 = make_taxamatch_hash 'majorrrrr'
127
127
  s2 = make_taxamatch_hash 'marorraaa'
128
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>4}
128
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
129
129
  #Distance 2 or 3 matches if first 1 char is the same
130
130
  s1 = make_taxamatch_hash 'morrrr'
131
131
  s2 = make_taxamatch_hash 'moraaa'
132
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>3}
132
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
133
133
  #Should not match if Distance 2 or 3 and first 1 char is not the same
134
134
  s1 = make_taxamatch_hash 'morrrr'
135
135
  s2 = make_taxamatch_hash 'torraa'
136
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
136
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
137
137
  #Distance 1 will match anywhere
138
138
  s1 = make_taxamatch_hash 'major'
139
139
  s2 = make_taxamatch_hash 'rajor'
140
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>true, :edit_distance=>1}
140
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 1}
141
141
  #Will not match if distance 3 and length is less then twice of the edit distance
142
142
  s1 = make_taxamatch_hash 'marrr'
143
143
  s2 = make_taxamatch_hash 'maaaa'
144
- @tm.match_species(s1, s2).should == {:phonetic_match=>false, :match=>false, :edit_distance=>3}
144
+ @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 3}
145
145
  end
146
146
 
147
147
  it 'should match mathes' do
148
148
  #No trobule case
149
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
150
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
151
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>2, :match=>true}
149
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
150
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
151
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match' => true, 'edit_distance' => 2, 'match' => true}
152
152
  #Will not match if either genus or sp. epithet dont match
153
- gmatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
154
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
155
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
156
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
157
- smatch = {:match => false, :phonetic_match => false, :edit_distance => 1}
158
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>false}
153
+ gmatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
154
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
155
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
156
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
157
+ smatch = {'match' => false, 'phonetic_match' => false, 'edit_distance' => 1}
158
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>false}
159
159
  #Should not match if binomial edit distance > 4 NOTE: EVEN with full phonetic match
160
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 3}
161
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
162
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>5, :match=>false}
160
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 3}
161
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
162
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>5, 'match'=>false}
163
163
  #Should not have phonetic match if one of the components does not match phonetically
164
- gmatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
165
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
166
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
167
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 1}
168
- smatch = {:match => true, :phonetic_match => false, :edit_distance => 1}
169
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>false, :edit_distance=>2, :match=>true}
164
+ gmatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
165
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
166
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
167
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 1}
168
+ smatch = {'match' => true, 'phonetic_match' => false, 'edit_distance' => 1}
169
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>false, 'edit_distance'=>2, 'match'=>true}
170
170
  #edit distance should be equal the sum of of edit distances
171
- gmatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
172
- smatch = {:match => true, :phonetic_match => true, :edit_distance => 2}
173
- @tm.match_matches(gmatch, smatch).should == {:phonetic_match=>true, :edit_distance=>4, :match=>true}
171
+ gmatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
172
+ smatch = {'match' => true, 'phonetic_match' => true, 'edit_distance' => 2}
173
+ @tm.match_matches(gmatch, smatch).should == {'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true}
174
174
  end
175
175
 
176
176
  describe 'Taxamatch::Authmatch' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimus-taxamatch_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-08 00:00:00 -07:00
12
+ date: 2009-08-09 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency