taxamatch_rb 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/taxamatch_rb.rb CHANGED
@@ -7,7 +7,6 @@ require 'taxamatch_rb/atomizer'
7
7
  require 'taxamatch_rb/normalizer'
8
8
  require 'taxamatch_rb/phonetizer'
9
9
  require 'taxamatch_rb/authmatch'
10
- require 'ruby-debug'
11
10
 
12
11
  $KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
13
12
 
@@ -66,27 +65,29 @@ module Taxamatch
66
65
  def match_genera(genus1, genus2)
67
66
  genus1_length = genus1[:normalized].size
68
67
  genus2_length = genus2[:normalized].size
68
+ min_length = [genus1_length, genus2_length].min
69
69
  match = false
70
70
  ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
71
- return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[genus1_length, genus2_length].min > 0.2
71
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
72
72
  return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
73
73
 
74
- match = true if ed <= 3 && ([genus1_length, genus2_length].min > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
74
+ match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
75
75
  {'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
76
76
  end
77
77
 
78
78
  def match_species(sp1, sp2)
79
79
  sp1_length = sp1[:normalized].size
80
80
  sp2_length = sp2[:normalized].size
81
+ min_length = [sp1_length, sp2_length].min
81
82
  sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
82
83
  sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
83
84
  match = false
84
85
  ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
85
- return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/[sp1_length, sp2_length].min > 0.3334
86
+ return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
86
87
  #puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
87
88
  return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
88
89
 
89
- match = true if ed <= 4 && ([sp1_length, sp2_length].min >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
90
+ match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
90
91
  { 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
91
92
  end
92
93
 
@@ -59,7 +59,7 @@ describe 'Taxamatch::Base' do
59
59
  if y
60
60
  y[2] = y[2] == 'true' ? true : false
61
61
  res = @tm.taxamatch(y[0], y[1], false)
62
- #puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
62
+ puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
63
63
  res['match'].should == y[2]
64
64
  res['edit_distance'].should == y[3].to_i
65
65
  end
@@ -89,16 +89,16 @@ describe 'Taxamatch::Base' do
89
89
  g2 = make_taxamatch_hash 'Pantheri'
90
90
  @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
91
91
  #phonetic match tramps everything
92
- g1 = make_taxamatch_hash 'Xantheriiiiiiiiiiiiiii'
93
- g2 = make_taxamatch_hash 'Zanthery'
92
+ g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
93
+ g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
94
94
  @tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
95
95
  #same first letter and distance 2 should match
96
- g1 = make_taxamatch_hash 'Xantherii'
97
- g2 = make_taxamatch_hash 'Xantherrr'
96
+ g1 = make_taxamatch_hash 'Xaaaantherii'
97
+ g2 = make_taxamatch_hash 'Xaaaantherrr'
98
98
  @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
99
99
  #First letter is the same and distance is 3 should match, no phonetic match
100
- g1 = make_taxamatch_hash 'Xantheriii'
101
- g2 = make_taxamatch_hash 'Xantherrrr'
100
+ g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
101
+ g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
102
102
  @tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
103
103
  #Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
104
104
  g1 = make_taxamatch_hash 'Xant'
@@ -116,24 +116,24 @@ describe 'Taxamatch::Base' do
116
116
  s2 = make_taxamatch_hash 'major'
117
117
  @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
118
118
  #Phonetic match always works
119
- s1 = make_taxamatch_hash 'xanteriiiiiiii'
120
- s2 = make_taxamatch_hash 'zantereeeeeeee'
121
- @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 5}
119
+ s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
120
+ s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
121
+ @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
122
122
  #Phonetic match works with different endings
123
123
  s1 = make_taxamatch_hash 'majorum'
124
124
  s2 = make_taxamatch_hash 'majoris'
125
125
  @tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
126
126
  #Distance 4 matches if first 3 chars are the same
127
- s1 = make_taxamatch_hash 'majorrrrr'
128
- s2 = make_taxamatch_hash 'majoraaaa'
127
+ s1 = make_taxamatch_hash 'majjjjorrrrr'
128
+ s2 = make_taxamatch_hash 'majjjjoraaaa'
129
129
  @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
130
130
  #Should not match if Distance 4 matches and first 3 chars are not the same
131
131
  s1 = make_taxamatch_hash 'majorrrrr'
132
132
  s2 = make_taxamatch_hash 'marorraaa'
133
133
  @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
134
134
  #Distance 2 or 3 matches if first 1 char is the same
135
- s1 = make_taxamatch_hash 'morrrr'
136
- s2 = make_taxamatch_hash 'moraaa'
135
+ s1 = make_taxamatch_hash 'moooorrrr'
136
+ s2 = make_taxamatch_hash 'mooooraaa'
137
137
  @tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
138
138
  #Should not match if Distance 2 or 3 and first 1 char is not the same
139
139
  s1 = make_taxamatch_hash 'morrrr'
@@ -9,6 +9,7 @@ Pomatomus|Pomatomas|true|1
9
9
  Pomatomus L.|Pomatomas Linn.|true|1
10
10
  Pomatomus Ber|Pomatomas Linn|false|1
11
11
  Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
12
+ Patella|Abbella|false|3
12
13
 
13
14
  ## additional authorship should match
14
15
  Puma concolor|Puma concolor L.|true|0
@@ -17,7 +18,7 @@ Puma concolor|Puma concolor L.|true|0
17
18
  Puma concolor|Puma cancolor|true|1
18
19
  #
19
20
  Pomatomus saltatrix|Pomatomus saltratix|true|2
20
- Pomatomus saltator|Pomatomus saltatrix|true|3
21
+ Pomatomus saltator|Pomatomus saltatrix|false|3 #!!!
21
22
  #
22
23
  Loligo pealeii|Loligo plei|false|3
23
24
  #
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 6
8
- - 3
9
- version: 0.6.3
8
+ - 4
9
+ version: 0.6.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Dmitry Mozzherin
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-30 00:00:00 -04:00
17
+ date: 2010-04-08 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency