taxamatch_rb 0.6.3 → 0.6.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/taxamatch_rb.rb +6 -5
- data/spec/taxamatch_rb_spec.rb +14 -14
- data/spec/taxamatch_test.txt +2 -1
- metadata +3 -3
data/lib/taxamatch_rb.rb
CHANGED
@@ -7,7 +7,6 @@ require 'taxamatch_rb/atomizer'
|
|
7
7
|
require 'taxamatch_rb/normalizer'
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
9
9
|
require 'taxamatch_rb/authmatch'
|
10
|
-
require 'ruby-debug'
|
11
10
|
|
12
11
|
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
13
12
|
|
@@ -66,27 +65,29 @@ module Taxamatch
|
|
66
65
|
def match_genera(genus1, genus2)
|
67
66
|
genus1_length = genus1[:normalized].size
|
68
67
|
genus2_length = genus2[:normalized].size
|
68
|
+
min_length = [genus1_length, genus2_length].min
|
69
69
|
match = false
|
70
70
|
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
71
|
-
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/
|
71
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
72
72
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
73
73
|
|
74
|
-
match = true if ed <= 3 && (
|
74
|
+
match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
75
75
|
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
76
76
|
end
|
77
77
|
|
78
78
|
def match_species(sp1, sp2)
|
79
79
|
sp1_length = sp1[:normalized].size
|
80
80
|
sp2_length = sp2[:normalized].size
|
81
|
+
min_length = [sp1_length, sp2_length].min
|
81
82
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
82
83
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
83
84
|
match = false
|
84
85
|
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
|
85
|
-
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/
|
86
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
|
86
87
|
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
87
88
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
88
89
|
|
89
|
-
match = true if ed <= 4 && (
|
90
|
+
match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
90
91
|
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
91
92
|
end
|
92
93
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
@@ -59,7 +59,7 @@ describe 'Taxamatch::Base' do
|
|
59
59
|
if y
|
60
60
|
y[2] = y[2] == 'true' ? true : false
|
61
61
|
res = @tm.taxamatch(y[0], y[1], false)
|
62
|
-
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
63
63
|
res['match'].should == y[2]
|
64
64
|
res['edit_distance'].should == y[3].to_i
|
65
65
|
end
|
@@ -89,16 +89,16 @@ describe 'Taxamatch::Base' do
|
|
89
89
|
g2 = make_taxamatch_hash 'Pantheri'
|
90
90
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
91
91
|
#phonetic match tramps everything
|
92
|
-
g1 = make_taxamatch_hash '
|
93
|
-
g2 = make_taxamatch_hash '
|
92
|
+
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
93
|
+
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
94
94
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
95
95
|
#same first letter and distance 2 should match
|
96
|
-
g1 = make_taxamatch_hash '
|
97
|
-
g2 = make_taxamatch_hash '
|
96
|
+
g1 = make_taxamatch_hash 'Xaaaantherii'
|
97
|
+
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
98
98
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
99
99
|
#First letter is the same and distance is 3 should match, no phonetic match
|
100
|
-
g1 = make_taxamatch_hash '
|
101
|
-
g2 = make_taxamatch_hash '
|
100
|
+
g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
|
101
|
+
g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
|
102
102
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
103
103
|
#Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
|
104
104
|
g1 = make_taxamatch_hash 'Xant'
|
@@ -116,24 +116,24 @@ describe 'Taxamatch::Base' do
|
|
116
116
|
s2 = make_taxamatch_hash 'major'
|
117
117
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
118
118
|
#Phonetic match always works
|
119
|
-
s1 = make_taxamatch_hash '
|
120
|
-
s2 = make_taxamatch_hash '
|
121
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' =>
|
119
|
+
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
120
|
+
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
121
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
|
122
122
|
#Phonetic match works with different endings
|
123
123
|
s1 = make_taxamatch_hash 'majorum'
|
124
124
|
s2 = make_taxamatch_hash 'majoris'
|
125
125
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
126
126
|
#Distance 4 matches if first 3 chars are the same
|
127
|
-
s1 = make_taxamatch_hash '
|
128
|
-
s2 = make_taxamatch_hash '
|
127
|
+
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
128
|
+
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
129
129
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
|
130
130
|
#Should not match if Distance 4 matches and first 3 chars are not the same
|
131
131
|
s1 = make_taxamatch_hash 'majorrrrr'
|
132
132
|
s2 = make_taxamatch_hash 'marorraaa'
|
133
133
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
134
134
|
#Distance 2 or 3 matches if first 1 char is the same
|
135
|
-
s1 = make_taxamatch_hash '
|
136
|
-
s2 = make_taxamatch_hash '
|
135
|
+
s1 = make_taxamatch_hash 'moooorrrr'
|
136
|
+
s2 = make_taxamatch_hash 'mooooraaa'
|
137
137
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
138
138
|
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
139
139
|
s1 = make_taxamatch_hash 'morrrr'
|
data/spec/taxamatch_test.txt
CHANGED
@@ -9,6 +9,7 @@ Pomatomus|Pomatomas|true|1
|
|
9
9
|
Pomatomus L.|Pomatomas Linn.|true|1
|
10
10
|
Pomatomus Ber|Pomatomas Linn|false|1
|
11
11
|
Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
|
12
|
+
Patella|Abbella|false|3
|
12
13
|
|
13
14
|
## additional authorship should match
|
14
15
|
Puma concolor|Puma concolor L.|true|0
|
@@ -17,7 +18,7 @@ Puma concolor|Puma concolor L.|true|0
|
|
17
18
|
Puma concolor|Puma cancolor|true|1
|
18
19
|
#
|
19
20
|
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
20
|
-
Pomatomus saltator|Pomatomus saltatrix|
|
21
|
+
Pomatomus saltator|Pomatomus saltatrix|false|3 #!!!
|
21
22
|
#
|
22
23
|
Loligo pealeii|Loligo plei|false|3
|
23
24
|
#
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 6
|
8
|
-
-
|
9
|
-
version: 0.6.
|
8
|
+
- 4
|
9
|
+
version: 0.6.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Dmitry Mozzherin
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-08 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|