taxamatch_rb 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/taxamatch_rb.rb +6 -5
- data/spec/taxamatch_rb_spec.rb +14 -14
- data/spec/taxamatch_test.txt +2 -1
- metadata +3 -3
data/lib/taxamatch_rb.rb
CHANGED
|
@@ -7,7 +7,6 @@ require 'taxamatch_rb/atomizer'
|
|
|
7
7
|
require 'taxamatch_rb/normalizer'
|
|
8
8
|
require 'taxamatch_rb/phonetizer'
|
|
9
9
|
require 'taxamatch_rb/authmatch'
|
|
10
|
-
require 'ruby-debug'
|
|
11
10
|
|
|
12
11
|
$KCODE='u' if RUBY_VERSION.split('.')[1].to_i < 9
|
|
13
12
|
|
|
@@ -66,27 +65,29 @@ module Taxamatch
|
|
|
66
65
|
def match_genera(genus1, genus2)
|
|
67
66
|
genus1_length = genus1[:normalized].size
|
|
68
67
|
genus2_length = genus2[:normalized].size
|
|
68
|
+
min_length = [genus1_length, genus2_length].min
|
|
69
69
|
match = false
|
|
70
70
|
ed = @dlm.distance(genus1[:normalized], genus2[:normalized],1,3) #TODO put block = 2
|
|
71
|
-
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/
|
|
71
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.2
|
|
72
72
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if genus1[:phonetized] == genus2[:phonetized]
|
|
73
73
|
|
|
74
|
-
match = true if ed <= 3 && (
|
|
74
|
+
match = true if ed <= 3 && (min_length > ed * 2) && (ed < 2 || genus1[0] == genus2[0])
|
|
75
75
|
{'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
|
76
76
|
end
|
|
77
77
|
|
|
78
78
|
def match_species(sp1, sp2)
|
|
79
79
|
sp1_length = sp1[:normalized].size
|
|
80
80
|
sp2_length = sp2[:normalized].size
|
|
81
|
+
min_length = [sp1_length, sp2_length].min
|
|
81
82
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
|
82
83
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
|
83
84
|
match = false
|
|
84
85
|
ed = @dlm.distance(sp1[:normalized], sp2[:normalized], 1, 4) #TODO put block 4
|
|
85
|
-
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/
|
|
86
|
+
return {'edit_distance' => ed, 'phonetic_match' => false, 'match' => false} if ed/min_length.to_f > 0.3334
|
|
86
87
|
#puts 's: %s, %s, %s' % [sp1[:normalized], sp2[:normalized], ed]
|
|
87
88
|
return {'edit_distance' => ed, 'phonetic_match' => true, 'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
|
88
89
|
|
|
89
|
-
match = true if ed <= 4 && (
|
|
90
|
+
match = true if ed <= 4 && (min_length >= ed * 2) && (ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) && (ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
|
90
91
|
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false}
|
|
91
92
|
end
|
|
92
93
|
|
data/spec/taxamatch_rb_spec.rb
CHANGED
|
@@ -59,7 +59,7 @@ describe 'Taxamatch::Base' do
|
|
|
59
59
|
if y
|
|
60
60
|
y[2] = y[2] == 'true' ? true : false
|
|
61
61
|
res = @tm.taxamatch(y[0], y[1], false)
|
|
62
|
-
|
|
62
|
+
puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
|
63
63
|
res['match'].should == y[2]
|
|
64
64
|
res['edit_distance'].should == y[3].to_i
|
|
65
65
|
end
|
|
@@ -89,16 +89,16 @@ describe 'Taxamatch::Base' do
|
|
|
89
89
|
g2 = make_taxamatch_hash 'Pantheri'
|
|
90
90
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'edit_distance' => 1, 'match' => true}
|
|
91
91
|
#phonetic match tramps everything
|
|
92
|
-
g1 = make_taxamatch_hash '
|
|
93
|
-
g2 = make_taxamatch_hash '
|
|
92
|
+
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
|
93
|
+
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
|
94
94
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => true, 'edit_distance' => 4, 'match' => true}
|
|
95
95
|
#same first letter and distance 2 should match
|
|
96
|
-
g1 = make_taxamatch_hash '
|
|
97
|
-
g2 = make_taxamatch_hash '
|
|
96
|
+
g1 = make_taxamatch_hash 'Xaaaantherii'
|
|
97
|
+
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
|
98
98
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 2}
|
|
99
99
|
#First letter is the same and distance is 3 should match, no phonetic match
|
|
100
|
-
g1 = make_taxamatch_hash '
|
|
101
|
-
g2 = make_taxamatch_hash '
|
|
100
|
+
g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
|
|
101
|
+
g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
|
|
102
102
|
@tm.match_genera(g1, g2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
|
103
103
|
#Should not match if one of words is shorter than 2x edit distance and distance is 2 or 3
|
|
104
104
|
g1 = make_taxamatch_hash 'Xant'
|
|
@@ -116,24 +116,24 @@ describe 'Taxamatch::Base' do
|
|
|
116
116
|
s2 = make_taxamatch_hash 'major'
|
|
117
117
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 0}
|
|
118
118
|
#Phonetic match always works
|
|
119
|
-
s1 = make_taxamatch_hash '
|
|
120
|
-
s2 = make_taxamatch_hash '
|
|
121
|
-
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' =>
|
|
119
|
+
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
|
120
|
+
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
|
121
|
+
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 4}
|
|
122
122
|
#Phonetic match works with different endings
|
|
123
123
|
s1 = make_taxamatch_hash 'majorum'
|
|
124
124
|
s2 = make_taxamatch_hash 'majoris'
|
|
125
125
|
@tm.match_species(s1, s2).should == {'phonetic_match' => true, 'match' => true, 'edit_distance' => 2}
|
|
126
126
|
#Distance 4 matches if first 3 chars are the same
|
|
127
|
-
s1 = make_taxamatch_hash '
|
|
128
|
-
s2 = make_taxamatch_hash '
|
|
127
|
+
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
|
128
|
+
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
|
129
129
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 4}
|
|
130
130
|
#Should not match if Distance 4 matches and first 3 chars are not the same
|
|
131
131
|
s1 = make_taxamatch_hash 'majorrrrr'
|
|
132
132
|
s2 = make_taxamatch_hash 'marorraaa'
|
|
133
133
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => false, 'edit_distance' => 4}
|
|
134
134
|
#Distance 2 or 3 matches if first 1 char is the same
|
|
135
|
-
s1 = make_taxamatch_hash '
|
|
136
|
-
s2 = make_taxamatch_hash '
|
|
135
|
+
s1 = make_taxamatch_hash 'moooorrrr'
|
|
136
|
+
s2 = make_taxamatch_hash 'mooooraaa'
|
|
137
137
|
@tm.match_species(s1, s2).should == {'phonetic_match' => false, 'match' => true, 'edit_distance' => 3}
|
|
138
138
|
#Should not match if Distance 2 or 3 and first 1 char is not the same
|
|
139
139
|
s1 = make_taxamatch_hash 'morrrr'
|
data/spec/taxamatch_test.txt
CHANGED
|
@@ -9,6 +9,7 @@ Pomatomus|Pomatomas|true|1
|
|
|
9
9
|
Pomatomus L.|Pomatomas Linn.|true|1
|
|
10
10
|
Pomatomus Ber|Pomatomas Linn|false|1
|
|
11
11
|
Pomatomus L. 1753|Pomatomus Linn. 1800|false|0
|
|
12
|
+
Patella|Abbella|false|3
|
|
12
13
|
|
|
13
14
|
## additional authorship should match
|
|
14
15
|
Puma concolor|Puma concolor L.|true|0
|
|
@@ -17,7 +18,7 @@ Puma concolor|Puma concolor L.|true|0
|
|
|
17
18
|
Puma concolor|Puma cancolor|true|1
|
|
18
19
|
#
|
|
19
20
|
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
|
20
|
-
Pomatomus saltator|Pomatomus saltatrix|
|
|
21
|
+
Pomatomus saltator|Pomatomus saltatrix|false|3 #!!!
|
|
21
22
|
#
|
|
22
23
|
Loligo pealeii|Loligo plei|false|3
|
|
23
24
|
#
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
7
|
- 6
|
|
8
|
-
-
|
|
9
|
-
version: 0.6.
|
|
8
|
+
- 4
|
|
9
|
+
version: 0.6.4
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Dmitry Mozzherin
|
|
@@ -14,7 +14,7 @@ autorequire:
|
|
|
14
14
|
bindir: bin
|
|
15
15
|
cert_chain: []
|
|
16
16
|
|
|
17
|
-
date: 2010-
|
|
17
|
+
date: 2010-04-08 00:00:00 -04:00
|
|
18
18
|
default_executable:
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|