taxamatch_rb 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +2 -0
- data/lib/taxamatch_rb/base.rb +50 -42
- data/lib/taxamatch_rb/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83617a85524edc8d1a9615a12b955a4a01ce8a0a
|
4
|
+
data.tar.gz: 480ac61418d818621282531ce74680bb99bdce19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15373025eec71b3dfe679406f3a12e74aac836536c616d0d37196b6f253760ffb91124ffb4d0c86dc7c37d406a6ed038cd459137024ac52f1ca7b10491890096
|
7
|
+
data.tar.gz: 38f1a2cd62d2271151b698529a17d1eeac07317baf0bca91523569e465c383e1d6644aca42d8ba0cc41ee6603a3c7e94c468c51c8fec32773a998b9f3328e2d4
|
data/CHANGELOG
CHANGED
data/lib/taxamatch_rb/base.rb
CHANGED
@@ -1,31 +1,37 @@
|
|
1
1
|
module Taxamatch
|
2
|
-
|
2
|
+
|
3
3
|
class Base
|
4
|
+
|
4
5
|
def initialize
|
5
6
|
@parser = Taxamatch::Atomizer.new
|
6
7
|
@dlm = DamerauLevenshtein
|
7
8
|
end
|
8
9
|
|
10
|
+
|
11
|
+
# takes two scientific names and returns true
|
12
|
+
# if names match and false if they don't
|
9
13
|
def taxamatch(str1, str2, return_boolean = true)
|
10
14
|
preparsed_1 = @parser.parse(str1)
|
11
15
|
preparsed_2 = @parser.parse(str2)
|
12
|
-
match = taxamatch_preparsed(preparsed_1, preparsed_2)
|
13
|
-
return_boolean ? (!!match && match[
|
16
|
+
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
17
|
+
return_boolean ? (!!match && match['match']) : match
|
14
18
|
end
|
15
19
|
|
20
|
+
# takes two hashes of parsed scientific names, analyses them and
|
21
|
+
# returns back this function is useful when species strings are preparsed.
|
16
22
|
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
17
23
|
result = nil
|
18
24
|
if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
19
25
|
result = match_uninomial(preparsed_1, preparsed_2)
|
20
|
-
|
26
|
+
end
|
27
|
+
if preparsed_1[:genus] && preparsed_2[:genus]
|
21
28
|
result = match_multinomial(preparsed_1, preparsed_2)
|
22
29
|
end
|
23
|
-
if result && result[
|
24
|
-
result[
|
30
|
+
if result && result['match']
|
31
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ?
|
32
|
+
false : true
|
25
33
|
end
|
26
|
-
result
|
27
|
-
rescue StandardError
|
28
|
-
nil
|
34
|
+
return result
|
29
35
|
end
|
30
36
|
|
31
37
|
def match_uninomial(preparsed_1, preparsed_2)
|
@@ -47,17 +53,17 @@ module Taxamatch
|
|
47
53
|
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
48
54
|
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) ||
|
49
55
|
(!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
50
|
-
match_hash = {
|
51
|
-
|
52
|
-
|
56
|
+
match_hash = { 'match' => false,
|
57
|
+
'edit_distance' => 5,
|
58
|
+
'phonetic_match' => false }
|
53
59
|
total_length += preparsed_1[:infraspecies] ?
|
54
60
|
preparsed_1[:infraspecies][0][:string].size :
|
55
61
|
preparsed_2[:infraspecies][0][:string].size
|
56
62
|
else
|
57
63
|
match_hash = match_matches(gen_match, sp_match)
|
58
64
|
end
|
59
|
-
match_hash.merge({
|
60
|
-
(1 - match_hash[
|
65
|
+
match_hash.merge({ 'score' =>
|
66
|
+
(1 - match_hash['edit_distance']/(total_length/2)) })
|
61
67
|
match_hash
|
62
68
|
end
|
63
69
|
|
@@ -67,22 +73,22 @@ module Taxamatch
|
|
67
73
|
opts = { with_phonetic_match: true }.merge(opts)
|
68
74
|
min_length = [genus1_length, genus2_length].min
|
69
75
|
unless opts[:with_phonetic_match]
|
70
|
-
genus1[:phonetized] =
|
71
|
-
genus2[:phonetized] =
|
76
|
+
genus1[:phonetized] = 'A'
|
77
|
+
genus2[:phonetized] = 'B'
|
72
78
|
end
|
73
79
|
match = false
|
74
80
|
ed = @dlm.distance(genus1[:normalized],
|
75
81
|
genus2[:normalized], 1, 3) #TODO put block = 2
|
76
|
-
return {
|
77
|
-
|
78
|
-
|
79
|
-
return {
|
80
|
-
|
81
|
-
|
82
|
+
return { 'edit_distance' => ed,
|
83
|
+
'phonetic_match' => false,
|
84
|
+
'match' => false } if ed/min_length.to_f > 0.2
|
85
|
+
return { 'edit_distance' => ed,
|
86
|
+
'phonetic_match' => true,
|
87
|
+
'match' => true } if genus1[:phonetized] == genus2[:phonetized]
|
82
88
|
|
83
89
|
match = true if ed <= 3 && (min_length > ed * 2) &&
|
84
90
|
(ed < 2 || genus1[0] == genus2[0])
|
85
|
-
{
|
91
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
|
86
92
|
end
|
87
93
|
|
88
94
|
def match_species(sp1, sp2, opts = {})
|
@@ -91,26 +97,26 @@ module Taxamatch
|
|
91
97
|
opts = { with_phonetic_match: true }.merge(opts)
|
92
98
|
min_length = [sp1_length, sp2_length].min
|
93
99
|
unless opts[:with_phonetic_match]
|
94
|
-
sp1[:phonetized] =
|
95
|
-
sp2[:phonetized] =
|
100
|
+
sp1[:phonetized] = 'A'
|
101
|
+
sp2[:phonetized] = 'B'
|
96
102
|
end
|
97
103
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
98
104
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
99
105
|
match = false
|
100
106
|
ed = @dlm.distance(sp1[:normalized],
|
101
107
|
sp2[:normalized], 1, 4) #TODO put block 4
|
102
|
-
return {
|
103
|
-
|
104
|
-
|
105
|
-
return {
|
106
|
-
|
107
|
-
|
108
|
+
return { 'edit_distance' => ed,
|
109
|
+
'phonetic_match' => false,
|
110
|
+
'match' => false } if ed/min_length.to_f > 0.3334
|
111
|
+
return {'edit_distance' => ed,
|
112
|
+
'phonetic_match' => true,
|
113
|
+
'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
108
114
|
|
109
115
|
match = true if ed <= 4 &&
|
110
116
|
(min_length >= ed * 2) &&
|
111
117
|
(ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) &&
|
112
118
|
(ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
113
|
-
{
|
119
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
|
114
120
|
end
|
115
121
|
|
116
122
|
def match_authors(preparsed_1, preparsed_2)
|
@@ -130,25 +136,27 @@ module Taxamatch
|
|
130
136
|
au2 = p2[:normalized_authors]
|
131
137
|
yr1 = p1[:years]
|
132
138
|
yr2 = p2[:years]
|
133
|
-
return
|
139
|
+
return 0 if au1.empty? || au2.empty?
|
134
140
|
score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
135
|
-
score == 0 ?
|
141
|
+
score == 0 ? -1 : 1
|
136
142
|
end
|
137
143
|
|
138
144
|
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
139
145
|
match = species_match
|
140
146
|
if infraspecies_match
|
141
|
-
match[
|
142
|
-
match[
|
143
|
-
match[
|
147
|
+
match['edit_distance'] += infraspecies_match['edit_distance']
|
148
|
+
match['match'] &&= infraspecies_match['match']
|
149
|
+
match['phonetic_match'] &&= infraspecies_match['phonetic_match']
|
144
150
|
end
|
145
|
-
match[
|
146
|
-
if match[
|
147
|
-
match[
|
151
|
+
match['edit_distance'] += genus_match['edit_distance']
|
152
|
+
if match['edit_distance'] > (infraspecies_match ? 6 : 4)
|
153
|
+
match['match'] = false
|
148
154
|
end
|
149
|
-
match[
|
150
|
-
match[
|
155
|
+
match['match'] &&= genus_match['match']
|
156
|
+
match['phonetic_match'] &&= genus_match['phonetic_match']
|
151
157
|
match
|
152
158
|
end
|
159
|
+
|
153
160
|
end
|
154
161
|
end
|
162
|
+
|
data/lib/taxamatch_rb/version.rb
CHANGED