taxamatch_rb 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +2 -0
- data/lib/taxamatch_rb/base.rb +50 -42
- data/lib/taxamatch_rb/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83617a85524edc8d1a9615a12b955a4a01ce8a0a
|
4
|
+
data.tar.gz: 480ac61418d818621282531ce74680bb99bdce19
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15373025eec71b3dfe679406f3a12e74aac836536c616d0d37196b6f253760ffb91124ffb4d0c86dc7c37d406a6ed038cd459137024ac52f1ca7b10491890096
|
7
|
+
data.tar.gz: 38f1a2cd62d2271151b698529a17d1eeac07317baf0bca91523569e465c383e1d6644aca42d8ba0cc41ee6603a3c7e94c468c51c8fec32773a998b9f3328e2d4
|
data/CHANGELOG
CHANGED
data/lib/taxamatch_rb/base.rb
CHANGED
@@ -1,31 +1,37 @@
|
|
1
1
|
module Taxamatch
|
2
|
-
|
2
|
+
|
3
3
|
class Base
|
4
|
+
|
4
5
|
def initialize
|
5
6
|
@parser = Taxamatch::Atomizer.new
|
6
7
|
@dlm = DamerauLevenshtein
|
7
8
|
end
|
8
9
|
|
10
|
+
|
11
|
+
# takes two scientific names and returns true
|
12
|
+
# if names match and false if they don't
|
9
13
|
def taxamatch(str1, str2, return_boolean = true)
|
10
14
|
preparsed_1 = @parser.parse(str1)
|
11
15
|
preparsed_2 = @parser.parse(str2)
|
12
|
-
match = taxamatch_preparsed(preparsed_1, preparsed_2)
|
13
|
-
return_boolean ? (!!match && match[
|
16
|
+
match = taxamatch_preparsed(preparsed_1, preparsed_2) rescue nil
|
17
|
+
return_boolean ? (!!match && match['match']) : match
|
14
18
|
end
|
15
19
|
|
20
|
+
# takes two hashes of parsed scientific names, analyses them and
|
21
|
+
# returns back this function is useful when species strings are preparsed.
|
16
22
|
def taxamatch_preparsed(preparsed_1, preparsed_2)
|
17
23
|
result = nil
|
18
24
|
if preparsed_1[:uninomial] && preparsed_2[:uninomial]
|
19
25
|
result = match_uninomial(preparsed_1, preparsed_2)
|
20
|
-
|
26
|
+
end
|
27
|
+
if preparsed_1[:genus] && preparsed_2[:genus]
|
21
28
|
result = match_multinomial(preparsed_1, preparsed_2)
|
22
29
|
end
|
23
|
-
if result && result[
|
24
|
-
result[
|
30
|
+
if result && result['match']
|
31
|
+
result['match'] = match_authors(preparsed_1, preparsed_2) == -1 ?
|
32
|
+
false : true
|
25
33
|
end
|
26
|
-
result
|
27
|
-
rescue StandardError
|
28
|
-
nil
|
34
|
+
return result
|
29
35
|
end
|
30
36
|
|
31
37
|
def match_uninomial(preparsed_1, preparsed_2)
|
@@ -47,17 +53,17 @@ module Taxamatch
|
|
47
53
|
match_hash = match_matches(gen_match, sp_match, infrasp_match)
|
48
54
|
elsif (preparsed_1[:infraspecies] && !preparsed_2[:infraspecies]) ||
|
49
55
|
(!preparsed_1[:infraspecies] && preparsed_2[:infraspecies])
|
50
|
-
match_hash = {
|
51
|
-
|
52
|
-
|
56
|
+
match_hash = { 'match' => false,
|
57
|
+
'edit_distance' => 5,
|
58
|
+
'phonetic_match' => false }
|
53
59
|
total_length += preparsed_1[:infraspecies] ?
|
54
60
|
preparsed_1[:infraspecies][0][:string].size :
|
55
61
|
preparsed_2[:infraspecies][0][:string].size
|
56
62
|
else
|
57
63
|
match_hash = match_matches(gen_match, sp_match)
|
58
64
|
end
|
59
|
-
match_hash.merge({
|
60
|
-
(1 - match_hash[
|
65
|
+
match_hash.merge({ 'score' =>
|
66
|
+
(1 - match_hash['edit_distance']/(total_length/2)) })
|
61
67
|
match_hash
|
62
68
|
end
|
63
69
|
|
@@ -67,22 +73,22 @@ module Taxamatch
|
|
67
73
|
opts = { with_phonetic_match: true }.merge(opts)
|
68
74
|
min_length = [genus1_length, genus2_length].min
|
69
75
|
unless opts[:with_phonetic_match]
|
70
|
-
genus1[:phonetized] =
|
71
|
-
genus2[:phonetized] =
|
76
|
+
genus1[:phonetized] = 'A'
|
77
|
+
genus2[:phonetized] = 'B'
|
72
78
|
end
|
73
79
|
match = false
|
74
80
|
ed = @dlm.distance(genus1[:normalized],
|
75
81
|
genus2[:normalized], 1, 3) #TODO put block = 2
|
76
|
-
return {
|
77
|
-
|
78
|
-
|
79
|
-
return {
|
80
|
-
|
81
|
-
|
82
|
+
return { 'edit_distance' => ed,
|
83
|
+
'phonetic_match' => false,
|
84
|
+
'match' => false } if ed/min_length.to_f > 0.2
|
85
|
+
return { 'edit_distance' => ed,
|
86
|
+
'phonetic_match' => true,
|
87
|
+
'match' => true } if genus1[:phonetized] == genus2[:phonetized]
|
82
88
|
|
83
89
|
match = true if ed <= 3 && (min_length > ed * 2) &&
|
84
90
|
(ed < 2 || genus1[0] == genus2[0])
|
85
|
-
{
|
91
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
|
86
92
|
end
|
87
93
|
|
88
94
|
def match_species(sp1, sp2, opts = {})
|
@@ -91,26 +97,26 @@ module Taxamatch
|
|
91
97
|
opts = { with_phonetic_match: true }.merge(opts)
|
92
98
|
min_length = [sp1_length, sp2_length].min
|
93
99
|
unless opts[:with_phonetic_match]
|
94
|
-
sp1[:phonetized] =
|
95
|
-
sp2[:phonetized] =
|
100
|
+
sp1[:phonetized] = 'A'
|
101
|
+
sp2[:phonetized] = 'B'
|
96
102
|
end
|
97
103
|
sp1[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp1[:phonetized]
|
98
104
|
sp2[:phonetized] = Taxamatch::Phonetizer.normalize_ending sp2[:phonetized]
|
99
105
|
match = false
|
100
106
|
ed = @dlm.distance(sp1[:normalized],
|
101
107
|
sp2[:normalized], 1, 4) #TODO put block 4
|
102
|
-
return {
|
103
|
-
|
104
|
-
|
105
|
-
return {
|
106
|
-
|
107
|
-
|
108
|
+
return { 'edit_distance' => ed,
|
109
|
+
'phonetic_match' => false,
|
110
|
+
'match' => false } if ed/min_length.to_f > 0.3334
|
111
|
+
return {'edit_distance' => ed,
|
112
|
+
'phonetic_match' => true,
|
113
|
+
'match' => true} if sp1[:phonetized] == sp2[:phonetized]
|
108
114
|
|
109
115
|
match = true if ed <= 4 &&
|
110
116
|
(min_length >= ed * 2) &&
|
111
117
|
(ed < 2 || sp1[:normalized][0] == sp2[:normalized][0]) &&
|
112
118
|
(ed < 4 || sp1[:normalized][0...3] == sp2[:normalized][0...3])
|
113
|
-
{
|
119
|
+
{ 'edit_distance' => ed, 'match' => match, 'phonetic_match' => false }
|
114
120
|
end
|
115
121
|
|
116
122
|
def match_authors(preparsed_1, preparsed_2)
|
@@ -130,25 +136,27 @@ module Taxamatch
|
|
130
136
|
au2 = p2[:normalized_authors]
|
131
137
|
yr1 = p1[:years]
|
132
138
|
yr2 = p2[:years]
|
133
|
-
return
|
139
|
+
return 0 if au1.empty? || au2.empty?
|
134
140
|
score = Taxamatch::Authmatch.authmatch(au1, au2, yr1, yr2)
|
135
|
-
score == 0 ?
|
141
|
+
score == 0 ? -1 : 1
|
136
142
|
end
|
137
143
|
|
138
144
|
def match_matches(genus_match, species_match, infraspecies_match = nil)
|
139
145
|
match = species_match
|
140
146
|
if infraspecies_match
|
141
|
-
match[
|
142
|
-
match[
|
143
|
-
match[
|
147
|
+
match['edit_distance'] += infraspecies_match['edit_distance']
|
148
|
+
match['match'] &&= infraspecies_match['match']
|
149
|
+
match['phonetic_match'] &&= infraspecies_match['phonetic_match']
|
144
150
|
end
|
145
|
-
match[
|
146
|
-
if match[
|
147
|
-
match[
|
151
|
+
match['edit_distance'] += genus_match['edit_distance']
|
152
|
+
if match['edit_distance'] > (infraspecies_match ? 6 : 4)
|
153
|
+
match['match'] = false
|
148
154
|
end
|
149
|
-
match[
|
150
|
-
match[
|
155
|
+
match['match'] &&= genus_match['match']
|
156
|
+
match['phonetic_match'] &&= genus_match['phonetic_match']
|
151
157
|
match
|
152
158
|
end
|
159
|
+
|
153
160
|
end
|
154
161
|
end
|
162
|
+
|
data/lib/taxamatch_rb/version.rb
CHANGED