dwc_agent 0.4.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +11 -7
- data/lib/dwc_agent/constants.rb +27 -7
- data/lib/dwc_agent/parser.rb +1 -1
- data/lib/dwc_agent/version.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dcdfb4922038d07715bd064b74a516203bcce44ef824683ac9766586764bb98f
|
4
|
+
data.tar.gz: d9113f199abc420d7c608bfbead6e70f1f6192437a82e43541f77eb187752c2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 403049856dbfbc83c984b5175615718c738b46312ecced4d3b5853fb4584a080d1e868492ee476187c8cf7238481845c0f77bd181e6ceaeebc22bae8b7aef127
|
7
|
+
data.tar.gz: 01e8bc4beb84140a80763c6c0dc8ff6ee6f967e8382cbacf04a230aa3405f2394ec1a90570471e113d90264c1e202c5452428858eb5aff6bc499f4e795dc3158
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -16,13 +16,9 @@ module DwcAgent
|
|
16
16
|
# @param parsed_namae [Object] the namae object
|
17
17
|
# @return [Hash] the given, family hash
|
18
18
|
def clean(parsed_namae)
|
19
|
-
blank_name = { given: nil, family: nil }
|
19
|
+
blank_name = { given: nil, family: nil, particle: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.
|
22
|
-
return blank_name
|
23
|
-
end
|
24
|
-
|
25
|
-
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
26
22
|
return blank_name
|
27
23
|
end
|
28
24
|
|
@@ -99,6 +95,10 @@ module DwcAgent
|
|
99
95
|
particle = nil
|
100
96
|
end
|
101
97
|
|
98
|
+
if !particle.nil? && particle.include?(".")
|
99
|
+
particle = nil
|
100
|
+
end
|
101
|
+
|
102
102
|
if !family.nil? && (family == family.upcase || family == family.downcase)
|
103
103
|
family = NameCase(family)
|
104
104
|
end
|
@@ -111,7 +111,11 @@ module DwcAgent
|
|
111
111
|
return blank_name
|
112
112
|
end
|
113
113
|
|
114
|
-
|
114
|
+
if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
|
115
|
+
return blank_name
|
116
|
+
end
|
117
|
+
|
118
|
+
{ given: given, family: family, particle: particle }
|
115
119
|
end
|
116
120
|
|
117
121
|
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -20,7 +20,6 @@ module DwcAgent
|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
22
22
|
\b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
|
23
|
-
(?i:no\s+(data|disponible))|
|
24
23
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
25
24
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
26
25
|
May|Jun|Jul|Aug|Sept?|
|
@@ -78,10 +77,11 @@ module DwcAgent
|
|
78
77
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
79
78
|
\b(?i:to\s+(sub)?spp?)\.?|
|
80
79
|
(?i:nom\.?\s+rev\.?)|
|
81
|
-
FNA|DAO|HUH|FDNMB|
|
80
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
81
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
83
82
|
(?i:university|museum|exhibits?)|
|
84
83
|
(?i:uqam)|
|
84
|
+
(?i:sem\s+(colec?tor|data))|
|
85
85
|
\b[,;]\s+\d+\z|
|
86
86
|
["!@?]|
|
87
87
|
[,]?\d+|
|
@@ -147,7 +147,8 @@ module DwcAgent
|
|
147
147
|
'{' => '',
|
148
148
|
'}' => '',
|
149
149
|
'@' => '',
|
150
|
-
'%' => ''
|
150
|
+
'%' => '',
|
151
|
+
'\\' => ''
|
151
152
|
}
|
152
153
|
|
153
154
|
PHRASE_SUBS = {
|
@@ -158,12 +159,23 @@ module DwcAgent
|
|
158
159
|
}
|
159
160
|
|
160
161
|
COMPLEX_SEPARATORS = %r{
|
161
|
-
^(
|
162
|
+
^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
|
162
163
|
}x
|
163
164
|
|
165
|
+
# Was used in 1.1.0 but it sunk performance so threw it back to a WIP
|
166
|
+
#
|
167
|
+
# @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
|
168
|
+
#
|
169
|
+
# name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
|
170
|
+
#
|
171
|
+
# CONTRACTED_LIST = %r{
|
172
|
+
# ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
|
173
|
+
# }x
|
174
|
+
|
164
175
|
BLACKLIST = %r{
|
165
176
|
(?i:abundant)|
|
166
177
|
(?i:adult|juvenile)|
|
178
|
+
(?i:administra(d|t)or)|
|
167
179
|
(?i:anon)|
|
168
180
|
(?i:australian?)|
|
169
181
|
(?i:average)|
|
@@ -172,9 +184,10 @@ module DwcAgent
|
|
172
184
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
173
185
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
174
186
|
(?i:carex|salix)|
|
175
|
-
(
|
187
|
+
(?i:catalog(ue)?)|
|
176
188
|
(?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
|
177
189
|
\b\s*(?i:help)\s*\b|
|
190
|
+
(?i:data\s+not\s+captured)|
|
178
191
|
(?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
|
179
192
|
(?i:desconocido)|
|
180
193
|
(?i:exc?s?icc?at(a|i))|
|
@@ -192,7 +205,9 @@ module DwcAgent
|
|
192
205
|
(?i:univ\.)|
|
193
206
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
194
207
|
(?i:non\s+pr(é|e)cis(é|e))|
|
195
|
-
(?i:
|
208
|
+
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
209
|
+
(?i:not?\s+(entered|stated))|
|
210
|
+
(?i:nomenclatur(e|al)\s+adjustment)|
|
196
211
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
197
212
|
(?i:recreation|culture)|
|
198
213
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -220,7 +235,12 @@ module DwcAgent
|
|
220
235
|
FAMILY_BLACKLIST = [
|
221
236
|
"der",
|
222
237
|
"van",
|
223
|
-
"von"
|
238
|
+
"von",
|
239
|
+
"the",
|
240
|
+
"of",
|
241
|
+
"curators",
|
242
|
+
"nomenclatural",
|
243
|
+
"adjustment"
|
224
244
|
]
|
225
245
|
|
226
246
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -20,7 +20,7 @@ module DwcAgent
|
|
20
20
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
|
-
@add_separators_regex = Regexp.new %r{(
|
23
|
+
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
24
|
end
|
25
25
|
|
26
26
|
# Parses the passed-in string and returns a list of names.
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|