dwc_agent 3.3.0.0 → 3.3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +2 -3
- data/lib/dwc_agent/constants.rb +115 -113
- data/lib/dwc_agent/version.rb +5 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39a9ead6ab8410e39f29e6bf5b841214042fbf47a336c0e4c5dfaa4b0d11b87d
|
4
|
+
data.tar.gz: 763e722a2f28c765660573b6a0f2895e5c4c25550e8c927f5a85fa6e533f3800
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5e35467255e9e5dfc029a02757387099d3edc4ef385e4d73785c02ca74b0ffc8be7d87a60ec34f9fd22e80d10747bf357e4951b35d415bdc099b2ec24973178
|
7
|
+
data.tar.gz: 5726c9073b61195aa4e5a9f8d5861c044c4392e42b0c2ec114013cb3881362ff7b30927847b352152fb5659dcc8ca2aaf48c728a3bd6ab0059b1126f56659fd0
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -126,9 +126,8 @@ module DwcAgent
|
|
126
126
|
end
|
127
127
|
|
128
128
|
if parsed_namae.family.nil? &&
|
129
|
-
!parsed_namae.given.nil?
|
130
|
-
|
131
|
-
parsed_namae.family = parsed_namae.given
|
129
|
+
!parsed_namae.given.nil?
|
130
|
+
parsed_namae.family = parsed_namae.given.delete_suffix(".")
|
132
131
|
parsed_namae.given = nil
|
133
132
|
end
|
134
133
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -10,7 +10,7 @@ module DwcAgent
|
|
10
10
|
[,]?\s*\#*\s+\d+\-(?i:[A-Z]|\d)+\-?\d*[A-Za-z]*\z|
|
11
11
|
\d*[A-Za-z]*\d*-\d*\z|
|
12
12
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
13
|
-
[,;\s]
|
13
|
+
[,;\s]+(?:et\.?\s+al|&\s+al)l?\.?|
|
14
14
|
\b[,;]?\s*(?i:etal)\.?|
|
15
15
|
\b[,;]?\s*(?i:et.al)\.?|
|
16
16
|
\b\s+(bis|ter)(\b|\z)|
|
@@ -32,7 +32,7 @@ module DwcAgent
|
|
32
32
|
^(?i:collection)\:?\s+|\s*(?i:collection)\s*$|
|
33
33
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
34
34
|
(?i:contactid)|
|
35
|
-
^(?i:dupl)[.,]
|
35
|
+
^(?i:dupl)[.,]+|
|
36
36
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
37
37
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
38
38
|
May|Jun|Jul|Aug|Sept?|
|
@@ -141,42 +141,42 @@ module DwcAgent
|
|
141
141
|
}x
|
142
142
|
|
143
143
|
SPLIT_BY = %r{
|
144
|
-
[;,]{2,}|
|
145
|
-
[–|ǀ∣|│&+\/;:]|
|
146
|
-
\s+-\s
|
147
|
-
\s+a\.\s
|
148
|
-
\b(con|e|y|i|en|et|or|per|for|und)\s*\b|
|
149
|
-
\b(?i:and|with)\s*\b|
|
150
|
-
\b(?i:annotated(\s+by)?)\s*\b|
|
151
|
-
\b(?i:coll\.)\s*\b|
|
152
|
-
\b(?i:comm\.?)\s*\b|
|
153
|
-
\b(?i:communicate?d(\s+to)?)\s*\b|
|
154
|
-
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
|
155
|
-
\b(?i:confirmada)(\s+por)?\s*\b|
|
156
|
-
\b(?i:checked?(\s+by)?)\s*\b|
|
157
|
-
\b(?i:det\.?(\s+by)?)\s*\b|
|
158
|
-
\b(?i:(donated)?\s*by)\s
|
159
|
-
\b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b|
|
160
|
-
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
|
161
|
-
\b(?i:in?dentified(\s+by)?)\s*\b|
|
162
|
-
\b(?i:in\s+coll\.?\s*\b)|
|
163
|
-
\b(?i:in\s+part(\s+by)?)\s*\b|
|
164
|
-
\b(?i:och)\s*\b|
|
165
|
-
\b(?i:prep\.?\s+(?i:by)?)\s*\b|
|
166
|
-
\b(?i:purchased?)(\s+by)?\s*\b|
|
167
|
-
\b(?i:redet\.?(\s+by?)?)\s*\b|
|
168
|
-
\b(?i:reidentified(\s+by)?)\s*\b|
|
169
|
-
\b(?i:stet)\s*\b|
|
170
|
-
\b(?i:then(\s+by)?)\s
|
171
|
-
\b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
|
172
|
-
\b(?i:via|from)\s*\b
|
144
|
+
[;,]{2,} | # Multiple semicolons or commas
|
145
|
+
[–|ǀ∣|│&+\/;:] | # Various separators
|
146
|
+
\s+-\s+ | # Dash surrounded by spaces
|
147
|
+
\s+a\.\s+ | # "a." surrounded by spaces
|
148
|
+
\b(con|e|y|i|en|et|or|per|for|und)\s*\b | # Short conjunctions or prepositions
|
149
|
+
\b(?i:and|with)\s*\b | # Case-insensitive "and", "with"
|
150
|
+
\b(?i:annotated(\s+by)?)\s*\b | # "annotated (by)"
|
151
|
+
\b(?i:coll\.)\s*\b | # "coll."
|
152
|
+
\b(?i:comm\.?)\s*\b | # "comm."
|
153
|
+
\b(?i:communicate?d(\s+to)?)\s*\b | # "communicated (to)"
|
154
|
+
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b | # "conf.", "confirmed (by)"
|
155
|
+
\b(?i:confirmada)(\s+por)?\s*\b | # "confirmada (por)"
|
156
|
+
\b(?i:checked?(\s+by)?)\s*\b | # "checked (by)"
|
157
|
+
\b(?i:det\.?(\s+by)?)\s*\b | # "det."
|
158
|
+
\b(?i:(donated)?\s*by)\s+ | # "donated by"
|
159
|
+
\b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b | # "dupl.", "duplicate"
|
160
|
+
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b | # "ex.", "examined (by)"
|
161
|
+
\b(?i:in?dentified(\s+by)?)\s*\b | # "identified (by)"
|
162
|
+
\b(?i:in\s+coll\.?\s*\b) | # "in coll."
|
163
|
+
\b(?i:in\s+part(\s+by)?)\s*\b | # "in part (by)"
|
164
|
+
\b(?i:och)\s*\b | # "och"
|
165
|
+
\b(?i:prep\.?\s+(?i:by)?)\s*\b | # "prep. by"
|
166
|
+
\b(?i:purchased?)(\s+by)?\s*\b | # "purchased (by)"
|
167
|
+
\b(?i:redet\.?(\s+by?)?)\s*\b | # "redet."
|
168
|
+
\b(?i:reidentified(\s+by)?)\s*\b | # "reidentified"
|
169
|
+
\b(?i:stet)\s*\b | # "stet"
|
170
|
+
\b(?i:then(\s+by)?)\s+ | # "then (by)"
|
171
|
+
\b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b | # "verif."
|
172
|
+
\b(?i:via|from)\s*\b # "via", "from"
|
173
173
|
}x
|
174
174
|
|
175
175
|
POST_STRIP_TIDY = %r{
|
176
|
-
^\s*[&,;.]\s
|
177
|
-
[\[\]]|
|
178
|
-
^[`'".,!?]
|
179
|
-
[`'",]
|
176
|
+
^\s*[&,;.]\s* | # Leading whitespace followed by any combination of &, ;, or .
|
177
|
+
[\[\]] | # Any standalone square brackets
|
178
|
+
^[`'".,!?]+ | # Leading repeated punctuation (` ' " . , ! ?)
|
179
|
+
[`'",]+$ # Trailing repeated punctuation (` ' ")
|
180
180
|
}x
|
181
181
|
|
182
182
|
CHAR_SUBS = {
|
@@ -225,93 +225,95 @@ module DwcAgent
|
|
225
225
|
}
|
226
226
|
|
227
227
|
SEPARATORS = {
|
228
|
-
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]
|
228
|
+
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]+)$" => "\\2 \\3 \\1",
|
229
229
|
"^(Mrs?\\.?)\\s+&\\s+(Mrs?\\.?)\\s+(.*)$" => "\\1 \\3 | \\2 \\3",
|
230
|
-
"^([A-Z]{1}\\.\\s*[[:alpha:]]
|
231
|
-
"^(\\S{4,},\\s+(?:\\S\\.\\s*)
|
230
|
+
"^([A-Z]{1}\\.\\s*[[:alpha:]]+),\\s*?([A-Z.]+)$" => "\\1 \\2",
|
231
|
+
"^(\\S{4,},\\s+(?:\\S\\.\\s*)+)\\s+(\\S{4,},\\s+(?:\\S\.\\s*)+)$" => "\\1 | \\2",
|
232
232
|
"(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
|
233
|
-
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)
|
233
|
+
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)+)$" => "\\1, \\2",
|
234
234
|
"([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2",
|
235
|
-
"^((?i:[A-Z]\\.\\s?)
|
236
|
-
"^((?i:[A-Z]\\.\\s?)
|
235
|
+
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})\\s+([[:alpha:]’`'-]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
|
236
|
+
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
237
237
|
"^([A-Z]{1,3})\\s+(?:and|&|et|e)\\s+([A-Z]{1,3})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
238
|
-
"^((?i:[A-Z]\\.\\s?)
|
238
|
+
"^((?i:[A-Z]\\.\\s?)+),\\s+([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\4 | \\2 \\4 | \\3 \\4 | \\5",
|
239
239
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
|
240
240
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4",
|
241
241
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4 | \\5"
|
242
242
|
}
|
243
243
|
|
244
244
|
BLACKLIST = %r{
|
245
|
-
(?i:
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
245
|
+
(?i:
|
246
|
+
abundant |
|
247
|
+
adult | juvenile |
|
248
|
+
administra(?:d|t)or |
|
249
|
+
^anon$ |
|
250
|
+
australian? |
|
251
|
+
average |
|
252
|
+
believe | unclear | ill?egible | suggested | (dis)?agrees? | approach |
|
253
|
+
\bnone\b |
|
254
|
+
barcod |
|
255
|
+
bgwd |
|
256
|
+
(biolog|botan|zoo|ecolog|mycol|(?:in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture) |
|
257
|
+
(bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america) |
|
258
|
+
carex | salix |
|
259
|
+
catalog(?:ue)? |
|
260
|
+
conservator |
|
261
|
+
(herbarium|herbier|collection|collected|publication|specimen|species|describe|an(?:a|o)morph|isolated|recorded|inspection|define|status|lighthouse) |
|
262
|
+
\bhelp\b |
|
263
|
+
data\s+not\s+captured |
|
264
|
+
(description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect) |
|
265
|
+
desconocido |
|
266
|
+
exc(?:s?icc?at(?:a|i)) |
|
267
|
+
evidence |
|
268
|
+
exporter |
|
269
|
+
foundation |
|
270
|
+
ichthyology |
|
271
|
+
inconn?u |
|
272
|
+
(internation|gou?vern|ministry|extension|unit|district|provincial|na(?:c|t)ional|military|region|environ|natur(?:e|al)|naturelles|division|program|direction) |
|
273
|
+
label |
|
274
|
+
o\.?m\.?n\.?r\.? |
|
275
|
+
measurement |
|
276
|
+
ent(?:o|y)mology |
|
277
|
+
malacology |
|
278
|
+
geographic |
|
279
|
+
(mus(?:eum|ée)|universit(?:y|é|e|at)|college|institute?|acad(?:e|é)m|school|écol(?:e|iers?)|laboratoi?r|project|polytech|dep(?:t|artment)|research|clinic|hospital|cientifica|sanctuary|safari) |
|
280
|
+
univ\. |
|
281
|
+
\b(graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(?:a|e)n|police|taxonomist|consultant|participant(?:es)?|team|(?:é|e)quipe|memb(?:er|re)|crew|group|personnel|staff|family|captain|friends|assistant|worker|gamekeeper)\b |
|
282
|
+
non\s+pr(?:é|e)cis(?:é|e) |
|
283
|
+
no\s+consta |
|
284
|
+
no\s+(agent\s+)?(?:data|disponible)(?:\s+available)? |
|
285
|
+
not?\s+(entered|stated) |
|
286
|
+
nomenclatur(?:e|al)\s+adjustment |
|
287
|
+
not\s+available |
|
288
|
+
(ontario|qu(?:e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?) |
|
289
|
+
popa\s+observers? |
|
290
|
+
recreation | culture |
|
291
|
+
renseigné |
|
292
|
+
(shaped|dark|pale|areas|phase|spotting|interior|between|closer) |
|
293
|
+
soci(?:e|é)t(?:y|é) | cent(?:er|re) | community | history | conservation | conference | assoc | commission | consortium | council | club | exposit | alliance | protective | circle |
|
294
|
+
^class\b |
|
295
|
+
commercial | control | product |
|
296
|
+
^company\b |
|
297
|
+
sequence\s+data |
|
298
|
+
size | large | colou?r |
|
299
|
+
skeleton |
|
300
|
+
survey | assessment | station | monitor | stn\. | project | engine | (e|é)x?chang(?:e|é)s? | ex(?:c|k)urs(?:e|o|ó)n? | exped\.? | exp(?:e|i)di(?:c|t)i(?:e|o|ó)n? | experiment | explora(?:d|t) | festival | generation | inventory | marine | service |
|
301
|
+
^index\b |
|
302
|
+
submersible |
|
303
|
+
synonymy? |
|
304
|
+
systematic | perspective |
|
305
|
+
^(?:off|too|the)\b |
|
306
|
+
taxiderm(?:ies|y) |
|
307
|
+
though |
|
308
|
+
texas\s+instruments?(?:\s+for)? |
|
309
|
+
tropical |
|
310
|
+
toward | seen\s+at |
|
311
|
+
unidentified | unspecified | unk?nown? | unnamed | unread | unmistak | no agent |
|
312
|
+
urn: |
|
313
|
+
usda | ucla |
|
314
|
+
workshop | garden | farm | jardin | public |
|
315
|
+
^de$
|
316
|
+
)
|
315
317
|
}x
|
316
318
|
|
317
319
|
FAMILY_GREENLIST = [
|
@@ -402,7 +404,7 @@ module DwcAgent
|
|
402
404
|
|
403
405
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
404
406
|
|
405
|
-
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
|
407
|
+
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|ESQ|esq|[IVX]{2,})(\.|\b)/
|
406
408
|
|
407
409
|
PARTICLES = [
|
408
410
|
"ap",
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.3.
|
4
|
+
version: 3.3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|