dwc_agent 0.4.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +8 -8
- data/lib/dwc_agent/constants.rb +13 -7
- data/lib/dwc_agent/parser.rb +1 -1
- data/lib/dwc_agent/version.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3bcc45d8ecae4211cd0f678727d1002ed54ca998934d0f72f458762b21152f1
|
4
|
+
data.tar.gz: 6857f25e128b7a5a62e95bc36367679356ef9321b65e783b3ea04a7e0501afa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5489f5f4d6473de3054377c7884b37cbb1ce8982f68a4e52ea39cf68ec59945a8708a90400b0aa8c509505bd5c48cef2ab4a99e19d1661af21fd6cab349a0b2
|
7
|
+
data.tar.gz: a27efb2e8aa6c9183d473217e6d50ac309163cebf12c7d191f878f28838c006cd6132c5ccd2ec71b76e108be100492332edd93917adb2ee32417d5e9f31a87ea
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -16,13 +16,9 @@ module DwcAgent
|
|
16
16
|
# @param parsed_namae [Object] the namae object
|
17
17
|
# @return [Hash] the given, family hash
|
18
18
|
def clean(parsed_namae)
|
19
|
-
blank_name = { given: nil, family: nil }
|
19
|
+
blank_name = { given: nil, family: nil, particle: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.
|
22
|
-
return blank_name
|
23
|
-
end
|
24
|
-
|
25
|
-
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
26
22
|
return blank_name
|
27
23
|
end
|
28
24
|
|
@@ -99,6 +95,10 @@ module DwcAgent
|
|
99
95
|
particle = nil
|
100
96
|
end
|
101
97
|
|
98
|
+
if !particle.nil? && particle.include?(".")
|
99
|
+
particle = nil
|
100
|
+
end
|
101
|
+
|
102
102
|
if !family.nil? && (family == family.upcase || family == family.downcase)
|
103
103
|
family = NameCase(family)
|
104
104
|
end
|
@@ -111,11 +111,11 @@ module DwcAgent
|
|
111
111
|
return blank_name
|
112
112
|
end
|
113
113
|
|
114
|
-
if !family.nil? && FAMILY_BLACKLIST.
|
114
|
+
if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
|
115
115
|
return blank_name
|
116
116
|
end
|
117
117
|
|
118
|
-
{ given: given, family: family }
|
118
|
+
{ given: given, family: family, particle: particle }
|
119
119
|
end
|
120
120
|
|
121
121
|
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -20,7 +20,6 @@ module DwcAgent
|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
22
22
|
\b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
|
23
|
-
(?i:no\s+(data|disponible))|
|
24
23
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
25
24
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
26
25
|
May|Jun|Jul|Aug|Sept?|
|
@@ -78,10 +77,11 @@ module DwcAgent
|
|
78
77
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
79
78
|
\b(?i:to\s+(sub)?spp?)\.?|
|
80
79
|
(?i:nom\.?\s+rev\.?)|
|
81
|
-
FNA|DAO|HUH|FDNMB|
|
80
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
81
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
83
82
|
(?i:university|museum|exhibits?)|
|
84
83
|
(?i:uqam)|
|
84
|
+
(?i:sem\s+(colec?tor|data))|
|
85
85
|
\b[,;]\s+\d+\z|
|
86
86
|
["!@?]|
|
87
87
|
[,]?\d+|
|
@@ -159,12 +159,13 @@ module DwcAgent
|
|
159
159
|
}
|
160
160
|
|
161
161
|
COMPLEX_SEPARATORS = %r{
|
162
|
-
^(
|
162
|
+
^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
|
163
163
|
}x
|
164
164
|
|
165
165
|
BLACKLIST = %r{
|
166
166
|
(?i:abundant)|
|
167
167
|
(?i:adult|juvenile)|
|
168
|
+
(?i:administra(d|t)or)|
|
168
169
|
(?i:anon)|
|
169
170
|
(?i:australian?)|
|
170
171
|
(?i:average)|
|
@@ -173,9 +174,10 @@ module DwcAgent
|
|
173
174
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
174
175
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
175
176
|
(?i:carex|salix)|
|
176
|
-
(
|
177
|
+
(?i:catalog(ue)?)|
|
177
178
|
(?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
|
178
179
|
\b\s*(?i:help)\s*\b|
|
180
|
+
(?i:data\s+not\s+captured)|
|
179
181
|
(?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
|
180
182
|
(?i:desconocido)|
|
181
183
|
(?i:exc?s?icc?at(a|i))|
|
@@ -193,7 +195,9 @@ module DwcAgent
|
|
193
195
|
(?i:univ\.)|
|
194
196
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
195
197
|
(?i:non\s+pr(é|e)cis(é|e))|
|
196
|
-
(?i:
|
198
|
+
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
|
+
(?i:not?\s+(entered|stated))|
|
200
|
+
(?i:nomenclatur(e|al)\s+adjustment)|
|
197
201
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
198
202
|
(?i:recreation|culture)|
|
199
203
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -223,8 +227,10 @@ module DwcAgent
|
|
223
227
|
"van",
|
224
228
|
"von",
|
225
229
|
"the",
|
226
|
-
"
|
227
|
-
"
|
230
|
+
"of",
|
231
|
+
"curators",
|
232
|
+
"nomenclatural",
|
233
|
+
"adjustment"
|
228
234
|
]
|
229
235
|
|
230
236
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -20,7 +20,7 @@ module DwcAgent
|
|
20
20
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
|
-
@add_separators_regex = Regexp.new %r{(
|
23
|
+
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
24
|
end
|
25
25
|
|
26
26
|
# Parses the passed-in string and returns a list of names.
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|