dwc_agent 0.4.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +11 -3
- data/lib/dwc_agent/constants.rb +21 -6
- data/lib/dwc_agent/parser.rb +3 -1
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d56e8cb49d4857e324dbdda7d6cfa31e9a7a3079c40a12659ff6077d13308613
|
4
|
+
data.tar.gz: 7652bc849261aa4179a99e64a725a0f82d0b65b4d2e2885042ad35521887e705
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99d384a56c180c8c5db64c5cc8cf072da272b427830875783a22d67279cd070af3bd0bd2b881e4a20018bdeba774d27e8adb8ae048668c28452171017b72c70b
|
7
|
+
data.tar.gz: edc59989136745e0b603039d08420b368ef92e95a0ab17b48bcabf439472c1252ec763f4f9c7a398b735995b340f897dc0f3d85a45c2ea1c9ccec643f050e904
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -16,9 +16,9 @@ module DwcAgent
|
|
16
16
|
# @param parsed_namae [Object] the namae object
|
17
17
|
# @return [Hash] the given, family hash
|
18
18
|
def clean(parsed_namae)
|
19
|
-
blank_name = { given: nil, family: nil }
|
19
|
+
blank_name = { given: nil, family: nil, particle: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
@@ -99,6 +99,10 @@ module DwcAgent
|
|
99
99
|
particle = nil
|
100
100
|
end
|
101
101
|
|
102
|
+
if !particle.nil? && particle.include?(".")
|
103
|
+
particle = nil
|
104
|
+
end
|
105
|
+
|
102
106
|
if !family.nil? && (family == family.upcase || family == family.downcase)
|
103
107
|
family = NameCase(family)
|
104
108
|
end
|
@@ -111,7 +115,11 @@ module DwcAgent
|
|
111
115
|
return blank_name
|
112
116
|
end
|
113
117
|
|
114
|
-
|
118
|
+
if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
|
119
|
+
return blank_name
|
120
|
+
end
|
121
|
+
|
122
|
+
{ given: given, family: family, particle: particle }
|
115
123
|
end
|
116
124
|
|
117
125
|
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -20,7 +20,6 @@ module DwcAgent
|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
22
22
|
\b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
|
23
|
-
(?i:no\s+(data|disponible))|
|
24
23
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
25
24
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
26
25
|
May|Jun|Jul|Aug|Sept?|
|
@@ -78,10 +77,11 @@ module DwcAgent
|
|
78
77
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
79
78
|
\b(?i:to\s+(sub)?spp?)\.?|
|
80
79
|
(?i:nom\.?\s+rev\.?)|
|
81
|
-
FNA|DAO|HUH|FDNMB|
|
80
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
81
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
83
82
|
(?i:university|museum|exhibits?)|
|
84
83
|
(?i:uqam)|
|
84
|
+
(?i:sem\s+(colec?tor|data))|
|
85
85
|
\b[,;]\s+\d+\z|
|
86
86
|
["!@?]|
|
87
87
|
[,]?\d+|
|
@@ -147,7 +147,8 @@ module DwcAgent
|
|
147
147
|
'{' => '',
|
148
148
|
'}' => '',
|
149
149
|
'@' => '',
|
150
|
-
'%' => ''
|
150
|
+
'%' => '',
|
151
|
+
'\\' => ''
|
151
152
|
}
|
152
153
|
|
153
154
|
PHRASE_SUBS = {
|
@@ -158,12 +159,17 @@ module DwcAgent
|
|
158
159
|
}
|
159
160
|
|
160
161
|
COMPLEX_SEPARATORS = %r{
|
161
|
-
^(
|
162
|
+
^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
|
163
|
+
}x
|
164
|
+
|
165
|
+
CONTRACTED_LIST = %r{
|
166
|
+
^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
|
162
167
|
}x
|
163
168
|
|
164
169
|
BLACKLIST = %r{
|
165
170
|
(?i:abundant)|
|
166
171
|
(?i:adult|juvenile)|
|
172
|
+
(?i:administra(d|t)or)|
|
167
173
|
(?i:anon)|
|
168
174
|
(?i:australian?)|
|
169
175
|
(?i:average)|
|
@@ -172,9 +178,10 @@ module DwcAgent
|
|
172
178
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
173
179
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
174
180
|
(?i:carex|salix)|
|
175
|
-
(
|
181
|
+
(?i:catalog(ue)?)|
|
176
182
|
(?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
|
177
183
|
\b\s*(?i:help)\s*\b|
|
184
|
+
(?i:data\s+not\s+captured)|
|
178
185
|
(?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
|
179
186
|
(?i:desconocido)|
|
180
187
|
(?i:exc?s?icc?at(a|i))|
|
@@ -192,6 +199,9 @@ module DwcAgent
|
|
192
199
|
(?i:univ\.)|
|
193
200
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
194
201
|
(?i:non\s+pr(é|e)cis(é|e))|
|
202
|
+
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
203
|
+
(?i:not?\s+(entered|stated))|
|
204
|
+
(?i:nomenclatur(e|al)\s+adjustment)|
|
195
205
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
196
206
|
(?i:recreation|culture)|
|
197
207
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -219,7 +229,12 @@ module DwcAgent
|
|
219
229
|
FAMILY_BLACKLIST = [
|
220
230
|
"der",
|
221
231
|
"van",
|
222
|
-
"von"
|
232
|
+
"von",
|
233
|
+
"the",
|
234
|
+
"of",
|
235
|
+
"curators",
|
236
|
+
"nomenclatural",
|
237
|
+
"adjustment"
|
223
238
|
]
|
224
239
|
|
225
240
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -20,7 +20,8 @@ module DwcAgent
|
|
20
20
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join('\\')].to_s
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
|
-
@add_separators_regex = Regexp.new %r{(
|
23
|
+
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
|
+
@contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
|
24
25
|
end
|
25
26
|
|
26
27
|
# Parses the passed-in string and returns a list of names.
|
@@ -34,6 +35,7 @@ module DwcAgent
|
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
36
37
|
name.gsub!(@complex_separators_regex, '\1 | \2')
|
38
|
+
name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
|
37
39
|
name.gsub!(@residual_terminators_regex, '')
|
38
40
|
name.squeeze!(' ')
|
39
41
|
name.strip!
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|