dwc_agent 0.3.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +18 -2
- data/lib/dwc_agent/constants.rb +21 -4
- data/lib/dwc_agent/version.rb +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69d6e0f6c0f9d59801d0e53a89dd73896f0ac5a8230156879227ea6efe14bb7d
|
4
|
+
data.tar.gz: 1330921589cbcad22273c4b5cae821eba7266cc4357756498ae3b6d4f029e3f8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 735057f853b259f4d7fcb9f4c73d2f193f9a0e363d439a8bf894bbd9f878e8667cd14979c10cf138c7cc007f83cd09b7d5f9f21e0482b6303f73a40c788a957a
|
7
|
+
data.tar.gz: 8760dc0d9976543fe6ce448e2129b94e2d86d5a17f8e0b4b297f4dd23e8c4e4d73b4c7af4863e5edfa6abbbdb83181e6364e8064b0c73f185b0370c3ea1f597c
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -16,20 +16,28 @@ module DwcAgent
|
|
16
16
|
# @param parsed_namae [Object] the namae object
|
17
17
|
# @return [Hash] the given, family hash
|
18
18
|
def clean(parsed_namae)
|
19
|
-
blank_name = { given: nil, family: nil }
|
19
|
+
blank_name = { given: nil, family: nil, particle: nil }
|
20
|
+
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
|
+
return blank_name
|
23
|
+
end
|
20
24
|
|
21
25
|
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
22
26
|
return blank_name
|
23
27
|
end
|
28
|
+
|
24
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
25
30
|
return blank_name
|
26
31
|
end
|
32
|
+
|
27
33
|
if parsed_namae.given && parsed_namae.given.length > 25
|
28
34
|
return blank_name
|
29
35
|
end
|
36
|
+
|
30
37
|
if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
31
38
|
return blank_name
|
32
39
|
end
|
40
|
+
|
33
41
|
if parsed_namae.display_order =~ BLACKLIST
|
34
42
|
return blank_name
|
35
43
|
end
|
@@ -91,6 +99,10 @@ module DwcAgent
|
|
91
99
|
particle = nil
|
92
100
|
end
|
93
101
|
|
102
|
+
if !particle.nil? && particle.include?(".")
|
103
|
+
particle = nil
|
104
|
+
end
|
105
|
+
|
94
106
|
if !family.nil? && (family == family.upcase || family == family.downcase)
|
95
107
|
family = NameCase(family)
|
96
108
|
end
|
@@ -103,7 +115,11 @@ module DwcAgent
|
|
103
115
|
return blank_name
|
104
116
|
end
|
105
117
|
|
106
|
-
|
118
|
+
if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
|
119
|
+
return blank_name
|
120
|
+
end
|
121
|
+
|
122
|
+
{ given: given, family: family, particle: particle }
|
107
123
|
end
|
108
124
|
|
109
125
|
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -20,7 +20,6 @@ module DwcAgent
|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
22
22
|
\b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
|
23
|
-
(?i:no\s+(data|disponible))|
|
24
23
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
25
24
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
26
25
|
May|Jun|Jul|Aug|Sept?|
|
@@ -78,10 +77,11 @@ module DwcAgent
|
|
78
77
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
79
78
|
\b(?i:to\s+(sub)?spp?)\.?|
|
80
79
|
(?i:nom\.?\s+rev\.?)|
|
81
|
-
FNA|DAO|HUH|FDNMB|
|
80
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
81
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
83
82
|
(?i:university|museum|exhibits?)|
|
84
83
|
(?i:uqam)|
|
84
|
+
(?i:sem\s+(colec?tor|data))|
|
85
85
|
\b[,;]\s+\d+\z|
|
86
86
|
["!@?]|
|
87
87
|
[,]?\d+|
|
@@ -147,7 +147,8 @@ module DwcAgent
|
|
147
147
|
'{' => '',
|
148
148
|
'}' => '',
|
149
149
|
'@' => '',
|
150
|
-
'%' => ''
|
150
|
+
'%' => '',
|
151
|
+
'\\' => ''
|
151
152
|
}
|
152
153
|
|
153
154
|
PHRASE_SUBS = {
|
@@ -164,6 +165,7 @@ module DwcAgent
|
|
164
165
|
BLACKLIST = %r{
|
165
166
|
(?i:abundant)|
|
166
167
|
(?i:adult|juvenile)|
|
168
|
+
(?i:administra(d|t)or)|
|
167
169
|
(?i:anon)|
|
168
170
|
(?i:australian?)|
|
169
171
|
(?i:average)|
|
@@ -172,9 +174,10 @@ module DwcAgent
|
|
172
174
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
173
175
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
174
176
|
(?i:carex|salix)|
|
175
|
-
(
|
177
|
+
(?i:catalog(ue)?)|
|
176
178
|
(?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
|
177
179
|
\b\s*(?i:help)\s*\b|
|
180
|
+
(?i:data\s+not\s+captured)|
|
178
181
|
(?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
|
179
182
|
(?i:desconocido)|
|
180
183
|
(?i:exc?s?icc?at(a|i))|
|
@@ -192,6 +195,9 @@ module DwcAgent
|
|
192
195
|
(?i:univ\.)|
|
193
196
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
194
197
|
(?i:non\s+pr(é|e)cis(é|e))|
|
198
|
+
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
|
+
(?i:not?\s+(entered|stated))|
|
200
|
+
(?i:nomenclatur(e|al)\s+adjustment)|
|
195
201
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
196
202
|
(?i:recreation|culture)|
|
197
203
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -216,6 +222,17 @@ module DwcAgent
|
|
216
222
|
^\s*?de\s*?$
|
217
223
|
}x
|
218
224
|
|
225
|
+
FAMILY_BLACKLIST = [
|
226
|
+
"der",
|
227
|
+
"van",
|
228
|
+
"von",
|
229
|
+
"the",
|
230
|
+
"of",
|
231
|
+
"curators",
|
232
|
+
"nomenclatural",
|
233
|
+
"adjustment"
|
234
|
+
]
|
235
|
+
|
219
236
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
220
237
|
|
221
238
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|