dwc_agent 0.3.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +12 -0
- data/lib/dwc_agent/constants.rb +19 -5
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4ea46a2cca2719aebba4a99251aaf02f6d2fb36f21f3e6ea28b76584fc7345a
|
4
|
+
data.tar.gz: 72cde7bbdf5c8f93923710f887f299a4618e32c5d129e8cac0bbcc1a285492fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77c1027c302f5b853641266a833d197c1d81045ffad0ca0a2b3f4161d1fc4443fff944e6493d9c1089e710ac5aaff33bd0075698c7b70ff0ddf90d79602c8442
|
7
|
+
data.tar.gz: 9b92955bf9421e4b5f7a0c5cc5fa04265a393f04de6d7f23217cea739732b81bd8f9fb9042c02db20fe13118f2c9f3b75e1a374787b83d8e0ee1350e3e2a6c7b
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,18 +18,26 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { given: nil, family: nil }
|
20
20
|
|
21
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
|
+
return blank_name
|
23
|
+
end
|
24
|
+
|
21
25
|
if parsed_namae.family && parsed_namae.family.length < 2 && parsed_namae.family.count('.') == 0
|
22
26
|
return blank_name
|
23
27
|
end
|
28
|
+
|
24
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
25
30
|
return blank_name
|
26
31
|
end
|
32
|
+
|
27
33
|
if parsed_namae.given && parsed_namae.given.length > 25
|
28
34
|
return blank_name
|
29
35
|
end
|
36
|
+
|
30
37
|
if parsed_namae.given && parsed_namae.given.count('.') >= 3 && /\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
31
38
|
return blank_name
|
32
39
|
end
|
40
|
+
|
33
41
|
if parsed_namae.display_order =~ BLACKLIST
|
34
42
|
return blank_name
|
35
43
|
end
|
@@ -103,6 +111,10 @@ module DwcAgent
|
|
103
111
|
return blank_name
|
104
112
|
end
|
105
113
|
|
114
|
+
if !family.nil? && FAMILY_BLACKLIST.any?{ |s| s.casecmp(family) == 0 }
|
115
|
+
return blank_name
|
116
|
+
end
|
117
|
+
|
106
118
|
{ given: given, family: family }
|
107
119
|
end
|
108
120
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -20,7 +20,6 @@ module DwcAgent
|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
22
22
|
\b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
|
23
|
-
(?i:no\s+(data|disponible))|
|
24
23
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
25
24
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
26
25
|
May|Jun|Jul|Aug|Sept?|
|
@@ -78,7 +77,7 @@ module DwcAgent
|
|
78
77
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
79
78
|
\b(?i:to\s+(sub)?spp?)\.?|
|
80
79
|
(?i:nom\.?\s+rev\.?)|
|
81
|
-
FNA|DAO|HUH|FDNMB|
|
80
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
81
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
83
82
|
(?i:university|museum|exhibits?)|
|
84
83
|
(?i:uqam)|
|
@@ -147,7 +146,8 @@ module DwcAgent
|
|
147
146
|
'{' => '',
|
148
147
|
'}' => '',
|
149
148
|
'@' => '',
|
150
|
-
'%' => ''
|
149
|
+
'%' => '',
|
150
|
+
'\\' => ''
|
151
151
|
}
|
152
152
|
|
153
153
|
PHRASE_SUBS = {
|
@@ -164,6 +164,7 @@ module DwcAgent
|
|
164
164
|
BLACKLIST = %r{
|
165
165
|
(?i:abundant)|
|
166
166
|
(?i:adult|juvenile)|
|
167
|
+
(?i:administra(d|t)or)|
|
167
168
|
(?i:anon)|
|
168
169
|
(?i:australian?)|
|
169
170
|
(?i:average)|
|
@@ -172,9 +173,10 @@ module DwcAgent
|
|
172
173
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
173
174
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
174
175
|
(?i:carex|salix)|
|
175
|
-
(
|
176
|
+
(?i:catalog(ue)?)|
|
176
177
|
(?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
|
177
178
|
\b\s*(?i:help)\s*\b|
|
179
|
+
(?i:data\s+not\s+captured)|
|
178
180
|
(?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
|
179
181
|
(?i:desconocido)|
|
180
182
|
(?i:exc?s?icc?at(a|i))|
|
@@ -192,6 +194,9 @@ module DwcAgent
|
|
192
194
|
(?i:univ\.)|
|
193
195
|
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
194
196
|
(?i:non\s+pr(é|e)cis(é|e))|
|
197
|
+
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
198
|
+
(?i:not?\s+(entered|stated))|
|
199
|
+
(?i:nomenclatur(e|al)\s+adjustment)|
|
195
200
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
196
201
|
(?i:recreation|culture)|
|
197
202
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
@@ -216,6 +221,15 @@ module DwcAgent
|
|
216
221
|
^\s*?de\s*?$
|
217
222
|
}x
|
218
223
|
|
219
|
-
|
224
|
+
FAMILY_BLACKLIST = [
|
225
|
+
"der",
|
226
|
+
"van",
|
227
|
+
"von",
|
228
|
+
"the",
|
229
|
+
"of",
|
230
|
+
"curators"
|
231
|
+
]
|
232
|
+
|
233
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
220
234
|
|
221
235
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-09-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|