dwc_agent 1.4.1 → 1.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +18 -10
- data/lib/dwc_agent/constants.rb +38 -18
- data/lib/dwc_agent/parser.rb +6 -5
- data/lib/dwc_agent/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1cf058ee8ca5956a4d05ca0122d5ee9a7497b304576bd4af92fd0f0597846a08
|
4
|
+
data.tar.gz: 3ef7bafb6a3bd9b9af826f161448b259739ec791b4158e5704f6d4da98af48fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0be4b1c78b8fbf61c1a5363ce32efa780372b394bf88e9fd5a5fc42be77eea564d865637feaca08b63712d8780c515db2408e01967519fc014a3b6f5468793c9
|
7
|
+
data.tar.gz: 37095ef2920f06e6152aa58aab26034ce5b00d15778a8ae3270c7672c7e53ba2853223f7f67bfbfc5953d06050b8f2a8c8a2a1f0c3e41ba002aa18411a57092c
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -22,6 +22,10 @@ module DwcAgent
|
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
25
|
+
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
|
+
return blank_name
|
27
|
+
end
|
28
|
+
|
25
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
26
30
|
return blank_name
|
27
31
|
end
|
@@ -38,9 +42,9 @@ module DwcAgent
|
|
38
42
|
return blank_name
|
39
43
|
end
|
40
44
|
|
41
|
-
if parsed_namae.given &&
|
42
|
-
parsed_namae.family &&
|
43
|
-
parsed_namae.family.count(".") > 0 &&
|
45
|
+
if parsed_namae.given &&
|
46
|
+
parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") > 0 &&
|
44
48
|
parsed_namae.family.length - parsed_namae.family.count(".") <= 3
|
45
49
|
given = parsed_namae.given
|
46
50
|
family = parsed_namae.family
|
@@ -48,9 +52,9 @@ module DwcAgent
|
|
48
52
|
parsed_namae.given = family
|
49
53
|
end
|
50
54
|
|
51
|
-
if parsed_namae.given &&
|
52
|
-
parsed_namae.family &&
|
53
|
-
parsed_namae.family.length <=3 &&
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
54
58
|
parsed_namae.family == parsed_namae.family.upcase &&
|
55
59
|
parsed_namae.given[-1] != "."
|
56
60
|
given = parsed_namae.given
|
@@ -59,9 +63,9 @@ module DwcAgent
|
|
59
63
|
parsed_namae.given = family
|
60
64
|
end
|
61
65
|
|
62
|
-
if parsed_namae.given &&
|
63
|
-
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
|
-
parsed_namae.given == parsed_namae.given.downcase) &&
|
66
|
+
if parsed_namae.given &&
|
67
|
+
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
+
parsed_namae.given == parsed_namae.given.downcase) &&
|
65
69
|
!parsed_namae.given.include?(".") &&
|
66
70
|
parsed_namae.given.tr(".","").length >= 4
|
67
71
|
parsed_namae.given = NameCase(parsed_namae.given)
|
@@ -115,8 +119,12 @@ module DwcAgent
|
|
115
119
|
return blank_name
|
116
120
|
end
|
117
121
|
|
122
|
+
if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
|
123
|
+
return blank_name
|
124
|
+
end
|
125
|
+
|
118
126
|
{ given: given, family: family, particle: particle }
|
119
127
|
end
|
120
128
|
|
121
129
|
end
|
122
|
-
end
|
130
|
+
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
3
|
^[\[{(]|
|
4
|
-
[\]})]
|
4
|
+
[\]})]\??$|
|
5
5
|
\s*?\d+\.\d+|
|
6
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
7
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -13,9 +13,8 @@ module DwcAgent
|
|
13
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
14
14
|
\b[,;]?\s*(?i:n/a)\b|
|
15
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
17
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
18
|
-
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
19
18
|
\b[,;]?\s*(?i:string)\b|
|
20
19
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
20
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
@@ -48,10 +47,12 @@ module DwcAgent
|
|
48
47
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
48
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
49
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
50
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
51
51
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
52
52
|
\b\s*(?i:maybe)\s*\b|
|
53
53
|
\b\s*(?i:prob)\.\s*\b|
|
54
54
|
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
55
|
+
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
55
56
|
(?i:fide)\:?\s*\b|
|
56
57
|
(?i:game\s+dept)\.?\s*\b|
|
57
58
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -66,22 +67,24 @@ module DwcAgent
|
|
66
67
|
ORCID|
|
67
68
|
MRI(\s|-)PAS|
|
68
69
|
urn\:qm\.qld\.gov\.au\:collector|
|
69
|
-
(?i:University\s+of\s+California
|
70
|
+
(?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
|
70
71
|
(?i:Field\s+Museum\s+of\s+Natural\s+History)|
|
71
72
|
(?i:American\s+Museum\s+of\s+Natural\s+History)|
|
72
73
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
73
74
|
(?i:museum\s+victoria)|
|
75
|
+
\b\s*(?i:United\s+States|Russia)\s*\b|
|
74
76
|
(?i:revised|photograph|fruits\s+only)|
|
75
77
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
76
78
|
-?\s*(?i:synonym(y|ie))|
|
77
79
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
78
80
|
\b(?i:to\s+(sub)?spp?)\.?|
|
79
81
|
(?i:nom\.?\s+rev\.?)|
|
80
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|
|
81
83
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
82
84
|
(?i:university|museum|exhibits?)|
|
83
85
|
(?i:uqam)|
|
84
86
|
(?i:sem\s+(colec?tor|data))|
|
87
|
+
(?i:no\s+coll\.?(ector)?)|
|
85
88
|
\b[,;]\s+\d+\z|
|
86
89
|
["!@?]|
|
87
90
|
[,]?\d+|
|
@@ -133,8 +136,6 @@ module DwcAgent
|
|
133
136
|
'|' => ' | ',
|
134
137
|
'(' => ' ',
|
135
138
|
')' => ' ',
|
136
|
-
'[' => ' ',
|
137
|
-
']' => ' ',
|
138
139
|
'?' => '',
|
139
140
|
'!' => '',
|
140
141
|
'=' => '',
|
@@ -152,10 +153,12 @@ module DwcAgent
|
|
152
153
|
}
|
153
154
|
|
154
155
|
PHRASE_SUBS = {
|
155
|
-
'
|
156
|
-
'
|
157
|
-
'
|
158
|
-
'
|
156
|
+
'dr\.' => 'Dr. ',
|
157
|
+
'mr\.' => 'Mr. ',
|
158
|
+
'mrs\.' => 'Mrs. ',
|
159
|
+
'prof\.' => 'Prof. ',
|
160
|
+
'\, ph\.d\.' => ' Ph.D.',
|
161
|
+
'\, bro\.' => ' Bro.'
|
159
162
|
}
|
160
163
|
|
161
164
|
COMPLEX_SEPARATORS = %r{
|
@@ -193,13 +196,15 @@ module DwcAgent
|
|
193
196
|
(?i:geographic)|
|
194
197
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
195
198
|
(?i:univ\.)|
|
196
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
199
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
197
200
|
(?i:non\s+pr(é|e)cis(é|e))|
|
201
|
+
(?i:no\s+consta)|
|
198
202
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
203
|
(?i:not?\s+(entered|stated))|
|
200
204
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
201
205
|
(?i:not\s+available)|
|
202
206
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
207
|
+
(?i:popa\s+observers?)|
|
203
208
|
(?i:recreation|culture)|
|
204
209
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
205
210
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
@@ -207,7 +212,7 @@ module DwcAgent
|
|
207
212
|
(?i:sequence\s+data)|
|
208
213
|
(?i:size|large|colou?r)\s+|
|
209
214
|
(?i:skeleton)|
|
210
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
215
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
211
216
|
(?i:submersible)|
|
212
217
|
(?i:synonymy?)|(topo|syn|holo)type|
|
213
218
|
(?i:systematic|perspective)|
|
@@ -226,21 +231,36 @@ module DwcAgent
|
|
226
231
|
}x
|
227
232
|
|
228
233
|
FAMILY_BLACKLIST = [
|
234
|
+
"da",
|
235
|
+
"de'",
|
236
|
+
"del",
|
229
237
|
"der",
|
238
|
+
"du",
|
239
|
+
"el",
|
230
240
|
"van",
|
231
241
|
"von",
|
232
242
|
"the",
|
233
243
|
"of",
|
234
|
-
"curators",
|
235
|
-
"nomenclatural",
|
236
244
|
"adjustment",
|
237
245
|
"available",
|
246
|
+
"arachnology",
|
247
|
+
"catalogue",
|
248
|
+
"curators",
|
238
249
|
"data",
|
250
|
+
"determination",
|
251
|
+
"dissection",
|
252
|
+
"entered",
|
253
|
+
"nomenclatural",
|
239
254
|
"orig",
|
240
|
-
"
|
241
|
-
"
|
255
|
+
"registration",
|
256
|
+
"science"
|
257
|
+
]
|
258
|
+
|
259
|
+
GIVEN_BLACKLIST = [
|
260
|
+
"not any",
|
261
|
+
"has not"
|
242
262
|
]
|
243
263
|
|
244
|
-
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|
|
264
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
245
265
|
|
246
266
|
end
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -8,7 +8,7 @@ module DwcAgent
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def initialize
|
11
|
-
options = {
|
11
|
+
options = {
|
12
12
|
prefer_comma_as_separator: true,
|
13
13
|
separator: SPLIT_BY,
|
14
14
|
title: TITLE
|
@@ -17,12 +17,12 @@ module DwcAgent
|
|
17
17
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
21
|
-
@phrase_subs_regex = Regexp.new
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
21
|
+
@phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
# Parses the passed-in string and returns a list of names.
|
27
27
|
#
|
28
28
|
# @param names [String] the name or names to be parsed
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
30
30
|
def parse(name)
|
31
31
|
return [] if name.nil? || name == ""
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
33
|
+
name.gsub!(/\[|\]/, '')
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
@@ -41,4 +42,4 @@ module DwcAgent
|
|
41
42
|
end
|
42
43
|
|
43
44
|
end
|
44
|
-
end
|
45
|
+
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|
@@ -119,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.0.
|
122
|
+
rubygems_version: 3.0.6
|
123
123
|
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|