dwc_agent 1.5.0.2 → 1.5.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +5 -5
- data/lib/dwc_agent/constants.rb +55 -18
- data/lib/dwc_agent/parser.rb +1 -1
- data/lib/dwc_agent/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a63a84b3c095994d4b5053ddb389cfc1e7e0375f51b6cbb2668742bc4381a0da
|
4
|
+
data.tar.gz: 4f4cc7668f64196458bc126e8dc81753afd92ee77ce729f241f96fa5df39315c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: da8f649cbc9d4ddf82b66e044e0f9e6a6354d5f613ecc2bf1e5cf900a37868178e3056f9c84311050738ec3b3dd7b7466df2ea25b5cebd3595821c35f634eb29
|
7
|
+
data.tar.gz: e52cec918b39c1aceb010e55ba0d5feaa7891234368fbdb56abd48c36c0945a5de6db5e89ee1371ee8240c3030caf4bae268257b0a780036df54f9521c3a6f1e
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,10 +18,6 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
|
-
return blank_name
|
23
|
-
end
|
24
|
-
|
25
21
|
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
22
|
return blank_name
|
27
23
|
end
|
@@ -30,7 +26,7 @@ module DwcAgent
|
|
30
26
|
return blank_name
|
31
27
|
end
|
32
28
|
|
33
|
-
if parsed_namae.given && parsed_namae.given.length >
|
29
|
+
if parsed_namae.given && parsed_namae.given.length > 35
|
34
30
|
return blank_name
|
35
31
|
end
|
36
32
|
|
@@ -79,6 +75,10 @@ module DwcAgent
|
|
79
75
|
parsed_namae.given = NameCase(parsed_namae.given)
|
80
76
|
end
|
81
77
|
|
78
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
79
|
+
return blank_name
|
80
|
+
end
|
81
|
+
|
82
82
|
parsed_namae.normalize_initials
|
83
83
|
|
84
84
|
family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -3,18 +3,20 @@ module DwcAgent
|
|
3
3
|
^[\[{(]|
|
4
4
|
[\]})]\??$|
|
5
5
|
(?i:acc\s?\#)|
|
6
|
+
[,;]?\s*(?i:1st|2nd|3rd|[4-9]th)|
|
6
7
|
\s*?\d+\.\d+|
|
7
8
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
8
9
|
\b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
|
10
|
+
\b[,;]?\s*(?i:etal)\.?|
|
9
11
|
\b\s+(bis|ter)(\b|\z)|
|
10
12
|
\bu\.\s*a\.|
|
11
|
-
\b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
|
13
|
+
\b[,;]?\s*(?i:and|&)?\s*(?i:others|party)\s*\b|
|
12
14
|
\b[,;]?\s*(?i:etc)\.?|
|
13
15
|
\b[,;]?\s*(?i:on)\b|
|
14
16
|
\b[,;]?\s*(?i:unkn?own)\b|
|
15
17
|
\b[,;]?\s*(?i:n/a)\b|
|
16
18
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
17
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed
|
19
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?|presumably)\)?\b|
|
18
20
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
19
21
|
\b[,;]?\s*(?i:string)\b|
|
20
22
|
\b[,;]?\s*(?i:person\s*string)\b|
|
@@ -53,29 +55,32 @@ module DwcAgent
|
|
53
55
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
54
56
|
\b\s*(?i:maybe)\s*\b|
|
55
57
|
\b\s*(?i:prob)\.\s*\b|
|
56
|
-
\(?[,]?\s*?(?i:(local)?\s?
|
58
|
+
\(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
57
59
|
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
58
60
|
(?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
|
59
61
|
(?i:fide)\:?\s*\b|
|
62
|
+
(?i:first\s+name\s+unknown)|
|
60
63
|
(?i:game\s+dept)\.?\s*\b|
|
61
64
|
(?i:see\s+notes?\s*(inside)?)|
|
62
65
|
(?i:see\s+letter\s+enclosed)|
|
63
66
|
(?i:(by)?\s+correspondance)|
|
64
|
-
(?i:pers\.?\s
|
67
|
+
(?i:pers\.?\s*comm\.?)|
|
65
68
|
(?i:crossed\s+out)|
|
66
69
|
\(?(?i:source)\(?|
|
67
70
|
(?i:according\s+to)|
|
68
71
|
(?i:lanuv)\d+|
|
72
|
+
\b\s*name\b|
|
73
|
+
\b\s*lost\b|
|
69
74
|
(?i:nswobs)|
|
70
75
|
ORCID|
|
71
76
|
MRI(\s|-)PAS|
|
72
77
|
urn\:qm\.qld\.gov\.au\:collector|
|
73
78
|
(?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
|
74
|
-
(?i:
|
75
|
-
(?i:
|
79
|
+
(?i:field\s+museum\s+of\s+natural\s+history)|
|
80
|
+
(?i:american\s+museum\s+of\s+natural\s+history)|
|
76
81
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
77
82
|
(?i:museums?\s+victoria)|
|
78
|
-
\b\s*(?i:
|
83
|
+
\b\s*(?i:united\s+states|russia)\s*\b|
|
79
84
|
(?i:revised|photograph|fruits\s+only)|
|
80
85
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
81
86
|
-?\s*(?i:synonym(y|ie))|
|
@@ -83,11 +88,14 @@ module DwcAgent
|
|
83
88
|
\b(?i:to\s+(sub)?spp?)\.?|
|
84
89
|
(?i:nom\.?\s+rev\.?)|
|
85
90
|
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|USGS|NAWQA|
|
91
|
+
\b,?\s*(?i:para|topo|syn)?(?i:type)|
|
86
92
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
87
93
|
(?i:university|museum|exhibits?)|
|
88
94
|
(?i:uqam)|
|
89
95
|
(?i:sem\s+(colec?tor|data))|
|
90
96
|
(?i:no\s+coll\.?(ector)?)|
|
97
|
+
(?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
|
98
|
+
(?i:non?)\s+(?i:specificato)|
|
91
99
|
\b[,;]\s+\d+\z|
|
92
100
|
["!@?]|
|
93
101
|
[,]?\d+|
|
@@ -113,19 +121,22 @@ module DwcAgent
|
|
113
121
|
[–|ǀ∣|│&+\/;:]|
|
114
122
|
\s+-\s+|
|
115
123
|
\s+a\.\s+|
|
116
|
-
\b(e|y|i|en|et|or|per|for)\s*\b|
|
124
|
+
\b(con|e|y|i|en|et|or|per|for)\s*\b|
|
117
125
|
\b(?i:and|with)\s*\b|
|
118
126
|
\b(?i:annotated(\s+by)?)\s*\b|
|
119
127
|
\b(?i:coll\.)\s*\b|
|
120
128
|
\b(?i:communicate?d(\s+to)?)\s*\b|
|
121
129
|
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
|
130
|
+
\b(?i:confirmada)(\s+por)?\s*\b|
|
122
131
|
\b(?i:checked?(\s+by)?)\s*\b|
|
123
132
|
\b(?i:det\.?(\s+by)?)\s*\b|
|
124
133
|
\b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
|
125
134
|
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
|
126
135
|
\b(?i:in?dentified(\s+by)?)\s*\b|
|
127
136
|
\b(?i:in\s+part(\s+by)?)\s*\b|
|
137
|
+
\b(?i:och)\s*\b|
|
128
138
|
\b(?i:prep\.?\s+(?i:by)?)\s*\b|
|
139
|
+
\b(?i:purchased?)(\s+by)?\s*\b|
|
129
140
|
\b(?i:redet\.?(\s+by?)?)\s*\b|
|
130
141
|
\b(?i:reidentified(\s+by)?)\s*\b|
|
131
142
|
\b(?i:stet)\s*\b|
|
@@ -155,16 +166,24 @@ module DwcAgent
|
|
155
166
|
'}' => '',
|
156
167
|
'@' => '',
|
157
168
|
'%' => '',
|
158
|
-
'\\' => ''
|
169
|
+
'\\' => '',
|
170
|
+
'´' => '\'',
|
171
|
+
'+' => ' | '
|
159
172
|
}
|
160
173
|
|
161
174
|
PHRASE_SUBS = {
|
162
|
-
'
|
163
|
-
'
|
164
|
-
'
|
165
|
-
'
|
166
|
-
'
|
167
|
-
'
|
175
|
+
', ph.d.' => ' Ph.D.',
|
176
|
+
', Ph.D.' => ' Ph.D.',
|
177
|
+
', bro.' => ' Bro.',
|
178
|
+
', Jr.,' => ' Jr.;',
|
179
|
+
', Jr.' => ' Jr.',
|
180
|
+
',Jr.' => ' Jr.',
|
181
|
+
', Sr.' => ' Sr.',
|
182
|
+
',Sr.' => ' Sr.',
|
183
|
+
' jr.,' => ' Jr.;',
|
184
|
+
' jr,' => ' Jr.;',
|
185
|
+
'-jr' => ' Jr.',
|
186
|
+
'-Jr' => ' Jr.'
|
168
187
|
}
|
169
188
|
|
170
189
|
COMPLEX_SEPARATORS = %r{
|
@@ -229,7 +248,7 @@ module DwcAgent
|
|
229
248
|
(?i:though)|
|
230
249
|
(?i:texas\s+instruments?)\s*?(for)?|
|
231
250
|
(?:tropical)|
|
232
|
-
(?i:toward|seen
|
251
|
+
(?i:toward|seen\s+at)|
|
233
252
|
(?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
|
234
253
|
(?i:urn\:)|
|
235
254
|
(?i:usda|ucla)|
|
@@ -238,29 +257,47 @@ module DwcAgent
|
|
238
257
|
}x
|
239
258
|
|
240
259
|
FAMILY_BLACKLIST = [
|
260
|
+
"ap",
|
241
261
|
"da",
|
262
|
+
"de",
|
242
263
|
"de'",
|
243
264
|
"del",
|
244
265
|
"der",
|
266
|
+
"di",
|
267
|
+
"do",
|
268
|
+
"dos",
|
245
269
|
"du",
|
246
270
|
"el",
|
271
|
+
"le",
|
272
|
+
"la",
|
247
273
|
"van",
|
248
274
|
"von",
|
249
275
|
"the",
|
250
276
|
"of",
|
251
277
|
"adjustment",
|
278
|
+
"annotator",
|
252
279
|
"available",
|
253
280
|
"arachnology",
|
254
281
|
"catalogue",
|
255
282
|
"curators",
|
256
283
|
"data",
|
284
|
+
"details",
|
285
|
+
"determiner",
|
257
286
|
"determination",
|
258
287
|
"dissection",
|
259
288
|
"entered",
|
289
|
+
"erased",
|
290
|
+
"indecipherable",
|
260
291
|
"nomenclatural",
|
261
292
|
"orig",
|
262
293
|
"registration",
|
263
|
-
"science"
|
294
|
+
"science",
|
295
|
+
"wg",
|
296
|
+
"wm",
|
297
|
+
"wn",
|
298
|
+
"zw",
|
299
|
+
"zz",
|
300
|
+
"z-"
|
264
301
|
]
|
265
302
|
|
266
303
|
GIVEN_BLACKLIST = [
|
@@ -268,7 +305,7 @@ module DwcAgent
|
|
268
305
|
"has not"
|
269
306
|
]
|
270
307
|
|
271
|
-
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|
|
308
|
+
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|major|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
272
309
|
|
273
310
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
274
311
|
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -20,7 +20,7 @@ module DwcAgent
|
|
20
20
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
21
21
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
22
22
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
23
|
-
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s
|
23
|
+
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
|
24
24
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
25
25
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
26
26
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.0.
|
4
|
+
version: 1.5.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|
@@ -102,7 +102,7 @@ homepage: https://github.com/bionomia/dwc_agent
|
|
102
102
|
licenses:
|
103
103
|
- MIT
|
104
104
|
metadata: {}
|
105
|
-
post_install_message:
|
105
|
+
post_install_message:
|
106
106
|
rdoc_options:
|
107
107
|
- "--encoding"
|
108
108
|
- UTF-8
|
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
-
signing_key:
|
122
|
+
rubygems_version: 3.1.2
|
123
|
+
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|
126
126
|
test_files: []
|