dwc_agent 1.4.2 → 1.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/constants.rb +37 -21
- data/lib/dwc_agent/parser.rb +6 -5
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2975bee8cb8675fa7b6e50bc45f90d4d21855a2a22fe72999220b80af4e9d54d
|
4
|
+
data.tar.gz: 06b764667fe3235983492f2182eb2dc90d4ad768382e1f1f5eb7384144180a86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7d026fd7ffc15101bd5f994263c1950115fee1d5ee9a5af89f46ae1bfd5aa7e761216b77dea5a4b225503ed11ae889bc8853aafab98d700532b60723cfeefec1
|
7
|
+
data.tar.gz: fd2d7986eb1ea1456800f03c6bf78b31ef33e50e743e1c316b873334fa16767e23afa2c463bf3445e23573620b86746ea61e67c890b16d6b8b2673e01f8a6a84
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
3
|
^[\[{(]|
|
4
|
-
[\]})]
|
4
|
+
[\]})]\??$|
|
5
|
+
(?i:acc\s?\#)|
|
5
6
|
\s*?\d+\.\d+|
|
6
7
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
7
|
-
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
8
|
+
\b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
|
8
9
|
\b\s+(bis|ter)(\b|\z)|
|
9
10
|
\bu\.\s*a\.|
|
10
11
|
\b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
|
@@ -13,9 +14,8 @@ module DwcAgent
|
|
13
14
|
\b[,;]?\s*(?i:unkn?own)\b|
|
14
15
|
\b[,;]?\s*(?i:n/a)\b|
|
15
16
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
17
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
17
18
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
18
|
-
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
19
19
|
\b[,;]?\s*(?i:string)\b|
|
20
20
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
21
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
@@ -48,10 +48,12 @@ module DwcAgent
|
|
48
48
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
49
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
50
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
51
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
51
52
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
52
53
|
\b\s*(?i:maybe)\s*\b|
|
53
54
|
\b\s*(?i:prob)\.\s*\b|
|
54
55
|
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
56
|
+
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
55
57
|
(?i:fide)\:?\s*\b|
|
56
58
|
(?i:game\s+dept)\.?\s*\b|
|
57
59
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -66,22 +68,24 @@ module DwcAgent
|
|
66
68
|
ORCID|
|
67
69
|
MRI(\s|-)PAS|
|
68
70
|
urn\:qm\.qld\.gov\.au\:collector|
|
69
|
-
(?i:University\s+of\s+California
|
71
|
+
(?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
|
70
72
|
(?i:Field\s+Museum\s+of\s+Natural\s+History)|
|
71
73
|
(?i:American\s+Museum\s+of\s+Natural\s+History)|
|
72
74
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
73
|
-
(?i:
|
75
|
+
(?i:museums?\s+victoria)|
|
76
|
+
\b\s*(?i:United\s+States|Russia)\s*\b|
|
74
77
|
(?i:revised|photograph|fruits\s+only)|
|
75
78
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
76
79
|
-?\s*(?i:synonym(y|ie))|
|
77
80
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
78
81
|
\b(?i:to\s+(sub)?spp?)\.?|
|
79
82
|
(?i:nom\.?\s+rev\.?)|
|
80
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
83
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|
|
81
84
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
82
85
|
(?i:university|museum|exhibits?)|
|
83
86
|
(?i:uqam)|
|
84
87
|
(?i:sem\s+(colec?tor|data))|
|
88
|
+
(?i:no\s+coll\.?(ector)?)|
|
85
89
|
\b[,;]\s+\d+\z|
|
86
90
|
["!@?]|
|
87
91
|
[,]?\d+|
|
@@ -133,8 +137,6 @@ module DwcAgent
|
|
133
137
|
'|' => ' | ',
|
134
138
|
'(' => ' ',
|
135
139
|
')' => ' ',
|
136
|
-
'[' => ' ',
|
137
|
-
']' => ' ',
|
138
140
|
'?' => '',
|
139
141
|
'!' => '',
|
140
142
|
'=' => '',
|
@@ -152,10 +154,12 @@ module DwcAgent
|
|
152
154
|
}
|
153
155
|
|
154
156
|
PHRASE_SUBS = {
|
155
|
-
'
|
156
|
-
'
|
157
|
-
'
|
158
|
-
'
|
157
|
+
'dr\.' => 'Dr. ',
|
158
|
+
'mr\.' => 'Mr. ',
|
159
|
+
'mrs\.' => 'Mrs. ',
|
160
|
+
'prof\.' => 'Prof. ',
|
161
|
+
'\, ph\.d\.' => ' Ph.D.',
|
162
|
+
'\, bro\.' => ' Bro.'
|
159
163
|
}
|
160
164
|
|
161
165
|
COMPLEX_SEPARATORS = %r{
|
@@ -171,6 +175,7 @@ module DwcAgent
|
|
171
175
|
(?i:average)|
|
172
176
|
(?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
|
173
177
|
(?i:barcod)|
|
178
|
+
(?i:BgWd)|
|
174
179
|
(?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
|
175
180
|
(?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
|
176
181
|
(?i:carex|salix)|
|
@@ -193,13 +198,15 @@ module DwcAgent
|
|
193
198
|
(?i:geographic)|
|
194
199
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
195
200
|
(?i:univ\.)|
|
196
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
201
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
197
202
|
(?i:non\s+pr(é|e)cis(é|e))|
|
203
|
+
(?i:no\s+consta)|
|
198
204
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
205
|
(?i:not?\s+(entered|stated))|
|
200
206
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
201
207
|
(?i:not\s+available)|
|
202
208
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
209
|
+
(?i:popa\s+observers?)|
|
203
210
|
(?i:recreation|culture)|
|
204
211
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
205
212
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
@@ -207,7 +214,7 @@ module DwcAgent
|
|
207
214
|
(?i:sequence\s+data)|
|
208
215
|
(?i:size|large|colou?r)\s+|
|
209
216
|
(?i:skeleton)|
|
210
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
217
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
211
218
|
(?i:submersible)|
|
212
219
|
(?i:synonymy?)|(topo|syn|holo)type|
|
213
220
|
(?i:systematic|perspective)|
|
@@ -226,20 +233,29 @@ module DwcAgent
|
|
226
233
|
}x
|
227
234
|
|
228
235
|
FAMILY_BLACKLIST = [
|
236
|
+
"da",
|
237
|
+
"de'",
|
238
|
+
"del",
|
229
239
|
"der",
|
240
|
+
"du",
|
241
|
+
"el",
|
230
242
|
"van",
|
231
243
|
"von",
|
232
244
|
"the",
|
233
245
|
"of",
|
234
|
-
"curators",
|
235
|
-
"nomenclatural",
|
236
246
|
"adjustment",
|
237
247
|
"available",
|
248
|
+
"arachnology",
|
249
|
+
"catalogue",
|
250
|
+
"curators",
|
238
251
|
"data",
|
252
|
+
"determination",
|
253
|
+
"dissection",
|
254
|
+
"entered",
|
255
|
+
"nomenclatural",
|
239
256
|
"orig",
|
240
|
-
"
|
241
|
-
"
|
242
|
-
"entered"
|
257
|
+
"registration",
|
258
|
+
"science"
|
243
259
|
]
|
244
260
|
|
245
261
|
GIVEN_BLACKLIST = [
|
@@ -247,6 +263,6 @@ module DwcAgent
|
|
247
263
|
"has not"
|
248
264
|
]
|
249
265
|
|
250
|
-
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|
|
266
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
251
267
|
|
252
268
|
end
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -8,7 +8,7 @@ module DwcAgent
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def initialize
|
11
|
-
options = {
|
11
|
+
options = {
|
12
12
|
prefer_comma_as_separator: true,
|
13
13
|
separator: SPLIT_BY,
|
14
14
|
title: TITLE
|
@@ -17,12 +17,12 @@ module DwcAgent
|
|
17
17
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
21
|
-
@phrase_subs_regex = Regexp.new
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
21
|
+
@phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
# Parses the passed-in string and returns a list of names.
|
27
27
|
#
|
28
28
|
# @param names [String] the name or names to be parsed
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
30
30
|
def parse(name)
|
31
31
|
return [] if name.nil? || name == ""
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
33
|
+
name.gsub!(/\[|\]/, '')
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
@@ -41,4 +42,4 @@ module DwcAgent
|
|
41
42
|
end
|
42
43
|
|
43
44
|
end
|
44
|
-
end
|
45
|
+
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|