dwc_agent 1.3.1 → 1.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +18 -10
- data/lib/dwc_agent/constants.rb +40 -14
- data/lib/dwc_agent/parser.rb +6 -5
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7b8890828a04d30f7989f3545df032be216d1bab086f1ec6576cbe93569231bd
|
4
|
+
data.tar.gz: d94e120615540e65f29a13132d46b34639e3380e2265439fd05cf5febc580ff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2e30d42a8c2d743791c2322a51d3fe022e5ef458f68ebd30763cfc74e5ec9a1914faca1ff90cfb3364bead66cde22bccbc0ab47aef6a7d17a78c6954c8cbeff
|
7
|
+
data.tar.gz: d2de0c26c64f77fd273db67465d8772aef5480eb0c4dd1b48446dbb6a6495164bc9e2fbd133280c18ef7e975f82bf9f46dcd65e87727531fa7c514ebc09e0be7
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -22,6 +22,10 @@ module DwcAgent
|
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
25
|
+
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
|
+
return blank_name
|
27
|
+
end
|
28
|
+
|
25
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
26
30
|
return blank_name
|
27
31
|
end
|
@@ -38,9 +42,9 @@ module DwcAgent
|
|
38
42
|
return blank_name
|
39
43
|
end
|
40
44
|
|
41
|
-
if parsed_namae.given &&
|
42
|
-
parsed_namae.family &&
|
43
|
-
parsed_namae.family.count(".") > 0 &&
|
45
|
+
if parsed_namae.given &&
|
46
|
+
parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") > 0 &&
|
44
48
|
parsed_namae.family.length - parsed_namae.family.count(".") <= 3
|
45
49
|
given = parsed_namae.given
|
46
50
|
family = parsed_namae.family
|
@@ -48,9 +52,9 @@ module DwcAgent
|
|
48
52
|
parsed_namae.given = family
|
49
53
|
end
|
50
54
|
|
51
|
-
if parsed_namae.given &&
|
52
|
-
parsed_namae.family &&
|
53
|
-
parsed_namae.family.length <=3 &&
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
54
58
|
parsed_namae.family == parsed_namae.family.upcase &&
|
55
59
|
parsed_namae.given[-1] != "."
|
56
60
|
given = parsed_namae.given
|
@@ -59,9 +63,9 @@ module DwcAgent
|
|
59
63
|
parsed_namae.given = family
|
60
64
|
end
|
61
65
|
|
62
|
-
if parsed_namae.given &&
|
63
|
-
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
|
-
parsed_namae.given == parsed_namae.given.downcase) &&
|
66
|
+
if parsed_namae.given &&
|
67
|
+
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
+
parsed_namae.given == parsed_namae.given.downcase) &&
|
65
69
|
!parsed_namae.given.include?(".") &&
|
66
70
|
parsed_namae.given.tr(".","").length >= 4
|
67
71
|
parsed_namae.given = NameCase(parsed_namae.given)
|
@@ -115,8 +119,12 @@ module DwcAgent
|
|
115
119
|
return blank_name
|
116
120
|
end
|
117
121
|
|
122
|
+
if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
|
123
|
+
return blank_name
|
124
|
+
end
|
125
|
+
|
118
126
|
{ given: given, family: family, particle: particle }
|
119
127
|
end
|
120
128
|
|
121
129
|
end
|
122
|
-
end
|
130
|
+
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
3
|
^[\[{(]|
|
4
|
-
[\]})]
|
4
|
+
[\]})]\??$|
|
5
5
|
\s*?\d+\.\d+|
|
6
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
7
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -13,9 +13,8 @@ module DwcAgent
|
|
13
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
14
14
|
\b[,;]?\s*(?i:n/a)\b|
|
15
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
17
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
18
|
-
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
19
18
|
\b[,;]?\s*(?i:string)\b|
|
20
19
|
\b[,;]?\s*(?i:person\s*string)\b|
|
21
20
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
@@ -48,10 +47,12 @@ module DwcAgent
|
|
48
47
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
48
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
49
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
50
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
51
51
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
52
52
|
\b\s*(?i:maybe)\s*\b|
|
53
53
|
\b\s*(?i:prob)\.\s*\b|
|
54
54
|
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
55
|
+
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
55
56
|
(?i:fide)\:?\s*\b|
|
56
57
|
(?i:game\s+dept)\.?\s*\b|
|
57
58
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -71,17 +72,19 @@ module DwcAgent
|
|
71
72
|
(?i:American\s+Museum\s+of\s+Natural\s+History)|
|
72
73
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
73
74
|
(?i:museum\s+victoria)|
|
75
|
+
\b\s*(?i:United\s+States|Russia)\s*\b|
|
74
76
|
(?i:revised|photograph|fruits\s+only)|
|
75
77
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
76
78
|
-?\s*(?i:synonym(y|ie))|
|
77
79
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
78
80
|
\b(?i:to\s+(sub)?spp?)\.?|
|
79
81
|
(?i:nom\.?\s+rev\.?)|
|
80
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
82
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|
|
81
83
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
82
84
|
(?i:university|museum|exhibits?)|
|
83
85
|
(?i:uqam)|
|
84
86
|
(?i:sem\s+(colec?tor|data))|
|
87
|
+
(?i:no\s+coll\.?(ector)?)|
|
85
88
|
\b[,;]\s+\d+\z|
|
86
89
|
["!@?]|
|
87
90
|
[,]?\d+|
|
@@ -133,8 +136,6 @@ module DwcAgent
|
|
133
136
|
'|' => ' | ',
|
134
137
|
'(' => ' ',
|
135
138
|
')' => ' ',
|
136
|
-
'[' => ' ',
|
137
|
-
']' => ' ',
|
138
139
|
'?' => '',
|
139
140
|
'!' => '',
|
140
141
|
'=' => '',
|
@@ -152,10 +153,12 @@ module DwcAgent
|
|
152
153
|
}
|
153
154
|
|
154
155
|
PHRASE_SUBS = {
|
155
|
-
'
|
156
|
-
'
|
157
|
-
'
|
158
|
-
'
|
156
|
+
'dr\.' => 'Dr. ',
|
157
|
+
'mr\.' => 'Mr. ',
|
158
|
+
'mrs\.' => 'Mrs. ',
|
159
|
+
'prof\.' => 'Prof. ',
|
160
|
+
'\, ph\.d\.' => ' Ph.D.',
|
161
|
+
'\, bro\.' => ' Bro.'
|
159
162
|
}
|
160
163
|
|
161
164
|
COMPLEX_SEPARATORS = %r{
|
@@ -193,19 +196,22 @@ module DwcAgent
|
|
193
196
|
(?i:geographic)|
|
194
197
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
195
198
|
(?i:univ\.)|
|
196
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
199
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
197
200
|
(?i:non\s+pr(é|e)cis(é|e))|
|
198
201
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
202
|
(?i:not?\s+(entered|stated))|
|
200
203
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
204
|
+
(?i:not\s+available)|
|
201
205
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
206
|
+
(?i:popa\s+observers?)|
|
202
207
|
(?i:recreation|culture)|
|
203
208
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
204
209
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
205
210
|
(?i:commercial|company|control|product)|
|
211
|
+
(?i:sequence\s+data)|
|
206
212
|
(?i:size|large|colou?r)\s+|
|
207
213
|
(?i:skeleton)|
|
208
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
214
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
209
215
|
(?i:submersible)|
|
210
216
|
(?i:synonymy?)|(topo|syn|holo)type|
|
211
217
|
(?i:systematic|perspective)|
|
@@ -213,6 +219,7 @@ module DwcAgent
|
|
213
219
|
\s*(?i:too)\s+|\s*(?i:the)\s+|
|
214
220
|
(?i:taxiderm(ies|y))|
|
215
221
|
(?i:though)|
|
222
|
+
(?i:texas\s+instruments?)\s*?(for)?|
|
216
223
|
(?:tropical)|
|
217
224
|
(?i:toward|seen at)|
|
218
225
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
@@ -223,16 +230,35 @@ module DwcAgent
|
|
223
230
|
}x
|
224
231
|
|
225
232
|
FAMILY_BLACKLIST = [
|
233
|
+
"da",
|
234
|
+
"de'",
|
235
|
+
"del",
|
226
236
|
"der",
|
237
|
+
"du",
|
238
|
+
"el",
|
227
239
|
"van",
|
228
240
|
"von",
|
229
241
|
"the",
|
230
242
|
"of",
|
243
|
+
"adjustment",
|
244
|
+
"available",
|
245
|
+
"catalogue",
|
231
246
|
"curators",
|
247
|
+
"data",
|
248
|
+
"determination",
|
249
|
+
"dissection",
|
250
|
+
"entered",
|
232
251
|
"nomenclatural",
|
233
|
-
"
|
252
|
+
"orig",
|
253
|
+
"registration",
|
254
|
+
"science"
|
234
255
|
]
|
235
256
|
|
236
|
-
|
257
|
+
GIVEN_BLACKLIST = [
|
258
|
+
"not any",
|
259
|
+
"has not"
|
260
|
+
]
|
261
|
+
|
262
|
+
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
237
263
|
|
238
264
|
end
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -8,7 +8,7 @@ module DwcAgent
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def initialize
|
11
|
-
options = {
|
11
|
+
options = {
|
12
12
|
prefer_comma_as_separator: true,
|
13
13
|
separator: SPLIT_BY,
|
14
14
|
title: TITLE
|
@@ -17,12 +17,12 @@ module DwcAgent
|
|
17
17
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
21
|
-
@phrase_subs_regex = Regexp.new
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
21
|
+
@phrase_subs_regex = Regexp.new((PHRASE_SUBS.keys.join('|')).to_s, Regexp::IGNORECASE)
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
# Parses the passed-in string and returns a list of names.
|
27
27
|
#
|
28
28
|
# @param names [String] the name or names to be parsed
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
30
30
|
def parse(name)
|
31
31
|
return [] if name.nil? || name == ""
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
33
|
+
name.gsub!(/\[|\]/, '')
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
@@ -41,4 +42,4 @@ module DwcAgent
|
|
41
42
|
end
|
42
43
|
|
43
44
|
end
|
44
|
-
end
|
45
|
+
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|