dwc_agent 1.4.3 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/constants.rb +19 -13
- data/lib/dwc_agent/parser.rb +2 -1
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fa4fb87ea91fd1f0e67278590192a55bfc7f1e8d6f4b8dc92c1f9f5eb508e44c
|
|
4
|
+
data.tar.gz: a89b51ea705885713ef8615c67e1ea10798abfe593b5646b4de9fb8e1b478762
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d676b64441d0097bd6272e2cd694c5754c4bdaed8fd0f523ecbe28748c8ccedffd9dd1c0430f5ad25cf48c02705b8131a9ce1021c07965da0791ae5f62e36c8a
|
|
7
|
+
data.tar.gz: 00effae1b438e6d97ef8da8383aa407985876d148b5b30a51ca98d2befa0dc8ac4a8c69bb389f75dd08c147853522490a406470dd8c4aa20d976238cf2cb4d82
|
data/lib/dwc_agent/constants.rb
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module DwcAgent
|
|
2
2
|
STRIP_OUT = %r{
|
|
3
3
|
^[\[{(]|
|
|
4
|
-
[\]})]
|
|
4
|
+
[\]})]\??$|
|
|
5
5
|
\s*?\d+\.\d+|
|
|
6
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
|
7
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
|
@@ -13,7 +13,7 @@ module DwcAgent
|
|
|
13
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
|
14
14
|
\b[,;]?\s*(?i:n/a)\b|
|
|
15
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
|
17
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
|
18
18
|
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
|
19
19
|
\b[,;]?\s*(?i:string)\b|
|
|
@@ -80,11 +80,12 @@ module DwcAgent
|
|
|
80
80
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
|
81
81
|
\b(?i:to\s+(sub)?spp?)\.?|
|
|
82
82
|
(?i:nom\.?\s+rev\.?)|
|
|
83
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
|
83
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|
|
|
84
84
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
|
85
85
|
(?i:university|museum|exhibits?)|
|
|
86
86
|
(?i:uqam)|
|
|
87
87
|
(?i:sem\s+(colec?tor|data))|
|
|
88
|
+
(?i:no\s+coll\.?(ector)?)|
|
|
88
89
|
\b[,;]\s+\d+\z|
|
|
89
90
|
["!@?]|
|
|
90
91
|
[,]?\d+|
|
|
@@ -136,8 +137,6 @@ module DwcAgent
|
|
|
136
137
|
'|' => ' | ',
|
|
137
138
|
'(' => ' ',
|
|
138
139
|
')' => ' ',
|
|
139
|
-
'[' => ' ',
|
|
140
|
-
']' => ' ',
|
|
141
140
|
'?' => '',
|
|
142
141
|
'!' => '',
|
|
143
142
|
'=' => '',
|
|
@@ -196,7 +195,7 @@ module DwcAgent
|
|
|
196
195
|
(?i:geographic)|
|
|
197
196
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
|
198
197
|
(?i:univ\.)|
|
|
199
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
|
198
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
|
200
199
|
(?i:non\s+pr(é|e)cis(é|e))|
|
|
201
200
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
|
202
201
|
(?i:not?\s+(entered|stated))|
|
|
@@ -211,7 +210,7 @@ module DwcAgent
|
|
|
211
210
|
(?i:sequence\s+data)|
|
|
212
211
|
(?i:size|large|colou?r)\s+|
|
|
213
212
|
(?i:skeleton)|
|
|
214
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
|
213
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
|
215
214
|
(?i:submersible)|
|
|
216
215
|
(?i:synonymy?)|(topo|syn|holo)type|
|
|
217
216
|
(?i:systematic|perspective)|
|
|
@@ -230,21 +229,28 @@ module DwcAgent
|
|
|
230
229
|
}x
|
|
231
230
|
|
|
232
231
|
FAMILY_BLACKLIST = [
|
|
232
|
+
"da",
|
|
233
|
+
"de'",
|
|
234
|
+
"del",
|
|
233
235
|
"der",
|
|
236
|
+
"du",
|
|
237
|
+
"el",
|
|
234
238
|
"van",
|
|
235
239
|
"von",
|
|
236
240
|
"the",
|
|
237
241
|
"of",
|
|
238
|
-
"curators",
|
|
239
|
-
"nomenclatural",
|
|
240
242
|
"adjustment",
|
|
241
243
|
"available",
|
|
242
|
-
"data",
|
|
243
|
-
"orig",
|
|
244
|
-
"science",
|
|
245
244
|
"catalogue",
|
|
245
|
+
"curators",
|
|
246
|
+
"data",
|
|
247
|
+
"determination",
|
|
248
|
+
"dissection",
|
|
246
249
|
"entered",
|
|
247
|
-
"
|
|
250
|
+
"nomenclatural",
|
|
251
|
+
"orig",
|
|
252
|
+
"registration",
|
|
253
|
+
"science"
|
|
248
254
|
]
|
|
249
255
|
|
|
250
256
|
GIVEN_BLACKLIST = [
|
data/lib/dwc_agent/parser.rb
CHANGED
|
@@ -17,7 +17,7 @@ module DwcAgent
|
|
|
17
17
|
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
|
30
30
|
def parse(name)
|
|
31
31
|
return [] if name.nil? || name == ""
|
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
|
33
|
+
name.gsub!(/\[|\]/, '')
|
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dwc_agent
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.4.
|
|
4
|
+
version: 1.4.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David P. Shorthouse
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2020-01-
|
|
11
|
+
date: 2020-01-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: namae
|