dwc_agent 1.4.3 → 1.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/dwc_agent/constants.rb +19 -13
- data/lib/dwc_agent/parser.rb +2 -1
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa4fb87ea91fd1f0e67278590192a55bfc7f1e8d6f4b8dc92c1f9f5eb508e44c
|
4
|
+
data.tar.gz: a89b51ea705885713ef8615c67e1ea10798abfe593b5646b4de9fb8e1b478762
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d676b64441d0097bd6272e2cd694c5754c4bdaed8fd0f523ecbe28748c8ccedffd9dd1c0430f5ad25cf48c02705b8131a9ce1021c07965da0791ae5f62e36c8a
|
7
|
+
data.tar.gz: 00effae1b438e6d97ef8da8383aa407985876d148b5b30a51ca98d2befa0dc8ac4a8c69bb389f75dd08c147853522490a406470dd8c4aa20d976238cf2cb4d82
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
3
|
^[\[{(]|
|
4
|
-
[\]})]
|
4
|
+
[\]})]\??$|
|
5
5
|
\s*?\d+\.\d+|
|
6
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
7
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -13,7 +13,7 @@ module DwcAgent
|
|
13
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
14
14
|
\b[,;]?\s*(?i:n/a)\b|
|
15
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
17
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
18
18
|
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
19
19
|
\b[,;]?\s*(?i:string)\b|
|
@@ -80,11 +80,12 @@ module DwcAgent
|
|
80
80
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
81
81
|
\b(?i:to\s+(sub)?spp?)\.?|
|
82
82
|
(?i:nom\.?\s+rev\.?)|
|
83
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
83
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|
|
84
84
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
85
85
|
(?i:university|museum|exhibits?)|
|
86
86
|
(?i:uqam)|
|
87
87
|
(?i:sem\s+(colec?tor|data))|
|
88
|
+
(?i:no\s+coll\.?(ector)?)|
|
88
89
|
\b[,;]\s+\d+\z|
|
89
90
|
["!@?]|
|
90
91
|
[,]?\d+|
|
@@ -136,8 +137,6 @@ module DwcAgent
|
|
136
137
|
'|' => ' | ',
|
137
138
|
'(' => ' ',
|
138
139
|
')' => ' ',
|
139
|
-
'[' => ' ',
|
140
|
-
']' => ' ',
|
141
140
|
'?' => '',
|
142
141
|
'!' => '',
|
143
142
|
'=' => '',
|
@@ -196,7 +195,7 @@ module DwcAgent
|
|
196
195
|
(?i:geographic)|
|
197
196
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
198
197
|
(?i:univ\.)|
|
199
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
198
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
200
199
|
(?i:non\s+pr(é|e)cis(é|e))|
|
201
200
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
202
201
|
(?i:not?\s+(entered|stated))|
|
@@ -211,7 +210,7 @@ module DwcAgent
|
|
211
210
|
(?i:sequence\s+data)|
|
212
211
|
(?i:size|large|colou?r)\s+|
|
213
212
|
(?i:skeleton)|
|
214
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
213
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
215
214
|
(?i:submersible)|
|
216
215
|
(?i:synonymy?)|(topo|syn|holo)type|
|
217
216
|
(?i:systematic|perspective)|
|
@@ -230,21 +229,28 @@ module DwcAgent
|
|
230
229
|
}x
|
231
230
|
|
232
231
|
FAMILY_BLACKLIST = [
|
232
|
+
"da",
|
233
|
+
"de'",
|
234
|
+
"del",
|
233
235
|
"der",
|
236
|
+
"du",
|
237
|
+
"el",
|
234
238
|
"van",
|
235
239
|
"von",
|
236
240
|
"the",
|
237
241
|
"of",
|
238
|
-
"curators",
|
239
|
-
"nomenclatural",
|
240
242
|
"adjustment",
|
241
243
|
"available",
|
242
|
-
"data",
|
243
|
-
"orig",
|
244
|
-
"science",
|
245
244
|
"catalogue",
|
245
|
+
"curators",
|
246
|
+
"data",
|
247
|
+
"determination",
|
248
|
+
"dissection",
|
246
249
|
"entered",
|
247
|
-
"
|
250
|
+
"nomenclatural",
|
251
|
+
"orig",
|
252
|
+
"registration",
|
253
|
+
"science"
|
248
254
|
]
|
249
255
|
|
250
256
|
GIVEN_BLACKLIST = [
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -17,7 +17,7 @@ module DwcAgent
|
|
17
17
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
30
30
|
def parse(name)
|
31
31
|
return [] if name.nil? || name == ""
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
33
|
+
name.gsub!(/\[|\]/, '')
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|