dwc_agent 1.3.0 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/dwcagent +1 -1
- data/bin/dwcagent-similarity +1 -1
- data/lib/dwc_agent/cleaner.rb +18 -10
- data/lib/dwc_agent/constants.rb +33 -8
- data/lib/dwc_agent/parser.rb +2 -1
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fa4fb87ea91fd1f0e67278590192a55bfc7f1e8d6f4b8dc92c1f9f5eb508e44c
|
4
|
+
data.tar.gz: a89b51ea705885713ef8615c67e1ea10798abfe593b5646b4de9fb8e1b478762
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d676b64441d0097bd6272e2cd694c5754c4bdaed8fd0f523ecbe28748c8ccedffd9dd1c0430f5ad25cf48c02705b8131a9ce1021c07965da0791ae5f62e36c8a
|
7
|
+
data.tar.gz: 00effae1b438e6d97ef8da8383aa407985876d148b5b30a51ca98d2befa0dc8ac4a8c69bb389f75dd08c147853522490a406470dd8c4aa20d976238cf2cb4d82
|
data/bin/dwcagent
CHANGED
data/bin/dwcagent-similarity
CHANGED
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -22,6 +22,10 @@ module DwcAgent
|
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
25
|
+
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
|
+
return blank_name
|
27
|
+
end
|
28
|
+
|
25
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
26
30
|
return blank_name
|
27
31
|
end
|
@@ -38,9 +42,9 @@ module DwcAgent
|
|
38
42
|
return blank_name
|
39
43
|
end
|
40
44
|
|
41
|
-
if parsed_namae.given &&
|
42
|
-
parsed_namae.family &&
|
43
|
-
parsed_namae.family.count(".") > 0 &&
|
45
|
+
if parsed_namae.given &&
|
46
|
+
parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") > 0 &&
|
44
48
|
parsed_namae.family.length - parsed_namae.family.count(".") <= 3
|
45
49
|
given = parsed_namae.given
|
46
50
|
family = parsed_namae.family
|
@@ -48,9 +52,9 @@ module DwcAgent
|
|
48
52
|
parsed_namae.given = family
|
49
53
|
end
|
50
54
|
|
51
|
-
if parsed_namae.given &&
|
52
|
-
parsed_namae.family &&
|
53
|
-
parsed_namae.family.length <=3 &&
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
54
58
|
parsed_namae.family == parsed_namae.family.upcase &&
|
55
59
|
parsed_namae.given[-1] != "."
|
56
60
|
given = parsed_namae.given
|
@@ -59,9 +63,9 @@ module DwcAgent
|
|
59
63
|
parsed_namae.given = family
|
60
64
|
end
|
61
65
|
|
62
|
-
if parsed_namae.given &&
|
63
|
-
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
|
-
parsed_namae.given == parsed_namae.given.downcase) &&
|
66
|
+
if parsed_namae.given &&
|
67
|
+
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
+
parsed_namae.given == parsed_namae.given.downcase) &&
|
65
69
|
!parsed_namae.given.include?(".") &&
|
66
70
|
parsed_namae.given.tr(".","").length >= 4
|
67
71
|
parsed_namae.given = NameCase(parsed_namae.given)
|
@@ -115,8 +119,12 @@ module DwcAgent
|
|
115
119
|
return blank_name
|
116
120
|
end
|
117
121
|
|
122
|
+
if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
|
123
|
+
return blank_name
|
124
|
+
end
|
125
|
+
|
118
126
|
{ given: given, family: family, particle: particle }
|
119
127
|
end
|
120
128
|
|
121
129
|
end
|
122
|
-
end
|
130
|
+
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module DwcAgent
|
2
2
|
STRIP_OUT = %r{
|
3
3
|
^[\[{(]|
|
4
|
-
[\]})]
|
4
|
+
[\]})]\??$|
|
5
5
|
\s*?\d+\.\d+|
|
6
6
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
7
7
|
\b[,;]?\s*(?i:et\.?\s+al)\.?|
|
@@ -13,7 +13,7 @@ module DwcAgent
|
|
13
13
|
\b[,;]?\s*(?i:unkn?own)\b|
|
14
14
|
\b[,;]?\s*(?i:n/a)\b|
|
15
15
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
16
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|
|
16
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit)\)?\b|
|
17
17
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
18
18
|
\b[,;]?\s*(?i:frère|frere|père|pere|soeur|sister|bro)\.?(\b|\z)|
|
19
19
|
\b[,;]?\s*(?i:string)\b|
|
@@ -48,10 +48,12 @@ module DwcAgent
|
|
48
48
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
49
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
50
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
51
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
51
52
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
52
53
|
\b\s*(?i:maybe)\s*\b|
|
53
54
|
\b\s*(?i:prob)\.\s*\b|
|
54
55
|
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
56
|
+
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
55
57
|
(?i:fide)\:?\s*\b|
|
56
58
|
(?i:game\s+dept)\.?\s*\b|
|
57
59
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -71,17 +73,19 @@ module DwcAgent
|
|
71
73
|
(?i:American\s+Museum\s+of\s+Natural\s+History)|
|
72
74
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
73
75
|
(?i:museum\s+victoria)|
|
76
|
+
\b\s*(?i:United\s+States|Russia)\s*\b|
|
74
77
|
(?i:revised|photograph|fruits\s+only)|
|
75
78
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
76
79
|
-?\s*(?i:synonym(y|ie))|
|
77
80
|
\b\s*\(?(?i:(fe)?male)\)?\s*\b|
|
78
81
|
\b(?i:to\s+(sub)?spp?)\.?|
|
79
82
|
(?i:nom\.?\s+rev\.?)|
|
80
|
-
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|
|
83
|
+
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|
|
81
84
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
82
85
|
(?i:university|museum|exhibits?)|
|
83
86
|
(?i:uqam)|
|
84
87
|
(?i:sem\s+(colec?tor|data))|
|
88
|
+
(?i:no\s+coll\.?(ector)?)|
|
85
89
|
\b[,;]\s+\d+\z|
|
86
90
|
["!@?]|
|
87
91
|
[,]?\d+|
|
@@ -133,8 +137,6 @@ module DwcAgent
|
|
133
137
|
'|' => ' | ',
|
134
138
|
'(' => ' ',
|
135
139
|
')' => ' ',
|
136
|
-
'[' => ' ',
|
137
|
-
']' => ' ',
|
138
140
|
'?' => '',
|
139
141
|
'!' => '',
|
140
142
|
'=' => '',
|
@@ -193,19 +195,22 @@ module DwcAgent
|
|
193
195
|
(?i:geographic)|
|
194
196
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
195
197
|
(?i:univ\.)|
|
196
|
-
(?i:graduate|student|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
198
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fishermen|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker)|
|
197
199
|
(?i:non\s+pr(é|e)cis(é|e))|
|
198
200
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
199
201
|
(?i:not?\s+(entered|stated))|
|
200
202
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
203
|
+
(?i:not\s+available)|
|
201
204
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
205
|
+
(?i:popa\s+observers?)|
|
202
206
|
(?i:recreation|culture)|
|
203
207
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
204
208
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
205
209
|
(?i:commercial|company|control|product)|
|
210
|
+
(?i:sequence\s+data)|
|
206
211
|
(?i:size|large|colou?r)\s+|
|
207
212
|
(?i:skeleton)|
|
208
|
-
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|
|
213
|
+
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
209
214
|
(?i:submersible)|
|
210
215
|
(?i:synonymy?)|(topo|syn|holo)type|
|
211
216
|
(?i:systematic|perspective)|
|
@@ -213,6 +218,7 @@ module DwcAgent
|
|
213
218
|
\s*(?i:too)\s+|\s*(?i:the)\s+|
|
214
219
|
(?i:taxiderm(ies|y))|
|
215
220
|
(?i:though)|
|
221
|
+
(?i:texas\s+instruments?)\s*?(for)?|
|
216
222
|
(?:tropical)|
|
217
223
|
(?i:toward|seen at)|
|
218
224
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
@@ -223,14 +229,33 @@ module DwcAgent
|
|
223
229
|
}x
|
224
230
|
|
225
231
|
FAMILY_BLACKLIST = [
|
232
|
+
"da",
|
233
|
+
"de'",
|
234
|
+
"del",
|
226
235
|
"der",
|
236
|
+
"du",
|
237
|
+
"el",
|
227
238
|
"van",
|
228
239
|
"von",
|
229
240
|
"the",
|
230
241
|
"of",
|
242
|
+
"adjustment",
|
243
|
+
"available",
|
244
|
+
"catalogue",
|
231
245
|
"curators",
|
246
|
+
"data",
|
247
|
+
"determination",
|
248
|
+
"dissection",
|
249
|
+
"entered",
|
232
250
|
"nomenclatural",
|
233
|
-
"
|
251
|
+
"orig",
|
252
|
+
"registration",
|
253
|
+
"science"
|
254
|
+
]
|
255
|
+
|
256
|
+
GIVEN_BLACKLIST = [
|
257
|
+
"not any",
|
258
|
+
"has not"
|
234
259
|
]
|
235
260
|
|
236
261
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -17,7 +17,7 @@ module DwcAgent
|
|
17
17
|
|
18
18
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
19
19
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
20
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join
|
20
|
+
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
@@ -30,6 +30,7 @@ module DwcAgent
|
|
30
30
|
def parse(name)
|
31
31
|
return [] if name.nil? || name == ""
|
32
32
|
name.gsub!(@strip_out_regex, ' ')
|
33
|
+
name.gsub!(/\[|\]/, '')
|
33
34
|
name.gsub!(@char_subs_regex, CHAR_SUBS)
|
34
35
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
35
36
|
name.gsub!(@add_separators_regex, '\1 \2')
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|