dwc_agent 1.2.0 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/dwcagent +1 -1
- data/bin/dwcagent-similarity +1 -1
- data/lib/dwc_agent/cleaner.rb +18 -10
- data/lib/dwc_agent/constants.rb +20 -11
- data/lib/dwc_agent/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee3ede60926c849a6db7094f63a9d03d07f3e9d13fd74428841812f8970aa681
|
4
|
+
data.tar.gz: 7075c82cd35834f8dad00d57922e46f7ab0eb4f086eb4c0ad40b02c5726ca001
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9e7b655e50ec2d744d74ad44a30b35d75e76b7e0160a35ee7e6b295a03dbc343a012a702e3614c10516bf1282617ec00a0f472e2cc1aefc9d3decc9b4494a946
|
7
|
+
data.tar.gz: a28bd967f7df5afaf5e71a01f3902c0bab33dc735bc0861ceb137bad2d81981ec9ae6768497afff267b986bf653f9ba1fb8091221720964866b3352f8d8e2aae
|
data/bin/dwcagent
CHANGED
data/bin/dwcagent-similarity
CHANGED
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -22,6 +22,10 @@ module DwcAgent
|
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
25
|
+
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
|
+
return blank_name
|
27
|
+
end
|
28
|
+
|
25
29
|
if parsed_namae.family && parsed_namae.family.length == 3 && parsed_namae.family.count('.') == 1
|
26
30
|
return blank_name
|
27
31
|
end
|
@@ -38,9 +42,9 @@ module DwcAgent
|
|
38
42
|
return blank_name
|
39
43
|
end
|
40
44
|
|
41
|
-
if parsed_namae.given &&
|
42
|
-
parsed_namae.family &&
|
43
|
-
parsed_namae.family.count(".") > 0 &&
|
45
|
+
if parsed_namae.given &&
|
46
|
+
parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") > 0 &&
|
44
48
|
parsed_namae.family.length - parsed_namae.family.count(".") <= 3
|
45
49
|
given = parsed_namae.given
|
46
50
|
family = parsed_namae.family
|
@@ -48,9 +52,9 @@ module DwcAgent
|
|
48
52
|
parsed_namae.given = family
|
49
53
|
end
|
50
54
|
|
51
|
-
if parsed_namae.given &&
|
52
|
-
parsed_namae.family &&
|
53
|
-
parsed_namae.family.length <=3 &&
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
54
58
|
parsed_namae.family == parsed_namae.family.upcase &&
|
55
59
|
parsed_namae.given[-1] != "."
|
56
60
|
given = parsed_namae.given
|
@@ -59,9 +63,9 @@ module DwcAgent
|
|
59
63
|
parsed_namae.given = family
|
60
64
|
end
|
61
65
|
|
62
|
-
if parsed_namae.given &&
|
63
|
-
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
|
-
parsed_namae.given == parsed_namae.given.downcase) &&
|
66
|
+
if parsed_namae.given &&
|
67
|
+
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
+
parsed_namae.given == parsed_namae.given.downcase) &&
|
65
69
|
!parsed_namae.given.include?(".") &&
|
66
70
|
parsed_namae.given.tr(".","").length >= 4
|
67
71
|
parsed_namae.given = NameCase(parsed_namae.given)
|
@@ -115,8 +119,12 @@ module DwcAgent
|
|
115
119
|
return blank_name
|
116
120
|
end
|
117
121
|
|
122
|
+
if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
|
123
|
+
return blank_name
|
124
|
+
end
|
125
|
+
|
118
126
|
{ given: given, family: family, particle: particle }
|
119
127
|
end
|
120
128
|
|
121
129
|
end
|
122
|
-
end
|
130
|
+
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -48,10 +48,12 @@ module DwcAgent
|
|
48
48
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
49
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
50
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
51
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
51
52
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
52
53
|
\b\s*(?i:maybe)\s*\b|
|
53
54
|
\b\s*(?i:prob)\.\s*\b|
|
54
55
|
\(?[,]?\s*?(?i:(local)?\s?collector|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
56
|
+
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
55
57
|
(?i:fide)\:?\s*\b|
|
56
58
|
(?i:game\s+dept)\.?\s*\b|
|
57
59
|
(?i:see\s+notes?\s*(inside)?)|
|
@@ -71,6 +73,7 @@ module DwcAgent
|
|
71
73
|
(?i:American\s+Museum\s+of\s+Natural\s+History)|
|
72
74
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
73
75
|
(?i:museum\s+victoria)|
|
76
|
+
\b\s*(?i:United\s+States|Russia)\s*\b|
|
74
77
|
(?i:revised|photograph|fruits\s+only)|
|
75
78
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
76
79
|
-?\s*(?i:synonym(y|ie))|
|
@@ -162,16 +165,6 @@ module DwcAgent
|
|
162
165
|
^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
|
163
166
|
}x
|
164
167
|
|
165
|
-
# Was used in 1.1.0 but it sunk performance so threw it back to a WIP
|
166
|
-
#
|
167
|
-
# @contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
|
168
|
-
#
|
169
|
-
# name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
|
170
|
-
#
|
171
|
-
# CONTRACTED_LIST = %r{
|
172
|
-
# ^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
|
173
|
-
# }x
|
174
|
-
|
175
168
|
BLACKLIST = %r{
|
176
169
|
(?i:abundant)|
|
177
170
|
(?i:adult|juvenile)|
|
@@ -208,11 +201,14 @@ module DwcAgent
|
|
208
201
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
209
202
|
(?i:not?\s+(entered|stated))|
|
210
203
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
204
|
+
(?i:not\s+available)|
|
211
205
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
206
|
+
(?i:popa\s+observers?)|
|
212
207
|
(?i:recreation|culture)|
|
213
208
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
214
209
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
215
210
|
(?i:commercial|company|control|product)|
|
211
|
+
(?i:sequence\s+data)|
|
216
212
|
(?i:size|large|colou?r)\s+|
|
217
213
|
(?i:skeleton)|
|
218
214
|
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
@@ -223,6 +219,7 @@ module DwcAgent
|
|
223
219
|
\s*(?i:too)\s+|\s*(?i:the)\s+|
|
224
220
|
(?i:taxiderm(ies|y))|
|
225
221
|
(?i:though)|
|
222
|
+
(?i:texas\s+instruments?)\s*?(for)?|
|
226
223
|
(?:tropical)|
|
227
224
|
(?i:toward|seen at)|
|
228
225
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
@@ -240,7 +237,19 @@ module DwcAgent
|
|
240
237
|
"of",
|
241
238
|
"curators",
|
242
239
|
"nomenclatural",
|
243
|
-
"adjustment"
|
240
|
+
"adjustment",
|
241
|
+
"available",
|
242
|
+
"data",
|
243
|
+
"orig",
|
244
|
+
"science",
|
245
|
+
"catalogue",
|
246
|
+
"entered",
|
247
|
+
"registration"
|
248
|
+
]
|
249
|
+
|
250
|
+
GIVEN_BLACKLIST = [
|
251
|
+
"not any",
|
252
|
+
"has not"
|
244
253
|
]
|
245
254
|
|
246
255
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|