dwc_agent 1.1.0 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/dwcagent +1 -1
- data/bin/dwcagent-similarity +1 -1
- data/lib/dwc_agent/cleaner.rb +15 -11
- data/lib/dwc_agent/constants.rb +15 -5
- data/lib/dwc_agent/parser.rb +0 -2
- data/lib/dwc_agent/version.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75ac9f77b1b4d5761881cafca45e66c2745eb47bb501ad9fab84cad6324b41ee
|
4
|
+
data.tar.gz: 16057726f330cb033fe7e3af0af58dd4fbb2f6f9817a56c3e86c88da1fbccf61
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f628d3bfec046b54338eb08ec18cc5352dff7560e5e87e2c949378e66bcb44df84af637ea7c8b4d559504945b5682b6f019f2bde56d31552a1c4537902b2a71e
|
7
|
+
data.tar.gz: 444b15c0d391e12d8ccd6823f8ae97688d9b20ed075323795e22acd7834add4e5068fcc2d4fc3729e36aa3bdc11f29f4a6f8af8c7fc1cfc34d1f294159212328
|
data/bin/dwcagent
CHANGED
data/bin/dwcagent-similarity
CHANGED
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -22,7 +22,7 @@ module DwcAgent
|
|
22
22
|
return blank_name
|
23
23
|
end
|
24
24
|
|
25
|
-
if parsed_namae.
|
25
|
+
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
26
|
return blank_name
|
27
27
|
end
|
28
28
|
|
@@ -42,9 +42,9 @@ module DwcAgent
|
|
42
42
|
return blank_name
|
43
43
|
end
|
44
44
|
|
45
|
-
if parsed_namae.given &&
|
46
|
-
parsed_namae.family &&
|
47
|
-
parsed_namae.family.count(".") > 0 &&
|
45
|
+
if parsed_namae.given &&
|
46
|
+
parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") > 0 &&
|
48
48
|
parsed_namae.family.length - parsed_namae.family.count(".") <= 3
|
49
49
|
given = parsed_namae.given
|
50
50
|
family = parsed_namae.family
|
@@ -52,9 +52,9 @@ module DwcAgent
|
|
52
52
|
parsed_namae.given = family
|
53
53
|
end
|
54
54
|
|
55
|
-
if parsed_namae.given &&
|
56
|
-
parsed_namae.family &&
|
57
|
-
parsed_namae.family.length <=3 &&
|
55
|
+
if parsed_namae.given &&
|
56
|
+
parsed_namae.family &&
|
57
|
+
parsed_namae.family.length <=3 &&
|
58
58
|
parsed_namae.family == parsed_namae.family.upcase &&
|
59
59
|
parsed_namae.given[-1] != "."
|
60
60
|
given = parsed_namae.given
|
@@ -63,9 +63,9 @@ module DwcAgent
|
|
63
63
|
parsed_namae.given = family
|
64
64
|
end
|
65
65
|
|
66
|
-
if parsed_namae.given &&
|
67
|
-
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
-
parsed_namae.given == parsed_namae.given.downcase) &&
|
66
|
+
if parsed_namae.given &&
|
67
|
+
(parsed_namae.given == parsed_namae.given.upcase ||
|
68
|
+
parsed_namae.given == parsed_namae.given.downcase) &&
|
69
69
|
!parsed_namae.given.include?(".") &&
|
70
70
|
parsed_namae.given.tr(".","").length >= 4
|
71
71
|
parsed_namae.given = NameCase(parsed_namae.given)
|
@@ -119,8 +119,12 @@ module DwcAgent
|
|
119
119
|
return blank_name
|
120
120
|
end
|
121
121
|
|
122
|
+
if !given.nil? && GIVEN_BLACKLIST.any?{ |s| s.casecmp(given) == 0 }
|
123
|
+
return blank_name
|
124
|
+
end
|
125
|
+
|
122
126
|
{ given: given, family: family, particle: particle }
|
123
127
|
end
|
124
128
|
|
125
129
|
end
|
126
|
-
end
|
130
|
+
end
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -162,10 +162,6 @@ module DwcAgent
|
|
162
162
|
^(\S{4,},\s+(?:\S\.\s*){1,})\s+(\S{4,},\s+(?:\S\.\s*){1,})$
|
163
163
|
}x
|
164
164
|
|
165
|
-
CONTRACTED_LIST = %r{
|
166
|
-
^(\S{1,}\.?)+\s+(?i:and|&)\s+(\S{1,}\.?)+\s*(.*)$
|
167
|
-
}x
|
168
|
-
|
169
165
|
BLACKLIST = %r{
|
170
166
|
(?i:abundant)|
|
171
167
|
(?i:adult|juvenile)|
|
@@ -202,11 +198,13 @@ module DwcAgent
|
|
202
198
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
203
199
|
(?i:not?\s+(entered|stated))|
|
204
200
|
(?i:nomenclatur(e|al)\s+adjustment)|
|
201
|
+
(?i:not\s+available)|
|
205
202
|
(?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
|
206
203
|
(?i:recreation|culture)|
|
207
204
|
(?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
|
208
205
|
(?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|class|commission|consortium|council|club|exposit|alliance|protective|circle)|
|
209
206
|
(?i:commercial|company|control|product)|
|
207
|
+
(?i:sequence\s+data)|
|
210
208
|
(?i:size|large|colou?r)\s+|
|
211
209
|
(?i:skeleton)|
|
212
210
|
(?i:survey|assessment|station|monitor|stn\.|index|project|bureau|engine|exchange|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
|
@@ -217,6 +215,7 @@ module DwcAgent
|
|
217
215
|
\s*(?i:too)\s+|\s*(?i:the)\s+|
|
218
216
|
(?i:taxiderm(ies|y))|
|
219
217
|
(?i:though)|
|
218
|
+
(?i:texas\s+instruments?)\s*?(for)?|
|
220
219
|
(?:tropical)|
|
221
220
|
(?i:toward|seen at)|
|
222
221
|
(?i:unidentified|unspecified|unk?nown|unnamed|unread|unmistak|no agent)|
|
@@ -234,7 +233,18 @@ module DwcAgent
|
|
234
233
|
"of",
|
235
234
|
"curators",
|
236
235
|
"nomenclatural",
|
237
|
-
"adjustment"
|
236
|
+
"adjustment",
|
237
|
+
"available",
|
238
|
+
"data",
|
239
|
+
"orig",
|
240
|
+
"science",
|
241
|
+
"catalogue",
|
242
|
+
"entered"
|
243
|
+
]
|
244
|
+
|
245
|
+
GIVEN_BLACKLIST = [
|
246
|
+
"not any",
|
247
|
+
"has not"
|
238
248
|
]
|
239
249
|
|
240
250
|
TITLE = /\s*\b(sir|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d|rev|docteur|mme|abbé|ptre)\.?|frère|frere|père|pere|professor|esq\.?)(\s+|$)/i
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -21,7 +21,6 @@ module DwcAgent
|
|
21
21
|
@phrase_subs_regex = Regexp.new (PHRASE_SUBS.keys.join('|')).to_s
|
22
22
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
23
23
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
24
|
-
@contracted_list_regex = Regexp.new CONTRACTED_LIST.to_s
|
25
24
|
end
|
26
25
|
|
27
26
|
# Parses the passed-in string and returns a list of names.
|
@@ -35,7 +34,6 @@ module DwcAgent
|
|
35
34
|
name.gsub!(@phrase_subs_regex, PHRASE_SUBS)
|
36
35
|
name.gsub!(@add_separators_regex, '\1 \2')
|
37
36
|
name.gsub!(@complex_separators_regex, '\1 | \2')
|
38
|
-
name.gsub!(@contracted_list_regex, '\1 \3 | \2 \3')
|
39
37
|
name.gsub!(@residual_terminators_regex, '')
|
40
38
|
name.squeeze!(' ')
|
41
39
|
name.strip!
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-12-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|
@@ -119,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.0.
|
122
|
+
rubygems_version: 3.0.6
|
123
123
|
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|