dwc_agent 3.0.7.0 → 3.0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/constants.rb +13 -4
- data/lib/dwc_agent/parser.rb +2 -0
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0d83fd4dba1ddb6b9976bb1dea4c95a5365cf31b9c1afc335e035f70317a40f
|
4
|
+
data.tar.gz: 2fc6de4a6de283d9cf8d813306bb034ca67e20e684ce6136d6f2366512633d34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6ca8f0a7507c8a8d09b183f59ca47d97ff30a0de3e017da155e0b1e57ae5d34df4328ae9bdbb2fd4682b0d57afac7e251baebaa305fea5bc46b2886b22e5385
|
7
|
+
data.tar.gz: b9ad4777ade9052a3cd54926173c77973f899e76b3f65f43f31b45481de746e70daa4ecdcbb0b190567c27f3496d69cb5093d35522243b284ac5d4931fbe09a0
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -161,6 +161,10 @@ module DwcAgent
|
|
161
161
|
\b(?i:via|from)\s*\b
|
162
162
|
}x
|
163
163
|
|
164
|
+
POST_STRIP_TIDY = %r{
|
165
|
+
^\s*[&,;]\s*
|
166
|
+
}x
|
167
|
+
|
164
168
|
CHAR_SUBS = {
|
165
169
|
'"' => '\'',
|
166
170
|
'|' => ' | ',
|
@@ -199,15 +203,18 @@ module DwcAgent
|
|
199
203
|
' jr.,' => ' Jr.;',
|
200
204
|
' jr,' => ' Jr.;',
|
201
205
|
'-jr' => ' Jr.',
|
202
|
-
'-Jr' => ' Jr.'
|
206
|
+
'-Jr' => ' Jr.',
|
207
|
+
'Dr.' => 'Dr. ',
|
208
|
+
'prof.' => 'Prof. '
|
203
209
|
}
|
204
210
|
|
205
211
|
SEPARATORS = {
|
212
|
+
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]{1,})$" => "\\2 \\3 \\1",
|
206
213
|
"^([A-Z]{1}\\.\\s*[[:alpha:]]{1,}),\\s*?([A-Z.]{1,})$" => "\\1 \\2",
|
207
214
|
"^(\\S{4,},\\s+(?:\\S\\.\\s*){1,})\\s+(\\S{4,},\\s+(?:\\S\.\\s*){1,})$" => "\\1 | \\2",
|
208
215
|
"(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
|
209
216
|
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?){1,})$" => "\\1, \\2",
|
210
|
-
"([[:alpha:]]*),?\\s
|
217
|
+
"([[:alpha:]]*),?\\s*(.*)\\s+(van|von)$" => "\\3 \\1, \\2",
|
211
218
|
"^([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+([A-Z.\\s]+)\\s+([[:alpha:]]{2,})\\s+([[:alpha:]]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
|
212
219
|
"^([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+([A-Z.\\s]+)\\s+([[:alpha:]]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
213
220
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
|
@@ -219,7 +226,7 @@ module DwcAgent
|
|
219
226
|
(?i:abundant)|
|
220
227
|
(?i:adult|juvenile)|
|
221
228
|
(?i:administra(d|t)or)|
|
222
|
-
(?i:anon)
|
229
|
+
^(?i:anon)$|
|
223
230
|
(?i:australian?)|
|
224
231
|
(?i:average)|
|
225
232
|
(?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
|
@@ -329,6 +336,8 @@ module DwcAgent
|
|
329
336
|
"inst",
|
330
337
|
"nomenclatural",
|
331
338
|
"orig",
|
339
|
+
"prof",
|
340
|
+
"professional",
|
332
341
|
"qld",
|
333
342
|
"registration",
|
334
343
|
"science",
|
@@ -346,7 +355,7 @@ module DwcAgent
|
|
346
355
|
"has not"
|
347
356
|
]
|
348
357
|
|
349
|
-
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|doct(eu|o)r|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
358
|
+
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|dra\.|drª|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|doct(eu|o)r|father|cantor|vicar|père|pastor|profa\.?|profª|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
350
359
|
|
351
360
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
352
361
|
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -18,6 +18,7 @@ module DwcAgent
|
|
18
18
|
}
|
19
19
|
@namae = Namae::Parser.new(options)
|
20
20
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
21
|
+
@tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
|
21
22
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
22
23
|
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
|
23
24
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
@@ -31,6 +32,7 @@ module DwcAgent
|
|
31
32
|
def parse(name)
|
32
33
|
return [] if name.nil? || name == ""
|
33
34
|
name.gsub!(@strip_out_regex, ' ')
|
35
|
+
name.gsub!(@tidy_remains_regex, '')
|
34
36
|
name.gsub!(Regexp.union(@char_subs_regex, @phrase_subs_regex), CHAR_SUBS.merge(PHRASE_SUBS))
|
35
37
|
@separators.each{|k| name.gsub!(k[0], k[1])}
|
36
38
|
name.gsub!(@residual_terminators_regex, '')
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|