dwc_agent 3.4.2.0 → 3.4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/constants.rb +21 -18
- data/lib/dwc_agent/parser.rb +2 -2
- data/lib/dwc_agent/version.rb +2 -2
- metadata +17 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 29cbee65088adb6c5914a955a61f7fb6ff88e84fc72394cae0168edeb8652953
|
|
4
|
+
data.tar.gz: a79f84cd59930282009d7f86545f4ce2a726d1818693b0abd9648add0160ca41
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 79b6048d32a3ccd7e8fa6c6833c5df709f7a37cd2fc3f904d4075796f669cc33fd185aedb61893648aa310bc18abad854a8b5e0687023bb9de6eae27b93aded4
|
|
7
|
+
data.tar.gz: 3ddd88b8c5d5a0b0725876e3758caffa20f6fcbf1b69a55b1967e1984f67f0d117501f84cb7cc5759e038d968df069c1d4dddff1c86a8b31afe427d3ab6c73ff
|
data/lib/dwc_agent/constants.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
1
3
|
module DwcAgent
|
|
2
4
|
STRIP_OUT = %r{
|
|
3
5
|
(?i:acc\s?\#)|
|
|
@@ -138,7 +140,7 @@ module DwcAgent
|
|
|
138
140
|
\s+de\s*$|
|
|
139
141
|
\.{2,}$|
|
|
140
142
|
[^[:alnum:][:blank:][:punct:][∣´|ǀ∣|│`~$^+|<>]] # Removes emojis from string
|
|
141
|
-
}x
|
|
143
|
+
}x.freeze
|
|
142
144
|
|
|
143
145
|
SPLIT_BY = %r{
|
|
144
146
|
[;,]{2,} | # Multiple semicolons or commas
|
|
@@ -170,14 +172,14 @@ module DwcAgent
|
|
|
170
172
|
\b(?i:then(\s+by)?)\s+ | # "then (by)"
|
|
171
173
|
\b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b | # "verif."
|
|
172
174
|
\b(?i:via|from)\s*\b # "via", "from"
|
|
173
|
-
}x
|
|
175
|
+
}x.freeze
|
|
174
176
|
|
|
175
177
|
POST_STRIP_TIDY = %r{
|
|
176
178
|
^\s*[&,;.]\s* | # Leading whitespace followed by any combination of &, ;, or .
|
|
177
179
|
[\[\]] | # Any standalone square brackets
|
|
178
180
|
^[`'".,!?]+ | # Leading repeated punctuation (` ' " . , ! ?)
|
|
179
181
|
[`'",]+$ # Trailing repeated punctuation (` ' ")
|
|
180
|
-
}x
|
|
182
|
+
}x.freeze
|
|
181
183
|
|
|
182
184
|
CHAR_SUBS = {
|
|
183
185
|
'"' => '\'',
|
|
@@ -219,7 +221,7 @@ module DwcAgent
|
|
|
219
221
|
'prof.' => 'Prof. ',
|
|
220
222
|
' .;' => '. ;',
|
|
221
223
|
', &' => ' &'
|
|
222
|
-
}
|
|
224
|
+
}.freeze
|
|
223
225
|
|
|
224
226
|
COMPLEX_SEPARATORS = {
|
|
225
227
|
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]+)$" => "\\2 \\3 \\1",
|
|
@@ -228,7 +230,8 @@ module DwcAgent
|
|
|
228
230
|
"^(\\S{4,},\\s+(?:\\S\\.\\s*)+)\\s+(\\S{4,},\\s+(?:\\S\.\\s*)+)$" => "\\1 | \\2",
|
|
229
231
|
"(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
|
|
230
232
|
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)+)$" => "\\1, \\2",
|
|
231
|
-
"([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2",
|
|
233
|
+
"^([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)(?i:and|&|et|e|,|;)\s*([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2 | \\7 \\5, \\6",
|
|
234
|
+
"^([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2",
|
|
232
235
|
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})\\s+([[:alpha:]’`'-]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
|
|
233
236
|
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
|
234
237
|
"^([A-Z]{1,3})\\s+(?:and|&|et|e)\\s+([A-Z]{1,3})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
|
@@ -236,7 +239,7 @@ module DwcAgent
|
|
|
236
239
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
|
|
237
240
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4",
|
|
238
241
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4 | \\5"
|
|
239
|
-
}
|
|
242
|
+
}.freeze
|
|
240
243
|
|
|
241
244
|
BLACKLIST = %r{
|
|
242
245
|
(?i:
|
|
@@ -311,9 +314,9 @@ module DwcAgent
|
|
|
311
314
|
workshop | garden | farm | jardin | public |
|
|
312
315
|
^de$
|
|
313
316
|
)
|
|
314
|
-
}x
|
|
317
|
+
}x.freeze
|
|
315
318
|
|
|
316
|
-
FAMILY_GREENLIST = [
|
|
319
|
+
FAMILY_GREENLIST = Set.new([
|
|
317
320
|
"Ng",
|
|
318
321
|
"Srb",
|
|
319
322
|
"Srp",
|
|
@@ -323,9 +326,9 @@ module DwcAgent
|
|
|
323
326
|
"Smrt",
|
|
324
327
|
"Krc",
|
|
325
328
|
"Krč"
|
|
326
|
-
]
|
|
329
|
+
]).freeze
|
|
327
330
|
|
|
328
|
-
FAMILY_BLACKLIST = [
|
|
331
|
+
FAMILY_BLACKLIST = Set.new([
|
|
329
332
|
"a b",
|
|
330
333
|
"a e",
|
|
331
334
|
"a g",
|
|
@@ -390,20 +393,20 @@ module DwcAgent
|
|
|
390
393
|
"zw",
|
|
391
394
|
"zz",
|
|
392
395
|
"z-"
|
|
393
|
-
]
|
|
396
|
+
]).freeze
|
|
394
397
|
|
|
395
|
-
GIVEN_BLACKLIST = [
|
|
398
|
+
GIVEN_BLACKLIST = Set.new([
|
|
396
399
|
"not any",
|
|
397
400
|
"has not"
|
|
398
|
-
]
|
|
401
|
+
]).freeze
|
|
399
402
|
|
|
400
|
-
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|cmdr|lt|sgt|cpl|pvt|proff?|dr|dra\.|drª|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|doct(eu|o)r|father|cantor|vicar|père|pastor|profa\.?|profª|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
|
403
|
+
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|cmdr|lt|sgt|cpl|pvt|proff?|dr|dra\.|drª|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|doct(eu|o)r|father|cantor|vicar|père|pastor|profa\.?|profª|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i.freeze
|
|
401
404
|
|
|
402
|
-
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
|
405
|
+
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i.freeze
|
|
403
406
|
|
|
404
|
-
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|ESQ|esq|[IVX]{2,})(\.|\b)
|
|
407
|
+
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|ESQ|esq|[IVX]{2,})(\.|\b)/.freeze
|
|
405
408
|
|
|
406
|
-
PARTICLES = [
|
|
409
|
+
PARTICLES = Set.new([
|
|
407
410
|
"ap",
|
|
408
411
|
"da",
|
|
409
412
|
"de",
|
|
@@ -425,7 +428,7 @@ module DwcAgent
|
|
|
425
428
|
"van de",
|
|
426
429
|
"van der",
|
|
427
430
|
"von der"
|
|
428
|
-
]
|
|
431
|
+
]).freeze
|
|
429
432
|
|
|
430
433
|
VOWELS = "aeiouàáâäǎæãåāèéêëěẽēėęìíîïǐĩīıįòóôöǒœøõōùúûüǔũūűů"
|
|
431
434
|
|
data/lib/dwc_agent/parser.rb
CHANGED
|
@@ -13,7 +13,7 @@ module DwcAgent
|
|
|
13
13
|
subs_regex: Regexp.new(CHAR_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s),
|
|
14
14
|
complex_separators_regex: COMPLEX_SEPARATORS.map{|k,v| [Regexp.new(k), v] },
|
|
15
15
|
residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s)
|
|
16
|
-
}
|
|
16
|
+
}.freeze
|
|
17
17
|
|
|
18
18
|
class << self
|
|
19
19
|
attr_reader :defaults
|
|
@@ -35,7 +35,7 @@ module DwcAgent
|
|
|
35
35
|
# @param names [String] the name or names to be parsed
|
|
36
36
|
# @return [Array] the list of parsed names
|
|
37
37
|
def parse(name)
|
|
38
|
-
return [] if name.nil? || name
|
|
38
|
+
return [] if name.nil? || name.empty?
|
|
39
39
|
name.gsub!(options[:strip_out_regex], ' ')
|
|
40
40
|
name.gsub!(options[:tidy_remains_regex], '')
|
|
41
41
|
name.gsub!(options[:subs_regex], CHAR_SUBS)
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dwc_agent
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.4.
|
|
4
|
+
version: 3.4.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- David P. Shorthouse
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 2026-02-23 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: namae
|
|
@@ -80,6 +79,20 @@ dependencies:
|
|
|
80
79
|
- - "~>"
|
|
81
80
|
- !ruby/object:Gem::Version
|
|
82
81
|
version: '2'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: benchmark-ips
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '2'
|
|
89
|
+
type: :development
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '2'
|
|
83
96
|
description: Parses the typically messy content in Darwin Core terms that contain
|
|
84
97
|
people names
|
|
85
98
|
email: davidpshorthouse@gmail.coms
|
|
@@ -102,7 +115,6 @@ homepage: https://github.com/bionomia/dwc_agent
|
|
|
102
115
|
licenses:
|
|
103
116
|
- MIT
|
|
104
117
|
metadata: {}
|
|
105
|
-
post_install_message:
|
|
106
118
|
rdoc_options:
|
|
107
119
|
- "--encoding"
|
|
108
120
|
- UTF-8
|
|
@@ -119,8 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
119
131
|
- !ruby/object:Gem::Version
|
|
120
132
|
version: '0'
|
|
121
133
|
requirements: []
|
|
122
|
-
rubygems_version: 3.
|
|
123
|
-
signing_key:
|
|
134
|
+
rubygems_version: 3.6.9
|
|
124
135
|
specification_version: 4
|
|
125
136
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|
|
126
137
|
test_files: []
|