dwc_agent 3.2.0.0 → 3.3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +21 -11
- data/lib/dwc_agent/constants.rb +2 -2
- data/lib/dwc_agent/parser.rb +27 -21
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40a0a33de602297e5e87e3059edbd0c88e0ad36bbb90f0803362698120f9e3b1
|
4
|
+
data.tar.gz: 94705d02d7cb7a3ac1647f903d15db7140ece849008ba84f6d49f1bf678abf14
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1baeb5c1367e570139fca30a1181395030b12d09b1d4f12a026ad48e28ebaecee70b20d81b96b02f2600da269f2342281c8929facc70f0b9c571cafcfb273501
|
7
|
+
data.tar.gz: bf7676d6ed221258ab6efe217a7b695cb619c54be8a8d14f16bb23a3f1bc9e0fa0a33474211e7f20e5a08b058e2ab4b57e4c5e1367be74d9719ad2502a035d77
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -2,17 +2,25 @@ module DwcAgent
|
|
2
2
|
|
3
3
|
class Cleaner
|
4
4
|
|
5
|
+
@defaults = {
|
6
|
+
blacklist: BLACKLIST,
|
7
|
+
given_blacklist: GIVEN_BLACKLIST,
|
8
|
+
family_blacklist: FAMILY_BLACKLIST,
|
9
|
+
particles: PARTICLES
|
10
|
+
}
|
11
|
+
|
5
12
|
class << self
|
13
|
+
attr_reader :defaults
|
14
|
+
|
6
15
|
def instance
|
7
16
|
Thread.current[:dwc_agent_cleaner] ||= new
|
8
17
|
end
|
9
18
|
end
|
10
19
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@
|
15
|
-
@particles = PARTICLES
|
20
|
+
attr_reader :options
|
21
|
+
|
22
|
+
def initialize(options = {})
|
23
|
+
@options = self.class.defaults.merge(options)
|
16
24
|
end
|
17
25
|
|
18
26
|
def default
|
@@ -26,6 +34,8 @@ module DwcAgent
|
|
26
34
|
# @return Namae::Name [Object] a new Namae object
|
27
35
|
def clean(parsed_namae)
|
28
36
|
|
37
|
+
return default if !parsed_namae.instance_of?(Namae::Name)
|
38
|
+
|
29
39
|
if parsed_namae.family &&
|
30
40
|
parsed_namae.family == NameCase(parsed_namae.family) &&
|
31
41
|
parsed_namae.display_order.split.join == parsed_namae.initials
|
@@ -33,7 +43,7 @@ module DwcAgent
|
|
33
43
|
end
|
34
44
|
|
35
45
|
if parsed_namae.given &&
|
36
|
-
|
46
|
+
options[:given_blacklist].any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
37
47
|
return
|
38
48
|
end
|
39
49
|
|
@@ -53,7 +63,7 @@ module DwcAgent
|
|
53
63
|
return default
|
54
64
|
end
|
55
65
|
|
56
|
-
if parsed_namae.display_order =~
|
66
|
+
if parsed_namae.display_order =~ options[:blacklist]
|
57
67
|
return default
|
58
68
|
end
|
59
69
|
|
@@ -111,7 +121,7 @@ module DwcAgent
|
|
111
121
|
end
|
112
122
|
|
113
123
|
if parsed_namae.family &&
|
114
|
-
|
124
|
+
options[:family_blacklist].any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
115
125
|
return default
|
116
126
|
end
|
117
127
|
|
@@ -138,7 +148,7 @@ module DwcAgent
|
|
138
148
|
if !family.nil? &&
|
139
149
|
given.nil? &&
|
140
150
|
!particle.nil? &&
|
141
|
-
|
151
|
+
!options[:particles].include?(particle.downcase)
|
142
152
|
given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
|
143
153
|
particle = nil
|
144
154
|
end
|
@@ -159,11 +169,11 @@ module DwcAgent
|
|
159
169
|
return default
|
160
170
|
end
|
161
171
|
|
162
|
-
if !family.nil? &&
|
172
|
+
if !family.nil? && options[:family_blacklist].any?{ |s| s.casecmp(family) == 0 }
|
163
173
|
return default
|
164
174
|
end
|
165
175
|
|
166
|
-
if !given.nil? &&
|
176
|
+
if !given.nil? && options[:given_blacklist].any?{ |s| s.casecmp(given) == 0 }
|
167
177
|
return default
|
168
178
|
end
|
169
179
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -10,7 +10,7 @@ module DwcAgent
|
|
10
10
|
[,]?\s*\#*\s+\d+\-(?i:[A-Z]|\d)+\-?\d*[A-Za-z]*\z|
|
11
11
|
\d*[A-Za-z]*\d*-\d*\z|
|
12
12
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
13
|
-
|
13
|
+
[,;\s]{1,}(?:et\.?\s+al|&\s+al)l?\.?|
|
14
14
|
\b[,;]?\s*(?i:etal)\.?|
|
15
15
|
\b[,;]?\s*(?i:et.al)\.?|
|
16
16
|
\b\s+(bis|ter)(\b|\z)|
|
@@ -113,7 +113,7 @@ module DwcAgent
|
|
113
113
|
(?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
|
114
114
|
(?i:non?)\s+(?i:specificato)|
|
115
115
|
\b[,;]\s+\d+\.?\z|
|
116
|
-
[!@?]
|
116
|
+
[!@?]\s*\-?\s*|
|
117
117
|
\d{1,4}[\/.]?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii)[\/.]\d{1,4}|
|
118
118
|
[,]?\d+|
|
119
119
|
[,;]\z|
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -2,27 +2,33 @@ module DwcAgent
|
|
2
2
|
|
3
3
|
class Parser
|
4
4
|
|
5
|
+
@defaults = {
|
6
|
+
prefer_comma_as_separator: true,
|
7
|
+
separator: SPLIT_BY,
|
8
|
+
title: TITLE,
|
9
|
+
appellation: APPELLATION,
|
10
|
+
suffix: SUFFIX,
|
11
|
+
strip_out_regex: Regexp.new(STRIP_OUT.to_s),
|
12
|
+
tidy_remains_regex: Regexp.new(POST_STRIP_TIDY.to_s),
|
13
|
+
char_subs_regex: Regexp.new([CHAR_SUBS.keys.join].to_s),
|
14
|
+
phrase_subs_regex: Regexp.new(PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s),
|
15
|
+
residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s),
|
16
|
+
separators: SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
|
17
|
+
}
|
18
|
+
|
5
19
|
class << self
|
20
|
+
attr_reader :defaults
|
21
|
+
|
6
22
|
def instance
|
7
23
|
Thread.current[:dwc_agent_parser] ||= new
|
8
24
|
end
|
9
25
|
end
|
10
26
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
appellation: APPELLATION,
|
17
|
-
suffix: SUFFIX
|
18
|
-
}
|
19
|
-
@namae = Namae::Parser.new(options)
|
20
|
-
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
21
|
-
@tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
|
22
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
23
|
-
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
|
24
|
-
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
25
|
-
@separators = SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
|
27
|
+
attr_reader :options, :namae
|
28
|
+
|
29
|
+
def initialize(options = {})
|
30
|
+
@options = self.class.defaults.merge(options)
|
31
|
+
@namae = Namae::Parser.new(@options)
|
26
32
|
end
|
27
33
|
|
28
34
|
# Parses the passed-in string and returns a list of names.
|
@@ -31,14 +37,14 @@ module DwcAgent
|
|
31
37
|
# @return [Array] the list of parsed names
|
32
38
|
def parse(name)
|
33
39
|
return [] if name.nil? || name == ""
|
34
|
-
name.gsub!(
|
35
|
-
name.gsub!(
|
36
|
-
name.gsub!(Regexp.union(
|
37
|
-
|
38
|
-
name.gsub!(
|
40
|
+
name.gsub!(options[:strip_out_regex], ' ')
|
41
|
+
name.gsub!(options[:tidy_remains_regex], '')
|
42
|
+
name.gsub!(Regexp.union(options[:char_subs_regex], options[:phrase_subs_regex]), CHAR_SUBS.merge(PHRASE_SUBS))
|
43
|
+
options[:separators].each{|k| name.gsub!(k[0], k[1])}
|
44
|
+
name.gsub!(options[:residual_terminators_regex], '')
|
39
45
|
name.squeeze!(' ')
|
40
46
|
name.strip!
|
41
|
-
|
47
|
+
namae.parse(name)
|
42
48
|
end
|
43
49
|
|
44
50
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-11-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|