dwc_agent 3.2.1.0 → 3.3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +21 -14
- data/lib/dwc_agent/constants.rb +115 -113
- data/lib/dwc_agent/parser.rb +27 -21
- data/lib/dwc_agent/version.rb +5 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39a9ead6ab8410e39f29e6bf5b841214042fbf47a336c0e4c5dfaa4b0d11b87d
|
4
|
+
data.tar.gz: 763e722a2f28c765660573b6a0f2895e5c4c25550e8c927f5a85fa6e533f3800
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5e35467255e9e5dfc029a02757387099d3edc4ef385e4d73785c02ca74b0ffc8be7d87a60ec34f9fd22e80d10747bf357e4951b35d415bdc099b2ec24973178
|
7
|
+
data.tar.gz: 5726c9073b61195aa4e5a9f8d5861c044c4392e42b0c2ec114013cb3881362ff7b30927847b352152fb5659dcc8ca2aaf48c728a3bd6ab0059b1126f56659fd0
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -2,17 +2,25 @@ module DwcAgent
|
|
2
2
|
|
3
3
|
class Cleaner
|
4
4
|
|
5
|
+
@defaults = {
|
6
|
+
blacklist: BLACKLIST,
|
7
|
+
given_blacklist: GIVEN_BLACKLIST,
|
8
|
+
family_blacklist: FAMILY_BLACKLIST,
|
9
|
+
particles: PARTICLES
|
10
|
+
}
|
11
|
+
|
5
12
|
class << self
|
13
|
+
attr_reader :defaults
|
14
|
+
|
6
15
|
def instance
|
7
16
|
Thread.current[:dwc_agent_cleaner] ||= new
|
8
17
|
end
|
9
18
|
end
|
10
19
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@
|
15
|
-
@particles = PARTICLES
|
20
|
+
attr_reader :options
|
21
|
+
|
22
|
+
def initialize(options = {})
|
23
|
+
@options = self.class.defaults.merge(options)
|
16
24
|
end
|
17
25
|
|
18
26
|
def default
|
@@ -35,7 +43,7 @@ module DwcAgent
|
|
35
43
|
end
|
36
44
|
|
37
45
|
if parsed_namae.given &&
|
38
|
-
|
46
|
+
options[:given_blacklist].any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
39
47
|
return
|
40
48
|
end
|
41
49
|
|
@@ -55,7 +63,7 @@ module DwcAgent
|
|
55
63
|
return default
|
56
64
|
end
|
57
65
|
|
58
|
-
if parsed_namae.display_order =~
|
66
|
+
if parsed_namae.display_order =~ options[:blacklist]
|
59
67
|
return default
|
60
68
|
end
|
61
69
|
|
@@ -113,14 +121,13 @@ module DwcAgent
|
|
113
121
|
end
|
114
122
|
|
115
123
|
if parsed_namae.family &&
|
116
|
-
|
124
|
+
options[:family_blacklist].any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
117
125
|
return default
|
118
126
|
end
|
119
127
|
|
120
128
|
if parsed_namae.family.nil? &&
|
121
|
-
!parsed_namae.given.nil?
|
122
|
-
|
123
|
-
parsed_namae.family = parsed_namae.given
|
129
|
+
!parsed_namae.given.nil?
|
130
|
+
parsed_namae.family = parsed_namae.given.delete_suffix(".")
|
124
131
|
parsed_namae.given = nil
|
125
132
|
end
|
126
133
|
|
@@ -140,7 +147,7 @@ module DwcAgent
|
|
140
147
|
if !family.nil? &&
|
141
148
|
given.nil? &&
|
142
149
|
!particle.nil? &&
|
143
|
-
|
150
|
+
!options[:particles].include?(particle.downcase)
|
144
151
|
given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
|
145
152
|
particle = nil
|
146
153
|
end
|
@@ -161,11 +168,11 @@ module DwcAgent
|
|
161
168
|
return default
|
162
169
|
end
|
163
170
|
|
164
|
-
if !family.nil? &&
|
171
|
+
if !family.nil? && options[:family_blacklist].any?{ |s| s.casecmp(family) == 0 }
|
165
172
|
return default
|
166
173
|
end
|
167
174
|
|
168
|
-
if !given.nil? &&
|
175
|
+
if !given.nil? && options[:given_blacklist].any?{ |s| s.casecmp(given) == 0 }
|
169
176
|
return default
|
170
177
|
end
|
171
178
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -10,7 +10,7 @@ module DwcAgent
|
|
10
10
|
[,]?\s*\#*\s+\d+\-(?i:[A-Z]|\d)+\-?\d*[A-Za-z]*\z|
|
11
11
|
\d*[A-Za-z]*\d*-\d*\z|
|
12
12
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
13
|
-
[,;\s]
|
13
|
+
[,;\s]+(?:et\.?\s+al|&\s+al)l?\.?|
|
14
14
|
\b[,;]?\s*(?i:etal)\.?|
|
15
15
|
\b[,;]?\s*(?i:et.al)\.?|
|
16
16
|
\b\s+(bis|ter)(\b|\z)|
|
@@ -32,7 +32,7 @@ module DwcAgent
|
|
32
32
|
^(?i:collection)\:?\s+|\s*(?i:collection)\s*$|
|
33
33
|
\b[,;]?\s*(?i:colls)\.(\b|\z)|
|
34
34
|
(?i:contactid)|
|
35
|
-
^(?i:dupl)[.,]
|
35
|
+
^(?i:dupl)[.,]+|
|
36
36
|
\b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
|
37
37
|
[,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
|
38
38
|
May|Jun|Jul|Aug|Sept?|
|
@@ -141,42 +141,42 @@ module DwcAgent
|
|
141
141
|
}x
|
142
142
|
|
143
143
|
SPLIT_BY = %r{
|
144
|
-
[;,]{2,}|
|
145
|
-
[–|ǀ∣|│&+\/;:]|
|
146
|
-
\s+-\s
|
147
|
-
\s+a\.\s
|
148
|
-
\b(con|e|y|i|en|et|or|per|for|und)\s*\b|
|
149
|
-
\b(?i:and|with)\s*\b|
|
150
|
-
\b(?i:annotated(\s+by)?)\s*\b|
|
151
|
-
\b(?i:coll\.)\s*\b|
|
152
|
-
\b(?i:comm\.?)\s*\b|
|
153
|
-
\b(?i:communicate?d(\s+to)?)\s*\b|
|
154
|
-
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
|
155
|
-
\b(?i:confirmada)(\s+por)?\s*\b|
|
156
|
-
\b(?i:checked?(\s+by)?)\s*\b|
|
157
|
-
\b(?i:det\.?(\s+by)?)\s*\b|
|
158
|
-
\b(?i:(donated)?\s*by)\s
|
159
|
-
\b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b|
|
160
|
-
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
|
161
|
-
\b(?i:in?dentified(\s+by)?)\s*\b|
|
162
|
-
\b(?i:in\s+coll\.?\s*\b)|
|
163
|
-
\b(?i:in\s+part(\s+by)?)\s*\b|
|
164
|
-
\b(?i:och)\s*\b|
|
165
|
-
\b(?i:prep\.?\s+(?i:by)?)\s*\b|
|
166
|
-
\b(?i:purchased?)(\s+by)?\s*\b|
|
167
|
-
\b(?i:redet\.?(\s+by?)?)\s*\b|
|
168
|
-
\b(?i:reidentified(\s+by)?)\s*\b|
|
169
|
-
\b(?i:stet)\s*\b|
|
170
|
-
\b(?i:then(\s+by)?)\s
|
171
|
-
\b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
|
172
|
-
\b(?i:via|from)\s*\b
|
144
|
+
[;,]{2,} | # Multiple semicolons or commas
|
145
|
+
[–|ǀ∣|│&+\/;:] | # Various separators
|
146
|
+
\s+-\s+ | # Dash surrounded by spaces
|
147
|
+
\s+a\.\s+ | # "a." surrounded by spaces
|
148
|
+
\b(con|e|y|i|en|et|or|per|for|und)\s*\b | # Short conjunctions or prepositions
|
149
|
+
\b(?i:and|with)\s*\b | # Case-insensitive "and", "with"
|
150
|
+
\b(?i:annotated(\s+by)?)\s*\b | # "annotated (by)"
|
151
|
+
\b(?i:coll\.)\s*\b | # "coll."
|
152
|
+
\b(?i:comm\.?)\s*\b | # "comm."
|
153
|
+
\b(?i:communicate?d(\s+to)?)\s*\b | # "communicated (to)"
|
154
|
+
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b | # "conf.", "confirmed (by)"
|
155
|
+
\b(?i:confirmada)(\s+por)?\s*\b | # "confirmada (por)"
|
156
|
+
\b(?i:checked?(\s+by)?)\s*\b | # "checked (by)"
|
157
|
+
\b(?i:det\.?(\s+by)?)\s*\b | # "det."
|
158
|
+
\b(?i:(donated)?\s*by)\s+ | # "donated by"
|
159
|
+
\b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b | # "dupl.", "duplicate"
|
160
|
+
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b | # "ex.", "examined (by)"
|
161
|
+
\b(?i:in?dentified(\s+by)?)\s*\b | # "identified (by)"
|
162
|
+
\b(?i:in\s+coll\.?\s*\b) | # "in coll."
|
163
|
+
\b(?i:in\s+part(\s+by)?)\s*\b | # "in part (by)"
|
164
|
+
\b(?i:och)\s*\b | # "och"
|
165
|
+
\b(?i:prep\.?\s+(?i:by)?)\s*\b | # "prep. by"
|
166
|
+
\b(?i:purchased?)(\s+by)?\s*\b | # "purchased (by)"
|
167
|
+
\b(?i:redet\.?(\s+by?)?)\s*\b | # "redet."
|
168
|
+
\b(?i:reidentified(\s+by)?)\s*\b | # "reidentified"
|
169
|
+
\b(?i:stet)\s*\b | # "stet"
|
170
|
+
\b(?i:then(\s+by)?)\s+ | # "then (by)"
|
171
|
+
\b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b | # "verif."
|
172
|
+
\b(?i:via|from)\s*\b # "via", "from"
|
173
173
|
}x
|
174
174
|
|
175
175
|
POST_STRIP_TIDY = %r{
|
176
|
-
^\s*[&,;.]\s
|
177
|
-
[\[\]]|
|
178
|
-
^[`'".,!?]
|
179
|
-
[`'",]
|
176
|
+
^\s*[&,;.]\s* | # Leading whitespace followed by any combination of &, ;, or .
|
177
|
+
[\[\]] | # Any standalone square brackets
|
178
|
+
^[`'".,!?]+ | # Leading repeated punctuation (` ' " . , ! ?)
|
179
|
+
[`'",]+$ # Trailing repeated punctuation (` ' ")
|
180
180
|
}x
|
181
181
|
|
182
182
|
CHAR_SUBS = {
|
@@ -225,93 +225,95 @@ module DwcAgent
|
|
225
225
|
}
|
226
226
|
|
227
227
|
SEPARATORS = {
|
228
|
-
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]
|
228
|
+
"^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]+)$" => "\\2 \\3 \\1",
|
229
229
|
"^(Mrs?\\.?)\\s+&\\s+(Mrs?\\.?)\\s+(.*)$" => "\\1 \\3 | \\2 \\3",
|
230
|
-
"^([A-Z]{1}\\.\\s*[[:alpha:]]
|
231
|
-
"^(\\S{4,},\\s+(?:\\S\\.\\s*)
|
230
|
+
"^([A-Z]{1}\\.\\s*[[:alpha:]]+),\\s*?([A-Z.]+)$" => "\\1 \\2",
|
231
|
+
"^(\\S{4,},\\s+(?:\\S\\.\\s*)+)\\s+(\\S{4,},\\s+(?:\\S\.\\s*)+)$" => "\\1 | \\2",
|
232
232
|
"(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
|
233
|
-
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)
|
233
|
+
"^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)+)$" => "\\1, \\2",
|
234
234
|
"([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2",
|
235
|
-
"^((?i:[A-Z]\\.\\s?)
|
236
|
-
"^((?i:[A-Z]\\.\\s?)
|
235
|
+
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})\\s+([[:alpha:]’`'-]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
|
236
|
+
"^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
237
237
|
"^([A-Z]{1,3})\\s+(?:and|&|et|e)\\s+([A-Z]{1,3})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
|
238
|
-
"^((?i:[A-Z]\\.\\s?)
|
238
|
+
"^((?i:[A-Z]\\.\\s?)+),\\s+([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\4 | \\2 \\4 | \\3 \\4 | \\5",
|
239
239
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
|
240
240
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4",
|
241
241
|
"^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4 | \\5"
|
242
242
|
}
|
243
243
|
|
244
244
|
BLACKLIST = %r{
|
245
|
-
(?i:
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
245
|
+
(?i:
|
246
|
+
abundant |
|
247
|
+
adult | juvenile |
|
248
|
+
administra(?:d|t)or |
|
249
|
+
^anon$ |
|
250
|
+
australian? |
|
251
|
+
average |
|
252
|
+
believe | unclear | ill?egible | suggested | (dis)?agrees? | approach |
|
253
|
+
\bnone\b |
|
254
|
+
barcod |
|
255
|
+
bgwd |
|
256
|
+
(biolog|botan|zoo|ecolog|mycol|(?:in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture) |
|
257
|
+
(bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america) |
|
258
|
+
carex | salix |
|
259
|
+
catalog(?:ue)? |
|
260
|
+
conservator |
|
261
|
+
(herbarium|herbier|collection|collected|publication|specimen|species|describe|an(?:a|o)morph|isolated|recorded|inspection|define|status|lighthouse) |
|
262
|
+
\bhelp\b |
|
263
|
+
data\s+not\s+captured |
|
264
|
+
(description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect) |
|
265
|
+
desconocido |
|
266
|
+
exc(?:s?icc?at(?:a|i)) |
|
267
|
+
evidence |
|
268
|
+
exporter |
|
269
|
+
foundation |
|
270
|
+
ichthyology |
|
271
|
+
inconn?u |
|
272
|
+
(internation|gou?vern|ministry|extension|unit|district|provincial|na(?:c|t)ional|military|region|environ|natur(?:e|al)|naturelles|division|program|direction) |
|
273
|
+
label |
|
274
|
+
o\.?m\.?n\.?r\.? |
|
275
|
+
measurement |
|
276
|
+
ent(?:o|y)mology |
|
277
|
+
malacology |
|
278
|
+
geographic |
|
279
|
+
(mus(?:eum|ée)|universit(?:y|é|e|at)|college|institute?|acad(?:e|é)m|school|écol(?:e|iers?)|laboratoi?r|project|polytech|dep(?:t|artment)|research|clinic|hospital|cientifica|sanctuary|safari) |
|
280
|
+
univ\. |
|
281
|
+
\b(graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(?:a|e)n|police|taxonomist|consultant|participant(?:es)?|team|(?:é|e)quipe|memb(?:er|re)|crew|group|personnel|staff|family|captain|friends|assistant|worker|gamekeeper)\b |
|
282
|
+
non\s+pr(?:é|e)cis(?:é|e) |
|
283
|
+
no\s+consta |
|
284
|
+
no\s+(agent\s+)?(?:data|disponible)(?:\s+available)? |
|
285
|
+
not?\s+(entered|stated) |
|
286
|
+
nomenclatur(?:e|al)\s+adjustment |
|
287
|
+
not\s+available |
|
288
|
+
(ontario|qu(?:e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?) |
|
289
|
+
popa\s+observers? |
|
290
|
+
recreation | culture |
|
291
|
+
renseigné |
|
292
|
+
(shaped|dark|pale|areas|phase|spotting|interior|between|closer) |
|
293
|
+
soci(?:e|é)t(?:y|é) | cent(?:er|re) | community | history | conservation | conference | assoc | commission | consortium | council | club | exposit | alliance | protective | circle |
|
294
|
+
^class\b |
|
295
|
+
commercial | control | product |
|
296
|
+
^company\b |
|
297
|
+
sequence\s+data |
|
298
|
+
size | large | colou?r |
|
299
|
+
skeleton |
|
300
|
+
survey | assessment | station | monitor | stn\. | project | engine | (e|é)x?chang(?:e|é)s? | ex(?:c|k)urs(?:e|o|ó)n? | exped\.? | exp(?:e|i)di(?:c|t)i(?:e|o|ó)n? | experiment | explora(?:d|t) | festival | generation | inventory | marine | service |
|
301
|
+
^index\b |
|
302
|
+
submersible |
|
303
|
+
synonymy? |
|
304
|
+
systematic | perspective |
|
305
|
+
^(?:off|too|the)\b |
|
306
|
+
taxiderm(?:ies|y) |
|
307
|
+
though |
|
308
|
+
texas\s+instruments?(?:\s+for)? |
|
309
|
+
tropical |
|
310
|
+
toward | seen\s+at |
|
311
|
+
unidentified | unspecified | unk?nown? | unnamed | unread | unmistak | no agent |
|
312
|
+
urn: |
|
313
|
+
usda | ucla |
|
314
|
+
workshop | garden | farm | jardin | public |
|
315
|
+
^de$
|
316
|
+
)
|
315
317
|
}x
|
316
318
|
|
317
319
|
FAMILY_GREENLIST = [
|
@@ -402,7 +404,7 @@ module DwcAgent
|
|
402
404
|
|
403
405
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
404
406
|
|
405
|
-
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
|
407
|
+
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|ESQ|esq|[IVX]{2,})(\.|\b)/
|
406
408
|
|
407
409
|
PARTICLES = [
|
408
410
|
"ap",
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -2,27 +2,33 @@ module DwcAgent
|
|
2
2
|
|
3
3
|
class Parser
|
4
4
|
|
5
|
+
@defaults = {
|
6
|
+
prefer_comma_as_separator: true,
|
7
|
+
separator: SPLIT_BY,
|
8
|
+
title: TITLE,
|
9
|
+
appellation: APPELLATION,
|
10
|
+
suffix: SUFFIX,
|
11
|
+
strip_out_regex: Regexp.new(STRIP_OUT.to_s),
|
12
|
+
tidy_remains_regex: Regexp.new(POST_STRIP_TIDY.to_s),
|
13
|
+
char_subs_regex: Regexp.new([CHAR_SUBS.keys.join].to_s),
|
14
|
+
phrase_subs_regex: Regexp.new(PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s),
|
15
|
+
residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s),
|
16
|
+
separators: SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
|
17
|
+
}
|
18
|
+
|
5
19
|
class << self
|
20
|
+
attr_reader :defaults
|
21
|
+
|
6
22
|
def instance
|
7
23
|
Thread.current[:dwc_agent_parser] ||= new
|
8
24
|
end
|
9
25
|
end
|
10
26
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
appellation: APPELLATION,
|
17
|
-
suffix: SUFFIX
|
18
|
-
}
|
19
|
-
@namae = Namae::Parser.new(options)
|
20
|
-
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
21
|
-
@tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
|
22
|
-
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
23
|
-
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
|
24
|
-
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
25
|
-
@separators = SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
|
27
|
+
attr_reader :options, :namae
|
28
|
+
|
29
|
+
def initialize(options = {})
|
30
|
+
@options = self.class.defaults.merge(options)
|
31
|
+
@namae = Namae::Parser.new(@options)
|
26
32
|
end
|
27
33
|
|
28
34
|
# Parses the passed-in string and returns a list of names.
|
@@ -31,14 +37,14 @@ module DwcAgent
|
|
31
37
|
# @return [Array] the list of parsed names
|
32
38
|
def parse(name)
|
33
39
|
return [] if name.nil? || name == ""
|
34
|
-
name.gsub!(
|
35
|
-
name.gsub!(
|
36
|
-
name.gsub!(Regexp.union(
|
37
|
-
|
38
|
-
name.gsub!(
|
40
|
+
name.gsub!(options[:strip_out_regex], ' ')
|
41
|
+
name.gsub!(options[:tidy_remains_regex], '')
|
42
|
+
name.gsub!(Regexp.union(options[:char_subs_regex], options[:phrase_subs_regex]), CHAR_SUBS.merge(PHRASE_SUBS))
|
43
|
+
options[:separators].each{|k| name.gsub!(k[0], k[1])}
|
44
|
+
name.gsub!(options[:residual_terminators_regex], '')
|
39
45
|
name.squeeze!(' ')
|
40
46
|
name.strip!
|
41
|
-
|
47
|
+
namae.parse(name)
|
42
48
|
end
|
43
49
|
|
44
50
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
@@ -3,7 +3,7 @@ module DwcAgent
|
|
3
3
|
class Version
|
4
4
|
|
5
5
|
MAJOR = 3
|
6
|
-
MINOR =
|
6
|
+
MINOR = 3
|
7
7
|
PATCH = 1
|
8
8
|
BUILD = 0
|
9
9
|
|
@@ -11,6 +11,10 @@ module DwcAgent
|
|
11
11
|
[MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
|
12
12
|
end
|
13
13
|
|
14
|
+
def self.date
|
15
|
+
'2025-06-18'
|
16
|
+
end
|
17
|
+
|
14
18
|
end
|
15
19
|
|
16
20
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|