dwc_agent 3.2.1.0 → 3.3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cec3414f1b65b944a67995a6a306282ceced45af8d58a63c42ecfd469d74b135
4
- data.tar.gz: 26644458e9c264b4f07c270c86bdcc89ba5654495662f08974bc551688fb3cab
3
+ metadata.gz: 39a9ead6ab8410e39f29e6bf5b841214042fbf47a336c0e4c5dfaa4b0d11b87d
4
+ data.tar.gz: 763e722a2f28c765660573b6a0f2895e5c4c25550e8c927f5a85fa6e533f3800
5
5
  SHA512:
6
- metadata.gz: 4915c877d81d0f19c4b32fb05bbf6fd708e7ebd69f27f3ae119f7896668b946612228a155879b64713af3bdd8107fbf1760e3182e99000038dfa6b1530dadd2b
7
- data.tar.gz: 6c85892b18270743222d2db185e34e531c025e12d6f15915352fe3e6e2229d4b814a127036b24cce5f89a739d07353754d74fa132c07173225060ed80e06bb41
6
+ metadata.gz: e5e35467255e9e5dfc029a02757387099d3edc4ef385e4d73785c02ca74b0ffc8be7d87a60ec34f9fd22e80d10747bf357e4951b35d415bdc099b2ec24973178
7
+ data.tar.gz: 5726c9073b61195aa4e5a9f8d5861c044c4392e42b0c2ec114013cb3881362ff7b30927847b352152fb5659dcc8ca2aaf48c728a3bd6ab0059b1126f56659fd0
@@ -2,17 +2,25 @@ module DwcAgent
2
2
 
3
3
  class Cleaner
4
4
 
5
+ @defaults = {
6
+ blacklist: BLACKLIST,
7
+ given_blacklist: GIVEN_BLACKLIST,
8
+ family_blacklist: FAMILY_BLACKLIST,
9
+ particles: PARTICLES
10
+ }
11
+
5
12
  class << self
13
+ attr_reader :defaults
14
+
6
15
  def instance
7
16
  Thread.current[:dwc_agent_cleaner] ||= new
8
17
  end
9
18
  end
10
19
 
11
- def initialize
12
- @blacklist = BLACKLIST
13
- @given_blacklist = GIVEN_BLACKLIST
14
- @family_blacklist = FAMILY_BLACKLIST
15
- @particles = PARTICLES
20
+ attr_reader :options
21
+
22
+ def initialize(options = {})
23
+ @options = self.class.defaults.merge(options)
16
24
  end
17
25
 
18
26
  def default
@@ -35,7 +43,7 @@ module DwcAgent
35
43
  end
36
44
 
37
45
  if parsed_namae.given &&
38
- @given_blacklist.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
46
+ options[:given_blacklist].any?{ |s| s.casecmp(parsed_namae.given) == 0 }
39
47
  return
40
48
  end
41
49
 
@@ -55,7 +63,7 @@ module DwcAgent
55
63
  return default
56
64
  end
57
65
 
58
- if parsed_namae.display_order =~ @blacklist
66
+ if parsed_namae.display_order =~ options[:blacklist]
59
67
  return default
60
68
  end
61
69
 
@@ -113,14 +121,13 @@ module DwcAgent
113
121
  end
114
122
 
115
123
  if parsed_namae.family &&
116
- @family_blacklist.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
124
+ options[:family_blacklist].any?{ |s| s.casecmp(parsed_namae.family) == 0 }
117
125
  return default
118
126
  end
119
127
 
120
128
  if parsed_namae.family.nil? &&
121
- !parsed_namae.given.nil? &&
122
- !parsed_namae.given.include?(".")
123
- parsed_namae.family = parsed_namae.given
129
+ !parsed_namae.given.nil?
130
+ parsed_namae.family = parsed_namae.given.delete_suffix(".")
124
131
  parsed_namae.given = nil
125
132
  end
126
133
 
@@ -140,7 +147,7 @@ module DwcAgent
140
147
  if !family.nil? &&
141
148
  given.nil? &&
142
149
  !particle.nil? &&
143
- !@particles.include?(particle.downcase)
150
+ !options[:particles].include?(particle.downcase)
144
151
  given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
145
152
  particle = nil
146
153
  end
@@ -161,11 +168,11 @@ module DwcAgent
161
168
  return default
162
169
  end
163
170
 
164
- if !family.nil? && @family_blacklist.any?{ |s| s.casecmp(family) == 0 }
171
+ if !family.nil? && options[:family_blacklist].any?{ |s| s.casecmp(family) == 0 }
165
172
  return default
166
173
  end
167
174
 
168
- if !given.nil? && @given_blacklist.any?{ |s| s.casecmp(given) == 0 }
175
+ if !given.nil? && options[:given_blacklist].any?{ |s| s.casecmp(given) == 0 }
169
176
  return default
170
177
  end
171
178
 
@@ -10,7 +10,7 @@ module DwcAgent
10
10
  [,]?\s*\#*\s+\d+\-(?i:[A-Z]|\d)+\-?\d*[A-Za-z]*\z|
11
11
  \d*[A-Za-z]*\d*-\d*\z|
12
12
  \b\d+\(?(?i:[[:alpha:]])\)?\b|
13
- [,;\s]{1,}(?:et\.?\s+al|&\s+al)l?\.?|
13
+ [,;\s]+(?:et\.?\s+al|&\s+al)l?\.?|
14
14
  \b[,;]?\s*(?i:etal)\.?|
15
15
  \b[,;]?\s*(?i:et.al)\.?|
16
16
  \b\s+(bis|ter)(\b|\z)|
@@ -32,7 +32,7 @@ module DwcAgent
32
32
  ^(?i:collection)\:?\s+|\s*(?i:collection)\s*$|
33
33
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
34
34
  (?i:contactid)|
35
- ^(?i:dupl)[.,]{1,}|
35
+ ^(?i:dupl)[.,]+|
36
36
  \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
37
37
  [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
38
38
  May|Jun|Jul|Aug|Sept?|
@@ -141,42 +141,42 @@ module DwcAgent
141
141
  }x
142
142
 
143
143
  SPLIT_BY = %r{
144
- [;,]{2,}|
145
- [–|ǀ∣|│&+\/;:]|
146
- \s+-\s+|
147
- \s+a\.\s+|
148
- \b(con|e|y|i|en|et|or|per|for|und)\s*\b|
149
- \b(?i:and|with)\s*\b|
150
- \b(?i:annotated(\s+by)?)\s*\b|
151
- \b(?i:coll\.)\s*\b|
152
- \b(?i:comm\.?)\s*\b|
153
- \b(?i:communicate?d(\s+to)?)\s*\b|
154
- \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
155
- \b(?i:confirmada)(\s+por)?\s*\b|
156
- \b(?i:checked?(\s+by)?)\s*\b|
157
- \b(?i:det\.?(\s+by)?)\s*\b|
158
- \b(?i:(donated)?\s*by)\s+|
159
- \b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b|
160
- \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
161
- \b(?i:in?dentified(\s+by)?)\s*\b|
162
- \b(?i:in\s+coll\.?\s*\b)|
163
- \b(?i:in\s+part(\s+by)?)\s*\b|
164
- \b(?i:och)\s*\b|
165
- \b(?i:prep\.?\s+(?i:by)?)\s*\b|
166
- \b(?i:purchased?)(\s+by)?\s*\b|
167
- \b(?i:redet\.?(\s+by?)?)\s*\b|
168
- \b(?i:reidentified(\s+by)?)\s*\b|
169
- \b(?i:stet)\s*\b|
170
- \b(?i:then(\s+by)?)\s+|
171
- \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b|
172
- \b(?i:via|from)\s*\b
144
+ [;,]{2,} | # Multiple semicolons or commas
145
+ [–|ǀ∣|│&+\/;:] | # Various separators
146
+ \s+-\s+ | # Dash surrounded by spaces
147
+ \s+a\.\s+ | # "a." surrounded by spaces
148
+ \b(con|e|y|i|en|et|or|per|for|und)\s*\b | # Short conjunctions or prepositions
149
+ \b(?i:and|with)\s*\b | # Case-insensitive "and", "with"
150
+ \b(?i:annotated(\s+by)?)\s*\b | # "annotated (by)"
151
+ \b(?i:coll\.)\s*\b | # "coll."
152
+ \b(?i:comm\.?)\s*\b | # "comm."
153
+ \b(?i:communicate?d(\s+to)?)\s*\b | # "communicated (to)"
154
+ \b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b | # "conf.", "confirmed (by)"
155
+ \b(?i:confirmada)(\s+por)?\s*\b | # "confirmada (por)"
156
+ \b(?i:checked?(\s+by)?)\s*\b | # "checked (by)"
157
+ \b(?i:det\.?(\s+by)?)\s*\b | # "det."
158
+ \b(?i:(donated)?\s*by)\s+ | # "donated by"
159
+ \b(?i:dupl?[.,]?(\s+by)?|duplicate(\s+by)?)\s*\b | # "dupl.", "duplicate"
160
+ \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b | # "ex.", "examined (by)"
161
+ \b(?i:in?dentified(\s+by)?)\s*\b | # "identified (by)"
162
+ \b(?i:in\s+coll\.?\s*\b) | # "in coll."
163
+ \b(?i:in\s+part(\s+by)?)\s*\b | # "in part (by)"
164
+ \b(?i:och)\s*\b | # "och"
165
+ \b(?i:prep\.?\s+(?i:by)?)\s*\b | # "prep. by"
166
+ \b(?i:purchased?)(\s+by)?\s*\b | # "purchased (by)"
167
+ \b(?i:redet\.?(\s+by?)?)\s*\b | # "redet."
168
+ \b(?i:reidentified(\s+by)?)\s*\b | # "reidentified"
169
+ \b(?i:stet)\s*\b | # "stet"
170
+ \b(?i:then(\s+by)?)\s+ | # "then (by)"
171
+ \b(?i:veri?f?\.?\:?(\s+by)?|v(e|é)rifi(e|é)d?(\s+by)?)\s*\b | # "verif."
172
+ \b(?i:via|from)\s*\b # "via", "from"
173
173
  }x
174
174
 
175
175
  POST_STRIP_TIDY = %r{
176
- ^\s*[&,;.]\s*|
177
- [\[\]]|
178
- ^[`'".,!?]{1,}|
179
- [`'",]{1,}$
176
+ ^\s*[&,;.]\s* | # Leading whitespace followed by any combination of &, ;, or .
177
+ [\[\]] | # Any standalone square brackets
178
+ ^[`'".,!?]+ | # Leading repeated punctuation (` ' " . , ! ?)
179
+ [`'",]+$ # Trailing repeated punctuation (` ' ")
180
180
  }x
181
181
 
182
182
  CHAR_SUBS = {
@@ -225,93 +225,95 @@ module DwcAgent
225
225
  }
226
226
 
227
227
  SEPARATORS = {
228
- "^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]{1,})$" => "\\2 \\3 \\1",
228
+ "^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]+)$" => "\\2 \\3 \\1",
229
229
  "^(Mrs?\\.?)\\s+&\\s+(Mrs?\\.?)\\s+(.*)$" => "\\1 \\3 | \\2 \\3",
230
- "^([A-Z]{1}\\.\\s*[[:alpha:]]{1,}),\\s*?([A-Z.]{1,})$" => "\\1 \\2",
231
- "^(\\S{4,},\\s+(?:\\S\\.\\s*){1,})\\s+(\\S{4,},\\s+(?:\\S\.\\s*){1,})$" => "\\1 | \\2",
230
+ "^([A-Z]{1}\\.\\s*[[:alpha:]]+),\\s*?([A-Z.]+)$" => "\\1 \\2",
231
+ "^(\\S{4,},\\s+(?:\\S\\.\\s*)+)\\s+(\\S{4,},\\s+(?:\\S\.\\s*)+)$" => "\\1 | \\2",
232
232
  "(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
233
- "^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?){1,})$" => "\\1, \\2",
233
+ "^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?)+)$" => "\\1, \\2",
234
234
  "([[:alpha:]]*),?\\s*(.*)\\s+(van|von|v\\.|v(a|o)n\\s+der?)$" => "\\3 \\1, \\2",
235
- "^((?i:[A-Z]\\.\\s?){1,})\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?){1,})\\s+([[:alpha:]’`'-]{2,})\\s+([[:alpha:]’`'-]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
236
- "^((?i:[A-Z]\\.\\s?){1,})\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?){1,})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
235
+ "^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})\\s+([[:alpha:]’`'-]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
236
+ "^((?i:[A-Z]\\.\\s?)+)\\s?(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
237
237
  "^([A-Z]{1,3})\\s+(?:and|&|et|e)\\s+([A-Z]{1,3})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
238
- "^((?i:[A-Z]\\.\\s?){1,}),\\s+([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?){1,})\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\4 | \\2 \\4 | \\3 \\4 | \\5",
238
+ "^((?i:[A-Z]\\.\\s?)+),\\s+([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+((?i:[A-Z]\\.\\s?)+)\\s+([[:alpha:]’`'-]{2,})(.*)$" => "\\1 \\4 | \\2 \\4 | \\3 \\4 | \\5",
239
239
  "^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
240
240
  "^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4",
241
241
  "^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{3,})$" => "\\1 | \\2 | \\3 | \\4 | \\5"
242
242
  }
243
243
 
244
244
  BLACKLIST = %r{
245
- (?i:abundant)|
246
- (?i:adult|juvenile)|
247
- (?i:administra(d|t)or)|
248
- ^(?i:anon)$|
249
- (?i:australian?)|
250
- (?i:average)|
251
- (?i:believe|unclear|ill?egible|suggested|(dis)?agrees?)|approach|
252
- \b\s*(?i:none)\s*\b|
253
- (?i:barcod)|
254
- (?i:BgWd)|
255
- (?i:biolog|botan|zoo|ecolog|mycol|(in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture)|
256
- (?i:bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america)|
257
- (?i:carex|salix)|
258
- (?i:catalog(ue)?)|
259
- (?i:conservator)|
260
- (?i:herbarium|herbier|collection|collected|publication|specimen|species|describe|an(a|o)morph|isolated|recorded|inspection|define|status|lighthouse)|
261
- \b\s*(?i:help)\s*\b|
262
- (?i:data\s+not\s+captured)|
263
- (?i:description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect)|
264
- (?i:desconocido)|
265
- (?i:exc?s?icc?at(a|i))|
266
- (?i:evidence)|
267
- (?i:exporter)|
268
- (?i:foundation)|
269
- (?i:ichthyology)|
270
- (?i:inconn?u)|
271
- (?i:internation|gou?vern|ministry|extension|unit|district|provincial|na(c|t)ional|military|region|environ|natur(e|al)|naturelles|division|program|direction|national)|
272
- (?i:label)|
273
- (?i:o?\.?m\.?n\.?r\.?)|
274
- (?i:measurement)|
275
- (?i:ent(o|y)mology)|
276
- (?i:malacology)|
277
- (?i:geographic)|
278
- (?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
279
- (?i:univ\.)|
280
- \b\s*(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(a|e)n|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|personnel|staff|family|captain|friends|assistant|worker|gamekeeper)\s*\b|
281
- (?i:non\s+pr(é|e)cis(é|e))|
282
- (?i:no\s+consta)|
283
- (?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
284
- (?i:not?\s+(entered|stated))|
285
- (?i:nomenclatur(e|al)\s+adjustment)|
286
- (?i:not\s+available)|
287
- (?i:ontario|qu(e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?)|
288
- (?i:popa\s+observers?)|
289
- (?i:recreation|culture)|
290
- (?i:renseigné)|
291
- (?i:shaped|dark|pale|areas|phase|spotting|interior|between|closer)|
292
- (?i:soci(e|é)t(y|é)|cent(er|re)|community|history|conservation|conference|assoc|commission|consortium|council|club|exposit|alliance|protective|circle)|
293
- ^(?i:class)\s*\b|
294
- (?i:commercial|control|product)|
295
- ^(?i:company)\s*\b|
296
- (?i:sequence\s+data)|
297
- (?i:size|large|colou?r)\s+|
298
- (?i:skeleton)|
299
- (?i:survey|assessment|station|monitor|stn\.|project|engine|(e|é)x?chang(e|é)s?|ex(c|k)ursi(e|o|ó)n?|exped\.?|exp(e|i)di(c|t)i(e|o|ó)n?|experiment|explora(d|t)|festival|generation|inventory|marine|service)|
300
- ^(?i:index)\s*\b|
301
- (?i:submersible)|
302
- (?i:synonymy?)|
303
- (?i:systematic|perspective)|
304
- ^\s*(?i:off|too|the)\s*\b|
305
- (?i:taxiderm(ies|y))|
306
- (?i:though)|
307
- (?i:texas\s+instruments?)\s*?(for)?|
308
- (?:tropical)|
309
- (?i:toward|seen\s+at)|
310
- (?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
311
- (?i:urn\:)|
312
- (?i:usda|ucla)|
313
- (?i:workshop|garden|farm|jardin|public)|
314
- ^\s*?de\s*?$
245
+ (?i:
246
+ abundant |
247
+ adult | juvenile |
248
+ administra(?:d|t)or |
249
+ ^anon$ |
250
+ australian? |
251
+ average |
252
+ believe | unclear | ill?egible | suggested | (dis)?agrees? | approach |
253
+ \bnone\b |
254
+ barcod |
255
+ bgwd |
256
+ (biolog|botan|zoo|ecolog|mycol|(?:in)?vertebrate|fisheries|genetic|animal|mushroom|wildlife|plumage|flower|agriculture) |
257
+ (bris?tish|canadi?an?|chinese|arctic|japan|russian|north\s+america) |
258
+ carex | salix |
259
+ catalog(?:ue)? |
260
+ conservator |
261
+ (herbarium|herbier|collection|collected|publication|specimen|species|describe|an(?:a|o)morph|isolated|recorded|inspection|define|status|lighthouse) |
262
+ \bhelp\b |
263
+ data\s+not\s+captured |
264
+ (description|drawing|identification|remark|original|illustration|checklist|intermedia|measurement|indisting|series|imperfect) |
265
+ desconocido |
266
+ exc(?:s?icc?at(?:a|i)) |
267
+ evidence |
268
+ exporter |
269
+ foundation |
270
+ ichthyology |
271
+ inconn?u |
272
+ (internation|gou?vern|ministry|extension|unit|district|provincial|na(?:c|t)ional|military|region|environ|natur(?:e|al)|naturelles|division|program|direction) |
273
+ label |
274
+ o\.?m\.?n\.?r\.? |
275
+ measurement |
276
+ ent(?:o|y)mology |
277
+ malacology |
278
+ geographic |
279
+ (mus(?:eum|ée)|universit(?:y|é|e|at)|college|institute?|acad(?:e|é)m|school|écol(?:e|iers?)|laboratoi?r|project|polytech|dep(?:t|artment)|research|clinic|hospital|cientifica|sanctuary|safari) |
280
+ univ\. |
281
+ \b(graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(?:a|e)n|police|taxonomist|consultant|participant(?:es)?|team|(?:é|e)quipe|memb(?:er|re)|crew|group|personnel|staff|family|captain|friends|assistant|worker|gamekeeper)\b |
282
+ non\s+pr(?:é|e)cis(?:é|e) |
283
+ no\s+consta |
284
+ no\s+(agent\s+)?(?:data|disponible)(?:\s+available)? |
285
+ not?\s+(entered|stated) |
286
+ nomenclatur(?:e|al)\s+adjustment |
287
+ not\s+available |
288
+ (ontario|qu(?:e|é)bec|saskatchewan|new brunswick|sault|newfoundland|assurance|vancouver|u\.?s\.?s\.?r\.?) |
289
+ popa\s+observers? |
290
+ recreation | culture |
291
+ renseigné |
292
+ (shaped|dark|pale|areas|phase|spotting|interior|between|closer) |
293
+ soci(?:e|é)t(?:y|é) | cent(?:er|re) | community | history | conservation | conference | assoc | commission | consortium | council | club | exposit | alliance | protective | circle |
294
+ ^class\b |
295
+ commercial | control | product |
296
+ ^company\b |
297
+ sequence\s+data |
298
+ size | large | colou?r |
299
+ skeleton |
300
+ survey | assessment | station | monitor | stn\. | project | engine | (e|é)x?chang(?:e|é)s? | ex(?:c|k)urs(?:e|o|ó)n? | exped\.? | exp(?:e|i)di(?:c|t)i(?:e|o|ó)n? | experiment | explora(?:d|t) | festival | generation | inventory | marine | service |
301
+ ^index\b |
302
+ submersible |
303
+ synonymy? |
304
+ systematic | perspective |
305
+ ^(?:off|too|the)\b |
306
+ taxiderm(?:ies|y) |
307
+ though |
308
+ texas\s+instruments?(?:\s+for)? |
309
+ tropical |
310
+ toward | seen\s+at |
311
+ unidentified | unspecified | unk?nown? | unnamed | unread | unmistak | no agent |
312
+ urn: |
313
+ usda | ucla |
314
+ workshop | garden | farm | jardin | public |
315
+ ^de$
316
+ )
315
317
  }x
316
318
 
317
319
  FAMILY_GREENLIST = [
@@ -402,7 +404,7 @@ module DwcAgent
402
404
 
403
405
  APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
404
406
 
405
- SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
407
+ SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|ESQ|esq|[IVX]{2,})(\.|\b)/
406
408
 
407
409
  PARTICLES = [
408
410
  "ap",
@@ -2,27 +2,33 @@ module DwcAgent
2
2
 
3
3
  class Parser
4
4
 
5
+ @defaults = {
6
+ prefer_comma_as_separator: true,
7
+ separator: SPLIT_BY,
8
+ title: TITLE,
9
+ appellation: APPELLATION,
10
+ suffix: SUFFIX,
11
+ strip_out_regex: Regexp.new(STRIP_OUT.to_s),
12
+ tidy_remains_regex: Regexp.new(POST_STRIP_TIDY.to_s),
13
+ char_subs_regex: Regexp.new([CHAR_SUBS.keys.join].to_s),
14
+ phrase_subs_regex: Regexp.new(PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s),
15
+ residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s),
16
+ separators: SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
17
+ }
18
+
5
19
  class << self
20
+ attr_reader :defaults
21
+
6
22
  def instance
7
23
  Thread.current[:dwc_agent_parser] ||= new
8
24
  end
9
25
  end
10
26
 
11
- def initialize
12
- options = {
13
- prefer_comma_as_separator: true,
14
- separator: SPLIT_BY,
15
- title: TITLE,
16
- appellation: APPELLATION,
17
- suffix: SUFFIX
18
- }
19
- @namae = Namae::Parser.new(options)
20
- @strip_out_regex = Regexp.new STRIP_OUT.to_s
21
- @tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
22
- @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
23
- @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
24
- @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
25
- @separators = SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
27
+ attr_reader :options, :namae
28
+
29
+ def initialize(options = {})
30
+ @options = self.class.defaults.merge(options)
31
+ @namae = Namae::Parser.new(@options)
26
32
  end
27
33
 
28
34
  # Parses the passed-in string and returns a list of names.
@@ -31,14 +37,14 @@ module DwcAgent
31
37
  # @return [Array] the list of parsed names
32
38
  def parse(name)
33
39
  return [] if name.nil? || name == ""
34
- name.gsub!(@strip_out_regex, ' ')
35
- name.gsub!(@tidy_remains_regex, '')
36
- name.gsub!(Regexp.union(@char_subs_regex, @phrase_subs_regex), CHAR_SUBS.merge(PHRASE_SUBS))
37
- @separators.each{|k| name.gsub!(k[0], k[1])}
38
- name.gsub!(@residual_terminators_regex, '')
40
+ name.gsub!(options[:strip_out_regex], ' ')
41
+ name.gsub!(options[:tidy_remains_regex], '')
42
+ name.gsub!(Regexp.union(options[:char_subs_regex], options[:phrase_subs_regex]), CHAR_SUBS.merge(PHRASE_SUBS))
43
+ options[:separators].each{|k| name.gsub!(k[0], k[1])}
44
+ name.gsub!(options[:residual_terminators_regex], '')
39
45
  name.squeeze!(' ')
40
46
  name.strip!
41
- @namae.parse(name)
47
+ namae.parse(name)
42
48
  end
43
49
 
44
50
  end
@@ -3,7 +3,7 @@ module DwcAgent
3
3
  class Version
4
4
 
5
5
  MAJOR = 3
6
- MINOR = 2
6
+ MINOR = 3
7
7
  PATCH = 1
8
8
  BUILD = 0
9
9
 
@@ -11,6 +11,10 @@ module DwcAgent
11
11
  [MAJOR, MINOR, PATCH, BUILD].compact.join('.').freeze
12
12
  end
13
13
 
14
+ def self.date
15
+ '2025-06-18'
16
+ end
17
+
14
18
  end
15
19
 
16
20
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.2.1.0
4
+ version: 3.3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-11-07 00:00:00.000000000 Z
11
+ date: 2025-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae