dwc_agent 3.0.8.0 → 3.0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '087733d2b0fb769363b4075c7f41c2f66029babdd94369754d948fd7ea79a918'
4
- data.tar.gz: 4ac264b58351da0cebf5d0e5139e61fafb0c3a2ce7230dc360c5d75498251c03
3
+ metadata.gz: bf7946475612999b141445b40ee1b93c1252d3f987b65eab8523238716ab162e
4
+ data.tar.gz: '09d2c2353fc63d86bdb327cfc8cd29e33be945d2eb4a6d147c55e31e30c80c14'
5
5
  SHA512:
6
- metadata.gz: 5df9637901a40e47c22872811af6a71a2e482042f3615bf263897945fdadff0254bd373b549f63a0a989e58dfee1d9ffc1e7189da271720badf51596188fc3ae
7
- data.tar.gz: d07083d5a2a439eabd9bb1615de318f482ce67b15bce86e32c823a52f6e0c9615373f587f1f66c9b14ebd5bb91028d18b21b95c9771724d6bf7570037981f723
6
+ metadata.gz: ce194f8823af1f1a594a43a0f37eb2e08ea51dbec107fb3c7d19dceb04e6579efe30d7a5bbaafbf964f1110debd8b4a8bb2abe53a55c9f99965da0a6cc32f8d3
7
+ data.tar.gz: 7f6b7b0fe8c23cd1e0bbef6eefa1488300867f7acbb8b039bef65f0c597011a85aad7f926652ce99993a0fa6bbebbc4eef99f73769031714e4f5e0002d5c6737
@@ -22,8 +22,9 @@ module DwcAgent
22
22
  \b[,;]?\s*(?i:importer|gift)\:?\b|
23
23
  \b[,;]?\s*(?i:string)\b|
24
24
  \b[,;]?\s*(?i:person\s*string)\b|
25
+ ^(?i:colln?)\.?\s+|\s*(?i:colln?)\.?\s*$|
26
+ ^(?i:collection)\:?\s+|\s*(?i:collection)\s*$|
25
27
  \b[,;]?\s*(?i:colls)\.(\b|\z)|
26
- \b[,;]?\s*(?i:colln?)[:.]?(\b|\z)|
27
28
  \b[,;]?\s*(?i:stet)[,!]?\s*\d*\z|
28
29
  [,;]?\s*\d+[-/\s+](?i:\d+|Jan|Feb|Mar|Apr|
29
30
  May|Jun|Jul|Aug|Sept?|
@@ -53,7 +54,7 @@ module DwcAgent
53
54
  \d+\s+(?i:Nov|Novemb(er|re))\.?\b|
54
55
  \d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
55
56
  \b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
56
- \b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Italy|Jamaica|Japan|Jersey|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Russia|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
57
+ \b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Isle of Man|Italy|Jamaica|Japan|Jersey|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Russia|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
57
58
  (?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
58
59
  \b\s*(?i:maybe)\s*\b|
59
60
  \b\s*(?i:prob)\.\s*\b|
@@ -149,6 +150,7 @@ module DwcAgent
149
150
  \b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
150
151
  \b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
151
152
  \b(?i:in?dentified(\s+by)?)\s*\b|
153
+ \b(?i:in\s+coll\.?\s*\b)|
152
154
  \b(?i:in\s+part(\s+by)?)\s*\b|
153
155
  \b(?i:och)\s*\b|
154
156
  \b(?i:prep\.?\s+(?i:by)?)\s*\b|
@@ -161,6 +163,10 @@ module DwcAgent
161
163
  \b(?i:via|from)\s*\b
162
164
  }x
163
165
 
166
+ POST_STRIP_TIDY = %r{
167
+ ^\s*[&,;.]\s*
168
+ }x
169
+
164
170
  CHAR_SUBS = {
165
171
  '"' => '\'',
166
172
  '|' => ' | ',
@@ -206,11 +212,12 @@ module DwcAgent
206
212
 
207
213
  SEPARATORS = {
208
214
  "^(\\S{4,}),\\s+(Mrs?\\.|MRS?\\.)\\s+([A-Za-z\\.\\s]{1,})$" => "\\2 \\3 \\1",
215
+ "^(Mrs?\\.?)\\s+&\\s+(Mrs?\\.?)\\s+(.*)$" => "\\1 \\3 | \\2 \\3",
209
216
  "^([A-Z]{1}\\.\\s*[[:alpha:]]{1,}),\\s*?([A-Z.]{1,})$" => "\\1 \\2",
210
217
  "^(\\S{4,},\\s+(?:\\S\\.\\s*){1,})\\s+(\\S{4,},\\s+(?:\\S\.\\s*){1,})$" => "\\1 | \\2",
211
218
  "(\\S{1}\\.)([[:alpha:]]{2,})" => "\\1 \\2",
212
219
  "^([[:alpha:]]{2,})(?:\\s+)((?:\\S{1}\\.\\s?){1,})$" => "\\1, \\2",
213
- "([[:alpha:]]*),?\\s+(.*)\\s+(van|von)$" => "\\3 \\1, \\2",
220
+ "([[:alpha:]]*),?\\s*(.*)\\s+(van|von)$" => "\\3 \\1, \\2",
214
221
  "^([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+([A-Z.\\s]+)\\s+([[:alpha:]]{2,})\\s+([[:alpha:]]{2,})$" => "\\1 \\4 | \\2 \\3 \\4",
215
222
  "^([A-Z.\\s]+)\\s+(?:and|&|et|e)\\s+([A-Z.\\s]+)\\s+([[:alpha:]]{2,})(.*)$" => "\\1 \\3 | \\2 \\3 | \\4",
216
223
  "^([A-Z][[:alpha:]]{2,}),\\s*?([A-Z][[:alpha:]]{2,})\\s*?(?i:and|&|et|e|,)\\s+([A-Z][[:alpha:]]{2,})$" => "\\1 | \\2 | \\3",
@@ -222,7 +229,7 @@ module DwcAgent
222
229
  (?i:abundant)|
223
230
  (?i:adult|juvenile)|
224
231
  (?i:administra(d|t)or)|
225
- (?i:anon)|
232
+ ^(?i:anon)$|
226
233
  (?i:australian?)|
227
234
  (?i:average)|
228
235
  (?i:believe|unclear|ill?egible|none|suggested|(dis)?agrees?)|approach|
@@ -18,6 +18,7 @@ module DwcAgent
18
18
  }
19
19
  @namae = Namae::Parser.new(options)
20
20
  @strip_out_regex = Regexp.new STRIP_OUT.to_s
21
+ @tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
21
22
  @char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
22
23
  @phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
23
24
  @residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
@@ -31,6 +32,7 @@ module DwcAgent
31
32
  def parse(name)
32
33
  return [] if name.nil? || name == ""
33
34
  name.gsub!(@strip_out_regex, ' ')
35
+ name.gsub!(@tidy_remains_regex, '')
34
36
  name.gsub!(Regexp.union(@char_subs_regex, @phrase_subs_regex), CHAR_SUBS.merge(PHRASE_SUBS))
35
37
  @separators.each{|k| name.gsub!(k[0], k[1])}
36
38
  name.gsub!(@residual_terminators_regex, '')
@@ -4,7 +4,7 @@ module DwcAgent
4
4
 
5
5
  MAJOR = 3
6
6
  MINOR = 0
7
- PATCH = 8
7
+ PATCH = 10
8
8
  BUILD = 0
9
9
 
10
10
  def self.version
@@ -13,4 +13,4 @@ module DwcAgent
13
13
 
14
14
  end
15
15
 
16
- end
16
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc_agent
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.8.0
4
+ version: 3.0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David P. Shorthouse
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-07 00:00:00.000000000 Z
11
+ date: 2023-09-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: namae