dwc_agent 1.5.0.1 → 1.5.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +5 -5
- data/lib/dwc_agent/constants.rb +55 -19
- data/lib/dwc_agent/parser.rb +1 -1
- data/lib/dwc_agent/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4b3360ae9a26c61f08d3a687e8c4e1af9e19334714746a976cb4c90e93d72a63
|
4
|
+
data.tar.gz: 4ada728f04c124ec1fbd6a9b45c1ecba80f45319f051a087c2074b2417110478
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ced6e4cd9352b267ee8662993d4a39c9aa6e35965c550fef33993bd3c7c40feb07e7ad1ac8e6c60531f843b5e3d18a5bc0caafb38de41b55f0ad706be898d81a
|
7
|
+
data.tar.gz: 4c3c76a1f3cf17114d9e23a2ec19f40772fe38b5bd1b6dd49264cf82081dac797b182ae0a2cd5d90732a0f3cd947251ea28830f7769a82d4bfe8a10ca166213e
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,10 +18,6 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
|
-
return blank_name
|
23
|
-
end
|
24
|
-
|
25
21
|
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
22
|
return blank_name
|
27
23
|
end
|
@@ -30,7 +26,7 @@ module DwcAgent
|
|
30
26
|
return blank_name
|
31
27
|
end
|
32
28
|
|
33
|
-
if parsed_namae.given && parsed_namae.given.length >
|
29
|
+
if parsed_namae.given && parsed_namae.given.length > 35
|
34
30
|
return blank_name
|
35
31
|
end
|
36
32
|
|
@@ -79,6 +75,10 @@ module DwcAgent
|
|
79
75
|
parsed_namae.given = NameCase(parsed_namae.given)
|
80
76
|
end
|
81
77
|
|
78
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
79
|
+
return blank_name
|
80
|
+
end
|
81
|
+
|
82
82
|
parsed_namae.normalize_initials
|
83
83
|
|
84
84
|
family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -6,15 +6,16 @@ module DwcAgent
|
|
6
6
|
\s*?\d+\.\d+|
|
7
7
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
8
8
|
\b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
|
9
|
+
\b[,;]?\s*(?i:etal)\.?|
|
9
10
|
\b\s+(bis|ter)(\b|\z)|
|
10
11
|
\bu\.\s*a\.|
|
11
|
-
\b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
|
12
|
+
\b[,;]?\s*(?i:and|&)?\s*(?i:others|party)\s*\b|
|
12
13
|
\b[,;]?\s*(?i:etc)\.?|
|
13
14
|
\b[,;]?\s*(?i:on)\b|
|
14
15
|
\b[,;]?\s*(?i:unkn?own)\b|
|
15
16
|
\b[,;]?\s*(?i:n/a)\b|
|
16
17
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
17
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed
|
18
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?|presumably)\)?\b|
|
18
19
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
19
20
|
\b[,;]?\s*(?i:string)\b|
|
20
21
|
\b[,;]?\s*(?i:person\s*string)\b|
|
@@ -49,33 +50,36 @@ module DwcAgent
|
|
49
50
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
51
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
51
52
|
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
52
|
-
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Georgia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Israel|Italy|Jamaica|Japan|Jersey|Jordan|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Montserrat|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Portugal|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)|
|
53
|
+
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Georgia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Israel|Italy|Jamaica|Japan|Jersey|Jordan|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Montserrat|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Portugal|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
|
53
54
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
54
55
|
\b\s*(?i:maybe)\s*\b|
|
55
56
|
\b\s*(?i:prob)\.\s*\b|
|
56
|
-
\(?[,]?\s*?(?i:(local)?\s?
|
57
|
+
\(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
57
58
|
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
58
59
|
(?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
|
59
60
|
(?i:fide)\:?\s*\b|
|
61
|
+
(?i:first\s+name\s+unknown)|
|
60
62
|
(?i:game\s+dept)\.?\s*\b|
|
61
63
|
(?i:see\s+notes?\s*(inside)?)|
|
62
64
|
(?i:see\s+letter\s+enclosed)|
|
63
65
|
(?i:(by)?\s+correspondance)|
|
64
|
-
(?i:pers\.?\s
|
66
|
+
(?i:pers\.?\s*comm\.?)|
|
65
67
|
(?i:crossed\s+out)|
|
66
68
|
\(?(?i:source)\(?|
|
67
69
|
(?i:according\s+to)|
|
68
70
|
(?i:lanuv)\d+|
|
71
|
+
\b\s*name\b|
|
72
|
+
\b\s*lost\b|
|
69
73
|
(?i:nswobs)|
|
70
74
|
ORCID|
|
71
75
|
MRI(\s|-)PAS|
|
72
76
|
urn\:qm\.qld\.gov\.au\:collector|
|
73
77
|
(?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
|
74
|
-
(?i:
|
75
|
-
(?i:
|
78
|
+
(?i:field\s+museum\s+of\s+natural\s+history)|
|
79
|
+
(?i:american\s+museum\s+of\s+natural\s+history)|
|
76
80
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
77
81
|
(?i:museums?\s+victoria)|
|
78
|
-
\b\s*(?i:
|
82
|
+
\b\s*(?i:united\s+states|russia)\s*\b|
|
79
83
|
(?i:revised|photograph|fruits\s+only)|
|
80
84
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
81
85
|
-?\s*(?i:synonym(y|ie))|
|
@@ -83,11 +87,14 @@ module DwcAgent
|
|
83
87
|
\b(?i:to\s+(sub)?spp?)\.?|
|
84
88
|
(?i:nom\.?\s+rev\.?)|
|
85
89
|
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|USGS|NAWQA|
|
90
|
+
\b,?\s*(?i:para|topo|syn)?(?i:type)|
|
86
91
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
87
92
|
(?i:university|museum|exhibits?)|
|
88
93
|
(?i:uqam)|
|
89
94
|
(?i:sem\s+(colec?tor|data))|
|
90
95
|
(?i:no\s+coll\.?(ector)?)|
|
96
|
+
(?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
|
97
|
+
(?i:non?)\s+(?i:specificato)|
|
91
98
|
\b[,;]\s+\d+\z|
|
92
99
|
["!@?]|
|
93
100
|
[,]?\d+|
|
@@ -113,19 +120,22 @@ module DwcAgent
|
|
113
120
|
[–|ǀ∣|│&+\/;:]|
|
114
121
|
\s+-\s+|
|
115
122
|
\s+a\.\s+|
|
116
|
-
\b(e|y|i|en|et|or|per|for)\s*\b|
|
123
|
+
\b(con|e|y|i|en|et|or|per|for)\s*\b|
|
117
124
|
\b(?i:and|with)\s*\b|
|
118
125
|
\b(?i:annotated(\s+by)?)\s*\b|
|
119
126
|
\b(?i:coll\.)\s*\b|
|
120
127
|
\b(?i:communicate?d(\s+to)?)\s*\b|
|
121
128
|
\b(?i:conf\.?(\s+by)?|confirmed(\s+by)?)\s*\b|
|
129
|
+
\b(?i:confirmada)(\s+por)?\s*\b|
|
122
130
|
\b(?i:checked?(\s+by)?)\s*\b|
|
123
131
|
\b(?i:det\.?(\s+by)?)\s*\b|
|
124
132
|
\b(?i:dupl?\.?(\s+by)?|duplicate(\s+by)?)\s*\b|
|
125
133
|
\b(?i:ex\.?(\s+by)?|examined(\s+by)?)\s*\b|
|
126
134
|
\b(?i:in?dentified(\s+by)?)\s*\b|
|
127
135
|
\b(?i:in\s+part(\s+by)?)\s*\b|
|
136
|
+
\b(?i:och)\s*\b|
|
128
137
|
\b(?i:prep\.?\s+(?i:by)?)\s*\b|
|
138
|
+
\b(?i:purchased?)(\s+by)?\s*\b|
|
129
139
|
\b(?i:redet\.?(\s+by?)?)\s*\b|
|
130
140
|
\b(?i:reidentified(\s+by)?)\s*\b|
|
131
141
|
\b(?i:stet)\s*\b|
|
@@ -155,16 +165,24 @@ module DwcAgent
|
|
155
165
|
'}' => '',
|
156
166
|
'@' => '',
|
157
167
|
'%' => '',
|
158
|
-
'\\' => ''
|
168
|
+
'\\' => '',
|
169
|
+
'´' => '\'',
|
170
|
+
'+' => ' | '
|
159
171
|
}
|
160
172
|
|
161
173
|
PHRASE_SUBS = {
|
162
|
-
'
|
163
|
-
'
|
164
|
-
'
|
165
|
-
'
|
166
|
-
'
|
167
|
-
'
|
174
|
+
', ph.d.' => ' Ph.D.',
|
175
|
+
', Ph.D.' => ' Ph.D.',
|
176
|
+
', bro.' => ' Bro.',
|
177
|
+
', Jr.,' => ' Jr.;',
|
178
|
+
', Jr.' => ' Jr.',
|
179
|
+
',Jr.' => ' Jr.',
|
180
|
+
', Sr.' => ' Sr.',
|
181
|
+
',Sr.' => ' Sr.',
|
182
|
+
' jr.,' => ' Jr.;',
|
183
|
+
' jr,' => ' Jr.;',
|
184
|
+
'-jr' => ' Jr.',
|
185
|
+
'-Jr' => ' Jr.'
|
168
186
|
}
|
169
187
|
|
170
188
|
COMPLEX_SEPARATORS = %r{
|
@@ -229,7 +247,7 @@ module DwcAgent
|
|
229
247
|
(?i:though)|
|
230
248
|
(?i:texas\s+instruments?)\s*?(for)?|
|
231
249
|
(?:tropical)|
|
232
|
-
(?i:toward|seen
|
250
|
+
(?i:toward|seen\s+at)|
|
233
251
|
(?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
|
234
252
|
(?i:urn\:)|
|
235
253
|
(?i:usda|ucla)|
|
@@ -238,29 +256,47 @@ module DwcAgent
|
|
238
256
|
}x
|
239
257
|
|
240
258
|
FAMILY_BLACKLIST = [
|
259
|
+
"ap",
|
241
260
|
"da",
|
261
|
+
"de",
|
242
262
|
"de'",
|
243
263
|
"del",
|
244
264
|
"der",
|
265
|
+
"di",
|
266
|
+
"do",
|
267
|
+
"dos",
|
245
268
|
"du",
|
246
269
|
"el",
|
270
|
+
"le",
|
271
|
+
"la",
|
247
272
|
"van",
|
248
273
|
"von",
|
249
274
|
"the",
|
250
275
|
"of",
|
251
276
|
"adjustment",
|
277
|
+
"annotator",
|
252
278
|
"available",
|
253
279
|
"arachnology",
|
254
280
|
"catalogue",
|
255
281
|
"curators",
|
256
282
|
"data",
|
283
|
+
"details",
|
284
|
+
"determiner",
|
257
285
|
"determination",
|
258
286
|
"dissection",
|
259
287
|
"entered",
|
288
|
+
"erased",
|
289
|
+
"indecipherable",
|
260
290
|
"nomenclatural",
|
261
291
|
"orig",
|
262
292
|
"registration",
|
263
|
-
"science"
|
293
|
+
"science",
|
294
|
+
"wg",
|
295
|
+
"wm",
|
296
|
+
"wn",
|
297
|
+
"zw",
|
298
|
+
"zz",
|
299
|
+
"z-"
|
264
300
|
]
|
265
301
|
|
266
302
|
GIVEN_BLACKLIST = [
|
@@ -268,7 +304,7 @@ module DwcAgent
|
|
268
304
|
"has not"
|
269
305
|
]
|
270
306
|
|
271
|
-
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|
|
307
|
+
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|major|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
272
308
|
|
273
309
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
274
310
|
|
data/lib/dwc_agent/parser.rb
CHANGED
@@ -20,7 +20,7 @@ module DwcAgent
|
|
20
20
|
@strip_out_regex = Regexp.new STRIP_OUT.to_s
|
21
21
|
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
|
22
22
|
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
|
23
|
-
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.join('|').to_s
|
23
|
+
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
|
24
24
|
@complex_separators_regex = Regexp.new COMPLEX_SEPARATORS.to_s
|
25
25
|
@add_separators_regex = Regexp.new %r{(\S{1}\.)([[:alpha:]]{2,})}.to_s
|
26
26
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.0.
|
4
|
+
version: 1.5.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|
@@ -102,7 +102,7 @@ homepage: https://github.com/bionomia/dwc_agent
|
|
102
102
|
licenses:
|
103
103
|
- MIT
|
104
104
|
metadata: {}
|
105
|
-
post_install_message:
|
105
|
+
post_install_message:
|
106
106
|
rdoc_options:
|
107
107
|
- "--encoding"
|
108
108
|
- UTF-8
|
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
-
signing_key:
|
122
|
+
rubygems_version: 3.1.2
|
123
|
+
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|
126
126
|
test_files: []
|