dwc_agent 1.5.0.4 → 1.5.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +5 -5
- data/lib/dwc_agent/constants.rb +47 -18
- data/lib/dwc_agent/version.rb +1 -1
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b79ba4257776b532e0e071ab4f2defb32fd2131147624b80e3a4b145ceb666e
|
4
|
+
data.tar.gz: cc5080d4b67557b5d68ca7c8de761c0fd7c063ef2f398e68aa7aff3b8bfbed5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 600569c1098627db767a701bc8f85079afa2e47930a8b707a089ef06f2597a4f97ffe8d8fd2d0fa3347bee8edb9a6d1035ac2645dc4baac1e1a02228c892675f
|
7
|
+
data.tar.gz: 94b68c1c443909b1d28396597ee7474981c51eddbba67a2dc29190b21700937bed091a645073533942360d2e2455b4140dba938f3e2f2af2f5f29fe32db337ca
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,10 +18,6 @@ module DwcAgent
|
|
18
18
|
def clean(parsed_namae)
|
19
19
|
blank_name = { title: nil, appellation: nil, given: nil, particle: nil, family: nil, suffix: nil }
|
20
20
|
|
21
|
-
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
22
|
-
return blank_name
|
23
|
-
end
|
24
|
-
|
25
21
|
if parsed_namae.given && GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
26
22
|
return blank_name
|
27
23
|
end
|
@@ -79,6 +75,10 @@ module DwcAgent
|
|
79
75
|
parsed_namae.given = NameCase(parsed_namae.given)
|
80
76
|
end
|
81
77
|
|
78
|
+
if parsed_namae.family && FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
79
|
+
return blank_name
|
80
|
+
end
|
81
|
+
|
82
82
|
parsed_namae.normalize_initials
|
83
83
|
|
84
84
|
family = parsed_namae.family.gsub(/\.\z/, '').strip rescue nil
|
@@ -114,7 +114,7 @@ module DwcAgent
|
|
114
114
|
return blank_name
|
115
115
|
end
|
116
116
|
|
117
|
-
if !family.nil? && family.match(/^[A-Z]{2}/)
|
117
|
+
if given.nil? && !family.nil? && family.match(/^[A-Z]{2}/)
|
118
118
|
return blank_name
|
119
119
|
end
|
120
120
|
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -3,18 +3,20 @@ module DwcAgent
|
|
3
3
|
^[\[{(]|
|
4
4
|
[\]})]\??$|
|
5
5
|
(?i:acc\s?\#)|
|
6
|
+
[,;]?\s*(?i:1st|2nd|3rd|[4-9]th)|
|
6
7
|
\s*?\d+\.\d+|
|
7
8
|
\b\d+\(?(?i:[[:alpha:]])\)?\b|
|
8
9
|
\b[,;]?\s*(?i:et\.?\s+al|&\s+al)\.?|
|
10
|
+
\b[,;]?\s*(?i:etal)\.?|
|
9
11
|
\b\s+(bis|ter)(\b|\z)|
|
10
12
|
\bu\.\s*a\.|
|
11
|
-
\b[,;]?\s*(?i:and|&)?\s*(?i:others)\s*\b|
|
13
|
+
\b[,;]?\s*(?i:and|&)?\s*(?i:others|party)\s*\b|
|
12
14
|
\b[,;]?\s*(?i:etc)\.?|
|
13
15
|
\b[,;]?\s*(?i:on)\b|
|
14
16
|
\b[,;]?\s*(?i:unkn?own)\b|
|
15
17
|
\b[,;]?\s*(?i:n/a)\b|
|
16
18
|
\b[,;]?\s*(?i:ann?onymous)\b|
|
17
|
-
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed
|
19
|
+
\b[,;]?\s*\(?(?i:undetermined|indeterminable|dummy|interim|accession|ill(eg|is)ible|scripsit|presumed?|presumably)\)?\b|
|
18
20
|
\b[,;]?\s*(?i:importer|gift)\:?\b|
|
19
21
|
\b[,;]?\s*(?i:string)\b|
|
20
22
|
\b[,;]?\s*(?i:person\s*string)\b|
|
@@ -48,34 +50,38 @@ module DwcAgent
|
|
48
50
|
\d+\s+(?i:Oct|Octob(er|re))\.?\b|
|
49
51
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
50
52
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
51
|
-
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|
|
52
|
-
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|
|
53
|
+
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
54
|
+
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Austria|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belgium|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|China|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Denmark|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|France|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Israel|Italy|Jamaica|Japan|Jersey|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Mali|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Montserrat|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Portugal|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Spain|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
|
53
55
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
54
56
|
\b\s*(?i:maybe)\s*\b|
|
55
57
|
\b\s*(?i:prob)\.\s*\b|
|
56
|
-
\(?[,]?\s*?(?i:(local)?\s?
|
58
|
+
\(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
57
59
|
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
58
60
|
(?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
|
59
61
|
(?i:fide)\:?\s*\b|
|
62
|
+
(?i:first\s+name\s+unknown)|
|
60
63
|
(?i:game\s+dept)\.?\s*\b|
|
61
64
|
(?i:see\s+notes?\s*(inside)?)|
|
62
65
|
(?i:see\s+letter\s+enclosed)|
|
63
66
|
(?i:(by)?\s+correspondance)|
|
64
67
|
(?i:pers\.?\s*comm\.?)|
|
65
68
|
(?i:crossed\s+out)|
|
69
|
+
(?i:(ohne|keine)\s+angaben)|
|
66
70
|
\(?(?i:source)\(?|
|
67
71
|
(?i:according\s+to)|
|
68
72
|
(?i:lanuv)\d+|
|
73
|
+
\b\s*name\b|
|
74
|
+
\b\s*lost\b|
|
69
75
|
(?i:nswobs)|
|
70
76
|
ORCID|
|
71
77
|
MRI(\s|-)PAS|
|
72
78
|
urn\:qm\.qld\.gov\.au\:collector|
|
73
79
|
(?i:University\s+of\s+(Southern\s+)?California(,\s+Berkeley)?)|
|
74
|
-
(?i:
|
75
|
-
(?i:
|
80
|
+
(?i:field\s+museum\s+of\s+natural\s+history)|
|
81
|
+
(?i:american\s+museum\s+of\s+natural\s+history)|
|
76
82
|
(?i:The\s+Paleontological\s+Research\s+Institution)|
|
77
83
|
(?i:museums?\s+victoria)|
|
78
|
-
\b\s*(?i:
|
84
|
+
\b\s*(?i:united\s+states|russia)\s*\b|
|
79
85
|
(?i:revised|photograph|fruits\s+only)|
|
80
86
|
-?\s*(?i:sight\s+(id|identifi?cation))\.?\s*\b|
|
81
87
|
-?\s*(?i:synonym(y|ie))|
|
@@ -83,11 +89,14 @@ module DwcAgent
|
|
83
89
|
\b(?i:to\s+(sub)?spp?)\.?|
|
84
90
|
(?i:nom\.?\s+rev\.?)|
|
85
91
|
FNA|DAO|HUH|FDNMB|MNHN|PNI|USNM|ZMUC|CSIRO|ACAD|USGS|NAWQA|
|
92
|
+
\b,?\s*(?i:para|topo|syn)?(?i:type)|
|
86
93
|
AFSC\/POLISH\s+SORTING\s+CTR\.?|
|
87
94
|
(?i:university|museum|exhibits?)|
|
88
95
|
(?i:uqam)|
|
89
96
|
(?i:sem\s+(colec?tor|data))|
|
90
97
|
(?i:no\s+coll\.?(ector)?)|
|
98
|
+
(?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)|
|
99
|
+
(?i:non?)\s+(?i:specificato)|
|
91
100
|
\b[,;]\s+\d+\z|
|
92
101
|
["!@?]|
|
93
102
|
[,]?\d+|
|
@@ -158,22 +167,24 @@ module DwcAgent
|
|
158
167
|
'}' => '',
|
159
168
|
'@' => '',
|
160
169
|
'%' => '',
|
161
|
-
'\\' => ''
|
170
|
+
'\\' => '',
|
171
|
+
'´' => '\'',
|
172
|
+
'+' => ' | '
|
162
173
|
}
|
163
174
|
|
164
175
|
PHRASE_SUBS = {
|
165
176
|
', ph.d.' => ' Ph.D.',
|
166
177
|
', Ph.D.' => ' Ph.D.',
|
167
178
|
', bro.' => ' Bro.',
|
168
|
-
'
|
169
|
-
' jr,' => ' Jr.;',
|
170
|
-
'-jr' => ' Jr.',
|
171
|
-
'-Jr' => ' Jr.',
|
179
|
+
', Jr.,' => ' Jr.;',
|
172
180
|
', Jr.' => ' Jr.',
|
173
181
|
',Jr.' => ' Jr.',
|
174
182
|
', Sr.' => ' Sr.',
|
175
|
-
',Sr.' => ' Sr.'
|
176
|
-
|
183
|
+
',Sr.' => ' Sr.',
|
184
|
+
' jr.,' => ' Jr.;',
|
185
|
+
' jr,' => ' Jr.;',
|
186
|
+
'-jr' => ' Jr.',
|
187
|
+
'-Jr' => ' Jr.'
|
177
188
|
}
|
178
189
|
|
179
190
|
COMPLEX_SEPARATORS = %r{
|
@@ -238,7 +249,7 @@ module DwcAgent
|
|
238
249
|
(?i:though)|
|
239
250
|
(?i:texas\s+instruments?)\s*?(for)?|
|
240
251
|
(?:tropical)|
|
241
|
-
(?i:toward|seen
|
252
|
+
(?i:toward|seen\s+at)|
|
242
253
|
(?i:unidentified|unspecified|unk?nown?|unnamed|unread|unmistak|no agent)|
|
243
254
|
(?i:urn\:)|
|
244
255
|
(?i:usda|ucla)|
|
@@ -247,29 +258,47 @@ module DwcAgent
|
|
247
258
|
}x
|
248
259
|
|
249
260
|
FAMILY_BLACKLIST = [
|
261
|
+
"ap",
|
250
262
|
"da",
|
263
|
+
"de",
|
251
264
|
"de'",
|
252
265
|
"del",
|
253
266
|
"der",
|
267
|
+
"di",
|
268
|
+
"do",
|
269
|
+
"dos",
|
254
270
|
"du",
|
255
271
|
"el",
|
272
|
+
"le",
|
273
|
+
"la",
|
256
274
|
"van",
|
257
275
|
"von",
|
258
276
|
"the",
|
259
277
|
"of",
|
260
278
|
"adjustment",
|
279
|
+
"annotator",
|
261
280
|
"available",
|
262
281
|
"arachnology",
|
263
282
|
"catalogue",
|
264
283
|
"curators",
|
265
284
|
"data",
|
285
|
+
"details",
|
286
|
+
"determiner",
|
266
287
|
"determination",
|
267
288
|
"dissection",
|
268
289
|
"entered",
|
290
|
+
"erased",
|
291
|
+
"indecipherable",
|
269
292
|
"nomenclatural",
|
270
293
|
"orig",
|
271
294
|
"registration",
|
272
|
-
"science"
|
295
|
+
"science",
|
296
|
+
"wg",
|
297
|
+
"wm",
|
298
|
+
"wn",
|
299
|
+
"zw",
|
300
|
+
"zz",
|
301
|
+
"z-"
|
273
302
|
]
|
274
303
|
|
275
304
|
GIVEN_BLACKLIST = [
|
@@ -277,7 +306,7 @@ module DwcAgent
|
|
277
306
|
"has not"
|
278
307
|
]
|
279
308
|
|
280
|
-
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|
|
309
|
+
TITLE = /\s*\b(sir|count(ess)?|colonel|(gen|adm|col|maj|major|capt|cmdr|lt|sgt|cpl|pvt|proff?|dr|md|ph\.?d|rev|mme|abbé|ptre|bro|esq)\.?|docteur|father|cantor|vicar|père|pastor|rabbi|reverend|pere|soeur|sister|professor)(\s+|$)/i
|
281
310
|
|
282
311
|
APPELLATION = /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
283
312
|
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.0.
|
4
|
+
version: 1.5.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|
@@ -102,7 +102,7 @@ homepage: https://github.com/bionomia/dwc_agent
|
|
102
102
|
licenses:
|
103
103
|
- MIT
|
104
104
|
metadata: {}
|
105
|
-
post_install_message:
|
105
|
+
post_install_message:
|
106
106
|
rdoc_options:
|
107
107
|
- "--encoding"
|
108
108
|
- UTF-8
|
@@ -112,15 +112,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
112
|
requirements:
|
113
113
|
- - "~>"
|
114
114
|
- !ruby/object:Gem::Version
|
115
|
-
version: '2.
|
115
|
+
version: '2.7'
|
116
116
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
117
|
requirements:
|
118
118
|
- - ">="
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
-
signing_key:
|
122
|
+
rubygems_version: 3.1.2
|
123
|
+
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: Parse Darwin Core agent terms such as recordedBy and identifiedBy
|
126
126
|
test_files: []
|