dwc_agent 3.0.0.4 → 3.0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +40 -13
- data/lib/dwc_agent/constants.rb +31 -2
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2ec2d89d805d35fb64401736263556d13d244407456b909996912d4eb749653
|
4
|
+
data.tar.gz: 1bb39f6a59ae242362266f8a949626cfd1289d194e150463dd2eb650b98a4171
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f005ba01365f8226966aae1e2e70ffcbbc3ffef391eeb319baad0e666c91c8edc0bef2acd1ce98dabf5d6503c063f43341c7da72e4f4b11e370537ec4592408e
|
7
|
+
data.tar.gz: bef79f42b26962c60357c3dba78d15d4b9f8189cc3c31f7034e65f78f6bc41c8b3153083354186869e9ac6b7285aa355a2b62b4c81060091a2ae99c08829ff2a
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,11 +18,14 @@ module DwcAgent
|
|
18
18
|
# @return Namae::Name [Object] a new Namae object
|
19
19
|
def clean(parsed_namae)
|
20
20
|
|
21
|
-
if parsed_namae.given &&
|
21
|
+
if parsed_namae.given &&
|
22
|
+
GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
22
23
|
return Namae::Name.new
|
23
24
|
end
|
24
25
|
|
25
|
-
if parsed_namae.family &&
|
26
|
+
if parsed_namae.family &&
|
27
|
+
parsed_namae.family.length == 3 &&
|
28
|
+
parsed_namae.family.count('.') == 1
|
26
29
|
return Namae::Name.new
|
27
30
|
end
|
28
31
|
|
@@ -30,7 +33,9 @@ module DwcAgent
|
|
30
33
|
return Namae::Name.new
|
31
34
|
end
|
32
35
|
|
33
|
-
if parsed_namae.given &&
|
36
|
+
if parsed_namae.given &&
|
37
|
+
parsed_namae.given.count('.') >= 3 &&
|
38
|
+
/\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
34
39
|
return Namae::Name.new
|
35
40
|
end
|
36
41
|
|
@@ -38,6 +43,13 @@ module DwcAgent
|
|
38
43
|
return Namae::Name.new
|
39
44
|
end
|
40
45
|
|
46
|
+
if parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") == 1 &&
|
48
|
+
parsed_namae.family[-1] == "." &&
|
49
|
+
parsed_namae.family.length > 3
|
50
|
+
parsed_namae.family = parsed_namae.family.delete_suffix(".")
|
51
|
+
end
|
52
|
+
|
41
53
|
if parsed_namae.given &&
|
42
54
|
parsed_namae.family &&
|
43
55
|
parsed_namae.family.count(".") > 0 &&
|
@@ -59,6 +71,15 @@ module DwcAgent
|
|
59
71
|
parsed_namae.given = family
|
60
72
|
end
|
61
73
|
|
74
|
+
if !parsed_namae.given &&
|
75
|
+
parsed_namae.particle &&
|
76
|
+
parsed_namae.family &&
|
77
|
+
/^[A-Za-z]{3,}\s+(?:[A-Z]\.\s?){1,}$/.match(parsed_namae.family)
|
78
|
+
matched = /^(?<family>[A-Za-z]{3,})\s+(?<given>([A-Z]\.\s?){1,})$/.match(parsed_namae.family)
|
79
|
+
parsed_namae.family = matched[:family]
|
80
|
+
parsed_namae.given = matched[:given]
|
81
|
+
end
|
82
|
+
|
62
83
|
if parsed_namae.given &&
|
63
84
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
85
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
@@ -75,13 +96,21 @@ module DwcAgent
|
|
75
96
|
parsed_namae.given = NameCase(parsed_namae.given)
|
76
97
|
end
|
77
98
|
|
78
|
-
if parsed_namae.family &&
|
99
|
+
if parsed_namae.family &&
|
100
|
+
FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
79
101
|
return Namae::Name.new
|
80
102
|
end
|
81
103
|
|
104
|
+
if parsed_namae.family.nil? &&
|
105
|
+
!parsed_namae.given.nil? &&
|
106
|
+
!parsed_namae.given.include?(".")
|
107
|
+
parsed_namae.family = parsed_namae.given
|
108
|
+
parsed_namae.given = nil
|
109
|
+
end
|
110
|
+
|
82
111
|
parsed_namae.normalize_initials
|
83
112
|
|
84
|
-
family = parsed_namae.family
|
113
|
+
family = parsed_namae.family
|
85
114
|
given = parsed_namae.given.strip rescue nil
|
86
115
|
particle = parsed_namae.particle.strip rescue nil
|
87
116
|
appellation = parsed_namae.appellation.strip rescue nil
|
@@ -92,12 +121,10 @@ module DwcAgent
|
|
92
121
|
given = given.gsub(".", ". ").strip
|
93
122
|
end
|
94
123
|
|
95
|
-
if family.nil? &&
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
if !family.nil? && given.nil? && !particle.nil?
|
124
|
+
if !family.nil? &&
|
125
|
+
given.nil? &&
|
126
|
+
!particle.nil? &&
|
127
|
+
!PARTICLES.include?(particle.downcase)
|
101
128
|
given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
|
102
129
|
particle = nil
|
103
130
|
end
|
@@ -110,8 +137,8 @@ module DwcAgent
|
|
110
137
|
family = NameCase(family)
|
111
138
|
end
|
112
139
|
|
113
|
-
if !family.nil? && family.match(/[A-Z]$/)
|
114
|
-
|
140
|
+
if !family.nil? && family.match(/[A-Z]{1,3}$/)
|
141
|
+
family = NameCase(family.upcase)
|
115
142
|
end
|
116
143
|
|
117
144
|
if given.nil? && !family.nil? && family.match(/^[A-Z]{2}/)
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -52,10 +52,13 @@ module DwcAgent
|
|
52
52
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
53
53
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
54
54
|
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
55
|
-
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|
|
55
|
+
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Italy|Jamaica|Japan|Jersey|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
|
56
56
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
57
57
|
\b\s*(?i:maybe)\s*\b|
|
58
58
|
\b\s*(?i:prob)\.\s*\b|
|
59
|
+
\b\s*(?i:field\s*number)|
|
60
|
+
\b\s*?(?i:malaise|light|pitfall|pan|suction|lobster|actinic light|cdc|fisherm(a|e)n)\s*(?i:trap)\s*\b|
|
61
|
+
\|\s*(?i:collector\s*(field\s*)?number).*$|
|
59
62
|
\(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
60
63
|
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
61
64
|
(?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
|
@@ -237,7 +240,7 @@ module DwcAgent
|
|
237
240
|
(?i:geographic)|
|
238
241
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
239
242
|
(?i:univ\.)|
|
240
|
-
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|
|
243
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(a|e)n|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker|gamekeeper)|
|
241
244
|
(?i:non\s+pr(é|e)cis(é|e))|
|
242
245
|
(?i:no\s+consta)|
|
243
246
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
@@ -290,16 +293,20 @@ module DwcAgent
|
|
290
293
|
"von",
|
291
294
|
"the",
|
292
295
|
"of",
|
296
|
+
"new",
|
297
|
+
"no",
|
293
298
|
"adjustment",
|
294
299
|
"annotator",
|
295
300
|
"available",
|
296
301
|
"arachnology",
|
297
302
|
"catalogue",
|
303
|
+
"comments",
|
298
304
|
"curators",
|
299
305
|
"data",
|
300
306
|
"details",
|
301
307
|
"determiner",
|
302
308
|
"determination",
|
309
|
+
"dissected",
|
303
310
|
"dissection",
|
304
311
|
"entered",
|
305
312
|
"erased",
|
@@ -335,4 +342,26 @@ module DwcAgent
|
|
335
342
|
|
336
343
|
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
|
337
344
|
|
345
|
+
PARTICLES = [
|
346
|
+
"ap",
|
347
|
+
"da",
|
348
|
+
"de",
|
349
|
+
"de'",
|
350
|
+
"del",
|
351
|
+
"der",
|
352
|
+
"des",
|
353
|
+
"di",
|
354
|
+
"do",
|
355
|
+
"dos",
|
356
|
+
"du",
|
357
|
+
"el",
|
358
|
+
"le",
|
359
|
+
"la",
|
360
|
+
"van",
|
361
|
+
"von",
|
362
|
+
"the",
|
363
|
+
"of",
|
364
|
+
"van der"
|
365
|
+
]
|
366
|
+
|
338
367
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.0.
|
4
|
+
version: 3.0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|