dwc_agent 3.0.0.4 → 3.0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/dwc_agent/cleaner.rb +40 -13
- data/lib/dwc_agent/constants.rb +31 -2
- data/lib/dwc_agent/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2ec2d89d805d35fb64401736263556d13d244407456b909996912d4eb749653
|
4
|
+
data.tar.gz: 1bb39f6a59ae242362266f8a949626cfd1289d194e150463dd2eb650b98a4171
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f005ba01365f8226966aae1e2e70ffcbbc3ffef391eeb319baad0e666c91c8edc0bef2acd1ce98dabf5d6503c063f43341c7da72e4f4b11e370537ec4592408e
|
7
|
+
data.tar.gz: bef79f42b26962c60357c3dba78d15d4b9f8189cc3c31f7034e65f78f6bc41c8b3153083354186869e9ac6b7285aa355a2b62b4c81060091a2ae99c08829ff2a
|
data/lib/dwc_agent/cleaner.rb
CHANGED
@@ -18,11 +18,14 @@ module DwcAgent
|
|
18
18
|
# @return Namae::Name [Object] a new Namae object
|
19
19
|
def clean(parsed_namae)
|
20
20
|
|
21
|
-
if parsed_namae.given &&
|
21
|
+
if parsed_namae.given &&
|
22
|
+
GIVEN_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
|
22
23
|
return Namae::Name.new
|
23
24
|
end
|
24
25
|
|
25
|
-
if parsed_namae.family &&
|
26
|
+
if parsed_namae.family &&
|
27
|
+
parsed_namae.family.length == 3 &&
|
28
|
+
parsed_namae.family.count('.') == 1
|
26
29
|
return Namae::Name.new
|
27
30
|
end
|
28
31
|
|
@@ -30,7 +33,9 @@ module DwcAgent
|
|
30
33
|
return Namae::Name.new
|
31
34
|
end
|
32
35
|
|
33
|
-
if parsed_namae.given &&
|
36
|
+
if parsed_namae.given &&
|
37
|
+
parsed_namae.given.count('.') >= 3 &&
|
38
|
+
/\.\s*[a-zA-Z]{4,}\s+[a-zA-Z]{1,}\./.match(parsed_namae.given)
|
34
39
|
return Namae::Name.new
|
35
40
|
end
|
36
41
|
|
@@ -38,6 +43,13 @@ module DwcAgent
|
|
38
43
|
return Namae::Name.new
|
39
44
|
end
|
40
45
|
|
46
|
+
if parsed_namae.family &&
|
47
|
+
parsed_namae.family.count(".") == 1 &&
|
48
|
+
parsed_namae.family[-1] == "." &&
|
49
|
+
parsed_namae.family.length > 3
|
50
|
+
parsed_namae.family = parsed_namae.family.delete_suffix(".")
|
51
|
+
end
|
52
|
+
|
41
53
|
if parsed_namae.given &&
|
42
54
|
parsed_namae.family &&
|
43
55
|
parsed_namae.family.count(".") > 0 &&
|
@@ -59,6 +71,15 @@ module DwcAgent
|
|
59
71
|
parsed_namae.given = family
|
60
72
|
end
|
61
73
|
|
74
|
+
if !parsed_namae.given &&
|
75
|
+
parsed_namae.particle &&
|
76
|
+
parsed_namae.family &&
|
77
|
+
/^[A-Za-z]{3,}\s+(?:[A-Z]\.\s?){1,}$/.match(parsed_namae.family)
|
78
|
+
matched = /^(?<family>[A-Za-z]{3,})\s+(?<given>([A-Z]\.\s?){1,})$/.match(parsed_namae.family)
|
79
|
+
parsed_namae.family = matched[:family]
|
80
|
+
parsed_namae.given = matched[:given]
|
81
|
+
end
|
82
|
+
|
62
83
|
if parsed_namae.given &&
|
63
84
|
(parsed_namae.given == parsed_namae.given.upcase ||
|
64
85
|
parsed_namae.given == parsed_namae.given.downcase) &&
|
@@ -75,13 +96,21 @@ module DwcAgent
|
|
75
96
|
parsed_namae.given = NameCase(parsed_namae.given)
|
76
97
|
end
|
77
98
|
|
78
|
-
if parsed_namae.family &&
|
99
|
+
if parsed_namae.family &&
|
100
|
+
FAMILY_BLACKLIST.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
|
79
101
|
return Namae::Name.new
|
80
102
|
end
|
81
103
|
|
104
|
+
if parsed_namae.family.nil? &&
|
105
|
+
!parsed_namae.given.nil? &&
|
106
|
+
!parsed_namae.given.include?(".")
|
107
|
+
parsed_namae.family = parsed_namae.given
|
108
|
+
parsed_namae.given = nil
|
109
|
+
end
|
110
|
+
|
82
111
|
parsed_namae.normalize_initials
|
83
112
|
|
84
|
-
family = parsed_namae.family
|
113
|
+
family = parsed_namae.family
|
85
114
|
given = parsed_namae.given.strip rescue nil
|
86
115
|
particle = parsed_namae.particle.strip rescue nil
|
87
116
|
appellation = parsed_namae.appellation.strip rescue nil
|
@@ -92,12 +121,10 @@ module DwcAgent
|
|
92
121
|
given = given.gsub(".", ". ").strip
|
93
122
|
end
|
94
123
|
|
95
|
-
if family.nil? &&
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
if !family.nil? && given.nil? && !particle.nil?
|
124
|
+
if !family.nil? &&
|
125
|
+
given.nil? &&
|
126
|
+
!particle.nil? &&
|
127
|
+
!PARTICLES.include?(particle.downcase)
|
101
128
|
given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
|
102
129
|
particle = nil
|
103
130
|
end
|
@@ -110,8 +137,8 @@ module DwcAgent
|
|
110
137
|
family = NameCase(family)
|
111
138
|
end
|
112
139
|
|
113
|
-
if !family.nil? && family.match(/[A-Z]$/)
|
114
|
-
|
140
|
+
if !family.nil? && family.match(/[A-Z]{1,3}$/)
|
141
|
+
family = NameCase(family.upcase)
|
115
142
|
end
|
116
143
|
|
117
144
|
if given.nil? && !family.nil? && family.match(/^[A-Z]{2}/)
|
data/lib/dwc_agent/constants.rb
CHANGED
@@ -52,10 +52,13 @@ module DwcAgent
|
|
52
52
|
\d+\s+(?i:Nov|Novemb(er|re))\.?\b|
|
53
53
|
\d+\s+(?i:Dec|D(e|é)cemb(er|re))\.?\b|
|
54
54
|
\b[.-–,;:/]?\s*(?i:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Evergreen|Florida|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Portland|Rhode\s+Island|South\s+Carolina|South\s+Dakota|St\s+Petersburg|Tennessee|Texas|Utah|Vermont|Washington|West\s+Virginia|Wisconsin|Wyoming)\s+(?i:State)\s*\b|
|
55
|
-
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|
|
55
|
+
\b[.,;:/]?\s*?(?i:Afghanistan|Åland Islands|Albania|Algeria|American Samoa|Andorra|Angola|Anguilla|Antarctica|Antigua and Barbuda|Argentina|Armenia|Aruba|Australia|Azerbaijan|Bahamas|Bahrain|Bangladesh|Barbados|Belarus|Belize|Benin|Bermuda|Bhutan|Bolivia \(Plurinational State of\)|Bonaire, Sint Eustatius and Saba|Bosnia and Herzegovina|Botswana|Bouvet Island|Brazil|British Indian Ocean Territory|Brunei Darussalam|Bulgaria|Burkina Faso|Burundi|Cabo Verde|Cambodia|Cameroon|Canada|Cayman Islands|Central African Republic|Chad|Chile|Christmas Island|Cocos \(Keeling\) Islands|Colombia|Comoros|Congo|Congo \(Democratic Republic of the\)|Cook Islands|Costa Rica|Côte d'Ivoire|Croatia|Cuba|Curaçao|Cyprus|Czechia|Djibouti|Dominica|Dominican Republic|Ecuador|Egypt|El Salvador|Equatorial Guinea|Eritrea|Estonia|Ethiopia|Falkland Islands \(Malvinas\)|Faroe Islands|Fiji|Finland|French Guiana|French Polynesia|French Southern Territories|Gabon|Gambia|Germany|Ghana|Gibraltar|Greece|Greenland|Grenada|Guadeloupe|Guam|Guatemala|Guernsey|Guinea-Bissau|Guyana|Haiti|Heard Island and McDonald Islands|Holy See|Honduras|Hong Kong|Hungary|Iceland|India|Indonesia|Iran \(Islamic Republic of\)|Iraq|Ireland|Isle of Man|Italy|Jamaica|Japan|Jersey|Kazakhstan|Kenya|Kiribati|Korea \(Democratic People\'s Republic of\)|Korea \(Republic of\)|Kuwait|Kyrgyzstan|Lao People\'s Democratic Republic|Latvia|Lebanon|Lesotho|Liberia|Libya|Liechtenstein|Lithuania|Luxembourg|Macao|Macedonia (the former Yugoslav Republic of)|Madagascar|Malawi|Malaysia|Maldives|Malta|Marshall Islands|Martinique|Mauritania|Mauritius|Mayotte|Mexico|Micronesia \(Federated States of\)|Moldova \(Republic of\)|Monaco|Mongolia|Montenegro|Morocco|Mozambique|Myanmar|Namibia|Nauru|Nepal|Netherlands|New Caledonia|New Zealand|Nicaragua|Niger|Nigeria|Niue|Norfolk Island|Northern Mariana Islands|Norway|Oman|Pakistan|Palau|Palestine, State of|Panama|Papua New Guinea|Paraguay|Peru|Philippines|Pitcairn|Poland|Puerto Rico|Qatar|Réunion|Romania|Russian Federation|Rwanda|Saint Barthélemy|Saint Helena, Ascension and Tristan da Cunha|Saint Kitts and Nevis|Saint Lucia|Saint Martin \(French part\)|Saint Pierre and Miquelon|Saint Vincent and the Grenadines|Samoa|San Marino|Sao Tome and Principe|Saudi Arabia|Senegal|Serbia|Seychelles|Sierra Leone|Singapore|Sint Maarten \(Dutch part\)|Slovakia|Slovenia|Solomon Islands|Somalia|South Africa|South Georgia and the South Sandwich Islands|South Sudan|Sri Lanka|Sudan|Suriname|Svalbard and Jan Mayen|Swaziland|Sweden|Switzerland|Syrian Arab Republic|Taiwan|Tajikistan|Tanzania, United Republic of|Thailand|Timor-Leste|Togo|Tokelau|Tonga|Trinidad and Tobago|Tunisia|Turkey|Turkmenistan|Turks and Caicos Islands|Tuvalu|Uganda|Ukraine|United Arab Emirates|United Kingdom of Great Britain and Northern Ireland|United States of America|United States Minor Outlying Islands|Uruguay|Uzbekistan|Vanuatu|Venezuela \(Bolivarian Republic of\)|Viet Nam|Virgin Islands \(British\)|Virgin Islands \(U\.S\.\)|Wallis and Futuna|Western Sahara|Yemen|Zambia|Zimbabwe)\b|
|
56
56
|
(?i:autres?\s+de|probab|likely|possibl(e|y)|doubtful)|
|
57
57
|
\b\s*(?i:maybe)\s*\b|
|
58
58
|
\b\s*(?i:prob)\.\s*\b|
|
59
|
+
\b\s*(?i:field\s*number)|
|
60
|
+
\b\s*?(?i:malaise|light|pitfall|pan|suction|lobster|actinic light|cdc|fisherm(a|e)n)\s*(?i:trap)\s*\b|
|
61
|
+
\|\s*(?i:collector\s*(field\s*)?number).*$|
|
59
62
|
\(?[,]?\s*?(?i:(local)?\s?collectors?|data\s*recorder|netter|(oper|prepar)ator)\(?s?\)?\.?\:?|
|
60
63
|
\b[.-–,;:]?\s*(?i:department|faculty)\s*?(?i:of)?\s*?(?i:entomology|biology|zoology)|
|
61
64
|
(?i:Engº|Agrº|Fcº|Drº|Mº|Profº|Dº|Fº)|
|
@@ -237,7 +240,7 @@ module DwcAgent
|
|
237
240
|
(?i:geographic)|
|
238
241
|
(?i:mus(eum|ée)|universit(y|é|e|at)|college|institute?|acad(e|é)m|school|écol(e|iers?)|laboratoi?r|projec?t|polytech|dep(t|art?ment)|research|clinic|hospital|cientifica|sanctuary|safari)|
|
239
242
|
(?i:univ\.)|
|
240
|
-
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|
|
243
|
+
(?i:graduate|student|élèves?|éleveur|étudiants|estudi?antes?|labo\.|storekeep|supervisor|superint|rcmp|coordinator|minority|fisherm(a|e)n|police|taxonomist|consultant|participante?s?|team|(é|e)quipe|memb(er|re)|crew|group|staff|personnel|family|captain|friends|assistant|worker|gamekeeper)|
|
241
244
|
(?i:non\s+pr(é|e)cis(é|e))|
|
242
245
|
(?i:no\s+consta)|
|
243
246
|
(?i:no\s+(agent)?\s?(data|disponible)(\s+available)?)|
|
@@ -290,16 +293,20 @@ module DwcAgent
|
|
290
293
|
"von",
|
291
294
|
"the",
|
292
295
|
"of",
|
296
|
+
"new",
|
297
|
+
"no",
|
293
298
|
"adjustment",
|
294
299
|
"annotator",
|
295
300
|
"available",
|
296
301
|
"arachnology",
|
297
302
|
"catalogue",
|
303
|
+
"comments",
|
298
304
|
"curators",
|
299
305
|
"data",
|
300
306
|
"details",
|
301
307
|
"determiner",
|
302
308
|
"determination",
|
309
|
+
"dissected",
|
303
310
|
"dissection",
|
304
311
|
"entered",
|
305
312
|
"erased",
|
@@ -335,4 +342,26 @@ module DwcAgent
|
|
335
342
|
|
336
343
|
SUFFIX = /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/
|
337
344
|
|
345
|
+
PARTICLES = [
|
346
|
+
"ap",
|
347
|
+
"da",
|
348
|
+
"de",
|
349
|
+
"de'",
|
350
|
+
"del",
|
351
|
+
"der",
|
352
|
+
"des",
|
353
|
+
"di",
|
354
|
+
"do",
|
355
|
+
"dos",
|
356
|
+
"du",
|
357
|
+
"el",
|
358
|
+
"le",
|
359
|
+
"la",
|
360
|
+
"van",
|
361
|
+
"von",
|
362
|
+
"the",
|
363
|
+
"of",
|
364
|
+
"van der"
|
365
|
+
]
|
366
|
+
|
338
367
|
end
|
data/lib/dwc_agent/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc_agent
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.0.
|
4
|
+
version: 3.0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David P. Shorthouse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: namae
|