dwca_hunter 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.ruby-version +1 -1
- data/Gemfile.lock +56 -27
- data/dwca_hunter.gemspec +11 -9
- data/exe/dwcahunter +0 -2
- data/lib/dwca_hunter.rb +9 -7
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/arctos.rb +42 -45
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +105 -105
- data/lib/dwca_hunter/resources/mammal_divdb.rb +76 -45
- data/lib/dwca_hunter/resources/mcz.rb +1 -1
- data/lib/dwca_hunter/resources/wikispecies.rb +65 -98
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +48 -20
@@ -3,14 +3,14 @@
|
|
3
3
|
module DwcaHunter
|
4
4
|
class ResourceIOCWorldBird < DwcaHunter::Resource
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = 'ioc-world-bird'
|
7
|
+
@title = 'IOC World Bird List'
|
8
|
+
@url = 'https://uofi.box.com/shared/static/znsd734a78saq87hes979p5uspgkzy93.csv'
|
9
|
+
@UUID = '6421ffec-38e3-40fb-a6d9-af27238a47a1'
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
'dwca_hunter',
|
12
|
+
'ioc-bird',
|
13
|
+
'data.csv')
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
@@ -21,17 +21,17 @@ module DwcaHunter
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def download
|
24
|
-
puts
|
25
|
-
puts
|
26
|
-
puts
|
27
|
-
puts
|
24
|
+
puts 'Downloading cached and converted to csv version.'
|
25
|
+
puts 'CHECK FOR NEW VERSION at'
|
26
|
+
puts 'https://www.worldbirdnames.org/ioc-lists/master-list-2/'
|
27
|
+
puts 'Use libreoffice to convert to csv.'
|
28
28
|
`curl -s -L #{@url} -o #{@download_path}`
|
29
29
|
end
|
30
30
|
|
31
31
|
def unpack; end
|
32
32
|
|
33
33
|
def make_dwca
|
34
|
-
DwcaHunter.logger_write(object_id,
|
34
|
+
DwcaHunter.logger_write(object_id, 'Extracting data')
|
35
35
|
get_names
|
36
36
|
generate_dwca
|
37
37
|
end
|
@@ -45,84 +45,84 @@ module DwcaHunter
|
|
45
45
|
|
46
46
|
def collect_names
|
47
47
|
@names_index = {}
|
48
|
-
file = CSV.open(File.join(@download_dir,
|
48
|
+
file = CSV.open(File.join(@download_dir, 'data.csv'),
|
49
49
|
headers: true)
|
50
|
-
order =
|
51
|
-
family =
|
52
|
-
genus =
|
53
|
-
species =
|
50
|
+
order = ''
|
51
|
+
family = ''
|
52
|
+
genus = ''
|
53
|
+
species = ''
|
54
54
|
count = 0
|
55
55
|
file.each do |row|
|
56
|
-
order1 = row[
|
57
|
-
order = order1.capitalize if order1.to_s !=
|
56
|
+
order1 = row['Order']
|
57
|
+
order = order1.capitalize if order1.to_s != ''
|
58
58
|
|
59
|
-
family1 = row[
|
60
|
-
family = family1.capitalize if family1.to_s !=
|
59
|
+
family1 = row['Family (Scientific)']
|
60
|
+
family = family1.capitalize if family1.to_s != ''
|
61
61
|
|
62
|
-
genus1 = row[
|
63
|
-
genus = genus1.capitalize if genus1.to_s !=
|
62
|
+
genus1 = row['Genus']
|
63
|
+
genus = genus1.capitalize if genus1.to_s != ''
|
64
64
|
|
65
|
-
species1 = row[
|
66
|
-
species = species1 if species1.to_s !=
|
65
|
+
species1 = row['Species (Scientific)']
|
66
|
+
species = species1 if species1.to_s != ''
|
67
67
|
|
68
|
-
subspecies = row[
|
69
|
-
next if species.to_s ==
|
68
|
+
subspecies = row['Subspecies']
|
69
|
+
next if species.to_s == ''
|
70
70
|
|
71
71
|
count += 1
|
72
72
|
taxon_id = "gn_#{count}"
|
73
73
|
name = {
|
74
74
|
taxon_id: taxon_id,
|
75
|
-
kingdom:
|
76
|
-
phylum:
|
77
|
-
klass:
|
75
|
+
kingdom: 'Animalia',
|
76
|
+
phylum: 'Chordata',
|
77
|
+
klass: 'Aves',
|
78
78
|
order: order,
|
79
79
|
family: family,
|
80
80
|
genus: genus,
|
81
|
-
code:
|
81
|
+
code: 'ICZN'
|
82
82
|
}
|
83
|
-
if subspecies.to_s ==
|
84
|
-
auth = row[
|
85
|
-
auth = DwcaHunter.normalize_authors(auth) if auth !=
|
83
|
+
if subspecies.to_s == ''
|
84
|
+
auth = row['Authority'].to_s
|
85
|
+
auth = DwcaHunter.normalize_authors(auth) if auth != ''
|
86
86
|
name[:name_string] = clean(
|
87
|
-
"#{genus} #{species} #{auth}"
|
88
|
-
strip
|
87
|
+
"#{genus} #{species} #{auth}"
|
88
|
+
.strip
|
89
89
|
)
|
90
90
|
@names << name
|
91
|
-
vernacular = row[
|
92
|
-
if vernacular.to_s !=
|
93
|
-
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang:
|
91
|
+
vernacular = row['Species (English)']
|
92
|
+
if vernacular.to_s != ''
|
93
|
+
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: 'en' }
|
94
94
|
@vernaculars << vernaclar
|
95
95
|
end
|
96
|
-
species =
|
96
|
+
species = ''
|
97
97
|
else
|
98
98
|
name[:name_string] = clean(
|
99
|
-
"#{genus} #{species} #{subspecies} #{row['Authority']}"
|
100
|
-
strip
|
99
|
+
"#{genus} #{species} #{subspecies} #{row['Authority']}"
|
100
|
+
.strip
|
101
101
|
)
|
102
102
|
@names << name
|
103
|
-
species =
|
104
|
-
subspecies =
|
103
|
+
species = ''
|
104
|
+
subspecies = ''
|
105
105
|
end
|
106
106
|
end
|
107
107
|
end
|
108
108
|
|
109
109
|
def clean(n)
|
110
|
-
n = n.gsub(/†/,
|
111
|
-
n.gsub(/\s+/,
|
110
|
+
n = n.gsub(/†/, '')
|
111
|
+
n.gsub(/\s+/, ' ')
|
112
112
|
end
|
113
113
|
|
114
114
|
def generate_dwca
|
115
115
|
DwcaHunter.logger_write(object_id,
|
116
|
-
|
117
|
-
@core = [[
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
116
|
+
'Creating DarwinCore Archive file')
|
117
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
118
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
119
|
+
'http://rs.tdwg.org/dwc/terms/kingdom',
|
120
|
+
'http://rs.tdwg.org/dwc/terms/phylum',
|
121
|
+
'http://rs.tdwg.org/dwc/terms/class',
|
122
|
+
'http://rs.tdwg.org/dwc/terms/order',
|
123
|
+
'http://rs.tdwg.org/dwc/terms/family',
|
124
|
+
'http://rs.tdwg.org/dwc/terms/genus',
|
125
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode']]
|
126
126
|
@names.each do |n|
|
127
127
|
@core << [n[:taxon_id], n[:name_string],
|
128
128
|
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
@@ -130,12 +130,12 @@ module DwcaHunter
|
|
130
130
|
end
|
131
131
|
@extensions << {
|
132
132
|
data: [[
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
134
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
135
|
+
'http://purl.org/dc/terms/language'
|
136
136
|
]],
|
137
|
-
file_name:
|
138
|
-
row_type:
|
137
|
+
file_name: 'vernacular_names.txt',
|
138
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
139
139
|
}
|
140
140
|
|
141
141
|
@vernaculars.each do |v|
|
@@ -146,53 +146,53 @@ module DwcaHunter
|
|
146
146
|
id: @uuid,
|
147
147
|
title: @title,
|
148
148
|
authors: [
|
149
|
-
{ first_name:
|
150
|
-
last_name:
|
151
|
-
{ first_name:
|
152
|
-
last_name:
|
153
|
-
{ first_name:
|
154
|
-
last_name:
|
155
|
-
{ first_name:
|
156
|
-
last_name:
|
157
|
-
{ first_name:
|
158
|
-
last_name:
|
159
|
-
{ first_name:
|
160
|
-
last_name:
|
161
|
-
{ first_name:
|
162
|
-
last_name:
|
163
|
-
{ first_name:
|
164
|
-
last_name:
|
165
|
-
{ first_name:
|
166
|
-
last_name:
|
167
|
-
{ first_name:
|
168
|
-
last_name:
|
169
|
-
{ first_name:
|
170
|
-
last_name:
|
171
|
-
{ first_name:
|
172
|
-
last_name:
|
173
|
-
{ first_name:
|
174
|
-
last_name:
|
175
|
-
{ first_name:
|
176
|
-
last_name:
|
177
|
-
{ first_name:
|
178
|
-
last_name:
|
179
|
-
{ first_name:
|
180
|
-
last_name:
|
181
|
-
{ first_name:
|
182
|
-
last_name:
|
183
|
-
{ first_name:
|
184
|
-
last_name:
|
185
|
-
{ first_name:
|
186
|
-
last_name:
|
149
|
+
{ first_name: 'Per',
|
150
|
+
last_name: 'Alstrom' },
|
151
|
+
{ first_name: 'Mike',
|
152
|
+
last_name: 'Blair' },
|
153
|
+
{ first_name: 'Rauri',
|
154
|
+
last_name: 'Bowie' },
|
155
|
+
{ first_name: 'Nigel',
|
156
|
+
last_name: 'Redman' },
|
157
|
+
{ first_name: 'Jon',
|
158
|
+
last_name: 'Fjeldsa' },
|
159
|
+
{ first_name: 'Phil',
|
160
|
+
last_name: 'Gregory' },
|
161
|
+
{ first_name: 'Leo',
|
162
|
+
last_name: 'Joseph' },
|
163
|
+
{ first_name: 'Peter',
|
164
|
+
last_name: 'Kovalik' },
|
165
|
+
{ first_name: 'Adolfo',
|
166
|
+
last_name: 'Navarro-Siguenza' },
|
167
|
+
{ first_name: 'David',
|
168
|
+
last_name: 'Parkin' },
|
169
|
+
{ first_name: 'Alan',
|
170
|
+
last_name: 'Peterson' },
|
171
|
+
{ first_name: 'Douglas',
|
172
|
+
last_name: 'Pratt' },
|
173
|
+
{ first_name: 'Pam',
|
174
|
+
last_name: 'Rasmussen' },
|
175
|
+
{ first_name: 'Frank',
|
176
|
+
last_name: 'Rheindt' },
|
177
|
+
{ first_name: 'Robert',
|
178
|
+
last_name: 'Ridgely' },
|
179
|
+
{ first_name: 'Peter',
|
180
|
+
last_name: 'Ryan' },
|
181
|
+
{ first_name: 'George',
|
182
|
+
last_name: 'Sangster' },
|
183
|
+
{ first_name: 'Dick',
|
184
|
+
last_name: 'Schodde' },
|
185
|
+
{ first_name: 'Minturn',
|
186
|
+
last_name: 'Wright' }
|
187
187
|
],
|
188
188
|
metadata_providers: [
|
189
|
-
{ first_name:
|
190
|
-
last_name:
|
191
|
-
email:
|
189
|
+
{ first_name: 'Dmitry',
|
190
|
+
last_name: 'Mozzherin',
|
191
|
+
email: 'dmozzherin@gmail.com' }
|
192
192
|
],
|
193
|
-
abstract:
|
194
|
-
|
195
|
-
url:
|
193
|
+
abstract: 'The IOC World Bird List is an open access resource of ' \
|
194
|
+
'the international community of ornithologists.',
|
195
|
+
url: 'https://www.worldbirdnames.org'
|
196
196
|
}
|
197
197
|
super
|
198
198
|
end
|
@@ -5,12 +5,12 @@ module DwcaHunter
|
|
5
5
|
def initialize(opts = {})
|
6
6
|
@command = "mammal-div-db"
|
7
7
|
@title = "ASM Mammal Diversity Database"
|
8
|
-
@url = "https://mammaldiversity.org/
|
8
|
+
@url = "https://www.mammaldiversity.org/assets/data/MDD.zip"
|
9
9
|
@UUID = "94270cdd-5424-4bb1-8324-46ccc5386dc7"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
11
|
"dwca_hunter",
|
12
12
|
"mammal-div-db",
|
13
|
-
"data.
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
@@ -25,7 +25,9 @@ module DwcaHunter
|
|
25
25
|
`curl '#{@url}' -H 'User-Agent:' -o #{@download_path}`
|
26
26
|
end
|
27
27
|
|
28
|
-
def unpack
|
28
|
+
def unpack
|
29
|
+
unpack_zip
|
30
|
+
end
|
29
31
|
|
30
32
|
def make_dwca
|
31
33
|
DwcaHunter.logger_write(object_id, "Extracting data")
|
@@ -40,49 +42,78 @@ module DwcaHunter
|
|
40
42
|
collect_names
|
41
43
|
end
|
42
44
|
|
45
|
+
def find_csv_file
|
46
|
+
Dir.chdir(@download_dir)
|
47
|
+
Dir.entries(".").each do |f|
|
48
|
+
return f if f[-4..-1] == ".csv"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def assemble_name(row)
|
53
|
+
name = row["sciName"].gsub("_", " ")
|
54
|
+
auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
|
55
|
+
strip
|
56
|
+
auth = "(#{auth})" if row["authorityParentheses"] == 1
|
57
|
+
rank = "species"
|
58
|
+
rank = "subspecies" if (name.split(" ").size > 2)
|
59
|
+
name = "#{name} #{auth}".strip
|
60
|
+
[rank, name]
|
61
|
+
end
|
62
|
+
|
63
|
+
def assemble_synonym(row)
|
64
|
+
name = row["originalNameCombination"].gsub("_", " ")
|
65
|
+
auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
|
66
|
+
strip
|
67
|
+
name = "#{name} #{auth}".strip
|
68
|
+
{ taxon_id: row["id"], name_string: name, status: "synonym" }
|
69
|
+
end
|
70
|
+
|
71
|
+
def vernaculars(row)
|
72
|
+
id = row["id"]
|
73
|
+
res = []
|
74
|
+
vern = row["mainCommonName"].to_s
|
75
|
+
res << vern if vern != ""
|
76
|
+
verns = row["otherCommonNames"].to_s
|
77
|
+
if verns != ""
|
78
|
+
verns = verns.split("|")
|
79
|
+
res += verns
|
80
|
+
end
|
81
|
+
res.map do |v|
|
82
|
+
{ taxon_id: id, vern: v, lang: "en" }
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
43
86
|
def collect_names
|
44
87
|
@names_index = {}
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
taxon_id: e[:internal_id],
|
88
|
+
file = CSV.open(File.join(@download_dir, find_csv_file),
|
89
|
+
headers: true)
|
90
|
+
file.each do |row|
|
91
|
+
order = row["order"].to_s.capitalize
|
92
|
+
order = nil if order.match(/incertae/) || order.empty?
|
93
|
+
family = row["family"].to_s.capitalize
|
94
|
+
family = nil if family.match(/incertae/) || family.empty?
|
95
|
+
genus = row["genus"].to_s.capitalize
|
96
|
+
genus = nil if genus.match(/incertae/) || genus.empty?
|
97
|
+
rank, name_string = assemble_name(row)
|
98
|
+
@names << {
|
99
|
+
taxon_id: row["id"],
|
58
100
|
kingdom: "Animalia",
|
59
101
|
phylum: "Chordata",
|
60
102
|
klass: "Mammalia",
|
61
103
|
order: order,
|
62
104
|
family: family,
|
63
105
|
genus: genus,
|
64
|
-
name_string:
|
65
|
-
|
66
|
-
rank: e[:dwc][:taxonRank],
|
67
|
-
status: e[:dwc][:taxonRank],
|
106
|
+
name_string: name_string,
|
107
|
+
rank: rank,
|
68
108
|
code: "ICZN"
|
69
109
|
}
|
70
|
-
if
|
71
|
-
@
|
72
|
-
|
73
|
-
|
110
|
+
if row["originalNameCombination"].to_s != ""
|
111
|
+
@synonyms << assemble_synonym(row)
|
112
|
+
end
|
113
|
+
vernaculars(row).each do |vern|
|
114
|
+
@vernaculars << vern
|
74
115
|
end
|
75
|
-
vern = e[:dwc][:vernacularName]
|
76
|
-
next unless vern.to_s != ""
|
77
|
-
vern = decoder.decode(vern)
|
78
|
-
vernacular = {
|
79
|
-
taxon_id: e[:id],
|
80
|
-
vern: vern,
|
81
|
-
lang: "en"
|
82
|
-
}
|
83
|
-
@vernaculars << vernacular
|
84
116
|
end
|
85
|
-
puts data[:result].size
|
86
117
|
end
|
87
118
|
|
88
119
|
def generate_dwca
|
@@ -96,11 +127,12 @@ module DwcaHunter
|
|
96
127
|
"http://rs.tdwg.org/dwc/terms/order",
|
97
128
|
"http://rs.tdwg.org/dwc/terms/family",
|
98
129
|
"http://rs.tdwg.org/dwc/terms/genus",
|
130
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank",
|
99
131
|
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
100
132
|
@names.each do |n|
|
101
133
|
@core << [n[:taxon_id], n[:name_string],
|
102
134
|
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
103
|
-
n[:genus], n[:code]]
|
135
|
+
n[:genus], n[:rank], n[:code]]
|
104
136
|
end
|
105
137
|
@extensions << {
|
106
138
|
data: [[
|
@@ -133,23 +165,22 @@ module DwcaHunter
|
|
133
165
|
authors: [
|
134
166
|
{ first_name: "C. J.",
|
135
167
|
last_name: "Burgin" },
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
168
|
+
{ first_name: "J. P.",
|
169
|
+
last_name: "Colella" },
|
170
|
+
{ first_name: "P. L.",
|
171
|
+
last_name: "Kahn" },
|
172
|
+
{ first_name: "N. S.",
|
173
|
+
last_name: "Upham" }
|
142
174
|
],
|
143
175
|
metadata_providers: [
|
144
176
|
{ first_name: "Dmitry",
|
145
177
|
last_name: "Mozzherin",
|
146
178
|
email: "dmozzherin@gmail.com" }
|
147
179
|
],
|
148
|
-
abstract: "Mammal Diversity Database.
|
149
|
-
"American Society of Mammalogists. Accessed
|
150
|
-
url: @url
|
180
|
+
abstract: "Mammal Diversity Database. 2021. www.mammaldiversity.org. " \
|
181
|
+
"American Society of Mammalogists. Accessed 2021-01-28.", url: @url
|
151
182
|
}
|
152
183
|
super
|
184
|
+
end
|
153
185
|
end
|
154
186
|
end
|
155
|
-
end
|