dwca_hunter 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.ruby-version +1 -1
- data/Gemfile.lock +56 -27
- data/dwca_hunter.gemspec +11 -9
- data/exe/dwcahunter +0 -2
- data/lib/dwca_hunter.rb +9 -7
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/arctos.rb +42 -45
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +105 -105
- data/lib/dwca_hunter/resources/mammal_divdb.rb +76 -45
- data/lib/dwca_hunter/resources/mcz.rb +1 -1
- data/lib/dwca_hunter/resources/wikispecies.rb +65 -98
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +48 -20
@@ -3,14 +3,14 @@
|
|
3
3
|
module DwcaHunter
|
4
4
|
class ResourceIOCWorldBird < DwcaHunter::Resource
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = 'ioc-world-bird'
|
7
|
+
@title = 'IOC World Bird List'
|
8
|
+
@url = 'https://uofi.box.com/shared/static/znsd734a78saq87hes979p5uspgkzy93.csv'
|
9
|
+
@UUID = '6421ffec-38e3-40fb-a6d9-af27238a47a1'
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
'dwca_hunter',
|
12
|
+
'ioc-bird',
|
13
|
+
'data.csv')
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
@@ -21,17 +21,17 @@ module DwcaHunter
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def download
|
24
|
-
puts
|
25
|
-
puts
|
26
|
-
puts
|
27
|
-
puts
|
24
|
+
puts 'Downloading cached and converted to csv version.'
|
25
|
+
puts 'CHECK FOR NEW VERSION at'
|
26
|
+
puts 'https://www.worldbirdnames.org/ioc-lists/master-list-2/'
|
27
|
+
puts 'Use libreoffice to convert to csv.'
|
28
28
|
`curl -s -L #{@url} -o #{@download_path}`
|
29
29
|
end
|
30
30
|
|
31
31
|
def unpack; end
|
32
32
|
|
33
33
|
def make_dwca
|
34
|
-
DwcaHunter.logger_write(object_id,
|
34
|
+
DwcaHunter.logger_write(object_id, 'Extracting data')
|
35
35
|
get_names
|
36
36
|
generate_dwca
|
37
37
|
end
|
@@ -45,84 +45,84 @@ module DwcaHunter
|
|
45
45
|
|
46
46
|
def collect_names
|
47
47
|
@names_index = {}
|
48
|
-
file = CSV.open(File.join(@download_dir,
|
48
|
+
file = CSV.open(File.join(@download_dir, 'data.csv'),
|
49
49
|
headers: true)
|
50
|
-
order =
|
51
|
-
family =
|
52
|
-
genus =
|
53
|
-
species =
|
50
|
+
order = ''
|
51
|
+
family = ''
|
52
|
+
genus = ''
|
53
|
+
species = ''
|
54
54
|
count = 0
|
55
55
|
file.each do |row|
|
56
|
-
order1 = row[
|
57
|
-
order = order1.capitalize if order1.to_s !=
|
56
|
+
order1 = row['Order']
|
57
|
+
order = order1.capitalize if order1.to_s != ''
|
58
58
|
|
59
|
-
family1 = row[
|
60
|
-
family = family1.capitalize if family1.to_s !=
|
59
|
+
family1 = row['Family (Scientific)']
|
60
|
+
family = family1.capitalize if family1.to_s != ''
|
61
61
|
|
62
|
-
genus1 = row[
|
63
|
-
genus = genus1.capitalize if genus1.to_s !=
|
62
|
+
genus1 = row['Genus']
|
63
|
+
genus = genus1.capitalize if genus1.to_s != ''
|
64
64
|
|
65
|
-
species1 = row[
|
66
|
-
species = species1 if species1.to_s !=
|
65
|
+
species1 = row['Species (Scientific)']
|
66
|
+
species = species1 if species1.to_s != ''
|
67
67
|
|
68
|
-
subspecies = row[
|
69
|
-
next if species.to_s ==
|
68
|
+
subspecies = row['Subspecies']
|
69
|
+
next if species.to_s == ''
|
70
70
|
|
71
71
|
count += 1
|
72
72
|
taxon_id = "gn_#{count}"
|
73
73
|
name = {
|
74
74
|
taxon_id: taxon_id,
|
75
|
-
kingdom:
|
76
|
-
phylum:
|
77
|
-
klass:
|
75
|
+
kingdom: 'Animalia',
|
76
|
+
phylum: 'Chordata',
|
77
|
+
klass: 'Aves',
|
78
78
|
order: order,
|
79
79
|
family: family,
|
80
80
|
genus: genus,
|
81
|
-
code:
|
81
|
+
code: 'ICZN'
|
82
82
|
}
|
83
|
-
if subspecies.to_s ==
|
84
|
-
auth = row[
|
85
|
-
auth = DwcaHunter.normalize_authors(auth) if auth !=
|
83
|
+
if subspecies.to_s == ''
|
84
|
+
auth = row['Authority'].to_s
|
85
|
+
auth = DwcaHunter.normalize_authors(auth) if auth != ''
|
86
86
|
name[:name_string] = clean(
|
87
|
-
"#{genus} #{species} #{auth}"
|
88
|
-
strip
|
87
|
+
"#{genus} #{species} #{auth}"
|
88
|
+
.strip
|
89
89
|
)
|
90
90
|
@names << name
|
91
|
-
vernacular = row[
|
92
|
-
if vernacular.to_s !=
|
93
|
-
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang:
|
91
|
+
vernacular = row['Species (English)']
|
92
|
+
if vernacular.to_s != ''
|
93
|
+
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: 'en' }
|
94
94
|
@vernaculars << vernaclar
|
95
95
|
end
|
96
|
-
species =
|
96
|
+
species = ''
|
97
97
|
else
|
98
98
|
name[:name_string] = clean(
|
99
|
-
"#{genus} #{species} #{subspecies} #{row['Authority']}"
|
100
|
-
strip
|
99
|
+
"#{genus} #{species} #{subspecies} #{row['Authority']}"
|
100
|
+
.strip
|
101
101
|
)
|
102
102
|
@names << name
|
103
|
-
species =
|
104
|
-
subspecies =
|
103
|
+
species = ''
|
104
|
+
subspecies = ''
|
105
105
|
end
|
106
106
|
end
|
107
107
|
end
|
108
108
|
|
109
109
|
def clean(n)
|
110
|
-
n = n.gsub(/†/,
|
111
|
-
n.gsub(/\s+/,
|
110
|
+
n = n.gsub(/†/, '')
|
111
|
+
n.gsub(/\s+/, ' ')
|
112
112
|
end
|
113
113
|
|
114
114
|
def generate_dwca
|
115
115
|
DwcaHunter.logger_write(object_id,
|
116
|
-
|
117
|
-
@core = [[
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
116
|
+
'Creating DarwinCore Archive file')
|
117
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
118
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
119
|
+
'http://rs.tdwg.org/dwc/terms/kingdom',
|
120
|
+
'http://rs.tdwg.org/dwc/terms/phylum',
|
121
|
+
'http://rs.tdwg.org/dwc/terms/class',
|
122
|
+
'http://rs.tdwg.org/dwc/terms/order',
|
123
|
+
'http://rs.tdwg.org/dwc/terms/family',
|
124
|
+
'http://rs.tdwg.org/dwc/terms/genus',
|
125
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode']]
|
126
126
|
@names.each do |n|
|
127
127
|
@core << [n[:taxon_id], n[:name_string],
|
128
128
|
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
@@ -130,12 +130,12 @@ module DwcaHunter
|
|
130
130
|
end
|
131
131
|
@extensions << {
|
132
132
|
data: [[
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
134
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
135
|
+
'http://purl.org/dc/terms/language'
|
136
136
|
]],
|
137
|
-
file_name:
|
138
|
-
row_type:
|
137
|
+
file_name: 'vernacular_names.txt',
|
138
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
139
139
|
}
|
140
140
|
|
141
141
|
@vernaculars.each do |v|
|
@@ -146,53 +146,53 @@ module DwcaHunter
|
|
146
146
|
id: @uuid,
|
147
147
|
title: @title,
|
148
148
|
authors: [
|
149
|
-
{ first_name:
|
150
|
-
last_name:
|
151
|
-
{ first_name:
|
152
|
-
last_name:
|
153
|
-
{ first_name:
|
154
|
-
last_name:
|
155
|
-
{ first_name:
|
156
|
-
last_name:
|
157
|
-
{ first_name:
|
158
|
-
last_name:
|
159
|
-
{ first_name:
|
160
|
-
last_name:
|
161
|
-
{ first_name:
|
162
|
-
last_name:
|
163
|
-
{ first_name:
|
164
|
-
last_name:
|
165
|
-
{ first_name:
|
166
|
-
last_name:
|
167
|
-
{ first_name:
|
168
|
-
last_name:
|
169
|
-
{ first_name:
|
170
|
-
last_name:
|
171
|
-
{ first_name:
|
172
|
-
last_name:
|
173
|
-
{ first_name:
|
174
|
-
last_name:
|
175
|
-
{ first_name:
|
176
|
-
last_name:
|
177
|
-
{ first_name:
|
178
|
-
last_name:
|
179
|
-
{ first_name:
|
180
|
-
last_name:
|
181
|
-
{ first_name:
|
182
|
-
last_name:
|
183
|
-
{ first_name:
|
184
|
-
last_name:
|
185
|
-
{ first_name:
|
186
|
-
last_name:
|
149
|
+
{ first_name: 'Per',
|
150
|
+
last_name: 'Alstrom' },
|
151
|
+
{ first_name: 'Mike',
|
152
|
+
last_name: 'Blair' },
|
153
|
+
{ first_name: 'Rauri',
|
154
|
+
last_name: 'Bowie' },
|
155
|
+
{ first_name: 'Nigel',
|
156
|
+
last_name: 'Redman' },
|
157
|
+
{ first_name: 'Jon',
|
158
|
+
last_name: 'Fjeldsa' },
|
159
|
+
{ first_name: 'Phil',
|
160
|
+
last_name: 'Gregory' },
|
161
|
+
{ first_name: 'Leo',
|
162
|
+
last_name: 'Joseph' },
|
163
|
+
{ first_name: 'Peter',
|
164
|
+
last_name: 'Kovalik' },
|
165
|
+
{ first_name: 'Adolfo',
|
166
|
+
last_name: 'Navarro-Siguenza' },
|
167
|
+
{ first_name: 'David',
|
168
|
+
last_name: 'Parkin' },
|
169
|
+
{ first_name: 'Alan',
|
170
|
+
last_name: 'Peterson' },
|
171
|
+
{ first_name: 'Douglas',
|
172
|
+
last_name: 'Pratt' },
|
173
|
+
{ first_name: 'Pam',
|
174
|
+
last_name: 'Rasmussen' },
|
175
|
+
{ first_name: 'Frank',
|
176
|
+
last_name: 'Rheindt' },
|
177
|
+
{ first_name: 'Robert',
|
178
|
+
last_name: 'Ridgely' },
|
179
|
+
{ first_name: 'Peter',
|
180
|
+
last_name: 'Ryan' },
|
181
|
+
{ first_name: 'George',
|
182
|
+
last_name: 'Sangster' },
|
183
|
+
{ first_name: 'Dick',
|
184
|
+
last_name: 'Schodde' },
|
185
|
+
{ first_name: 'Minturn',
|
186
|
+
last_name: 'Wright' }
|
187
187
|
],
|
188
188
|
metadata_providers: [
|
189
|
-
{ first_name:
|
190
|
-
last_name:
|
191
|
-
email:
|
189
|
+
{ first_name: 'Dmitry',
|
190
|
+
last_name: 'Mozzherin',
|
191
|
+
email: 'dmozzherin@gmail.com' }
|
192
192
|
],
|
193
|
-
abstract:
|
194
|
-
|
195
|
-
url:
|
193
|
+
abstract: 'The IOC World Bird List is an open access resource of ' \
|
194
|
+
'the international community of ornithologists.',
|
195
|
+
url: 'https://www.worldbirdnames.org'
|
196
196
|
}
|
197
197
|
super
|
198
198
|
end
|
@@ -5,12 +5,12 @@ module DwcaHunter
|
|
5
5
|
def initialize(opts = {})
|
6
6
|
@command = "mammal-div-db"
|
7
7
|
@title = "ASM Mammal Diversity Database"
|
8
|
-
@url = "https://mammaldiversity.org/
|
8
|
+
@url = "https://www.mammaldiversity.org/assets/data/MDD.zip"
|
9
9
|
@UUID = "94270cdd-5424-4bb1-8324-46ccc5386dc7"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
11
|
"dwca_hunter",
|
12
12
|
"mammal-div-db",
|
13
|
-
"data.
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
@@ -25,7 +25,9 @@ module DwcaHunter
|
|
25
25
|
`curl '#{@url}' -H 'User-Agent:' -o #{@download_path}`
|
26
26
|
end
|
27
27
|
|
28
|
-
def unpack
|
28
|
+
def unpack
|
29
|
+
unpack_zip
|
30
|
+
end
|
29
31
|
|
30
32
|
def make_dwca
|
31
33
|
DwcaHunter.logger_write(object_id, "Extracting data")
|
@@ -40,49 +42,78 @@ module DwcaHunter
|
|
40
42
|
collect_names
|
41
43
|
end
|
42
44
|
|
45
|
+
def find_csv_file
|
46
|
+
Dir.chdir(@download_dir)
|
47
|
+
Dir.entries(".").each do |f|
|
48
|
+
return f if f[-4..-1] == ".csv"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def assemble_name(row)
|
53
|
+
name = row["sciName"].gsub("_", " ")
|
54
|
+
auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
|
55
|
+
strip
|
56
|
+
auth = "(#{auth})" if row["authorityParentheses"] == 1
|
57
|
+
rank = "species"
|
58
|
+
rank = "subspecies" if (name.split(" ").size > 2)
|
59
|
+
name = "#{name} #{auth}".strip
|
60
|
+
[rank, name]
|
61
|
+
end
|
62
|
+
|
63
|
+
def assemble_synonym(row)
|
64
|
+
name = row["originalNameCombination"].gsub("_", " ")
|
65
|
+
auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
|
66
|
+
strip
|
67
|
+
name = "#{name} #{auth}".strip
|
68
|
+
{ taxon_id: row["id"], name_string: name, status: "synonym" }
|
69
|
+
end
|
70
|
+
|
71
|
+
def vernaculars(row)
|
72
|
+
id = row["id"]
|
73
|
+
res = []
|
74
|
+
vern = row["mainCommonName"].to_s
|
75
|
+
res << vern if vern != ""
|
76
|
+
verns = row["otherCommonNames"].to_s
|
77
|
+
if verns != ""
|
78
|
+
verns = verns.split("|")
|
79
|
+
res += verns
|
80
|
+
end
|
81
|
+
res.map do |v|
|
82
|
+
{ taxon_id: id, vern: v, lang: "en" }
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
43
86
|
def collect_names
|
44
87
|
@names_index = {}
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
taxon_id: e[:internal_id],
|
88
|
+
file = CSV.open(File.join(@download_dir, find_csv_file),
|
89
|
+
headers: true)
|
90
|
+
file.each do |row|
|
91
|
+
order = row["order"].to_s.capitalize
|
92
|
+
order = nil if order.match(/incertae/) || order.empty?
|
93
|
+
family = row["family"].to_s.capitalize
|
94
|
+
family = nil if family.match(/incertae/) || family.empty?
|
95
|
+
genus = row["genus"].to_s.capitalize
|
96
|
+
genus = nil if genus.match(/incertae/) || genus.empty?
|
97
|
+
rank, name_string = assemble_name(row)
|
98
|
+
@names << {
|
99
|
+
taxon_id: row["id"],
|
58
100
|
kingdom: "Animalia",
|
59
101
|
phylum: "Chordata",
|
60
102
|
klass: "Mammalia",
|
61
103
|
order: order,
|
62
104
|
family: family,
|
63
105
|
genus: genus,
|
64
|
-
name_string:
|
65
|
-
|
66
|
-
rank: e[:dwc][:taxonRank],
|
67
|
-
status: e[:dwc][:taxonRank],
|
106
|
+
name_string: name_string,
|
107
|
+
rank: rank,
|
68
108
|
code: "ICZN"
|
69
109
|
}
|
70
|
-
if
|
71
|
-
@
|
72
|
-
|
73
|
-
|
110
|
+
if row["originalNameCombination"].to_s != ""
|
111
|
+
@synonyms << assemble_synonym(row)
|
112
|
+
end
|
113
|
+
vernaculars(row).each do |vern|
|
114
|
+
@vernaculars << vern
|
74
115
|
end
|
75
|
-
vern = e[:dwc][:vernacularName]
|
76
|
-
next unless vern.to_s != ""
|
77
|
-
vern = decoder.decode(vern)
|
78
|
-
vernacular = {
|
79
|
-
taxon_id: e[:id],
|
80
|
-
vern: vern,
|
81
|
-
lang: "en"
|
82
|
-
}
|
83
|
-
@vernaculars << vernacular
|
84
116
|
end
|
85
|
-
puts data[:result].size
|
86
117
|
end
|
87
118
|
|
88
119
|
def generate_dwca
|
@@ -96,11 +127,12 @@ module DwcaHunter
|
|
96
127
|
"http://rs.tdwg.org/dwc/terms/order",
|
97
128
|
"http://rs.tdwg.org/dwc/terms/family",
|
98
129
|
"http://rs.tdwg.org/dwc/terms/genus",
|
130
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank",
|
99
131
|
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
100
132
|
@names.each do |n|
|
101
133
|
@core << [n[:taxon_id], n[:name_string],
|
102
134
|
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
103
|
-
n[:genus], n[:code]]
|
135
|
+
n[:genus], n[:rank], n[:code]]
|
104
136
|
end
|
105
137
|
@extensions << {
|
106
138
|
data: [[
|
@@ -133,23 +165,22 @@ module DwcaHunter
|
|
133
165
|
authors: [
|
134
166
|
{ first_name: "C. J.",
|
135
167
|
last_name: "Burgin" },
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
168
|
+
{ first_name: "J. P.",
|
169
|
+
last_name: "Colella" },
|
170
|
+
{ first_name: "P. L.",
|
171
|
+
last_name: "Kahn" },
|
172
|
+
{ first_name: "N. S.",
|
173
|
+
last_name: "Upham" }
|
142
174
|
],
|
143
175
|
metadata_providers: [
|
144
176
|
{ first_name: "Dmitry",
|
145
177
|
last_name: "Mozzherin",
|
146
178
|
email: "dmozzherin@gmail.com" }
|
147
179
|
],
|
148
|
-
abstract: "Mammal Diversity Database.
|
149
|
-
"American Society of Mammalogists. Accessed
|
150
|
-
url: @url
|
180
|
+
abstract: "Mammal Diversity Database. 2021. www.mammaldiversity.org. " \
|
181
|
+
"American Society of Mammalogists. Accessed 2021-01-28.", url: @url
|
151
182
|
}
|
152
183
|
super
|
184
|
+
end
|
153
185
|
end
|
154
186
|
end
|
155
|
-
end
|