dwca_hunter 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +45 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +61 -83
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +9 -9
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +39 -8
- data/lib/dwca_hunter/resource.rb +5 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +121 -145
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/eol.rb +85 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +111 -0
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +46 -40
- data/files/birdlife_7.csv +0 -11862
- data/files/fishbase_taxon_cache.tsv +0 -81000
- data/files/reptile_checklist_2014_12.csv +0 -15158
- data/files/species-black.txt +0 -251
@@ -1,30 +1,36 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceArctos < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = "arctos"
|
7
|
+
@title = "Arctos"
|
8
|
+
@url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
|
9
|
+
@UUID = "eea8315d-a244-4625-859a-226675622312"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"arctos",
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
17
17
|
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
18
20
|
super(opts)
|
19
|
-
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached verion of the file. Ask Arctos to generate new."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
20
26
|
end
|
21
27
|
|
22
28
|
def unpack
|
23
|
-
|
29
|
+
unpack_tar
|
24
30
|
end
|
25
31
|
|
26
32
|
def make_dwca
|
27
|
-
DwcaHunter
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
28
34
|
get_names
|
29
35
|
generate_dwca
|
30
36
|
end
|
@@ -33,190 +39,160 @@ module DwcaHunter
|
|
33
39
|
|
34
40
|
def get_names
|
35
41
|
Dir.chdir(@download_dir)
|
36
|
-
Dir.entries(@download_dir).grep(/zip$/).each do |file|
|
37
|
-
self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
|
38
|
-
end
|
39
|
-
collect_names
|
40
42
|
collect_synonyms
|
41
43
|
collect_vernaculars
|
44
|
+
collect_names
|
42
45
|
end
|
43
46
|
|
44
47
|
def collect_vernaculars
|
45
|
-
file = open(File.join(@download_dir,
|
46
|
-
|
48
|
+
file = CSV.open(File.join(@download_dir, "common_name.csv"),
|
49
|
+
headers: true)
|
47
50
|
file.each_with_index do |row, i|
|
51
|
+
canonical = row["SCIENTIFIC_NAME"]
|
52
|
+
vernacular_name_string = row["COMMON_NAME"]
|
48
53
|
|
49
|
-
if
|
50
|
-
|
51
|
-
|
54
|
+
if @vernaculars_hash.key?(canonical)
|
55
|
+
@vernaculars_hash[canonical] << vernacular_name_string
|
56
|
+
else
|
57
|
+
@vernaculars_hash[canonical] = [vernacular_name_string]
|
52
58
|
end
|
53
59
|
|
54
|
-
|
55
|
-
|
56
|
-
taxon_id = row[fields[:taxon_name_id]]
|
57
|
-
vernacular_name_string = row[fields[:common_name]]
|
58
|
-
|
59
|
-
@vernaculars << {
|
60
|
-
taxon_id: taxon_id,
|
61
|
-
vernacular_name_string: vernacular_name_string
|
62
|
-
}
|
63
|
-
|
64
|
-
puts "Processed %s vernaculars" % i if i % 10000 == 0
|
60
|
+
puts "Processed %s vernaculars" % i if i % 10_000 == 0
|
65
61
|
end
|
66
62
|
end
|
67
63
|
|
68
64
|
def collect_synonyms
|
69
|
-
file = open(File.join(@download_dir,
|
70
|
-
|
65
|
+
file = CSV.open(File.join(@download_dir, "relationships.csv"),
|
66
|
+
headers: true)
|
71
67
|
file.each_with_index do |row, i|
|
72
|
-
|
73
|
-
|
74
|
-
|
68
|
+
canonical = row["scientific_name"]
|
69
|
+
if @synonyms_hash.key?(canonical)
|
70
|
+
@synonyms_hash[canonical] <<
|
71
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
72
|
+
else
|
73
|
+
@synonyms_hash[canonical] = [
|
74
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
75
|
+
]
|
75
76
|
end
|
76
|
-
|
77
|
-
row = split_row(row)
|
78
|
-
taxon_id = row[fields[:taxon_name_id]]
|
79
|
-
@synonyms << {
|
80
|
-
taxon_id: row[fields[:related_taxon_name_id]],
|
81
|
-
local_id: taxon_id,
|
82
|
-
name_string: @names_index[taxon_id],
|
83
|
-
#synonym_authority: row[fields[:relation_authority]],
|
84
|
-
taxonomic_status: row[fields[:taxon_relationship]],
|
85
|
-
}
|
86
|
-
puts "Processed %s synonyms" % i if i % 10000 == 0
|
77
|
+
puts "Processed %s synonyms" % i if i % 10_000 == 0
|
87
78
|
end
|
88
79
|
end
|
89
80
|
|
90
81
|
def collect_names
|
91
82
|
@names_index = {}
|
92
|
-
file = open(File.join(@download_dir,
|
93
|
-
|
83
|
+
file = CSV.open(File.join(@download_dir, "classification.csv"),
|
84
|
+
headers: true)
|
94
85
|
file.each_with_index do |row, i|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
subspecies = row[fields[:subspecies]]
|
117
|
-
code = row[fields[:nomenclatural_code]]
|
118
|
-
|
86
|
+
next unless row["display_name"]
|
87
|
+
|
88
|
+
name_string = row["display_name"].gsub(%r{</?i>}, "")
|
89
|
+
canonical = row["scientific_name"]
|
90
|
+
kingdom = row["kingdom"]
|
91
|
+
phylum = row["phylum"]
|
92
|
+
klass = row["phylclass"]
|
93
|
+
subclass = row["subclass"]
|
94
|
+
order = row["phylorder"]
|
95
|
+
suborder = row["suborder"]
|
96
|
+
superfamily = row["superfamily"]
|
97
|
+
family = row["family"]
|
98
|
+
subfamily = row["subfamily"]
|
99
|
+
tribe = row["tribe"]
|
100
|
+
genus = row["genus"]
|
101
|
+
subgenus = row["subgenus"]
|
102
|
+
species = row["species"]
|
103
|
+
subspecies = row["subspecies"]
|
104
|
+
code = row["nomenclatural_code"]
|
105
|
+
|
106
|
+
taxon_id = "ARCT_#{i + 1}"
|
119
107
|
@names << { taxon_id: taxon_id,
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
puts "Processed %s names" % i if i % 10000 == 0
|
108
|
+
name_string: name_string,
|
109
|
+
kingdom: kingdom,
|
110
|
+
phylum: phylum,
|
111
|
+
klass: klass,
|
112
|
+
order: order,
|
113
|
+
family: family,
|
114
|
+
genus: genus,
|
115
|
+
code: code }
|
116
|
+
|
117
|
+
update_vernacular(taxon_id, canonical)
|
118
|
+
update_synonym(taxon_id, canonical)
|
119
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
133
120
|
end
|
134
121
|
end
|
135
122
|
|
136
|
-
def
|
137
|
-
|
138
|
-
row.split('","')
|
139
|
-
end
|
123
|
+
def update_vernacular(taxon_id, canonical)
|
124
|
+
return unless @vernaculars_hash.key?(canonical)
|
140
125
|
|
141
|
-
|
142
|
-
|
143
|
-
encoding_options = {
|
144
|
-
:invalid => :replace,
|
145
|
-
:undef => :replace,
|
146
|
-
:replace => '',
|
147
|
-
:universal_newline => true
|
148
|
-
}
|
149
|
-
num_ary = (0...row.size).to_a
|
150
|
-
row = row.map do |f|
|
151
|
-
f = f.strip.downcase
|
152
|
-
f = f.encode ::Encoding.find('ASCII'), encoding_options
|
153
|
-
f.to_sym
|
126
|
+
@vernaculars_hash[canonical].each do |vern|
|
127
|
+
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
154
128
|
end
|
155
|
-
Hash[row.zip(num_ary)]
|
156
129
|
end
|
157
130
|
|
131
|
+
def update_synonym(taxon_id, canonical)
|
132
|
+
return unless @synonyms_hash.key?(canonical)
|
133
|
+
|
134
|
+
@synonyms_hash[canonical].each do |syn|
|
135
|
+
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
136
|
+
status: syn[:status] }
|
137
|
+
end
|
138
|
+
end
|
158
139
|
|
159
140
|
def generate_dwca
|
160
|
-
DwcaHunter
|
161
|
-
|
162
|
-
@core = [[
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
172
|
-
]]
|
141
|
+
DwcaHunter.logger_write(object_id,
|
142
|
+
"Creating DarwinCore Archive file")
|
143
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
144
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
145
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
146
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
147
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
148
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
149
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
150
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
151
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
173
152
|
@names.each do |n|
|
174
|
-
@core << [n[:taxon_id], n[:
|
175
|
-
|
176
|
-
|
153
|
+
@core << [n[:taxon_id], n[:name_string],
|
154
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
155
|
+
n[:genus], n[:code]]
|
177
156
|
end
|
178
157
|
@extensions << {
|
179
158
|
data: [[
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
159
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
160
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"
|
161
|
+
]],
|
162
|
+
file_name: "vernacular_names.txt",
|
163
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
164
|
+
}
|
184
165
|
|
185
166
|
@vernaculars.each do |v|
|
186
|
-
@extensions[-1][:data] << [v[:taxon_id], v[:
|
167
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern]]
|
187
168
|
end
|
188
169
|
|
189
170
|
@extensions << {
|
190
171
|
data: [[
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
}
|
198
|
-
|
172
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
173
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
174
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
175
|
+
]],
|
176
|
+
file_name: "synonyms.txt"
|
177
|
+
}
|
199
178
|
@synonyms.each do |s|
|
200
|
-
@extensions[-1][:data] << [
|
201
|
-
s[:taxon_id], s[:local_id],
|
202
|
-
s[:name_string], s[:taxonomic_status]]
|
179
|
+
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
203
180
|
end
|
204
181
|
@eml = {
|
205
182
|
id: @uuid,
|
206
183
|
title: @title,
|
207
184
|
authors: [
|
208
|
-
{email:
|
209
|
-
|
185
|
+
{ email: "dustymc at gmail dot com" }
|
186
|
+
],
|
210
187
|
metadata_providers: [
|
211
|
-
{ first_name:
|
212
|
-
last_name:
|
213
|
-
email:
|
214
|
-
|
215
|
-
abstract:
|
188
|
+
{ first_name: "Dmitry",
|
189
|
+
last_name: "Mozzherin",
|
190
|
+
email: "dmozzherin@gmail.com" }
|
191
|
+
],
|
192
|
+
abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
|
216
193
|
url: @url
|
217
194
|
}
|
218
195
|
super
|
219
196
|
end
|
220
197
|
end
|
221
198
|
end
|
222
|
-
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceClements < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "clements-ebird"
|
7
|
+
@title = "The eBird/Clements Checklist of Birds of the World"
|
8
|
+
@url = "https://uofi.box.com/shared/static/b4n8zqa99hq9rdga27skkh3870yhujgo.csv"
|
9
|
+
@UUID = "577c0b56-4a3c-4314-8724-14b304f601de"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"clements",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached and modified version of the file."
|
25
|
+
puts "Go to https://www.birds.cornell.edu/clementschecklist/download/ " \
|
26
|
+
"for updates."
|
27
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
28
|
+
end
|
29
|
+
|
30
|
+
def unpack
|
31
|
+
end
|
32
|
+
|
33
|
+
def make_dwca
|
34
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
35
|
+
get_names
|
36
|
+
generate_dwca
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def get_names
|
42
|
+
Dir.chdir(@download_dir)
|
43
|
+
collect_names
|
44
|
+
end
|
45
|
+
|
46
|
+
def collect_names
|
47
|
+
@names_index = {}
|
48
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
49
|
+
headers: true)
|
50
|
+
file.each_with_index do |row, i|
|
51
|
+
name_string = row["scientific name"]
|
52
|
+
canonical = name_string
|
53
|
+
kingdom = "Animalia"
|
54
|
+
phylum = "Chordata"
|
55
|
+
klass = "Aves"
|
56
|
+
order = row["order"]
|
57
|
+
family = row["family"]
|
58
|
+
code = "ICZN"
|
59
|
+
|
60
|
+
taxon_id = "gn_#{i + 1}"
|
61
|
+
@names << { taxon_id: taxon_id,
|
62
|
+
name_string: name_string,
|
63
|
+
kingdom: kingdom,
|
64
|
+
phylum: phylum,
|
65
|
+
klass: klass,
|
66
|
+
order: order,
|
67
|
+
family: family,
|
68
|
+
code: code }
|
69
|
+
|
70
|
+
if row["English name"].to_s != ""
|
71
|
+
@vernaculars << {
|
72
|
+
taxon_id: taxon_id,
|
73
|
+
vern: row["English name"],
|
74
|
+
lang: "end"
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def generate_dwca
|
83
|
+
DwcaHunter.logger_write(object_id,
|
84
|
+
"Creating DarwinCore Archive file")
|
85
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
86
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
87
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
88
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
89
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
93
|
+
@names.each do |n|
|
94
|
+
@core << [n[:taxon_id], n[:name_string],
|
95
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
96
|
+
n[:code]]
|
97
|
+
end
|
98
|
+
@extensions << {
|
99
|
+
data: [[
|
100
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
101
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
102
|
+
"http://purl.org/dc/terms/language"
|
103
|
+
]],
|
104
|
+
file_name: "vernacular_names.txt",
|
105
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
106
|
+
}
|
107
|
+
|
108
|
+
@vernaculars.each do |v|
|
109
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
110
|
+
end
|
111
|
+
|
112
|
+
@eml = {
|
113
|
+
id: @uuid,
|
114
|
+
title: @title,
|
115
|
+
authors: [
|
116
|
+
{ first_name: "G. F.",
|
117
|
+
last_name: "Clements"
|
118
|
+
},
|
119
|
+
{ first_name: "T. S.",
|
120
|
+
last_name: "Schulenberg"
|
121
|
+
},
|
122
|
+
{ first_name: "M. J.",
|
123
|
+
last_name: "Iliff"
|
124
|
+
},
|
125
|
+
{ first_name: "S. M.",
|
126
|
+
last_name: "Billerman"
|
127
|
+
},
|
128
|
+
{ first_name: "T. A.",
|
129
|
+
last_name: "Fredericks"
|
130
|
+
},
|
131
|
+
{ first_name: "B. L.",
|
132
|
+
last_name: "Sullivan"
|
133
|
+
},
|
134
|
+
{ first_name: "C. L.",
|
135
|
+
last_name: "Wood"
|
136
|
+
},
|
137
|
+
],
|
138
|
+
metadata_providers: [
|
139
|
+
{ first_name: "Dmitry",
|
140
|
+
last_name: "Mozzherin",
|
141
|
+
email: "dmozzherin@gmail.com" }
|
142
|
+
],
|
143
|
+
abstract: "The eBird/Clements Checklist of Birds of the World" \
|
144
|
+
": v2019. Downloaded from " \
|
145
|
+
"https://www.birds.cornell.edu/clementschecklist/download/",
|
146
|
+
url: @url
|
147
|
+
}
|
148
|
+
super
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|