dwca_hunter 0.5.2 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +59 -135
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +39 -8
- data/lib/dwca_hunter/resource.rb +5 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +121 -145
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/eol.rb +85 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +31 -40
- data/files/birdlife_7.csv +0 -11862
- data/files/fishbase_taxon_cache.tsv +0 -81000
- data/files/reptile_checklist_2014_12.csv +0 -15158
- data/files/species-black.txt +0 -251
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
data/lib/dwca_hunter/resource.rb
CHANGED
@@ -7,6 +7,11 @@ module DwcaHunter
|
|
7
7
|
`unzip -qq -u #{file} > /dev/null 2>&1`
|
8
8
|
end
|
9
9
|
|
10
|
+
def self.gunzip(file, dir = nil)
|
11
|
+
Dir.chdir(dir) if dir
|
12
|
+
`gunzip #{file}`
|
13
|
+
end
|
14
|
+
|
10
15
|
def initialize(opts)
|
11
16
|
@needs_download = !(opts[:download] == false)
|
12
17
|
@needs_unpack = !(opts[:unpack] == false)
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceAOS < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "aos-birds"
|
7
|
+
@title = "American Ornithological Society"
|
8
|
+
@url = "http://checklist.americanornithology.org/taxa.csv"
|
9
|
+
@UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"aos",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading csv from remote"
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
@names_index = {}
|
45
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
46
|
+
headers: true)
|
47
|
+
file.each_with_index do |row, _i|
|
48
|
+
taxon_id = row["id"]
|
49
|
+
name_string = row["species"]
|
50
|
+
kingdom = "Animalia"
|
51
|
+
phylum = "Chordata"
|
52
|
+
klass = "Aves"
|
53
|
+
order = row["order"]
|
54
|
+
family = row["family"]
|
55
|
+
genus = row["genus"]
|
56
|
+
code = "ICZN"
|
57
|
+
|
58
|
+
@names << {
|
59
|
+
taxon_id: taxon_id,
|
60
|
+
name_string: name_string,
|
61
|
+
kingdom: kingdom,
|
62
|
+
phylum: phylum,
|
63
|
+
klass: klass,
|
64
|
+
order: order,
|
65
|
+
family: family,
|
66
|
+
genus: genus,
|
67
|
+
code: code
|
68
|
+
}
|
69
|
+
if row["common_name"].to_s != ""
|
70
|
+
@vernaculars << {
|
71
|
+
taxon_id: taxon_id,
|
72
|
+
vern: row["common_name"],
|
73
|
+
lang: "en"
|
74
|
+
}
|
75
|
+
end
|
76
|
+
next unless row["french_name"].to_s != ""
|
77
|
+
|
78
|
+
@vernaculars << {
|
79
|
+
taxon_id: taxon_id,
|
80
|
+
vern: row["french_name"],
|
81
|
+
lang: "fr"
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_dwca
|
87
|
+
DwcaHunter.logger_write(object_id,
|
88
|
+
"Creating DarwinCore Archive file")
|
89
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
94
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
95
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
96
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
97
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
98
|
+
@names.each do |n|
|
99
|
+
@core << [n[:taxon_id], n[:name_string],
|
100
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
101
|
+
n[:genus], n[:code]]
|
102
|
+
end
|
103
|
+
@extensions << {
|
104
|
+
data: [[
|
105
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
106
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
107
|
+
"http://purl.org/dc/terms/language"
|
108
|
+
]],
|
109
|
+
file_name: "vernacular_names.txt",
|
110
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
111
|
+
}
|
112
|
+
|
113
|
+
@vernaculars.each do |v|
|
114
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
115
|
+
end
|
116
|
+
@eml = {
|
117
|
+
id: @uuid,
|
118
|
+
title: @title,
|
119
|
+
authors: [
|
120
|
+
{ first_name: "R. T.",
|
121
|
+
last_name: "Chesser" }
|
122
|
+
],
|
123
|
+
metadata_providers: [
|
124
|
+
{ first_name: "Dmitry",
|
125
|
+
last_name: "Mozzherin",
|
126
|
+
email: "dmozzherin@gmail.com" }
|
127
|
+
],
|
128
|
+
abstract: "The American Ornithological Society's (AOS) Checklist is " \
|
129
|
+
"the official source on the taxonomy of birds found in North and " \
|
130
|
+
"Middle America, including adjacent islands. This list is produced " \
|
131
|
+
"by the North American Classification and Nomenclature Committee " \
|
132
|
+
"(NACC) of the AOS.\n\n" \
|
133
|
+
"Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
|
134
|
+
"J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
|
135
|
+
"J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
|
136
|
+
"of North American Birds (online). American Ornithological Society. " \
|
137
|
+
"http://checklist.aou.org/taxa",
|
138
|
+
url: @url
|
139
|
+
}
|
140
|
+
super
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -1,30 +1,36 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceArctos < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = "arctos"
|
7
|
+
@title = "Arctos"
|
8
|
+
@url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
|
9
|
+
@UUID = "eea8315d-a244-4625-859a-226675622312"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"arctos",
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
17
17
|
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
18
20
|
super(opts)
|
19
|
-
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached verion of the file. Ask Arctos to generate new."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
20
26
|
end
|
21
27
|
|
22
28
|
def unpack
|
23
|
-
|
29
|
+
unpack_tar
|
24
30
|
end
|
25
31
|
|
26
32
|
def make_dwca
|
27
|
-
DwcaHunter
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
28
34
|
get_names
|
29
35
|
generate_dwca
|
30
36
|
end
|
@@ -33,190 +39,160 @@ module DwcaHunter
|
|
33
39
|
|
34
40
|
def get_names
|
35
41
|
Dir.chdir(@download_dir)
|
36
|
-
Dir.entries(@download_dir).grep(/zip$/).each do |file|
|
37
|
-
self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
|
38
|
-
end
|
39
|
-
collect_names
|
40
42
|
collect_synonyms
|
41
43
|
collect_vernaculars
|
44
|
+
collect_names
|
42
45
|
end
|
43
46
|
|
44
47
|
def collect_vernaculars
|
45
|
-
file = open(File.join(@download_dir,
|
46
|
-
|
48
|
+
file = CSV.open(File.join(@download_dir, "common_name.csv"),
|
49
|
+
headers: true)
|
47
50
|
file.each_with_index do |row, i|
|
51
|
+
canonical = row["SCIENTIFIC_NAME"]
|
52
|
+
vernacular_name_string = row["COMMON_NAME"]
|
48
53
|
|
49
|
-
if
|
50
|
-
|
51
|
-
|
54
|
+
if @vernaculars_hash.key?(canonical)
|
55
|
+
@vernaculars_hash[canonical] << vernacular_name_string
|
56
|
+
else
|
57
|
+
@vernaculars_hash[canonical] = [vernacular_name_string]
|
52
58
|
end
|
53
59
|
|
54
|
-
|
55
|
-
|
56
|
-
taxon_id = row[fields[:taxon_name_id]]
|
57
|
-
vernacular_name_string = row[fields[:common_name]]
|
58
|
-
|
59
|
-
@vernaculars << {
|
60
|
-
taxon_id: taxon_id,
|
61
|
-
vernacular_name_string: vernacular_name_string
|
62
|
-
}
|
63
|
-
|
64
|
-
puts "Processed %s vernaculars" % i if i % 10000 == 0
|
60
|
+
puts "Processed %s vernaculars" % i if i % 10_000 == 0
|
65
61
|
end
|
66
62
|
end
|
67
63
|
|
68
64
|
def collect_synonyms
|
69
|
-
file = open(File.join(@download_dir,
|
70
|
-
|
65
|
+
file = CSV.open(File.join(@download_dir, "relationships.csv"),
|
66
|
+
headers: true)
|
71
67
|
file.each_with_index do |row, i|
|
72
|
-
|
73
|
-
|
74
|
-
|
68
|
+
canonical = row["scientific_name"]
|
69
|
+
if @synonyms_hash.key?(canonical)
|
70
|
+
@synonyms_hash[canonical] <<
|
71
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
72
|
+
else
|
73
|
+
@synonyms_hash[canonical] = [
|
74
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
75
|
+
]
|
75
76
|
end
|
76
|
-
|
77
|
-
row = split_row(row)
|
78
|
-
taxon_id = row[fields[:taxon_name_id]]
|
79
|
-
@synonyms << {
|
80
|
-
taxon_id: row[fields[:related_taxon_name_id]],
|
81
|
-
local_id: taxon_id,
|
82
|
-
name_string: @names_index[taxon_id],
|
83
|
-
#synonym_authority: row[fields[:relation_authority]],
|
84
|
-
taxonomic_status: row[fields[:taxon_relationship]],
|
85
|
-
}
|
86
|
-
puts "Processed %s synonyms" % i if i % 10000 == 0
|
77
|
+
puts "Processed %s synonyms" % i if i % 10_000 == 0
|
87
78
|
end
|
88
79
|
end
|
89
80
|
|
90
81
|
def collect_names
|
91
82
|
@names_index = {}
|
92
|
-
file = open(File.join(@download_dir,
|
93
|
-
|
83
|
+
file = CSV.open(File.join(@download_dir, "classification.csv"),
|
84
|
+
headers: true)
|
94
85
|
file.each_with_index do |row, i|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
subspecies = row[fields[:subspecies]]
|
117
|
-
code = row[fields[:nomenclatural_code]]
|
118
|
-
|
86
|
+
next unless row["display_name"]
|
87
|
+
|
88
|
+
name_string = row["display_name"].gsub(%r{</?i>}, "")
|
89
|
+
canonical = row["scientific_name"]
|
90
|
+
kingdom = row["kingdom"]
|
91
|
+
phylum = row["phylum"]
|
92
|
+
klass = row["phylclass"]
|
93
|
+
subclass = row["subclass"]
|
94
|
+
order = row["phylorder"]
|
95
|
+
suborder = row["suborder"]
|
96
|
+
superfamily = row["superfamily"]
|
97
|
+
family = row["family"]
|
98
|
+
subfamily = row["subfamily"]
|
99
|
+
tribe = row["tribe"]
|
100
|
+
genus = row["genus"]
|
101
|
+
subgenus = row["subgenus"]
|
102
|
+
species = row["species"]
|
103
|
+
subspecies = row["subspecies"]
|
104
|
+
code = row["nomenclatural_code"]
|
105
|
+
|
106
|
+
taxon_id = "ARCT_#{i + 1}"
|
119
107
|
@names << { taxon_id: taxon_id,
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
puts "Processed %s names" % i if i % 10000 == 0
|
108
|
+
name_string: name_string,
|
109
|
+
kingdom: kingdom,
|
110
|
+
phylum: phylum,
|
111
|
+
klass: klass,
|
112
|
+
order: order,
|
113
|
+
family: family,
|
114
|
+
genus: genus,
|
115
|
+
code: code }
|
116
|
+
|
117
|
+
update_vernacular(taxon_id, canonical)
|
118
|
+
update_synonym(taxon_id, canonical)
|
119
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
133
120
|
end
|
134
121
|
end
|
135
122
|
|
136
|
-
def
|
137
|
-
|
138
|
-
row.split('","')
|
139
|
-
end
|
123
|
+
def update_vernacular(taxon_id, canonical)
|
124
|
+
return unless @vernaculars_hash.key?(canonical)
|
140
125
|
|
141
|
-
|
142
|
-
|
143
|
-
encoding_options = {
|
144
|
-
:invalid => :replace,
|
145
|
-
:undef => :replace,
|
146
|
-
:replace => '',
|
147
|
-
:universal_newline => true
|
148
|
-
}
|
149
|
-
num_ary = (0...row.size).to_a
|
150
|
-
row = row.map do |f|
|
151
|
-
f = f.strip.downcase
|
152
|
-
f = f.encode ::Encoding.find('ASCII'), encoding_options
|
153
|
-
f.to_sym
|
126
|
+
@vernaculars_hash[canonical].each do |vern|
|
127
|
+
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
154
128
|
end
|
155
|
-
Hash[row.zip(num_ary)]
|
156
129
|
end
|
157
130
|
|
131
|
+
def update_synonym(taxon_id, canonical)
|
132
|
+
return unless @synonyms_hash.key?(canonical)
|
133
|
+
|
134
|
+
@synonyms_hash[canonical].each do |syn|
|
135
|
+
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
136
|
+
status: syn[:status] }
|
137
|
+
end
|
138
|
+
end
|
158
139
|
|
159
140
|
def generate_dwca
|
160
|
-
DwcaHunter
|
161
|
-
|
162
|
-
@core = [[
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
172
|
-
]]
|
141
|
+
DwcaHunter.logger_write(object_id,
|
142
|
+
"Creating DarwinCore Archive file")
|
143
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
144
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
145
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
146
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
147
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
148
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
149
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
150
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
151
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
173
152
|
@names.each do |n|
|
174
|
-
@core << [n[:taxon_id], n[:
|
175
|
-
|
176
|
-
|
153
|
+
@core << [n[:taxon_id], n[:name_string],
|
154
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
155
|
+
n[:genus], n[:code]]
|
177
156
|
end
|
178
157
|
@extensions << {
|
179
158
|
data: [[
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
159
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
160
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"
|
161
|
+
]],
|
162
|
+
file_name: "vernacular_names.txt",
|
163
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
164
|
+
}
|
184
165
|
|
185
166
|
@vernaculars.each do |v|
|
186
|
-
@extensions[-1][:data] << [v[:taxon_id], v[:
|
167
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern]]
|
187
168
|
end
|
188
169
|
|
189
170
|
@extensions << {
|
190
171
|
data: [[
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
}
|
198
|
-
|
172
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
173
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
174
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
175
|
+
]],
|
176
|
+
file_name: "synonyms.txt"
|
177
|
+
}
|
199
178
|
@synonyms.each do |s|
|
200
|
-
@extensions[-1][:data] << [
|
201
|
-
s[:taxon_id], s[:local_id],
|
202
|
-
s[:name_string], s[:taxonomic_status]]
|
179
|
+
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
203
180
|
end
|
204
181
|
@eml = {
|
205
182
|
id: @uuid,
|
206
183
|
title: @title,
|
207
184
|
authors: [
|
208
|
-
{email:
|
209
|
-
|
185
|
+
{ email: "dustymc at gmail dot com" }
|
186
|
+
],
|
210
187
|
metadata_providers: [
|
211
|
-
{ first_name:
|
212
|
-
last_name:
|
213
|
-
email:
|
214
|
-
|
215
|
-
abstract:
|
188
|
+
{ first_name: "Dmitry",
|
189
|
+
last_name: "Mozzherin",
|
190
|
+
email: "dmozzherin@gmail.com" }
|
191
|
+
],
|
192
|
+
abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
|
216
193
|
url: @url
|
217
194
|
}
|
218
195
|
super
|
219
196
|
end
|
220
197
|
end
|
221
198
|
end
|
222
|
-
|