dwca_hunter 0.5.3 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +11 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +90 -84
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +13 -12
- data/exe/dwcahunter +1 -5
- data/lib/dwca_hunter.rb +33 -0
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +115 -149
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +186 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +166 -184
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +54 -32
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
data/lib/dwca_hunter/resource.rb
CHANGED
@@ -4,7 +4,12 @@ module DwcaHunter
|
|
4
4
|
|
5
5
|
def self.unzip(file, dir = nil)
|
6
6
|
Dir.chdir(dir) if dir
|
7
|
-
|
7
|
+
Zip::File.open(file) do |zip_file|
|
8
|
+
zip_file.each do |entry|
|
9
|
+
puts "Extracting #{entry.name}"
|
10
|
+
entry.extract
|
11
|
+
end
|
12
|
+
end
|
8
13
|
end
|
9
14
|
|
10
15
|
def self.gunzip(file, dir = nil)
|
@@ -13,8 +18,8 @@ module DwcaHunter
|
|
13
18
|
end
|
14
19
|
|
15
20
|
def initialize(opts)
|
16
|
-
@needs_download =
|
17
|
-
@needs_unpack =
|
21
|
+
@needs_download = (opts[:download] != false)
|
22
|
+
@needs_unpack = (opts[:unpack] != false)
|
18
23
|
@download_dir, @download_file = File.split(@download_path)
|
19
24
|
prepare_path if needs_download?
|
20
25
|
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceAOS < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "aos-birds"
|
7
|
+
@title = "American Ornithological Society"
|
8
|
+
@url = "http://checklist.americanornithology.org/taxa.csv"
|
9
|
+
@UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"aos",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading csv from remote"
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
@names_index = {}
|
45
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
46
|
+
headers: true)
|
47
|
+
file.each_with_index do |row, _i|
|
48
|
+
taxon_id = row["id"]
|
49
|
+
name_string = row["species"]
|
50
|
+
kingdom = "Animalia"
|
51
|
+
phylum = "Chordata"
|
52
|
+
klass = "Aves"
|
53
|
+
order = row["order"]
|
54
|
+
family = row["family"]
|
55
|
+
genus = row["genus"]
|
56
|
+
code = "ICZN"
|
57
|
+
|
58
|
+
@names << {
|
59
|
+
taxon_id: taxon_id,
|
60
|
+
name_string: name_string,
|
61
|
+
kingdom: kingdom,
|
62
|
+
phylum: phylum,
|
63
|
+
klass: klass,
|
64
|
+
order: order,
|
65
|
+
family: family,
|
66
|
+
genus: genus,
|
67
|
+
code: code
|
68
|
+
}
|
69
|
+
if row["common_name"].to_s != ""
|
70
|
+
@vernaculars << {
|
71
|
+
taxon_id: taxon_id,
|
72
|
+
vern: row["common_name"],
|
73
|
+
lang: "en"
|
74
|
+
}
|
75
|
+
end
|
76
|
+
next unless row["french_name"].to_s != ""
|
77
|
+
|
78
|
+
@vernaculars << {
|
79
|
+
taxon_id: taxon_id,
|
80
|
+
vern: row["french_name"],
|
81
|
+
lang: "fr"
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_dwca
|
87
|
+
DwcaHunter.logger_write(object_id,
|
88
|
+
"Creating DarwinCore Archive file")
|
89
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
94
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
95
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
96
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
97
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
98
|
+
@names.each do |n|
|
99
|
+
@core << [n[:taxon_id], n[:name_string],
|
100
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
101
|
+
n[:genus], n[:code]]
|
102
|
+
end
|
103
|
+
@extensions << {
|
104
|
+
data: [[
|
105
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
106
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
107
|
+
"http://purl.org/dc/terms/language"
|
108
|
+
]],
|
109
|
+
file_name: "vernacular_names.txt",
|
110
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
111
|
+
}
|
112
|
+
|
113
|
+
@vernaculars.each do |v|
|
114
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
115
|
+
end
|
116
|
+
@eml = {
|
117
|
+
id: @uuid,
|
118
|
+
title: @title,
|
119
|
+
authors: [
|
120
|
+
{ first_name: "R. T.",
|
121
|
+
last_name: "Chesser" }
|
122
|
+
],
|
123
|
+
metadata_providers: [
|
124
|
+
{ first_name: "Dmitry",
|
125
|
+
last_name: "Mozzherin",
|
126
|
+
email: "dmozzherin@gmail.com" }
|
127
|
+
],
|
128
|
+
abstract: "The American Ornithological Society's (AOS) Checklist is " \
|
129
|
+
"the official source on the taxonomy of birds found in North and " \
|
130
|
+
"Middle America, including adjacent islands. This list is produced " \
|
131
|
+
"by the North American Classification and Nomenclature Committee " \
|
132
|
+
"(NACC) of the AOS.\n\n" \
|
133
|
+
"Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
|
134
|
+
"J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
|
135
|
+
"J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
|
136
|
+
"of North American Birds (online). American Ornithological Society. " \
|
137
|
+
"http://checklist.aou.org/taxa",
|
138
|
+
url: @url
|
139
|
+
}
|
140
|
+
super
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -1,34 +1,36 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceArctos < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = "arctos"
|
7
|
+
@title = "Arctos"
|
8
|
+
@url = "http://arctos.database.museum/cache/gn_merge.tgz"
|
9
|
+
@UUID = "eea8315d-a244-4625-859a-226675622312"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"arctos",
|
13
|
+
"data.tar.gz")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
17
17
|
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
18
20
|
super(opts)
|
19
21
|
end
|
20
22
|
|
21
23
|
def download
|
22
|
-
puts "Downloading
|
23
|
-
|
24
|
+
puts "Downloading Arctos file."
|
25
|
+
`curl -s #{@url} -o #{@download_path}`
|
24
26
|
end
|
25
27
|
|
26
28
|
def unpack
|
27
|
-
|
29
|
+
unpack_tar
|
28
30
|
end
|
29
31
|
|
30
32
|
def make_dwca
|
31
|
-
DwcaHunter
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
34
|
get_names
|
33
35
|
generate_dwca
|
34
36
|
end
|
@@ -37,190 +39,154 @@ module DwcaHunter
|
|
37
39
|
|
38
40
|
def get_names
|
39
41
|
Dir.chdir(@download_dir)
|
40
|
-
Dir.entries(@download_dir).grep(/zip$/).each do |file|
|
41
|
-
self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
|
42
|
-
end
|
43
|
-
collect_names
|
44
42
|
collect_synonyms
|
45
43
|
collect_vernaculars
|
44
|
+
collect_names
|
46
45
|
end
|
47
46
|
|
48
47
|
def collect_vernaculars
|
49
|
-
file = open(File.join(@download_dir,
|
50
|
-
|
48
|
+
file = CSV.open(File.join(@download_dir, "globalnames_commonname.csv"),
|
49
|
+
headers: true)
|
51
50
|
file.each_with_index do |row, i|
|
51
|
+
canonical = row["scientific_name"]
|
52
|
+
vernacular_name_string = row["common_name"]
|
52
53
|
|
53
|
-
if
|
54
|
-
|
55
|
-
|
54
|
+
if @vernaculars_hash.key?(canonical)
|
55
|
+
@vernaculars_hash[canonical] << vernacular_name_string
|
56
|
+
else
|
57
|
+
@vernaculars_hash[canonical] = [vernacular_name_string]
|
56
58
|
end
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
taxon_id = row[fields[:taxon_name_id]]
|
61
|
-
vernacular_name_string = row[fields[:common_name]]
|
62
|
-
|
63
|
-
@vernaculars << {
|
64
|
-
taxon_id: taxon_id,
|
65
|
-
vernacular_name_string: vernacular_name_string
|
66
|
-
}
|
67
|
-
|
68
|
-
puts "Processed %s vernaculars" % i if i % 10000 == 0
|
60
|
+
puts "Processed #{i} vernaculars"if (i % 100_000).zero?
|
69
61
|
end
|
70
62
|
end
|
71
63
|
|
72
64
|
def collect_synonyms
|
73
|
-
file = open(File.join(@download_dir,
|
74
|
-
|
65
|
+
file = CSV.open(File.join(@download_dir, "globalnames_relationships.csv"),
|
66
|
+
headers: true)
|
75
67
|
file.each_with_index do |row, i|
|
76
|
-
|
77
|
-
|
78
|
-
|
68
|
+
canonical = row["scientific_name"]
|
69
|
+
if @synonyms_hash.key?(canonical)
|
70
|
+
@synonyms_hash[canonical] <<
|
71
|
+
{ name_string: row["related_name"], status: row["taxon_relationship"] }
|
72
|
+
else
|
73
|
+
@synonyms_hash[canonical] = [
|
74
|
+
{ name_string: row["related_name"], status: row["taxon_relationship"] }
|
75
|
+
]
|
79
76
|
end
|
80
|
-
|
81
|
-
row = split_row(row)
|
82
|
-
taxon_id = row[fields[:taxon_name_id]]
|
83
|
-
@synonyms << {
|
84
|
-
taxon_id: row[fields[:related_taxon_name_id]],
|
85
|
-
local_id: taxon_id,
|
86
|
-
name_string: @names_index[taxon_id],
|
87
|
-
#synonym_authority: row[fields[:relation_authority]],
|
88
|
-
taxonomic_status: row[fields[:taxon_relationship]],
|
89
|
-
}
|
90
|
-
puts "Processed %s synonyms" % i if i % 10000 == 0
|
77
|
+
puts "Processed #{i} synonyms" if (i % 100_000).zero?
|
91
78
|
end
|
92
79
|
end
|
93
80
|
|
94
81
|
def collect_names
|
95
82
|
@names_index = {}
|
96
|
-
file = open(File.join(@download_dir,
|
97
|
-
|
83
|
+
file = CSV.open(File.join(@download_dir, "globalnames_classification.csv"),
|
84
|
+
headers: true)
|
85
|
+
|
86
|
+
names = {}
|
98
87
|
file.each_with_index do |row, i|
|
99
|
-
if
|
100
|
-
|
101
|
-
|
88
|
+
next if row["term_type"].nil?
|
89
|
+
name = row["scientific_name"]
|
90
|
+
if names.key?(name)
|
91
|
+
names[name] = names[name].
|
92
|
+
merge({row["term_type"].to_sym => row["term"]})
|
93
|
+
else
|
94
|
+
names[name] = {row["term_type"].to_sym => row["term"]}
|
102
95
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
@names << { taxon_id: taxon_id,
|
125
|
-
local_id: taxon_id,
|
126
|
-
name_string: name_string,
|
127
|
-
kingdom: kingdom,
|
128
|
-
phylum: phylum,
|
129
|
-
klass: klass,
|
130
|
-
order: order,
|
131
|
-
family: family,
|
132
|
-
genus: genus,
|
133
|
-
code: code,
|
134
|
-
}
|
135
|
-
|
136
|
-
@names_index[taxon_id] = name_string
|
137
|
-
puts "Processed %s names" % i if i % 10000 == 0
|
96
|
+
puts "Preprocessed #{i} rows" if (i % 100_000).zero?
|
97
|
+
end
|
98
|
+
names.each_with_index do |m, i|
|
99
|
+
canonical = m[0]
|
100
|
+
v = m[1]
|
101
|
+
taxon_id = "gn_#{i + 1}"
|
102
|
+
res ={ taxon_id: taxon_id,
|
103
|
+
name_string: canonical,
|
104
|
+
kingdom: v[:kingdom],
|
105
|
+
phylum: v[:phylum],
|
106
|
+
klass: v[:class],
|
107
|
+
order: v[:order],
|
108
|
+
family: v[:family],
|
109
|
+
genus: v[:genus],
|
110
|
+
species: v[:species],
|
111
|
+
authors: v[:author_text],
|
112
|
+
code: v[:nomenclatural_code] }
|
113
|
+
@names << res
|
114
|
+
update_vernacular(taxon_id, canonical)
|
115
|
+
update_synonym(taxon_id, canonical)
|
116
|
+
puts "Processed #{i} names" if (i % 100_000).zero?
|
138
117
|
end
|
139
118
|
end
|
140
119
|
|
141
|
-
def
|
142
|
-
|
143
|
-
row.split('","')
|
144
|
-
end
|
120
|
+
def update_vernacular(taxon_id, canonical)
|
121
|
+
return unless @vernaculars_hash.key?(canonical)
|
145
122
|
|
146
|
-
|
147
|
-
|
148
|
-
encoding_options = {
|
149
|
-
:invalid => :replace,
|
150
|
-
:undef => :replace,
|
151
|
-
:replace => '',
|
152
|
-
:universal_newline => true
|
153
|
-
}
|
154
|
-
num_ary = (0...row.size).to_a
|
155
|
-
row = row.map do |f|
|
156
|
-
f = f.strip.downcase
|
157
|
-
f = f.encode ::Encoding.find('ASCII'), encoding_options
|
158
|
-
f.to_sym
|
123
|
+
@vernaculars_hash[canonical].each do |vern|
|
124
|
+
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
159
125
|
end
|
160
|
-
res = Hash[row.zip(num_ary)]
|
161
|
-
require 'byebug'; byebug
|
162
|
-
puts ''
|
163
|
-
res
|
164
126
|
end
|
165
127
|
|
128
|
+
def update_synonym(taxon_id, canonical)
|
129
|
+
return unless @synonyms_hash.key?(canonical)
|
130
|
+
|
131
|
+
@synonyms_hash[canonical].each do |syn|
|
132
|
+
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
133
|
+
status: syn[:status] }
|
134
|
+
end
|
135
|
+
end
|
166
136
|
|
167
137
|
def generate_dwca
|
168
|
-
DwcaHunter
|
169
|
-
|
170
|
-
@core = [[
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
180
|
-
]]
|
138
|
+
DwcaHunter.logger_write(object_id,
|
139
|
+
"Creating DarwinCore Archive file")
|
140
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
141
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
142
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
143
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
144
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
145
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
146
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
147
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
148
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
181
149
|
@names.each do |n|
|
182
|
-
@core << [n[:taxon_id], n[:
|
183
|
-
|
184
|
-
|
150
|
+
@core << [n[:taxon_id], n[:name_string],
|
151
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
152
|
+
n[:genus], n[:code]]
|
185
153
|
end
|
186
154
|
@extensions << {
|
187
155
|
data: [[
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
156
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
157
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"
|
158
|
+
]],
|
159
|
+
file_name: "vernacular_names.txt",
|
160
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
161
|
+
}
|
192
162
|
|
193
163
|
@vernaculars.each do |v|
|
194
|
-
@extensions[-1][:data] << [v[:taxon_id], v[:
|
164
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern]]
|
195
165
|
end
|
196
166
|
|
197
167
|
@extensions << {
|
198
168
|
data: [[
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
}
|
206
|
-
|
169
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
170
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
171
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
172
|
+
]],
|
173
|
+
file_name: "synonyms.txt"
|
174
|
+
}
|
207
175
|
@synonyms.each do |s|
|
208
|
-
@extensions[-1][:data] << [
|
209
|
-
s[:taxon_id], s[:local_id],
|
210
|
-
s[:name_string], s[:taxonomic_status]]
|
176
|
+
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
211
177
|
end
|
212
178
|
@eml = {
|
213
179
|
id: @uuid,
|
214
180
|
title: @title,
|
215
181
|
authors: [
|
216
|
-
{email:
|
217
|
-
|
182
|
+
{ email: "dustymc at gmail dot com" }
|
183
|
+
],
|
218
184
|
metadata_providers: [
|
219
|
-
{ first_name:
|
220
|
-
last_name:
|
221
|
-
email:
|
222
|
-
|
223
|
-
abstract:
|
185
|
+
{ first_name: "Dmitry",
|
186
|
+
last_name: "Mozzherin",
|
187
|
+
email: "dmozzherin@gmail.com" }
|
188
|
+
],
|
189
|
+
abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
|
224
190
|
url: @url
|
225
191
|
}
|
226
192
|
super
|