dwca_hunter 0.5.2 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +59 -135
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +39 -8
- data/lib/dwca_hunter/resource.rb +5 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +121 -145
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/eol.rb +85 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +31 -40
- data/files/birdlife_7.csv +0 -11862
- data/files/fishbase_taxon_cache.tsv +0 -81000
- data/files/reptile_checklist_2014_12.csv +0 -15158
- data/files/species-black.txt +0 -251
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
data/lib/dwca_hunter/resource.rb
CHANGED
@@ -7,6 +7,11 @@ module DwcaHunter
|
|
7
7
|
`unzip -qq -u #{file} > /dev/null 2>&1`
|
8
8
|
end
|
9
9
|
|
10
|
+
def self.gunzip(file, dir = nil)
|
11
|
+
Dir.chdir(dir) if dir
|
12
|
+
`gunzip #{file}`
|
13
|
+
end
|
14
|
+
|
10
15
|
def initialize(opts)
|
11
16
|
@needs_download = !(opts[:download] == false)
|
12
17
|
@needs_unpack = !(opts[:unpack] == false)
|
@@ -0,0 +1,143 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceAOS < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "aos-birds"
|
7
|
+
@title = "American Ornithological Society"
|
8
|
+
@url = "http://checklist.americanornithology.org/taxa.csv"
|
9
|
+
@UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"aos",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading csv from remote"
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
@names_index = {}
|
45
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
46
|
+
headers: true)
|
47
|
+
file.each_with_index do |row, _i|
|
48
|
+
taxon_id = row["id"]
|
49
|
+
name_string = row["species"]
|
50
|
+
kingdom = "Animalia"
|
51
|
+
phylum = "Chordata"
|
52
|
+
klass = "Aves"
|
53
|
+
order = row["order"]
|
54
|
+
family = row["family"]
|
55
|
+
genus = row["genus"]
|
56
|
+
code = "ICZN"
|
57
|
+
|
58
|
+
@names << {
|
59
|
+
taxon_id: taxon_id,
|
60
|
+
name_string: name_string,
|
61
|
+
kingdom: kingdom,
|
62
|
+
phylum: phylum,
|
63
|
+
klass: klass,
|
64
|
+
order: order,
|
65
|
+
family: family,
|
66
|
+
genus: genus,
|
67
|
+
code: code
|
68
|
+
}
|
69
|
+
if row["common_name"].to_s != ""
|
70
|
+
@vernaculars << {
|
71
|
+
taxon_id: taxon_id,
|
72
|
+
vern: row["common_name"],
|
73
|
+
lang: "en"
|
74
|
+
}
|
75
|
+
end
|
76
|
+
next unless row["french_name"].to_s != ""
|
77
|
+
|
78
|
+
@vernaculars << {
|
79
|
+
taxon_id: taxon_id,
|
80
|
+
vern: row["french_name"],
|
81
|
+
lang: "fr"
|
82
|
+
}
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_dwca
|
87
|
+
DwcaHunter.logger_write(object_id,
|
88
|
+
"Creating DarwinCore Archive file")
|
89
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
94
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
95
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
96
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
97
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
98
|
+
@names.each do |n|
|
99
|
+
@core << [n[:taxon_id], n[:name_string],
|
100
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
101
|
+
n[:genus], n[:code]]
|
102
|
+
end
|
103
|
+
@extensions << {
|
104
|
+
data: [[
|
105
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
106
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
107
|
+
"http://purl.org/dc/terms/language"
|
108
|
+
]],
|
109
|
+
file_name: "vernacular_names.txt",
|
110
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
111
|
+
}
|
112
|
+
|
113
|
+
@vernaculars.each do |v|
|
114
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
115
|
+
end
|
116
|
+
@eml = {
|
117
|
+
id: @uuid,
|
118
|
+
title: @title,
|
119
|
+
authors: [
|
120
|
+
{ first_name: "R. T.",
|
121
|
+
last_name: "Chesser" }
|
122
|
+
],
|
123
|
+
metadata_providers: [
|
124
|
+
{ first_name: "Dmitry",
|
125
|
+
last_name: "Mozzherin",
|
126
|
+
email: "dmozzherin@gmail.com" }
|
127
|
+
],
|
128
|
+
abstract: "The American Ornithological Society's (AOS) Checklist is " \
|
129
|
+
"the official source on the taxonomy of birds found in North and " \
|
130
|
+
"Middle America, including adjacent islands. This list is produced " \
|
131
|
+
"by the North American Classification and Nomenclature Committee " \
|
132
|
+
"(NACC) of the AOS.\n\n" \
|
133
|
+
"Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
|
134
|
+
"J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
|
135
|
+
"J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
|
136
|
+
"of North American Birds (online). American Ornithological Society. " \
|
137
|
+
"http://checklist.aou.org/taxa",
|
138
|
+
url: @url
|
139
|
+
}
|
140
|
+
super
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -1,30 +1,36 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceArctos < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = "arctos"
|
7
|
+
@title = "Arctos"
|
8
|
+
@url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
|
9
|
+
@UUID = "eea8315d-a244-4625-859a-226675622312"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"arctos",
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
17
17
|
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
18
20
|
super(opts)
|
19
|
-
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached verion of the file. Ask Arctos to generate new."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
20
26
|
end
|
21
27
|
|
22
28
|
def unpack
|
23
|
-
|
29
|
+
unpack_tar
|
24
30
|
end
|
25
31
|
|
26
32
|
def make_dwca
|
27
|
-
DwcaHunter
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
28
34
|
get_names
|
29
35
|
generate_dwca
|
30
36
|
end
|
@@ -33,190 +39,160 @@ module DwcaHunter
|
|
33
39
|
|
34
40
|
def get_names
|
35
41
|
Dir.chdir(@download_dir)
|
36
|
-
Dir.entries(@download_dir).grep(/zip$/).each do |file|
|
37
|
-
self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
|
38
|
-
end
|
39
|
-
collect_names
|
40
42
|
collect_synonyms
|
41
43
|
collect_vernaculars
|
44
|
+
collect_names
|
42
45
|
end
|
43
46
|
|
44
47
|
def collect_vernaculars
|
45
|
-
file = open(File.join(@download_dir,
|
46
|
-
|
48
|
+
file = CSV.open(File.join(@download_dir, "common_name.csv"),
|
49
|
+
headers: true)
|
47
50
|
file.each_with_index do |row, i|
|
51
|
+
canonical = row["SCIENTIFIC_NAME"]
|
52
|
+
vernacular_name_string = row["COMMON_NAME"]
|
48
53
|
|
49
|
-
if
|
50
|
-
|
51
|
-
|
54
|
+
if @vernaculars_hash.key?(canonical)
|
55
|
+
@vernaculars_hash[canonical] << vernacular_name_string
|
56
|
+
else
|
57
|
+
@vernaculars_hash[canonical] = [vernacular_name_string]
|
52
58
|
end
|
53
59
|
|
54
|
-
|
55
|
-
|
56
|
-
taxon_id = row[fields[:taxon_name_id]]
|
57
|
-
vernacular_name_string = row[fields[:common_name]]
|
58
|
-
|
59
|
-
@vernaculars << {
|
60
|
-
taxon_id: taxon_id,
|
61
|
-
vernacular_name_string: vernacular_name_string
|
62
|
-
}
|
63
|
-
|
64
|
-
puts "Processed %s vernaculars" % i if i % 10000 == 0
|
60
|
+
puts "Processed %s vernaculars" % i if i % 10_000 == 0
|
65
61
|
end
|
66
62
|
end
|
67
63
|
|
68
64
|
def collect_synonyms
|
69
|
-
file = open(File.join(@download_dir,
|
70
|
-
|
65
|
+
file = CSV.open(File.join(@download_dir, "relationships.csv"),
|
66
|
+
headers: true)
|
71
67
|
file.each_with_index do |row, i|
|
72
|
-
|
73
|
-
|
74
|
-
|
68
|
+
canonical = row["scientific_name"]
|
69
|
+
if @synonyms_hash.key?(canonical)
|
70
|
+
@synonyms_hash[canonical] <<
|
71
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
72
|
+
else
|
73
|
+
@synonyms_hash[canonical] = [
|
74
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
75
|
+
]
|
75
76
|
end
|
76
|
-
|
77
|
-
row = split_row(row)
|
78
|
-
taxon_id = row[fields[:taxon_name_id]]
|
79
|
-
@synonyms << {
|
80
|
-
taxon_id: row[fields[:related_taxon_name_id]],
|
81
|
-
local_id: taxon_id,
|
82
|
-
name_string: @names_index[taxon_id],
|
83
|
-
#synonym_authority: row[fields[:relation_authority]],
|
84
|
-
taxonomic_status: row[fields[:taxon_relationship]],
|
85
|
-
}
|
86
|
-
puts "Processed %s synonyms" % i if i % 10000 == 0
|
77
|
+
puts "Processed %s synonyms" % i if i % 10_000 == 0
|
87
78
|
end
|
88
79
|
end
|
89
80
|
|
90
81
|
def collect_names
|
91
82
|
@names_index = {}
|
92
|
-
file = open(File.join(@download_dir,
|
93
|
-
|
83
|
+
file = CSV.open(File.join(@download_dir, "classification.csv"),
|
84
|
+
headers: true)
|
94
85
|
file.each_with_index do |row, i|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
subspecies = row[fields[:subspecies]]
|
117
|
-
code = row[fields[:nomenclatural_code]]
|
118
|
-
|
86
|
+
next unless row["display_name"]
|
87
|
+
|
88
|
+
name_string = row["display_name"].gsub(%r{</?i>}, "")
|
89
|
+
canonical = row["scientific_name"]
|
90
|
+
kingdom = row["kingdom"]
|
91
|
+
phylum = row["phylum"]
|
92
|
+
klass = row["phylclass"]
|
93
|
+
subclass = row["subclass"]
|
94
|
+
order = row["phylorder"]
|
95
|
+
suborder = row["suborder"]
|
96
|
+
superfamily = row["superfamily"]
|
97
|
+
family = row["family"]
|
98
|
+
subfamily = row["subfamily"]
|
99
|
+
tribe = row["tribe"]
|
100
|
+
genus = row["genus"]
|
101
|
+
subgenus = row["subgenus"]
|
102
|
+
species = row["species"]
|
103
|
+
subspecies = row["subspecies"]
|
104
|
+
code = row["nomenclatural_code"]
|
105
|
+
|
106
|
+
taxon_id = "ARCT_#{i + 1}"
|
119
107
|
@names << { taxon_id: taxon_id,
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
puts "Processed %s names" % i if i % 10000 == 0
|
108
|
+
name_string: name_string,
|
109
|
+
kingdom: kingdom,
|
110
|
+
phylum: phylum,
|
111
|
+
klass: klass,
|
112
|
+
order: order,
|
113
|
+
family: family,
|
114
|
+
genus: genus,
|
115
|
+
code: code }
|
116
|
+
|
117
|
+
update_vernacular(taxon_id, canonical)
|
118
|
+
update_synonym(taxon_id, canonical)
|
119
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
133
120
|
end
|
134
121
|
end
|
135
122
|
|
136
|
-
def
|
137
|
-
|
138
|
-
row.split('","')
|
139
|
-
end
|
123
|
+
def update_vernacular(taxon_id, canonical)
|
124
|
+
return unless @vernaculars_hash.key?(canonical)
|
140
125
|
|
141
|
-
|
142
|
-
|
143
|
-
encoding_options = {
|
144
|
-
:invalid => :replace,
|
145
|
-
:undef => :replace,
|
146
|
-
:replace => '',
|
147
|
-
:universal_newline => true
|
148
|
-
}
|
149
|
-
num_ary = (0...row.size).to_a
|
150
|
-
row = row.map do |f|
|
151
|
-
f = f.strip.downcase
|
152
|
-
f = f.encode ::Encoding.find('ASCII'), encoding_options
|
153
|
-
f.to_sym
|
126
|
+
@vernaculars_hash[canonical].each do |vern|
|
127
|
+
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
154
128
|
end
|
155
|
-
Hash[row.zip(num_ary)]
|
156
129
|
end
|
157
130
|
|
131
|
+
def update_synonym(taxon_id, canonical)
|
132
|
+
return unless @synonyms_hash.key?(canonical)
|
133
|
+
|
134
|
+
@synonyms_hash[canonical].each do |syn|
|
135
|
+
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
136
|
+
status: syn[:status] }
|
137
|
+
end
|
138
|
+
end
|
158
139
|
|
159
140
|
def generate_dwca
|
160
|
-
DwcaHunter
|
161
|
-
|
162
|
-
@core = [[
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
172
|
-
]]
|
141
|
+
DwcaHunter.logger_write(object_id,
|
142
|
+
"Creating DarwinCore Archive file")
|
143
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
144
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
145
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
146
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
147
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
148
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
149
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
150
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
151
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
173
152
|
@names.each do |n|
|
174
|
-
@core << [n[:taxon_id], n[:
|
175
|
-
|
176
|
-
|
153
|
+
@core << [n[:taxon_id], n[:name_string],
|
154
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
155
|
+
n[:genus], n[:code]]
|
177
156
|
end
|
178
157
|
@extensions << {
|
179
158
|
data: [[
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
159
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
160
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"
|
161
|
+
]],
|
162
|
+
file_name: "vernacular_names.txt",
|
163
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
164
|
+
}
|
184
165
|
|
185
166
|
@vernaculars.each do |v|
|
186
|
-
@extensions[-1][:data] << [v[:taxon_id], v[:
|
167
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern]]
|
187
168
|
end
|
188
169
|
|
189
170
|
@extensions << {
|
190
171
|
data: [[
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
}
|
198
|
-
|
172
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
173
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
174
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
175
|
+
]],
|
176
|
+
file_name: "synonyms.txt"
|
177
|
+
}
|
199
178
|
@synonyms.each do |s|
|
200
|
-
@extensions[-1][:data] << [
|
201
|
-
s[:taxon_id], s[:local_id],
|
202
|
-
s[:name_string], s[:taxonomic_status]]
|
179
|
+
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
203
180
|
end
|
204
181
|
@eml = {
|
205
182
|
id: @uuid,
|
206
183
|
title: @title,
|
207
184
|
authors: [
|
208
|
-
{email:
|
209
|
-
|
185
|
+
{ email: "dustymc at gmail dot com" }
|
186
|
+
],
|
210
187
|
metadata_providers: [
|
211
|
-
{ first_name:
|
212
|
-
last_name:
|
213
|
-
email:
|
214
|
-
|
215
|
-
abstract:
|
188
|
+
{ first_name: "Dmitry",
|
189
|
+
last_name: "Mozzherin",
|
190
|
+
email: "dmozzherin@gmail.com" }
|
191
|
+
],
|
192
|
+
abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
|
216
193
|
url: @url
|
217
194
|
}
|
218
195
|
super
|
219
196
|
end
|
220
197
|
end
|
221
198
|
end
|
222
|
-
|