dwca_hunter 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class Downloader
|
4
|
+
|
5
|
+
attr_reader :url
|
6
|
+
|
7
|
+
def initialize(source_url, file_path)
|
8
|
+
@source_url = source_url
|
9
|
+
@file_path = file_path
|
10
|
+
@url = Url.new(source_url)
|
11
|
+
@download_length = 0
|
12
|
+
@filename = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
# downloads a given file into a specified filename.
|
16
|
+
# If block is given returns download progress
|
17
|
+
def download
|
18
|
+
raise "#{@source_url} is not accessible" unless @url.valid?
|
19
|
+
f = open(@file_path,'wb')
|
20
|
+
count = 0
|
21
|
+
@url.net_http.request_get(@url.path) do |r|
|
22
|
+
r.read_body do |s|
|
23
|
+
@download_length += s.length
|
24
|
+
f.write s
|
25
|
+
if block_given?
|
26
|
+
count += 1
|
27
|
+
if count % 100 == 0
|
28
|
+
yield @download_length
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
f.close
|
34
|
+
downloaded = @download_length
|
35
|
+
@download_length = 0
|
36
|
+
downloaded
|
37
|
+
end
|
38
|
+
|
39
|
+
def download_with_percentage
|
40
|
+
start_time = Time.now
|
41
|
+
download do |r|
|
42
|
+
percentage = r.to_f/@url.header.content_length * 100
|
43
|
+
elapsed_time = Time.now - start_time
|
44
|
+
eta = calculate_eta(percentage, elapsed_time)
|
45
|
+
res = { percentage: percentage,
|
46
|
+
elapsed_time: elapsed_time,
|
47
|
+
eta: eta }
|
48
|
+
yield res
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
def calculate_eta(percentage, elapsed_time)
|
55
|
+
eta = elapsed_time/percentage * 100 - elapsed_time
|
56
|
+
eta = 1.0 if eta <= 0
|
57
|
+
eta
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
module Encoding
|
3
|
+
def self.latin1_to_utf8(file_path)
|
4
|
+
new_file = file_path + '.utf_8'
|
5
|
+
puts "Creating %s" % new_file
|
6
|
+
r = open(file_path)
|
7
|
+
w = open(new_file, 'w:utf-8')
|
8
|
+
r.each do |l|
|
9
|
+
l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
|
10
|
+
w.write l
|
11
|
+
end
|
12
|
+
r.close
|
13
|
+
w.close
|
14
|
+
new_file
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
class Resource
|
3
|
+
attr_reader :url, :uuid, :download_path, :title, :abbr, :command
|
4
|
+
|
5
|
+
def self.unzip(file, dir = nil)
|
6
|
+
Dir.chdir(dir) if dir
|
7
|
+
`unzip -qq -u #{file} > /dev/null 2>&1`
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(opts)
|
11
|
+
@needs_download = !(opts[:download] == false)
|
12
|
+
@needs_unpack = !(opts[:unpack] == false)
|
13
|
+
@download_dir, @download_file = File.split(@download_path)
|
14
|
+
prepare_path if needs_download?
|
15
|
+
end
|
16
|
+
|
17
|
+
def needs_download?
|
18
|
+
@needs_download
|
19
|
+
end
|
20
|
+
|
21
|
+
def needs_unpack?
|
22
|
+
@needs_unpack
|
23
|
+
end
|
24
|
+
|
25
|
+
def download
|
26
|
+
DwcaHunter::logger_write(self.object_id,
|
27
|
+
"Starting download of '%s'" % @url)
|
28
|
+
percentage = 0
|
29
|
+
if url.match(/^\s*http:\/\//)
|
30
|
+
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
31
|
+
downloaded_length = dlr.download_with_percentage do |r|
|
32
|
+
if r[:percentage].to_i != percentage
|
33
|
+
percentage = r[:percentage].to_i
|
34
|
+
msg = "Downloaded %.0f%% in %.0f seconds ETA is %.0f seconds" %
|
35
|
+
[percentage, r[:elapsed_time], r[:eta]]
|
36
|
+
DwcaHunter::logger_write(self.object_id, msg)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
DwcaHunter::logger_write(self.object_id,
|
40
|
+
"Download finished, Size: %s" %
|
41
|
+
downloaded_length)
|
42
|
+
else
|
43
|
+
`curl -s #{url} > #{download_path}`
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def cleanup(str)
|
50
|
+
str.strip!
|
51
|
+
str.to_i.to_s == str ? str.to_i : str
|
52
|
+
end
|
53
|
+
|
54
|
+
def prepare_path
|
55
|
+
FileUtils.rm_rf(@download_dir)
|
56
|
+
FileUtils.mkdir_p(@download_dir)
|
57
|
+
end
|
58
|
+
|
59
|
+
def unpack_bz2
|
60
|
+
DwcaHunter::logger_write(self.object_id,
|
61
|
+
'Unpacking a bz2 file, it might take a while...')
|
62
|
+
Dir.chdir(@download_dir)
|
63
|
+
`bunzip2 #{@download_file}`
|
64
|
+
end
|
65
|
+
|
66
|
+
def unpack_zip
|
67
|
+
DwcaHunter::logger_write(self.object_id,
|
68
|
+
'Unpacking a zip file, it might take a while...')
|
69
|
+
self.class.unzip(@download_file, @download_dir)
|
70
|
+
end
|
71
|
+
|
72
|
+
def unpack_gzip
|
73
|
+
DwcaHunter::logger_write(self.object_id,
|
74
|
+
'Unpacking gzip file, it might take a while...')
|
75
|
+
self.class.gunzip(@download_file, @download_dir)
|
76
|
+
end
|
77
|
+
|
78
|
+
def unpack_tar
|
79
|
+
DwcaHunter::logger_write(self.object_id,
|
80
|
+
'Unpacking a tar file, it might take a while...')
|
81
|
+
Dir.chdir(@download_dir)
|
82
|
+
`tar zxvf #{@download_file}`
|
83
|
+
end
|
84
|
+
|
85
|
+
def generate_dwca
|
86
|
+
gen = DarwinCore::Generator.new(File.join(@download_dir, 'dwca.tar.gz'))
|
87
|
+
gen.add_core(@core, 'taxa.txt')
|
88
|
+
@extensions.each_with_index do |extension, i|
|
89
|
+
gen.add_extension(extension[:data],
|
90
|
+
extension[:file_name],
|
91
|
+
true,
|
92
|
+
extension[:row_type])
|
93
|
+
end
|
94
|
+
gen.add_meta_xml
|
95
|
+
gen.add_eml_xml(@eml)
|
96
|
+
gen.pack
|
97
|
+
DwcaHunter::logger_write(self.object_id,
|
98
|
+
'DarwinCore Archive file is created')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceArctos < DwcaHunter::Resource
|
4
|
+
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = 'arctos'
|
7
|
+
@title = 'Arctos'
|
8
|
+
@url = 'http://arctos.database.museum/download/gncombined.zip'
|
9
|
+
@UUID = 'eea8315d-a244-4625-859a-226675622312'
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
'dwca_hunter',
|
12
|
+
'arctos',
|
13
|
+
'data.tar.gz')
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
super(opts)
|
19
|
+
@gnub_dir = File.join(@download_dir, 'gnub')
|
20
|
+
end
|
21
|
+
|
22
|
+
def unpack
|
23
|
+
unpack_zip
|
24
|
+
end
|
25
|
+
|
26
|
+
def make_dwca
|
27
|
+
DwcaHunter::logger_write(self.object_id, 'Extracting data')
|
28
|
+
get_names
|
29
|
+
generate_dwca
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def get_names
|
35
|
+
Dir.chdir(@download_dir)
|
36
|
+
Dir.entries(@download_dir).grep(/zip$/).each do |file|
|
37
|
+
self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
|
38
|
+
end
|
39
|
+
collect_names
|
40
|
+
collect_synonyms
|
41
|
+
collect_vernaculars
|
42
|
+
end
|
43
|
+
|
44
|
+
def collect_vernaculars
|
45
|
+
file = open(File.join(@download_dir, 'common_name.csv'))
|
46
|
+
fields = {}
|
47
|
+
file.each_with_index do |row, i|
|
48
|
+
|
49
|
+
if i == 0
|
50
|
+
fields = get_fields(row)
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
row = split_row(row)
|
55
|
+
|
56
|
+
taxon_id = row[fields[:taxon_name_id]]
|
57
|
+
vernacular_name_string = row[fields[:common_name]]
|
58
|
+
|
59
|
+
@vernaculars << {
|
60
|
+
taxon_id: taxon_id,
|
61
|
+
vernacular_name_string: vernacular_name_string
|
62
|
+
}
|
63
|
+
|
64
|
+
puts "Processed %s vernaculars" % i if i % 10000 == 0
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def collect_synonyms
|
69
|
+
file = open(File.join(@download_dir, 'taxon_relations.csv'))
|
70
|
+
fields = {}
|
71
|
+
file.each_with_index do |row, i|
|
72
|
+
if i == 0
|
73
|
+
fields = get_fields(row)
|
74
|
+
next
|
75
|
+
end
|
76
|
+
|
77
|
+
row = split_row(row)
|
78
|
+
taxon_id = row[fields[:taxon_name_id]]
|
79
|
+
@synonyms << {
|
80
|
+
taxon_id: row[fields[:related_taxon_name_id]],
|
81
|
+
local_id: taxon_id,
|
82
|
+
name_string: @names_index[taxon_id],
|
83
|
+
#synonym_authority: row[fields[:relation_authority]],
|
84
|
+
taxonomic_status: row[fields[:taxon_relationship]],
|
85
|
+
}
|
86
|
+
puts "Processed %s synonyms" % i if i % 10000 == 0
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def collect_names
|
91
|
+
@names_index = {}
|
92
|
+
file = open(File.join(@download_dir, 'taxonomy.csv'))
|
93
|
+
fields = {}
|
94
|
+
file.each_with_index do |row, i|
|
95
|
+
if i == 0
|
96
|
+
fields = get_fields(row)
|
97
|
+
next
|
98
|
+
end
|
99
|
+
next unless row[fields[:display_name]]
|
100
|
+
row = split_row(row)
|
101
|
+
taxon_id = row[fields[:taxon_name_id]]
|
102
|
+
name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
|
103
|
+
kingdom = row[fields[:kingdom]]
|
104
|
+
phylum = row[fields[:phylum]]
|
105
|
+
klass = row[fields[:phylclass]]
|
106
|
+
subclass = row[fields[:subclass]]
|
107
|
+
order = row[fields[:phylorder]]
|
108
|
+
suborder = row[fields[:suborder]]
|
109
|
+
superfamily = row[fields[:superfamily]]
|
110
|
+
family = row[fields[:family]]
|
111
|
+
subfamily = row[fields[:subfamily]]
|
112
|
+
tribe = row[fields[:tribe]]
|
113
|
+
genus = row[fields[:genus]]
|
114
|
+
subgenus = row[fields[:subgenus]]
|
115
|
+
species = row[fields[:species]]
|
116
|
+
subspecies = row[fields[:subspecies]]
|
117
|
+
code = row[fields[:nomenclatural_code]]
|
118
|
+
|
119
|
+
@names << { taxon_id: taxon_id,
|
120
|
+
local_id: taxon_id,
|
121
|
+
name_string: name_string,
|
122
|
+
kingdom: kingdom,
|
123
|
+
phylum: phylum,
|
124
|
+
klass: klass,
|
125
|
+
order: order,
|
126
|
+
family: family,
|
127
|
+
genus: genus,
|
128
|
+
code: code,
|
129
|
+
}
|
130
|
+
|
131
|
+
@names_index[taxon_id] = name_string
|
132
|
+
puts "Processed %s names" % i if i % 10000 == 0
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def split_row(row)
|
137
|
+
row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
|
138
|
+
row.split('","')
|
139
|
+
end
|
140
|
+
|
141
|
+
def get_fields(row)
|
142
|
+
row = row.split(",")
|
143
|
+
encoding_options = {
|
144
|
+
:invalid => :replace,
|
145
|
+
:undef => :replace,
|
146
|
+
:replace => '',
|
147
|
+
:universal_newline => true
|
148
|
+
}
|
149
|
+
num_ary = (0...row.size).to_a
|
150
|
+
row = row.map do |f|
|
151
|
+
f = f.strip.downcase
|
152
|
+
f = f.encode ::Encoding.find('ASCII'), encoding_options
|
153
|
+
f.to_sym
|
154
|
+
end
|
155
|
+
Hash[row.zip(num_ary)]
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
def generate_dwca
|
160
|
+
DwcaHunter::logger_write(self.object_id,
|
161
|
+
'Creating DarwinCore Archive file')
|
162
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
163
|
+
'http://globalnames.org/terms/localID',
|
164
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
165
|
+
'http://rs.tdwg.org/dwc/terms/kingdom',
|
166
|
+
'http://rs.tdwg.org/dwc/terms/phylum',
|
167
|
+
'http://rs.tdwg.org/dwc/terms/class',
|
168
|
+
'http://rs.tdwg.org/dwc/terms/order',
|
169
|
+
'http://rs.tdwg.org/dwc/terms/family',
|
170
|
+
'http://rs.tdwg.org/dwc/terms/genus',
|
171
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
172
|
+
]]
|
173
|
+
@names.each do |n|
|
174
|
+
@core << [n[:taxon_id], n[:taxon_id], n[:name_string],
|
175
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
176
|
+
n[:genus], n[:code]]
|
177
|
+
end
|
178
|
+
@extensions << {
|
179
|
+
data: [[
|
180
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
181
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName']],
|
182
|
+
file_name: 'vernacular_names.txt',
|
183
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
|
184
|
+
|
185
|
+
@vernaculars.each do |v|
|
186
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
|
187
|
+
end
|
188
|
+
|
189
|
+
@extensions << {
|
190
|
+
data: [[
|
191
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
192
|
+
'http://globalnames.org/terms/localID',
|
193
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
194
|
+
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
|
195
|
+
]],
|
196
|
+
file_name: 'synonyms.txt',
|
197
|
+
}
|
198
|
+
|
199
|
+
@synonyms.each do |s|
|
200
|
+
@extensions[-1][:data] << [
|
201
|
+
s[:taxon_id], s[:local_id],
|
202
|
+
s[:name_string], s[:taxonomic_status]]
|
203
|
+
end
|
204
|
+
@eml = {
|
205
|
+
id: @uuid,
|
206
|
+
title: @title,
|
207
|
+
authors: [
|
208
|
+
{email: 'dustymc at gmail dot com'}
|
209
|
+
],
|
210
|
+
metadata_providers: [
|
211
|
+
{ first_name: 'Dmitry',
|
212
|
+
last_name: 'Mozzherin',
|
213
|
+
email: 'dmozzherin@gmail.com' }
|
214
|
+
],
|
215
|
+
abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
|
216
|
+
url: @url
|
217
|
+
}
|
218
|
+
super
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
@@ -0,0 +1,160 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
class ResourceBirdLife < DwcaHunter::Resource
|
3
|
+
def initialize(opts = {})
|
4
|
+
@command = "bird-life"
|
5
|
+
@title = "BirdLife International"
|
6
|
+
@uuid = "b1d8de7a-ab96-455f-acd8-f3fff2d7d169"
|
7
|
+
@data = []
|
8
|
+
@extensions = []
|
9
|
+
@url = "http://www.birdlife.org/datazone/userfiles"\
|
10
|
+
"/file/Species/Taxonomy/BirdLife_Checklist_Version_70.zip"
|
11
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter", "birdlife",
|
12
|
+
"fake.zip")
|
13
|
+
@clades = {}
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def needs_unpack?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
def download
|
22
|
+
end
|
23
|
+
|
24
|
+
def make_dwca
|
25
|
+
organize_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def generate_dwca
|
32
|
+
DwcaHunter::logger_write(self.object_id,
|
33
|
+
'Creating DarwinCore Archive file')
|
34
|
+
core_init
|
35
|
+
extensions_init
|
36
|
+
eml_init
|
37
|
+
@data.each do |rec|
|
38
|
+
process(rec)
|
39
|
+
end
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
def core_init
|
44
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
45
|
+
"http://globalnames.org/terms/localID",
|
46
|
+
"http://rs.tdwg.org/dwc/terms/parentNameUsageID",
|
47
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
48
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
49
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus",
|
50
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank"]]
|
51
|
+
@count = 1
|
52
|
+
@core << [@count, nil, nil, @count, "Aves", nil, "class"]
|
53
|
+
end
|
54
|
+
|
55
|
+
def process(rec)
|
56
|
+
parent_id = 1
|
57
|
+
[:order, :family].each do |rank|
|
58
|
+
clade_id = nil
|
59
|
+
unless @clades[rec[rank]]
|
60
|
+
@count += 1
|
61
|
+
@clades[rec[rank]] = { id: @count }
|
62
|
+
end
|
63
|
+
clade_id = @clades[rec[rank]][:id]
|
64
|
+
@core << [clade_id, nil, parent_id, clade_id, rec[rank], nil, rank.to_s]
|
65
|
+
parent_id = clade_id
|
66
|
+
end
|
67
|
+
@count += 1
|
68
|
+
@core << [@count, rec[:local_id], parent_id, @count,
|
69
|
+
rec[:scientific_name], nil, rec[:rank]]
|
70
|
+
taxon = @core.last
|
71
|
+
process_synonyms(rec, taxon)
|
72
|
+
process_vernaculars(rec, taxon)
|
73
|
+
end
|
74
|
+
|
75
|
+
def process_synonyms(rec, taxon)
|
76
|
+
rec[:synonyms].each do |syn|
|
77
|
+
@count += 1
|
78
|
+
@core << [@count, nil, taxon[2], taxon[0], syn, "synonym", taxon[-1]]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def process_vernaculars(rec, taxon)
|
83
|
+
rec[:vernaculars].each do |v|
|
84
|
+
taxon_id = taxon[0]
|
85
|
+
lang = "en"
|
86
|
+
name = v
|
87
|
+
@extensions[0][:data] << [taxon_id, name, lang]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def extensions_init
|
92
|
+
@extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
94
|
+
"http://purl.org/dc/terms/language"]],
|
95
|
+
file_name: "vernacular_names.txt",
|
96
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
97
|
+
}
|
98
|
+
end
|
99
|
+
|
100
|
+
def organize_data
|
101
|
+
DwcaHunter::logger_write(self.object_id,
|
102
|
+
"Organizing data")
|
103
|
+
path = File.join(__dir__, "..",
|
104
|
+
"..", "files", "birdlife_7.csv")
|
105
|
+
opts = { headers: true, header_converters: :symbol }
|
106
|
+
collect_data(path, opts)
|
107
|
+
end
|
108
|
+
|
109
|
+
def collect_data(path, opts)
|
110
|
+
@data = CSV.open(path, opts).each_with_object([]) do |row, data|
|
111
|
+
order = row[:order]
|
112
|
+
order = order.capitalize if order.match(/^[A-Z]+$/)
|
113
|
+
family = row[:familyname]
|
114
|
+
scientific_name = [row[:scientificname], row[:authority]].join(" ").
|
115
|
+
strip.gsub(/[\s]+/, " ")
|
116
|
+
rank = row[:taxonomictreatment] == "R" ? "species" : "not recognized"
|
117
|
+
local_id = row[:sisrecid]
|
118
|
+
vernaculars = collect_vernaculars(row)
|
119
|
+
synonyms = collect_synonyms(row)
|
120
|
+
data << { order: order, family: family, rank: rank,
|
121
|
+
scientific_name: scientific_name, synonyms: synonyms,
|
122
|
+
local_id: local_id, vernaculars: vernaculars }
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def collect_synonyms(row)
|
127
|
+
synonyms = row[:synonyms]
|
128
|
+
synonyms ? synonyms.split(";").map(&:strip) : []
|
129
|
+
end
|
130
|
+
|
131
|
+
def collect_vernaculars(row)
|
132
|
+
name1 = row[:commonname]
|
133
|
+
names = name1 ? [name1] : []
|
134
|
+
other = row[:alternativecommonnames]
|
135
|
+
if other
|
136
|
+
names += other.split(";").map(&:strip)
|
137
|
+
end
|
138
|
+
names
|
139
|
+
end
|
140
|
+
|
141
|
+
def eml_init
|
142
|
+
@eml = {
|
143
|
+
id: @uuid,
|
144
|
+
title: @title,
|
145
|
+
authors: [],
|
146
|
+
metadata_providers: [
|
147
|
+
{ first_name: "Dmitry",
|
148
|
+
last_name: "Mozzherin",
|
149
|
+
email: "dmozzherin@gmail.com" }
|
150
|
+
],
|
151
|
+
abstract: "BirdLife is widely recognised as the world leader in bird "\
|
152
|
+
"conservation. Rigorous science informed by practical "\
|
153
|
+
"feedback from projects on the ground in important sites "\
|
154
|
+
"and habitats enables us to implement successful "\
|
155
|
+
"conservation programmes for birds and all nature.",
|
156
|
+
url: "http://www.birdlife.org/"
|
157
|
+
}
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
# Resource for FishBase
|
3
|
+
class ResourceFishbase < DwcaHunter::Resource
|
4
|
+
attr_reader :title, :abbr
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "fishbase"
|
7
|
+
@title = "FishBase Cache"
|
8
|
+
@abbr = "FishBase Cache"
|
9
|
+
@uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
|
10
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter", "fishbase",
|
11
|
+
"fishbase.tsv")
|
12
|
+
@extensions = []
|
13
|
+
super
|
14
|
+
end
|
15
|
+
|
16
|
+
def download
|
17
|
+
FileUtils.cp(File.join(__dir__, "..", "..", "files",
|
18
|
+
"fishbase_taxon_cache.tsv"), @download_path)
|
19
|
+
end
|
20
|
+
|
21
|
+
def unpack
|
22
|
+
end
|
23
|
+
|
24
|
+
def make_dwca
|
25
|
+
organize_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def organize_data
|
32
|
+
ranks = %i(class order family sub_family genus species)
|
33
|
+
DwcaHunter::logger_write(self.object_id,
|
34
|
+
"Organizing data")
|
35
|
+
# snp = ScientificNameParser.new
|
36
|
+
@data = CSV.open(@download_path, col_sep: "\t")
|
37
|
+
.each_with_object([]) do |row, data|
|
38
|
+
cl = Hash[ranks.zip(row[4].split("|"))]
|
39
|
+
data << { taxon_id: row[0],
|
40
|
+
local_id: row[0],
|
41
|
+
scientific_name: row[1],
|
42
|
+
rank: row[2],
|
43
|
+
source: row[7]
|
44
|
+
}.merge(cl)
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def generate_dwca
|
50
|
+
DwcaHunter::logger_write(self.object_id,
|
51
|
+
'Creating DarwinCore Archive file')
|
52
|
+
core_init
|
53
|
+
eml_init
|
54
|
+
DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
|
55
|
+
count = 0
|
56
|
+
@data.each do |d|
|
57
|
+
count += 1
|
58
|
+
if count % 10000 == 0
|
59
|
+
DwcaHunter::logger_write(self.object_id, "Core row #{count}")
|
60
|
+
end
|
61
|
+
@core << [d[:taxon_id], d[:taxon_id], d[:taxon_id],
|
62
|
+
d[:scientific_name], d[:rank],
|
63
|
+
d[:class], d[:order], d[:family], d[:genus],
|
64
|
+
d[:source]]
|
65
|
+
end
|
66
|
+
super
|
67
|
+
end
|
68
|
+
|
69
|
+
def eml_init
|
70
|
+
@eml = {
|
71
|
+
id: @uuid,
|
72
|
+
title: @title,
|
73
|
+
authors: [],
|
74
|
+
metadata_providers: [
|
75
|
+
{ first_name: "Jorrit",
|
76
|
+
last_name: "Poelen",
|
77
|
+
}
|
78
|
+
],
|
79
|
+
abstract: "FishBase is a global species database of fish species" \
|
80
|
+
"(specifically finfish). It is the largest and the most" \
|
81
|
+
"extensively accessed online database of finfish",
|
82
|
+
url: "http://www.fishbase.org"
|
83
|
+
}
|
84
|
+
end
|
85
|
+
|
86
|
+
def core_init
|
87
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
88
|
+
"http://globalnames.org/terms/localID",
|
89
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
94
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
95
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
96
|
+
"http://purl.org/dc/terms/source"]]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|