dwca_hunter 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class Downloader
4
+
5
+ attr_reader :url
6
+
7
+ def initialize(source_url, file_path)
8
+ @source_url = source_url
9
+ @file_path = file_path
10
+ @url = Url.new(source_url)
11
+ @download_length = 0
12
+ @filename = nil
13
+ end
14
+
15
+ # downloads a given file into a specified filename.
16
+ # If block is given returns download progress
17
+ def download
18
+ raise "#{@source_url} is not accessible" unless @url.valid?
19
+ f = open(@file_path,'wb')
20
+ count = 0
21
+ @url.net_http.request_get(@url.path) do |r|
22
+ r.read_body do |s|
23
+ @download_length += s.length
24
+ f.write s
25
+ if block_given?
26
+ count += 1
27
+ if count % 100 == 0
28
+ yield @download_length
29
+ end
30
+ end
31
+ end
32
+ end
33
+ f.close
34
+ downloaded = @download_length
35
+ @download_length = 0
36
+ downloaded
37
+ end
38
+
39
+ def download_with_percentage
40
+ start_time = Time.now
41
+ download do |r|
42
+ percentage = r.to_f/@url.header.content_length * 100
43
+ elapsed_time = Time.now - start_time
44
+ eta = calculate_eta(percentage, elapsed_time)
45
+ res = { percentage: percentage,
46
+ elapsed_time: elapsed_time,
47
+ eta: eta }
48
+ yield res
49
+ end
50
+ end
51
+
52
+ protected
53
+
54
+ def calculate_eta(percentage, elapsed_time)
55
+ eta = elapsed_time/percentage * 100 - elapsed_time
56
+ eta = 1.0 if eta <= 0
57
+ eta
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,17 @@
1
+ module DwcaHunter
2
+ module Encoding
3
+ def self.latin1_to_utf8(file_path)
4
+ new_file = file_path + '.utf_8'
5
+ puts "Creating %s" % new_file
6
+ r = open(file_path)
7
+ w = open(new_file, 'w:utf-8')
8
+ r.each do |l|
9
+ l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
10
+ w.write l
11
+ end
12
+ r.close
13
+ w.close
14
+ new_file
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,101 @@
1
+ module DwcaHunter
2
+ class Resource
3
+ attr_reader :url, :uuid, :download_path, :title, :abbr, :command
4
+
5
+ def self.unzip(file, dir = nil)
6
+ Dir.chdir(dir) if dir
7
+ `unzip -qq -u #{file} > /dev/null 2>&1`
8
+ end
9
+
10
+ def initialize(opts)
11
+ @needs_download = !(opts[:download] == false)
12
+ @needs_unpack = !(opts[:unpack] == false)
13
+ @download_dir, @download_file = File.split(@download_path)
14
+ prepare_path if needs_download?
15
+ end
16
+
17
+ def needs_download?
18
+ @needs_download
19
+ end
20
+
21
+ def needs_unpack?
22
+ @needs_unpack
23
+ end
24
+
25
+ def download
26
+ DwcaHunter::logger_write(self.object_id,
27
+ "Starting download of '%s'" % @url)
28
+ percentage = 0
29
+ if url.match(/^\s*http:\/\//)
30
+ dlr = DwcaHunter::Downloader.new(url, @download_path)
31
+ downloaded_length = dlr.download_with_percentage do |r|
32
+ if r[:percentage].to_i != percentage
33
+ percentage = r[:percentage].to_i
34
+ msg = "Downloaded %.0f%% in %.0f seconds ETA is %.0f seconds" %
35
+ [percentage, r[:elapsed_time], r[:eta]]
36
+ DwcaHunter::logger_write(self.object_id, msg)
37
+ end
38
+ end
39
+ DwcaHunter::logger_write(self.object_id,
40
+ "Download finished, Size: %s" %
41
+ downloaded_length)
42
+ else
43
+ `curl -s #{url} > #{download_path}`
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def cleanup(str)
50
+ str.strip!
51
+ str.to_i.to_s == str ? str.to_i : str
52
+ end
53
+
54
+ def prepare_path
55
+ FileUtils.rm_rf(@download_dir)
56
+ FileUtils.mkdir_p(@download_dir)
57
+ end
58
+
59
+ def unpack_bz2
60
+ DwcaHunter::logger_write(self.object_id,
61
+ 'Unpacking a bz2 file, it might take a while...')
62
+ Dir.chdir(@download_dir)
63
+ `bunzip2 #{@download_file}`
64
+ end
65
+
66
+ def unpack_zip
67
+ DwcaHunter::logger_write(self.object_id,
68
+ 'Unpacking a zip file, it might take a while...')
69
+ self.class.unzip(@download_file, @download_dir)
70
+ end
71
+
72
+ def unpack_gzip
73
+ DwcaHunter::logger_write(self.object_id,
74
+ 'Unpacking gzip file, it might take a while...')
75
+ self.class.gunzip(@download_file, @download_dir)
76
+ end
77
+
78
+ def unpack_tar
79
+ DwcaHunter::logger_write(self.object_id,
80
+ 'Unpacking a tar file, it might take a while...')
81
+ Dir.chdir(@download_dir)
82
+ `tar zxvf #{@download_file}`
83
+ end
84
+
85
+ def generate_dwca
86
+ gen = DarwinCore::Generator.new(File.join(@download_dir, 'dwca.tar.gz'))
87
+ gen.add_core(@core, 'taxa.txt')
88
+ @extensions.each_with_index do |extension, i|
89
+ gen.add_extension(extension[:data],
90
+ extension[:file_name],
91
+ true,
92
+ extension[:row_type])
93
+ end
94
+ gen.add_meta_xml
95
+ gen.add_eml_xml(@eml)
96
+ gen.pack
97
+ DwcaHunter::logger_write(self.object_id,
98
+ 'DarwinCore Archive file is created')
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,222 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceArctos < DwcaHunter::Resource
4
+
5
+ def initialize(opts = {})
6
+ @command = 'arctos'
7
+ @title = 'Arctos'
8
+ @url = 'http://arctos.database.museum/download/gncombined.zip'
9
+ @UUID = 'eea8315d-a244-4625-859a-226675622312'
10
+ @download_path = File.join(Dir.tmpdir,
11
+ 'dwca_hunter',
12
+ 'arctos',
13
+ 'data.tar.gz')
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ super(opts)
19
+ @gnub_dir = File.join(@download_dir, 'gnub')
20
+ end
21
+
22
+ def unpack
23
+ unpack_zip
24
+ end
25
+
26
+ def make_dwca
27
+ DwcaHunter::logger_write(self.object_id, 'Extracting data')
28
+ get_names
29
+ generate_dwca
30
+ end
31
+
32
+ private
33
+
34
+ def get_names
35
+ Dir.chdir(@download_dir)
36
+ Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
+ self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
+ end
39
+ collect_names
40
+ collect_synonyms
41
+ collect_vernaculars
42
+ end
43
+
44
+ def collect_vernaculars
45
+ file = open(File.join(@download_dir, 'common_name.csv'))
46
+ fields = {}
47
+ file.each_with_index do |row, i|
48
+
49
+ if i == 0
50
+ fields = get_fields(row)
51
+ next
52
+ end
53
+
54
+ row = split_row(row)
55
+
56
+ taxon_id = row[fields[:taxon_name_id]]
57
+ vernacular_name_string = row[fields[:common_name]]
58
+
59
+ @vernaculars << {
60
+ taxon_id: taxon_id,
61
+ vernacular_name_string: vernacular_name_string
62
+ }
63
+
64
+ puts "Processed %s vernaculars" % i if i % 10000 == 0
65
+ end
66
+ end
67
+
68
+ def collect_synonyms
69
+ file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
+ fields = {}
71
+ file.each_with_index do |row, i|
72
+ if i == 0
73
+ fields = get_fields(row)
74
+ next
75
+ end
76
+
77
+ row = split_row(row)
78
+ taxon_id = row[fields[:taxon_name_id]]
79
+ @synonyms << {
80
+ taxon_id: row[fields[:related_taxon_name_id]],
81
+ local_id: taxon_id,
82
+ name_string: @names_index[taxon_id],
83
+ #synonym_authority: row[fields[:relation_authority]],
84
+ taxonomic_status: row[fields[:taxon_relationship]],
85
+ }
86
+ puts "Processed %s synonyms" % i if i % 10000 == 0
87
+ end
88
+ end
89
+
90
+ def collect_names
91
+ @names_index = {}
92
+ file = open(File.join(@download_dir, 'taxonomy.csv'))
93
+ fields = {}
94
+ file.each_with_index do |row, i|
95
+ if i == 0
96
+ fields = get_fields(row)
97
+ next
98
+ end
99
+ next unless row[fields[:display_name]]
100
+ row = split_row(row)
101
+ taxon_id = row[fields[:taxon_name_id]]
102
+ name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
+ kingdom = row[fields[:kingdom]]
104
+ phylum = row[fields[:phylum]]
105
+ klass = row[fields[:phylclass]]
106
+ subclass = row[fields[:subclass]]
107
+ order = row[fields[:phylorder]]
108
+ suborder = row[fields[:suborder]]
109
+ superfamily = row[fields[:superfamily]]
110
+ family = row[fields[:family]]
111
+ subfamily = row[fields[:subfamily]]
112
+ tribe = row[fields[:tribe]]
113
+ genus = row[fields[:genus]]
114
+ subgenus = row[fields[:subgenus]]
115
+ species = row[fields[:species]]
116
+ subspecies = row[fields[:subspecies]]
117
+ code = row[fields[:nomenclatural_code]]
118
+
119
+ @names << { taxon_id: taxon_id,
120
+ local_id: taxon_id,
121
+ name_string: name_string,
122
+ kingdom: kingdom,
123
+ phylum: phylum,
124
+ klass: klass,
125
+ order: order,
126
+ family: family,
127
+ genus: genus,
128
+ code: code,
129
+ }
130
+
131
+ @names_index[taxon_id] = name_string
132
+ puts "Processed %s names" % i if i % 10000 == 0
133
+ end
134
+ end
135
+
136
+ def split_row(row)
137
+ row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
+ row.split('","')
139
+ end
140
+
141
+ def get_fields(row)
142
+ row = row.split(",")
143
+ encoding_options = {
144
+ :invalid => :replace,
145
+ :undef => :replace,
146
+ :replace => '',
147
+ :universal_newline => true
148
+ }
149
+ num_ary = (0...row.size).to_a
150
+ row = row.map do |f|
151
+ f = f.strip.downcase
152
+ f = f.encode ::Encoding.find('ASCII'), encoding_options
153
+ f.to_sym
154
+ end
155
+ Hash[row.zip(num_ary)]
156
+ end
157
+
158
+
159
+ def generate_dwca
160
+ DwcaHunter::logger_write(self.object_id,
161
+ 'Creating DarwinCore Archive file')
162
+ @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
+ 'http://globalnames.org/terms/localID',
164
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
165
+ 'http://rs.tdwg.org/dwc/terms/kingdom',
166
+ 'http://rs.tdwg.org/dwc/terms/phylum',
167
+ 'http://rs.tdwg.org/dwc/terms/class',
168
+ 'http://rs.tdwg.org/dwc/terms/order',
169
+ 'http://rs.tdwg.org/dwc/terms/family',
170
+ 'http://rs.tdwg.org/dwc/terms/genus',
171
+ 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
+ ]]
173
+ @names.each do |n|
174
+ @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
+ n[:genus], n[:code]]
177
+ end
178
+ @extensions << {
179
+ data: [[
180
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
181
+ 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
+ file_name: 'vernacular_names.txt',
183
+ row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
184
+
185
+ @vernaculars.each do |v|
186
+ @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
187
+ end
188
+
189
+ @extensions << {
190
+ data: [[
191
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
192
+ 'http://globalnames.org/terms/localID',
193
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
194
+ 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
+ ]],
196
+ file_name: 'synonyms.txt',
197
+ }
198
+
199
+ @synonyms.each do |s|
200
+ @extensions[-1][:data] << [
201
+ s[:taxon_id], s[:local_id],
202
+ s[:name_string], s[:taxonomic_status]]
203
+ end
204
+ @eml = {
205
+ id: @uuid,
206
+ title: @title,
207
+ authors: [
208
+ {email: 'dustymc at gmail dot com'}
209
+ ],
210
+ metadata_providers: [
211
+ { first_name: 'Dmitry',
212
+ last_name: 'Mozzherin',
213
+ email: 'dmozzherin@gmail.com' }
214
+ ],
215
+ abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
216
+ url: @url
217
+ }
218
+ super
219
+ end
220
+ end
221
+ end
222
+
@@ -0,0 +1,160 @@
1
+ module DwcaHunter
2
+ class ResourceBirdLife < DwcaHunter::Resource
3
+ def initialize(opts = {})
4
+ @command = "bird-life"
5
+ @title = "BirdLife International"
6
+ @uuid = "b1d8de7a-ab96-455f-acd8-f3fff2d7d169"
7
+ @data = []
8
+ @extensions = []
9
+ @url = "http://www.birdlife.org/datazone/userfiles"\
10
+ "/file/Species/Taxonomy/BirdLife_Checklist_Version_70.zip"
11
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter", "birdlife",
12
+ "fake.zip")
13
+ @clades = {}
14
+ super
15
+ end
16
+
17
+ def needs_unpack?
18
+ false
19
+ end
20
+
21
+ def download
22
+ end
23
+
24
+ def make_dwca
25
+ organize_data
26
+ generate_dwca
27
+ end
28
+
29
+ private
30
+
31
+ def generate_dwca
32
+ DwcaHunter::logger_write(self.object_id,
33
+ 'Creating DarwinCore Archive file')
34
+ core_init
35
+ extensions_init
36
+ eml_init
37
+ @data.each do |rec|
38
+ process(rec)
39
+ end
40
+ super
41
+ end
42
+
43
+ def core_init
44
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
45
+ "http://globalnames.org/terms/localID",
46
+ "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
47
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
48
+ "http://rs.tdwg.org/dwc/terms/scientificName",
49
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
50
+ "http://rs.tdwg.org/dwc/terms/taxonRank"]]
51
+ @count = 1
52
+ @core << [@count, nil, nil, @count, "Aves", nil, "class"]
53
+ end
54
+
55
+ def process(rec)
56
+ parent_id = 1
57
+ [:order, :family].each do |rank|
58
+ clade_id = nil
59
+ unless @clades[rec[rank]]
60
+ @count += 1
61
+ @clades[rec[rank]] = { id: @count }
62
+ end
63
+ clade_id = @clades[rec[rank]][:id]
64
+ @core << [clade_id, nil, parent_id, clade_id, rec[rank], nil, rank.to_s]
65
+ parent_id = clade_id
66
+ end
67
+ @count += 1
68
+ @core << [@count, rec[:local_id], parent_id, @count,
69
+ rec[:scientific_name], nil, rec[:rank]]
70
+ taxon = @core.last
71
+ process_synonyms(rec, taxon)
72
+ process_vernaculars(rec, taxon)
73
+ end
74
+
75
+ def process_synonyms(rec, taxon)
76
+ rec[:synonyms].each do |syn|
77
+ @count += 1
78
+ @core << [@count, nil, taxon[2], taxon[0], syn, "synonym", taxon[-1]]
79
+ end
80
+ end
81
+
82
+ def process_vernaculars(rec, taxon)
83
+ rec[:vernaculars].each do |v|
84
+ taxon_id = taxon[0]
85
+ lang = "en"
86
+ name = v
87
+ @extensions[0][:data] << [taxon_id, name, lang]
88
+ end
89
+ end
90
+
91
+ def extensions_init
92
+ @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
93
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
94
+ "http://purl.org/dc/terms/language"]],
95
+ file_name: "vernacular_names.txt",
96
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
97
+ }
98
+ end
99
+
100
+ def organize_data
101
+ DwcaHunter::logger_write(self.object_id,
102
+ "Organizing data")
103
+ path = File.join(__dir__, "..",
104
+ "..", "files", "birdlife_7.csv")
105
+ opts = { headers: true, header_converters: :symbol }
106
+ collect_data(path, opts)
107
+ end
108
+
109
+ def collect_data(path, opts)
110
+ @data = CSV.open(path, opts).each_with_object([]) do |row, data|
111
+ order = row[:order]
112
+ order = order.capitalize if order.match(/^[A-Z]+$/)
113
+ family = row[:familyname]
114
+ scientific_name = [row[:scientificname], row[:authority]].join(" ").
115
+ strip.gsub(/[\s]+/, " ")
116
+ rank = row[:taxonomictreatment] == "R" ? "species" : "not recognized"
117
+ local_id = row[:sisrecid]
118
+ vernaculars = collect_vernaculars(row)
119
+ synonyms = collect_synonyms(row)
120
+ data << { order: order, family: family, rank: rank,
121
+ scientific_name: scientific_name, synonyms: synonyms,
122
+ local_id: local_id, vernaculars: vernaculars }
123
+ end
124
+ end
125
+
126
+ def collect_synonyms(row)
127
+ synonyms = row[:synonyms]
128
+ synonyms ? synonyms.split(";").map(&:strip) : []
129
+ end
130
+
131
+ def collect_vernaculars(row)
132
+ name1 = row[:commonname]
133
+ names = name1 ? [name1] : []
134
+ other = row[:alternativecommonnames]
135
+ if other
136
+ names += other.split(";").map(&:strip)
137
+ end
138
+ names
139
+ end
140
+
141
+ def eml_init
142
+ @eml = {
143
+ id: @uuid,
144
+ title: @title,
145
+ authors: [],
146
+ metadata_providers: [
147
+ { first_name: "Dmitry",
148
+ last_name: "Mozzherin",
149
+ email: "dmozzherin@gmail.com" }
150
+ ],
151
+ abstract: "BirdLife is widely recognised as the world leader in bird "\
152
+ "conservation. Rigorous science informed by practical "\
153
+ "feedback from projects on the ground in important sites "\
154
+ "and habitats enables us to implement successful "\
155
+ "conservation programmes for birds and all nature.",
156
+ url: "http://www.birdlife.org/"
157
+ }
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,99 @@
1
+ module DwcaHunter
2
+ # Resource for FishBase
3
+ class ResourceFishbase < DwcaHunter::Resource
4
+ attr_reader :title, :abbr
5
+ def initialize(opts = {})
6
+ @command = "fishbase"
7
+ @title = "FishBase Cache"
8
+ @abbr = "FishBase Cache"
9
+ @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
10
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter", "fishbase",
11
+ "fishbase.tsv")
12
+ @extensions = []
13
+ super
14
+ end
15
+
16
+ def download
17
+ FileUtils.cp(File.join(__dir__, "..", "..", "files",
18
+ "fishbase_taxon_cache.tsv"), @download_path)
19
+ end
20
+
21
+ def unpack
22
+ end
23
+
24
+ def make_dwca
25
+ organize_data
26
+ generate_dwca
27
+ end
28
+
29
+ private
30
+
31
+ def organize_data
32
+ ranks = %i(class order family sub_family genus species)
33
+ DwcaHunter::logger_write(self.object_id,
34
+ "Organizing data")
35
+ # snp = ScientificNameParser.new
36
+ @data = CSV.open(@download_path, col_sep: "\t")
37
+ .each_with_object([]) do |row, data|
38
+ cl = Hash[ranks.zip(row[4].split("|"))]
39
+ data << { taxon_id: row[0],
40
+ local_id: row[0],
41
+ scientific_name: row[1],
42
+ rank: row[2],
43
+ source: row[7]
44
+ }.merge(cl)
45
+
46
+ end
47
+ end
48
+
49
+ def generate_dwca
50
+ DwcaHunter::logger_write(self.object_id,
51
+ 'Creating DarwinCore Archive file')
52
+ core_init
53
+ eml_init
54
+ DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
55
+ count = 0
56
+ @data.each do |d|
57
+ count += 1
58
+ if count % 10000 == 0
59
+ DwcaHunter::logger_write(self.object_id, "Core row #{count}")
60
+ end
61
+ @core << [d[:taxon_id], d[:taxon_id], d[:taxon_id],
62
+ d[:scientific_name], d[:rank],
63
+ d[:class], d[:order], d[:family], d[:genus],
64
+ d[:source]]
65
+ end
66
+ super
67
+ end
68
+
69
+ def eml_init
70
+ @eml = {
71
+ id: @uuid,
72
+ title: @title,
73
+ authors: [],
74
+ metadata_providers: [
75
+ { first_name: "Jorrit",
76
+ last_name: "Poelen",
77
+ }
78
+ ],
79
+ abstract: "FishBase is a global species database of fish species" \
80
+ "(specifically finfish). It is the largest and the most" \
81
+ "extensively accessed online database of finfish",
82
+ url: "http://www.fishbase.org"
83
+ }
84
+ end
85
+
86
+ def core_init
87
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
88
+ "http://globalnames.org/terms/localID",
89
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
90
+ "http://rs.tdwg.org/dwc/terms/scientificName",
91
+ "http://rs.tdwg.org/dwc/terms/taxonRank",
92
+ "http://rs.tdwg.org/dwc/terms/class",
93
+ "http://rs.tdwg.org/dwc/terms/order",
94
+ "http://rs.tdwg.org/dwc/terms/family",
95
+ "http://rs.tdwg.org/dwc/terms/genus",
96
+ "http://purl.org/dc/terms/source"]]
97
+ end
98
+ end
99
+ end