dwca_hunter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class Downloader
4
+
5
+ attr_reader :url
6
+
7
+ def initialize(source_url, file_path)
8
+ @source_url = source_url
9
+ @file_path = file_path
10
+ @url = Url.new(source_url)
11
+ @download_length = 0
12
+ @filename = nil
13
+ end
14
+
15
+ # downloads a given file into a specified filename.
16
+ # If block is given returns download progress
17
+ def download
18
+ raise "#{@source_url} is not accessible" unless @url.valid?
19
+ f = open(@file_path,'wb')
20
+ count = 0
21
+ @url.net_http.request_get(@url.path) do |r|
22
+ r.read_body do |s|
23
+ @download_length += s.length
24
+ f.write s
25
+ if block_given?
26
+ count += 1
27
+ if count % 100 == 0
28
+ yield @download_length
29
+ end
30
+ end
31
+ end
32
+ end
33
+ f.close
34
+ downloaded = @download_length
35
+ @download_length = 0
36
+ downloaded
37
+ end
38
+
39
+ def download_with_percentage
40
+ start_time = Time.now
41
+ download do |r|
42
+ percentage = r.to_f/@url.header.content_length * 100
43
+ elapsed_time = Time.now - start_time
44
+ eta = calculate_eta(percentage, elapsed_time)
45
+ res = { percentage: percentage,
46
+ elapsed_time: elapsed_time,
47
+ eta: eta }
48
+ yield res
49
+ end
50
+ end
51
+
52
+ protected
53
+
54
+ def calculate_eta(percentage, elapsed_time)
55
+ eta = elapsed_time/percentage * 100 - elapsed_time
56
+ eta = 1.0 if eta <= 0
57
+ eta
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,17 @@
1
+ module DwcaHunter
2
+ module Encoding
3
+ def self.latin1_to_utf8(file_path)
4
+ new_file = file_path + '.utf_8'
5
+ puts "Creating %s" % new_file
6
+ r = open(file_path)
7
+ w = open(new_file, 'w:utf-8')
8
+ r.each do |l|
9
+ l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
10
+ w.write l
11
+ end
12
+ r.close
13
+ w.close
14
+ new_file
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,101 @@
1
+ module DwcaHunter
2
+ class Resource
3
+ attr_reader :url, :uuid, :download_path, :title, :abbr, :command
4
+
5
+ def self.unzip(file, dir = nil)
6
+ Dir.chdir(dir) if dir
7
+ `unzip -qq -u #{file} > /dev/null 2>&1`
8
+ end
9
+
10
+ def initialize(opts)
11
+ @needs_download = !(opts[:download] == false)
12
+ @needs_unpack = !(opts[:unpack] == false)
13
+ @download_dir, @download_file = File.split(@download_path)
14
+ prepare_path if needs_download?
15
+ end
16
+
17
+ def needs_download?
18
+ @needs_download
19
+ end
20
+
21
+ def needs_unpack?
22
+ @needs_unpack
23
+ end
24
+
25
+ def download
26
+ DwcaHunter::logger_write(self.object_id,
27
+ "Starting download of '%s'" % @url)
28
+ percentage = 0
29
+ if url.match(/^\s*http:\/\//)
30
+ dlr = DwcaHunter::Downloader.new(url, @download_path)
31
+ downloaded_length = dlr.download_with_percentage do |r|
32
+ if r[:percentage].to_i != percentage
33
+ percentage = r[:percentage].to_i
34
+ msg = "Downloaded %.0f%% in %.0f seconds ETA is %.0f seconds" %
35
+ [percentage, r[:elapsed_time], r[:eta]]
36
+ DwcaHunter::logger_write(self.object_id, msg)
37
+ end
38
+ end
39
+ DwcaHunter::logger_write(self.object_id,
40
+ "Download finished, Size: %s" %
41
+ downloaded_length)
42
+ else
43
+ `curl -s #{url} > #{download_path}`
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def cleanup(str)
50
+ str.strip!
51
+ str.to_i.to_s == str ? str.to_i : str
52
+ end
53
+
54
+ def prepare_path
55
+ FileUtils.rm_rf(@download_dir)
56
+ FileUtils.mkdir_p(@download_dir)
57
+ end
58
+
59
+ def unpack_bz2
60
+ DwcaHunter::logger_write(self.object_id,
61
+ 'Unpacking a bz2 file, it might take a while...')
62
+ Dir.chdir(@download_dir)
63
+ `bunzip2 #{@download_file}`
64
+ end
65
+
66
+ def unpack_zip
67
+ DwcaHunter::logger_write(self.object_id,
68
+ 'Unpacking a zip file, it might take a while...')
69
+ self.class.unzip(@download_file, @download_dir)
70
+ end
71
+
72
+ def unpack_gzip
73
+ DwcaHunter::logger_write(self.object_id,
74
+ 'Unpacking gzip file, it might take a while...')
75
+ self.class.gunzip(@download_file, @download_dir)
76
+ end
77
+
78
+ def unpack_tar
79
+ DwcaHunter::logger_write(self.object_id,
80
+ 'Unpacking a tar file, it might take a while...')
81
+ Dir.chdir(@download_dir)
82
+ `tar zxvf #{@download_file}`
83
+ end
84
+
85
+ def generate_dwca
86
+ gen = DarwinCore::Generator.new(File.join(@download_dir, 'dwca.tar.gz'))
87
+ gen.add_core(@core, 'taxa.txt')
88
+ @extensions.each_with_index do |extension, i|
89
+ gen.add_extension(extension[:data],
90
+ extension[:file_name],
91
+ true,
92
+ extension[:row_type])
93
+ end
94
+ gen.add_meta_xml
95
+ gen.add_eml_xml(@eml)
96
+ gen.pack
97
+ DwcaHunter::logger_write(self.object_id,
98
+ 'DarwinCore Archive file is created')
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,222 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceArctos < DwcaHunter::Resource
4
+
5
+ def initialize(opts = {})
6
+ @command = 'arctos'
7
+ @title = 'Arctos'
8
+ @url = 'http://arctos.database.museum/download/gncombined.zip'
9
+ @UUID = 'eea8315d-a244-4625-859a-226675622312'
10
+ @download_path = File.join(Dir.tmpdir,
11
+ 'dwca_hunter',
12
+ 'arctos',
13
+ 'data.tar.gz')
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ super(opts)
19
+ @gnub_dir = File.join(@download_dir, 'gnub')
20
+ end
21
+
22
+ def unpack
23
+ unpack_zip
24
+ end
25
+
26
+ def make_dwca
27
+ DwcaHunter::logger_write(self.object_id, 'Extracting data')
28
+ get_names
29
+ generate_dwca
30
+ end
31
+
32
+ private
33
+
34
+ def get_names
35
+ Dir.chdir(@download_dir)
36
+ Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
+ self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
+ end
39
+ collect_names
40
+ collect_synonyms
41
+ collect_vernaculars
42
+ end
43
+
44
+ def collect_vernaculars
45
+ file = open(File.join(@download_dir, 'common_name.csv'))
46
+ fields = {}
47
+ file.each_with_index do |row, i|
48
+
49
+ if i == 0
50
+ fields = get_fields(row)
51
+ next
52
+ end
53
+
54
+ row = split_row(row)
55
+
56
+ taxon_id = row[fields[:taxon_name_id]]
57
+ vernacular_name_string = row[fields[:common_name]]
58
+
59
+ @vernaculars << {
60
+ taxon_id: taxon_id,
61
+ vernacular_name_string: vernacular_name_string
62
+ }
63
+
64
+ puts "Processed %s vernaculars" % i if i % 10000 == 0
65
+ end
66
+ end
67
+
68
+ def collect_synonyms
69
+ file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
+ fields = {}
71
+ file.each_with_index do |row, i|
72
+ if i == 0
73
+ fields = get_fields(row)
74
+ next
75
+ end
76
+
77
+ row = split_row(row)
78
+ taxon_id = row[fields[:taxon_name_id]]
79
+ @synonyms << {
80
+ taxon_id: row[fields[:related_taxon_name_id]],
81
+ local_id: taxon_id,
82
+ name_string: @names_index[taxon_id],
83
+ #synonym_authority: row[fields[:relation_authority]],
84
+ taxonomic_status: row[fields[:taxon_relationship]],
85
+ }
86
+ puts "Processed %s synonyms" % i if i % 10000 == 0
87
+ end
88
+ end
89
+
90
+ def collect_names
91
+ @names_index = {}
92
+ file = open(File.join(@download_dir, 'taxonomy.csv'))
93
+ fields = {}
94
+ file.each_with_index do |row, i|
95
+ if i == 0
96
+ fields = get_fields(row)
97
+ next
98
+ end
99
+ next unless row[fields[:display_name]]
100
+ row = split_row(row)
101
+ taxon_id = row[fields[:taxon_name_id]]
102
+ name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
+ kingdom = row[fields[:kingdom]]
104
+ phylum = row[fields[:phylum]]
105
+ klass = row[fields[:phylclass]]
106
+ subclass = row[fields[:subclass]]
107
+ order = row[fields[:phylorder]]
108
+ suborder = row[fields[:suborder]]
109
+ superfamily = row[fields[:superfamily]]
110
+ family = row[fields[:family]]
111
+ subfamily = row[fields[:subfamily]]
112
+ tribe = row[fields[:tribe]]
113
+ genus = row[fields[:genus]]
114
+ subgenus = row[fields[:subgenus]]
115
+ species = row[fields[:species]]
116
+ subspecies = row[fields[:subspecies]]
117
+ code = row[fields[:nomenclatural_code]]
118
+
119
+ @names << { taxon_id: taxon_id,
120
+ local_id: taxon_id,
121
+ name_string: name_string,
122
+ kingdom: kingdom,
123
+ phylum: phylum,
124
+ klass: klass,
125
+ order: order,
126
+ family: family,
127
+ genus: genus,
128
+ code: code,
129
+ }
130
+
131
+ @names_index[taxon_id] = name_string
132
+ puts "Processed %s names" % i if i % 10000 == 0
133
+ end
134
+ end
135
+
136
+ def split_row(row)
137
+ row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
+ row.split('","')
139
+ end
140
+
141
+ def get_fields(row)
142
+ row = row.split(",")
143
+ encoding_options = {
144
+ :invalid => :replace,
145
+ :undef => :replace,
146
+ :replace => '',
147
+ :universal_newline => true
148
+ }
149
+ num_ary = (0...row.size).to_a
150
+ row = row.map do |f|
151
+ f = f.strip.downcase
152
+ f = f.encode ::Encoding.find('ASCII'), encoding_options
153
+ f.to_sym
154
+ end
155
+ Hash[row.zip(num_ary)]
156
+ end
157
+
158
+
159
+ def generate_dwca
160
+ DwcaHunter::logger_write(self.object_id,
161
+ 'Creating DarwinCore Archive file')
162
+ @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
+ 'http://globalnames.org/terms/localID',
164
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
165
+ 'http://rs.tdwg.org/dwc/terms/kingdom',
166
+ 'http://rs.tdwg.org/dwc/terms/phylum',
167
+ 'http://rs.tdwg.org/dwc/terms/class',
168
+ 'http://rs.tdwg.org/dwc/terms/order',
169
+ 'http://rs.tdwg.org/dwc/terms/family',
170
+ 'http://rs.tdwg.org/dwc/terms/genus',
171
+ 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
+ ]]
173
+ @names.each do |n|
174
+ @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
+ n[:genus], n[:code]]
177
+ end
178
+ @extensions << {
179
+ data: [[
180
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
181
+ 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
+ file_name: 'vernacular_names.txt',
183
+ row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
184
+
185
+ @vernaculars.each do |v|
186
+ @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
187
+ end
188
+
189
+ @extensions << {
190
+ data: [[
191
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
192
+ 'http://globalnames.org/terms/localID',
193
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
194
+ 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
+ ]],
196
+ file_name: 'synonyms.txt',
197
+ }
198
+
199
+ @synonyms.each do |s|
200
+ @extensions[-1][:data] << [
201
+ s[:taxon_id], s[:local_id],
202
+ s[:name_string], s[:taxonomic_status]]
203
+ end
204
+ @eml = {
205
+ id: @uuid,
206
+ title: @title,
207
+ authors: [
208
+ {email: 'dustymc at gmail dot com'}
209
+ ],
210
+ metadata_providers: [
211
+ { first_name: 'Dmitry',
212
+ last_name: 'Mozzherin',
213
+ email: 'dmozzherin@gmail.com' }
214
+ ],
215
+ abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
216
+ url: @url
217
+ }
218
+ super
219
+ end
220
+ end
221
+ end
222
+
@@ -0,0 +1,160 @@
1
+ module DwcaHunter
2
+ class ResourceBirdLife < DwcaHunter::Resource
3
+ def initialize(opts = {})
4
+ @command = "bird-life"
5
+ @title = "BirdLife International"
6
+ @uuid = "b1d8de7a-ab96-455f-acd8-f3fff2d7d169"
7
+ @data = []
8
+ @extensions = []
9
+ @url = "http://www.birdlife.org/datazone/userfiles"\
10
+ "/file/Species/Taxonomy/BirdLife_Checklist_Version_70.zip"
11
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter", "birdlife",
12
+ "fake.zip")
13
+ @clades = {}
14
+ super
15
+ end
16
+
17
+ def needs_unpack?
18
+ false
19
+ end
20
+
21
+ def download
22
+ end
23
+
24
+ def make_dwca
25
+ organize_data
26
+ generate_dwca
27
+ end
28
+
29
+ private
30
+
31
+ def generate_dwca
32
+ DwcaHunter::logger_write(self.object_id,
33
+ 'Creating DarwinCore Archive file')
34
+ core_init
35
+ extensions_init
36
+ eml_init
37
+ @data.each do |rec|
38
+ process(rec)
39
+ end
40
+ super
41
+ end
42
+
43
+ def core_init
44
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
45
+ "http://globalnames.org/terms/localID",
46
+ "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
47
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
48
+ "http://rs.tdwg.org/dwc/terms/scientificName",
49
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
50
+ "http://rs.tdwg.org/dwc/terms/taxonRank"]]
51
+ @count = 1
52
+ @core << [@count, nil, nil, @count, "Aves", nil, "class"]
53
+ end
54
+
55
+ def process(rec)
56
+ parent_id = 1
57
+ [:order, :family].each do |rank|
58
+ clade_id = nil
59
+ unless @clades[rec[rank]]
60
+ @count += 1
61
+ @clades[rec[rank]] = { id: @count }
62
+ end
63
+ clade_id = @clades[rec[rank]][:id]
64
+ @core << [clade_id, nil, parent_id, clade_id, rec[rank], nil, rank.to_s]
65
+ parent_id = clade_id
66
+ end
67
+ @count += 1
68
+ @core << [@count, rec[:local_id], parent_id, @count,
69
+ rec[:scientific_name], nil, rec[:rank]]
70
+ taxon = @core.last
71
+ process_synonyms(rec, taxon)
72
+ process_vernaculars(rec, taxon)
73
+ end
74
+
75
+ def process_synonyms(rec, taxon)
76
+ rec[:synonyms].each do |syn|
77
+ @count += 1
78
+ @core << [@count, nil, taxon[2], taxon[0], syn, "synonym", taxon[-1]]
79
+ end
80
+ end
81
+
82
+ def process_vernaculars(rec, taxon)
83
+ rec[:vernaculars].each do |v|
84
+ taxon_id = taxon[0]
85
+ lang = "en"
86
+ name = v
87
+ @extensions[0][:data] << [taxon_id, name, lang]
88
+ end
89
+ end
90
+
91
+ def extensions_init
92
+ @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
93
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
94
+ "http://purl.org/dc/terms/language"]],
95
+ file_name: "vernacular_names.txt",
96
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
97
+ }
98
+ end
99
+
100
+ def organize_data
101
+ DwcaHunter::logger_write(self.object_id,
102
+ "Organizing data")
103
+ path = File.join(__dir__, "..",
104
+ "..", "files", "birdlife_7.csv")
105
+ opts = { headers: true, header_converters: :symbol }
106
+ collect_data(path, opts)
107
+ end
108
+
109
+ def collect_data(path, opts)
110
+ @data = CSV.open(path, opts).each_with_object([]) do |row, data|
111
+ order = row[:order]
112
+ order = order.capitalize if order.match(/^[A-Z]+$/)
113
+ family = row[:familyname]
114
+ scientific_name = [row[:scientificname], row[:authority]].join(" ").
115
+ strip.gsub(/[\s]+/, " ")
116
+ rank = row[:taxonomictreatment] == "R" ? "species" : "not recognized"
117
+ local_id = row[:sisrecid]
118
+ vernaculars = collect_vernaculars(row)
119
+ synonyms = collect_synonyms(row)
120
+ data << { order: order, family: family, rank: rank,
121
+ scientific_name: scientific_name, synonyms: synonyms,
122
+ local_id: local_id, vernaculars: vernaculars }
123
+ end
124
+ end
125
+
126
+ def collect_synonyms(row)
127
+ synonyms = row[:synonyms]
128
+ synonyms ? synonyms.split(";").map(&:strip) : []
129
+ end
130
+
131
+ def collect_vernaculars(row)
132
+ name1 = row[:commonname]
133
+ names = name1 ? [name1] : []
134
+ other = row[:alternativecommonnames]
135
+ if other
136
+ names += other.split(";").map(&:strip)
137
+ end
138
+ names
139
+ end
140
+
141
+ def eml_init
142
+ @eml = {
143
+ id: @uuid,
144
+ title: @title,
145
+ authors: [],
146
+ metadata_providers: [
147
+ { first_name: "Dmitry",
148
+ last_name: "Mozzherin",
149
+ email: "dmozzherin@gmail.com" }
150
+ ],
151
+ abstract: "BirdLife is widely recognised as the world leader in bird "\
152
+ "conservation. Rigorous science informed by practical "\
153
+ "feedback from projects on the ground in important sites "\
154
+ "and habitats enables us to implement successful "\
155
+ "conservation programmes for birds and all nature.",
156
+ url: "http://www.birdlife.org/"
157
+ }
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,99 @@
1
+ module DwcaHunter
2
+ # Resource for FishBase
3
+ class ResourceFishbase < DwcaHunter::Resource
4
+ attr_reader :title, :abbr
5
+ def initialize(opts = {})
6
+ @command = "fishbase"
7
+ @title = "FishBase Cache"
8
+ @abbr = "FishBase Cache"
9
+ @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
10
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter", "fishbase",
11
+ "fishbase.tsv")
12
+ @extensions = []
13
+ super
14
+ end
15
+
16
+ def download
17
+ FileUtils.cp(File.join(__dir__, "..", "..", "files",
18
+ "fishbase_taxon_cache.tsv"), @download_path)
19
+ end
20
+
21
+ def unpack
22
+ end
23
+
24
+ def make_dwca
25
+ organize_data
26
+ generate_dwca
27
+ end
28
+
29
+ private
30
+
31
+ def organize_data
32
+ ranks = %i(class order family sub_family genus species)
33
+ DwcaHunter::logger_write(self.object_id,
34
+ "Organizing data")
35
+ # snp = ScientificNameParser.new
36
+ @data = CSV.open(@download_path, col_sep: "\t")
37
+ .each_with_object([]) do |row, data|
38
+ cl = Hash[ranks.zip(row[4].split("|"))]
39
+ data << { taxon_id: row[0],
40
+ local_id: row[0],
41
+ scientific_name: row[1],
42
+ rank: row[2],
43
+ source: row[7]
44
+ }.merge(cl)
45
+
46
+ end
47
+ end
48
+
49
+ def generate_dwca
50
+ DwcaHunter::logger_write(self.object_id,
51
+ 'Creating DarwinCore Archive file')
52
+ core_init
53
+ eml_init
54
+ DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
55
+ count = 0
56
+ @data.each do |d|
57
+ count += 1
58
+ if count % 10000 == 0
59
+ DwcaHunter::logger_write(self.object_id, "Core row #{count}")
60
+ end
61
+ @core << [d[:taxon_id], d[:taxon_id], d[:taxon_id],
62
+ d[:scientific_name], d[:rank],
63
+ d[:class], d[:order], d[:family], d[:genus],
64
+ d[:source]]
65
+ end
66
+ super
67
+ end
68
+
69
+ def eml_init
70
+ @eml = {
71
+ id: @uuid,
72
+ title: @title,
73
+ authors: [],
74
+ metadata_providers: [
75
+ { first_name: "Jorrit",
76
+ last_name: "Poelen",
77
+ }
78
+ ],
79
+ abstract: "FishBase is a global species database of fish species" \
80
+ "(specifically finfish). It is the largest and the most" \
81
+ "extensively accessed online database of finfish",
82
+ url: "http://www.fishbase.org"
83
+ }
84
+ end
85
+
86
+ def core_init
87
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
88
+ "http://globalnames.org/terms/localID",
89
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
90
+ "http://rs.tdwg.org/dwc/terms/scientificName",
91
+ "http://rs.tdwg.org/dwc/terms/taxonRank",
92
+ "http://rs.tdwg.org/dwc/terms/class",
93
+ "http://rs.tdwg.org/dwc/terms/order",
94
+ "http://rs.tdwg.org/dwc/terms/family",
95
+ "http://rs.tdwg.org/dwc/terms/genus",
96
+ "http://purl.org/dc/terms/source"]]
97
+ end
98
+ end
99
+ end