dwca_hunter 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,350 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceWikispecies < DwcaHunter::Resource
4
+ def initialize(opts = {})
5
+ @problems_file = open('problems.txt', 'w:utf-8')
6
+ @command = "wikispecies"
7
+ @title = 'Wikispecies'
8
+ @url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
9
+ 'specieswiki-latest-pages-articles.xml.bz2'
10
+ @url = opts[:url] if opts[:url]
11
+ @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
12
+ @download_path = File.join(Dir.tmpdir,
13
+ 'dwca_hunter',
14
+ 'wikispecies',
15
+ 'data.xml.bz2')
16
+ @data = []
17
+ @templates = {}
18
+ @taxon_ids = {}
19
+ @tree = {}
20
+ @paths = {}
21
+ @extensions = []
22
+ @re = {
23
+ page_start: /^\s*\<page\>\s*$/,
24
+ page_end: /^\s*\<\/page\>\s*$/,
25
+ template: /Template:/i,
26
+ template_link: /\{\{([^\}]*)\}\}/,
27
+ vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
28
+ }
29
+ super(opts)
30
+ end
31
+
32
+ def unpack
33
+ unpack_bz2
34
+ end
35
+
36
+ def make_dwca
37
+ enrich_data
38
+ extend_classification
39
+ generate_dwca
40
+ end
41
+
42
+ private
43
+
44
+ def enrich_data
45
+ DwcaHunter::logger_write(self.object_id,
46
+ 'Extracting data from xml file...')
47
+ Dir.chdir(@download_dir)
48
+ f = open('data.xml', 'r:utf-8')
49
+ page_on = false
50
+ page = ''
51
+ page_num = 0
52
+ f.each do |l|
53
+ if l.match(@re[:page_start])
54
+ page << l
55
+ page_on = true
56
+ elsif page_on
57
+ page << l
58
+ if l.match(@re[:page_end])
59
+ page_on = false
60
+ page_xml = Nokogiri::XML.parse(page)
61
+ template?(page_xml) ?
62
+ process_template(page_xml) :
63
+ process_species(page_xml)
64
+ page_num += 1
65
+ if page_num % BATCH_SIZE == 0
66
+ DwcaHunter::logger_write(self.object_id,
67
+ "Traversed %s pages" % page_num)
68
+ end
69
+ page = ''
70
+ @page_title = nil
71
+ @page_id = nil
72
+ end
73
+ end
74
+ end
75
+ DwcaHunter::logger_write(self.object_id,
76
+ 'Extracted total %s pages' % page_num)
77
+ f.close
78
+ end
79
+
80
+ def extend_classification
81
+ DwcaHunter::logger_write(self.object_id, 'Extending classifications')
82
+ @data.each_with_index do |d, i|
83
+ unless d[:classificationPath].empty?
84
+ n = 50
85
+ while n > 0
86
+ n -= 1
87
+ if n == 0
88
+ d[:classificationPath] = []
89
+ break
90
+ end
91
+ parent = @templates[d[:classificationPath].first]
92
+ if parent
93
+ d[:classificationPath].unshift(parent[:parentName])
94
+ else
95
+ update_tree(d[:classificationPath])
96
+ break
97
+ end
98
+ end
99
+ end
100
+ # d[:classificationPath] = d[:classificationPath].join("|").
101
+ # gsub("Main Page", "Life")
102
+ if i % BATCH_SIZE == 0 && i > 0
103
+ DwcaHunter::logger_write(self.object_id,
104
+ "Extended %s classifications" % i)
105
+ end
106
+ end
107
+ end
108
+
109
+ def update_tree(path)
110
+ path = path.dup
111
+ return if @paths.has_key?(path.join('|'))
112
+ (0...path.size).each do |i|
113
+ subpath = path[0..i]
114
+ subpath_string = subpath.join('|')
115
+ next if @paths.has_key?(subpath_string)
116
+ name = subpath.pop
117
+ tree_element = subpath.inject(@tree) { |res, n| res[n] }
118
+ tree_element[name] = {}
119
+ @paths[subpath_string] = 1
120
+ end
121
+ end
122
+
123
+ def process_template(x)
124
+ name = page_title(x).gsub!(@re[:template], '').strip
125
+ text = x.xpath('//text').text.strip
126
+ parent_name = text.match(@re[:template_link])
127
+ if parent_name
128
+ return if parent_name[1].match(/\#if/)
129
+ list = parent_name[1].split("|")
130
+ if list.size == 1
131
+ parent_name = list[0]
132
+ elsif list[0].match /Taxonav/i
133
+ parent_name = list[1]
134
+ else
135
+ parent_name = list[0]
136
+ end
137
+ end
138
+ name.gsub!(/_/, ' ')
139
+ parent_name.gsub!(/_/, ' ') if parent_name
140
+ @templates[name] = { parentName: parent_name, id: page_id(x) }
141
+ end
142
+
143
+ def process_species(x)
144
+ return if page_title(x).match(/Wikispecies/i)
145
+ items = find_species_components(x)
146
+ if items
147
+ @data << {
148
+ taxonId: page_id(x),
149
+ canonicalForm: page_title(x),
150
+ scientificName: page_title(x),
151
+ classificationPath: [],
152
+ vernacularNames: [] }
153
+ get_full_scientific_name(items)
154
+ get_vernacular_names(items)
155
+ init_classification_path(items)
156
+ end
157
+ end
158
+
159
+ def get_full_scientific_name(items)
160
+ if items['name']
161
+ if name = items['name'][0]
162
+ @data[-1][:scientificName] = parse_name(name, @data[-1])
163
+ else
164
+ @problems_file.write("%s\n" % @data[-1][:canonicalForm])
165
+ end
166
+ end
167
+ end
168
+
169
+ def get_vernacular_names(items)
170
+ if items['vernacular names'] && items['vernacular names'].size > 0
171
+ vn_string = items['vernacular names'].join("")
172
+ vn = vn_string.match(@re[:vernacular_names])
173
+ if vn
174
+ vn_list = vn[1].strip.split("|")
175
+ vnames = []
176
+ vn_list.each do |item|
177
+ language, name = item.split("=").map { |x| x.strip }
178
+ if language && name && language.size < 4 && name.valid_encoding?
179
+ vnames << {
180
+ name: name,
181
+ language: language }
182
+ end
183
+ end
184
+
185
+ @data[-1][:vernacularNames] = vnames
186
+ end
187
+ end
188
+ end
189
+
190
+ def init_classification_path(items)
191
+ if items['taxonavigation']
192
+ items['taxonavigation'].each do |line|
193
+ line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
194
+ if template_link = line.match(@re[:template_link])
195
+ template_link = template_link[1].
196
+ strip.gsub(/Template:/, '').gsub(/_/, ' ')
197
+ if !template_link.match(/\|/)
198
+ @data[-1][:classificationPath] << template_link
199
+ break
200
+ end
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ def find_species_components(x)
207
+ items = get_items(x.xpath('//text').text)
208
+ is_taxon_item = items.has_key?('name') ||
209
+ items.has_key?('taxonavigation')
210
+ return nil unless is_taxon_item
211
+ items
212
+ end
213
+
214
+ def get_items(txt)
215
+ item_on = false
216
+ items = {}
217
+ current_item = nil
218
+ txt.split("\n").each do |l|
219
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
220
+ if item
221
+ current_item = item[1].strip.downcase
222
+ items[current_item] = []
223
+ elsif current_item && !l.empty?
224
+ items[current_item] << l
225
+ end
226
+ end
227
+ items
228
+ end
229
+
230
+ def page_title(x)
231
+ @page_title ||= x.xpath('//title').first.text
232
+ end
233
+
234
+ def page_id(x)
235
+ @page_id ||= x.xpath('//id').first.text
236
+ end
237
+
238
+ def template?(page_xml)
239
+ !!page_title(page_xml).match(@re[:template])
240
+ end
241
+
242
+ def parse_name(name_string, taxa)
243
+ name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
244
+ name_string = name_string.strip
245
+ old_l = name_string.dup
246
+ name_string.gsub! /^\*\s*/, ''
247
+ name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
248
+ name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
249
+ name_string.gsub!(/[']{2,}/, ' ')
250
+ name_string.gsub!(/["]{2,}/, ' ')
251
+ name_string.gsub!(/\:\s*\d.*$/, '')
252
+ name_string.gsub!(/,\s*\[RSD\]/i, '')
253
+ name_string.gsub!(/^\s*†\s*/, '')
254
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
255
+ # name_string = DwcaHunter::XML.unescape(name_string)
256
+ name_string.gsub!(/\<nowiki\>.*$/, '')
257
+ name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
258
+ name_string.gsub!(/^\s*\&dagger;\s*/, '')
259
+ name_string.gsub!(/&nbsp;/, ' ')
260
+ name_string.gsub!(/\s+/, ' ')
261
+ name_string = name_string.strip
262
+ # puts "%s---%s" % [name_string, old_l]
263
+ return name_string
264
+ end
265
+
266
+ def generate_dwca
267
+ DwcaHunter::logger_write(self.object_id,
268
+ 'Creating DarwinCore Archive file')
269
+ @core = [
270
+ ['http://rs.tdwg.org/dwc/terms/taxonID',
271
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
272
+ 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
273
+ 'http://globalnames.org/terms/canonicalForm',
274
+ 'http://rs.tdwg.org/dwc/terms/higherClassification',
275
+ 'http://purl.org/dc/terms/source']
276
+ ]
277
+ DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
278
+ count = 0
279
+ @data.map do |d|
280
+ count += 1
281
+ if count % BATCH_SIZE == 0
282
+ DwcaHunter::logger_write(self.object_id,
283
+ "Traversing %s core data record" % count)
284
+ end
285
+ taxon_id = (d[:classificationPath].empty? ?
286
+ d[:taxonId] :
287
+ @templates[d[:classificationPath].
288
+ last][:id]) rescue d[:taxonId]
289
+ @taxon_ids[d[:taxonId]] = taxon_id
290
+ parentNameUsageId = (d[:classificationPath].size > 1 ?
291
+ @templates[d[:classificationPath][-2]][:id] :
292
+ nil) rescue nil
293
+ url = 'http://species.wikimedia.org/wiki/' +
294
+ URI.encode(d[:canonicalForm].gsub(' ', '_'))
295
+ path = d[:classificationPath]
296
+ path.pop if path[-1] == d[:canonicalForm]
297
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
298
+ scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
299
+ canonical_form :
300
+ d[:scientificName]
301
+ @core << [taxon_id,
302
+ scientific_name,
303
+ parentNameUsageId,
304
+ canonical_form,
305
+ path.join('|'),
306
+ url]
307
+ end
308
+ @extensions << { data: [[
309
+ 'http://rs.tdwg.org/dwc/terms/TaxonID',
310
+ 'http://rs.tdwg.org/dwc/terms/vernacularName',
311
+ 'http://purl.org/dc/terms/language'
312
+ ]], file_name: 'vernacular_names.txt' }
313
+ DwcaHunter::logger_write(self.object_id,
314
+ 'Creating verncaular name extension for DarwinCore Archive file')
315
+ count = 0
316
+ @data.each do |d|
317
+ count += 1
318
+ if count % BATCH_SIZE == 0
319
+ DwcaHunter::logger_write(self.object_id,
320
+ "Traversing %s extension data record" % count)
321
+ end
322
+ d[:vernacularNames].each do |vn|
323
+ taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
324
+ if taxon_id
325
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
326
+ end
327
+ end
328
+ end
329
+ @eml = {
330
+ id: @uuid,
331
+ title: @title,
332
+ license: 'http://creativecommons.org/licenses/by-sa/3.0/',
333
+ authors: [
334
+ { first_name: 'Stephen',
335
+ last_name: 'Thorpe',
336
+ email: 'stephen_thorpe@yahoo.co.nz',
337
+ url: 'http://species.wikimedia.org/wiki/Main_Page' }],
338
+ abstract: 'The free species directory that anyone can edit.',
339
+ metadata_providers: [
340
+ { first_name: 'Dmitry',
341
+ last_name: 'Mozzherin',
342
+ email: 'dmozzherin@mbl.edu' }],
343
+ url: 'http://species.wikimedia.org/wiki/Main_Page'
344
+ }
345
+ super
346
+ end
347
+
348
+ end
349
+ end
350
+
@@ -0,0 +1,176 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceWoRMS < DwcaHunter::Resource
4
+ def initialize(opts = {})
5
+ @command = 'worms'
6
+ @title = 'WoRMS'
7
+ @url = 'http://content60.eol.org/resources/26.tar.gz'
8
+ @uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
9
+ @download_path = File.join(Dir.tmpdir,
10
+ 'dwca_hunter',
11
+ 'worms',
12
+ 'data.tar.gz')
13
+ @fields = ['dc:identifier',
14
+ 'dc:source',
15
+ 'dwc:Kingdom',
16
+ 'dwc:Phylum',
17
+ 'dwc:Class',
18
+ 'dwc:Order',
19
+ 'dwc:Family',
20
+ 'dwc:Genus',
21
+ 'dwc:ScientificName']
22
+ @rank = { 1 => 'kingdom',
23
+ 2 => 'phylum',
24
+ 3 => 'class',
25
+ 4 => 'order',
26
+ 5 => 'family',
27
+ 6 => 'genus',
28
+ 7 => 'species' }
29
+ @known_paths = {}
30
+ @data = []
31
+ @extensions = []
32
+ @extensions << { data: [[
33
+ 'http://rs.tdwg.org/dwc/terms/taxonId',
34
+ 'http://rs.tdwg.org/dwc/terms/scientificName']],
35
+ file_name: 'synonyms.txt' }
36
+ @re = {
37
+ cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
38
+ }
39
+ @core = [[
40
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
41
+ 'http://purl.org/dc/terms/parentNameUsageID',
42
+ 'http://purl.org/dc/terms/source',
43
+ 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
44
+ 'http://purl.org/dc/terms/scientificName',
45
+ 'http://purl.org/dc/terms/taxonRank']]
46
+ super
47
+ end
48
+
49
+ def unpack
50
+ unpack_tar
51
+ end
52
+
53
+ def make_dwca
54
+ collect_data
55
+ make_core_data
56
+ generate_dwca
57
+ end
58
+
59
+ private
60
+
61
+ def collect_data
62
+ DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
63
+ xml_file = File.join(@download_dir, '26.xml')
64
+ f = open(xml_file, 'r:utf-8')
65
+ in_taxon = false
66
+ taxon = nil
67
+ count = 0
68
+ Nokogiri::XML::Reader(f).each do |node|
69
+ if !in_taxon && node.name == 'taxon'
70
+ in_taxon = true
71
+ taxon = {}
72
+ @fields.each { |field| taxon[field.to_sym] = nil }
73
+ taxon[:synonyms] = []
74
+ elsif in_taxon && node.name == 'taxon'
75
+ in_taxon = false
76
+ @data << taxon
77
+ taxon = nil
78
+ count += 1
79
+ if count % BATCH_SIZE == 0
80
+ DwcaHunter::logger_write(self.object_id,
81
+ "Extracted %s taxons" % count)
82
+ end
83
+ elsif in_taxon
84
+ item = node.name.to_sym
85
+ if taxon.has_key?(item) && !taxon[item]
86
+ text = node.inner_xml
87
+ if cdata = text.match(@re[:cdata])
88
+ text = cdata[1]
89
+ else
90
+ text = DwcaHunter::XML.unescape(text)
91
+ end
92
+ taxon[item] = text
93
+ elsif node.name == 'synonym' &&
94
+ (cdata = node.inner_xml.match(@re[:cdata]))
95
+ taxon[:synonyms] << cdata[1]
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ def get_gn_id(path_string)
102
+ gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
103
+ id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
104
+ "gn:" + id
105
+ end
106
+
107
+ def make_core_data
108
+ DwcaHunter::logger_write(self.object_id, 'Creating core data')
109
+ @data.each_with_index do |taxa, i|
110
+ if i % BATCH_SIZE == 0
111
+ DwcaHunter::logger_write(self.object_id,
112
+ 'Traversing %s species for core' % i)
113
+ end
114
+ path = get_path(taxa)
115
+ parent_id = get_gn_id(path.join('|'))
116
+ @core << [taxa[:'dc:identifier'],
117
+ parent_id, taxa[:'dc:source'],
118
+ nil,
119
+ taxa[:'dwc:ScientificName'],
120
+ 'species']
121
+
122
+ taxa[:synonyms].each do |synonym|
123
+ @extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
124
+ end
125
+
126
+ until path.empty?
127
+ path_string = path.join("|")
128
+ unless @known_paths[path_string]
129
+ @known_paths[path_string] = 1
130
+ parent_id = (path.size == 1) ?
131
+ nil :
132
+ get_gn_id([path[0..-2]].join('|'))
133
+ id = get_gn_id(path_string)
134
+ @core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
135
+ end
136
+ path.pop
137
+ end
138
+ end
139
+ end
140
+
141
+ def get_path(taxa)
142
+ path = []
143
+ @fields[2..-2].each do |field|
144
+ path << taxa[field.to_sym]
145
+ end
146
+ path
147
+ end
148
+
149
+ def generate_dwca
150
+ DwcaHunter::logger_write(self.object_id,
151
+ 'Creating DarwinCore Archive file')
152
+ @eml = {
153
+ id: @uuid,
154
+ title: @title,
155
+ authors: [
156
+ { email: 'info@marinespecies.org',
157
+ url: 'http://www.marinespecies.org' }
158
+ ],
159
+ metadata_providers: [
160
+ { first_name: 'Dmitry',
161
+ last_name: 'Mozzherin',
162
+ email: 'dmozzherin@gmail.com' }
163
+ ],
164
+ abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
165
+ 'is to provide an authoritative and comprehensive list ' +
166
+ 'of names of marine organisms, including information ' +
167
+ 'on synonymy. While highest priority goes to valid ' +
168
+ 'names, other names in use are included so that this ' +
169
+ 'register can serve as a guide to interpret taxonomic ' +
170
+ 'literature.',
171
+ }
172
+ super
173
+ end
174
+ end
175
+ end
176
+
@@ -0,0 +1,33 @@
1
+ module DwcaHunter
2
+ class Url
3
+
4
+ attr_reader :net_http, :path, :header
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ @parsed_url = URI.parse(url.strip)
9
+ @path = @parsed_url.path == '' ? '/' : @parsed_url.path
10
+ @net_http = Net::HTTP.new(@parsed_url.host, @parsed_url.port)
11
+ @header = get_header
12
+ end
13
+
14
+ # confirm that the passed in URL is valid and responses with a proper code
15
+ def valid?
16
+ @header && ['200','301','302'].include?(@header.code)
17
+ end
18
+
19
+ def content_length
20
+ header ? header.content_length : nil
21
+ end
22
+
23
+ private
24
+
25
+ def get_header
26
+ begin
27
+ return @net_http.head(@path)
28
+ rescue SocketError
29
+ return nil
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,7 @@
1
+ module DwcaHunter
2
+ VERSION = "0.5.0"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module DwcaHunter
2
+ module XML
3
+ def self.escape(input)
4
+ result = input.dup.strip
5
+
6
+ result.gsub!(/[&<>'"]/) do | match |
7
+ case match
8
+ when '&' then '&amp;'
9
+ when '<' then '&lt;'
10
+ when '>' then '&gt;'
11
+ when "'" then '&apos;'
12
+ when '"' then '&quot;'
13
+ end
14
+ end
15
+ result
16
+ end
17
+
18
+ def self.unescape(input)
19
+ result = input.dup.strip
20
+
21
+ result.gsub!(/&[a-z]+;/) do | match |
22
+ case match
23
+ when '&amp;' then '&'
24
+ when '&lt;' then '<'
25
+ when '&gt;' then '>'
26
+ when '&apos;' then "'"
27
+ when '&quot;' then '"'
28
+ end
29
+ end
30
+ result
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+ require "fileutils"
5
+ require "uri"
6
+ require "tmpdir"
7
+ require "net/http"
8
+ require "json"
9
+ require "dwc_archive"
10
+ require "dwca_hunter/resource"
11
+ require "rest_client"
12
+ require "base64"
13
+
14
+ Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
15
+ each { |f| require f }
16
+
17
+ Dir[File.join(__dir__, "dwca_hunter", "resources", "*.rb")].
18
+ each { |f| require f }
19
+
20
+ # DwcaHunter a namespace module for the project.
21
+ module DwcaHunter
22
+ BATCH_SIZE = 10_000
23
+
24
+ class << self
25
+ attr_reader :resource
26
+
27
+ def logger
28
+ @logger ||= Logger.new(nil)
29
+ end
30
+
31
+ attr_writer :logger
32
+
33
+ def logger_reset
34
+ self.logger = Logger.new(nil)
35
+ end
36
+
37
+ def logger_write(obj_id, message, method = :info)
38
+ logger.send(method, "|#{obj_id}|#{message}|")
39
+ end
40
+
41
+ def process(resource)
42
+ resource.download if resource.needs_download?
43
+ resource.unpack if resource.needs_unpack?
44
+ resource.make_dwca
45
+ end
46
+
47
+ def resources
48
+ ObjectSpace.each_object(Class).select do |c|
49
+ c < Resource
50
+ end
51
+ end
52
+ end
53
+ end