dwca_hunter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceWikispecies < DwcaHunter::Resource
4
+ def initialize(opts = {})
5
+ @problems_file = open('problems.txt', 'w:utf-8')
6
+ @command = "wikispecies"
7
+ @title = 'Wikispecies'
8
+ @url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
9
+ 'specieswiki-latest-pages-articles.xml.bz2'
10
+ @url = opts[:url] if opts[:url]
11
+ @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
12
+ @download_path = File.join(Dir.tmpdir,
13
+ 'dwca_hunter',
14
+ 'wikispecies',
15
+ 'data.xml.bz2')
16
+ @data = []
17
+ @templates = {}
18
+ @taxon_ids = {}
19
+ @tree = {}
20
+ @paths = {}
21
+ @extensions = []
22
+ @re = {
23
+ page_start: /^\s*\<page\>\s*$/,
24
+ page_end: /^\s*\<\/page\>\s*$/,
25
+ template: /Template:/i,
26
+ template_link: /\{\{([^\}]*)\}\}/,
27
+ vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
28
+ }
29
+ super(opts)
30
+ end
31
+
32
+ def unpack
33
+ unpack_bz2
34
+ end
35
+
36
+ def make_dwca
37
+ enrich_data
38
+ extend_classification
39
+ generate_dwca
40
+ end
41
+
42
+ private
43
+
44
+ def enrich_data
45
+ DwcaHunter::logger_write(self.object_id,
46
+ 'Extracting data from xml file...')
47
+ Dir.chdir(@download_dir)
48
+ f = open('data.xml', 'r:utf-8')
49
+ page_on = false
50
+ page = ''
51
+ page_num = 0
52
+ f.each do |l|
53
+ if l.match(@re[:page_start])
54
+ page << l
55
+ page_on = true
56
+ elsif page_on
57
+ page << l
58
+ if l.match(@re[:page_end])
59
+ page_on = false
60
+ page_xml = Nokogiri::XML.parse(page)
61
+ template?(page_xml) ?
62
+ process_template(page_xml) :
63
+ process_species(page_xml)
64
+ page_num += 1
65
+ if page_num % BATCH_SIZE == 0
66
+ DwcaHunter::logger_write(self.object_id,
67
+ "Traversed %s pages" % page_num)
68
+ end
69
+ page = ''
70
+ @page_title = nil
71
+ @page_id = nil
72
+ end
73
+ end
74
+ end
75
+ DwcaHunter::logger_write(self.object_id,
76
+ 'Extracted total %s pages' % page_num)
77
+ f.close
78
+ end
79
+
80
+ def extend_classification
81
+ DwcaHunter::logger_write(self.object_id, 'Extending classifications')
82
+ @data.each_with_index do |d, i|
83
+ unless d[:classificationPath].empty?
84
+ n = 50
85
+ while n > 0
86
+ n -= 1
87
+ if n == 0
88
+ d[:classificationPath] = []
89
+ break
90
+ end
91
+ parent = @templates[d[:classificationPath].first]
92
+ if parent
93
+ d[:classificationPath].unshift(parent[:parentName])
94
+ else
95
+ update_tree(d[:classificationPath])
96
+ break
97
+ end
98
+ end
99
+ end
100
+ # d[:classificationPath] = d[:classificationPath].join("|").
101
+ # gsub("Main Page", "Life")
102
+ if i % BATCH_SIZE == 0 && i > 0
103
+ DwcaHunter::logger_write(self.object_id,
104
+ "Extended %s classifications" % i)
105
+ end
106
+ end
107
+ end
108
+
109
+ def update_tree(path)
110
+ path = path.dup
111
+ return if @paths.has_key?(path.join('|'))
112
+ (0...path.size).each do |i|
113
+ subpath = path[0..i]
114
+ subpath_string = subpath.join('|')
115
+ next if @paths.has_key?(subpath_string)
116
+ name = subpath.pop
117
+ tree_element = subpath.inject(@tree) { |res, n| res[n] }
118
+ tree_element[name] = {}
119
+ @paths[subpath_string] = 1
120
+ end
121
+ end
122
+
123
+ def process_template(x)
124
+ name = page_title(x).gsub!(@re[:template], '').strip
125
+ text = x.xpath('//text').text.strip
126
+ parent_name = text.match(@re[:template_link])
127
+ if parent_name
128
+ return if parent_name[1].match(/\#if/)
129
+ list = parent_name[1].split("|")
130
+ if list.size == 1
131
+ parent_name = list[0]
132
+ elsif list[0].match /Taxonav/i
133
+ parent_name = list[1]
134
+ else
135
+ parent_name = list[0]
136
+ end
137
+ end
138
+ name.gsub!(/_/, ' ')
139
+ parent_name.gsub!(/_/, ' ') if parent_name
140
+ @templates[name] = { parentName: parent_name, id: page_id(x) }
141
+ end
142
+
143
+ def process_species(x)
144
+ return if page_title(x).match(/Wikispecies/i)
145
+ items = find_species_components(x)
146
+ if items
147
+ @data << {
148
+ taxonId: page_id(x),
149
+ canonicalForm: page_title(x),
150
+ scientificName: page_title(x),
151
+ classificationPath: [],
152
+ vernacularNames: [] }
153
+ get_full_scientific_name(items)
154
+ get_vernacular_names(items)
155
+ init_classification_path(items)
156
+ end
157
+ end
158
+
159
+ def get_full_scientific_name(items)
160
+ if items['name']
161
+ if name = items['name'][0]
162
+ @data[-1][:scientificName] = parse_name(name, @data[-1])
163
+ else
164
+ @problems_file.write("%s\n" % @data[-1][:canonicalForm])
165
+ end
166
+ end
167
+ end
168
+
169
+ def get_vernacular_names(items)
170
+ if items['vernacular names'] && items['vernacular names'].size > 0
171
+ vn_string = items['vernacular names'].join("")
172
+ vn = vn_string.match(@re[:vernacular_names])
173
+ if vn
174
+ vn_list = vn[1].strip.split("|")
175
+ vnames = []
176
+ vn_list.each do |item|
177
+ language, name = item.split("=").map { |x| x.strip }
178
+ if language && name && language.size < 4 && name.valid_encoding?
179
+ vnames << {
180
+ name: name,
181
+ language: language }
182
+ end
183
+ end
184
+
185
+ @data[-1][:vernacularNames] = vnames
186
+ end
187
+ end
188
+ end
189
+
190
+ def init_classification_path(items)
191
+ if items['taxonavigation']
192
+ items['taxonavigation'].each do |line|
193
+ line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
194
+ if template_link = line.match(@re[:template_link])
195
+ template_link = template_link[1].
196
+ strip.gsub(/Template:/, '').gsub(/_/, ' ')
197
+ if !template_link.match(/\|/)
198
+ @data[-1][:classificationPath] << template_link
199
+ break
200
+ end
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ def find_species_components(x)
207
+ items = get_items(x.xpath('//text').text)
208
+ is_taxon_item = items.has_key?('name') ||
209
+ items.has_key?('taxonavigation')
210
+ return nil unless is_taxon_item
211
+ items
212
+ end
213
+
214
+ def get_items(txt)
215
+ item_on = false
216
+ items = {}
217
+ current_item = nil
218
+ txt.split("\n").each do |l|
219
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
220
+ if item
221
+ current_item = item[1].strip.downcase
222
+ items[current_item] = []
223
+ elsif current_item && !l.empty?
224
+ items[current_item] << l
225
+ end
226
+ end
227
+ items
228
+ end
229
+
230
+ def page_title(x)
231
+ @page_title ||= x.xpath('//title').first.text
232
+ end
233
+
234
+ def page_id(x)
235
+ @page_id ||= x.xpath('//id').first.text
236
+ end
237
+
238
+ def template?(page_xml)
239
+ !!page_title(page_xml).match(@re[:template])
240
+ end
241
+
242
+ def parse_name(name_string, taxa)
243
+ name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
244
+ name_string = name_string.strip
245
+ old_l = name_string.dup
246
+ name_string.gsub! /^\*\s*/, ''
247
+ name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
248
+ name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
249
+ name_string.gsub!(/[']{2,}/, ' ')
250
+ name_string.gsub!(/["]{2,}/, ' ')
251
+ name_string.gsub!(/\:\s*\d.*$/, '')
252
+ name_string.gsub!(/,\s*\[RSD\]/i, '')
253
+ name_string.gsub!(/^\s*†\s*/, '')
254
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
255
+ # name_string = DwcaHunter::XML.unescape(name_string)
256
+ name_string.gsub!(/\<nowiki\>.*$/, '')
257
+ name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
258
+ name_string.gsub!(/^\s*\&dagger;\s*/, '')
259
+ name_string.gsub!(/&nbsp;/, ' ')
260
+ name_string.gsub!(/\s+/, ' ')
261
+ name_string = name_string.strip
262
+ # puts "%s---%s" % [name_string, old_l]
263
+ return name_string
264
+ end
265
+
266
+ def generate_dwca
267
+ DwcaHunter::logger_write(self.object_id,
268
+ 'Creating DarwinCore Archive file')
269
+ @core = [
270
+ ['http://rs.tdwg.org/dwc/terms/taxonID',
271
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
272
+ 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
273
+ 'http://globalnames.org/terms/canonicalForm',
274
+ 'http://rs.tdwg.org/dwc/terms/higherClassification',
275
+ 'http://purl.org/dc/terms/source']
276
+ ]
277
+ DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
278
+ count = 0
279
+ @data.map do |d|
280
+ count += 1
281
+ if count % BATCH_SIZE == 0
282
+ DwcaHunter::logger_write(self.object_id,
283
+ "Traversing %s core data record" % count)
284
+ end
285
+ taxon_id = (d[:classificationPath].empty? ?
286
+ d[:taxonId] :
287
+ @templates[d[:classificationPath].
288
+ last][:id]) rescue d[:taxonId]
289
+ @taxon_ids[d[:taxonId]] = taxon_id
290
+ parentNameUsageId = (d[:classificationPath].size > 1 ?
291
+ @templates[d[:classificationPath][-2]][:id] :
292
+ nil) rescue nil
293
+ url = 'http://species.wikimedia.org/wiki/' +
294
+ URI.encode(d[:canonicalForm].gsub(' ', '_'))
295
+ path = d[:classificationPath]
296
+ path.pop if path[-1] == d[:canonicalForm]
297
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
298
+ scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
299
+ canonical_form :
300
+ d[:scientificName]
301
+ @core << [taxon_id,
302
+ scientific_name,
303
+ parentNameUsageId,
304
+ canonical_form,
305
+ path.join('|'),
306
+ url]
307
+ end
308
+ @extensions << { data: [[
309
+ 'http://rs.tdwg.org/dwc/terms/TaxonID',
310
+ 'http://rs.tdwg.org/dwc/terms/vernacularName',
311
+ 'http://purl.org/dc/terms/language'
312
+ ]], file_name: 'vernacular_names.txt' }
313
+ DwcaHunter::logger_write(self.object_id,
314
+ 'Creating verncaular name extension for DarwinCore Archive file')
315
+ count = 0
316
+ @data.each do |d|
317
+ count += 1
318
+ if count % BATCH_SIZE == 0
319
+ DwcaHunter::logger_write(self.object_id,
320
+ "Traversing %s extension data record" % count)
321
+ end
322
+ d[:vernacularNames].each do |vn|
323
+ taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
324
+ if taxon_id
325
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
326
+ end
327
+ end
328
+ end
329
+ @eml = {
330
+ id: @uuid,
331
+ title: @title,
332
+ license: 'http://creativecommons.org/licenses/by-sa/3.0/',
333
+ authors: [
334
+ { first_name: 'Stephen',
335
+ last_name: 'Thorpe',
336
+ email: 'stephen_thorpe@yahoo.co.nz',
337
+ url: 'http://species.wikimedia.org/wiki/Main_Page' }],
338
+ abstract: 'The free species directory that anyone can edit.',
339
+ metadata_providers: [
340
+ { first_name: 'Dmitry',
341
+ last_name: 'Mozzherin',
342
+ email: 'dmozzherin@mbl.edu' }],
343
+ url: 'http://species.wikimedia.org/wiki/Main_Page'
344
+ }
345
+ super
346
+ end
347
+
348
+ end
349
+ end
350
+
@@ -0,0 +1,176 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceWoRMS < DwcaHunter::Resource
4
+ def initialize(opts = {})
5
+ @command = 'worms'
6
+ @title = 'WoRMS'
7
+ @url = 'http://content60.eol.org/resources/26.tar.gz'
8
+ @uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
9
+ @download_path = File.join(Dir.tmpdir,
10
+ 'dwca_hunter',
11
+ 'worms',
12
+ 'data.tar.gz')
13
+ @fields = ['dc:identifier',
14
+ 'dc:source',
15
+ 'dwc:Kingdom',
16
+ 'dwc:Phylum',
17
+ 'dwc:Class',
18
+ 'dwc:Order',
19
+ 'dwc:Family',
20
+ 'dwc:Genus',
21
+ 'dwc:ScientificName']
22
+ @rank = { 1 => 'kingdom',
23
+ 2 => 'phylum',
24
+ 3 => 'class',
25
+ 4 => 'order',
26
+ 5 => 'family',
27
+ 6 => 'genus',
28
+ 7 => 'species' }
29
+ @known_paths = {}
30
+ @data = []
31
+ @extensions = []
32
+ @extensions << { data: [[
33
+ 'http://rs.tdwg.org/dwc/terms/taxonId',
34
+ 'http://rs.tdwg.org/dwc/terms/scientificName']],
35
+ file_name: 'synonyms.txt' }
36
+ @re = {
37
+ cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
38
+ }
39
+ @core = [[
40
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
41
+ 'http://purl.org/dc/terms/parentNameUsageID',
42
+ 'http://purl.org/dc/terms/source',
43
+ 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
44
+ 'http://purl.org/dc/terms/scientificName',
45
+ 'http://purl.org/dc/terms/taxonRank']]
46
+ super
47
+ end
48
+
49
+ def unpack
50
+ unpack_tar
51
+ end
52
+
53
+ def make_dwca
54
+ collect_data
55
+ make_core_data
56
+ generate_dwca
57
+ end
58
+
59
+ private
60
+
61
+ def collect_data
62
+ DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
63
+ xml_file = File.join(@download_dir, '26.xml')
64
+ f = open(xml_file, 'r:utf-8')
65
+ in_taxon = false
66
+ taxon = nil
67
+ count = 0
68
+ Nokogiri::XML::Reader(f).each do |node|
69
+ if !in_taxon && node.name == 'taxon'
70
+ in_taxon = true
71
+ taxon = {}
72
+ @fields.each { |field| taxon[field.to_sym] = nil }
73
+ taxon[:synonyms] = []
74
+ elsif in_taxon && node.name == 'taxon'
75
+ in_taxon = false
76
+ @data << taxon
77
+ taxon = nil
78
+ count += 1
79
+ if count % BATCH_SIZE == 0
80
+ DwcaHunter::logger_write(self.object_id,
81
+ "Extracted %s taxons" % count)
82
+ end
83
+ elsif in_taxon
84
+ item = node.name.to_sym
85
+ if taxon.has_key?(item) && !taxon[item]
86
+ text = node.inner_xml
87
+ if cdata = text.match(@re[:cdata])
88
+ text = cdata[1]
89
+ else
90
+ text = DwcaHunter::XML.unescape(text)
91
+ end
92
+ taxon[item] = text
93
+ elsif node.name == 'synonym' &&
94
+ (cdata = node.inner_xml.match(@re[:cdata]))
95
+ taxon[:synonyms] << cdata[1]
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ def get_gn_id(path_string)
102
+ gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
103
+ id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
104
+ "gn:" + id
105
+ end
106
+
107
+ def make_core_data
108
+ DwcaHunter::logger_write(self.object_id, 'Creating core data')
109
+ @data.each_with_index do |taxa, i|
110
+ if i % BATCH_SIZE == 0
111
+ DwcaHunter::logger_write(self.object_id,
112
+ 'Traversing %s species for core' % i)
113
+ end
114
+ path = get_path(taxa)
115
+ parent_id = get_gn_id(path.join('|'))
116
+ @core << [taxa[:'dc:identifier'],
117
+ parent_id, taxa[:'dc:source'],
118
+ nil,
119
+ taxa[:'dwc:ScientificName'],
120
+ 'species']
121
+
122
+ taxa[:synonyms].each do |synonym|
123
+ @extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
124
+ end
125
+
126
+ until path.empty?
127
+ path_string = path.join("|")
128
+ unless @known_paths[path_string]
129
+ @known_paths[path_string] = 1
130
+ parent_id = (path.size == 1) ?
131
+ nil :
132
+ get_gn_id([path[0..-2]].join('|'))
133
+ id = get_gn_id(path_string)
134
+ @core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
135
+ end
136
+ path.pop
137
+ end
138
+ end
139
+ end
140
+
141
+ def get_path(taxa)
142
+ path = []
143
+ @fields[2..-2].each do |field|
144
+ path << taxa[field.to_sym]
145
+ end
146
+ path
147
+ end
148
+
149
+ def generate_dwca
150
+ DwcaHunter::logger_write(self.object_id,
151
+ 'Creating DarwinCore Archive file')
152
+ @eml = {
153
+ id: @uuid,
154
+ title: @title,
155
+ authors: [
156
+ { email: 'info@marinespecies.org',
157
+ url: 'http://www.marinespecies.org' }
158
+ ],
159
+ metadata_providers: [
160
+ { first_name: 'Dmitry',
161
+ last_name: 'Mozzherin',
162
+ email: 'dmozzherin@gmail.com' }
163
+ ],
164
+ abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
165
+ 'is to provide an authoritative and comprehensive list ' +
166
+ 'of names of marine organisms, including information ' +
167
+ 'on synonymy. While highest priority goes to valid ' +
168
+ 'names, other names in use are included so that this ' +
169
+ 'register can serve as a guide to interpret taxonomic ' +
170
+ 'literature.',
171
+ }
172
+ super
173
+ end
174
+ end
175
+ end
176
+
@@ -0,0 +1,33 @@
1
+ module DwcaHunter
2
+ class Url
3
+
4
+ attr_reader :net_http, :path, :header
5
+
6
+ def initialize(url)
7
+ @url = url
8
+ @parsed_url = URI.parse(url.strip)
9
+ @path = @parsed_url.path == '' ? '/' : @parsed_url.path
10
+ @net_http = Net::HTTP.new(@parsed_url.host, @parsed_url.port)
11
+ @header = get_header
12
+ end
13
+
14
+ # confirm that the passed in URL is valid and responses with a proper code
15
+ def valid?
16
+ @header && ['200','301','302'].include?(@header.code)
17
+ end
18
+
19
+ def content_length
20
+ header ? header.content_length : nil
21
+ end
22
+
23
+ private
24
+
25
+ def get_header
26
+ begin
27
+ return @net_http.head(@path)
28
+ rescue SocketError
29
+ return nil
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,7 @@
1
+ module DwcaHunter
2
+ VERSION = "0.5.0"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
@@ -0,0 +1,33 @@
1
+ module DwcaHunter
2
+ module XML
3
+ def self.escape(input)
4
+ result = input.dup.strip
5
+
6
+ result.gsub!(/[&<>'"]/) do | match |
7
+ case match
8
+ when '&' then '&amp;'
9
+ when '<' then '&lt;'
10
+ when '>' then '&gt;'
11
+ when "'" then '&apos;'
12
+ when '"' then '&quot;'
13
+ end
14
+ end
15
+ result
16
+ end
17
+
18
+ def self.unescape(input)
19
+ result = input.dup.strip
20
+
21
+ result.gsub!(/&[a-z]+;/) do | match |
22
+ case match
23
+ when '&amp;' then '&'
24
+ when '&lt;' then '<'
25
+ when '&gt;' then '>'
26
+ when '&apos;' then "'"
27
+ when '&quot;' then '"'
28
+ end
29
+ end
30
+ result
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "logger"
4
+ require "fileutils"
5
+ require "uri"
6
+ require "tmpdir"
7
+ require "net/http"
8
+ require "json"
9
+ require "dwc_archive"
10
+ require "dwca_hunter/resource"
11
+ require "rest_client"
12
+ require "base64"
13
+
14
+ Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
15
+ each { |f| require f }
16
+
17
+ Dir[File.join(__dir__, "dwca_hunter", "resources", "*.rb")].
18
+ each { |f| require f }
19
+
20
+ # DwcaHunter a namespace module for the project.
21
+ module DwcaHunter
22
+ BATCH_SIZE = 10_000
23
+
24
+ class << self
25
+ attr_reader :resource
26
+
27
+ def logger
28
+ @logger ||= Logger.new(nil)
29
+ end
30
+
31
+ attr_writer :logger
32
+
33
+ def logger_reset
34
+ self.logger = Logger.new(nil)
35
+ end
36
+
37
+ def logger_write(obj_id, message, method = :info)
38
+ logger.send(method, "|#{obj_id}|#{message}|")
39
+ end
40
+
41
+ def process(resource)
42
+ resource.download if resource.needs_download?
43
+ resource.unpack if resource.needs_unpack?
44
+ resource.make_dwca
45
+ end
46
+
47
+ def resources
48
+ ObjectSpace.each_object(Class).select do |c|
49
+ c < Resource
50
+ end
51
+ end
52
+ end
53
+ end