dwca_hunter 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceWikispecies < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@problems_file = open('problems.txt', 'w:utf-8')
|
6
|
+
@command = "wikispecies"
|
7
|
+
@title = 'Wikispecies'
|
8
|
+
@url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
|
9
|
+
'specieswiki-latest-pages-articles.xml.bz2'
|
10
|
+
@url = opts[:url] if opts[:url]
|
11
|
+
@uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
|
12
|
+
@download_path = File.join(Dir.tmpdir,
|
13
|
+
'dwca_hunter',
|
14
|
+
'wikispecies',
|
15
|
+
'data.xml.bz2')
|
16
|
+
@data = []
|
17
|
+
@templates = {}
|
18
|
+
@taxon_ids = {}
|
19
|
+
@tree = {}
|
20
|
+
@paths = {}
|
21
|
+
@extensions = []
|
22
|
+
@re = {
|
23
|
+
page_start: /^\s*\<page\>\s*$/,
|
24
|
+
page_end: /^\s*\<\/page\>\s*$/,
|
25
|
+
template: /Template:/i,
|
26
|
+
template_link: /\{\{([^\}]*)\}\}/,
|
27
|
+
vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
|
28
|
+
}
|
29
|
+
super(opts)
|
30
|
+
end
|
31
|
+
|
32
|
+
def unpack
|
33
|
+
unpack_bz2
|
34
|
+
end
|
35
|
+
|
36
|
+
def make_dwca
|
37
|
+
enrich_data
|
38
|
+
extend_classification
|
39
|
+
generate_dwca
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def enrich_data
|
45
|
+
DwcaHunter::logger_write(self.object_id,
|
46
|
+
'Extracting data from xml file...')
|
47
|
+
Dir.chdir(@download_dir)
|
48
|
+
f = open('data.xml', 'r:utf-8')
|
49
|
+
page_on = false
|
50
|
+
page = ''
|
51
|
+
page_num = 0
|
52
|
+
f.each do |l|
|
53
|
+
if l.match(@re[:page_start])
|
54
|
+
page << l
|
55
|
+
page_on = true
|
56
|
+
elsif page_on
|
57
|
+
page << l
|
58
|
+
if l.match(@re[:page_end])
|
59
|
+
page_on = false
|
60
|
+
page_xml = Nokogiri::XML.parse(page)
|
61
|
+
template?(page_xml) ?
|
62
|
+
process_template(page_xml) :
|
63
|
+
process_species(page_xml)
|
64
|
+
page_num += 1
|
65
|
+
if page_num % BATCH_SIZE == 0
|
66
|
+
DwcaHunter::logger_write(self.object_id,
|
67
|
+
"Traversed %s pages" % page_num)
|
68
|
+
end
|
69
|
+
page = ''
|
70
|
+
@page_title = nil
|
71
|
+
@page_id = nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
DwcaHunter::logger_write(self.object_id,
|
76
|
+
'Extracted total %s pages' % page_num)
|
77
|
+
f.close
|
78
|
+
end
|
79
|
+
|
80
|
+
def extend_classification
|
81
|
+
DwcaHunter::logger_write(self.object_id, 'Extending classifications')
|
82
|
+
@data.each_with_index do |d, i|
|
83
|
+
unless d[:classificationPath].empty?
|
84
|
+
n = 50
|
85
|
+
while n > 0
|
86
|
+
n -= 1
|
87
|
+
if n == 0
|
88
|
+
d[:classificationPath] = []
|
89
|
+
break
|
90
|
+
end
|
91
|
+
parent = @templates[d[:classificationPath].first]
|
92
|
+
if parent
|
93
|
+
d[:classificationPath].unshift(parent[:parentName])
|
94
|
+
else
|
95
|
+
update_tree(d[:classificationPath])
|
96
|
+
break
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
# d[:classificationPath] = d[:classificationPath].join("|").
|
101
|
+
# gsub("Main Page", "Life")
|
102
|
+
if i % BATCH_SIZE == 0 && i > 0
|
103
|
+
DwcaHunter::logger_write(self.object_id,
|
104
|
+
"Extended %s classifications" % i)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def update_tree(path)
|
110
|
+
path = path.dup
|
111
|
+
return if @paths.has_key?(path.join('|'))
|
112
|
+
(0...path.size).each do |i|
|
113
|
+
subpath = path[0..i]
|
114
|
+
subpath_string = subpath.join('|')
|
115
|
+
next if @paths.has_key?(subpath_string)
|
116
|
+
name = subpath.pop
|
117
|
+
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
118
|
+
tree_element[name] = {}
|
119
|
+
@paths[subpath_string] = 1
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def process_template(x)
|
124
|
+
name = page_title(x).gsub!(@re[:template], '').strip
|
125
|
+
text = x.xpath('//text').text.strip
|
126
|
+
parent_name = text.match(@re[:template_link])
|
127
|
+
if parent_name
|
128
|
+
return if parent_name[1].match(/\#if/)
|
129
|
+
list = parent_name[1].split("|")
|
130
|
+
if list.size == 1
|
131
|
+
parent_name = list[0]
|
132
|
+
elsif list[0].match /Taxonav/i
|
133
|
+
parent_name = list[1]
|
134
|
+
else
|
135
|
+
parent_name = list[0]
|
136
|
+
end
|
137
|
+
end
|
138
|
+
name.gsub!(/_/, ' ')
|
139
|
+
parent_name.gsub!(/_/, ' ') if parent_name
|
140
|
+
@templates[name] = { parentName: parent_name, id: page_id(x) }
|
141
|
+
end
|
142
|
+
|
143
|
+
def process_species(x)
|
144
|
+
return if page_title(x).match(/Wikispecies/i)
|
145
|
+
items = find_species_components(x)
|
146
|
+
if items
|
147
|
+
@data << {
|
148
|
+
taxonId: page_id(x),
|
149
|
+
canonicalForm: page_title(x),
|
150
|
+
scientificName: page_title(x),
|
151
|
+
classificationPath: [],
|
152
|
+
vernacularNames: [] }
|
153
|
+
get_full_scientific_name(items)
|
154
|
+
get_vernacular_names(items)
|
155
|
+
init_classification_path(items)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def get_full_scientific_name(items)
|
160
|
+
if items['name']
|
161
|
+
if name = items['name'][0]
|
162
|
+
@data[-1][:scientificName] = parse_name(name, @data[-1])
|
163
|
+
else
|
164
|
+
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def get_vernacular_names(items)
|
170
|
+
if items['vernacular names'] && items['vernacular names'].size > 0
|
171
|
+
vn_string = items['vernacular names'].join("")
|
172
|
+
vn = vn_string.match(@re[:vernacular_names])
|
173
|
+
if vn
|
174
|
+
vn_list = vn[1].strip.split("|")
|
175
|
+
vnames = []
|
176
|
+
vn_list.each do |item|
|
177
|
+
language, name = item.split("=").map { |x| x.strip }
|
178
|
+
if language && name && language.size < 4 && name.valid_encoding?
|
179
|
+
vnames << {
|
180
|
+
name: name,
|
181
|
+
language: language }
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
@data[-1][:vernacularNames] = vnames
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def init_classification_path(items)
|
191
|
+
if items['taxonavigation']
|
192
|
+
items['taxonavigation'].each do |line|
|
193
|
+
line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
|
194
|
+
if template_link = line.match(@re[:template_link])
|
195
|
+
template_link = template_link[1].
|
196
|
+
strip.gsub(/Template:/, '').gsub(/_/, ' ')
|
197
|
+
if !template_link.match(/\|/)
|
198
|
+
@data[-1][:classificationPath] << template_link
|
199
|
+
break
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
def find_species_components(x)
|
207
|
+
items = get_items(x.xpath('//text').text)
|
208
|
+
is_taxon_item = items.has_key?('name') ||
|
209
|
+
items.has_key?('taxonavigation')
|
210
|
+
return nil unless is_taxon_item
|
211
|
+
items
|
212
|
+
end
|
213
|
+
|
214
|
+
def get_items(txt)
|
215
|
+
item_on = false
|
216
|
+
items = {}
|
217
|
+
current_item = nil
|
218
|
+
txt.split("\n").each do |l|
|
219
|
+
item = l.match(/[\=]+([^\=]+)[\=]+/)
|
220
|
+
if item
|
221
|
+
current_item = item[1].strip.downcase
|
222
|
+
items[current_item] = []
|
223
|
+
elsif current_item && !l.empty?
|
224
|
+
items[current_item] << l
|
225
|
+
end
|
226
|
+
end
|
227
|
+
items
|
228
|
+
end
|
229
|
+
|
230
|
+
def page_title(x)
|
231
|
+
@page_title ||= x.xpath('//title').first.text
|
232
|
+
end
|
233
|
+
|
234
|
+
def page_id(x)
|
235
|
+
@page_id ||= x.xpath('//id').first.text
|
236
|
+
end
|
237
|
+
|
238
|
+
def template?(page_xml)
|
239
|
+
!!page_title(page_xml).match(@re[:template])
|
240
|
+
end
|
241
|
+
|
242
|
+
def parse_name(name_string, taxa)
|
243
|
+
name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
|
244
|
+
name_string = name_string.strip
|
245
|
+
old_l = name_string.dup
|
246
|
+
name_string.gsub! /^\*\s*/, ''
|
247
|
+
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
248
|
+
name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
|
249
|
+
name_string.gsub!(/[']{2,}/, ' ')
|
250
|
+
name_string.gsub!(/["]{2,}/, ' ')
|
251
|
+
name_string.gsub!(/\:\s*\d.*$/, '')
|
252
|
+
name_string.gsub!(/,\s*\[RSD\]/i, '')
|
253
|
+
name_string.gsub!(/^\s*†\s*/, '')
|
254
|
+
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
|
255
|
+
# name_string = DwcaHunter::XML.unescape(name_string)
|
256
|
+
name_string.gsub!(/\<nowiki\>.*$/, '')
|
257
|
+
name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
|
258
|
+
name_string.gsub!(/^\s*\†\s*/, '')
|
259
|
+
name_string.gsub!(/ /, ' ')
|
260
|
+
name_string.gsub!(/\s+/, ' ')
|
261
|
+
name_string = name_string.strip
|
262
|
+
# puts "%s---%s" % [name_string, old_l]
|
263
|
+
return name_string
|
264
|
+
end
|
265
|
+
|
266
|
+
def generate_dwca
|
267
|
+
DwcaHunter::logger_write(self.object_id,
|
268
|
+
'Creating DarwinCore Archive file')
|
269
|
+
@core = [
|
270
|
+
['http://rs.tdwg.org/dwc/terms/taxonID',
|
271
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
272
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
273
|
+
'http://globalnames.org/terms/canonicalForm',
|
274
|
+
'http://rs.tdwg.org/dwc/terms/higherClassification',
|
275
|
+
'http://purl.org/dc/terms/source']
|
276
|
+
]
|
277
|
+
DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
|
278
|
+
count = 0
|
279
|
+
@data.map do |d|
|
280
|
+
count += 1
|
281
|
+
if count % BATCH_SIZE == 0
|
282
|
+
DwcaHunter::logger_write(self.object_id,
|
283
|
+
"Traversing %s core data record" % count)
|
284
|
+
end
|
285
|
+
taxon_id = (d[:classificationPath].empty? ?
|
286
|
+
d[:taxonId] :
|
287
|
+
@templates[d[:classificationPath].
|
288
|
+
last][:id]) rescue d[:taxonId]
|
289
|
+
@taxon_ids[d[:taxonId]] = taxon_id
|
290
|
+
parentNameUsageId = (d[:classificationPath].size > 1 ?
|
291
|
+
@templates[d[:classificationPath][-2]][:id] :
|
292
|
+
nil) rescue nil
|
293
|
+
url = 'http://species.wikimedia.org/wiki/' +
|
294
|
+
URI.encode(d[:canonicalForm].gsub(' ', '_'))
|
295
|
+
path = d[:classificationPath]
|
296
|
+
path.pop if path[-1] == d[:canonicalForm]
|
297
|
+
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
|
298
|
+
scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
|
299
|
+
canonical_form :
|
300
|
+
d[:scientificName]
|
301
|
+
@core << [taxon_id,
|
302
|
+
scientific_name,
|
303
|
+
parentNameUsageId,
|
304
|
+
canonical_form,
|
305
|
+
path.join('|'),
|
306
|
+
url]
|
307
|
+
end
|
308
|
+
@extensions << { data: [[
|
309
|
+
'http://rs.tdwg.org/dwc/terms/TaxonID',
|
310
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
311
|
+
'http://purl.org/dc/terms/language'
|
312
|
+
]], file_name: 'vernacular_names.txt' }
|
313
|
+
DwcaHunter::logger_write(self.object_id,
|
314
|
+
'Creating verncaular name extension for DarwinCore Archive file')
|
315
|
+
count = 0
|
316
|
+
@data.each do |d|
|
317
|
+
count += 1
|
318
|
+
if count % BATCH_SIZE == 0
|
319
|
+
DwcaHunter::logger_write(self.object_id,
|
320
|
+
"Traversing %s extension data record" % count)
|
321
|
+
end
|
322
|
+
d[:vernacularNames].each do |vn|
|
323
|
+
taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
|
324
|
+
if taxon_id
|
325
|
+
@extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
@eml = {
|
330
|
+
id: @uuid,
|
331
|
+
title: @title,
|
332
|
+
license: 'http://creativecommons.org/licenses/by-sa/3.0/',
|
333
|
+
authors: [
|
334
|
+
{ first_name: 'Stephen',
|
335
|
+
last_name: 'Thorpe',
|
336
|
+
email: 'stephen_thorpe@yahoo.co.nz',
|
337
|
+
url: 'http://species.wikimedia.org/wiki/Main_Page' }],
|
338
|
+
abstract: 'The free species directory that anyone can edit.',
|
339
|
+
metadata_providers: [
|
340
|
+
{ first_name: 'Dmitry',
|
341
|
+
last_name: 'Mozzherin',
|
342
|
+
email: 'dmozzherin@mbl.edu' }],
|
343
|
+
url: 'http://species.wikimedia.org/wiki/Main_Page'
|
344
|
+
}
|
345
|
+
super
|
346
|
+
end
|
347
|
+
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
@@ -0,0 +1,176 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceWoRMS < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@command = 'worms'
|
6
|
+
@title = 'WoRMS'
|
7
|
+
@url = 'http://content60.eol.org/resources/26.tar.gz'
|
8
|
+
@uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'worms',
|
12
|
+
'data.tar.gz')
|
13
|
+
@fields = ['dc:identifier',
|
14
|
+
'dc:source',
|
15
|
+
'dwc:Kingdom',
|
16
|
+
'dwc:Phylum',
|
17
|
+
'dwc:Class',
|
18
|
+
'dwc:Order',
|
19
|
+
'dwc:Family',
|
20
|
+
'dwc:Genus',
|
21
|
+
'dwc:ScientificName']
|
22
|
+
@rank = { 1 => 'kingdom',
|
23
|
+
2 => 'phylum',
|
24
|
+
3 => 'class',
|
25
|
+
4 => 'order',
|
26
|
+
5 => 'family',
|
27
|
+
6 => 'genus',
|
28
|
+
7 => 'species' }
|
29
|
+
@known_paths = {}
|
30
|
+
@data = []
|
31
|
+
@extensions = []
|
32
|
+
@extensions << { data: [[
|
33
|
+
'http://rs.tdwg.org/dwc/terms/taxonId',
|
34
|
+
'http://rs.tdwg.org/dwc/terms/scientificName']],
|
35
|
+
file_name: 'synonyms.txt' }
|
36
|
+
@re = {
|
37
|
+
cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
|
38
|
+
}
|
39
|
+
@core = [[
|
40
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
41
|
+
'http://purl.org/dc/terms/parentNameUsageID',
|
42
|
+
'http://purl.org/dc/terms/source',
|
43
|
+
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
44
|
+
'http://purl.org/dc/terms/scientificName',
|
45
|
+
'http://purl.org/dc/terms/taxonRank']]
|
46
|
+
super
|
47
|
+
end
|
48
|
+
|
49
|
+
def unpack
|
50
|
+
unpack_tar
|
51
|
+
end
|
52
|
+
|
53
|
+
def make_dwca
|
54
|
+
collect_data
|
55
|
+
make_core_data
|
56
|
+
generate_dwca
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def collect_data
|
62
|
+
DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
|
63
|
+
xml_file = File.join(@download_dir, '26.xml')
|
64
|
+
f = open(xml_file, 'r:utf-8')
|
65
|
+
in_taxon = false
|
66
|
+
taxon = nil
|
67
|
+
count = 0
|
68
|
+
Nokogiri::XML::Reader(f).each do |node|
|
69
|
+
if !in_taxon && node.name == 'taxon'
|
70
|
+
in_taxon = true
|
71
|
+
taxon = {}
|
72
|
+
@fields.each { |field| taxon[field.to_sym] = nil }
|
73
|
+
taxon[:synonyms] = []
|
74
|
+
elsif in_taxon && node.name == 'taxon'
|
75
|
+
in_taxon = false
|
76
|
+
@data << taxon
|
77
|
+
taxon = nil
|
78
|
+
count += 1
|
79
|
+
if count % BATCH_SIZE == 0
|
80
|
+
DwcaHunter::logger_write(self.object_id,
|
81
|
+
"Extracted %s taxons" % count)
|
82
|
+
end
|
83
|
+
elsif in_taxon
|
84
|
+
item = node.name.to_sym
|
85
|
+
if taxon.has_key?(item) && !taxon[item]
|
86
|
+
text = node.inner_xml
|
87
|
+
if cdata = text.match(@re[:cdata])
|
88
|
+
text = cdata[1]
|
89
|
+
else
|
90
|
+
text = DwcaHunter::XML.unescape(text)
|
91
|
+
end
|
92
|
+
taxon[item] = text
|
93
|
+
elsif node.name == 'synonym' &&
|
94
|
+
(cdata = node.inner_xml.match(@re[:cdata]))
|
95
|
+
taxon[:synonyms] << cdata[1]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def get_gn_id(path_string)
|
102
|
+
gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
|
103
|
+
id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
|
104
|
+
"gn:" + id
|
105
|
+
end
|
106
|
+
|
107
|
+
def make_core_data
|
108
|
+
DwcaHunter::logger_write(self.object_id, 'Creating core data')
|
109
|
+
@data.each_with_index do |taxa, i|
|
110
|
+
if i % BATCH_SIZE == 0
|
111
|
+
DwcaHunter::logger_write(self.object_id,
|
112
|
+
'Traversing %s species for core' % i)
|
113
|
+
end
|
114
|
+
path = get_path(taxa)
|
115
|
+
parent_id = get_gn_id(path.join('|'))
|
116
|
+
@core << [taxa[:'dc:identifier'],
|
117
|
+
parent_id, taxa[:'dc:source'],
|
118
|
+
nil,
|
119
|
+
taxa[:'dwc:ScientificName'],
|
120
|
+
'species']
|
121
|
+
|
122
|
+
taxa[:synonyms].each do |synonym|
|
123
|
+
@extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
|
124
|
+
end
|
125
|
+
|
126
|
+
until path.empty?
|
127
|
+
path_string = path.join("|")
|
128
|
+
unless @known_paths[path_string]
|
129
|
+
@known_paths[path_string] = 1
|
130
|
+
parent_id = (path.size == 1) ?
|
131
|
+
nil :
|
132
|
+
get_gn_id([path[0..-2]].join('|'))
|
133
|
+
id = get_gn_id(path_string)
|
134
|
+
@core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
|
135
|
+
end
|
136
|
+
path.pop
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def get_path(taxa)
|
142
|
+
path = []
|
143
|
+
@fields[2..-2].each do |field|
|
144
|
+
path << taxa[field.to_sym]
|
145
|
+
end
|
146
|
+
path
|
147
|
+
end
|
148
|
+
|
149
|
+
def generate_dwca
|
150
|
+
DwcaHunter::logger_write(self.object_id,
|
151
|
+
'Creating DarwinCore Archive file')
|
152
|
+
@eml = {
|
153
|
+
id: @uuid,
|
154
|
+
title: @title,
|
155
|
+
authors: [
|
156
|
+
{ email: 'info@marinespecies.org',
|
157
|
+
url: 'http://www.marinespecies.org' }
|
158
|
+
],
|
159
|
+
metadata_providers: [
|
160
|
+
{ first_name: 'Dmitry',
|
161
|
+
last_name: 'Mozzherin',
|
162
|
+
email: 'dmozzherin@gmail.com' }
|
163
|
+
],
|
164
|
+
abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
|
165
|
+
'is to provide an authoritative and comprehensive list ' +
|
166
|
+
'of names of marine organisms, including information ' +
|
167
|
+
'on synonymy. While highest priority goes to valid ' +
|
168
|
+
'names, other names in use are included so that this ' +
|
169
|
+
'register can serve as a guide to interpret taxonomic ' +
|
170
|
+
'literature.',
|
171
|
+
}
|
172
|
+
super
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
class Url
|
3
|
+
|
4
|
+
attr_reader :net_http, :path, :header
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url
|
8
|
+
@parsed_url = URI.parse(url.strip)
|
9
|
+
@path = @parsed_url.path == '' ? '/' : @parsed_url.path
|
10
|
+
@net_http = Net::HTTP.new(@parsed_url.host, @parsed_url.port)
|
11
|
+
@header = get_header
|
12
|
+
end
|
13
|
+
|
14
|
+
# confirm that the passed in URL is valid and responses with a proper code
|
15
|
+
def valid?
|
16
|
+
@header && ['200','301','302'].include?(@header.code)
|
17
|
+
end
|
18
|
+
|
19
|
+
def content_length
|
20
|
+
header ? header.content_length : nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def get_header
|
26
|
+
begin
|
27
|
+
return @net_http.head(@path)
|
28
|
+
rescue SocketError
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
module XML
|
3
|
+
def self.escape(input)
|
4
|
+
result = input.dup.strip
|
5
|
+
|
6
|
+
result.gsub!(/[&<>'"]/) do | match |
|
7
|
+
case match
|
8
|
+
when '&' then '&'
|
9
|
+
when '<' then '<'
|
10
|
+
when '>' then '>'
|
11
|
+
when "'" then '''
|
12
|
+
when '"' then '"'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
result
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.unescape(input)
|
19
|
+
result = input.dup.strip
|
20
|
+
|
21
|
+
result.gsub!(/&[a-z]+;/) do | match |
|
22
|
+
case match
|
23
|
+
when '&' then '&'
|
24
|
+
when '<' then '<'
|
25
|
+
when '>' then '>'
|
26
|
+
when ''' then "'"
|
27
|
+
when '"' then '"'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
result
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/dwca_hunter.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
require "fileutils"
|
5
|
+
require "uri"
|
6
|
+
require "tmpdir"
|
7
|
+
require "net/http"
|
8
|
+
require "json"
|
9
|
+
require "dwc_archive"
|
10
|
+
require "dwca_hunter/resource"
|
11
|
+
require "rest_client"
|
12
|
+
require "base64"
|
13
|
+
|
14
|
+
Dir[File.join(__dir__, "dwca_hunter", "*.rb")].
|
15
|
+
each { |f| require f }
|
16
|
+
|
17
|
+
Dir[File.join(__dir__, "dwca_hunter", "resources", "*.rb")].
|
18
|
+
each { |f| require f }
|
19
|
+
|
20
|
+
# DwcaHunter a namespace module for the project.
|
21
|
+
module DwcaHunter
|
22
|
+
BATCH_SIZE = 10_000
|
23
|
+
|
24
|
+
class << self
|
25
|
+
attr_reader :resource
|
26
|
+
|
27
|
+
def logger
|
28
|
+
@logger ||= Logger.new(nil)
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_writer :logger
|
32
|
+
|
33
|
+
def logger_reset
|
34
|
+
self.logger = Logger.new(nil)
|
35
|
+
end
|
36
|
+
|
37
|
+
def logger_write(obj_id, message, method = :info)
|
38
|
+
logger.send(method, "|#{obj_id}|#{message}|")
|
39
|
+
end
|
40
|
+
|
41
|
+
def process(resource)
|
42
|
+
resource.download if resource.needs_download?
|
43
|
+
resource.unpack if resource.needs_unpack?
|
44
|
+
resource.make_dwca
|
45
|
+
end
|
46
|
+
|
47
|
+
def resources
|
48
|
+
ObjectSpace.each_object(Class).select do |c|
|
49
|
+
c < Resource
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|