dwca_hunter 0.7.1 → 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.ruby-version +1 -1
- data/Gemfile.lock +56 -27
- data/dwca_hunter.gemspec +11 -9
- data/exe/dwcahunter +0 -2
- data/lib/dwca_hunter.rb +9 -7
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/arctos.rb +42 -45
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +105 -105
- data/lib/dwca_hunter/resources/mammal_divdb.rb +76 -45
- data/lib/dwca_hunter/resources/mcz.rb +1 -1
- data/lib/dwca_hunter/resources/wikispecies.rb +65 -98
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +48 -20
@@ -44,7 +44,7 @@ module DwcaHunter
|
|
44
44
|
|
45
45
|
def collect_names
|
46
46
|
@names_index = {}
|
47
|
-
file = CSV.open(File.join(@download_dir, "
|
47
|
+
file = CSV.open(File.join(@download_dir, "taxonomy_export_2021Feb2.csv"),
|
48
48
|
headers: true)
|
49
49
|
file.each_with_index do |row, i|
|
50
50
|
canonical = row["SCIENTIFIC_NAME"]
|
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DwcaHunter
|
4
|
+
# Wikispecies source
|
4
5
|
class ResourceWikispecies < DwcaHunter::Resource
|
5
|
-
def initialize(opts = {})
|
6
|
+
def initialize(opts = { download: true, unpack: true })
|
6
7
|
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
7
|
-
@problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
8
|
+
@problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
8
9
|
@command = "wikispecies"
|
9
10
|
@title = "Wikispecies"
|
10
11
|
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
@@ -18,12 +19,13 @@ module DwcaHunter
|
|
18
19
|
@tree = {}
|
19
20
|
@paths = {}
|
20
21
|
@extensions = []
|
22
|
+
@parser = Biodiversity::Parser
|
21
23
|
@re = {
|
22
|
-
page_start: /^\s
|
23
|
-
page_end: %r{^\s
|
24
|
+
page_start: /^\s*<page>\s*$/,
|
25
|
+
page_end: %r{^\s*</page>\s*$},
|
24
26
|
template: /Template:/i,
|
25
|
-
template_link: /\{\{([
|
26
|
-
vernacular_names: /\{\{\s*VN\s*\|([
|
27
|
+
template_link: /\{\{([^}]*)\}\}/,
|
28
|
+
vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
|
27
29
|
}
|
28
30
|
super(opts)
|
29
31
|
end
|
@@ -39,7 +41,6 @@ module DwcaHunter
|
|
39
41
|
|
40
42
|
def make_dwca
|
41
43
|
enrich_data
|
42
|
-
extend_classification
|
43
44
|
generate_dwca
|
44
45
|
end
|
45
46
|
|
@@ -62,13 +63,15 @@ module DwcaHunter
|
|
62
63
|
if l.match(@re[:page_end])
|
63
64
|
page_on = false
|
64
65
|
page_xml = Nokogiri::XML.parse(page)
|
65
|
-
template?(page_xml)
|
66
|
-
process_template(page_xml)
|
66
|
+
if template?(page_xml)
|
67
|
+
process_template(page_xml)
|
68
|
+
else
|
67
69
|
process_species(page_xml)
|
70
|
+
end
|
68
71
|
page_num += 1
|
69
|
-
if page_num % BATCH_SIZE
|
72
|
+
if (page_num % BATCH_SIZE).zero?
|
70
73
|
DwcaHunter.logger_write(object_id,
|
71
|
-
"Traversed
|
74
|
+
"Traversed #{page_num} pages")
|
72
75
|
end
|
73
76
|
page = ""
|
74
77
|
@page_title = nil
|
@@ -81,51 +84,6 @@ module DwcaHunter
|
|
81
84
|
f.close
|
82
85
|
end
|
83
86
|
|
84
|
-
def extend_classification
|
85
|
-
DwcaHunter.logger_write(object_id, "Extending classifications")
|
86
|
-
@data.each_with_index do |d, i|
|
87
|
-
unless d[:classificationPath].empty?
|
88
|
-
n = 50
|
89
|
-
while n > 0
|
90
|
-
n -= 1
|
91
|
-
if n == 0
|
92
|
-
d[:classificationPath] = []
|
93
|
-
break
|
94
|
-
end
|
95
|
-
parent = @templates[d[:classificationPath].first]
|
96
|
-
if parent
|
97
|
-
d[:classificationPath].unshift(parent[:parentName])
|
98
|
-
else
|
99
|
-
update_tree(d[:classificationPath])
|
100
|
-
break
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
# d[:classificationPath] = d[:classificationPath].join("|").
|
105
|
-
# gsub("Main Page", "Life")
|
106
|
-
if i % BATCH_SIZE == 0 && i > 0
|
107
|
-
DwcaHunter.logger_write(object_id,
|
108
|
-
"Extended %s classifications" % i)
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
def update_tree(path)
|
114
|
-
path = path.dup
|
115
|
-
return if @paths.key?(path.join("|"))
|
116
|
-
|
117
|
-
(0...path.size).each do |i|
|
118
|
-
subpath = path[0..i]
|
119
|
-
subpath_string = subpath.join("|")
|
120
|
-
next if @paths.key?(subpath_string)
|
121
|
-
|
122
|
-
name = subpath.pop
|
123
|
-
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
124
|
-
tree_element[name] = {}
|
125
|
-
@paths[subpath_string] = 1
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
87
|
def process_template(x)
|
130
88
|
name = page_title(x).gsub!(@re[:template], "").strip
|
131
89
|
text = x.xpath("//text").text.strip
|
@@ -161,23 +119,28 @@ module DwcaHunter
|
|
161
119
|
}
|
162
120
|
get_full_scientific_name(items)
|
163
121
|
get_vernacular_names(items)
|
164
|
-
init_classification_path(items)
|
165
122
|
end
|
166
123
|
end
|
167
124
|
|
168
125
|
def get_full_scientific_name(items)
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
126
|
+
name_ary = items["{{int:name}}"]
|
127
|
+
|
128
|
+
if name_ary.nil? || name_ary.empty?
|
129
|
+
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
130
|
+
return
|
131
|
+
end
|
132
|
+
|
133
|
+
name = name_ary[0]
|
134
|
+
name = parse_name(name, @data[-1])
|
135
|
+
if name != ""
|
136
|
+
@data[-1][:scientificName] = name
|
175
137
|
end
|
176
138
|
end
|
177
139
|
|
178
140
|
def get_vernacular_names(items)
|
179
|
-
|
180
|
-
|
141
|
+
vern = items["{{int:vernacular names}}"]
|
142
|
+
if vern.is_a?(Array) && vern.size.positive?
|
143
|
+
vn_string = vern.join("")
|
181
144
|
vn = vn_string.match(@re[:vernacular_names])
|
182
145
|
if vn
|
183
146
|
vn_list = vn[1].strip.split("|")
|
@@ -214,8 +177,8 @@ module DwcaHunter
|
|
214
177
|
|
215
178
|
def find_species_components(x)
|
216
179
|
items = get_items(x.xpath("//text").text)
|
217
|
-
is_taxon_item = items.key?("name")
|
218
|
-
items.key?("taxonavigation")
|
180
|
+
is_taxon_item = items.key?("{{int:name}}") &&
|
181
|
+
items.key?("{{int:taxonavigation}}")
|
219
182
|
return nil unless is_taxon_item
|
220
183
|
|
221
184
|
items
|
@@ -226,7 +189,7 @@ module DwcaHunter
|
|
226
189
|
items = {}
|
227
190
|
current_item = nil
|
228
191
|
txt.split("\n").each do |l|
|
229
|
-
item = l.match(
|
192
|
+
item = l.match(/=+([^=]+)=+/)
|
230
193
|
if item
|
231
194
|
current_item = item[1].strip.downcase
|
232
195
|
items[current_item] = []
|
@@ -255,22 +218,25 @@ module DwcaHunter
|
|
255
218
|
old_l = name_string.dup
|
256
219
|
name_string.gsub!(/^\*\s*/, "")
|
257
220
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
258
|
-
name_string.gsub!(/\{\{([
|
259
|
-
name_string.gsub!(/
|
260
|
-
name_string.gsub!(/
|
261
|
-
name_string.gsub!(
|
221
|
+
name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
|
222
|
+
name_string.gsub!(/'{2,}/, " ")
|
223
|
+
name_string.gsub!(/"{2,}/, " ")
|
224
|
+
name_string.gsub!(/:\s*\d.*$/, "")
|
262
225
|
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
263
226
|
name_string.gsub!(/^\s*†\s*/, "")
|
264
227
|
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
265
228
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
266
|
-
name_string.gsub!(
|
267
|
-
name_string.gsub!(%r{
|
268
|
-
name_string.gsub!(/^\s
|
229
|
+
name_string.gsub!(/<nowiki>.*$/, "")
|
230
|
+
name_string.gsub!(%r{<br\s*/?\s*>}, "")
|
231
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
269
232
|
name_string.gsub!(/ /, " ")
|
270
233
|
name_string.gsub!(/\s+/, " ")
|
271
|
-
|
272
|
-
|
273
|
-
|
234
|
+
res = name_string.strip
|
235
|
+
parsed = @parser.parse(res, simple: true)
|
236
|
+
if !["1","2"].include?(parsed[:quality])
|
237
|
+
return ""
|
238
|
+
end
|
239
|
+
res
|
274
240
|
end
|
275
241
|
|
276
242
|
def generate_dwca
|
@@ -286,34 +252,35 @@ module DwcaHunter
|
|
286
252
|
count = 0
|
287
253
|
@data.map do |d|
|
288
254
|
count += 1
|
289
|
-
if count % BATCH_SIZE
|
255
|
+
if (count % BATCH_SIZE).zero?
|
290
256
|
DwcaHunter.logger_write(object_id,
|
291
257
|
"Traversing %s core data record" % count)
|
292
258
|
end
|
293
259
|
taxon_id = begin
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
260
|
+
(if d[:classificationPath].empty?
|
261
|
+
d[:taxonId]
|
262
|
+
else
|
263
|
+
@templates[d[:classificationPath].
|
264
|
+
last][:id]
|
265
|
+
end)
|
266
|
+
rescue StandardError
|
267
|
+
d[:taxonId]
|
268
|
+
end
|
301
269
|
@taxon_ids[d[:taxonId]] = taxon_id
|
302
270
|
parentNameUsageId = begin
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
end
|
309
|
-
url = "http://species.wikimedia.org/wiki/" +
|
310
|
-
URI.encode(d[:canonicalForm].gsub(" ", "_"))
|
271
|
+
(@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
|
272
|
+
rescue StandardError
|
273
|
+
nil
|
274
|
+
end
|
275
|
+
url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
|
311
276
|
path = d[:classificationPath]
|
312
277
|
path.pop if path[-1] == d[:canonicalForm]
|
313
278
|
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
314
|
-
scientific_name = d[:scientificName] == d[:canonicalForm]
|
315
|
-
|
316
|
-
|
279
|
+
scientific_name = if d[:scientificName] == d[:canonicalForm]
|
280
|
+
canonical_form
|
281
|
+
else
|
282
|
+
d[:scientificName]
|
283
|
+
end
|
317
284
|
@core << [taxon_id,
|
318
285
|
scientific_name,
|
319
286
|
canonical_form,
|
@@ -329,7 +296,7 @@ module DwcaHunter
|
|
329
296
|
count = 0
|
330
297
|
@data.each do |d|
|
331
298
|
count += 1
|
332
|
-
if count % BATCH_SIZE
|
299
|
+
if (count % BATCH_SIZE).zero?
|
333
300
|
DwcaHunter.logger_write(object_id,
|
334
301
|
"Traversing %s extension data record" % count)
|
335
302
|
end
|
data/lib/dwca_hunter/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwca_hunter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: biodiversity
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 5.1.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 5.1.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: dwc-archive
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.1.
|
33
|
+
version: 1.1.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.1.
|
40
|
+
version: 1.1.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: gn_uuid
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '2.
|
89
|
+
version: '2.1'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '2.
|
96
|
+
version: '2.1'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: ruby-xz
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,48 +108,62 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rubyzip
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.3'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.3'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: thor
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - "~>"
|
116
130
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
131
|
+
version: '1.1'
|
118
132
|
type: :runtime
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
138
|
+
version: '1.1'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: bundler
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '2.
|
145
|
+
version: '2.2'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '2.
|
152
|
+
version: '2.2'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: byebug
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
142
156
|
requirements:
|
143
157
|
- - "~>"
|
144
158
|
- !ruby/object:Gem::Version
|
145
|
-
version: '
|
159
|
+
version: '11.1'
|
146
160
|
type: :development
|
147
161
|
prerelease: false
|
148
162
|
version_requirements: !ruby/object:Gem::Requirement
|
149
163
|
requirements:
|
150
164
|
- - "~>"
|
151
165
|
- !ruby/object:Gem::Version
|
152
|
-
version: '
|
166
|
+
version: '11.1'
|
153
167
|
- !ruby/object:Gem::Dependency
|
154
168
|
name: coveralls
|
155
169
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,28 +198,42 @@ dependencies:
|
|
184
198
|
requirements:
|
185
199
|
- - "~>"
|
186
200
|
- !ruby/object:Gem::Version
|
187
|
-
version: '3.
|
201
|
+
version: '3.10'
|
188
202
|
type: :development
|
189
203
|
prerelease: false
|
190
204
|
version_requirements: !ruby/object:Gem::Requirement
|
191
205
|
requirements:
|
192
206
|
- - "~>"
|
193
207
|
- !ruby/object:Gem::Version
|
194
|
-
version: '3.
|
208
|
+
version: '3.10'
|
195
209
|
- !ruby/object:Gem::Dependency
|
196
210
|
name: rubocop
|
197
211
|
requirement: !ruby/object:Gem::Requirement
|
198
212
|
requirements:
|
199
213
|
- - "~>"
|
200
214
|
- !ruby/object:Gem::Version
|
201
|
-
version: '
|
215
|
+
version: '1.9'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - "~>"
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '1.9'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: solargraph
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - "~>"
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0.40'
|
202
230
|
type: :development
|
203
231
|
prerelease: false
|
204
232
|
version_requirements: !ruby/object:Gem::Requirement
|
205
233
|
requirements:
|
206
234
|
- - "~>"
|
207
235
|
- !ruby/object:Gem::Version
|
208
|
-
version: '0.
|
236
|
+
version: '0.40'
|
209
237
|
description: Gem harvests data from a variety of formats and converts incoming data
|
210
238
|
to DwCA format.
|
211
239
|
email:
|
@@ -272,14 +300,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
272
300
|
requirements:
|
273
301
|
- - ">="
|
274
302
|
- !ruby/object:Gem::Version
|
275
|
-
version:
|
303
|
+
version: 3.0.0
|
276
304
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
277
305
|
requirements:
|
278
306
|
- - ">="
|
279
307
|
- !ruby/object:Gem::Version
|
280
308
|
version: '0'
|
281
309
|
requirements: []
|
282
|
-
rubygems_version: 3.
|
310
|
+
rubygems_version: 3.2.6
|
283
311
|
signing_key:
|
284
312
|
specification_version: 4
|
285
313
|
summary: Converts a variety of available online resources to DarwinCore Archive files.
|