dwca_hunter 0.7.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -1
- data/.ruby-version +1 -1
- data/Gemfile.lock +56 -27
- data/dwca_hunter.gemspec +11 -9
- data/exe/dwcahunter +0 -2
- data/lib/dwca_hunter.rb +9 -7
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/arctos.rb +42 -45
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +105 -105
- data/lib/dwca_hunter/resources/mammal_divdb.rb +76 -45
- data/lib/dwca_hunter/resources/mcz.rb +1 -1
- data/lib/dwca_hunter/resources/wikispecies.rb +65 -98
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +48 -20
@@ -44,7 +44,7 @@ module DwcaHunter
|
|
44
44
|
|
45
45
|
def collect_names
|
46
46
|
@names_index = {}
|
47
|
-
file = CSV.open(File.join(@download_dir, "
|
47
|
+
file = CSV.open(File.join(@download_dir, "taxonomy_export_2021Feb2.csv"),
|
48
48
|
headers: true)
|
49
49
|
file.each_with_index do |row, i|
|
50
50
|
canonical = row["SCIENTIFIC_NAME"]
|
@@ -1,10 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DwcaHunter
|
4
|
+
# Wikispecies source
|
4
5
|
class ResourceWikispecies < DwcaHunter::Resource
|
5
|
-
def initialize(opts = {})
|
6
|
+
def initialize(opts = { download: true, unpack: true })
|
6
7
|
@wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
|
7
|
-
@problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
8
|
+
@problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
|
8
9
|
@command = "wikispecies"
|
9
10
|
@title = "Wikispecies"
|
10
11
|
@url = "http://dumps.wikimedia.org/specieswiki/latest/" \
|
@@ -18,12 +19,13 @@ module DwcaHunter
|
|
18
19
|
@tree = {}
|
19
20
|
@paths = {}
|
20
21
|
@extensions = []
|
22
|
+
@parser = Biodiversity::Parser
|
21
23
|
@re = {
|
22
|
-
page_start: /^\s
|
23
|
-
page_end: %r{^\s
|
24
|
+
page_start: /^\s*<page>\s*$/,
|
25
|
+
page_end: %r{^\s*</page>\s*$},
|
24
26
|
template: /Template:/i,
|
25
|
-
template_link: /\{\{([
|
26
|
-
vernacular_names: /\{\{\s*VN\s*\|([
|
27
|
+
template_link: /\{\{([^}]*)\}\}/,
|
28
|
+
vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
|
27
29
|
}
|
28
30
|
super(opts)
|
29
31
|
end
|
@@ -39,7 +41,6 @@ module DwcaHunter
|
|
39
41
|
|
40
42
|
def make_dwca
|
41
43
|
enrich_data
|
42
|
-
extend_classification
|
43
44
|
generate_dwca
|
44
45
|
end
|
45
46
|
|
@@ -62,13 +63,15 @@ module DwcaHunter
|
|
62
63
|
if l.match(@re[:page_end])
|
63
64
|
page_on = false
|
64
65
|
page_xml = Nokogiri::XML.parse(page)
|
65
|
-
template?(page_xml)
|
66
|
-
process_template(page_xml)
|
66
|
+
if template?(page_xml)
|
67
|
+
process_template(page_xml)
|
68
|
+
else
|
67
69
|
process_species(page_xml)
|
70
|
+
end
|
68
71
|
page_num += 1
|
69
|
-
if page_num % BATCH_SIZE
|
72
|
+
if (page_num % BATCH_SIZE).zero?
|
70
73
|
DwcaHunter.logger_write(object_id,
|
71
|
-
"Traversed
|
74
|
+
"Traversed #{page_num} pages")
|
72
75
|
end
|
73
76
|
page = ""
|
74
77
|
@page_title = nil
|
@@ -81,51 +84,6 @@ module DwcaHunter
|
|
81
84
|
f.close
|
82
85
|
end
|
83
86
|
|
84
|
-
def extend_classification
|
85
|
-
DwcaHunter.logger_write(object_id, "Extending classifications")
|
86
|
-
@data.each_with_index do |d, i|
|
87
|
-
unless d[:classificationPath].empty?
|
88
|
-
n = 50
|
89
|
-
while n > 0
|
90
|
-
n -= 1
|
91
|
-
if n == 0
|
92
|
-
d[:classificationPath] = []
|
93
|
-
break
|
94
|
-
end
|
95
|
-
parent = @templates[d[:classificationPath].first]
|
96
|
-
if parent
|
97
|
-
d[:classificationPath].unshift(parent[:parentName])
|
98
|
-
else
|
99
|
-
update_tree(d[:classificationPath])
|
100
|
-
break
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
# d[:classificationPath] = d[:classificationPath].join("|").
|
105
|
-
# gsub("Main Page", "Life")
|
106
|
-
if i % BATCH_SIZE == 0 && i > 0
|
107
|
-
DwcaHunter.logger_write(object_id,
|
108
|
-
"Extended %s classifications" % i)
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
def update_tree(path)
|
114
|
-
path = path.dup
|
115
|
-
return if @paths.key?(path.join("|"))
|
116
|
-
|
117
|
-
(0...path.size).each do |i|
|
118
|
-
subpath = path[0..i]
|
119
|
-
subpath_string = subpath.join("|")
|
120
|
-
next if @paths.key?(subpath_string)
|
121
|
-
|
122
|
-
name = subpath.pop
|
123
|
-
tree_element = subpath.inject(@tree) { |res, n| res[n] }
|
124
|
-
tree_element[name] = {}
|
125
|
-
@paths[subpath_string] = 1
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
87
|
def process_template(x)
|
130
88
|
name = page_title(x).gsub!(@re[:template], "").strip
|
131
89
|
text = x.xpath("//text").text.strip
|
@@ -161,23 +119,28 @@ module DwcaHunter
|
|
161
119
|
}
|
162
120
|
get_full_scientific_name(items)
|
163
121
|
get_vernacular_names(items)
|
164
|
-
init_classification_path(items)
|
165
122
|
end
|
166
123
|
end
|
167
124
|
|
168
125
|
def get_full_scientific_name(items)
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
126
|
+
name_ary = items["{{int:name}}"]
|
127
|
+
|
128
|
+
if name_ary.nil? || name_ary.empty?
|
129
|
+
@problems_file.write("%s\n" % @data[-1][:canonicalForm])
|
130
|
+
return
|
131
|
+
end
|
132
|
+
|
133
|
+
name = name_ary[0]
|
134
|
+
name = parse_name(name, @data[-1])
|
135
|
+
if name != ""
|
136
|
+
@data[-1][:scientificName] = name
|
175
137
|
end
|
176
138
|
end
|
177
139
|
|
178
140
|
def get_vernacular_names(items)
|
179
|
-
|
180
|
-
|
141
|
+
vern = items["{{int:vernacular names}}"]
|
142
|
+
if vern.is_a?(Array) && vern.size.positive?
|
143
|
+
vn_string = vern.join("")
|
181
144
|
vn = vn_string.match(@re[:vernacular_names])
|
182
145
|
if vn
|
183
146
|
vn_list = vn[1].strip.split("|")
|
@@ -214,8 +177,8 @@ module DwcaHunter
|
|
214
177
|
|
215
178
|
def find_species_components(x)
|
216
179
|
items = get_items(x.xpath("//text").text)
|
217
|
-
is_taxon_item = items.key?("name")
|
218
|
-
items.key?("taxonavigation")
|
180
|
+
is_taxon_item = items.key?("{{int:name}}") &&
|
181
|
+
items.key?("{{int:taxonavigation}}")
|
219
182
|
return nil unless is_taxon_item
|
220
183
|
|
221
184
|
items
|
@@ -226,7 +189,7 @@ module DwcaHunter
|
|
226
189
|
items = {}
|
227
190
|
current_item = nil
|
228
191
|
txt.split("\n").each do |l|
|
229
|
-
item = l.match(
|
192
|
+
item = l.match(/=+([^=]+)=+/)
|
230
193
|
if item
|
231
194
|
current_item = item[1].strip.downcase
|
232
195
|
items[current_item] = []
|
@@ -255,22 +218,25 @@ module DwcaHunter
|
|
255
218
|
old_l = name_string.dup
|
256
219
|
name_string.gsub!(/^\*\s*/, "")
|
257
220
|
name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
|
258
|
-
name_string.gsub!(/\{\{([
|
259
|
-
name_string.gsub!(/
|
260
|
-
name_string.gsub!(/
|
261
|
-
name_string.gsub!(
|
221
|
+
name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
|
222
|
+
name_string.gsub!(/'{2,}/, " ")
|
223
|
+
name_string.gsub!(/"{2,}/, " ")
|
224
|
+
name_string.gsub!(/:\s*\d.*$/, "")
|
262
225
|
name_string.gsub!(/,\s*\[RSD\]/i, "")
|
263
226
|
name_string.gsub!(/^\s*†\s*/, "")
|
264
227
|
name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
|
265
228
|
# name_string = DwcaHunter::XML.unescape(name_string)
|
266
|
-
name_string.gsub!(
|
267
|
-
name_string.gsub!(%r{
|
268
|
-
name_string.gsub!(/^\s
|
229
|
+
name_string.gsub!(/<nowiki>.*$/, "")
|
230
|
+
name_string.gsub!(%r{<br\s*/?\s*>}, "")
|
231
|
+
name_string.gsub!(/^\s*†\s*/, "")
|
269
232
|
name_string.gsub!(/ /, " ")
|
270
233
|
name_string.gsub!(/\s+/, " ")
|
271
|
-
|
272
|
-
|
273
|
-
|
234
|
+
res = name_string.strip
|
235
|
+
parsed = @parser.parse(res, simple: true)
|
236
|
+
if !["1","2"].include?(parsed[:quality])
|
237
|
+
return ""
|
238
|
+
end
|
239
|
+
res
|
274
240
|
end
|
275
241
|
|
276
242
|
def generate_dwca
|
@@ -286,34 +252,35 @@ module DwcaHunter
|
|
286
252
|
count = 0
|
287
253
|
@data.map do |d|
|
288
254
|
count += 1
|
289
|
-
if count % BATCH_SIZE
|
255
|
+
if (count % BATCH_SIZE).zero?
|
290
256
|
DwcaHunter.logger_write(object_id,
|
291
257
|
"Traversing %s core data record" % count)
|
292
258
|
end
|
293
259
|
taxon_id = begin
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
260
|
+
(if d[:classificationPath].empty?
|
261
|
+
d[:taxonId]
|
262
|
+
else
|
263
|
+
@templates[d[:classificationPath].
|
264
|
+
last][:id]
|
265
|
+
end)
|
266
|
+
rescue StandardError
|
267
|
+
d[:taxonId]
|
268
|
+
end
|
301
269
|
@taxon_ids[d[:taxonId]] = taxon_id
|
302
270
|
parentNameUsageId = begin
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
end
|
309
|
-
url = "http://species.wikimedia.org/wiki/" +
|
310
|
-
URI.encode(d[:canonicalForm].gsub(" ", "_"))
|
271
|
+
(@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
|
272
|
+
rescue StandardError
|
273
|
+
nil
|
274
|
+
end
|
275
|
+
url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
|
311
276
|
path = d[:classificationPath]
|
312
277
|
path.pop if path[-1] == d[:canonicalForm]
|
313
278
|
canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
|
314
|
-
scientific_name = d[:scientificName] == d[:canonicalForm]
|
315
|
-
|
316
|
-
|
279
|
+
scientific_name = if d[:scientificName] == d[:canonicalForm]
|
280
|
+
canonical_form
|
281
|
+
else
|
282
|
+
d[:scientificName]
|
283
|
+
end
|
317
284
|
@core << [taxon_id,
|
318
285
|
scientific_name,
|
319
286
|
canonical_form,
|
@@ -329,7 +296,7 @@ module DwcaHunter
|
|
329
296
|
count = 0
|
330
297
|
@data.each do |d|
|
331
298
|
count += 1
|
332
|
-
if count % BATCH_SIZE
|
299
|
+
if (count % BATCH_SIZE).zero?
|
333
300
|
DwcaHunter.logger_write(object_id,
|
334
301
|
"Traversing %s extension data record" % count)
|
335
302
|
end
|
data/lib/dwca_hunter/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwca_hunter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-03-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: biodiversity
|
@@ -16,28 +16,28 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 5.1.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: 5.1.2
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: dwc-archive
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.1.
|
33
|
+
version: 1.1.3
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.1.
|
40
|
+
version: 1.1.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: gn_uuid
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '2.
|
89
|
+
version: '2.1'
|
90
90
|
type: :runtime
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '2.
|
96
|
+
version: '2.1'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: ruby-xz
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,48 +108,62 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '1.0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rubyzip
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.3'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.3'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: thor
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
129
|
- - "~>"
|
116
130
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
131
|
+
version: '1.1'
|
118
132
|
type: :runtime
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
136
|
- - "~>"
|
123
137
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
138
|
+
version: '1.1'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: bundler
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
143
|
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
|
-
version: '2.
|
145
|
+
version: '2.2'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
150
|
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
|
-
version: '2.
|
152
|
+
version: '2.2'
|
139
153
|
- !ruby/object:Gem::Dependency
|
140
154
|
name: byebug
|
141
155
|
requirement: !ruby/object:Gem::Requirement
|
142
156
|
requirements:
|
143
157
|
- - "~>"
|
144
158
|
- !ruby/object:Gem::Version
|
145
|
-
version: '
|
159
|
+
version: '11.1'
|
146
160
|
type: :development
|
147
161
|
prerelease: false
|
148
162
|
version_requirements: !ruby/object:Gem::Requirement
|
149
163
|
requirements:
|
150
164
|
- - "~>"
|
151
165
|
- !ruby/object:Gem::Version
|
152
|
-
version: '
|
166
|
+
version: '11.1'
|
153
167
|
- !ruby/object:Gem::Dependency
|
154
168
|
name: coveralls
|
155
169
|
requirement: !ruby/object:Gem::Requirement
|
@@ -184,28 +198,42 @@ dependencies:
|
|
184
198
|
requirements:
|
185
199
|
- - "~>"
|
186
200
|
- !ruby/object:Gem::Version
|
187
|
-
version: '3.
|
201
|
+
version: '3.10'
|
188
202
|
type: :development
|
189
203
|
prerelease: false
|
190
204
|
version_requirements: !ruby/object:Gem::Requirement
|
191
205
|
requirements:
|
192
206
|
- - "~>"
|
193
207
|
- !ruby/object:Gem::Version
|
194
|
-
version: '3.
|
208
|
+
version: '3.10'
|
195
209
|
- !ruby/object:Gem::Dependency
|
196
210
|
name: rubocop
|
197
211
|
requirement: !ruby/object:Gem::Requirement
|
198
212
|
requirements:
|
199
213
|
- - "~>"
|
200
214
|
- !ruby/object:Gem::Version
|
201
|
-
version: '
|
215
|
+
version: '1.9'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - "~>"
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '1.9'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: solargraph
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - "~>"
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0.40'
|
202
230
|
type: :development
|
203
231
|
prerelease: false
|
204
232
|
version_requirements: !ruby/object:Gem::Requirement
|
205
233
|
requirements:
|
206
234
|
- - "~>"
|
207
235
|
- !ruby/object:Gem::Version
|
208
|
-
version: '0.
|
236
|
+
version: '0.40'
|
209
237
|
description: Gem harvests data from a variety of formats and converts incoming data
|
210
238
|
to DwCA format.
|
211
239
|
email:
|
@@ -272,14 +300,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
272
300
|
requirements:
|
273
301
|
- - ">="
|
274
302
|
- !ruby/object:Gem::Version
|
275
|
-
version:
|
303
|
+
version: 3.0.0
|
276
304
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
277
305
|
requirements:
|
278
306
|
- - ">="
|
279
307
|
- !ruby/object:Gem::Version
|
280
308
|
version: '0'
|
281
309
|
requirements: []
|
282
|
-
rubygems_version: 3.
|
310
|
+
rubygems_version: 3.2.6
|
283
311
|
signing_key:
|
284
312
|
specification_version: 4
|
285
313
|
summary: Converts a variety of available online resources to DarwinCore Archive files.
|