dwca_hunter 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,7 +44,7 @@ module DwcaHunter
44
44
 
45
45
  def collect_names
46
46
  @names_index = {}
47
- file = CSV.open(File.join(@download_dir, "taxonomy_export_2020May26.csv"),
47
+ file = CSV.open(File.join(@download_dir, "taxonomy_export_2021Feb2.csv"),
48
48
  headers: true)
49
49
  file.each_with_index do |row, i|
50
50
  canonical = row["SCIENTIFIC_NAME"]
@@ -1,10 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DwcaHunter
4
+ # Wikispecies source
4
5
  class ResourceWikispecies < DwcaHunter::Resource
5
- def initialize(opts = {})
6
+ def initialize(opts = { download: true, unpack: true })
6
7
  @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
- @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
8
+ @problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
8
9
  @command = "wikispecies"
9
10
  @title = "Wikispecies"
10
11
  @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
@@ -18,12 +19,13 @@ module DwcaHunter
18
19
  @tree = {}
19
20
  @paths = {}
20
21
  @extensions = []
22
+ @parser = Biodiversity::Parser
21
23
  @re = {
22
- page_start: /^\s*\<page\>\s*$/,
23
- page_end: %r{^\s*\</page\>\s*$},
24
+ page_start: /^\s*<page>\s*$/,
25
+ page_end: %r{^\s*</page>\s*$},
24
26
  template: /Template:/i,
25
- template_link: /\{\{([^\}]*)\}\}/,
26
- vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
27
+ template_link: /\{\{([^}]*)\}\}/,
28
+ vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
27
29
  }
28
30
  super(opts)
29
31
  end
@@ -39,7 +41,6 @@ module DwcaHunter
39
41
 
40
42
  def make_dwca
41
43
  enrich_data
42
- extend_classification
43
44
  generate_dwca
44
45
  end
45
46
 
@@ -62,13 +63,15 @@ module DwcaHunter
62
63
  if l.match(@re[:page_end])
63
64
  page_on = false
64
65
  page_xml = Nokogiri::XML.parse(page)
65
- template?(page_xml) ?
66
- process_template(page_xml) :
66
+ if template?(page_xml)
67
+ process_template(page_xml)
68
+ else
67
69
  process_species(page_xml)
70
+ end
68
71
  page_num += 1
69
- if page_num % BATCH_SIZE == 0
72
+ if (page_num % BATCH_SIZE).zero?
70
73
  DwcaHunter.logger_write(object_id,
71
- "Traversed %s pages" % page_num)
74
+ "Traversed #{page_num} pages")
72
75
  end
73
76
  page = ""
74
77
  @page_title = nil
@@ -81,51 +84,6 @@ module DwcaHunter
81
84
  f.close
82
85
  end
83
86
 
84
- def extend_classification
85
- DwcaHunter.logger_write(object_id, "Extending classifications")
86
- @data.each_with_index do |d, i|
87
- unless d[:classificationPath].empty?
88
- n = 50
89
- while n > 0
90
- n -= 1
91
- if n == 0
92
- d[:classificationPath] = []
93
- break
94
- end
95
- parent = @templates[d[:classificationPath].first]
96
- if parent
97
- d[:classificationPath].unshift(parent[:parentName])
98
- else
99
- update_tree(d[:classificationPath])
100
- break
101
- end
102
- end
103
- end
104
- # d[:classificationPath] = d[:classificationPath].join("|").
105
- # gsub("Main Page", "Life")
106
- if i % BATCH_SIZE == 0 && i > 0
107
- DwcaHunter.logger_write(object_id,
108
- "Extended %s classifications" % i)
109
- end
110
- end
111
- end
112
-
113
- def update_tree(path)
114
- path = path.dup
115
- return if @paths.key?(path.join("|"))
116
-
117
- (0...path.size).each do |i|
118
- subpath = path[0..i]
119
- subpath_string = subpath.join("|")
120
- next if @paths.key?(subpath_string)
121
-
122
- name = subpath.pop
123
- tree_element = subpath.inject(@tree) { |res, n| res[n] }
124
- tree_element[name] = {}
125
- @paths[subpath_string] = 1
126
- end
127
- end
128
-
129
87
  def process_template(x)
130
88
  name = page_title(x).gsub!(@re[:template], "").strip
131
89
  text = x.xpath("//text").text.strip
@@ -161,23 +119,28 @@ module DwcaHunter
161
119
  }
162
120
  get_full_scientific_name(items)
163
121
  get_vernacular_names(items)
164
- init_classification_path(items)
165
122
  end
166
123
  end
167
124
 
168
125
  def get_full_scientific_name(items)
169
- if items["name"]
170
- if name = items["name"][0]
171
- @data[-1][:scientificName] = parse_name(name, @data[-1])
172
- else
173
- @problems_file.write("%s\n" % @data[-1][:canonicalForm])
174
- end
126
+ name_ary = items["{{int:name}}"]
127
+
128
+ if name_ary.nil? || name_ary.empty?
129
+ @problems_file.write("%s\n" % @data[-1][:canonicalForm])
130
+ return
131
+ end
132
+
133
+ name = name_ary[0]
134
+ name = parse_name(name, @data[-1])
135
+ if name != ""
136
+ @data[-1][:scientificName] = name
175
137
  end
176
138
  end
177
139
 
178
140
  def get_vernacular_names(items)
179
- if items["vernacular names"] && !items["vernacular names"].empty?
180
- vn_string = items["vernacular names"].join("")
141
+ vern = items["{{int:vernacular names}}"]
142
+ if vern.is_a?(Array) && vern.size.positive?
143
+ vn_string = vern.join("")
181
144
  vn = vn_string.match(@re[:vernacular_names])
182
145
  if vn
183
146
  vn_list = vn[1].strip.split("|")
@@ -214,8 +177,8 @@ module DwcaHunter
214
177
 
215
178
  def find_species_components(x)
216
179
  items = get_items(x.xpath("//text").text)
217
- is_taxon_item = items.key?("name") ||
218
- items.key?("taxonavigation")
180
+ is_taxon_item = items.key?("{{int:name}}") &&
181
+ items.key?("{{int:taxonavigation}}")
219
182
  return nil unless is_taxon_item
220
183
 
221
184
  items
@@ -226,7 +189,7 @@ module DwcaHunter
226
189
  items = {}
227
190
  current_item = nil
228
191
  txt.split("\n").each do |l|
229
- item = l.match(/[\=]+([^\=]+)[\=]+/)
192
+ item = l.match(/=+([^=]+)=+/)
230
193
  if item
231
194
  current_item = item[1].strip.downcase
232
195
  items[current_item] = []
@@ -255,22 +218,25 @@ module DwcaHunter
255
218
  old_l = name_string.dup
256
219
  name_string.gsub!(/^\*\s*/, "")
257
220
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
258
- name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
259
- name_string.gsub!(/[']{2,}/, " ")
260
- name_string.gsub!(/["]{2,}/, " ")
261
- name_string.gsub!(/\:\s*\d.*$/, "")
221
+ name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
222
+ name_string.gsub!(/'{2,}/, " ")
223
+ name_string.gsub!(/"{2,}/, " ")
224
+ name_string.gsub!(/:\s*\d.*$/, "")
262
225
  name_string.gsub!(/,\s*\[RSD\]/i, "")
263
226
  name_string.gsub!(/^\s*†\s*/, "")
264
227
  name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
265
228
  # name_string = DwcaHunter::XML.unescape(name_string)
266
- name_string.gsub!(/\<nowiki\>.*$/, "")
267
- name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
- name_string.gsub!(/^\s*\&dagger;\s*/, "")
229
+ name_string.gsub!(/<nowiki>.*$/, "")
230
+ name_string.gsub!(%r{<br\s*/?\s*>}, "")
231
+ name_string.gsub!(/^\s*&dagger;\s*/, "")
269
232
  name_string.gsub!(/&nbsp;/, " ")
270
233
  name_string.gsub!(/\s+/, " ")
271
- name_string = name_string.strip
272
- # puts "%s---%s" % [name_string, old_l]
273
- name_string
234
+ res = name_string.strip
235
+ parsed = @parser.parse(res, simple: true)
236
+ if !["1","2"].include?(parsed[:quality])
237
+ return ""
238
+ end
239
+ res
274
240
  end
275
241
 
276
242
  def generate_dwca
@@ -286,34 +252,35 @@ module DwcaHunter
286
252
  count = 0
287
253
  @data.map do |d|
288
254
  count += 1
289
- if count % BATCH_SIZE == 0
255
+ if (count % BATCH_SIZE).zero?
290
256
  DwcaHunter.logger_write(object_id,
291
257
  "Traversing %s core data record" % count)
292
258
  end
293
259
  taxon_id = begin
294
- (d[:classificationPath].empty? ?
295
- d[:taxonId] :
296
- @templates[d[:classificationPath].
297
- last][:id])
298
- rescue StandardError
299
- d[:taxonId]
300
- end
260
+ (if d[:classificationPath].empty?
261
+ d[:taxonId]
262
+ else
263
+ @templates[d[:classificationPath].
264
+ last][:id]
265
+ end)
266
+ rescue StandardError
267
+ d[:taxonId]
268
+ end
301
269
  @taxon_ids[d[:taxonId]] = taxon_id
302
270
  parentNameUsageId = begin
303
- (d[:classificationPath].size > 1 ?
304
- @templates[d[:classificationPath][-2]][:id] :
305
- nil)
306
- rescue StandardError
307
- nil
308
- end
309
- url = "http://species.wikimedia.org/wiki/" +
310
- URI.encode(d[:canonicalForm].gsub(" ", "_"))
271
+ (@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
272
+ rescue StandardError
273
+ nil
274
+ end
275
+ url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
311
276
  path = d[:classificationPath]
312
277
  path.pop if path[-1] == d[:canonicalForm]
313
278
  canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
- scientific_name = d[:scientificName] == d[:canonicalForm] ?
315
- canonical_form :
316
- d[:scientificName]
279
+ scientific_name = if d[:scientificName] == d[:canonicalForm]
280
+ canonical_form
281
+ else
282
+ d[:scientificName]
283
+ end
317
284
  @core << [taxon_id,
318
285
  scientific_name,
319
286
  canonical_form,
@@ -329,7 +296,7 @@ module DwcaHunter
329
296
  count = 0
330
297
  @data.each do |d|
331
298
  count += 1
332
- if count % BATCH_SIZE == 0
299
+ if (count % BATCH_SIZE).zero?
333
300
  DwcaHunter.logger_write(object_id,
334
301
  "Traversing %s extension data record" % count)
335
302
  end
@@ -1,5 +1,5 @@
1
1
  module DwcaHunter
2
- VERSION = "0.7.1"
2
+ VERSION = "0.7.2"
3
3
 
4
4
  def self.version
5
5
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwca_hunter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-01-07 00:00:00.000000000 Z
11
+ date: 2021-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: biodiversity
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '4'
19
+ version: 5.1.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '4'
26
+ version: 5.1.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: dwc-archive
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.1.1
33
+ version: 1.1.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.1.1
40
+ version: 1.1.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: gn_uuid
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '2.0'
89
+ version: '2.1'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '2.0'
96
+ version: '2.1'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: ruby-xz
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -108,48 +108,62 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rubyzip
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.3'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.3'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: thor
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
129
  - - "~>"
116
130
  - !ruby/object:Gem::Version
117
- version: '0.19'
131
+ version: '1.1'
118
132
  type: :runtime
119
133
  prerelease: false
120
134
  version_requirements: !ruby/object:Gem::Requirement
121
135
  requirements:
122
136
  - - "~>"
123
137
  - !ruby/object:Gem::Version
124
- version: '0.19'
138
+ version: '1.1'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: bundler
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
- version: '2.0'
145
+ version: '2.2'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
- version: '2.0'
152
+ version: '2.2'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: byebug
141
155
  requirement: !ruby/object:Gem::Requirement
142
156
  requirements:
143
157
  - - "~>"
144
158
  - !ruby/object:Gem::Version
145
- version: '10.0'
159
+ version: '11.1'
146
160
  type: :development
147
161
  prerelease: false
148
162
  version_requirements: !ruby/object:Gem::Requirement
149
163
  requirements:
150
164
  - - "~>"
151
165
  - !ruby/object:Gem::Version
152
- version: '10.0'
166
+ version: '11.1'
153
167
  - !ruby/object:Gem::Dependency
154
168
  name: coveralls
155
169
  requirement: !ruby/object:Gem::Requirement
@@ -184,28 +198,42 @@ dependencies:
184
198
  requirements:
185
199
  - - "~>"
186
200
  - !ruby/object:Gem::Version
187
- version: '3.9'
201
+ version: '3.10'
188
202
  type: :development
189
203
  prerelease: false
190
204
  version_requirements: !ruby/object:Gem::Requirement
191
205
  requirements:
192
206
  - - "~>"
193
207
  - !ruby/object:Gem::Version
194
- version: '3.9'
208
+ version: '3.10'
195
209
  - !ruby/object:Gem::Dependency
196
210
  name: rubocop
197
211
  requirement: !ruby/object:Gem::Requirement
198
212
  requirements:
199
213
  - - "~>"
200
214
  - !ruby/object:Gem::Version
201
- version: '0.84'
215
+ version: '1.9'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '1.9'
223
+ - !ruby/object:Gem::Dependency
224
+ name: solargraph
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '0.40'
202
230
  type: :development
203
231
  prerelease: false
204
232
  version_requirements: !ruby/object:Gem::Requirement
205
233
  requirements:
206
234
  - - "~>"
207
235
  - !ruby/object:Gem::Version
208
- version: '0.84'
236
+ version: '0.40'
209
237
  description: Gem harvests data from a variety of formats and converts incoming data
210
238
  to DwCA format.
211
239
  email:
@@ -272,14 +300,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
272
300
  requirements:
273
301
  - - ">="
274
302
  - !ruby/object:Gem::Version
275
- version: 2.6.6
303
+ version: 3.0.0
276
304
  required_rubygems_version: !ruby/object:Gem::Requirement
277
305
  requirements:
278
306
  - - ">="
279
307
  - !ruby/object:Gem::Version
280
308
  version: '0'
281
309
  requirements: []
282
- rubygems_version: 3.0.3
310
+ rubygems_version: 3.2.6
283
311
  signing_key:
284
312
  specification_version: 4
285
313
  summary: Converts a variety of available online resources to DarwinCore Archive files.