dwca_hunter 0.7.1 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -44,7 +44,7 @@ module DwcaHunter
44
44
 
45
45
  def collect_names
46
46
  @names_index = {}
47
- file = CSV.open(File.join(@download_dir, "taxonomy_export_2020May26.csv"),
47
+ file = CSV.open(File.join(@download_dir, "taxonomy_export_2021Feb2.csv"),
48
48
  headers: true)
49
49
  file.each_with_index do |row, i|
50
50
  canonical = row["SCIENTIFIC_NAME"]
@@ -1,10 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DwcaHunter
4
+ # Wikispecies source
4
5
  class ResourceWikispecies < DwcaHunter::Resource
5
- def initialize(opts = {})
6
+ def initialize(opts = { download: true, unpack: true })
6
7
  @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
- @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
8
+ @problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
8
9
  @command = "wikispecies"
9
10
  @title = "Wikispecies"
10
11
  @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
@@ -18,12 +19,13 @@ module DwcaHunter
18
19
  @tree = {}
19
20
  @paths = {}
20
21
  @extensions = []
22
+ @parser = Biodiversity::Parser
21
23
  @re = {
22
- page_start: /^\s*\<page\>\s*$/,
23
- page_end: %r{^\s*\</page\>\s*$},
24
+ page_start: /^\s*<page>\s*$/,
25
+ page_end: %r{^\s*</page>\s*$},
24
26
  template: /Template:/i,
25
- template_link: /\{\{([^\}]*)\}\}/,
26
- vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
27
+ template_link: /\{\{([^}]*)\}\}/,
28
+ vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
27
29
  }
28
30
  super(opts)
29
31
  end
@@ -39,7 +41,6 @@ module DwcaHunter
39
41
 
40
42
  def make_dwca
41
43
  enrich_data
42
- extend_classification
43
44
  generate_dwca
44
45
  end
45
46
 
@@ -62,13 +63,15 @@ module DwcaHunter
62
63
  if l.match(@re[:page_end])
63
64
  page_on = false
64
65
  page_xml = Nokogiri::XML.parse(page)
65
- template?(page_xml) ?
66
- process_template(page_xml) :
66
+ if template?(page_xml)
67
+ process_template(page_xml)
68
+ else
67
69
  process_species(page_xml)
70
+ end
68
71
  page_num += 1
69
- if page_num % BATCH_SIZE == 0
72
+ if (page_num % BATCH_SIZE).zero?
70
73
  DwcaHunter.logger_write(object_id,
71
- "Traversed %s pages" % page_num)
74
+ "Traversed #{page_num} pages")
72
75
  end
73
76
  page = ""
74
77
  @page_title = nil
@@ -81,51 +84,6 @@ module DwcaHunter
81
84
  f.close
82
85
  end
83
86
 
84
- def extend_classification
85
- DwcaHunter.logger_write(object_id, "Extending classifications")
86
- @data.each_with_index do |d, i|
87
- unless d[:classificationPath].empty?
88
- n = 50
89
- while n > 0
90
- n -= 1
91
- if n == 0
92
- d[:classificationPath] = []
93
- break
94
- end
95
- parent = @templates[d[:classificationPath].first]
96
- if parent
97
- d[:classificationPath].unshift(parent[:parentName])
98
- else
99
- update_tree(d[:classificationPath])
100
- break
101
- end
102
- end
103
- end
104
- # d[:classificationPath] = d[:classificationPath].join("|").
105
- # gsub("Main Page", "Life")
106
- if i % BATCH_SIZE == 0 && i > 0
107
- DwcaHunter.logger_write(object_id,
108
- "Extended %s classifications" % i)
109
- end
110
- end
111
- end
112
-
113
- def update_tree(path)
114
- path = path.dup
115
- return if @paths.key?(path.join("|"))
116
-
117
- (0...path.size).each do |i|
118
- subpath = path[0..i]
119
- subpath_string = subpath.join("|")
120
- next if @paths.key?(subpath_string)
121
-
122
- name = subpath.pop
123
- tree_element = subpath.inject(@tree) { |res, n| res[n] }
124
- tree_element[name] = {}
125
- @paths[subpath_string] = 1
126
- end
127
- end
128
-
129
87
  def process_template(x)
130
88
  name = page_title(x).gsub!(@re[:template], "").strip
131
89
  text = x.xpath("//text").text.strip
@@ -161,23 +119,28 @@ module DwcaHunter
161
119
  }
162
120
  get_full_scientific_name(items)
163
121
  get_vernacular_names(items)
164
- init_classification_path(items)
165
122
  end
166
123
  end
167
124
 
168
125
  def get_full_scientific_name(items)
169
- if items["name"]
170
- if name = items["name"][0]
171
- @data[-1][:scientificName] = parse_name(name, @data[-1])
172
- else
173
- @problems_file.write("%s\n" % @data[-1][:canonicalForm])
174
- end
126
+ name_ary = items["{{int:name}}"]
127
+
128
+ if name_ary.nil? || name_ary.empty?
129
+ @problems_file.write("%s\n" % @data[-1][:canonicalForm])
130
+ return
131
+ end
132
+
133
+ name = name_ary[0]
134
+ name = parse_name(name, @data[-1])
135
+ if name != ""
136
+ @data[-1][:scientificName] = name
175
137
  end
176
138
  end
177
139
 
178
140
  def get_vernacular_names(items)
179
- if items["vernacular names"] && !items["vernacular names"].empty?
180
- vn_string = items["vernacular names"].join("")
141
+ vern = items["{{int:vernacular names}}"]
142
+ if vern.is_a?(Array) && vern.size.positive?
143
+ vn_string = vern.join("")
181
144
  vn = vn_string.match(@re[:vernacular_names])
182
145
  if vn
183
146
  vn_list = vn[1].strip.split("|")
@@ -214,8 +177,8 @@ module DwcaHunter
214
177
 
215
178
  def find_species_components(x)
216
179
  items = get_items(x.xpath("//text").text)
217
- is_taxon_item = items.key?("name") ||
218
- items.key?("taxonavigation")
180
+ is_taxon_item = items.key?("{{int:name}}") &&
181
+ items.key?("{{int:taxonavigation}}")
219
182
  return nil unless is_taxon_item
220
183
 
221
184
  items
@@ -226,7 +189,7 @@ module DwcaHunter
226
189
  items = {}
227
190
  current_item = nil
228
191
  txt.split("\n").each do |l|
229
- item = l.match(/[\=]+([^\=]+)[\=]+/)
192
+ item = l.match(/=+([^=]+)=+/)
230
193
  if item
231
194
  current_item = item[1].strip.downcase
232
195
  items[current_item] = []
@@ -255,22 +218,25 @@ module DwcaHunter
255
218
  old_l = name_string.dup
256
219
  name_string.gsub!(/^\*\s*/, "")
257
220
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
258
- name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
259
- name_string.gsub!(/[']{2,}/, " ")
260
- name_string.gsub!(/["]{2,}/, " ")
261
- name_string.gsub!(/\:\s*\d.*$/, "")
221
+ name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
222
+ name_string.gsub!(/'{2,}/, " ")
223
+ name_string.gsub!(/"{2,}/, " ")
224
+ name_string.gsub!(/:\s*\d.*$/, "")
262
225
  name_string.gsub!(/,\s*\[RSD\]/i, "")
263
226
  name_string.gsub!(/^\s*†\s*/, "")
264
227
  name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
265
228
  # name_string = DwcaHunter::XML.unescape(name_string)
266
- name_string.gsub!(/\<nowiki\>.*$/, "")
267
- name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
- name_string.gsub!(/^\s*\&dagger;\s*/, "")
229
+ name_string.gsub!(/<nowiki>.*$/, "")
230
+ name_string.gsub!(%r{<br\s*/?\s*>}, "")
231
+ name_string.gsub!(/^\s*&dagger;\s*/, "")
269
232
  name_string.gsub!(/&nbsp;/, " ")
270
233
  name_string.gsub!(/\s+/, " ")
271
- name_string = name_string.strip
272
- # puts "%s---%s" % [name_string, old_l]
273
- name_string
234
+ res = name_string.strip
235
+ parsed = @parser.parse(res, simple: true)
236
+ if !["1","2"].include?(parsed[:quality])
237
+ return ""
238
+ end
239
+ res
274
240
  end
275
241
 
276
242
  def generate_dwca
@@ -286,34 +252,35 @@ module DwcaHunter
286
252
  count = 0
287
253
  @data.map do |d|
288
254
  count += 1
289
- if count % BATCH_SIZE == 0
255
+ if (count % BATCH_SIZE).zero?
290
256
  DwcaHunter.logger_write(object_id,
291
257
  "Traversing %s core data record" % count)
292
258
  end
293
259
  taxon_id = begin
294
- (d[:classificationPath].empty? ?
295
- d[:taxonId] :
296
- @templates[d[:classificationPath].
297
- last][:id])
298
- rescue StandardError
299
- d[:taxonId]
300
- end
260
+ (if d[:classificationPath].empty?
261
+ d[:taxonId]
262
+ else
263
+ @templates[d[:classificationPath].
264
+ last][:id]
265
+ end)
266
+ rescue StandardError
267
+ d[:taxonId]
268
+ end
301
269
  @taxon_ids[d[:taxonId]] = taxon_id
302
270
  parentNameUsageId = begin
303
- (d[:classificationPath].size > 1 ?
304
- @templates[d[:classificationPath][-2]][:id] :
305
- nil)
306
- rescue StandardError
307
- nil
308
- end
309
- url = "http://species.wikimedia.org/wiki/" +
310
- URI.encode(d[:canonicalForm].gsub(" ", "_"))
271
+ (@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
272
+ rescue StandardError
273
+ nil
274
+ end
275
+ url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
311
276
  path = d[:classificationPath]
312
277
  path.pop if path[-1] == d[:canonicalForm]
313
278
  canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
- scientific_name = d[:scientificName] == d[:canonicalForm] ?
315
- canonical_form :
316
- d[:scientificName]
279
+ scientific_name = if d[:scientificName] == d[:canonicalForm]
280
+ canonical_form
281
+ else
282
+ d[:scientificName]
283
+ end
317
284
  @core << [taxon_id,
318
285
  scientific_name,
319
286
  canonical_form,
@@ -329,7 +296,7 @@ module DwcaHunter
329
296
  count = 0
330
297
  @data.each do |d|
331
298
  count += 1
332
- if count % BATCH_SIZE == 0
299
+ if (count % BATCH_SIZE).zero?
333
300
  DwcaHunter.logger_write(object_id,
334
301
  "Traversing %s extension data record" % count)
335
302
  end
@@ -1,5 +1,5 @@
1
1
  module DwcaHunter
2
- VERSION = "0.7.1"
2
+ VERSION = "0.7.2"
3
3
 
4
4
  def self.version
5
5
  VERSION
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwca_hunter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.7.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-01-07 00:00:00.000000000 Z
11
+ date: 2021-03-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: biodiversity
@@ -16,28 +16,28 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '4'
19
+ version: 5.1.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '4'
26
+ version: 5.1.2
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: dwc-archive
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.1.1
33
+ version: 1.1.3
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.1.1
40
+ version: 1.1.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: gn_uuid
43
43
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '2.0'
89
+ version: '2.1'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '2.0'
96
+ version: '2.1'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: ruby-xz
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -108,48 +108,62 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '1.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rubyzip
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.3'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.3'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: thor
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
129
  - - "~>"
116
130
  - !ruby/object:Gem::Version
117
- version: '0.19'
131
+ version: '1.1'
118
132
  type: :runtime
119
133
  prerelease: false
120
134
  version_requirements: !ruby/object:Gem::Requirement
121
135
  requirements:
122
136
  - - "~>"
123
137
  - !ruby/object:Gem::Version
124
- version: '0.19'
138
+ version: '1.1'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: bundler
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
143
  - - "~>"
130
144
  - !ruby/object:Gem::Version
131
- version: '2.0'
145
+ version: '2.2'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
150
  - - "~>"
137
151
  - !ruby/object:Gem::Version
138
- version: '2.0'
152
+ version: '2.2'
139
153
  - !ruby/object:Gem::Dependency
140
154
  name: byebug
141
155
  requirement: !ruby/object:Gem::Requirement
142
156
  requirements:
143
157
  - - "~>"
144
158
  - !ruby/object:Gem::Version
145
- version: '10.0'
159
+ version: '11.1'
146
160
  type: :development
147
161
  prerelease: false
148
162
  version_requirements: !ruby/object:Gem::Requirement
149
163
  requirements:
150
164
  - - "~>"
151
165
  - !ruby/object:Gem::Version
152
- version: '10.0'
166
+ version: '11.1'
153
167
  - !ruby/object:Gem::Dependency
154
168
  name: coveralls
155
169
  requirement: !ruby/object:Gem::Requirement
@@ -184,28 +198,42 @@ dependencies:
184
198
  requirements:
185
199
  - - "~>"
186
200
  - !ruby/object:Gem::Version
187
- version: '3.9'
201
+ version: '3.10'
188
202
  type: :development
189
203
  prerelease: false
190
204
  version_requirements: !ruby/object:Gem::Requirement
191
205
  requirements:
192
206
  - - "~>"
193
207
  - !ruby/object:Gem::Version
194
- version: '3.9'
208
+ version: '3.10'
195
209
  - !ruby/object:Gem::Dependency
196
210
  name: rubocop
197
211
  requirement: !ruby/object:Gem::Requirement
198
212
  requirements:
199
213
  - - "~>"
200
214
  - !ruby/object:Gem::Version
201
- version: '0.84'
215
+ version: '1.9'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '1.9'
223
+ - !ruby/object:Gem::Dependency
224
+ name: solargraph
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '0.40'
202
230
  type: :development
203
231
  prerelease: false
204
232
  version_requirements: !ruby/object:Gem::Requirement
205
233
  requirements:
206
234
  - - "~>"
207
235
  - !ruby/object:Gem::Version
208
- version: '0.84'
236
+ version: '0.40'
209
237
  description: Gem harvests data from a variety of formats and converts incoming data
210
238
  to DwCA format.
211
239
  email:
@@ -272,14 +300,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
272
300
  requirements:
273
301
  - - ">="
274
302
  - !ruby/object:Gem::Version
275
- version: 2.6.6
303
+ version: 3.0.0
276
304
  required_rubygems_version: !ruby/object:Gem::Requirement
277
305
  requirements:
278
306
  - - ">="
279
307
  - !ruby/object:Gem::Version
280
308
  version: '0'
281
309
  requirements: []
282
- rubygems_version: 3.0.3
310
+ rubygems_version: 3.2.6
283
311
  signing_key:
284
312
  specification_version: 4
285
313
  summary: Converts a variety of available online resources to DarwinCore Archive files.