dwca_hunter 0.5.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ class PaleodbHarvester
2
+ def initialize(download_dir)
3
+ @dir = File.join(download_dir, "json")
4
+ FileUtils.mkdir_p(@dir)
5
+ @in_dir = download_dir
6
+ @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
7
+ @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
8
+ @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
9
+ @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
10
+ end
11
+
12
+ def taxa
13
+ # "orig_no","taxon_no","record_type","flags","taxon_rank",
14
+ # "taxon_name","difference","accepted_no","accepted_rank",
15
+ # "accepted_name","parent_no","reference_no","is_extant","n_occs"
16
+ taxa = {}
17
+ name2id = {}
18
+ @taxa_csv.each do |r|
19
+ r = strip(r)
20
+ taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
21
+ rank: r["taxon_rank"], name: r["taxon_name"],
22
+ auth: r["taxon_attr"],
23
+ extinct: extinct(r["is_extant"]),
24
+ vernacular: r["common_name"],
25
+ annot: r["difference"], acc_id: r["accepted_no"],
26
+ acc_rank: r["accepted_rank"],
27
+ acc_name: r["accepted_name"], ecol: ecol(r),
28
+ parent_id: r["parent_no"], ref: r["reference_no"],
29
+ occs_num: r["n_occs"], enterer: enterer(r) }
30
+
31
+ name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
32
+ end
33
+ f = open(File.join(@dir, "taxa.json"), "w:utf-8")
34
+ f.write(JSON.pretty_generate(taxa))
35
+ f.close
36
+ f = open(File.join(@dir, "name_id.json"), "w:utf-8")
37
+ f.write(JSON.pretty_generate(name2id))
38
+ f.close
39
+ end
40
+
41
+ def enterer(r)
42
+ res = [r["enterer"], r["modifier"]].map(&:to_s)
43
+ .map(&:strip).uniq.select { |e| e != "" }
44
+ res.empty? ? "" : res.join(", ")
45
+ end
46
+
47
+
48
+ def extinct(val)
49
+ val == "extinct" ? 1 : 0
50
+ end
51
+
52
+ def ecol(row)
53
+ row = strip row
54
+ "#{row['life_habit']} #{row['diet']}"
55
+ end
56
+
57
+ def refs
58
+ # "reference_no","record_type","ref_type","author1init","author1last",
59
+ # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
60
+ # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
61
+ # "language","doi"
62
+
63
+ # {"id":31671,"orig":true,"author":"Hahn, C. W.",
64
+ # "year":1834,"title":"Die wanzenartigen Insecten.",
65
+ # "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
66
+ # "distribution":"Germany","comment":"n. sp."}
67
+ refs = {}
68
+ @refs_csv.each do |r|
69
+ r = strip r
70
+ authorship, author = authors(r)
71
+ refs[r["reference_no"]] = { id: r["reference_no"], author: author,
72
+ authorship: authorship,
73
+ year: r["pubyr"], title: r["reftitle"],
74
+ details: details(r) }
75
+ end
76
+ f = open(File.join(@dir, "refs.json"), "w:utf-8")
77
+ f.write(JSON.pretty_generate(refs))
78
+ f.close
79
+ end
80
+
81
+ def authors(row)
82
+ row = strip row
83
+ au = ["#{row['author1init']} #{row['author1last']}".strip,
84
+ "#{row['author2init']} #{row['author2last']}".strip,
85
+ "#{row['otherauthors']}".strip]
86
+ au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
87
+ [au[0..1].join(", "), au.join(", ")]
88
+ end
89
+
90
+ def details(row)
91
+ row = strip row
92
+ ref = "#{row['pubtitle']}"
93
+ ref << " #{row['pubno']}" unless row['pubno'].empty?
94
+ ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
95
+ ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
96
+ ref << " (#{row["doi"]})" unless row['doi'].empty?
97
+ ref.gsub(/[\s]{2,}/, " ").strip
98
+ end
99
+
100
+ def taxa_refs
101
+ tr = {}
102
+ @taxa_refs_csv.each do |r|
103
+ r = strip r
104
+ row = { acc_id: r["accepted_no"], name: r["accepted_name"],
105
+ ref_id: r["reference_no"] }
106
+ if tr.key? r["accepted_no"]
107
+ tr[r["accepted_no"]] << row
108
+ else
109
+ tr[r["accepted_no"]] = [row]
110
+ end
111
+ end
112
+ f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
113
+ f.write(JSON.pretty_generate(tr))
114
+ f.close
115
+ end
116
+
117
+ def occurences
118
+ occ = {}
119
+ @occurences_csv.each_with_index do |r, i|
120
+ r = strip r
121
+ row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
122
+ state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
123
+ if occ.key? r["accepted_no"]
124
+ occ[r["accepted_no"]] << row
125
+ else
126
+ occ[r["accepted_no"]] = [row]
127
+ end
128
+ end
129
+ f = open(File.join(@dir, "occurences.json"), "w:utf-8")
130
+ f.write(JSON.pretty_generate(occ))
131
+ f.close
132
+ end
133
+
134
+ def strip(row)
135
+ row.each_with_object({}) do |(k, v), h|
136
+ h[k] = v.nil? ? nil : v.strip
137
+ end
138
+ end
139
+ end
140
+
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceSherborn < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "sherborn"
7
+ @title = "Index Animalium"
8
+ @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
9
+ @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "sherborn",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading."
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ dupes = {}
45
+ @names_index = {}
46
+ file = CSV.open(File.join(@download_dir, "data.csv"),
47
+ headers: false, col_sep: "\t")
48
+ file.each_with_index do |row, i|
49
+ next if dupes.key?(row[1])
50
+
51
+ dupes[row[1]] = true
52
+ taxon_id = row[0]
53
+ name_string = row[1]
54
+
55
+ @names << { taxon_id: taxon_id,
56
+ name_string: name_string }
57
+ puts "Processed %s names" % i if i % 10_000 == 0
58
+ end
59
+ end
60
+
61
+ def generate_dwca
62
+ DwcaHunter.logger_write(object_id,
63
+ "Creating DarwinCore Archive file")
64
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
65
+ "http://rs.tdwg.org/dwc/terms/scientificName",
66
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
67
+ @names.each do |n|
68
+ @core << [n[:taxon_id], n[:name_string], "ICZN"]
69
+ end
70
+
71
+ @eml = {
72
+ id: @uuid,
73
+ title: @title,
74
+ authors: [
75
+ { first_name: "Charles Davies",
76
+ last_name: "Sherborn" }
77
+ ],
78
+ metadata_providers: [
79
+ { first_name: "Dmitry",
80
+ last_name: "Mozzherin",
81
+ email: "dmozzherin@gmail.com" }
82
+ ],
83
+ abstract: "Index Animalium is a monumental work that covers " \
84
+ "400 000 zoological names registered by science " \
85
+ "between 1758 and 1850",
86
+ url: @url
87
+ }
88
+ super
89
+ end
90
+ end
91
+ end
@@ -1,16 +1,17 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceWikispecies < DwcaHunter::Resource
4
5
  def initialize(opts = {})
5
- @wikisp_path = File.join(Dir.tmpdir, 'dwca_hunter', 'wikispecies')
6
- @problems_file = open(File.join(Dir.tmpdir, 'problems.txt'), 'w:utf-8')
6
+ @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
+ @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
7
8
  @command = "wikispecies"
8
- @title = 'Wikispecies'
9
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
10
- 'specieswiki-latest-pages-articles.xml.bz2'
9
+ @title = "Wikispecies"
10
+ @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
11
+ "specieswiki-latest-pages-articles.xml.bz2"
11
12
  @url = opts[:url] if opts[:url]
12
- @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
13
- @download_path = File.join(@wikisp_path, 'data.xml.bz2')
13
+ @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
14
+ @download_path = File.join(@wikisp_path, "data.xml.bz2")
14
15
  @data = []
15
16
  @templates = {}
16
17
  @taxon_ids = {}
@@ -19,7 +20,7 @@ module DwcaHunter
19
20
  @extensions = []
20
21
  @re = {
21
22
  page_start: /^\s*\<page\>\s*$/,
22
- page_end: /^\s*\<\/page\>\s*$/,
23
+ page_end: %r{^\s*\</page\>\s*$},
23
24
  template: /Template:/i,
24
25
  template_link: /\{\{([^\}]*)\}\}/,
25
26
  vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
@@ -27,6 +28,11 @@ module DwcaHunter
27
28
  super(opts)
28
29
  end
29
30
 
31
+ def download
32
+ puts "Downloading from the source"
33
+ `curl -L #{@url} -o #{@download_path}`
34
+ end
35
+
30
36
  def unpack
31
37
  unpack_bz2
32
38
  end
@@ -37,22 +43,22 @@ module DwcaHunter
37
43
  generate_dwca
38
44
  end
39
45
 
40
- private
46
+ private
41
47
 
42
48
  def enrich_data
43
- DwcaHunter::logger_write(self.object_id,
44
- 'Extracting data from xml file...')
49
+ DwcaHunter.logger_write(object_id,
50
+ "Extracting data from xml file...")
45
51
  Dir.chdir(@download_dir)
46
- f = open('data.xml', 'r:utf-8')
52
+ f = open("data.xml", "r:utf-8")
47
53
  page_on = false
48
- page = ''
54
+ page = ""
49
55
  page_num = 0
50
56
  f.each do |l|
51
57
  if l.match(@re[:page_start])
52
- page << l
58
+ page += l
53
59
  page_on = true
54
60
  elsif page_on
55
- page << l
61
+ page += l
56
62
  if l.match(@re[:page_end])
57
63
  page_on = false
58
64
  page_xml = Nokogiri::XML.parse(page)
@@ -61,22 +67,22 @@ module DwcaHunter
61
67
  process_species(page_xml)
62
68
  page_num += 1
63
69
  if page_num % BATCH_SIZE == 0
64
- DwcaHunter::logger_write(self.object_id,
65
- "Traversed %s pages" % page_num)
70
+ DwcaHunter.logger_write(object_id,
71
+ "Traversed %s pages" % page_num)
66
72
  end
67
- page = ''
73
+ page = ""
68
74
  @page_title = nil
69
75
  @page_id = nil
70
76
  end
71
77
  end
72
78
  end
73
- DwcaHunter::logger_write(self.object_id,
74
- 'Extracted total %s pages' % page_num)
79
+ DwcaHunter.logger_write(object_id,
80
+ "Extracted total %s pages" % page_num)
75
81
  f.close
76
82
  end
77
83
 
78
84
  def extend_classification
79
- DwcaHunter::logger_write(self.object_id, 'Extending classifications')
85
+ DwcaHunter.logger_write(object_id, "Extending classifications")
80
86
  @data.each_with_index do |d, i|
81
87
  unless d[:classificationPath].empty?
82
88
  n = 50
@@ -98,19 +104,21 @@ module DwcaHunter
98
104
  # d[:classificationPath] = d[:classificationPath].join("|").
99
105
  # gsub("Main Page", "Life")
100
106
  if i % BATCH_SIZE == 0 && i > 0
101
- DwcaHunter::logger_write(self.object_id,
102
- "Extended %s classifications" % i)
107
+ DwcaHunter.logger_write(object_id,
108
+ "Extended %s classifications" % i)
103
109
  end
104
110
  end
105
111
  end
106
112
 
107
113
  def update_tree(path)
108
114
  path = path.dup
109
- return if @paths.has_key?(path.join('|'))
115
+ return if @paths.key?(path.join("|"))
116
+
110
117
  (0...path.size).each do |i|
111
118
  subpath = path[0..i]
112
- subpath_string = subpath.join('|')
113
- next if @paths.has_key?(subpath_string)
119
+ subpath_string = subpath.join("|")
120
+ next if @paths.key?(subpath_string)
121
+
114
122
  name = subpath.pop
115
123
  tree_element = subpath.inject(@tree) { |res, n| res[n] }
116
124
  tree_element[name] = {}
@@ -119,27 +127,29 @@ module DwcaHunter
119
127
  end
120
128
 
121
129
  def process_template(x)
122
- name = page_title(x).gsub!(@re[:template], '').strip
123
- text = x.xpath('//text').text.strip
130
+ name = page_title(x).gsub!(@re[:template], "").strip
131
+ text = x.xpath("//text").text.strip
124
132
  parent_name = text.match(@re[:template_link])
125
133
  if parent_name
126
134
  return if parent_name[1].match(/\#if/)
135
+
127
136
  list = parent_name[1].split("|")
128
- if list.size == 1
129
- parent_name = list[0]
130
- elsif list[0].match /Taxonav/i
131
- parent_name = list[1]
132
- else
133
- parent_name = list[0]
134
- end
137
+ parent_name = if list.size == 1
138
+ list[0]
139
+ elsif list[0].match(/Taxonav/i)
140
+ list[1]
141
+ else
142
+ list[0]
143
+ end
135
144
  end
136
- name.gsub!(/_/, ' ')
137
- parent_name.gsub!(/_/, ' ') if parent_name
145
+ name.gsub!(/_/, " ")
146
+ parent_name&.gsub!(/_/, " ")
138
147
  @templates[name] = { parentName: parent_name, id: page_id(x) }
139
148
  end
140
149
 
141
150
  def process_species(x)
142
151
  return if page_title(x).match(/Wikispecies/i)
152
+
143
153
  items = find_species_components(x)
144
154
  if items
145
155
  @data << {
@@ -147,7 +157,8 @@ module DwcaHunter
147
157
  canonicalForm: page_title(x),
148
158
  scientificName: page_title(x),
149
159
  classificationPath: [],
150
- vernacularNames: [] }
160
+ vernacularNames: []
161
+ }
151
162
  get_full_scientific_name(items)
152
163
  get_vernacular_names(items)
153
164
  init_classification_path(items)
@@ -155,8 +166,8 @@ module DwcaHunter
155
166
  end
156
167
 
157
168
  def get_full_scientific_name(items)
158
- if items['name']
159
- if name = items['name'][0]
169
+ if items["name"]
170
+ if name = items["name"][0]
160
171
  @data[-1][:scientificName] = parse_name(name, @data[-1])
161
172
  else
162
173
  @problems_file.write("%s\n" % @data[-1][:canonicalForm])
@@ -165,19 +176,20 @@ module DwcaHunter
165
176
  end
166
177
 
167
178
  def get_vernacular_names(items)
168
- if items['vernacular names'] && items['vernacular names'].size > 0
169
- vn_string = items['vernacular names'].join("")
179
+ if items["vernacular names"] && !items["vernacular names"].empty?
180
+ vn_string = items["vernacular names"].join("")
170
181
  vn = vn_string.match(@re[:vernacular_names])
171
182
  if vn
172
183
  vn_list = vn[1].strip.split("|")
173
184
  vnames = []
174
185
  vn_list.each do |item|
175
- language, name = item.split("=").map { |x| x.strip }
176
- if language && name && language.size < 4 && name.valid_encoding?
177
- vnames << {
178
- name: name,
179
- language: language }
180
- end
186
+ language, name = item.split("=").map(&:strip)
187
+ next unless language && name && language.size < 4 && name.valid_encoding?
188
+
189
+ vnames << {
190
+ name: name,
191
+ language: language
192
+ }
181
193
  end
182
194
 
183
195
  @data[-1][:vernacularNames] = vnames
@@ -186,26 +198,26 @@ module DwcaHunter
186
198
  end
187
199
 
188
200
  def init_classification_path(items)
189
- if items['taxonavigation']
190
- items['taxonavigation'].each do |line|
191
- line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
192
- if template_link = line.match(@re[:template_link])
193
- template_link = template_link[1].
194
- strip.gsub(/Template:/, '').gsub(/_/, ' ')
195
- if !template_link.match(/\|/)
196
- @data[-1][:classificationPath] << template_link
197
- break
198
- end
199
- end
201
+ # ignore non-template links
202
+ items["taxonavigation"]&.each do |line|
203
+ line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
204
+ next unless template_link = line.match(@re[:template_link])
205
+
206
+ template_link = template_link[1].
207
+ strip.gsub(/Template:/, "").gsub(/_/, " ")
208
+ unless template_link.match(/\|/)
209
+ @data[-1][:classificationPath] << template_link
210
+ break
200
211
  end
201
212
  end
202
213
  end
203
214
 
204
215
  def find_species_components(x)
205
- items = get_items(x.xpath('//text').text)
206
- is_taxon_item = items.has_key?('name') ||
207
- items.has_key?('taxonavigation')
216
+ items = get_items(x.xpath("//text").text)
217
+ is_taxon_item = items.key?("name") ||
218
+ items.key?("taxonavigation")
208
219
  return nil unless is_taxon_item
220
+
209
221
  items
210
222
  end
211
223
 
@@ -214,7 +226,7 @@ module DwcaHunter
214
226
  items = {}
215
227
  current_item = nil
216
228
  txt.split("\n").each do |l|
217
- item = l.match(/[\=]+([^\=]+)[\=]+/)
229
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
218
230
  if item
219
231
  current_item = item[1].strip.downcase
220
232
  items[current_item] = []
@@ -226,11 +238,11 @@ module DwcaHunter
226
238
  end
227
239
 
228
240
  def page_title(x)
229
- @page_title ||= x.xpath('//title').first.text
241
+ @page_title ||= x.xpath("//title").first.text
230
242
  end
231
243
 
232
244
  def page_id(x)
233
- @page_id ||= x.xpath('//id').first.text
245
+ @page_id ||= x.xpath("//id").first.text
234
246
  end
235
247
 
236
248
  def template?(page_xml)
@@ -238,110 +250,113 @@ module DwcaHunter
238
250
  end
239
251
 
240
252
  def parse_name(name_string, taxa)
241
- name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
253
+ name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
242
254
  name_string = name_string.strip
243
255
  old_l = name_string.dup
244
- name_string.gsub! /^\*\s*/, ''
256
+ name_string.gsub!(/^\*\s*/, "")
245
257
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
246
258
  name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
247
- name_string.gsub!(/[']{2,}/, ' ')
248
- name_string.gsub!(/["]{2,}/, ' ')
249
- name_string.gsub!(/\:\s*\d.*$/, '')
250
- name_string.gsub!(/,\s*\[RSD\]/i, '')
251
- name_string.gsub!(/^\s*†\s*/, '')
252
- name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
259
+ name_string.gsub!(/[']{2,}/, " ")
260
+ name_string.gsub!(/["]{2,}/, " ")
261
+ name_string.gsub!(/\:\s*\d.*$/, "")
262
+ name_string.gsub!(/,\s*\[RSD\]/i, "")
263
+ name_string.gsub!(/^\s*†\s*/, "")
264
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
253
265
  # name_string = DwcaHunter::XML.unescape(name_string)
254
- name_string.gsub!(/\<nowiki\>.*$/, '')
255
- name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
256
- name_string.gsub!(/^\s*\&dagger;\s*/, '')
257
- name_string.gsub!(/&nbsp;/, ' ')
258
- name_string.gsub!(/\s+/, ' ')
266
+ name_string.gsub!(/\<nowiki\>.*$/, "")
267
+ name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
+ name_string.gsub!(/^\s*\&dagger;\s*/, "")
269
+ name_string.gsub!(/&nbsp;/, " ")
270
+ name_string.gsub!(/\s+/, " ")
259
271
  name_string = name_string.strip
260
272
  # puts "%s---%s" % [name_string, old_l]
261
- return name_string
273
+ name_string
262
274
  end
263
275
 
264
276
  def generate_dwca
265
- DwcaHunter::logger_write(self.object_id,
266
- 'Creating DarwinCore Archive file')
277
+ DwcaHunter.logger_write(object_id,
278
+ "Creating DarwinCore Archive file")
267
279
  @core = [
268
- ['http://rs.tdwg.org/dwc/terms/taxonID',
269
- 'http://rs.tdwg.org/dwc/terms/scientificName',
270
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
271
- 'http://globalnames.org/terms/canonicalForm',
272
- 'http://rs.tdwg.org/dwc/terms/higherClassification',
273
- 'http://purl.org/dc/terms/source']
280
+ ["http://rs.tdwg.org/dwc/terms/taxonID",
281
+ "http://rs.tdwg.org/dwc/terms/scientificName",
282
+ "http://globalnames.org/terms/canonicalForm",
283
+ "http://purl.org/dc/terms/source"]
274
284
  ]
275
- DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
285
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
276
286
  count = 0
277
287
  @data.map do |d|
278
288
  count += 1
279
289
  if count % BATCH_SIZE == 0
280
- DwcaHunter::logger_write(self.object_id,
281
- "Traversing %s core data record" % count)
290
+ DwcaHunter.logger_write(object_id,
291
+ "Traversing %s core data record" % count)
282
292
  end
283
- taxon_id = (d[:classificationPath].empty? ?
284
- d[:taxonId] :
285
- @templates[d[:classificationPath].
286
- last][:id]) rescue d[:taxonId]
293
+ taxon_id = begin
294
+ (d[:classificationPath].empty? ?
295
+ d[:taxonId] :
296
+ @templates[d[:classificationPath].
297
+ last][:id])
298
+ rescue StandardError
299
+ d[:taxonId]
300
+ end
287
301
  @taxon_ids[d[:taxonId]] = taxon_id
288
- parentNameUsageId = (d[:classificationPath].size > 1 ?
289
- @templates[d[:classificationPath][-2]][:id] :
290
- nil) rescue nil
291
- url = 'http://species.wikimedia.org/wiki/' +
292
- URI.encode(d[:canonicalForm].gsub(' ', '_'))
302
+ parentNameUsageId = begin
303
+ (d[:classificationPath].size > 1 ?
304
+ @templates[d[:classificationPath][-2]][:id] :
305
+ nil)
306
+ rescue StandardError
307
+ nil
308
+ end
309
+ url = "http://species.wikimedia.org/wiki/" +
310
+ URI.encode(d[:canonicalForm].gsub(" ", "_"))
293
311
  path = d[:classificationPath]
294
312
  path.pop if path[-1] == d[:canonicalForm]
295
- canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
296
- scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
313
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
+ scientific_name = d[:scientificName] == d[:canonicalForm] ?
297
315
  canonical_form :
298
316
  d[:scientificName]
299
317
  @core << [taxon_id,
300
318
  scientific_name,
301
- parentNameUsageId,
302
319
  canonical_form,
303
- path.join('|'),
304
320
  url]
305
321
  end
306
322
  @extensions << { data: [[
307
- 'http://rs.tdwg.org/dwc/terms/TaxonID',
308
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
309
- 'http://purl.org/dc/terms/language'
310
- ]], file_name: 'vernacular_names.txt' }
311
- DwcaHunter::logger_write(self.object_id,
312
- 'Creating verncaular name extension for DarwinCore Archive file')
323
+ "http://rs.tdwg.org/dwc/terms/TaxonID",
324
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
325
+ "http://purl.org/dc/terms/language"
326
+ ]], file_name: "vernacular_names.txt" }
327
+ DwcaHunter.logger_write(object_id,
328
+ "Creating verncaular name extension for DarwinCore Archive file")
313
329
  count = 0
314
330
  @data.each do |d|
315
331
  count += 1
316
332
  if count % BATCH_SIZE == 0
317
- DwcaHunter::logger_write(self.object_id,
318
- "Traversing %s extension data record" % count)
333
+ DwcaHunter.logger_write(object_id,
334
+ "Traversing %s extension data record" % count)
319
335
  end
320
336
  d[:vernacularNames].each do |vn|
321
- taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
322
- if taxon_id
323
- @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
324
- end
337
+ taxon_id = @taxon_ids[d[:taxonId]] || nil
338
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
325
339
  end
326
340
  end
327
341
  @eml = {
328
342
  id: @uuid,
329
343
  title: @title,
330
- license: 'http://creativecommons.org/licenses/by-sa/3.0/',
344
+ license: "http://creativecommons.org/licenses/by-sa/3.0/",
331
345
  authors: [
332
- { first_name: 'Stephen',
333
- last_name: 'Thorpe',
334
- email: 'stephen_thorpe@yahoo.co.nz',
335
- url: 'http://species.wikimedia.org/wiki/Main_Page' }],
336
- abstract: 'The free species directory that anyone can edit.',
346
+ { first_name: "Stephen",
347
+ last_name: "Thorpe",
348
+ email: "stephen_thorpe@yahoo.co.nz",
349
+ url: "http://species.wikimedia.org/wiki/Main_Page" }
350
+ ],
351
+ abstract: "The free species directory that anyone can edit.",
337
352
  metadata_providers: [
338
- { first_name: 'Dmitry',
339
- last_name: 'Mozzherin',
340
- email: 'dmozzherin@mbl.edu' }],
341
- url: 'http://species.wikimedia.org/wiki/Main_Page'
353
+ { first_name: "Dmitry",
354
+ last_name: "Mozzherin",
355
+ email: "dmozzherin@mbl.edu" }
356
+ ],
357
+ url: "http://species.wikimedia.org/wiki/Main_Page"
342
358
  }
343
359
  super
344
360
  end
345
-
346
361
  end
347
362
  end