dwca_hunter 0.5.5 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ class PaleodbHarvester
2
+ def initialize(download_dir)
3
+ @dir = File.join(download_dir, "json")
4
+ FileUtils.mkdir_p(@dir)
5
+ @in_dir = download_dir
6
+ @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
7
+ @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
8
+ @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
9
+ @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
10
+ end
11
+
12
+ def taxa
13
+ # "orig_no","taxon_no","record_type","flags","taxon_rank",
14
+ # "taxon_name","difference","accepted_no","accepted_rank",
15
+ # "accepted_name","parent_no","reference_no","is_extant","n_occs"
16
+ taxa = {}
17
+ name2id = {}
18
+ @taxa_csv.each do |r|
19
+ r = strip(r)
20
+ taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
21
+ rank: r["taxon_rank"], name: r["taxon_name"],
22
+ auth: r["taxon_attr"],
23
+ extinct: extinct(r["is_extant"]),
24
+ vernacular: r["common_name"],
25
+ annot: r["difference"], acc_id: r["accepted_no"],
26
+ acc_rank: r["accepted_rank"],
27
+ acc_name: r["accepted_name"], ecol: ecol(r),
28
+ parent_id: r["parent_no"], ref: r["reference_no"],
29
+ occs_num: r["n_occs"], enterer: enterer(r) }
30
+
31
+ name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
32
+ end
33
+ f = open(File.join(@dir, "taxa.json"), "w:utf-8")
34
+ f.write(JSON.pretty_generate(taxa))
35
+ f.close
36
+ f = open(File.join(@dir, "name_id.json"), "w:utf-8")
37
+ f.write(JSON.pretty_generate(name2id))
38
+ f.close
39
+ end
40
+
41
+ def enterer(r)
42
+ res = [r["enterer"], r["modifier"]].map(&:to_s)
43
+ .map(&:strip).uniq.select { |e| e != "" }
44
+ res.empty? ? "" : res.join(", ")
45
+ end
46
+
47
+
48
+ def extinct(val)
49
+ val == "extinct" ? 1 : 0
50
+ end
51
+
52
+ def ecol(row)
53
+ row = strip row
54
+ "#{row['life_habit']} #{row['diet']}"
55
+ end
56
+
57
+ def refs
58
+ # "reference_no","record_type","ref_type","author1init","author1last",
59
+ # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
60
+ # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
61
+ # "language","doi"
62
+
63
+ # {"id":31671,"orig":true,"author":"Hahn, C. W.",
64
+ # "year":1834,"title":"Die wanzenartigen Insecten.",
65
+ # "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
66
+ # "distribution":"Germany","comment":"n. sp."}
67
+ refs = {}
68
+ @refs_csv.each do |r|
69
+ r = strip r
70
+ authorship, author = authors(r)
71
+ refs[r["reference_no"]] = { id: r["reference_no"], author: author,
72
+ authorship: authorship,
73
+ year: r["pubyr"], title: r["reftitle"],
74
+ details: details(r) }
75
+ end
76
+ f = open(File.join(@dir, "refs.json"), "w:utf-8")
77
+ f.write(JSON.pretty_generate(refs))
78
+ f.close
79
+ end
80
+
81
+ def authors(row)
82
+ row = strip row
83
+ au = ["#{row['author1init']} #{row['author1last']}".strip,
84
+ "#{row['author2init']} #{row['author2last']}".strip,
85
+ "#{row['otherauthors']}".strip]
86
+ au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
87
+ [au[0..1].join(", "), au.join(", ")]
88
+ end
89
+
90
+ def details(row)
91
+ row = strip row
92
+ ref = "#{row['pubtitle']}"
93
+ ref << " #{row['pubno']}" unless row['pubno'].empty?
94
+ ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
95
+ ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
96
+ ref << " (#{row["doi"]})" unless row['doi'].empty?
97
+ ref.gsub(/[\s]{2,}/, " ").strip
98
+ end
99
+
100
+ def taxa_refs
101
+ tr = {}
102
+ @taxa_refs_csv.each do |r|
103
+ r = strip r
104
+ row = { acc_id: r["accepted_no"], name: r["accepted_name"],
105
+ ref_id: r["reference_no"] }
106
+ if tr.key? r["accepted_no"]
107
+ tr[r["accepted_no"]] << row
108
+ else
109
+ tr[r["accepted_no"]] = [row]
110
+ end
111
+ end
112
+ f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
113
+ f.write(JSON.pretty_generate(tr))
114
+ f.close
115
+ end
116
+
117
+ def occurences
118
+ occ = {}
119
+ @occurences_csv.each_with_index do |r, i|
120
+ r = strip r
121
+ row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
122
+ state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
123
+ if occ.key? r["accepted_no"]
124
+ occ[r["accepted_no"]] << row
125
+ else
126
+ occ[r["accepted_no"]] = [row]
127
+ end
128
+ end
129
+ f = open(File.join(@dir, "occurences.json"), "w:utf-8")
130
+ f.write(JSON.pretty_generate(occ))
131
+ f.close
132
+ end
133
+
134
+ def strip(row)
135
+ row.each_with_object({}) do |(k, v), h|
136
+ h[k] = v.nil? ? nil : v.strip
137
+ end
138
+ end
139
+ end
140
+
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceSherborn < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "sherborn"
7
+ @title = "Index Animalium"
8
+ @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
9
+ @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "sherborn",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading."
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ dupes = {}
45
+ @names_index = {}
46
+ file = CSV.open(File.join(@download_dir, "data.csv"),
47
+ headers: false, col_sep: "\t")
48
+ file.each_with_index do |row, i|
49
+ next if dupes.key?(row[1])
50
+
51
+ dupes[row[1]] = true
52
+ taxon_id = row[0]
53
+ name_string = row[1]
54
+
55
+ @names << { taxon_id: taxon_id,
56
+ name_string: name_string }
57
+ puts "Processed %s names" % i if i % 10_000 == 0
58
+ end
59
+ end
60
+
61
+ def generate_dwca
62
+ DwcaHunter.logger_write(object_id,
63
+ "Creating DarwinCore Archive file")
64
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
65
+ "http://rs.tdwg.org/dwc/terms/scientificName",
66
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
67
+ @names.each do |n|
68
+ @core << [n[:taxon_id], n[:name_string], "ICZN"]
69
+ end
70
+
71
+ @eml = {
72
+ id: @uuid,
73
+ title: @title,
74
+ authors: [
75
+ { first_name: "Charles Davies",
76
+ last_name: "Sherborn" }
77
+ ],
78
+ metadata_providers: [
79
+ { first_name: "Dmitry",
80
+ last_name: "Mozzherin",
81
+ email: "dmozzherin@gmail.com" }
82
+ ],
83
+ abstract: "Index Animalium is a monumental work that covers " \
84
+ "400 000 zoological names registered by science " \
85
+ "between 1758 and 1850",
86
+ url: @url
87
+ }
88
+ super
89
+ end
90
+ end
91
+ end
@@ -1,16 +1,17 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceWikispecies < DwcaHunter::Resource
4
5
  def initialize(opts = {})
5
- @wikisp_path = File.join(Dir.tmpdir, 'dwca_hunter', 'wikispecies')
6
- @problems_file = open(File.join(Dir.tmpdir, 'problems.txt'), 'w:utf-8')
6
+ @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
+ @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
7
8
  @command = "wikispecies"
8
- @title = 'Wikispecies'
9
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
10
- 'specieswiki-latest-pages-articles.xml.bz2'
9
+ @title = "Wikispecies"
10
+ @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
11
+ "specieswiki-latest-pages-articles.xml.bz2"
11
12
  @url = opts[:url] if opts[:url]
12
- @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
13
- @download_path = File.join(@wikisp_path, 'data.xml.bz2')
13
+ @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
14
+ @download_path = File.join(@wikisp_path, "data.xml.bz2")
14
15
  @data = []
15
16
  @templates = {}
16
17
  @taxon_ids = {}
@@ -19,7 +20,7 @@ module DwcaHunter
19
20
  @extensions = []
20
21
  @re = {
21
22
  page_start: /^\s*\<page\>\s*$/,
22
- page_end: /^\s*\<\/page\>\s*$/,
23
+ page_end: %r{^\s*\</page\>\s*$},
23
24
  template: /Template:/i,
24
25
  template_link: /\{\{([^\}]*)\}\}/,
25
26
  vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
@@ -27,6 +28,11 @@ module DwcaHunter
27
28
  super(opts)
28
29
  end
29
30
 
31
+ def download
32
+ puts "Downloading from the source"
33
+ `curl -L #{@url} -o #{@download_path}`
34
+ end
35
+
30
36
  def unpack
31
37
  unpack_bz2
32
38
  end
@@ -37,22 +43,22 @@ module DwcaHunter
37
43
  generate_dwca
38
44
  end
39
45
 
40
- private
46
+ private
41
47
 
42
48
  def enrich_data
43
- DwcaHunter::logger_write(self.object_id,
44
- 'Extracting data from xml file...')
49
+ DwcaHunter.logger_write(object_id,
50
+ "Extracting data from xml file...")
45
51
  Dir.chdir(@download_dir)
46
- f = open('data.xml', 'r:utf-8')
52
+ f = open("data.xml", "r:utf-8")
47
53
  page_on = false
48
- page = ''
54
+ page = ""
49
55
  page_num = 0
50
56
  f.each do |l|
51
57
  if l.match(@re[:page_start])
52
- page << l
58
+ page += l
53
59
  page_on = true
54
60
  elsif page_on
55
- page << l
61
+ page += l
56
62
  if l.match(@re[:page_end])
57
63
  page_on = false
58
64
  page_xml = Nokogiri::XML.parse(page)
@@ -61,22 +67,22 @@ module DwcaHunter
61
67
  process_species(page_xml)
62
68
  page_num += 1
63
69
  if page_num % BATCH_SIZE == 0
64
- DwcaHunter::logger_write(self.object_id,
65
- "Traversed %s pages" % page_num)
70
+ DwcaHunter.logger_write(object_id,
71
+ "Traversed %s pages" % page_num)
66
72
  end
67
- page = ''
73
+ page = ""
68
74
  @page_title = nil
69
75
  @page_id = nil
70
76
  end
71
77
  end
72
78
  end
73
- DwcaHunter::logger_write(self.object_id,
74
- 'Extracted total %s pages' % page_num)
79
+ DwcaHunter.logger_write(object_id,
80
+ "Extracted total %s pages" % page_num)
75
81
  f.close
76
82
  end
77
83
 
78
84
  def extend_classification
79
- DwcaHunter::logger_write(self.object_id, 'Extending classifications')
85
+ DwcaHunter.logger_write(object_id, "Extending classifications")
80
86
  @data.each_with_index do |d, i|
81
87
  unless d[:classificationPath].empty?
82
88
  n = 50
@@ -98,19 +104,21 @@ module DwcaHunter
98
104
  # d[:classificationPath] = d[:classificationPath].join("|").
99
105
  # gsub("Main Page", "Life")
100
106
  if i % BATCH_SIZE == 0 && i > 0
101
- DwcaHunter::logger_write(self.object_id,
102
- "Extended %s classifications" % i)
107
+ DwcaHunter.logger_write(object_id,
108
+ "Extended %s classifications" % i)
103
109
  end
104
110
  end
105
111
  end
106
112
 
107
113
  def update_tree(path)
108
114
  path = path.dup
109
- return if @paths.has_key?(path.join('|'))
115
+ return if @paths.key?(path.join("|"))
116
+
110
117
  (0...path.size).each do |i|
111
118
  subpath = path[0..i]
112
- subpath_string = subpath.join('|')
113
- next if @paths.has_key?(subpath_string)
119
+ subpath_string = subpath.join("|")
120
+ next if @paths.key?(subpath_string)
121
+
114
122
  name = subpath.pop
115
123
  tree_element = subpath.inject(@tree) { |res, n| res[n] }
116
124
  tree_element[name] = {}
@@ -119,27 +127,29 @@ module DwcaHunter
119
127
  end
120
128
 
121
129
  def process_template(x)
122
- name = page_title(x).gsub!(@re[:template], '').strip
123
- text = x.xpath('//text').text.strip
130
+ name = page_title(x).gsub!(@re[:template], "").strip
131
+ text = x.xpath("//text").text.strip
124
132
  parent_name = text.match(@re[:template_link])
125
133
  if parent_name
126
134
  return if parent_name[1].match(/\#if/)
135
+
127
136
  list = parent_name[1].split("|")
128
- if list.size == 1
129
- parent_name = list[0]
130
- elsif list[0].match /Taxonav/i
131
- parent_name = list[1]
132
- else
133
- parent_name = list[0]
134
- end
137
+ parent_name = if list.size == 1
138
+ list[0]
139
+ elsif list[0].match(/Taxonav/i)
140
+ list[1]
141
+ else
142
+ list[0]
143
+ end
135
144
  end
136
- name.gsub!(/_/, ' ')
137
- parent_name.gsub!(/_/, ' ') if parent_name
145
+ name.gsub!(/_/, " ")
146
+ parent_name&.gsub!(/_/, " ")
138
147
  @templates[name] = { parentName: parent_name, id: page_id(x) }
139
148
  end
140
149
 
141
150
  def process_species(x)
142
151
  return if page_title(x).match(/Wikispecies/i)
152
+
143
153
  items = find_species_components(x)
144
154
  if items
145
155
  @data << {
@@ -147,7 +157,8 @@ module DwcaHunter
147
157
  canonicalForm: page_title(x),
148
158
  scientificName: page_title(x),
149
159
  classificationPath: [],
150
- vernacularNames: [] }
160
+ vernacularNames: []
161
+ }
151
162
  get_full_scientific_name(items)
152
163
  get_vernacular_names(items)
153
164
  init_classification_path(items)
@@ -155,8 +166,8 @@ module DwcaHunter
155
166
  end
156
167
 
157
168
  def get_full_scientific_name(items)
158
- if items['name']
159
- if name = items['name'][0]
169
+ if items["name"]
170
+ if name = items["name"][0]
160
171
  @data[-1][:scientificName] = parse_name(name, @data[-1])
161
172
  else
162
173
  @problems_file.write("%s\n" % @data[-1][:canonicalForm])
@@ -165,19 +176,20 @@ module DwcaHunter
165
176
  end
166
177
 
167
178
  def get_vernacular_names(items)
168
- if items['vernacular names'] && items['vernacular names'].size > 0
169
- vn_string = items['vernacular names'].join("")
179
+ if items["vernacular names"] && !items["vernacular names"].empty?
180
+ vn_string = items["vernacular names"].join("")
170
181
  vn = vn_string.match(@re[:vernacular_names])
171
182
  if vn
172
183
  vn_list = vn[1].strip.split("|")
173
184
  vnames = []
174
185
  vn_list.each do |item|
175
- language, name = item.split("=").map { |x| x.strip }
176
- if language && name && language.size < 4 && name.valid_encoding?
177
- vnames << {
178
- name: name,
179
- language: language }
180
- end
186
+ language, name = item.split("=").map(&:strip)
187
+ next unless language && name && language.size < 4 && name.valid_encoding?
188
+
189
+ vnames << {
190
+ name: name,
191
+ language: language
192
+ }
181
193
  end
182
194
 
183
195
  @data[-1][:vernacularNames] = vnames
@@ -186,26 +198,26 @@ module DwcaHunter
186
198
  end
187
199
 
188
200
  def init_classification_path(items)
189
- if items['taxonavigation']
190
- items['taxonavigation'].each do |line|
191
- line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
192
- if template_link = line.match(@re[:template_link])
193
- template_link = template_link[1].
194
- strip.gsub(/Template:/, '').gsub(/_/, ' ')
195
- if !template_link.match(/\|/)
196
- @data[-1][:classificationPath] << template_link
197
- break
198
- end
199
- end
201
+ # ignore non-template links
202
+ items["taxonavigation"]&.each do |line|
203
+ line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
204
+ next unless template_link = line.match(@re[:template_link])
205
+
206
+ template_link = template_link[1].
207
+ strip.gsub(/Template:/, "").gsub(/_/, " ")
208
+ unless template_link.match(/\|/)
209
+ @data[-1][:classificationPath] << template_link
210
+ break
200
211
  end
201
212
  end
202
213
  end
203
214
 
204
215
  def find_species_components(x)
205
- items = get_items(x.xpath('//text').text)
206
- is_taxon_item = items.has_key?('name') ||
207
- items.has_key?('taxonavigation')
216
+ items = get_items(x.xpath("//text").text)
217
+ is_taxon_item = items.key?("name") ||
218
+ items.key?("taxonavigation")
208
219
  return nil unless is_taxon_item
220
+
209
221
  items
210
222
  end
211
223
 
@@ -214,7 +226,7 @@ module DwcaHunter
214
226
  items = {}
215
227
  current_item = nil
216
228
  txt.split("\n").each do |l|
217
- item = l.match(/[\=]+([^\=]+)[\=]+/)
229
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
218
230
  if item
219
231
  current_item = item[1].strip.downcase
220
232
  items[current_item] = []
@@ -226,11 +238,11 @@ module DwcaHunter
226
238
  end
227
239
 
228
240
  def page_title(x)
229
- @page_title ||= x.xpath('//title').first.text
241
+ @page_title ||= x.xpath("//title").first.text
230
242
  end
231
243
 
232
244
  def page_id(x)
233
- @page_id ||= x.xpath('//id').first.text
245
+ @page_id ||= x.xpath("//id").first.text
234
246
  end
235
247
 
236
248
  def template?(page_xml)
@@ -238,110 +250,113 @@ module DwcaHunter
238
250
  end
239
251
 
240
252
  def parse_name(name_string, taxa)
241
- name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
253
+ name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
242
254
  name_string = name_string.strip
243
255
  old_l = name_string.dup
244
- name_string.gsub! /^\*\s*/, ''
256
+ name_string.gsub!(/^\*\s*/, "")
245
257
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
246
258
  name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
247
- name_string.gsub!(/[']{2,}/, ' ')
248
- name_string.gsub!(/["]{2,}/, ' ')
249
- name_string.gsub!(/\:\s*\d.*$/, '')
250
- name_string.gsub!(/,\s*\[RSD\]/i, '')
251
- name_string.gsub!(/^\s*†\s*/, '')
252
- name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
259
+ name_string.gsub!(/[']{2,}/, " ")
260
+ name_string.gsub!(/["]{2,}/, " ")
261
+ name_string.gsub!(/\:\s*\d.*$/, "")
262
+ name_string.gsub!(/,\s*\[RSD\]/i, "")
263
+ name_string.gsub!(/^\s*†\s*/, "")
264
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
253
265
  # name_string = DwcaHunter::XML.unescape(name_string)
254
- name_string.gsub!(/\<nowiki\>.*$/, '')
255
- name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
256
- name_string.gsub!(/^\s*\&dagger;\s*/, '')
257
- name_string.gsub!(/&nbsp;/, ' ')
258
- name_string.gsub!(/\s+/, ' ')
266
+ name_string.gsub!(/\<nowiki\>.*$/, "")
267
+ name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
+ name_string.gsub!(/^\s*\&dagger;\s*/, "")
269
+ name_string.gsub!(/&nbsp;/, " ")
270
+ name_string.gsub!(/\s+/, " ")
259
271
  name_string = name_string.strip
260
272
  # puts "%s---%s" % [name_string, old_l]
261
- return name_string
273
+ name_string
262
274
  end
263
275
 
264
276
  def generate_dwca
265
- DwcaHunter::logger_write(self.object_id,
266
- 'Creating DarwinCore Archive file')
277
+ DwcaHunter.logger_write(object_id,
278
+ "Creating DarwinCore Archive file")
267
279
  @core = [
268
- ['http://rs.tdwg.org/dwc/terms/taxonID',
269
- 'http://rs.tdwg.org/dwc/terms/scientificName',
270
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
271
- 'http://globalnames.org/terms/canonicalForm',
272
- 'http://rs.tdwg.org/dwc/terms/higherClassification',
273
- 'http://purl.org/dc/terms/source']
280
+ ["http://rs.tdwg.org/dwc/terms/taxonID",
281
+ "http://rs.tdwg.org/dwc/terms/scientificName",
282
+ "http://globalnames.org/terms/canonicalForm",
283
+ "http://purl.org/dc/terms/source"]
274
284
  ]
275
- DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
285
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
276
286
  count = 0
277
287
  @data.map do |d|
278
288
  count += 1
279
289
  if count % BATCH_SIZE == 0
280
- DwcaHunter::logger_write(self.object_id,
281
- "Traversing %s core data record" % count)
290
+ DwcaHunter.logger_write(object_id,
291
+ "Traversing %s core data record" % count)
282
292
  end
283
- taxon_id = (d[:classificationPath].empty? ?
284
- d[:taxonId] :
285
- @templates[d[:classificationPath].
286
- last][:id]) rescue d[:taxonId]
293
+ taxon_id = begin
294
+ (d[:classificationPath].empty? ?
295
+ d[:taxonId] :
296
+ @templates[d[:classificationPath].
297
+ last][:id])
298
+ rescue StandardError
299
+ d[:taxonId]
300
+ end
287
301
  @taxon_ids[d[:taxonId]] = taxon_id
288
- parentNameUsageId = (d[:classificationPath].size > 1 ?
289
- @templates[d[:classificationPath][-2]][:id] :
290
- nil) rescue nil
291
- url = 'http://species.wikimedia.org/wiki/' +
292
- URI.encode(d[:canonicalForm].gsub(' ', '_'))
302
+ parentNameUsageId = begin
303
+ (d[:classificationPath].size > 1 ?
304
+ @templates[d[:classificationPath][-2]][:id] :
305
+ nil)
306
+ rescue StandardError
307
+ nil
308
+ end
309
+ url = "http://species.wikimedia.org/wiki/" +
310
+ URI.encode(d[:canonicalForm].gsub(" ", "_"))
293
311
  path = d[:classificationPath]
294
312
  path.pop if path[-1] == d[:canonicalForm]
295
- canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
296
- scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
313
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
+ scientific_name = d[:scientificName] == d[:canonicalForm] ?
297
315
  canonical_form :
298
316
  d[:scientificName]
299
317
  @core << [taxon_id,
300
318
  scientific_name,
301
- parentNameUsageId,
302
319
  canonical_form,
303
- path.join('|'),
304
320
  url]
305
321
  end
306
322
  @extensions << { data: [[
307
- 'http://rs.tdwg.org/dwc/terms/TaxonID',
308
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
309
- 'http://purl.org/dc/terms/language'
310
- ]], file_name: 'vernacular_names.txt' }
311
- DwcaHunter::logger_write(self.object_id,
312
- 'Creating verncaular name extension for DarwinCore Archive file')
323
+ "http://rs.tdwg.org/dwc/terms/TaxonID",
324
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
325
+ "http://purl.org/dc/terms/language"
326
+ ]], file_name: "vernacular_names.txt" }
327
+ DwcaHunter.logger_write(object_id,
328
+ "Creating verncaular name extension for DarwinCore Archive file")
313
329
  count = 0
314
330
  @data.each do |d|
315
331
  count += 1
316
332
  if count % BATCH_SIZE == 0
317
- DwcaHunter::logger_write(self.object_id,
318
- "Traversing %s extension data record" % count)
333
+ DwcaHunter.logger_write(object_id,
334
+ "Traversing %s extension data record" % count)
319
335
  end
320
336
  d[:vernacularNames].each do |vn|
321
- taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
322
- if taxon_id
323
- @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
324
- end
337
+ taxon_id = @taxon_ids[d[:taxonId]] || nil
338
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
325
339
  end
326
340
  end
327
341
  @eml = {
328
342
  id: @uuid,
329
343
  title: @title,
330
- license: 'http://creativecommons.org/licenses/by-sa/3.0/',
344
+ license: "http://creativecommons.org/licenses/by-sa/3.0/",
331
345
  authors: [
332
- { first_name: 'Stephen',
333
- last_name: 'Thorpe',
334
- email: 'stephen_thorpe@yahoo.co.nz',
335
- url: 'http://species.wikimedia.org/wiki/Main_Page' }],
336
- abstract: 'The free species directory that anyone can edit.',
346
+ { first_name: "Stephen",
347
+ last_name: "Thorpe",
348
+ email: "stephen_thorpe@yahoo.co.nz",
349
+ url: "http://species.wikimedia.org/wiki/Main_Page" }
350
+ ],
351
+ abstract: "The free species directory that anyone can edit.",
337
352
  metadata_providers: [
338
- { first_name: 'Dmitry',
339
- last_name: 'Mozzherin',
340
- email: 'dmozzherin@mbl.edu' }],
341
- url: 'http://species.wikimedia.org/wiki/Main_Page'
353
+ { first_name: "Dmitry",
354
+ last_name: "Mozzherin",
355
+ email: "dmozzherin@mbl.edu" }
356
+ ],
357
+ url: "http://species.wikimedia.org/wiki/Main_Page"
342
358
  }
343
359
  super
344
360
  end
345
-
346
361
  end
347
362
  end