dwca_hunter 0.5.3 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ class PaleodbHarvester
2
+ def initialize(download_dir)
3
+ @dir = File.join(download_dir, "json")
4
+ FileUtils.mkdir_p(@dir)
5
+ @in_dir = download_dir
6
+ @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
7
+ @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
8
+ @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
9
+ @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
10
+ end
11
+
12
+ def taxa
13
+ # "orig_no","taxon_no","record_type","flags","taxon_rank",
14
+ # "taxon_name","difference","accepted_no","accepted_rank",
15
+ # "accepted_name","parent_no","reference_no","is_extant","n_occs"
16
+ taxa = {}
17
+ name2id = {}
18
+ @taxa_csv.each do |r|
19
+ r = strip(r)
20
+ taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
21
+ rank: r["taxon_rank"], name: r["taxon_name"],
22
+ auth: r["taxon_attr"],
23
+ extinct: extinct(r["is_extant"]),
24
+ vernacular: r["common_name"],
25
+ annot: r["difference"], acc_id: r["accepted_no"],
26
+ acc_rank: r["accepted_rank"],
27
+ acc_name: r["accepted_name"], ecol: ecol(r),
28
+ parent_id: r["parent_no"], ref: r["reference_no"],
29
+ occs_num: r["n_occs"], enterer: enterer(r) }
30
+
31
+ name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
32
+ end
33
+ f = open(File.join(@dir, "taxa.json"), "w:utf-8")
34
+ f.write(JSON.pretty_generate(taxa))
35
+ f.close
36
+ f = open(File.join(@dir, "name_id.json"), "w:utf-8")
37
+ f.write(JSON.pretty_generate(name2id))
38
+ f.close
39
+ end
40
+
41
+ def enterer(r)
42
+ res = [r["enterer"], r["modifier"]].map(&:to_s)
43
+ .map(&:strip).uniq.select { |e| e != "" }
44
+ res.empty? ? "" : res.join(", ")
45
+ end
46
+
47
+
48
+ def extinct(val)
49
+ val == "extinct" ? 1 : 0
50
+ end
51
+
52
+ def ecol(row)
53
+ row = strip row
54
+ "#{row['life_habit']} #{row['diet']}"
55
+ end
56
+
57
+ def refs
58
+ # "reference_no","record_type","ref_type","author1init","author1last",
59
+ # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
60
+ # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
61
+ # "language","doi"
62
+
63
+ # {"id":31671,"orig":true,"author":"Hahn, C. W.",
64
+ # "year":1834,"title":"Die wanzenartigen Insecten.",
65
+ # "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
66
+ # "distribution":"Germany","comment":"n. sp."}
67
+ refs = {}
68
+ @refs_csv.each do |r|
69
+ r = strip r
70
+ authorship, author = authors(r)
71
+ refs[r["reference_no"]] = { id: r["reference_no"], author: author,
72
+ authorship: authorship,
73
+ year: r["pubyr"], title: r["reftitle"],
74
+ details: details(r) }
75
+ end
76
+ f = open(File.join(@dir, "refs.json"), "w:utf-8")
77
+ f.write(JSON.pretty_generate(refs))
78
+ f.close
79
+ end
80
+
81
+ def authors(row)
82
+ row = strip row
83
+ au = ["#{row['author1init']} #{row['author1last']}".strip,
84
+ "#{row['author2init']} #{row['author2last']}".strip,
85
+ "#{row['otherauthors']}".strip]
86
+ au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
87
+ [au[0..1].join(", "), au.join(", ")]
88
+ end
89
+
90
+ def details(row)
91
+ row = strip row
92
+ ref = "#{row['pubtitle']}"
93
+ ref << " #{row['pubno']}" unless row['pubno'].empty?
94
+ ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
95
+ ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
96
+ ref << " (#{row["doi"]})" unless row['doi'].empty?
97
+ ref.gsub(/[\s]{2,}/, " ").strip
98
+ end
99
+
100
+ def taxa_refs
101
+ tr = {}
102
+ @taxa_refs_csv.each do |r|
103
+ r = strip r
104
+ row = { acc_id: r["accepted_no"], name: r["accepted_name"],
105
+ ref_id: r["reference_no"] }
106
+ if tr.key? r["accepted_no"]
107
+ tr[r["accepted_no"]] << row
108
+ else
109
+ tr[r["accepted_no"]] = [row]
110
+ end
111
+ end
112
+ f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
113
+ f.write(JSON.pretty_generate(tr))
114
+ f.close
115
+ end
116
+
117
+ def occurences
118
+ occ = {}
119
+ @occurences_csv.each_with_index do |r, i|
120
+ r = strip r
121
+ row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
122
+ state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
123
+ if occ.key? r["accepted_no"]
124
+ occ[r["accepted_no"]] << row
125
+ else
126
+ occ[r["accepted_no"]] = [row]
127
+ end
128
+ end
129
+ f = open(File.join(@dir, "occurences.json"), "w:utf-8")
130
+ f.write(JSON.pretty_generate(occ))
131
+ f.close
132
+ end
133
+
134
+ def strip(row)
135
+ row.each_with_object({}) do |(k, v), h|
136
+ h[k] = v.nil? ? nil : v.strip
137
+ end
138
+ end
139
+ end
140
+
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceSherborn < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "sherborn"
7
+ @title = "Index Animalium"
8
+ @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
9
+ @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "sherborn",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading."
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ dupes = {}
45
+ @names_index = {}
46
+ file = CSV.open(File.join(@download_dir, "data.csv"),
47
+ headers: false, col_sep: "\t")
48
+ file.each_with_index do |row, i|
49
+ next if dupes.key?(row[1])
50
+
51
+ dupes[row[1]] = true
52
+ taxon_id = row[0]
53
+ name_string = row[1]
54
+
55
+ @names << { taxon_id: taxon_id,
56
+ name_string: name_string }
57
+ puts "Processed %s names" % i if i % 10_000 == 0
58
+ end
59
+ end
60
+
61
+ def generate_dwca
62
+ DwcaHunter.logger_write(object_id,
63
+ "Creating DarwinCore Archive file")
64
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
65
+ "http://rs.tdwg.org/dwc/terms/scientificName",
66
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
67
+ @names.each do |n|
68
+ @core << [n[:taxon_id], n[:name_string], "ICZN"]
69
+ end
70
+
71
+ @eml = {
72
+ id: @uuid,
73
+ title: @title,
74
+ authors: [
75
+ { first_name: "Charles Davies",
76
+ last_name: "Sherborn" }
77
+ ],
78
+ metadata_providers: [
79
+ { first_name: "Dmitry",
80
+ last_name: "Mozzherin",
81
+ email: "dmozzherin@gmail.com" }
82
+ ],
83
+ abstract: "Index Animalium is a monumental work that covers " \
84
+ "400 000 zoological names registered by science " \
85
+ "between 1758 and 1850",
86
+ url: @url
87
+ }
88
+ super
89
+ end
90
+ end
91
+ end
@@ -1,145 +1,113 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
4
+ # Wikispecies source
3
5
  class ResourceWikispecies < DwcaHunter::Resource
4
- def initialize(opts = {})
5
- @wikisp_path = File.join(Dir.tmpdir, 'dwca_hunter', 'wikispecies')
6
- @problems_file = open(File.join(Dir.tmpdir, 'problems.txt'), 'w:utf-8')
6
+ def initialize(opts = { download: true, unpack: true })
7
+ @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
8
+ @problems_file = File.open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
7
9
  @command = "wikispecies"
8
- @title = 'Wikispecies'
9
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
10
- 'specieswiki-latest-pages-articles.xml.bz2'
10
+ @title = "Wikispecies"
11
+ @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
12
+ "specieswiki-latest-pages-articles.xml.bz2"
11
13
  @url = opts[:url] if opts[:url]
12
- @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
13
- @download_path = File.join(@wikisp_path, 'data.xml.bz2')
14
+ @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
15
+ @download_path = File.join(@wikisp_path, "data.xml.bz2")
14
16
  @data = []
15
17
  @templates = {}
16
18
  @taxon_ids = {}
17
19
  @tree = {}
18
20
  @paths = {}
19
21
  @extensions = []
22
+ @parser = Biodiversity::Parser
20
23
  @re = {
21
- page_start: /^\s*\<page\>\s*$/,
22
- page_end: /^\s*\<\/page\>\s*$/,
24
+ page_start: /^\s*<page>\s*$/,
25
+ page_end: %r{^\s*</page>\s*$},
23
26
  template: /Template:/i,
24
- template_link: /\{\{([^\}]*)\}\}/,
25
- vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
27
+ template_link: /\{\{([^}]*)\}\}/,
28
+ vernacular_names: /\{\{\s*VN\s*\|([^}]+)\}\}/i
26
29
  }
27
30
  super(opts)
28
31
  end
29
32
 
33
+ def download
34
+ puts "Downloading from the source"
35
+ `curl -L #{@url} -o #{@download_path}`
36
+ end
37
+
30
38
  def unpack
31
39
  unpack_bz2
32
40
  end
33
41
 
34
42
  def make_dwca
35
43
  enrich_data
36
- extend_classification
37
44
  generate_dwca
38
45
  end
39
46
 
40
- private
47
+ private
41
48
 
42
49
  def enrich_data
43
- DwcaHunter::logger_write(self.object_id,
44
- 'Extracting data from xml file...')
50
+ DwcaHunter.logger_write(object_id,
51
+ "Extracting data from xml file...")
45
52
  Dir.chdir(@download_dir)
46
- f = open('data.xml', 'r:utf-8')
53
+ f = open("data.xml", "r:utf-8")
47
54
  page_on = false
48
- page = ''
55
+ page = ""
49
56
  page_num = 0
50
57
  f.each do |l|
51
58
  if l.match(@re[:page_start])
52
- page << l
59
+ page += l
53
60
  page_on = true
54
61
  elsif page_on
55
- page << l
62
+ page += l
56
63
  if l.match(@re[:page_end])
57
64
  page_on = false
58
65
  page_xml = Nokogiri::XML.parse(page)
59
- template?(page_xml) ?
60
- process_template(page_xml) :
66
+ if template?(page_xml)
67
+ process_template(page_xml)
68
+ else
61
69
  process_species(page_xml)
70
+ end
62
71
  page_num += 1
63
- if page_num % BATCH_SIZE == 0
64
- DwcaHunter::logger_write(self.object_id,
65
- "Traversed %s pages" % page_num)
72
+ if (page_num % BATCH_SIZE).zero?
73
+ DwcaHunter.logger_write(object_id,
74
+ "Traversed #{page_num} pages")
66
75
  end
67
- page = ''
76
+ page = ""
68
77
  @page_title = nil
69
78
  @page_id = nil
70
79
  end
71
80
  end
72
81
  end
73
- DwcaHunter::logger_write(self.object_id,
74
- 'Extracted total %s pages' % page_num)
82
+ DwcaHunter.logger_write(object_id,
83
+ "Extracted total %s pages" % page_num)
75
84
  f.close
76
85
  end
77
86
 
78
- def extend_classification
79
- DwcaHunter::logger_write(self.object_id, 'Extending classifications')
80
- @data.each_with_index do |d, i|
81
- unless d[:classificationPath].empty?
82
- n = 50
83
- while n > 0
84
- n -= 1
85
- if n == 0
86
- d[:classificationPath] = []
87
- break
88
- end
89
- parent = @templates[d[:classificationPath].first]
90
- if parent
91
- d[:classificationPath].unshift(parent[:parentName])
92
- else
93
- update_tree(d[:classificationPath])
94
- break
95
- end
96
- end
97
- end
98
- # d[:classificationPath] = d[:classificationPath].join("|").
99
- # gsub("Main Page", "Life")
100
- if i % BATCH_SIZE == 0 && i > 0
101
- DwcaHunter::logger_write(self.object_id,
102
- "Extended %s classifications" % i)
103
- end
104
- end
105
- end
106
-
107
- def update_tree(path)
108
- path = path.dup
109
- return if @paths.has_key?(path.join('|'))
110
- (0...path.size).each do |i|
111
- subpath = path[0..i]
112
- subpath_string = subpath.join('|')
113
- next if @paths.has_key?(subpath_string)
114
- name = subpath.pop
115
- tree_element = subpath.inject(@tree) { |res, n| res[n] }
116
- tree_element[name] = {}
117
- @paths[subpath_string] = 1
118
- end
119
- end
120
-
121
87
  def process_template(x)
122
- name = page_title(x).gsub!(@re[:template], '').strip
123
- text = x.xpath('//text').text.strip
88
+ name = page_title(x).gsub!(@re[:template], "").strip
89
+ text = x.xpath("//text").text.strip
124
90
  parent_name = text.match(@re[:template_link])
125
91
  if parent_name
126
92
  return if parent_name[1].match(/\#if/)
93
+
127
94
  list = parent_name[1].split("|")
128
- if list.size == 1
129
- parent_name = list[0]
130
- elsif list[0].match /Taxonav/i
131
- parent_name = list[1]
132
- else
133
- parent_name = list[0]
134
- end
95
+ parent_name = if list.size == 1
96
+ list[0]
97
+ elsif list[0].match(/Taxonav/i)
98
+ list[1]
99
+ else
100
+ list[0]
101
+ end
135
102
  end
136
- name.gsub!(/_/, ' ')
137
- parent_name.gsub!(/_/, ' ') if parent_name
103
+ name.gsub!(/_/, " ")
104
+ parent_name&.gsub!(/_/, " ")
138
105
  @templates[name] = { parentName: parent_name, id: page_id(x) }
139
106
  end
140
107
 
141
108
  def process_species(x)
142
109
  return if page_title(x).match(/Wikispecies/i)
110
+
143
111
  items = find_species_components(x)
144
112
  if items
145
113
  @data << {
@@ -147,37 +115,44 @@ module DwcaHunter
147
115
  canonicalForm: page_title(x),
148
116
  scientificName: page_title(x),
149
117
  classificationPath: [],
150
- vernacularNames: [] }
118
+ vernacularNames: []
119
+ }
151
120
  get_full_scientific_name(items)
152
121
  get_vernacular_names(items)
153
- init_classification_path(items)
154
122
  end
155
123
  end
156
124
 
157
125
  def get_full_scientific_name(items)
158
- if items['name']
159
- if name = items['name'][0]
160
- @data[-1][:scientificName] = parse_name(name, @data[-1])
161
- else
162
- @problems_file.write("%s\n" % @data[-1][:canonicalForm])
163
- end
126
+ name_ary = items["{{int:name}}"]
127
+
128
+ if name_ary.nil? || name_ary.empty?
129
+ @problems_file.write("%s\n" % @data[-1][:canonicalForm])
130
+ return
131
+ end
132
+
133
+ name = name_ary[0]
134
+ name = parse_name(name, @data[-1])
135
+ if name != ""
136
+ @data[-1][:scientificName] = name
164
137
  end
165
138
  end
166
139
 
167
140
  def get_vernacular_names(items)
168
- if items['vernacular names'] && items['vernacular names'].size > 0
169
- vn_string = items['vernacular names'].join("")
141
+ vern = items["{{int:vernacular names}}"]
142
+ if vern.is_a?(Array) && vern.size.positive?
143
+ vn_string = vern.join("")
170
144
  vn = vn_string.match(@re[:vernacular_names])
171
145
  if vn
172
146
  vn_list = vn[1].strip.split("|")
173
147
  vnames = []
174
148
  vn_list.each do |item|
175
- language, name = item.split("=").map { |x| x.strip }
176
- if language && name && language.size < 4 && name.valid_encoding?
177
- vnames << {
178
- name: name,
179
- language: language }
180
- end
149
+ language, name = item.split("=").map(&:strip)
150
+ next unless language && name && language.size < 4 && name.valid_encoding?
151
+
152
+ vnames << {
153
+ name: name,
154
+ language: language
155
+ }
181
156
  end
182
157
 
183
158
  @data[-1][:vernacularNames] = vnames
@@ -186,26 +161,26 @@ module DwcaHunter
186
161
  end
187
162
 
188
163
  def init_classification_path(items)
189
- if items['taxonavigation']
190
- items['taxonavigation'].each do |line|
191
- line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
192
- if template_link = line.match(@re[:template_link])
193
- template_link = template_link[1].
194
- strip.gsub(/Template:/, '').gsub(/_/, ' ')
195
- if !template_link.match(/\|/)
196
- @data[-1][:classificationPath] << template_link
197
- break
198
- end
199
- end
164
+ # ignore non-template links
165
+ items["taxonavigation"]&.each do |line|
166
+ line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
167
+ next unless template_link = line.match(@re[:template_link])
168
+
169
+ template_link = template_link[1].
170
+ strip.gsub(/Template:/, "").gsub(/_/, " ")
171
+ unless template_link.match(/\|/)
172
+ @data[-1][:classificationPath] << template_link
173
+ break
200
174
  end
201
175
  end
202
176
  end
203
177
 
204
178
  def find_species_components(x)
205
- items = get_items(x.xpath('//text').text)
206
- is_taxon_item = items.has_key?('name') ||
207
- items.has_key?('taxonavigation')
179
+ items = get_items(x.xpath("//text").text)
180
+ is_taxon_item = items.key?("{{int:name}}") &&
181
+ items.key?("{{int:taxonavigation}}")
208
182
  return nil unless is_taxon_item
183
+
209
184
  items
210
185
  end
211
186
 
@@ -214,7 +189,7 @@ module DwcaHunter
214
189
  items = {}
215
190
  current_item = nil
216
191
  txt.split("\n").each do |l|
217
- item = l.match(/[\=]+([^\=]+)[\=]+/)
192
+ item = l.match(/=+([^=]+)=+/)
218
193
  if item
219
194
  current_item = item[1].strip.downcase
220
195
  items[current_item] = []
@@ -226,11 +201,11 @@ module DwcaHunter
226
201
  end
227
202
 
228
203
  def page_title(x)
229
- @page_title ||= x.xpath('//title').first.text
204
+ @page_title ||= x.xpath("//title").first.text
230
205
  end
231
206
 
232
207
  def page_id(x)
233
- @page_id ||= x.xpath('//id').first.text
208
+ @page_id ||= x.xpath("//id").first.text
234
209
  end
235
210
 
236
211
  def template?(page_xml)
@@ -238,110 +213,117 @@ module DwcaHunter
238
213
  end
239
214
 
240
215
  def parse_name(name_string, taxa)
241
- name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
216
+ name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
242
217
  name_string = name_string.strip
243
218
  old_l = name_string.dup
244
- name_string.gsub! /^\*\s*/, ''
219
+ name_string.gsub!(/^\*\s*/, "")
245
220
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
246
- name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
247
- name_string.gsub!(/[']{2,}/, ' ')
248
- name_string.gsub!(/["]{2,}/, ' ')
249
- name_string.gsub!(/\:\s*\d.*$/, '')
250
- name_string.gsub!(/,\s*\[RSD\]/i, '')
251
- name_string.gsub!(/^\s*†\s*/, '')
252
- name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
221
+ name_string.gsub!(/\{\{([^}]+\|)?([^}]*)\}\}/, '\2')
222
+ name_string.gsub!(/'{2,}/, " ")
223
+ name_string.gsub!(/"{2,}/, " ")
224
+ name_string.gsub!(/:\s*\d.*$/, "")
225
+ name_string.gsub!(/,\s*\[RSD\]/i, "")
226
+ name_string.gsub!(/^\s*†\s*/, "")
227
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
253
228
  # name_string = DwcaHunter::XML.unescape(name_string)
254
- name_string.gsub!(/\<nowiki\>.*$/, '')
255
- name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
256
- name_string.gsub!(/^\s*\&dagger;\s*/, '')
257
- name_string.gsub!(/&nbsp;/, ' ')
258
- name_string.gsub!(/\s+/, ' ')
259
- name_string = name_string.strip
260
- # puts "%s---%s" % [name_string, old_l]
261
- return name_string
229
+ name_string.gsub!(/<nowiki>.*$/, "")
230
+ name_string.gsub!(%r{<br\s*/?\s*>}, "")
231
+ name_string.gsub!(/^\s*&dagger;\s*/, "")
232
+ name_string.gsub!(/&nbsp;/, " ")
233
+ name_string.gsub!(/\s+/, " ")
234
+ res = name_string.strip
235
+ parsed = @parser.parse(res, simple: true)
236
+ if !["1","2"].include?(parsed[:quality])
237
+ return ""
238
+ end
239
+ res
262
240
  end
263
241
 
264
242
  def generate_dwca
265
- DwcaHunter::logger_write(self.object_id,
266
- 'Creating DarwinCore Archive file')
243
+ DwcaHunter.logger_write(object_id,
244
+ "Creating DarwinCore Archive file")
267
245
  @core = [
268
- ['http://rs.tdwg.org/dwc/terms/taxonID',
269
- 'http://rs.tdwg.org/dwc/terms/scientificName',
270
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
271
- 'http://globalnames.org/terms/canonicalForm',
272
- 'http://rs.tdwg.org/dwc/terms/higherClassification',
273
- 'http://purl.org/dc/terms/source']
246
+ ["http://rs.tdwg.org/dwc/terms/taxonID",
247
+ "http://rs.tdwg.org/dwc/terms/scientificName",
248
+ "http://globalnames.org/terms/canonicalForm",
249
+ "http://purl.org/dc/terms/source"]
274
250
  ]
275
- DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
251
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
276
252
  count = 0
277
253
  @data.map do |d|
278
254
  count += 1
279
- if count % BATCH_SIZE == 0
280
- DwcaHunter::logger_write(self.object_id,
281
- "Traversing %s core data record" % count)
255
+ if (count % BATCH_SIZE).zero?
256
+ DwcaHunter.logger_write(object_id,
257
+ "Traversing %s core data record" % count)
258
+ end
259
+ taxon_id = begin
260
+ (if d[:classificationPath].empty?
261
+ d[:taxonId]
262
+ else
263
+ @templates[d[:classificationPath].
264
+ last][:id]
265
+ end)
266
+ rescue StandardError
267
+ d[:taxonId]
282
268
  end
283
- taxon_id = (d[:classificationPath].empty? ?
284
- d[:taxonId] :
285
- @templates[d[:classificationPath].
286
- last][:id]) rescue d[:taxonId]
287
269
  @taxon_ids[d[:taxonId]] = taxon_id
288
- parentNameUsageId = (d[:classificationPath].size > 1 ?
289
- @templates[d[:classificationPath][-2]][:id] :
290
- nil) rescue nil
291
- url = 'http://species.wikimedia.org/wiki/' +
292
- URI.encode(d[:canonicalForm].gsub(' ', '_'))
270
+ parentNameUsageId = begin
271
+ (@templates[d[:classificationPath][-2]][:id] if d[:classificationPath].size > 1)
272
+ rescue StandardError
273
+ nil
274
+ end
275
+ url = "http://species.wikimedia.org/wiki/#{CGI.escape(d[:canonicalForm].gsub(' ', '_'))}"
293
276
  path = d[:classificationPath]
294
277
  path.pop if path[-1] == d[:canonicalForm]
295
- canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
296
- scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
297
- canonical_form :
298
- d[:scientificName]
278
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
279
+ scientific_name = if d[:scientificName] == d[:canonicalForm]
280
+ canonical_form
281
+ else
282
+ d[:scientificName]
283
+ end
299
284
  @core << [taxon_id,
300
285
  scientific_name,
301
- parentNameUsageId,
302
286
  canonical_form,
303
- path.join('|'),
304
287
  url]
305
288
  end
306
289
  @extensions << { data: [[
307
- 'http://rs.tdwg.org/dwc/terms/TaxonID',
308
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
309
- 'http://purl.org/dc/terms/language'
310
- ]], file_name: 'vernacular_names.txt' }
311
- DwcaHunter::logger_write(self.object_id,
312
- 'Creating verncaular name extension for DarwinCore Archive file')
290
+ "http://rs.tdwg.org/dwc/terms/TaxonID",
291
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
292
+ "http://purl.org/dc/terms/language"
293
+ ]], file_name: "vernacular_names.txt" }
294
+ DwcaHunter.logger_write(object_id,
295
+ "Creating verncaular name extension for DarwinCore Archive file")
313
296
  count = 0
314
297
  @data.each do |d|
315
298
  count += 1
316
- if count % BATCH_SIZE == 0
317
- DwcaHunter::logger_write(self.object_id,
318
- "Traversing %s extension data record" % count)
299
+ if (count % BATCH_SIZE).zero?
300
+ DwcaHunter.logger_write(object_id,
301
+ "Traversing %s extension data record" % count)
319
302
  end
320
303
  d[:vernacularNames].each do |vn|
321
- taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
322
- if taxon_id
323
- @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
324
- end
304
+ taxon_id = @taxon_ids[d[:taxonId]] || nil
305
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
325
306
  end
326
307
  end
327
308
  @eml = {
328
309
  id: @uuid,
329
310
  title: @title,
330
- license: 'http://creativecommons.org/licenses/by-sa/3.0/',
311
+ license: "http://creativecommons.org/licenses/by-sa/3.0/",
331
312
  authors: [
332
- { first_name: 'Stephen',
333
- last_name: 'Thorpe',
334
- email: 'stephen_thorpe@yahoo.co.nz',
335
- url: 'http://species.wikimedia.org/wiki/Main_Page' }],
336
- abstract: 'The free species directory that anyone can edit.',
313
+ { first_name: "Stephen",
314
+ last_name: "Thorpe",
315
+ email: "stephen_thorpe@yahoo.co.nz",
316
+ url: "http://species.wikimedia.org/wiki/Main_Page" }
317
+ ],
318
+ abstract: "The free species directory that anyone can edit.",
337
319
  metadata_providers: [
338
- { first_name: 'Dmitry',
339
- last_name: 'Mozzherin',
340
- email: 'dmozzherin@mbl.edu' }],
341
- url: 'http://species.wikimedia.org/wiki/Main_Page'
320
+ { first_name: "Dmitry",
321
+ last_name: "Mozzherin",
322
+ email: "dmozzherin@mbl.edu" }
323
+ ],
324
+ url: "http://species.wikimedia.org/wiki/Main_Page"
342
325
  }
343
326
  super
344
327
  end
345
-
346
328
  end
347
329
  end