dwca_hunter 0.5.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +45 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +61 -83
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +9 -9
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  20. data/lib/dwca_hunter/resources/ipni.rb +111 -0
  21. data/lib/dwca_hunter/resources/itis.rb +99 -99
  22. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  23. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  24. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  25. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  26. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  27. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  28. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  29. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  30. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  31. data/lib/dwca_hunter/version.rb +1 -1
  32. metadata +46 -40
  33. data/files/birdlife_7.csv +0 -11862
  34. data/files/fishbase_taxon_cache.tsv +0 -81000
  35. data/files/reptile_checklist_2014_12.csv +0 -15158
  36. data/files/species-black.txt +0 -251
@@ -0,0 +1,140 @@
1
+ class PaleodbHarvester
2
+ def initialize(download_dir)
3
+ @dir = File.join(download_dir, "json")
4
+ FileUtils.mkdir_p(@dir)
5
+ @in_dir = download_dir
6
+ @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
7
+ @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
8
+ @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
9
+ @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
10
+ end
11
+
12
+ def taxa
13
+ # "orig_no","taxon_no","record_type","flags","taxon_rank",
14
+ # "taxon_name","difference","accepted_no","accepted_rank",
15
+ # "accepted_name","parent_no","reference_no","is_extant","n_occs"
16
+ taxa = {}
17
+ name2id = {}
18
+ @taxa_csv.each do |r|
19
+ r = strip(r)
20
+ taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
21
+ rank: r["taxon_rank"], name: r["taxon_name"],
22
+ auth: r["taxon_attr"],
23
+ extinct: extinct(r["is_extant"]),
24
+ vernacular: r["common_name"],
25
+ annot: r["difference"], acc_id: r["accepted_no"],
26
+ acc_rank: r["accepted_rank"],
27
+ acc_name: r["accepted_name"], ecol: ecol(r),
28
+ parent_id: r["parent_no"], ref: r["reference_no"],
29
+ occs_num: r["n_occs"], enterer: enterer(r) }
30
+
31
+ name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
32
+ end
33
+ f = open(File.join(@dir, "taxa.json"), "w:utf-8")
34
+ f.write(JSON.pretty_generate(taxa))
35
+ f.close
36
+ f = open(File.join(@dir, "name_id.json"), "w:utf-8")
37
+ f.write(JSON.pretty_generate(name2id))
38
+ f.close
39
+ end
40
+
41
+ def enterer(r)
42
+ res = [r["enterer"], r["modifier"]].map(&:to_s)
43
+ .map(&:strip).uniq.select { |e| e != "" }
44
+ res.empty? ? "" : res.join(", ")
45
+ end
46
+
47
+
48
+ def extinct(val)
49
+ val == "extinct" ? 1 : 0
50
+ end
51
+
52
+ def ecol(row)
53
+ row = strip row
54
+ "#{row['life_habit']} #{row['diet']}"
55
+ end
56
+
57
+ def refs
58
+ # "reference_no","record_type","ref_type","author1init","author1last",
59
+ # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
60
+ # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
61
+ # "language","doi"
62
+
63
+ # {"id":31671,"orig":true,"author":"Hahn, C. W.",
64
+ # "year":1834,"title":"Die wanzenartigen Insecten.",
65
+ # "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
66
+ # "distribution":"Germany","comment":"n. sp."}
67
+ refs = {}
68
+ @refs_csv.each do |r|
69
+ r = strip r
70
+ authorship, author = authors(r)
71
+ refs[r["reference_no"]] = { id: r["reference_no"], author: author,
72
+ authorship: authorship,
73
+ year: r["pubyr"], title: r["reftitle"],
74
+ details: details(r) }
75
+ end
76
+ f = open(File.join(@dir, "refs.json"), "w:utf-8")
77
+ f.write(JSON.pretty_generate(refs))
78
+ f.close
79
+ end
80
+
81
+ def authors(row)
82
+ row = strip row
83
+ au = ["#{row['author1init']} #{row['author1last']}".strip,
84
+ "#{row['author2init']} #{row['author2last']}".strip,
85
+ "#{row['otherauthors']}".strip]
86
+ au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
87
+ [au[0..1].join(", "), au.join(", ")]
88
+ end
89
+
90
+ def details(row)
91
+ row = strip row
92
+ ref = "#{row['pubtitle']}"
93
+ ref << " #{row['pubno']}" unless row['pubno'].empty?
94
+ ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
95
+ ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
96
+ ref << " (#{row["doi"]})" unless row['doi'].empty?
97
+ ref.gsub(/[\s]{2,}/, " ").strip
98
+ end
99
+
100
+ def taxa_refs
101
+ tr = {}
102
+ @taxa_refs_csv.each do |r|
103
+ r = strip r
104
+ row = { acc_id: r["accepted_no"], name: r["accepted_name"],
105
+ ref_id: r["reference_no"] }
106
+ if tr.key? r["accepted_no"]
107
+ tr[r["accepted_no"]] << row
108
+ else
109
+ tr[r["accepted_no"]] = [row]
110
+ end
111
+ end
112
+ f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
113
+ f.write(JSON.pretty_generate(tr))
114
+ f.close
115
+ end
116
+
117
+ def occurences
118
+ occ = {}
119
+ @occurences_csv.each_with_index do |r, i|
120
+ r = strip r
121
+ row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
122
+ state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
123
+ if occ.key? r["accepted_no"]
124
+ occ[r["accepted_no"]] << row
125
+ else
126
+ occ[r["accepted_no"]] = [row]
127
+ end
128
+ end
129
+ f = open(File.join(@dir, "occurences.json"), "w:utf-8")
130
+ f.write(JSON.pretty_generate(occ))
131
+ f.close
132
+ end
133
+
134
+ def strip(row)
135
+ row.each_with_object({}) do |(k, v), h|
136
+ h[k] = v.nil? ? nil : v.strip
137
+ end
138
+ end
139
+ end
140
+
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceSherborn < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "sherborn"
7
+ @title = "Index Animalium"
8
+ @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
9
+ @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "sherborn",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading."
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ dupes = {}
45
+ @names_index = {}
46
+ file = CSV.open(File.join(@download_dir, "data.csv"),
47
+ headers: false, col_sep: "\t")
48
+ file.each_with_index do |row, i|
49
+ next if dupes.key?(row[1])
50
+
51
+ dupes[row[1]] = true
52
+ taxon_id = row[0]
53
+ name_string = row[1]
54
+
55
+ @names << { taxon_id: taxon_id,
56
+ name_string: name_string }
57
+ puts "Processed %s names" % i if i % 10_000 == 0
58
+ end
59
+ end
60
+
61
+ def generate_dwca
62
+ DwcaHunter.logger_write(object_id,
63
+ "Creating DarwinCore Archive file")
64
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
65
+ "http://rs.tdwg.org/dwc/terms/scientificName",
66
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
67
+ @names.each do |n|
68
+ @core << [n[:taxon_id], n[:name_string], "ICZN"]
69
+ end
70
+
71
+ @eml = {
72
+ id: @uuid,
73
+ title: @title,
74
+ authors: [
75
+ { first_name: "Charles Davies",
76
+ last_name: "Sherborn" }
77
+ ],
78
+ metadata_providers: [
79
+ { first_name: "Dmitry",
80
+ last_name: "Mozzherin",
81
+ email: "dmozzherin@gmail.com" }
82
+ ],
83
+ abstract: "Index Animalium is a monumental work that covers " \
84
+ "400 000 zoological names registered by science " \
85
+ "between 1758 and 1850",
86
+ url: @url
87
+ }
88
+ super
89
+ end
90
+ end
91
+ end
@@ -1,18 +1,17 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceWikispecies < DwcaHunter::Resource
4
5
  def initialize(opts = {})
5
- @problems_file = open('problems.txt', 'w:utf-8')
6
+ @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
+ @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
6
8
  @command = "wikispecies"
7
- @title = 'Wikispecies'
8
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
9
- 'specieswiki-latest-pages-articles.xml.bz2'
9
+ @title = "Wikispecies"
10
+ @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
11
+ "specieswiki-latest-pages-articles.xml.bz2"
10
12
  @url = opts[:url] if opts[:url]
11
- @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
12
- @download_path = File.join(Dir.tmpdir,
13
- 'dwca_hunter',
14
- 'wikispecies',
15
- 'data.xml.bz2')
13
+ @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
14
+ @download_path = File.join(@wikisp_path, "data.xml.bz2")
16
15
  @data = []
17
16
  @templates = {}
18
17
  @taxon_ids = {}
@@ -21,7 +20,7 @@ module DwcaHunter
21
20
  @extensions = []
22
21
  @re = {
23
22
  page_start: /^\s*\<page\>\s*$/,
24
- page_end: /^\s*\<\/page\>\s*$/,
23
+ page_end: %r{^\s*\</page\>\s*$},
25
24
  template: /Template:/i,
26
25
  template_link: /\{\{([^\}]*)\}\}/,
27
26
  vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
@@ -29,6 +28,11 @@ module DwcaHunter
29
28
  super(opts)
30
29
  end
31
30
 
31
+ def download
32
+ puts "Downloading from the source"
33
+ `curl -L #{@url} -o #{@download_path}`
34
+ end
35
+
32
36
  def unpack
33
37
  unpack_bz2
34
38
  end
@@ -39,22 +43,22 @@ module DwcaHunter
39
43
  generate_dwca
40
44
  end
41
45
 
42
- private
46
+ private
43
47
 
44
48
  def enrich_data
45
- DwcaHunter::logger_write(self.object_id,
46
- 'Extracting data from xml file...')
49
+ DwcaHunter.logger_write(object_id,
50
+ "Extracting data from xml file...")
47
51
  Dir.chdir(@download_dir)
48
- f = open('data.xml', 'r:utf-8')
52
+ f = open("data.xml", "r:utf-8")
49
53
  page_on = false
50
- page = ''
54
+ page = ""
51
55
  page_num = 0
52
56
  f.each do |l|
53
57
  if l.match(@re[:page_start])
54
- page << l
58
+ page += l
55
59
  page_on = true
56
60
  elsif page_on
57
- page << l
61
+ page += l
58
62
  if l.match(@re[:page_end])
59
63
  page_on = false
60
64
  page_xml = Nokogiri::XML.parse(page)
@@ -63,22 +67,22 @@ module DwcaHunter
63
67
  process_species(page_xml)
64
68
  page_num += 1
65
69
  if page_num % BATCH_SIZE == 0
66
- DwcaHunter::logger_write(self.object_id,
67
- "Traversed %s pages" % page_num)
70
+ DwcaHunter.logger_write(object_id,
71
+ "Traversed %s pages" % page_num)
68
72
  end
69
- page = ''
73
+ page = ""
70
74
  @page_title = nil
71
75
  @page_id = nil
72
76
  end
73
77
  end
74
78
  end
75
- DwcaHunter::logger_write(self.object_id,
76
- 'Extracted total %s pages' % page_num)
79
+ DwcaHunter.logger_write(object_id,
80
+ "Extracted total %s pages" % page_num)
77
81
  f.close
78
82
  end
79
83
 
80
84
  def extend_classification
81
- DwcaHunter::logger_write(self.object_id, 'Extending classifications')
85
+ DwcaHunter.logger_write(object_id, "Extending classifications")
82
86
  @data.each_with_index do |d, i|
83
87
  unless d[:classificationPath].empty?
84
88
  n = 50
@@ -100,19 +104,21 @@ module DwcaHunter
100
104
  # d[:classificationPath] = d[:classificationPath].join("|").
101
105
  # gsub("Main Page", "Life")
102
106
  if i % BATCH_SIZE == 0 && i > 0
103
- DwcaHunter::logger_write(self.object_id,
104
- "Extended %s classifications" % i)
107
+ DwcaHunter.logger_write(object_id,
108
+ "Extended %s classifications" % i)
105
109
  end
106
110
  end
107
111
  end
108
112
 
109
113
  def update_tree(path)
110
114
  path = path.dup
111
- return if @paths.has_key?(path.join('|'))
115
+ return if @paths.key?(path.join("|"))
116
+
112
117
  (0...path.size).each do |i|
113
118
  subpath = path[0..i]
114
- subpath_string = subpath.join('|')
115
- next if @paths.has_key?(subpath_string)
119
+ subpath_string = subpath.join("|")
120
+ next if @paths.key?(subpath_string)
121
+
116
122
  name = subpath.pop
117
123
  tree_element = subpath.inject(@tree) { |res, n| res[n] }
118
124
  tree_element[name] = {}
@@ -121,27 +127,29 @@ module DwcaHunter
121
127
  end
122
128
 
123
129
  def process_template(x)
124
- name = page_title(x).gsub!(@re[:template], '').strip
125
- text = x.xpath('//text').text.strip
130
+ name = page_title(x).gsub!(@re[:template], "").strip
131
+ text = x.xpath("//text").text.strip
126
132
  parent_name = text.match(@re[:template_link])
127
133
  if parent_name
128
134
  return if parent_name[1].match(/\#if/)
135
+
129
136
  list = parent_name[1].split("|")
130
- if list.size == 1
131
- parent_name = list[0]
132
- elsif list[0].match /Taxonav/i
133
- parent_name = list[1]
134
- else
135
- parent_name = list[0]
136
- end
137
+ parent_name = if list.size == 1
138
+ list[0]
139
+ elsif list[0].match(/Taxonav/i)
140
+ list[1]
141
+ else
142
+ list[0]
143
+ end
137
144
  end
138
- name.gsub!(/_/, ' ')
139
- parent_name.gsub!(/_/, ' ') if parent_name
145
+ name.gsub!(/_/, " ")
146
+ parent_name&.gsub!(/_/, " ")
140
147
  @templates[name] = { parentName: parent_name, id: page_id(x) }
141
148
  end
142
149
 
143
150
  def process_species(x)
144
151
  return if page_title(x).match(/Wikispecies/i)
152
+
145
153
  items = find_species_components(x)
146
154
  if items
147
155
  @data << {
@@ -149,7 +157,8 @@ module DwcaHunter
149
157
  canonicalForm: page_title(x),
150
158
  scientificName: page_title(x),
151
159
  classificationPath: [],
152
- vernacularNames: [] }
160
+ vernacularNames: []
161
+ }
153
162
  get_full_scientific_name(items)
154
163
  get_vernacular_names(items)
155
164
  init_classification_path(items)
@@ -157,8 +166,8 @@ module DwcaHunter
157
166
  end
158
167
 
159
168
  def get_full_scientific_name(items)
160
- if items['name']
161
- if name = items['name'][0]
169
+ if items["name"]
170
+ if name = items["name"][0]
162
171
  @data[-1][:scientificName] = parse_name(name, @data[-1])
163
172
  else
164
173
  @problems_file.write("%s\n" % @data[-1][:canonicalForm])
@@ -167,19 +176,20 @@ module DwcaHunter
167
176
  end
168
177
 
169
178
  def get_vernacular_names(items)
170
- if items['vernacular names'] && items['vernacular names'].size > 0
171
- vn_string = items['vernacular names'].join("")
179
+ if items["vernacular names"] && !items["vernacular names"].empty?
180
+ vn_string = items["vernacular names"].join("")
172
181
  vn = vn_string.match(@re[:vernacular_names])
173
182
  if vn
174
183
  vn_list = vn[1].strip.split("|")
175
184
  vnames = []
176
185
  vn_list.each do |item|
177
- language, name = item.split("=").map { |x| x.strip }
178
- if language && name && language.size < 4 && name.valid_encoding?
179
- vnames << {
180
- name: name,
181
- language: language }
182
- end
186
+ language, name = item.split("=").map(&:strip)
187
+ next unless language && name && language.size < 4 && name.valid_encoding?
188
+
189
+ vnames << {
190
+ name: name,
191
+ language: language
192
+ }
183
193
  end
184
194
 
185
195
  @data[-1][:vernacularNames] = vnames
@@ -188,26 +198,26 @@ module DwcaHunter
188
198
  end
189
199
 
190
200
  def init_classification_path(items)
191
- if items['taxonavigation']
192
- items['taxonavigation'].each do |line|
193
- line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
194
- if template_link = line.match(@re[:template_link])
195
- template_link = template_link[1].
196
- strip.gsub(/Template:/, '').gsub(/_/, ' ')
197
- if !template_link.match(/\|/)
198
- @data[-1][:classificationPath] << template_link
199
- break
200
- end
201
- end
201
+ # ignore non-template links
202
+ items["taxonavigation"]&.each do |line|
203
+ line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
204
+ next unless template_link = line.match(@re[:template_link])
205
+
206
+ template_link = template_link[1].
207
+ strip.gsub(/Template:/, "").gsub(/_/, " ")
208
+ unless template_link.match(/\|/)
209
+ @data[-1][:classificationPath] << template_link
210
+ break
202
211
  end
203
212
  end
204
213
  end
205
214
 
206
215
  def find_species_components(x)
207
- items = get_items(x.xpath('//text').text)
208
- is_taxon_item = items.has_key?('name') ||
209
- items.has_key?('taxonavigation')
216
+ items = get_items(x.xpath("//text").text)
217
+ is_taxon_item = items.key?("name") ||
218
+ items.key?("taxonavigation")
210
219
  return nil unless is_taxon_item
220
+
211
221
  items
212
222
  end
213
223
 
@@ -216,7 +226,7 @@ module DwcaHunter
216
226
  items = {}
217
227
  current_item = nil
218
228
  txt.split("\n").each do |l|
219
- item = l.match(/[\=]+([^\=]+)[\=]+/)
229
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
220
230
  if item
221
231
  current_item = item[1].strip.downcase
222
232
  items[current_item] = []
@@ -228,11 +238,11 @@ module DwcaHunter
228
238
  end
229
239
 
230
240
  def page_title(x)
231
- @page_title ||= x.xpath('//title').first.text
241
+ @page_title ||= x.xpath("//title").first.text
232
242
  end
233
243
 
234
244
  def page_id(x)
235
- @page_id ||= x.xpath('//id').first.text
245
+ @page_id ||= x.xpath("//id").first.text
236
246
  end
237
247
 
238
248
  def template?(page_xml)
@@ -240,110 +250,113 @@ module DwcaHunter
240
250
  end
241
251
 
242
252
  def parse_name(name_string, taxa)
243
- name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
253
+ name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
244
254
  name_string = name_string.strip
245
255
  old_l = name_string.dup
246
- name_string.gsub! /^\*\s*/, ''
256
+ name_string.gsub!(/^\*\s*/, "")
247
257
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
248
258
  name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
249
- name_string.gsub!(/[']{2,}/, ' ')
250
- name_string.gsub!(/["]{2,}/, ' ')
251
- name_string.gsub!(/\:\s*\d.*$/, '')
252
- name_string.gsub!(/,\s*\[RSD\]/i, '')
253
- name_string.gsub!(/^\s*†\s*/, '')
254
- name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
259
+ name_string.gsub!(/[']{2,}/, " ")
260
+ name_string.gsub!(/["]{2,}/, " ")
261
+ name_string.gsub!(/\:\s*\d.*$/, "")
262
+ name_string.gsub!(/,\s*\[RSD\]/i, "")
263
+ name_string.gsub!(/^\s*†\s*/, "")
264
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
255
265
  # name_string = DwcaHunter::XML.unescape(name_string)
256
- name_string.gsub!(/\<nowiki\>.*$/, '')
257
- name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
258
- name_string.gsub!(/^\s*\&dagger;\s*/, '')
259
- name_string.gsub!(/&nbsp;/, ' ')
260
- name_string.gsub!(/\s+/, ' ')
266
+ name_string.gsub!(/\<nowiki\>.*$/, "")
267
+ name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
+ name_string.gsub!(/^\s*\&dagger;\s*/, "")
269
+ name_string.gsub!(/&nbsp;/, " ")
270
+ name_string.gsub!(/\s+/, " ")
261
271
  name_string = name_string.strip
262
272
  # puts "%s---%s" % [name_string, old_l]
263
- return name_string
273
+ name_string
264
274
  end
265
275
 
266
276
  def generate_dwca
267
- DwcaHunter::logger_write(self.object_id,
268
- 'Creating DarwinCore Archive file')
277
+ DwcaHunter.logger_write(object_id,
278
+ "Creating DarwinCore Archive file")
269
279
  @core = [
270
- ['http://rs.tdwg.org/dwc/terms/taxonID',
271
- 'http://rs.tdwg.org/dwc/terms/scientificName',
272
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
273
- 'http://globalnames.org/terms/canonicalForm',
274
- 'http://rs.tdwg.org/dwc/terms/higherClassification',
275
- 'http://purl.org/dc/terms/source']
280
+ ["http://rs.tdwg.org/dwc/terms/taxonID",
281
+ "http://rs.tdwg.org/dwc/terms/scientificName",
282
+ "http://globalnames.org/terms/canonicalForm",
283
+ "http://purl.org/dc/terms/source"]
276
284
  ]
277
- DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
285
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
278
286
  count = 0
279
287
  @data.map do |d|
280
288
  count += 1
281
289
  if count % BATCH_SIZE == 0
282
- DwcaHunter::logger_write(self.object_id,
283
- "Traversing %s core data record" % count)
290
+ DwcaHunter.logger_write(object_id,
291
+ "Traversing %s core data record" % count)
284
292
  end
285
- taxon_id = (d[:classificationPath].empty? ?
286
- d[:taxonId] :
287
- @templates[d[:classificationPath].
288
- last][:id]) rescue d[:taxonId]
293
+ taxon_id = begin
294
+ (d[:classificationPath].empty? ?
295
+ d[:taxonId] :
296
+ @templates[d[:classificationPath].
297
+ last][:id])
298
+ rescue StandardError
299
+ d[:taxonId]
300
+ end
289
301
  @taxon_ids[d[:taxonId]] = taxon_id
290
- parentNameUsageId = (d[:classificationPath].size > 1 ?
291
- @templates[d[:classificationPath][-2]][:id] :
292
- nil) rescue nil
293
- url = 'http://species.wikimedia.org/wiki/' +
294
- URI.encode(d[:canonicalForm].gsub(' ', '_'))
302
+ parentNameUsageId = begin
303
+ (d[:classificationPath].size > 1 ?
304
+ @templates[d[:classificationPath][-2]][:id] :
305
+ nil)
306
+ rescue StandardError
307
+ nil
308
+ end
309
+ url = "http://species.wikimedia.org/wiki/" +
310
+ URI.encode(d[:canonicalForm].gsub(" ", "_"))
295
311
  path = d[:classificationPath]
296
312
  path.pop if path[-1] == d[:canonicalForm]
297
- canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
298
- scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
313
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
+ scientific_name = d[:scientificName] == d[:canonicalForm] ?
299
315
  canonical_form :
300
316
  d[:scientificName]
301
317
  @core << [taxon_id,
302
318
  scientific_name,
303
- parentNameUsageId,
304
319
  canonical_form,
305
- path.join('|'),
306
320
  url]
307
321
  end
308
322
  @extensions << { data: [[
309
- 'http://rs.tdwg.org/dwc/terms/TaxonID',
310
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
311
- 'http://purl.org/dc/terms/language'
312
- ]], file_name: 'vernacular_names.txt' }
313
- DwcaHunter::logger_write(self.object_id,
314
- 'Creating verncaular name extension for DarwinCore Archive file')
323
+ "http://rs.tdwg.org/dwc/terms/TaxonID",
324
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
325
+ "http://purl.org/dc/terms/language"
326
+ ]], file_name: "vernacular_names.txt" }
327
+ DwcaHunter.logger_write(object_id,
328
+ "Creating verncaular name extension for DarwinCore Archive file")
315
329
  count = 0
316
330
  @data.each do |d|
317
331
  count += 1
318
332
  if count % BATCH_SIZE == 0
319
- DwcaHunter::logger_write(self.object_id,
320
- "Traversing %s extension data record" % count)
333
+ DwcaHunter.logger_write(object_id,
334
+ "Traversing %s extension data record" % count)
321
335
  end
322
336
  d[:vernacularNames].each do |vn|
323
- taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
324
- if taxon_id
325
- @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
326
- end
337
+ taxon_id = @taxon_ids[d[:taxonId]] || nil
338
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
327
339
  end
328
340
  end
329
341
  @eml = {
330
342
  id: @uuid,
331
343
  title: @title,
332
- license: 'http://creativecommons.org/licenses/by-sa/3.0/',
344
+ license: "http://creativecommons.org/licenses/by-sa/3.0/",
333
345
  authors: [
334
- { first_name: 'Stephen',
335
- last_name: 'Thorpe',
336
- email: 'stephen_thorpe@yahoo.co.nz',
337
- url: 'http://species.wikimedia.org/wiki/Main_Page' }],
338
- abstract: 'The free species directory that anyone can edit.',
346
+ { first_name: "Stephen",
347
+ last_name: "Thorpe",
348
+ email: "stephen_thorpe@yahoo.co.nz",
349
+ url: "http://species.wikimedia.org/wiki/Main_Page" }
350
+ ],
351
+ abstract: "The free species directory that anyone can edit.",
339
352
  metadata_providers: [
340
- { first_name: 'Dmitry',
341
- last_name: 'Mozzherin',
342
- email: 'dmozzherin@mbl.edu' }],
343
- url: 'http://species.wikimedia.org/wiki/Main_Page'
353
+ { first_name: "Dmitry",
354
+ last_name: "Mozzherin",
355
+ email: "dmozzherin@mbl.edu" }
356
+ ],
357
+ url: "http://species.wikimedia.org/wiki/Main_Page"
344
358
  }
345
359
  super
346
360
  end
347
-
348
361
  end
349
362
  end