dwca_hunter 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +45 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +61 -83
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +9 -9
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  20. data/lib/dwca_hunter/resources/ipni.rb +111 -0
  21. data/lib/dwca_hunter/resources/itis.rb +99 -99
  22. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  23. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  24. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  25. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  26. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  27. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  28. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  29. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  30. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  31. data/lib/dwca_hunter/version.rb +1 -1
  32. metadata +46 -40
  33. data/files/birdlife_7.csv +0 -11862
  34. data/files/fishbase_taxon_cache.tsv +0 -81000
  35. data/files/reptile_checklist_2014_12.csv +0 -15158
  36. data/files/species-black.txt +0 -251
@@ -0,0 +1,140 @@
1
+ class PaleodbHarvester
2
+ def initialize(download_dir)
3
+ @dir = File.join(download_dir, "json")
4
+ FileUtils.mkdir_p(@dir)
5
+ @in_dir = download_dir
6
+ @taxa_csv = CSV.open(File.join(@in_dir, "taxa.csv"), headers: true)
7
+ @refs_csv = CSV.open(File.join(@in_dir, "refs.csv"), headers: true)
8
+ @taxa_refs_csv = CSV.open(File.join(@in_dir, "taxa_refs.csv"), headers: true)
9
+ @occurences_csv = CSV.open(File.join(@in_dir, "occurences.csv"), headers: true)
10
+ end
11
+
12
+ def taxa
13
+ # "orig_no","taxon_no","record_type","flags","taxon_rank",
14
+ # "taxon_name","difference","accepted_no","accepted_rank",
15
+ # "accepted_name","parent_no","reference_no","is_extant","n_occs"
16
+ taxa = {}
17
+ name2id = {}
18
+ @taxa_csv.each do |r|
19
+ r = strip(r)
20
+ taxa[r["taxon_no"]] = { t_id: r["orig_no"], id: r["taxon_no"],
21
+ rank: r["taxon_rank"], name: r["taxon_name"],
22
+ auth: r["taxon_attr"],
23
+ extinct: extinct(r["is_extant"]),
24
+ vernacular: r["common_name"],
25
+ annot: r["difference"], acc_id: r["accepted_no"],
26
+ acc_rank: r["accepted_rank"],
27
+ acc_name: r["accepted_name"], ecol: ecol(r),
28
+ parent_id: r["parent_no"], ref: r["reference_no"],
29
+ occs_num: r["n_occs"], enterer: enterer(r) }
30
+
31
+ name2id[r["taxon_name"]] = { id: r["taxon_no"], acc_id: r["accepted_no"] }
32
+ end
33
+ f = open(File.join(@dir, "taxa.json"), "w:utf-8")
34
+ f.write(JSON.pretty_generate(taxa))
35
+ f.close
36
+ f = open(File.join(@dir, "name_id.json"), "w:utf-8")
37
+ f.write(JSON.pretty_generate(name2id))
38
+ f.close
39
+ end
40
+
41
+ def enterer(r)
42
+ res = [r["enterer"], r["modifier"]].map(&:to_s)
43
+ .map(&:strip).uniq.select { |e| e != "" }
44
+ res.empty? ? "" : res.join(", ")
45
+ end
46
+
47
+
48
+ def extinct(val)
49
+ val == "extinct" ? 1 : 0
50
+ end
51
+
52
+ def ecol(row)
53
+ row = strip row
54
+ "#{row['life_habit']} #{row['diet']}"
55
+ end
56
+
57
+ def refs
58
+ # "reference_no","record_type","ref_type","author1init","author1last",
59
+ # "author2init","author2last","otherauthors","pubyr","reftitle","pubtitle",
60
+ # "editors","pubvol","pubno","firstpage","lastpage","publication_type",
61
+ # "language","doi"
62
+
63
+ # {"id":31671,"orig":true,"author":"Hahn, C. W.",
64
+ # "year":1834,"title":"Die wanzenartigen Insecten.",
65
+ # "details":"C. H. Zeh, Nurnberg. 2: 33--120.",
66
+ # "distribution":"Germany","comment":"n. sp."}
67
+ refs = {}
68
+ @refs_csv.each do |r|
69
+ r = strip r
70
+ authorship, author = authors(r)
71
+ refs[r["reference_no"]] = { id: r["reference_no"], author: author,
72
+ authorship: authorship,
73
+ year: r["pubyr"], title: r["reftitle"],
74
+ details: details(r) }
75
+ end
76
+ f = open(File.join(@dir, "refs.json"), "w:utf-8")
77
+ f.write(JSON.pretty_generate(refs))
78
+ f.close
79
+ end
80
+
81
+ def authors(row)
82
+ row = strip row
83
+ au = ["#{row['author1init']} #{row['author1last']}".strip,
84
+ "#{row['author2init']} #{row['author2last']}".strip,
85
+ "#{row['otherauthors']}".strip]
86
+ au = au.select { |a| !a.empty? }.map { |a| a.gsub(/[\s]{2,}/, " ").strip }
87
+ [au[0..1].join(", "), au.join(", ")]
88
+ end
89
+
90
+ def details(row)
91
+ row = strip row
92
+ ref = "#{row['pubtitle']}"
93
+ ref << " #{row['pubno']}" unless row['pubno'].empty?
94
+ ref << ": #{row['firstpage']}" unless row['firstpage'].empty?
95
+ ref << "--#{row['lastpage']}" unless row['lastpage'].empty?
96
+ ref << " (#{row["doi"]})" unless row['doi'].empty?
97
+ ref.gsub(/[\s]{2,}/, " ").strip
98
+ end
99
+
100
+ def taxa_refs
101
+ tr = {}
102
+ @taxa_refs_csv.each do |r|
103
+ r = strip r
104
+ row = { acc_id: r["accepted_no"], name: r["accepted_name"],
105
+ ref_id: r["reference_no"] }
106
+ if tr.key? r["accepted_no"]
107
+ tr[r["accepted_no"]] << row
108
+ else
109
+ tr[r["accepted_no"]] = [row]
110
+ end
111
+ end
112
+ f = open(File.join(@dir, "taxa_refs.json"), "w:utf-8")
113
+ f.write(JSON.pretty_generate(tr))
114
+ f.close
115
+ end
116
+
117
+ def occurences
118
+ occ = {}
119
+ @occurences_csv.each_with_index do |r, i|
120
+ r = strip r
121
+ row = { id: r["accepted_no"], name: r["accepted_name"], country: r["cc"],
122
+ state: r["state"], age_min: r["min_ma"], age_max: r["max_ma"] }
123
+ if occ.key? r["accepted_no"]
124
+ occ[r["accepted_no"]] << row
125
+ else
126
+ occ[r["accepted_no"]] = [row]
127
+ end
128
+ end
129
+ f = open(File.join(@dir, "occurences.json"), "w:utf-8")
130
+ f.write(JSON.pretty_generate(occ))
131
+ f.close
132
+ end
133
+
134
+ def strip(row)
135
+ row.each_with_object({}) do |(k, v), h|
136
+ h[k] = v.nil? ? nil : v.strip
137
+ end
138
+ end
139
+ end
140
+
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceSherborn < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "sherborn"
7
+ @title = "Index Animalium"
8
+ @url = "https://uofi.box.com/shared/static/kj8a26a3bcrraa4kccoyz5jr5uqrqoe6.csv"
9
+ @UUID = "05ad6ca2-fc37-47f4-983a-72e535420e28"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "sherborn",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading."
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ dupes = {}
45
+ @names_index = {}
46
+ file = CSV.open(File.join(@download_dir, "data.csv"),
47
+ headers: false, col_sep: "\t")
48
+ file.each_with_index do |row, i|
49
+ next if dupes.key?(row[1])
50
+
51
+ dupes[row[1]] = true
52
+ taxon_id = row[0]
53
+ name_string = row[1]
54
+
55
+ @names << { taxon_id: taxon_id,
56
+ name_string: name_string }
57
+ puts "Processed %s names" % i if i % 10_000 == 0
58
+ end
59
+ end
60
+
61
+ def generate_dwca
62
+ DwcaHunter.logger_write(object_id,
63
+ "Creating DarwinCore Archive file")
64
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
65
+ "http://rs.tdwg.org/dwc/terms/scientificName",
66
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
67
+ @names.each do |n|
68
+ @core << [n[:taxon_id], n[:name_string], "ICZN"]
69
+ end
70
+
71
+ @eml = {
72
+ id: @uuid,
73
+ title: @title,
74
+ authors: [
75
+ { first_name: "Charles Davies",
76
+ last_name: "Sherborn" }
77
+ ],
78
+ metadata_providers: [
79
+ { first_name: "Dmitry",
80
+ last_name: "Mozzherin",
81
+ email: "dmozzherin@gmail.com" }
82
+ ],
83
+ abstract: "Index Animalium is a monumental work that covers " \
84
+ "400 000 zoological names registered by science " \
85
+ "between 1758 and 1850",
86
+ url: @url
87
+ }
88
+ super
89
+ end
90
+ end
91
+ end
@@ -1,18 +1,17 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceWikispecies < DwcaHunter::Resource
4
5
  def initialize(opts = {})
5
- @problems_file = open('problems.txt', 'w:utf-8')
6
+ @wikisp_path = File.join(Dir.tmpdir, "dwca_hunter", "wikispecies")
7
+ @problems_file = open(File.join(Dir.tmpdir, "problems.txt"), "w:utf-8")
6
8
  @command = "wikispecies"
7
- @title = 'Wikispecies'
8
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
9
- 'specieswiki-latest-pages-articles.xml.bz2'
9
+ @title = "Wikispecies"
10
+ @url = "http://dumps.wikimedia.org/specieswiki/latest/" \
11
+ "specieswiki-latest-pages-articles.xml.bz2"
10
12
  @url = opts[:url] if opts[:url]
11
- @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
12
- @download_path = File.join(Dir.tmpdir,
13
- 'dwca_hunter',
14
- 'wikispecies',
15
- 'data.xml.bz2')
13
+ @uuid = "68923690-0727-473c-b7c5-2ae9e601e3fd"
14
+ @download_path = File.join(@wikisp_path, "data.xml.bz2")
16
15
  @data = []
17
16
  @templates = {}
18
17
  @taxon_ids = {}
@@ -21,7 +20,7 @@ module DwcaHunter
21
20
  @extensions = []
22
21
  @re = {
23
22
  page_start: /^\s*\<page\>\s*$/,
24
- page_end: /^\s*\<\/page\>\s*$/,
23
+ page_end: %r{^\s*\</page\>\s*$},
25
24
  template: /Template:/i,
26
25
  template_link: /\{\{([^\}]*)\}\}/,
27
26
  vernacular_names: /\{\{\s*VN\s*\|([^\}]+)\}\}/i
@@ -29,6 +28,11 @@ module DwcaHunter
29
28
  super(opts)
30
29
  end
31
30
 
31
+ def download
32
+ puts "Downloading from the source"
33
+ `curl -L #{@url} -o #{@download_path}`
34
+ end
35
+
32
36
  def unpack
33
37
  unpack_bz2
34
38
  end
@@ -39,22 +43,22 @@ module DwcaHunter
39
43
  generate_dwca
40
44
  end
41
45
 
42
- private
46
+ private
43
47
 
44
48
  def enrich_data
45
- DwcaHunter::logger_write(self.object_id,
46
- 'Extracting data from xml file...')
49
+ DwcaHunter.logger_write(object_id,
50
+ "Extracting data from xml file...")
47
51
  Dir.chdir(@download_dir)
48
- f = open('data.xml', 'r:utf-8')
52
+ f = open("data.xml", "r:utf-8")
49
53
  page_on = false
50
- page = ''
54
+ page = ""
51
55
  page_num = 0
52
56
  f.each do |l|
53
57
  if l.match(@re[:page_start])
54
- page << l
58
+ page += l
55
59
  page_on = true
56
60
  elsif page_on
57
- page << l
61
+ page += l
58
62
  if l.match(@re[:page_end])
59
63
  page_on = false
60
64
  page_xml = Nokogiri::XML.parse(page)
@@ -63,22 +67,22 @@ module DwcaHunter
63
67
  process_species(page_xml)
64
68
  page_num += 1
65
69
  if page_num % BATCH_SIZE == 0
66
- DwcaHunter::logger_write(self.object_id,
67
- "Traversed %s pages" % page_num)
70
+ DwcaHunter.logger_write(object_id,
71
+ "Traversed %s pages" % page_num)
68
72
  end
69
- page = ''
73
+ page = ""
70
74
  @page_title = nil
71
75
  @page_id = nil
72
76
  end
73
77
  end
74
78
  end
75
- DwcaHunter::logger_write(self.object_id,
76
- 'Extracted total %s pages' % page_num)
79
+ DwcaHunter.logger_write(object_id,
80
+ "Extracted total %s pages" % page_num)
77
81
  f.close
78
82
  end
79
83
 
80
84
  def extend_classification
81
- DwcaHunter::logger_write(self.object_id, 'Extending classifications')
85
+ DwcaHunter.logger_write(object_id, "Extending classifications")
82
86
  @data.each_with_index do |d, i|
83
87
  unless d[:classificationPath].empty?
84
88
  n = 50
@@ -100,19 +104,21 @@ module DwcaHunter
100
104
  # d[:classificationPath] = d[:classificationPath].join("|").
101
105
  # gsub("Main Page", "Life")
102
106
  if i % BATCH_SIZE == 0 && i > 0
103
- DwcaHunter::logger_write(self.object_id,
104
- "Extended %s classifications" % i)
107
+ DwcaHunter.logger_write(object_id,
108
+ "Extended %s classifications" % i)
105
109
  end
106
110
  end
107
111
  end
108
112
 
109
113
  def update_tree(path)
110
114
  path = path.dup
111
- return if @paths.has_key?(path.join('|'))
115
+ return if @paths.key?(path.join("|"))
116
+
112
117
  (0...path.size).each do |i|
113
118
  subpath = path[0..i]
114
- subpath_string = subpath.join('|')
115
- next if @paths.has_key?(subpath_string)
119
+ subpath_string = subpath.join("|")
120
+ next if @paths.key?(subpath_string)
121
+
116
122
  name = subpath.pop
117
123
  tree_element = subpath.inject(@tree) { |res, n| res[n] }
118
124
  tree_element[name] = {}
@@ -121,27 +127,29 @@ module DwcaHunter
121
127
  end
122
128
 
123
129
  def process_template(x)
124
- name = page_title(x).gsub!(@re[:template], '').strip
125
- text = x.xpath('//text').text.strip
130
+ name = page_title(x).gsub!(@re[:template], "").strip
131
+ text = x.xpath("//text").text.strip
126
132
  parent_name = text.match(@re[:template_link])
127
133
  if parent_name
128
134
  return if parent_name[1].match(/\#if/)
135
+
129
136
  list = parent_name[1].split("|")
130
- if list.size == 1
131
- parent_name = list[0]
132
- elsif list[0].match /Taxonav/i
133
- parent_name = list[1]
134
- else
135
- parent_name = list[0]
136
- end
137
+ parent_name = if list.size == 1
138
+ list[0]
139
+ elsif list[0].match(/Taxonav/i)
140
+ list[1]
141
+ else
142
+ list[0]
143
+ end
137
144
  end
138
- name.gsub!(/_/, ' ')
139
- parent_name.gsub!(/_/, ' ') if parent_name
145
+ name.gsub!(/_/, " ")
146
+ parent_name&.gsub!(/_/, " ")
140
147
  @templates[name] = { parentName: parent_name, id: page_id(x) }
141
148
  end
142
149
 
143
150
  def process_species(x)
144
151
  return if page_title(x).match(/Wikispecies/i)
152
+
145
153
  items = find_species_components(x)
146
154
  if items
147
155
  @data << {
@@ -149,7 +157,8 @@ module DwcaHunter
149
157
  canonicalForm: page_title(x),
150
158
  scientificName: page_title(x),
151
159
  classificationPath: [],
152
- vernacularNames: [] }
160
+ vernacularNames: []
161
+ }
153
162
  get_full_scientific_name(items)
154
163
  get_vernacular_names(items)
155
164
  init_classification_path(items)
@@ -157,8 +166,8 @@ module DwcaHunter
157
166
  end
158
167
 
159
168
  def get_full_scientific_name(items)
160
- if items['name']
161
- if name = items['name'][0]
169
+ if items["name"]
170
+ if name = items["name"][0]
162
171
  @data[-1][:scientificName] = parse_name(name, @data[-1])
163
172
  else
164
173
  @problems_file.write("%s\n" % @data[-1][:canonicalForm])
@@ -167,19 +176,20 @@ module DwcaHunter
167
176
  end
168
177
 
169
178
  def get_vernacular_names(items)
170
- if items['vernacular names'] && items['vernacular names'].size > 0
171
- vn_string = items['vernacular names'].join("")
179
+ if items["vernacular names"] && !items["vernacular names"].empty?
180
+ vn_string = items["vernacular names"].join("")
172
181
  vn = vn_string.match(@re[:vernacular_names])
173
182
  if vn
174
183
  vn_list = vn[1].strip.split("|")
175
184
  vnames = []
176
185
  vn_list.each do |item|
177
- language, name = item.split("=").map { |x| x.strip }
178
- if language && name && language.size < 4 && name.valid_encoding?
179
- vnames << {
180
- name: name,
181
- language: language }
182
- end
186
+ language, name = item.split("=").map(&:strip)
187
+ next unless language && name && language.size < 4 && name.valid_encoding?
188
+
189
+ vnames << {
190
+ name: name,
191
+ language: language
192
+ }
183
193
  end
184
194
 
185
195
  @data[-1][:vernacularNames] = vnames
@@ -188,26 +198,26 @@ module DwcaHunter
188
198
  end
189
199
 
190
200
  def init_classification_path(items)
191
- if items['taxonavigation']
192
- items['taxonavigation'].each do |line|
193
- line.gsub!(/\[\[.*\]\]/, '') # ignore non-template links
194
- if template_link = line.match(@re[:template_link])
195
- template_link = template_link[1].
196
- strip.gsub(/Template:/, '').gsub(/_/, ' ')
197
- if !template_link.match(/\|/)
198
- @data[-1][:classificationPath] << template_link
199
- break
200
- end
201
- end
201
+ # ignore non-template links
202
+ items["taxonavigation"]&.each do |line|
203
+ line.gsub!(/\[\[.*\]\]/, "") # ignore non-template links
204
+ next unless template_link = line.match(@re[:template_link])
205
+
206
+ template_link = template_link[1].
207
+ strip.gsub(/Template:/, "").gsub(/_/, " ")
208
+ unless template_link.match(/\|/)
209
+ @data[-1][:classificationPath] << template_link
210
+ break
202
211
  end
203
212
  end
204
213
  end
205
214
 
206
215
  def find_species_components(x)
207
- items = get_items(x.xpath('//text').text)
208
- is_taxon_item = items.has_key?('name') ||
209
- items.has_key?('taxonavigation')
216
+ items = get_items(x.xpath("//text").text)
217
+ is_taxon_item = items.key?("name") ||
218
+ items.key?("taxonavigation")
210
219
  return nil unless is_taxon_item
220
+
211
221
  items
212
222
  end
213
223
 
@@ -216,7 +226,7 @@ module DwcaHunter
216
226
  items = {}
217
227
  current_item = nil
218
228
  txt.split("\n").each do |l|
219
- item = l.match(/[\=]+([^\=]+)[\=]+/)
229
+ item = l.match(/[\=]+([^\=]+)[\=]+/)
220
230
  if item
221
231
  current_item = item[1].strip.downcase
222
232
  items[current_item] = []
@@ -228,11 +238,11 @@ module DwcaHunter
228
238
  end
229
239
 
230
240
  def page_title(x)
231
- @page_title ||= x.xpath('//title').first.text
241
+ @page_title ||= x.xpath("//title").first.text
232
242
  end
233
243
 
234
244
  def page_id(x)
235
- @page_id ||= x.xpath('//id').first.text
245
+ @page_id ||= x.xpath("//id").first.text
236
246
  end
237
247
 
238
248
  def template?(page_xml)
@@ -240,110 +250,113 @@ module DwcaHunter
240
250
  end
241
251
 
242
252
  def parse_name(name_string, taxa)
243
- name_string.gsub!('BASEPAGENAME', taxa[:canonicalForm])
253
+ name_string.gsub!("BASEPAGENAME", taxa[:canonicalForm])
244
254
  name_string = name_string.strip
245
255
  old_l = name_string.dup
246
- name_string.gsub! /^\*\s*/, ''
256
+ name_string.gsub!(/^\*\s*/, "")
247
257
  name_string.gsub!(/\[\[([^\]]+\|)?([^\]]*)\]\]/, '\2')
248
258
  name_string.gsub!(/\{\{([^\}]+\|)?([^\}]*)\}\}/, '\2')
249
- name_string.gsub!(/[']{2,}/, ' ')
250
- name_string.gsub!(/["]{2,}/, ' ')
251
- name_string.gsub!(/\:\s*\d.*$/, '')
252
- name_string.gsub!(/,\s*\[RSD\]/i, '')
253
- name_string.gsub!(/^\s*†\s*/, '')
254
- name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, '')
259
+ name_string.gsub!(/[']{2,}/, " ")
260
+ name_string.gsub!(/["]{2,}/, " ")
261
+ name_string.gsub!(/\:\s*\d.*$/, "")
262
+ name_string.gsub!(/,\s*\[RSD\]/i, "")
263
+ name_string.gsub!(/^\s*†\s*/, "")
264
+ name_string.gsub!(/(:\s*)?\[http:[^\]]+\]/, "")
255
265
  # name_string = DwcaHunter::XML.unescape(name_string)
256
- name_string.gsub!(/\<nowiki\>.*$/, '')
257
- name_string.gsub!(/\<br\s*[\/]?\s*\>/, '')
258
- name_string.gsub!(/^\s*\&dagger;\s*/, '')
259
- name_string.gsub!(/&nbsp;/, ' ')
260
- name_string.gsub!(/\s+/, ' ')
266
+ name_string.gsub!(/\<nowiki\>.*$/, "")
267
+ name_string.gsub!(%r{\<br\s*[/]?\s*\>}, "")
268
+ name_string.gsub!(/^\s*\&dagger;\s*/, "")
269
+ name_string.gsub!(/&nbsp;/, " ")
270
+ name_string.gsub!(/\s+/, " ")
261
271
  name_string = name_string.strip
262
272
  # puts "%s---%s" % [name_string, old_l]
263
- return name_string
273
+ name_string
264
274
  end
265
275
 
266
276
  def generate_dwca
267
- DwcaHunter::logger_write(self.object_id,
268
- 'Creating DarwinCore Archive file')
277
+ DwcaHunter.logger_write(object_id,
278
+ "Creating DarwinCore Archive file")
269
279
  @core = [
270
- ['http://rs.tdwg.org/dwc/terms/taxonID',
271
- 'http://rs.tdwg.org/dwc/terms/scientificName',
272
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
273
- 'http://globalnames.org/terms/canonicalForm',
274
- 'http://rs.tdwg.org/dwc/terms/higherClassification',
275
- 'http://purl.org/dc/terms/source']
280
+ ["http://rs.tdwg.org/dwc/terms/taxonID",
281
+ "http://rs.tdwg.org/dwc/terms/scientificName",
282
+ "http://globalnames.org/terms/canonicalForm",
283
+ "http://purl.org/dc/terms/source"]
276
284
  ]
277
- DwcaHunter::logger_write(self.object_id, 'Assembling Core Data')
285
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
278
286
  count = 0
279
287
  @data.map do |d|
280
288
  count += 1
281
289
  if count % BATCH_SIZE == 0
282
- DwcaHunter::logger_write(self.object_id,
283
- "Traversing %s core data record" % count)
290
+ DwcaHunter.logger_write(object_id,
291
+ "Traversing %s core data record" % count)
284
292
  end
285
- taxon_id = (d[:classificationPath].empty? ?
286
- d[:taxonId] :
287
- @templates[d[:classificationPath].
288
- last][:id]) rescue d[:taxonId]
293
+ taxon_id = begin
294
+ (d[:classificationPath].empty? ?
295
+ d[:taxonId] :
296
+ @templates[d[:classificationPath].
297
+ last][:id])
298
+ rescue StandardError
299
+ d[:taxonId]
300
+ end
289
301
  @taxon_ids[d[:taxonId]] = taxon_id
290
- parentNameUsageId = (d[:classificationPath].size > 1 ?
291
- @templates[d[:classificationPath][-2]][:id] :
292
- nil) rescue nil
293
- url = 'http://species.wikimedia.org/wiki/' +
294
- URI.encode(d[:canonicalForm].gsub(' ', '_'))
302
+ parentNameUsageId = begin
303
+ (d[:classificationPath].size > 1 ?
304
+ @templates[d[:classificationPath][-2]][:id] :
305
+ nil)
306
+ rescue StandardError
307
+ nil
308
+ end
309
+ url = "http://species.wikimedia.org/wiki/" +
310
+ URI.encode(d[:canonicalForm].gsub(" ", "_"))
295
311
  path = d[:classificationPath]
296
312
  path.pop if path[-1] == d[:canonicalForm]
297
- canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, '').strip
298
- scientific_name = (d[:scientificName] == d[:canonicalForm]) ?
313
+ canonical_form = d[:canonicalForm].gsub(/\(.*\)\s*$/, "").strip
314
+ scientific_name = d[:scientificName] == d[:canonicalForm] ?
299
315
  canonical_form :
300
316
  d[:scientificName]
301
317
  @core << [taxon_id,
302
318
  scientific_name,
303
- parentNameUsageId,
304
319
  canonical_form,
305
- path.join('|'),
306
320
  url]
307
321
  end
308
322
  @extensions << { data: [[
309
- 'http://rs.tdwg.org/dwc/terms/TaxonID',
310
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
311
- 'http://purl.org/dc/terms/language'
312
- ]], file_name: 'vernacular_names.txt' }
313
- DwcaHunter::logger_write(self.object_id,
314
- 'Creating verncaular name extension for DarwinCore Archive file')
323
+ "http://rs.tdwg.org/dwc/terms/TaxonID",
324
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
325
+ "http://purl.org/dc/terms/language"
326
+ ]], file_name: "vernacular_names.txt" }
327
+ DwcaHunter.logger_write(object_id,
328
+ "Creating verncaular name extension for DarwinCore Archive file")
315
329
  count = 0
316
330
  @data.each do |d|
317
331
  count += 1
318
332
  if count % BATCH_SIZE == 0
319
- DwcaHunter::logger_write(self.object_id,
320
- "Traversing %s extension data record" % count)
333
+ DwcaHunter.logger_write(object_id,
334
+ "Traversing %s extension data record" % count)
321
335
  end
322
336
  d[:vernacularNames].each do |vn|
323
- taxon_id = @taxon_ids[d[:taxonId]] ? @taxon_ids[d[:taxonId]] : nil
324
- if taxon_id
325
- @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]]
326
- end
337
+ taxon_id = @taxon_ids[d[:taxonId]] || nil
338
+ @extensions[-1][:data] << [taxon_id, vn[:name], vn[:language]] if taxon_id
327
339
  end
328
340
  end
329
341
  @eml = {
330
342
  id: @uuid,
331
343
  title: @title,
332
- license: 'http://creativecommons.org/licenses/by-sa/3.0/',
344
+ license: "http://creativecommons.org/licenses/by-sa/3.0/",
333
345
  authors: [
334
- { first_name: 'Stephen',
335
- last_name: 'Thorpe',
336
- email: 'stephen_thorpe@yahoo.co.nz',
337
- url: 'http://species.wikimedia.org/wiki/Main_Page' }],
338
- abstract: 'The free species directory that anyone can edit.',
346
+ { first_name: "Stephen",
347
+ last_name: "Thorpe",
348
+ email: "stephen_thorpe@yahoo.co.nz",
349
+ url: "http://species.wikimedia.org/wiki/Main_Page" }
350
+ ],
351
+ abstract: "The free species directory that anyone can edit.",
339
352
  metadata_providers: [
340
- { first_name: 'Dmitry',
341
- last_name: 'Mozzherin',
342
- email: 'dmozzherin@mbl.edu' }],
343
- url: 'http://species.wikimedia.org/wiki/Main_Page'
353
+ { first_name: "Dmitry",
354
+ last_name: "Mozzherin",
355
+ email: "dmozzherin@mbl.edu" }
356
+ ],
357
+ url: "http://species.wikimedia.org/wiki/Main_Page"
344
358
  }
345
359
  super
346
360
  end
347
-
348
361
  end
349
362
  end