dwc-archive 0.9.11 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +4 -0
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +114 -109
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
  25. data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +1 -1
  34. data/spec/lib/classification_normalizer_spec.rb +96 -105
  35. data/spec/lib/core_spec.rb +43 -41
  36. data/spec/lib/darwin_core_spec.rb +108 -138
  37. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  38. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  39. data/spec/lib/generator_spec.rb +73 -74
  40. data/spec/lib/gnub_taxon_spec.rb +15 -17
  41. data/spec/lib/metadata_spec.rb +50 -41
  42. data/spec/lib/taxon_normalized_spec.rb +62 -65
  43. data/spec/lib/xml_reader_spec.rb +9 -12
  44. data/spec/spec_helper.rb +54 -51
  45. metadata +101 -87
  46. data/.rvmrc +0 -1
  47. data/lib/dwc-archive.rb +0 -107
  48. data/lib/dwc-archive/archive.rb +0 -40
  49. data/lib/dwc-archive/classification_normalizer.rb +0 -427
  50. data/lib/dwc-archive/core.rb +0 -19
  51. data/lib/dwc-archive/expander.rb +0 -85
  52. data/lib/dwc-archive/generator.rb +0 -86
  53. data/lib/dwc-archive/ingester.rb +0 -101
  54. data/lib/dwc-archive/metadata.rb +0 -48
  55. data/lib/dwc-archive/version.rb +0 -3
  56. data/lib/dwc-archive/xml_reader.rb +0 -80
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm use ruby-1.9.3-p392@dwc-archive --create
@@ -1,107 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- recent_ruby = RUBY_VERSION >= '1.9.1'
4
- raise "IMPORTANT: dwc-archive gem requires ruby >= 1.9.1" unless recent_ruby
5
-
6
- require 'fileutils'
7
- require 'ostruct'
8
- require 'digest'
9
- require 'csv'
10
- require 'logger'
11
- require 'nokogiri'
12
- require_relative 'dwc-archive/xml_reader'
13
- require_relative 'dwc-archive/ingester'
14
- require_relative 'dwc-archive/errors'
15
- require_relative 'dwc-archive/expander'
16
- require_relative 'dwc-archive/archive'
17
- require_relative 'dwc-archive/core'
18
- require_relative 'dwc-archive/extension'
19
- require_relative 'dwc-archive/metadata'
20
- require_relative 'dwc-archive/generator'
21
- require_relative 'dwc-archive/generator_meta_xml'
22
- require_relative 'dwc-archive/generator_eml_xml'
23
- require_relative 'dwc-archive/classification_normalizer'
24
- require_relative 'dwc-archive/version'
25
-
26
- class DarwinCore
27
-
28
- DEFAULT_TMP_DIR = "/tmp"
29
-
30
- attr_reader :archive, :core, :metadata, :extensions,
31
- :classification_normalizer
32
- alias :eml :metadata
33
-
34
-
35
- def self.nil_field?(field)
36
- return true if [nil, '', '/N'].include?(field)
37
- false
38
- end
39
-
40
- def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
41
- Dir.entries(tmp_dir).each do |entry|
42
- path = File.join(tmp_dir, entry)
43
- if FileTest.directory?(path) && entry.match(/^dwc_[\d]+$/)
44
- FileUtils.rm_rf(path)
45
- end
46
- end
47
- end
48
-
49
- def self.logger
50
- @@logger ||= Logger.new(nil)
51
- end
52
-
53
- def self.logger=(logger)
54
- @@logger = logger
55
- end
56
-
57
- def self.logger_reset
58
- self.logger = Logger.new(nil)
59
- end
60
-
61
- def self.logger_write(obj_id, message, method = :info)
62
- self.logger.send(method, "|%s|%s|" % [obj_id, message])
63
- end
64
-
65
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
66
- @dwc_path = dwc_path
67
- @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
68
- @core = DarwinCore::Core.new(self)
69
- @metadata = DarwinCore::Metadata.new(@archive)
70
- @extensions = get_extensions
71
- end
72
-
73
- def file_name
74
- File.split(@dwc_path).last
75
- end
76
-
77
- def path
78
- File.expand_path(@dwc_path)
79
- end
80
-
81
- # generates a hash from a classification data with path to each node,
82
- # list of synonyms and vernacular names.
83
- def normalize_classification
84
- return nil unless has_parent_id?
85
- @classification_normalizer ||= DarwinCore::ClassificationNormalizer.
86
- new(self)
87
- @classification_normalizer.normalize
88
- end
89
-
90
- def has_parent_id?
91
- !!@core.fields.join('|').downcase.match(/highertaxonid|parentnameusageid/)
92
- end
93
-
94
- def checksum
95
- Digest::SHA1.hexdigest(open(@dwc_path).read)
96
- end
97
-
98
- private
99
- def get_extensions
100
- res = []
101
- root_key = @archive.meta.keys[0]
102
- ext = @archive.meta[root_key][:extension]
103
- return [] unless ext
104
- ext = [ext] if ext.class != Array
105
- ext.map { |e| DarwinCore::Extension.new(self, e) }
106
- end
107
- end
@@ -1,40 +0,0 @@
1
- class DarwinCore
2
- class Archive
3
- attr_reader :meta, :eml
4
-
5
- def initialize(archive_path, tmp_dir)
6
- @archive_path = archive_path
7
- @tmp_dir = tmp_dir
8
- @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
9
- @expander.unpack
10
- if valid?
11
- @meta = DarwinCore::XmlReader.
12
- from_xml(open(File.join(@expander.path, 'meta.xml')))
13
- @eml = files.include?("eml.xml") ?
14
- DarwinCore::XmlReader.
15
- from_xml(open(File.join(@expander.path, 'eml.xml'))) : nil
16
- else
17
- clean
18
- raise InvalidArchiveError
19
- end
20
- end
21
-
22
- def valid?
23
- valid = true
24
- valid = valid && @expander.path && FileTest.exists?(@expander.path)
25
- valid = valid && files && files.include?('meta.xml')
26
- end
27
-
28
- def files
29
- @expander.files
30
- end
31
-
32
- def files_path
33
- @expander.path
34
- end
35
-
36
- def clean
37
- @expander.clean
38
- end
39
- end
40
- end
@@ -1,427 +0,0 @@
1
- # encoding: utf-8
2
- require 'parsley-store'
3
-
4
- class DarwinCore
5
-
6
- class TaxonNormalized
7
- attr_accessor :id, :local_id, :global_id, :source, :parent_id,
8
- :classification_path_id, :classification_path,
9
- :linnean_classification_path, :current_name, :current_name_canonical,
10
- :synonyms, :vernacular_names, :rank, :status
11
-
12
- def initialize
13
- @id = @parent_id = @rank = @status = nil
14
- @current_name = ''
15
- @current_name_canonical = ''
16
- @source = ''
17
- @local_id = ''
18
- @global_id = ''
19
- @classification_path = []
20
- @classification_path_id = []
21
- @synonyms = []
22
- @vernacular_names = []
23
- @linnean_classification_path = []
24
- end
25
-
26
- end
27
-
28
- class GnubTaxon < TaxonNormalized
29
- attr_accessor :uuid, :uuid_path
30
-
31
- def initialize
32
- super
33
- @uuid = nil
34
- @uuid_path = []
35
- end
36
- end
37
-
38
- class SynonymNormalized < Struct.new(:id, :name, :canonical_name,
39
- :status, :source, :local_id,
40
- :global_id);end
41
- class VernacularNormalized < Struct.new(:name, :language, :locality,
42
- :country_code);end
43
-
44
- class ClassificationNormalizer
45
- attr_reader :error_names, :tree, :normalized_data
46
-
47
- def initialize(dwc_instance)
48
- @dwc = dwc_instance
49
- @core_fields = get_fields(@dwc.core)
50
- @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
51
- @normalized_data = {}
52
- @synonyms = {}
53
- @parser = ParsleyStore.new(1,2)
54
- @name_strings = {}
55
- @vernacular_name_strings = {}
56
- @error_names = []
57
- @tree = {}
58
- end
59
-
60
- def darwin_core
61
- @dwc
62
- end
63
-
64
- def add_name_string(name_string)
65
- @name_strings[name_string] = 1 unless @name_strings[name_string]
66
- end
67
-
68
- def add_vernacular_name_string(name_string)
69
- unless @vernacular_name_strings[name_string]
70
- @vernacular_name_strings[name_string] = 1
71
- end
72
- end
73
-
74
- def name_strings(opts = {})
75
- process_strings(@name_strings, opts)
76
- end
77
-
78
- def vernacular_name_strings(opts = {})
79
- process_strings(@vernacular_name_strings, opts)
80
- end
81
-
82
- def normalize(opts = {})
83
- opts = { :with_canonical_names => true,
84
- :with_extensions => true }.merge(opts)
85
- @with_canonical_names = !!opts[:with_canonical_names]
86
- DarwinCore.logger_write(@dwc.object_id,
87
- 'Started normalization of the classification')
88
- ingest_core
89
- DarwinCore.logger_write(@dwc.object_id,
90
- 'Calculating the classification parent/child paths')
91
- has_parent_id? ?
92
- calculate_classification_path :
93
- @normalized_data.keys.each { |id| @tree[id] = {} }
94
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting data from extensions')
95
- if !!opts[:with_extensions]
96
- ingest_extensions
97
- end
98
- @normalized_data
99
- end
100
-
101
- private
102
-
103
- def process_strings(strings, opts)
104
- opts = { with_hash: false }.merge(opts)
105
- if !!opts[:with_hash]
106
- strings
107
- else
108
- strings.keys
109
- end
110
- end
111
-
112
- def get_canonical_name(a_scientific_name)
113
- if @with_canonical_names
114
- canonical_name = @parser.parse(a_scientific_name,
115
- :canonical_only => true)
116
- canonical_name.to_s.empty? ? a_scientific_name : canonical_name
117
- else
118
- nil
119
- end
120
- end
121
-
122
- def get_fields(element)
123
- data = element.fields.inject({}) do |res, f|
124
- field = f[:term].split('/')[-1]
125
- field = field ? field.downcase.to_sym : ''
126
- res[field] = f[:index].to_i
127
- res
128
- end
129
- data[:id] = element.id[:index] if element.id
130
- data
131
- end
132
-
133
- def status_synonym?(status)
134
- status && !!status.match(/^syn/)
135
- end
136
-
137
- def add_synonym_from_core(taxon_id, row)
138
- @synonyms[row[@core_fields[:id]]] = taxon_id
139
- taxon = @normalized_data[row[taxon_id]] ?
140
- @normalized_data[row[taxon_id]] :
141
- @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
142
- synonym = SynonymNormalized.new(
143
- row[@core_fields[:id]],
144
- row[@core_fields[:scientificname]],
145
- row[@core_fields[:canonicalname]],
146
- @core_fields[:taxonomicstatus] ?
147
- row[@core_fields[:taxonomicstatus]] :
148
- nil,
149
- @core_fields[:source] ? row[@core_fields[:source]] : nil,
150
- @core_fields[:localid] ? row[@core_fields[:localid]] : nil,
151
- @core_fields[:globalid] ? row[@core_fields[:globalid]] : nil,
152
- )
153
- taxon.synonyms << synonym
154
- add_name_string(synonym.name)
155
- add_name_string(synonym.canonical_name)
156
- end
157
-
158
- def set_scientific_name(row, fields)
159
- row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
160
- canonical_name = nil
161
- scientific_name = row[fields[:scientificname]].strip
162
- if separate_canonical_and_authorship?(row, fields)
163
- if @with_canonical_names
164
- canonical_name = row[fields[:scientificname]].strip
165
- end
166
- scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
167
- else
168
- if @with_canonical_names
169
- canonical_name = get_canonical_name(row[fields[:scientificname]])
170
- end
171
- end
172
- fields[:canonicalname] = row.size
173
- row << canonical_name
174
- row[fields[:scientificname]] = scientific_name
175
- end
176
-
177
- def separate_canonical_and_authorship?(row, fields)
178
- authorship = ''
179
- if fields[:scientificnameauthorship]
180
- authorship = row[fields[:scientificnameauthorship]].to_s.strip
181
- end
182
- !(authorship.empty? || row[fields[:scientificname]].index(authorship))
183
- end
184
-
185
- def ingest_core
186
- @normalized_data = {}
187
- has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
188
- raise DarwinCore::CoreFileError.new('Darwin Core core fields must ' +
189
- 'contain taxon id and scientific name') unless has_name_and_id
190
- @dwc.core.read do |rows|
191
- rows[1].each do |error|
192
- @error_names << { :data => error,
193
- :error => :reading_or_encoding_error }
194
- end
195
- rows[0].each do |r|
196
- set_scientific_name(r, @core_fields)
197
- #core has AcceptedNameUsageId
198
- if @core_fields[:acceptednameusageid] &&
199
- r[@core_fields[:acceptednameusageid]] &&
200
- r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
201
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
202
- elsif !@core_fields[:acceptednameusageid] &&
203
- @core_fields[:taxonomicstatus] &&
204
- status_synonym?(r[@core_fields[:taxonomicstatus]])
205
- add_synonym_from_core(parent_id, r) if has_parent_id?
206
- else
207
- unless @normalized_data[r[@core_fields[:id]]]
208
- if gnub_archive?
209
- new_taxon = DarwinCore::GnubTaxon.new
210
- else
211
- new_taxon = DarwinCore::TaxonNormalized.new
212
- end
213
- @normalized_data[r[@core_fields[:id]]] = new_taxon
214
- end
215
- taxon = @normalized_data[r[@core_fields[:id]]]
216
- if gnub_archive?
217
- taxon.uuid = r[@core_fields[:originalnameusageid]]
218
- taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
219
- split('|')
220
- end
221
- taxon.id = r[@core_fields[:id]]
222
- taxon.current_name = r[@core_fields[:scientificname]]
223
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
224
- taxon.parent_id = has_parent_id? ? r[parent_id] : nil
225
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
226
- if @core_fields[:taxonomicstatus]
227
- taxon.status = r[@core_fields[:taxonomicstatus]]
228
- end
229
- taxon.source = r[@core_fields[:source]] if @core_fields[:source]
230
- taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
231
- if @core_fields[:globalid]
232
- taxon.global_id = r[@core_fields[:globalid]]
233
- end
234
- taxon.linnean_classification_path =
235
- get_linnean_classification_path(r, taxon)
236
- add_name_string(taxon.current_name)
237
- has_canonical = taxon.current_name_canonical &&
238
- !taxon.current_name_canonical.empty?
239
- add_name_string(taxon.current_name_canonical) if has_canonical
240
- end
241
- end
242
- end
243
- end
244
-
245
- def has_parent_id?
246
- @has_parent_id ||= @core_fields.has_key?(:highertaxonid) ||
247
- @core_fields.has_key?(:parentnameusageid)
248
- end
249
-
250
- def parent_id
251
- parent_id_field = @core_fields[:highertaxonid] ||
252
- @core_fields[:parentnameusageid]
253
- end
254
-
255
- def calculate_classification_path
256
- @paths_num = 0
257
- @normalized_data.each do |taxon_id, taxon|
258
- next if !taxon.classification_path_id.empty?
259
- res = get_classification_path(taxon)
260
- next if res == 'error'
261
- end
262
- end
263
-
264
- def get_classification_path(taxon)
265
- return if !taxon.classification_path_id.empty?
266
- @paths_num += 1
267
- if @paths_num % 10000 == 0
268
- DarwinCore.logger_write(@dwc.object_id,
269
- "Calculated %s paths" % @paths_num)
270
- end
271
- current_node = {taxon.id => {}}
272
- if DarwinCore.nil_field?(taxon.parent_id)
273
- if @with_canonical_names
274
- taxon.classification_path << taxon.current_name_canonical
275
- end
276
- taxon.classification_path_id << taxon.id
277
- @tree.merge!(current_node)
278
- else
279
- parent_cp = parent_cpid = nil
280
- if @normalized_data[taxon.parent_id]
281
- if @with_canonical_names
282
- parent_cp = @normalized_data[taxon.parent_id].classification_path
283
- end
284
- parent_cpid = @normalized_data[taxon.parent_id].
285
- classification_path_id
286
- else
287
- current_parent = @normalized_data[@synonyms[taxon.parent_id]]
288
- if current_parent
289
- error = 'WARNING: The parent of the taxon ' +
290
- "\'#{taxon.current_name}\' is deprecated"
291
- @error_names << {:data => taxon,
292
- :error => :deprecated_parent,
293
- :current_parent => current_parent }
294
-
295
- if @with_canonical_names
296
- parent_cp = current_parent.classification_path
297
- end
298
- parent_cpid = current_parent.classification_path_id
299
- else
300
- error = 'WARNING: The parent of the taxon ' +
301
- "\'#{taxon.current_name}\' not found"
302
- @error_names << {:data => taxon,
303
- :error => :deprecated_parent, :current_parent => nil}
304
- end
305
- end
306
- return 'error' unless parent_cpid
307
- if parent_cpid.empty?
308
- res = 'error'
309
- begin
310
- res = get_classification_path(@normalized_data[taxon.parent_id])
311
- rescue SystemStackError
312
- @error_names << {:data => taxon,
313
- :error => :too_deep_hierarchy, :current_parent => nil}
314
- end
315
- return res if res == 'error'
316
- if @with_canonical_names
317
- taxon.classification_path += @normalized_data[taxon.parent_id].
318
- classification_path + [taxon.current_name_canonical]
319
- end
320
- taxon.classification_path_id += @normalized_data[taxon.parent_id].
321
- classification_path_id + [taxon.id]
322
- parent_node = @normalized_data[taxon.parent_id].
323
- classification_path_id.inject(@tree) {|node, id| node[id]}
324
- parent_node.merge!(current_node)
325
- else
326
- taxon.classification_path += parent_cp +
327
- [taxon.current_name_canonical] if @with_canonical_names
328
- taxon.classification_path_id += parent_cpid + [taxon.id]
329
- parent_node = @normalized_data[taxon.parent_id].
330
- classification_path_id.inject(@tree) {|node, id| node[id]}
331
- begin
332
- parent_node.merge!(current_node)
333
- rescue NoMethodError => e
334
- DarwinCore.logger_write(@dwc.object_id,
335
- "Error '%s' taxon %s" % [e.message, taxon.id])
336
- return 'error'
337
- end
338
- end
339
- end
340
- end
341
-
342
- def ingest_extensions
343
- @extensions.each do |e|
344
- ext, fields = *e
345
- ingest_synonyms(e) if (File.split(e[0].file_path).
346
- last.match(/synonym/i) &&
347
- fields.keys.include?(:scientificname))
348
- ingest_vernaculars(e) if fields.keys.include? :vernacularname
349
- end
350
- end
351
-
352
- def ingest_synonyms(extension)
353
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting synonyms extension')
354
- ext, fields = *extension
355
- ext.read do |rows|
356
- rows[0].each do |r|
357
- set_scientific_name(r, fields)
358
- synonym = SynonymNormalized.new(
359
- nil,
360
- r[fields[:scientificname]],
361
- r[fields[:canonicalname]],
362
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil,
363
- fields[:source] ? r[fields[:source]] : nil,
364
- fields[:localid] ? r[fields[:localid]] : nil,
365
- fields[:globalid] ? r[fields[:globalid]] : nil,
366
- )
367
- if @normalized_data[r[fields[:id]]]
368
- @normalized_data[r[fields[:id]]].synonyms << synonym
369
- add_name_string(synonym.name)
370
- add_name_string(synonym.canonical_name)
371
- else
372
- @error_names << { :taxon => synonym,
373
- :error => :synonym_of_unknown_taxa }
374
- end
375
- end
376
- end
377
- end
378
-
379
- def ingest_vernaculars(extension)
380
- DarwinCore.logger_write(@dwc.object_id,
381
- 'Ingesting vernacular names extension')
382
- ext, fields = *extension
383
- ext.read do |rows|
384
- rows[0].each do |r|
385
-
386
- language = nil
387
- if fields[:language]
388
- language = r[fields[:language]]
389
- elsif fields[:languagecode]
390
- language = r[fields[:languagecode]]
391
- end
392
-
393
- locality = fields[:locality] ? r[fields[:locality]] : nil
394
-
395
- country_code = fields[:countrycode] ? r[fields[:countrycode]] : nil
396
-
397
- vernacular = VernacularNormalized.new(
398
- r[fields[:vernacularname]],
399
- language,
400
- locality,
401
- country_code)
402
- if @normalized_data[r[fields[:id]]]
403
- @normalized_data[r[fields[:id]]].vernacular_names << vernacular
404
- add_vernacular_name_string(vernacular.name)
405
- else
406
- @error_names << { :vernacular_name => vernacular,
407
- :error => :vernacular_of_unknown_taxa }
408
- end
409
- end
410
- end
411
- end
412
-
413
- #Collect linnean classification path only on species level
414
- def get_linnean_classification_path(row, taxon)
415
- res = []
416
- [:kingdom, :phylum, :class,
417
- :order, :family, :genus, :subgenus].each do |clade|
418
- res << [row[@core_fields[clade]], clade] if @core_fields[clade]
419
- end
420
- res
421
- end
422
-
423
- def gnub_archive?
424
- @core_fields[:originalnameusageidpath]
425
- end
426
- end
427
- end