dwc-archive 0.9.11 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +5 -5
  2. data/.rspec +2 -1
  3. data/.rubocop.yml +23 -0
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +2 -3
  6. data/CHANGELOG +2 -0
  7. data/Gemfile +3 -1
  8. data/README.md +110 -106
  9. data/Rakefile +13 -36
  10. data/dwc-archive.gemspec +24 -19
  11. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  12. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  13. data/features/support/env.rb +1 -1
  14. data/lib/dwc_archive.rb +121 -0
  15. data/lib/dwc_archive/archive.rb +59 -0
  16. data/lib/dwc_archive/classification_normalizer.rb +392 -0
  17. data/lib/dwc_archive/core.rb +25 -0
  18. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  19. data/lib/dwc_archive/expander.rb +88 -0
  20. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  21. data/lib/dwc_archive/generator.rb +90 -0
  22. data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
  23. data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
  24. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  25. data/lib/dwc_archive/ingester.rb +106 -0
  26. data/lib/dwc_archive/metadata.rb +56 -0
  27. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  28. data/lib/dwc_archive/version.rb +6 -0
  29. data/lib/dwc_archive/xml_reader.rb +89 -0
  30. data/spec/files/generator_eml.xml +1 -1
  31. data/spec/lib/classification_normalizer_spec.rb +96 -105
  32. data/spec/lib/core_spec.rb +43 -41
  33. data/spec/lib/darwin_core_spec.rb +111 -132
  34. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  35. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  36. data/spec/lib/generator_spec.rb +73 -74
  37. data/spec/lib/gnub_taxon_spec.rb +14 -16
  38. data/spec/lib/metadata_spec.rb +50 -41
  39. data/spec/lib/taxon_normalized_spec.rb +62 -65
  40. data/spec/lib/xml_reader_spec.rb +9 -12
  41. data/spec/spec_helper.rb +55 -49
  42. metadata +92 -77
  43. data/.rvmrc +0 -1
  44. data/lib/dwc-archive.rb +0 -107
  45. data/lib/dwc-archive/archive.rb +0 -40
  46. data/lib/dwc-archive/classification_normalizer.rb +0 -427
  47. data/lib/dwc-archive/core.rb +0 -19
  48. data/lib/dwc-archive/expander.rb +0 -85
  49. data/lib/dwc-archive/generator.rb +0 -86
  50. data/lib/dwc-archive/ingester.rb +0 -101
  51. data/lib/dwc-archive/metadata.rb +0 -48
  52. data/lib/dwc-archive/version.rb +0 -3
  53. data/lib/dwc-archive/xml_reader.rb +0 -80
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm use ruby-1.9.3-p392@dwc-archive --create
@@ -1,107 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- recent_ruby = RUBY_VERSION >= '1.9.1'
4
- raise "IMPORTANT: dwc-archive gem requires ruby >= 1.9.1" unless recent_ruby
5
-
6
- require 'fileutils'
7
- require 'ostruct'
8
- require 'digest'
9
- require 'csv'
10
- require 'logger'
11
- require 'nokogiri'
12
- require_relative 'dwc-archive/xml_reader'
13
- require_relative 'dwc-archive/ingester'
14
- require_relative 'dwc-archive/errors'
15
- require_relative 'dwc-archive/expander'
16
- require_relative 'dwc-archive/archive'
17
- require_relative 'dwc-archive/core'
18
- require_relative 'dwc-archive/extension'
19
- require_relative 'dwc-archive/metadata'
20
- require_relative 'dwc-archive/generator'
21
- require_relative 'dwc-archive/generator_meta_xml'
22
- require_relative 'dwc-archive/generator_eml_xml'
23
- require_relative 'dwc-archive/classification_normalizer'
24
- require_relative 'dwc-archive/version'
25
-
26
- class DarwinCore
27
-
28
- DEFAULT_TMP_DIR = "/tmp"
29
-
30
- attr_reader :archive, :core, :metadata, :extensions,
31
- :classification_normalizer
32
- alias :eml :metadata
33
-
34
-
35
- def self.nil_field?(field)
36
- return true if [nil, '', '/N'].include?(field)
37
- false
38
- end
39
-
40
- def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
41
- Dir.entries(tmp_dir).each do |entry|
42
- path = File.join(tmp_dir, entry)
43
- if FileTest.directory?(path) && entry.match(/^dwc_[\d]+$/)
44
- FileUtils.rm_rf(path)
45
- end
46
- end
47
- end
48
-
49
- def self.logger
50
- @@logger ||= Logger.new(nil)
51
- end
52
-
53
- def self.logger=(logger)
54
- @@logger = logger
55
- end
56
-
57
- def self.logger_reset
58
- self.logger = Logger.new(nil)
59
- end
60
-
61
- def self.logger_write(obj_id, message, method = :info)
62
- self.logger.send(method, "|%s|%s|" % [obj_id, message])
63
- end
64
-
65
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
66
- @dwc_path = dwc_path
67
- @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
68
- @core = DarwinCore::Core.new(self)
69
- @metadata = DarwinCore::Metadata.new(@archive)
70
- @extensions = get_extensions
71
- end
72
-
73
- def file_name
74
- File.split(@dwc_path).last
75
- end
76
-
77
- def path
78
- File.expand_path(@dwc_path)
79
- end
80
-
81
- # generates a hash from a classification data with path to each node,
82
- # list of synonyms and vernacular names.
83
- def normalize_classification
84
- return nil unless has_parent_id?
85
- @classification_normalizer ||= DarwinCore::ClassificationNormalizer.
86
- new(self)
87
- @classification_normalizer.normalize
88
- end
89
-
90
- def has_parent_id?
91
- !!@core.fields.join('|').downcase.match(/highertaxonid|parentnameusageid/)
92
- end
93
-
94
- def checksum
95
- Digest::SHA1.hexdigest(open(@dwc_path).read)
96
- end
97
-
98
- private
99
- def get_extensions
100
- res = []
101
- root_key = @archive.meta.keys[0]
102
- ext = @archive.meta[root_key][:extension]
103
- return [] unless ext
104
- ext = [ext] if ext.class != Array
105
- ext.map { |e| DarwinCore::Extension.new(self, e) }
106
- end
107
- end
@@ -1,40 +0,0 @@
1
- class DarwinCore
2
- class Archive
3
- attr_reader :meta, :eml
4
-
5
- def initialize(archive_path, tmp_dir)
6
- @archive_path = archive_path
7
- @tmp_dir = tmp_dir
8
- @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
9
- @expander.unpack
10
- if valid?
11
- @meta = DarwinCore::XmlReader.
12
- from_xml(open(File.join(@expander.path, 'meta.xml')))
13
- @eml = files.include?("eml.xml") ?
14
- DarwinCore::XmlReader.
15
- from_xml(open(File.join(@expander.path, 'eml.xml'))) : nil
16
- else
17
- clean
18
- raise InvalidArchiveError
19
- end
20
- end
21
-
22
- def valid?
23
- valid = true
24
- valid = valid && @expander.path && FileTest.exists?(@expander.path)
25
- valid = valid && files && files.include?('meta.xml')
26
- end
27
-
28
- def files
29
- @expander.files
30
- end
31
-
32
- def files_path
33
- @expander.path
34
- end
35
-
36
- def clean
37
- @expander.clean
38
- end
39
- end
40
- end
@@ -1,427 +0,0 @@
1
- # encoding: utf-8
2
- require 'parsley-store'
3
-
4
- class DarwinCore
5
-
6
- class TaxonNormalized
7
- attr_accessor :id, :local_id, :global_id, :source, :parent_id,
8
- :classification_path_id, :classification_path,
9
- :linnean_classification_path, :current_name, :current_name_canonical,
10
- :synonyms, :vernacular_names, :rank, :status
11
-
12
- def initialize
13
- @id = @parent_id = @rank = @status = nil
14
- @current_name = ''
15
- @current_name_canonical = ''
16
- @source = ''
17
- @local_id = ''
18
- @global_id = ''
19
- @classification_path = []
20
- @classification_path_id = []
21
- @synonyms = []
22
- @vernacular_names = []
23
- @linnean_classification_path = []
24
- end
25
-
26
- end
27
-
28
- class GnubTaxon < TaxonNormalized
29
- attr_accessor :uuid, :uuid_path
30
-
31
- def initialize
32
- super
33
- @uuid = nil
34
- @uuid_path = []
35
- end
36
- end
37
-
38
- class SynonymNormalized < Struct.new(:id, :name, :canonical_name,
39
- :status, :source, :local_id,
40
- :global_id);end
41
- class VernacularNormalized < Struct.new(:name, :language, :locality,
42
- :country_code);end
43
-
44
- class ClassificationNormalizer
45
- attr_reader :error_names, :tree, :normalized_data
46
-
47
- def initialize(dwc_instance)
48
- @dwc = dwc_instance
49
- @core_fields = get_fields(@dwc.core)
50
- @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
51
- @normalized_data = {}
52
- @synonyms = {}
53
- @parser = ParsleyStore.new(1,2)
54
- @name_strings = {}
55
- @vernacular_name_strings = {}
56
- @error_names = []
57
- @tree = {}
58
- end
59
-
60
- def darwin_core
61
- @dwc
62
- end
63
-
64
- def add_name_string(name_string)
65
- @name_strings[name_string] = 1 unless @name_strings[name_string]
66
- end
67
-
68
- def add_vernacular_name_string(name_string)
69
- unless @vernacular_name_strings[name_string]
70
- @vernacular_name_strings[name_string] = 1
71
- end
72
- end
73
-
74
- def name_strings(opts = {})
75
- process_strings(@name_strings, opts)
76
- end
77
-
78
- def vernacular_name_strings(opts = {})
79
- process_strings(@vernacular_name_strings, opts)
80
- end
81
-
82
- def normalize(opts = {})
83
- opts = { :with_canonical_names => true,
84
- :with_extensions => true }.merge(opts)
85
- @with_canonical_names = !!opts[:with_canonical_names]
86
- DarwinCore.logger_write(@dwc.object_id,
87
- 'Started normalization of the classification')
88
- ingest_core
89
- DarwinCore.logger_write(@dwc.object_id,
90
- 'Calculating the classification parent/child paths')
91
- has_parent_id? ?
92
- calculate_classification_path :
93
- @normalized_data.keys.each { |id| @tree[id] = {} }
94
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting data from extensions')
95
- if !!opts[:with_extensions]
96
- ingest_extensions
97
- end
98
- @normalized_data
99
- end
100
-
101
- private
102
-
103
- def process_strings(strings, opts)
104
- opts = { with_hash: false }.merge(opts)
105
- if !!opts[:with_hash]
106
- strings
107
- else
108
- strings.keys
109
- end
110
- end
111
-
112
- def get_canonical_name(a_scientific_name)
113
- if @with_canonical_names
114
- canonical_name = @parser.parse(a_scientific_name,
115
- :canonical_only => true)
116
- canonical_name.to_s.empty? ? a_scientific_name : canonical_name
117
- else
118
- nil
119
- end
120
- end
121
-
122
- def get_fields(element)
123
- data = element.fields.inject({}) do |res, f|
124
- field = f[:term].split('/')[-1]
125
- field = field ? field.downcase.to_sym : ''
126
- res[field] = f[:index].to_i
127
- res
128
- end
129
- data[:id] = element.id[:index] if element.id
130
- data
131
- end
132
-
133
- def status_synonym?(status)
134
- status && !!status.match(/^syn/)
135
- end
136
-
137
- def add_synonym_from_core(taxon_id, row)
138
- @synonyms[row[@core_fields[:id]]] = taxon_id
139
- taxon = @normalized_data[row[taxon_id]] ?
140
- @normalized_data[row[taxon_id]] :
141
- @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
142
- synonym = SynonymNormalized.new(
143
- row[@core_fields[:id]],
144
- row[@core_fields[:scientificname]],
145
- row[@core_fields[:canonicalname]],
146
- @core_fields[:taxonomicstatus] ?
147
- row[@core_fields[:taxonomicstatus]] :
148
- nil,
149
- @core_fields[:source] ? row[@core_fields[:source]] : nil,
150
- @core_fields[:localid] ? row[@core_fields[:localid]] : nil,
151
- @core_fields[:globalid] ? row[@core_fields[:globalid]] : nil,
152
- )
153
- taxon.synonyms << synonym
154
- add_name_string(synonym.name)
155
- add_name_string(synonym.canonical_name)
156
- end
157
-
158
- def set_scientific_name(row, fields)
159
- row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
160
- canonical_name = nil
161
- scientific_name = row[fields[:scientificname]].strip
162
- if separate_canonical_and_authorship?(row, fields)
163
- if @with_canonical_names
164
- canonical_name = row[fields[:scientificname]].strip
165
- end
166
- scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
167
- else
168
- if @with_canonical_names
169
- canonical_name = get_canonical_name(row[fields[:scientificname]])
170
- end
171
- end
172
- fields[:canonicalname] = row.size
173
- row << canonical_name
174
- row[fields[:scientificname]] = scientific_name
175
- end
176
-
177
- def separate_canonical_and_authorship?(row, fields)
178
- authorship = ''
179
- if fields[:scientificnameauthorship]
180
- authorship = row[fields[:scientificnameauthorship]].to_s.strip
181
- end
182
- !(authorship.empty? || row[fields[:scientificname]].index(authorship))
183
- end
184
-
185
- def ingest_core
186
- @normalized_data = {}
187
- has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
188
- raise DarwinCore::CoreFileError.new('Darwin Core core fields must ' +
189
- 'contain taxon id and scientific name') unless has_name_and_id
190
- @dwc.core.read do |rows|
191
- rows[1].each do |error|
192
- @error_names << { :data => error,
193
- :error => :reading_or_encoding_error }
194
- end
195
- rows[0].each do |r|
196
- set_scientific_name(r, @core_fields)
197
- #core has AcceptedNameUsageId
198
- if @core_fields[:acceptednameusageid] &&
199
- r[@core_fields[:acceptednameusageid]] &&
200
- r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
201
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
202
- elsif !@core_fields[:acceptednameusageid] &&
203
- @core_fields[:taxonomicstatus] &&
204
- status_synonym?(r[@core_fields[:taxonomicstatus]])
205
- add_synonym_from_core(parent_id, r) if has_parent_id?
206
- else
207
- unless @normalized_data[r[@core_fields[:id]]]
208
- if gnub_archive?
209
- new_taxon = DarwinCore::GnubTaxon.new
210
- else
211
- new_taxon = DarwinCore::TaxonNormalized.new
212
- end
213
- @normalized_data[r[@core_fields[:id]]] = new_taxon
214
- end
215
- taxon = @normalized_data[r[@core_fields[:id]]]
216
- if gnub_archive?
217
- taxon.uuid = r[@core_fields[:originalnameusageid]]
218
- taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
219
- split('|')
220
- end
221
- taxon.id = r[@core_fields[:id]]
222
- taxon.current_name = r[@core_fields[:scientificname]]
223
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
224
- taxon.parent_id = has_parent_id? ? r[parent_id] : nil
225
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
226
- if @core_fields[:taxonomicstatus]
227
- taxon.status = r[@core_fields[:taxonomicstatus]]
228
- end
229
- taxon.source = r[@core_fields[:source]] if @core_fields[:source]
230
- taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
231
- if @core_fields[:globalid]
232
- taxon.global_id = r[@core_fields[:globalid]]
233
- end
234
- taxon.linnean_classification_path =
235
- get_linnean_classification_path(r, taxon)
236
- add_name_string(taxon.current_name)
237
- has_canonical = taxon.current_name_canonical &&
238
- !taxon.current_name_canonical.empty?
239
- add_name_string(taxon.current_name_canonical) if has_canonical
240
- end
241
- end
242
- end
243
- end
244
-
245
- def has_parent_id?
246
- @has_parent_id ||= @core_fields.has_key?(:highertaxonid) ||
247
- @core_fields.has_key?(:parentnameusageid)
248
- end
249
-
250
- def parent_id
251
- parent_id_field = @core_fields[:highertaxonid] ||
252
- @core_fields[:parentnameusageid]
253
- end
254
-
255
- def calculate_classification_path
256
- @paths_num = 0
257
- @normalized_data.each do |taxon_id, taxon|
258
- next if !taxon.classification_path_id.empty?
259
- res = get_classification_path(taxon)
260
- next if res == 'error'
261
- end
262
- end
263
-
264
- def get_classification_path(taxon)
265
- return if !taxon.classification_path_id.empty?
266
- @paths_num += 1
267
- if @paths_num % 10000 == 0
268
- DarwinCore.logger_write(@dwc.object_id,
269
- "Calculated %s paths" % @paths_num)
270
- end
271
- current_node = {taxon.id => {}}
272
- if DarwinCore.nil_field?(taxon.parent_id)
273
- if @with_canonical_names
274
- taxon.classification_path << taxon.current_name_canonical
275
- end
276
- taxon.classification_path_id << taxon.id
277
- @tree.merge!(current_node)
278
- else
279
- parent_cp = parent_cpid = nil
280
- if @normalized_data[taxon.parent_id]
281
- if @with_canonical_names
282
- parent_cp = @normalized_data[taxon.parent_id].classification_path
283
- end
284
- parent_cpid = @normalized_data[taxon.parent_id].
285
- classification_path_id
286
- else
287
- current_parent = @normalized_data[@synonyms[taxon.parent_id]]
288
- if current_parent
289
- error = 'WARNING: The parent of the taxon ' +
290
- "\'#{taxon.current_name}\' is deprecated"
291
- @error_names << {:data => taxon,
292
- :error => :deprecated_parent,
293
- :current_parent => current_parent }
294
-
295
- if @with_canonical_names
296
- parent_cp = current_parent.classification_path
297
- end
298
- parent_cpid = current_parent.classification_path_id
299
- else
300
- error = 'WARNING: The parent of the taxon ' +
301
- "\'#{taxon.current_name}\' not found"
302
- @error_names << {:data => taxon,
303
- :error => :deprecated_parent, :current_parent => nil}
304
- end
305
- end
306
- return 'error' unless parent_cpid
307
- if parent_cpid.empty?
308
- res = 'error'
309
- begin
310
- res = get_classification_path(@normalized_data[taxon.parent_id])
311
- rescue SystemStackError
312
- @error_names << {:data => taxon,
313
- :error => :too_deep_hierarchy, :current_parent => nil}
314
- end
315
- return res if res == 'error'
316
- if @with_canonical_names
317
- taxon.classification_path += @normalized_data[taxon.parent_id].
318
- classification_path + [taxon.current_name_canonical]
319
- end
320
- taxon.classification_path_id += @normalized_data[taxon.parent_id].
321
- classification_path_id + [taxon.id]
322
- parent_node = @normalized_data[taxon.parent_id].
323
- classification_path_id.inject(@tree) {|node, id| node[id]}
324
- parent_node.merge!(current_node)
325
- else
326
- taxon.classification_path += parent_cp +
327
- [taxon.current_name_canonical] if @with_canonical_names
328
- taxon.classification_path_id += parent_cpid + [taxon.id]
329
- parent_node = @normalized_data[taxon.parent_id].
330
- classification_path_id.inject(@tree) {|node, id| node[id]}
331
- begin
332
- parent_node.merge!(current_node)
333
- rescue NoMethodError => e
334
- DarwinCore.logger_write(@dwc.object_id,
335
- "Error '%s' taxon %s" % [e.message, taxon.id])
336
- return 'error'
337
- end
338
- end
339
- end
340
- end
341
-
342
- def ingest_extensions
343
- @extensions.each do |e|
344
- ext, fields = *e
345
- ingest_synonyms(e) if (File.split(e[0].file_path).
346
- last.match(/synonym/i) &&
347
- fields.keys.include?(:scientificname))
348
- ingest_vernaculars(e) if fields.keys.include? :vernacularname
349
- end
350
- end
351
-
352
- def ingest_synonyms(extension)
353
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting synonyms extension')
354
- ext, fields = *extension
355
- ext.read do |rows|
356
- rows[0].each do |r|
357
- set_scientific_name(r, fields)
358
- synonym = SynonymNormalized.new(
359
- nil,
360
- r[fields[:scientificname]],
361
- r[fields[:canonicalname]],
362
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil,
363
- fields[:source] ? r[fields[:source]] : nil,
364
- fields[:localid] ? r[fields[:localid]] : nil,
365
- fields[:globalid] ? r[fields[:globalid]] : nil,
366
- )
367
- if @normalized_data[r[fields[:id]]]
368
- @normalized_data[r[fields[:id]]].synonyms << synonym
369
- add_name_string(synonym.name)
370
- add_name_string(synonym.canonical_name)
371
- else
372
- @error_names << { :taxon => synonym,
373
- :error => :synonym_of_unknown_taxa }
374
- end
375
- end
376
- end
377
- end
378
-
379
- def ingest_vernaculars(extension)
380
- DarwinCore.logger_write(@dwc.object_id,
381
- 'Ingesting vernacular names extension')
382
- ext, fields = *extension
383
- ext.read do |rows|
384
- rows[0].each do |r|
385
-
386
- language = nil
387
- if fields[:language]
388
- language = r[fields[:language]]
389
- elsif fields[:languagecode]
390
- language = r[fields[:languagecode]]
391
- end
392
-
393
- locality = fields[:locality] ? r[fields[:locality]] : nil
394
-
395
- country_code = fields[:countrycode] ? r[fields[:countrycode]] : nil
396
-
397
- vernacular = VernacularNormalized.new(
398
- r[fields[:vernacularname]],
399
- language,
400
- locality,
401
- country_code)
402
- if @normalized_data[r[fields[:id]]]
403
- @normalized_data[r[fields[:id]]].vernacular_names << vernacular
404
- add_vernacular_name_string(vernacular.name)
405
- else
406
- @error_names << { :vernacular_name => vernacular,
407
- :error => :vernacular_of_unknown_taxa }
408
- end
409
- end
410
- end
411
- end
412
-
413
- #Collect linnean classification path only on species level
414
- def get_linnean_classification_path(row, taxon)
415
- res = []
416
- [:kingdom, :phylum, :class,
417
- :order, :family, :genus, :subgenus].each do |clade|
418
- res << [row[@core_fields[clade]], clade] if @core_fields[clade]
419
- end
420
- res
421
- end
422
-
423
- def gnub_archive?
424
- @core_fields[:originalnameusageidpath]
425
- end
426
- end
427
- end