dwc-archive 0.9.11 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +4 -0
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +114 -109
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
  25. data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +1 -1
  34. data/spec/lib/classification_normalizer_spec.rb +96 -105
  35. data/spec/lib/core_spec.rb +43 -41
  36. data/spec/lib/darwin_core_spec.rb +108 -138
  37. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  38. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  39. data/spec/lib/generator_spec.rb +73 -74
  40. data/spec/lib/gnub_taxon_spec.rb +15 -17
  41. data/spec/lib/metadata_spec.rb +50 -41
  42. data/spec/lib/taxon_normalized_spec.rb +62 -65
  43. data/spec/lib/xml_reader_spec.rb +9 -12
  44. data/spec/spec_helper.rb +54 -51
  45. metadata +101 -87
  46. data/.rvmrc +0 -1
  47. data/lib/dwc-archive.rb +0 -107
  48. data/lib/dwc-archive/archive.rb +0 -40
  49. data/lib/dwc-archive/classification_normalizer.rb +0 -427
  50. data/lib/dwc-archive/core.rb +0 -19
  51. data/lib/dwc-archive/expander.rb +0 -85
  52. data/lib/dwc-archive/generator.rb +0 -86
  53. data/lib/dwc-archive/ingester.rb +0 -101
  54. data/lib/dwc-archive/metadata.rb +0 -48
  55. data/lib/dwc-archive/version.rb +0 -3
  56. data/lib/dwc-archive/xml_reader.rb +0 -80
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents core of the DarwinCore Archive
5
+ class Core
6
+ include DarwinCore::Ingester
7
+ attr_reader :id
8
+
9
+ # rubocop:disable Metrics/MethodLength
10
+ def initialize(dwc)
11
+ @dwc = dwc
12
+ @archive = @dwc.archive
13
+ @path = @archive.files_path
14
+ root_key = @archive.meta.keys[0]
15
+ @data = @archive.meta[root_key][:core]
16
+ unless @data
17
+ raise DarwinCore::CoreFileError,
18
+ "Cannot find core in meta.xml, is meta.xml valid?"
19
+ end
20
+ @id = @data[:id][:attributes]
21
+ init_attributes
22
+ end
23
+ end
24
+ # rubocop:enable Metrics/MethodLength
25
+ end
@@ -1,11 +1,21 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
2
4
  class Error < RuntimeError; end
5
+
3
6
  class FileNotFoundError < Error; end
7
+
4
8
  class UnpackingError < Error; end
9
+
5
10
  class InvalidArchiveError < Error; end
11
+
6
12
  class CoreFileError < Error; end
13
+
7
14
  class ExtensionFileError < Error; end
15
+
8
16
  class GeneratorError < Error; end
17
+
9
18
  class ParentNotCurrentError < Error; end
19
+
10
20
  class EncodingError < Error; end
11
21
  end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Unpacks compressed archives into a temp directory
5
+ class Expander
6
+ def initialize(archive_path, tmp_dir)
7
+ @archive_path = archive_path
8
+ @tmp_dir = tmp_dir
9
+ @dir_path = DarwinCore.random_path(tmp_dir)
10
+ @unpacker = init_unpacker
11
+ end
12
+
13
+ def unpack
14
+ clean
15
+ raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
16
+
17
+ success = @unpacker.call(@dir_path, @archive_path) if @unpacker
18
+ if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
19
+ success
20
+ else
21
+ clean
22
+ raise DarwinCore::UnpackingError
23
+ end
24
+ end
25
+
26
+ def path
27
+ @path ||= files_path
28
+ end
29
+
30
+ def clean
31
+ DarwinCore.clean(@dir_path)
32
+ end
33
+
34
+ def files
35
+ DarwinCore.files(path)
36
+ end
37
+
38
+ private
39
+
40
+ def init_unpacker
41
+ return tar_unpacker if @archive_path =~ /tar.gz$/i
42
+ return zip_unpacker if @archive_path =~ /zip$/i
43
+
44
+ nil
45
+ end
46
+
47
+ def tar_unpacker
48
+ proc do |tmp_path, archive_path|
49
+ FileUtils.mkdir tmp_path
50
+ path = esc(archive_path)
51
+ system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
52
+ end
53
+ end
54
+
55
+ def zip_unpacker
56
+ proc do |tmp_path, archive_path|
57
+ path = esc(archive_path)
58
+ system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
59
+ end
60
+ end
61
+
62
+ def esc(a_str)
63
+ "'#{a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'")}'"
64
+ end
65
+
66
+ def path_entries(dir)
67
+ Dir.entries(dir).reject { |e| e.match(/\.{1,2}$/) }.sort
68
+ end
69
+
70
+ def files_path
71
+ entries = path_entries(@dir_path)
72
+ entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
73
+ end
74
+
75
+ def search_for_file_path(entries)
76
+ res = nil
77
+ entries.each do |e|
78
+ check_path = File.join(@dir_path, e)
79
+ next unless FileTest.directory?(check_path) &&
80
+ path_entries(check_path).include?("meta.xml")
81
+
82
+ res = check_path
83
+ break
84
+ end
85
+ res
86
+ end
87
+ end
88
+ end
@@ -1,8 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
4
+ # Represents extensions of DarwinCore Archive
2
5
  class Extension
3
6
  include DarwinCore::Ingester
4
7
  attr_reader :coreid
5
- alias :id :coreid
8
+ alias id coreid
6
9
 
7
10
  def initialize(dwc, data)
8
11
  @dwc = dwc
@@ -10,8 +13,7 @@ class DarwinCore
10
13
  @path = @archive.files_path
11
14
  @data = data
12
15
  @coreid = @data[:coreid][:attributes]
13
- get_attributes(DarwinCore::ExtensionFileError)
16
+ init_attributes
14
17
  end
15
-
16
18
  end
17
19
  end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Creates csv files for core and extensions
5
+ class Generator
6
+ attr_reader :eml_xml_data, :path
7
+
8
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
9
+ @dwc_path = dwc_path
10
+ @path = DarwinCore.random_path(tmp_dir)
11
+ FileUtils.mkdir(@path)
12
+ @meta_xml_data = { extensions: [] }
13
+ @eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
14
+ citation: nil, url: nil }
15
+ @write = 'w:utf-8'
16
+ end
17
+
18
+ def clean
19
+ DarwinCore.clean(@path)
20
+ end
21
+
22
+ def add_core(data, file_name, keep_headers = true)
23
+ opts = { type: 'core', data: data, file_name: file_name,
24
+ keep_headers: keep_headers }
25
+ prepare_csv_file(opts)
26
+ end
27
+
28
+ def add_extension(data, file_name, keep_headers = true,
29
+ row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
30
+ opts = { type: 'extension', data: data, file_name: file_name,
31
+ keep_headers: keep_headers, row_type: row_type }
32
+ prepare_csv_file(opts)
33
+ end
34
+
35
+ def add_meta_xml
36
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
37
+ meta.create
38
+ end
39
+
40
+ def add_eml_xml(data)
41
+ @eml_xml_data = data
42
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
43
+ eml.create
44
+ end
45
+
46
+ def files
47
+ DarwinCore.files(@path)
48
+ end
49
+
50
+ def pack
51
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
52
+ system(a)
53
+ end
54
+
55
+ private
56
+
57
+ def prepare_csv_file(opts)
58
+ c = CSV.open(File.join(@path, opts[:file_name]), @write)
59
+ attributes = prepare_attributes(opts)
60
+ if opts[:type] == 'core'
61
+ @meta_xml_data[:core] = attributes
62
+ else
63
+ @meta_xml_data[:extensions] << attributes
64
+ end
65
+ opts[:data].each { |d| c << d }
66
+ c.close
67
+ end
68
+
69
+ def prepare_attributes(opts)
70
+ header = opts[:data].shift
71
+ fields = init_fields(header, opts[:type])
72
+ opts[:data].unshift(fields) if opts[:keep_headers]
73
+ ignore_header_lines = opts[:keep_headers] ? 1 : 0
74
+
75
+ res = { fields: header, ignoreHeaderLines: ignore_header_lines,
76
+ location: opts[:file_name] }
77
+ res[:rowType] = opts[:row_type] if opts[:row_type]
78
+ res
79
+ end
80
+
81
+ def init_fields(header, file_type)
82
+ header.map do |f|
83
+ f = f.strip
84
+ err = "No header in #{file_type} data, or header fields are not urls"
85
+ raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
86
+
87
+ f.split('/')[-1]
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,41 +1,54 @@
1
1
  class DarwinCore
2
2
  class Generator
3
+ # Creates EML file with meta information about archive
3
4
  class EmlXml
5
+ SCHEMA_DATA = {
6
+ :"xml:lang" => "en",
7
+ :"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
8
+ :"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
9
+ :"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
10
+ :"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
11
+ :"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
12
+ :"xmlns:dc" => "http://purl.org/dc/terms/",
13
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
14
+ :"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
15
+ "http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
16
+ }
4
17
 
5
18
  def initialize(data, path)
6
19
  @data = data
7
20
  @path = path
8
- @write = 'w:utf-8'
21
+ @write = "w:utf-8"
9
22
  end
10
23
 
11
24
  def create
12
- eml_uri = 'eml://ecoinformatics.org/eml-2.1.1' +
13
- ' http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd'
25
+ schema_data = {
26
+ packageId: "#{@data[:id]}/#{timestamp}",
27
+ system: @data[:system] || "http://globalnames.org"
28
+ }.merge(SCHEMA_DATA)
14
29
  builder = Nokogiri::XML::Builder.new do |xml|
15
- xml.eml(packageId: "%s/%s" % [@data[:id], timestamp],
16
- system: @data[:system] || 'http://globalnames.org',
17
- :'xml:lang' => 'en',
18
- :'xmlns:eml' => 'eml://ecoinformatics.org/eml-2.1.1',
19
- :'xmlns:md' => 'eml://ecoinformatics.org/methods-2.1.1',
20
- :'xmlns:proj' => 'eml://ecoinformatics.org/project-2.1.1',
21
- :'xmlns:d' => 'eml://ecoinformatics.org/dataset-2.1.1',
22
- :'xmlns:res' => 'eml://ecoinformatics.org/resource-2.1.1',
23
- :'xmlns:dc' => 'http://purl.org/dc/terms/',
24
- :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
25
- :'xsi:schemaLocation' => 'eml_uri') do
26
- build_dataset(xml)
27
- build_additional_metadata(xml)
28
- xml.parent.namespace = xml.parent.namespace_definitions.first
30
+ xml.eml(schema_data) do
31
+ build_body(xml)
29
32
  end
30
33
  end
34
+ save_eml(builder)
35
+ end
36
+
37
+ private
38
+
39
+ def build_body(xml)
40
+ build_dataset(xml)
41
+ build_additional_metadata(xml)
42
+ xml.parent.namespace = xml.parent.namespace_definitions.first
43
+ end
44
+
45
+ def save_eml(builder)
31
46
  data = builder.to_xml
32
- f = open(File.join(@path, 'eml.xml'), @write)
47
+ f = open(File.join(@path, "eml.xml"), @write)
33
48
  f.write(data)
34
49
  f.close
35
50
  end
36
51
 
37
- private
38
-
39
52
  def build_dataset(xml)
40
53
  xml.dataset(id: @data[:id]) do
41
54
  xml.title(@data[:title])
@@ -50,30 +63,24 @@ class DarwinCore
50
63
  end
51
64
 
52
65
  def build_abstract(xml)
53
- xml.abstract() do
54
- xml.para(@data[:abstract])
55
- end
66
+ xml.abstract { xml.para(@data[:abstract]) }
56
67
  end
57
68
 
58
69
  def build_contacts(xml, contacts)
59
- contacts.each do |contact|
60
- xml.contact { xml.references(contact) }
61
- end
70
+ contacts.each { |contact| xml.contact { xml.references(contact) } }
62
71
  end
63
72
 
64
73
  def build_metadata_providers(xml)
65
- @data[:metadata_providers].each_with_index do |a, i|
66
- xml.metadataProvider do
67
- build_person(xml, a)
68
- end
74
+ @data[:metadata_providers].each do |a|
75
+ xml.metadataProvider { build_person(xml, a) }
69
76
  end if @data[:metadata_providers]
70
77
  end
71
-
78
+
72
79
  def build_authors(xml, contacts)
73
80
  @data[:authors].each_with_index do |a, i|
74
81
  creator_id = i + 1
75
82
  contacts << creator_id
76
- xml.creator(id: creator_id, scope: 'document') do
83
+ xml.creator(id: creator_id, scope: "document") do
77
84
  build_person(xml, a)
78
85
  end
79
86
  end
@@ -102,7 +109,7 @@ class DarwinCore
102
109
 
103
110
  def timestamp
104
111
  t = Time.now.getutc.to_a[0..5].reverse
105
- t[0..2].join('-') + '::' + t[-3..-1].join(':')
112
+ t[0..2] * ("-") + "::" + t[-3..-1] * (":")
106
113
  end
107
114
  end
108
115
  end
@@ -1,42 +1,45 @@
1
1
  class DarwinCore
2
2
  class Generator
3
+ # Creates DarwinCore meta file
3
4
  class MetaXml
4
5
  def initialize(data, path)
5
6
  @data = data
6
7
  @path = path
7
- @write = 'w:utf-8'
8
+ @write = "w:utf-8"
8
9
  end
9
10
 
10
11
  def create
11
- schema_uri = 'http://rs.tdwg.org/dwc/terms/xsd/archive/' +
12
- ' http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd'
12
+ schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
13
+ "http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
13
14
  builder = Nokogiri::XML::Builder.new do |xml|
14
- opts = { encoding: 'UTF-8',
15
- fieldsTerminatedBy: ',',
16
- fieldsEnclosedBy: '"',
17
- linesTerminatedBy: "\n",
18
- rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
15
+ opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
16
+ fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
17
+ rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
19
18
  build_archive(xml, opts, schema_uri)
20
19
  end
20
+ save_meta(builder)
21
+ end
22
+
23
+ private
24
+
25
+ def save_meta(builder)
21
26
  meta_xml_data = builder.to_xml
22
- meta_file = open(File.join(@path, 'meta.xml'), @write)
27
+ meta_file = open(File.join(@path, "meta.xml"), @write)
23
28
  meta_file.write(meta_xml_data)
24
29
  meta_file.close
25
30
  end
26
31
 
27
- private
28
-
29
32
  def build_archive(xml, opts, schema_uri)
30
- xml.archive(xmlns: 'http://rs.tdwg.org/dwc/text/',
31
- :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
32
- :'xsi:schemaLocation' => schema_uri) do
33
+ xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
34
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
35
+ :"xsi:schemaLocation" => schema_uri) do
33
36
  build_core(xml, opts)
34
37
  build_extensions(xml, opts)
35
38
  end
36
39
  end
37
40
 
38
41
  def build_core(xml, opts)
39
- xml.core(opts.merge(ignoreHeaderLines:
42
+ xml.core(opts.merge(ignoreHeaderLines:
40
43
  @data[:core][:ignoreHeaderLines])) do
41
44
  xml.files { xml.location(@data[:core][:location]) }
42
45
  taxon_id, fields = find_taxon_id(@data[:core][:fields])
@@ -47,7 +50,7 @@ class DarwinCore
47
50
 
48
51
  def build_extensions(xml, opts)
49
52
  @data[:extensions].each do |e|
50
- xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
53
+ xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
51
54
  rowType: e[:rowType])) do
52
55
  xml.files { xml.location(e[:location]) }
53
56
  taxon_id, fields = find_taxon_id(e[:fields])
@@ -60,12 +63,10 @@ class DarwinCore
60
63
  def find_taxon_id(data)
61
64
  fields = []
62
65
  data.each_with_index { |f, i| fields << [f.strip, i] }
63
- taxon_id, fields = fields.partition { |f| f[0].match(%r|/taxonid$|i) }
64
- raise DarwinCore::GeneratorError if taxon_id.size != 1
66
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
67
+ fail DarwinCore::GeneratorError if taxon_id.size != 1
65
68
  [taxon_id[0], fields]
66
69
  end
67
-
68
70
  end
69
71
  end
70
72
  end
71
-