dwc-archive 0.9.10 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +14 -8
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +119 -107
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +96 -105
  36. data/spec/lib/core_spec.rb +43 -41
  37. data/spec/lib/darwin_core_spec.rb +108 -138
  38. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  39. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  40. data/spec/lib/generator_spec.rb +77 -69
  41. data/spec/lib/gnub_taxon_spec.rb +15 -17
  42. data/spec/lib/metadata_spec.rb +50 -41
  43. data/spec/lib/taxon_normalized_spec.rb +62 -65
  44. data/spec/lib/xml_reader_spec.rb +9 -12
  45. data/spec/spec_helper.rb +54 -51
  46. metadata +105 -88
  47. data/.rvmrc +0 -1
  48. data/] +0 -40
  49. data/lib/dwc-archive.rb +0 -107
  50. data/lib/dwc-archive/archive.rb +0 -40
  51. data/lib/dwc-archive/classification_normalizer.rb +0 -428
  52. data/lib/dwc-archive/core.rb +0 -17
  53. data/lib/dwc-archive/expander.rb +0 -84
  54. data/lib/dwc-archive/generator.rb +0 -85
  55. data/lib/dwc-archive/generator_eml_xml.rb +0 -86
  56. data/lib/dwc-archive/generator_meta_xml.rb +0 -58
  57. data/lib/dwc-archive/ingester.rb +0 -101
  58. data/lib/dwc-archive/metadata.rb +0 -48
  59. data/lib/dwc-archive/version.rb +0 -3
  60. data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents core of the DarwinCore Archive
5
+ class Core
6
+ include DarwinCore::Ingester
7
+ attr_reader :id
8
+
9
+ # rubocop:disable Metrics/MethodLength
10
+ def initialize(dwc)
11
+ @dwc = dwc
12
+ @archive = @dwc.archive
13
+ @path = @archive.files_path
14
+ root_key = @archive.meta.keys[0]
15
+ @data = @archive.meta[root_key][:core]
16
+ unless @data
17
+ raise DarwinCore::CoreFileError,
18
+ "Cannot find core in meta.xml, is meta.xml valid?"
19
+ end
20
+ @id = @data[:id][:attributes]
21
+ init_attributes
22
+ end
23
+ end
24
+ # rubocop:enable Metrics/MethodLength
25
+ end
@@ -1,11 +1,21 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
2
4
  class Error < RuntimeError; end
5
+
3
6
  class FileNotFoundError < Error; end
7
+
4
8
  class UnpackingError < Error; end
9
+
5
10
  class InvalidArchiveError < Error; end
11
+
6
12
  class CoreFileError < Error; end
13
+
7
14
  class ExtensionFileError < Error; end
15
+
8
16
  class GeneratorError < Error; end
17
+
9
18
  class ParentNotCurrentError < Error; end
19
+
10
20
  class EncodingError < Error; end
11
21
  end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Unpacks compressed archives into a temp directory
5
+ class Expander
6
+ def initialize(archive_path, tmp_dir)
7
+ @archive_path = archive_path
8
+ @tmp_dir = tmp_dir
9
+ @dir_path = DarwinCore.random_path(tmp_dir)
10
+ @unpacker = init_unpacker
11
+ end
12
+
13
+ def unpack
14
+ clean
15
+ raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
16
+
17
+ success = @unpacker.call(@dir_path, @archive_path) if @unpacker
18
+ if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
19
+ success
20
+ else
21
+ clean
22
+ raise DarwinCore::UnpackingError
23
+ end
24
+ end
25
+
26
+ def path
27
+ @path ||= files_path
28
+ end
29
+
30
+ def clean
31
+ DarwinCore.clean(@dir_path)
32
+ end
33
+
34
+ def files
35
+ DarwinCore.files(path)
36
+ end
37
+
38
+ private
39
+
40
+ def init_unpacker
41
+ return tar_unpacker if @archive_path =~ /tar.gz$/i
42
+ return zip_unpacker if @archive_path =~ /zip$/i
43
+
44
+ nil
45
+ end
46
+
47
+ def tar_unpacker
48
+ proc do |tmp_path, archive_path|
49
+ FileUtils.mkdir tmp_path
50
+ path = esc(archive_path)
51
+ system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
52
+ end
53
+ end
54
+
55
+ def zip_unpacker
56
+ proc do |tmp_path, archive_path|
57
+ path = esc(archive_path)
58
+ system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
59
+ end
60
+ end
61
+
62
+ def esc(a_str)
63
+ "'#{a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'")}'"
64
+ end
65
+
66
+ def path_entries(dir)
67
+ Dir.entries(dir).reject { |e| e.match(/\.{1,2}$/) }.sort
68
+ end
69
+
70
+ def files_path
71
+ entries = path_entries(@dir_path)
72
+ entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
73
+ end
74
+
75
+ def search_for_file_path(entries)
76
+ res = nil
77
+ entries.each do |e|
78
+ check_path = File.join(@dir_path, e)
79
+ next unless FileTest.directory?(check_path) &&
80
+ path_entries(check_path).include?("meta.xml")
81
+
82
+ res = check_path
83
+ break
84
+ end
85
+ res
86
+ end
87
+ end
88
+ end
@@ -1,8 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
4
+ # Represents extensions of DarwinCore Archive
2
5
  class Extension
3
6
  include DarwinCore::Ingester
4
7
  attr_reader :coreid
5
- alias :id :coreid
8
+ alias id coreid
6
9
 
7
10
  def initialize(dwc, data)
8
11
  @dwc = dwc
@@ -10,8 +13,7 @@ class DarwinCore
10
13
  @path = @archive.files_path
11
14
  @data = data
12
15
  @coreid = @data[:coreid][:attributes]
13
- get_attributes(DarwinCore::ExtensionFileError)
16
+ init_attributes
14
17
  end
15
-
16
18
  end
17
19
  end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Creates csv files for core and extensions
5
+ class Generator
6
+ attr_reader :eml_xml_data, :path
7
+
8
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
9
+ @dwc_path = dwc_path
10
+ @path = DarwinCore.random_path(tmp_dir)
11
+ FileUtils.mkdir(@path)
12
+ @meta_xml_data = { extensions: [] }
13
+ @eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
14
+ citation: nil, url: nil }
15
+ @write = 'w:utf-8'
16
+ end
17
+
18
+ def clean
19
+ DarwinCore.clean(@path)
20
+ end
21
+
22
+ def add_core(data, file_name, keep_headers = true)
23
+ opts = { type: 'core', data: data, file_name: file_name,
24
+ keep_headers: keep_headers }
25
+ prepare_csv_file(opts)
26
+ end
27
+
28
+ def add_extension(data, file_name, keep_headers = true,
29
+ row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
30
+ opts = { type: 'extension', data: data, file_name: file_name,
31
+ keep_headers: keep_headers, row_type: row_type }
32
+ prepare_csv_file(opts)
33
+ end
34
+
35
+ def add_meta_xml
36
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
37
+ meta.create
38
+ end
39
+
40
+ def add_eml_xml(data)
41
+ @eml_xml_data = data
42
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
43
+ eml.create
44
+ end
45
+
46
+ def files
47
+ DarwinCore.files(@path)
48
+ end
49
+
50
+ def pack
51
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
52
+ system(a)
53
+ end
54
+
55
+ private
56
+
57
+ def prepare_csv_file(opts)
58
+ c = CSV.open(File.join(@path, opts[:file_name]), @write)
59
+ attributes = prepare_attributes(opts)
60
+ if opts[:type] == 'core'
61
+ @meta_xml_data[:core] = attributes
62
+ else
63
+ @meta_xml_data[:extensions] << attributes
64
+ end
65
+ opts[:data].each { |d| c << d }
66
+ c.close
67
+ end
68
+
69
+ def prepare_attributes(opts)
70
+ header = opts[:data].shift
71
+ fields = init_fields(header, opts[:type])
72
+ opts[:data].unshift(fields) if opts[:keep_headers]
73
+ ignore_header_lines = opts[:keep_headers] ? 1 : 0
74
+
75
+ res = { fields: header, ignoreHeaderLines: ignore_header_lines,
76
+ location: opts[:file_name] }
77
+ res[:rowType] = opts[:row_type] if opts[:row_type]
78
+ res
79
+ end
80
+
81
+ def init_fields(header, file_type)
82
+ header.map do |f|
83
+ f = f.strip
84
+ err = "No header in #{file_type} data, or header fields are not urls"
85
+ raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
86
+
87
+ f.split('/')[-1]
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,116 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates EML file with meta information about archive
4
+ class EmlXml
5
+ SCHEMA_DATA = {
6
+ :"xml:lang" => "en",
7
+ :"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
8
+ :"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
9
+ :"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
10
+ :"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
11
+ :"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
12
+ :"xmlns:dc" => "http://purl.org/dc/terms/",
13
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
14
+ :"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
15
+ "http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
16
+ }
17
+
18
+ def initialize(data, path)
19
+ @data = data
20
+ @path = path
21
+ @write = "w:utf-8"
22
+ end
23
+
24
+ def create
25
+ schema_data = {
26
+ packageId: "#{@data[:id]}/#{timestamp}",
27
+ system: @data[:system] || "http://globalnames.org"
28
+ }.merge(SCHEMA_DATA)
29
+ builder = Nokogiri::XML::Builder.new do |xml|
30
+ xml.eml(schema_data) do
31
+ build_body(xml)
32
+ end
33
+ end
34
+ save_eml(builder)
35
+ end
36
+
37
+ private
38
+
39
+ def build_body(xml)
40
+ build_dataset(xml)
41
+ build_additional_metadata(xml)
42
+ xml.parent.namespace = xml.parent.namespace_definitions.first
43
+ end
44
+
45
+ def save_eml(builder)
46
+ data = builder.to_xml
47
+ f = open(File.join(@path, "eml.xml"), @write)
48
+ f.write(data)
49
+ f.close
50
+ end
51
+
52
+ def build_dataset(xml)
53
+ xml.dataset(id: @data[:id]) do
54
+ xml.title(@data[:title])
55
+ xml.license(@data[:license])
56
+ contacts = []
57
+ build_authors(xml, contacts)
58
+ build_metadata_providers(xml)
59
+ xml.pubDate(Time.now.to_s)
60
+ build_abstract(xml)
61
+ build_contacts(xml, contacts)
62
+ end
63
+ end
64
+
65
+ def build_abstract(xml)
66
+ xml.abstract { xml.para(@data[:abstract]) }
67
+ end
68
+
69
+ def build_contacts(xml, contacts)
70
+ contacts.each { |contact| xml.contact { xml.references(contact) } }
71
+ end
72
+
73
+ def build_metadata_providers(xml)
74
+ @data[:metadata_providers].each do |a|
75
+ xml.metadataProvider { build_person(xml, a) }
76
+ end if @data[:metadata_providers]
77
+ end
78
+
79
+ def build_authors(xml, contacts)
80
+ @data[:authors].each_with_index do |a, i|
81
+ creator_id = i + 1
82
+ contacts << creator_id
83
+ xml.creator(id: creator_id, scope: "document") do
84
+ build_person(xml, a)
85
+ end
86
+ end
87
+ end
88
+
89
+ def build_additional_metadata(xml)
90
+ xml.additionalMetadata do
91
+ xml.metadata do
92
+ xml.citation(@data[:citation])
93
+ xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
94
+ end
95
+ end
96
+ end
97
+
98
+ def build_person(xml, data)
99
+ a = data
100
+ xml.individualName do
101
+ xml.givenName(a[:first_name])
102
+ xml.surName(a[:last_name])
103
+ end
104
+ xml.organizationName(a[:organization]) if a[:organization]
105
+ xml.positionName(a[:position]) if a[:position]
106
+ xml.onlineUrl(a[:url]) if a[:url]
107
+ xml.electronicMailAddress(a[:email])
108
+ end
109
+
110
+ def timestamp
111
+ t = Time.now.getutc.to_a[0..5].reverse
112
+ t[0..2] * ("-") + "::" + t[-3..-1] * (":")
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,72 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates DarwinCore meta file
4
+ class MetaXml
5
+ def initialize(data, path)
6
+ @data = data
7
+ @path = path
8
+ @write = "w:utf-8"
9
+ end
10
+
11
+ def create
12
+ schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
13
+ "http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
14
+ builder = Nokogiri::XML::Builder.new do |xml|
15
+ opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
16
+ fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
17
+ rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
18
+ build_archive(xml, opts, schema_uri)
19
+ end
20
+ save_meta(builder)
21
+ end
22
+
23
+ private
24
+
25
+ def save_meta(builder)
26
+ meta_xml_data = builder.to_xml
27
+ meta_file = open(File.join(@path, "meta.xml"), @write)
28
+ meta_file.write(meta_xml_data)
29
+ meta_file.close
30
+ end
31
+
32
+ def build_archive(xml, opts, schema_uri)
33
+ xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
34
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
35
+ :"xsi:schemaLocation" => schema_uri) do
36
+ build_core(xml, opts)
37
+ build_extensions(xml, opts)
38
+ end
39
+ end
40
+
41
+ def build_core(xml, opts)
42
+ xml.core(opts.merge(ignoreHeaderLines:
43
+ @data[:core][:ignoreHeaderLines])) do
44
+ xml.files { xml.location(@data[:core][:location]) }
45
+ taxon_id, fields = find_taxon_id(@data[:core][:fields])
46
+ xml.id_(index: taxon_id[1])
47
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
48
+ end
49
+ end
50
+
51
+ def build_extensions(xml, opts)
52
+ @data[:extensions].each do |e|
53
+ xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
54
+ rowType: e[:rowType])) do
55
+ xml.files { xml.location(e[:location]) }
56
+ taxon_id, fields = find_taxon_id(e[:fields])
57
+ xml.coreid(index: taxon_id[1])
58
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
59
+ end
60
+ end
61
+ end
62
+
63
+ def find_taxon_id(data)
64
+ fields = []
65
+ data.each_with_index { |f, i| fields << [f.strip, i] }
66
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
67
+ fail DarwinCore::GeneratorError if taxon_id.size != 1
68
+ [taxon_id[0], fields]
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Covers special case of Global Names Usage Bank data
5
+ class GnubTaxon < TaxonNormalized
6
+ attr_accessor :uuid, :uuid_path
7
+
8
+ def initialize
9
+ super
10
+ @uuid = nil
11
+ @uuid_path = []
12
+ end
13
+ end
14
+ end