dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents core of the DarwinCore Archive
5
+ class Core
6
+ include DarwinCore::Ingester
7
+ attr_reader :id
8
+
9
+ # rubocop:disable Metrics/MethodLength
10
+ def initialize(dwc)
11
+ @dwc = dwc
12
+ @archive = @dwc.archive
13
+ @path = @archive.files_path
14
+ root_key = @archive.meta.keys[0]
15
+ @data = @archive.meta[root_key][:core]
16
+ unless @data
17
+ raise DarwinCore::CoreFileError,
18
+ "Cannot find core in meta.xml, is meta.xml valid?"
19
+ end
20
+ @id = @data[:id][:attributes]
21
+ init_attributes
22
+ end
23
+ end
24
+ # rubocop:enable Metrics/MethodLength
25
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
2
4
  class Error < RuntimeError; end
3
5
  class FileNotFoundError < Error; end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Unpacks compressed archives into a temp directory
5
+ class Expander
6
+ def initialize(archive_path, tmp_dir)
7
+ @archive_path = archive_path
8
+ @tmp_dir = tmp_dir
9
+ @dir_path = DarwinCore.random_path(tmp_dir)
10
+ @unpacker = init_unpacker
11
+ end
12
+
13
+ def unpack
14
+ clean
15
+ raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
16
+ success = @unpacker.call(@dir_path, @archive_path) if @unpacker
17
+ if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
18
+ success
19
+ else
20
+ clean
21
+ raise DarwinCore::UnpackingError
22
+ end
23
+ end
24
+
25
+ def path
26
+ @path ||= files_path
27
+ end
28
+
29
+ def clean
30
+ DarwinCore.clean(@dir_path)
31
+ end
32
+
33
+ def files
34
+ DarwinCore.files(path)
35
+ end
36
+
37
+ private
38
+
39
+ def init_unpacker
40
+ return tar_unpacker if @archive_path =~ /tar.gz$/i
41
+ return zip_unpacker if @archive_path =~ /zip$/i
42
+ nil
43
+ end
44
+
45
+ def tar_unpacker
46
+ proc do |tmp_path, archive_path|
47
+ FileUtils.mkdir tmp_path
48
+ path = esc(archive_path)
49
+ system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
50
+ end
51
+ end
52
+
53
+ def zip_unpacker
54
+ proc do |tmp_path, archive_path|
55
+ path = esc(archive_path)
56
+ system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
57
+ end
58
+ end
59
+
60
+ def esc(a_str)
61
+ "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
62
+ end
63
+
64
+ def path_entries(dir)
65
+ Dir.entries(dir).reject { |e| e.match(/[\.]{1,2}$/) }.sort
66
+ end
67
+
68
+ def files_path
69
+ entries = path_entries(@dir_path)
70
+ entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
71
+ end
72
+
73
+ def search_for_file_path(entries)
74
+ res = nil
75
+ entries.each do |e|
76
+ check_path = File.join(@dir_path, e)
77
+ next unless FileTest.directory?(check_path) &&
78
+ path_entries(check_path).include?("meta.xml")
79
+ res = check_path
80
+ break
81
+ end
82
+ res
83
+ end
84
+ end
85
+ end
@@ -1,8 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
4
+ # Represents extensions of DarwinCore Archive
2
5
  class Extension
3
6
  include DarwinCore::Ingester
4
7
  attr_reader :coreid
5
- alias :id :coreid
8
+ alias id coreid
6
9
 
7
10
  def initialize(dwc, data)
8
11
  @dwc = dwc
@@ -10,8 +13,7 @@ class DarwinCore
10
13
  @path = @archive.files_path
11
14
  @data = data
12
15
  @coreid = @data[:coreid][:attributes]
13
- get_attributes(DarwinCore::ExtensionFileError)
16
+ init_attributes
14
17
  end
15
-
16
18
  end
17
19
  end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Creates csv files for core and extensions
5
+ class Generator
6
+ attr_reader :eml_xml_data, :path
7
+
8
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
9
+ @dwc_path = dwc_path
10
+ @path = DarwinCore.random_path(tmp_dir)
11
+ FileUtils.mkdir(@path)
12
+ @meta_xml_data = { extensions: [] }
13
+ @eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
14
+ citation: nil, url: nil }
15
+ @write = "w:utf-8"
16
+ end
17
+
18
+ def clean
19
+ DarwinCore.clean(@path)
20
+ end
21
+
22
+ def add_core(data, file_name, keep_headers = true)
23
+ opts = { type: "core", data: data, file_name: file_name,
24
+ keep_headers: keep_headers }
25
+ prepare_csv_file(opts)
26
+ end
27
+
28
+ def add_extension(data, file_name, keep_headers = true,
29
+ row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
30
+ opts = { type: "extension", data: data, file_name: file_name,
31
+ keep_headers: keep_headers, row_type: row_type }
32
+ prepare_csv_file(opts)
33
+ end
34
+
35
+ def add_meta_xml
36
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
37
+ meta.create
38
+ end
39
+
40
+ def add_eml_xml(data)
41
+ @eml_xml_data = data
42
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
43
+ eml.create
44
+ end
45
+
46
+ def files
47
+ DarwinCore.files(@path)
48
+ end
49
+
50
+ def pack
51
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
52
+ system(a)
53
+ end
54
+
55
+ private
56
+
57
+ def prepare_csv_file(opts)
58
+ c = CSV.open(File.join(@path, opts[:file_name]), @write)
59
+ attributes = prepare_attributes(opts)
60
+ if opts[:type] == "core"
61
+ @meta_xml_data[:core] = attributes
62
+ else
63
+ @meta_xml_data[:extensions] << attributes
64
+ end
65
+ opts[:data].each { |d| c << d }
66
+ c.close
67
+ end
68
+
69
+ def prepare_attributes(opts)
70
+ header = opts[:data].shift
71
+ fields = init_fields(header, opts[:type])
72
+ opts[:data].unshift(fields) if opts[:keep_headers]
73
+ ignore_header_lines = opts[:keep_headers] ? 1 : 0
74
+
75
+ res = { fields: header, ignoreHeaderLines: ignore_header_lines,
76
+ location: opts[:file_name] }
77
+ res[:rowType] = opts[:row_type] if opts[:row_type]
78
+ res
79
+ end
80
+
81
+ def init_fields(header, file_type)
82
+ header.map do |f|
83
+ f = f.strip
84
+ err = "No header in #{file_type} data, or header fields are not urls"
85
+ raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
86
+ f.split("/")[-1]
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,116 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates EML file with meta information about archive
4
+ class EmlXml
5
+ SCHEMA_DATA = {
6
+ :"xml:lang" => "en",
7
+ :"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
8
+ :"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
9
+ :"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
10
+ :"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
11
+ :"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
12
+ :"xmlns:dc" => "http://purl.org/dc/terms/",
13
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
14
+ :"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
15
+ "http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
16
+ }
17
+
18
+ def initialize(data, path)
19
+ @data = data
20
+ @path = path
21
+ @write = "w:utf-8"
22
+ end
23
+
24
+ def create
25
+ schema_data = {
26
+ packageId: "#{@data[:id]}/#{timestamp}",
27
+ system: @data[:system] || "http://globalnames.org"
28
+ }.merge(SCHEMA_DATA)
29
+ builder = Nokogiri::XML::Builder.new do |xml|
30
+ xml.eml(schema_data) do
31
+ build_body(xml)
32
+ end
33
+ end
34
+ save_eml(builder)
35
+ end
36
+
37
+ private
38
+
39
+ def build_body(xml)
40
+ build_dataset(xml)
41
+ build_additional_metadata(xml)
42
+ xml.parent.namespace = xml.parent.namespace_definitions.first
43
+ end
44
+
45
+ def save_eml(builder)
46
+ data = builder.to_xml
47
+ f = open(File.join(@path, "eml.xml"), @write)
48
+ f.write(data)
49
+ f.close
50
+ end
51
+
52
+ def build_dataset(xml)
53
+ xml.dataset(id: @data[:id]) do
54
+ xml.title(@data[:title])
55
+ xml.license(@data[:license])
56
+ contacts = []
57
+ build_authors(xml, contacts)
58
+ build_metadata_providers(xml)
59
+ xml.pubDate(Time.now.to_s)
60
+ build_abstract(xml)
61
+ build_contacts(xml, contacts)
62
+ end
63
+ end
64
+
65
+ def build_abstract(xml)
66
+ xml.abstract { xml.para(@data[:abstract]) }
67
+ end
68
+
69
+ def build_contacts(xml, contacts)
70
+ contacts.each { |contact| xml.contact { xml.references(contact) } }
71
+ end
72
+
73
+ def build_metadata_providers(xml)
74
+ @data[:metadata_providers].each do |a|
75
+ xml.metadataProvider { build_person(xml, a) }
76
+ end if @data[:metadata_providers]
77
+ end
78
+
79
+ def build_authors(xml, contacts)
80
+ @data[:authors].each_with_index do |a, i|
81
+ creator_id = i + 1
82
+ contacts << creator_id
83
+ xml.creator(id: creator_id, scope: "document") do
84
+ build_person(xml, a)
85
+ end
86
+ end
87
+ end
88
+
89
+ def build_additional_metadata(xml)
90
+ xml.additionalMetadata do
91
+ xml.metadata do
92
+ xml.citation(@data[:citation])
93
+ xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
94
+ end
95
+ end
96
+ end
97
+
98
+ def build_person(xml, data)
99
+ a = data
100
+ xml.individualName do
101
+ xml.givenName(a[:first_name])
102
+ xml.surName(a[:last_name])
103
+ end
104
+ xml.organizationName(a[:organization]) if a[:organization]
105
+ xml.positionName(a[:position]) if a[:position]
106
+ xml.onlineUrl(a[:url]) if a[:url]
107
+ xml.electronicMailAddress(a[:email])
108
+ end
109
+
110
+ def timestamp
111
+ t = Time.now.getutc.to_a[0..5].reverse
112
+ t[0..2] * ("-") + "::" + t[-3..-1] * (":")
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,72 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates DarwinCore meta file
4
+ class MetaXml
5
+ def initialize(data, path)
6
+ @data = data
7
+ @path = path
8
+ @write = "w:utf-8"
9
+ end
10
+
11
+ def create
12
+ schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
13
+ "http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
14
+ builder = Nokogiri::XML::Builder.new do |xml|
15
+ opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
16
+ fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
17
+ rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
18
+ build_archive(xml, opts, schema_uri)
19
+ end
20
+ save_meta(builder)
21
+ end
22
+
23
+ private
24
+
25
+ def save_meta(builder)
26
+ meta_xml_data = builder.to_xml
27
+ meta_file = open(File.join(@path, "meta.xml"), @write)
28
+ meta_file.write(meta_xml_data)
29
+ meta_file.close
30
+ end
31
+
32
+ def build_archive(xml, opts, schema_uri)
33
+ xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
34
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
35
+ :"xsi:schemaLocation" => schema_uri) do
36
+ build_core(xml, opts)
37
+ build_extensions(xml, opts)
38
+ end
39
+ end
40
+
41
+ def build_core(xml, opts)
42
+ xml.core(opts.merge(ignoreHeaderLines:
43
+ @data[:core][:ignoreHeaderLines])) do
44
+ xml.files { xml.location(@data[:core][:location]) }
45
+ taxon_id, fields = find_taxon_id(@data[:core][:fields])
46
+ xml.id_(index: taxon_id[1])
47
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
48
+ end
49
+ end
50
+
51
+ def build_extensions(xml, opts)
52
+ @data[:extensions].each do |e|
53
+ xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
54
+ rowType: e[:rowType])) do
55
+ xml.files { xml.location(e[:location]) }
56
+ taxon_id, fields = find_taxon_id(e[:fields])
57
+ xml.coreid(index: taxon_id[1])
58
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
59
+ end
60
+ end
61
+ end
62
+
63
+ def find_taxon_id(data)
64
+ fields = []
65
+ data.each_with_index { |f, i| fields << [f.strip, i] }
66
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
67
+ fail DarwinCore::GeneratorError if taxon_id.size != 1
68
+ [taxon_id[0], fields]
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Covers special case of Global Names Usage Bank data
5
+ class GnubTaxon < TaxonNormalized
6
+ attr_accessor :uuid, :uuid_path
7
+
8
+ def initialize
9
+ super
10
+ @uuid = nil
11
+ @uuid_path = []
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: utf-8
2
+ class DarwinCore
3
+ # This module abstracts information for reading csv file to be used
4
+ # in several classes which need such functionality
5
+ module Ingester
6
+ attr_reader :data, :properties, :encoding, :fields_separator, :size
7
+ attr_reader :file_path, :fields, :line_separator, :quote_character,
8
+ :ignore_headers
9
+
10
+ def size
11
+ @size ||= init_size
12
+ end
13
+
14
+ def read(batch_size = 10_000)
15
+ DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
16
+ res = []
17
+ errors = []
18
+ args = define_csv_args
19
+ min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1
20
+ csv = CSV.new(open(@file_path), args)
21
+ csv.each_with_index do |r, i|
22
+ next if @ignore_headers && i == 0
23
+ min_size > r.size ? errors << r : process_csv_row(res, errors, r)
24
+ next if i == 0 || i % batch_size != 0
25
+ DarwinCore.logger_write(@dwc.object_id,
26
+ format("Ingested %s records from %s",
27
+ i, name))
28
+ next unless block_given?
29
+ yield [res, errors]
30
+ res = []
31
+ errors = []
32
+ end
33
+ yield [res, errors] if block_given?
34
+ [res, errors]
35
+ end
36
+
37
+ private
38
+
39
+ def define_csv_args
40
+ args = { col_sep: @field_separator }
41
+ @quote_character = "\b" if @quote_character.empty?
42
+ args.merge(quote_char: @quote_character)
43
+ end
44
+
45
+ def name
46
+ self.class.to_s.split("::")[-1].downcase
47
+ end
48
+
49
+ def process_csv_row(result, errors, row)
50
+ str = row.join("")
51
+ str = str.force_encoding("utf-8")
52
+ if str.encoding.name == "UTF-8" && str.valid_encoding?
53
+ result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
54
+ else
55
+ errors << row
56
+ end
57
+ end
58
+
59
+ def init_attributes
60
+ @properties = @data[:attributes]
61
+ init_encoding
62
+ @field_separator = init_field_separator
63
+ @quote_character = @properties[:fieldsEnclosedBy] || ""
64
+ @line_separator = @properties[:linesTerminatedBy] || "\n"
65
+ @ignore_headers = @properties[:ignoreHeaderLines] &&
66
+ [1, true].include?(@properties[:ignoreHeaderLines])
67
+ init_file_path
68
+ init_fields
69
+ end
70
+
71
+ def init_encoding
72
+ @encoding = @properties[:encoding] || "UTF-8"
73
+ accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"].
74
+ include?(@encoding.downcase)
75
+ fail(
76
+ DarwinCore::EncodingError,
77
+ "No support for encodings other than utf-8 or utf-16 at the moment"
78
+ ) unless accepted_encoding
79
+ end
80
+
81
+ def init_file_path
82
+ file = @data[:location] ||
83
+ @data[:attributes][:location] ||
84
+ @data[:files][:location]
85
+ @file_path = File.join(@path, file)
86
+ fail DarwinCore::FileNotFoundError, "No file data" unless @file_path
87
+ end
88
+
89
+ def init_fields
90
+ @data[:field] = [data[:field]] if data[:field].class != Array
91
+ @fields = @data[:field].map { |f| f[:attributes] }
92
+ fail DarwinCore::InvalidArchiveError,
93
+ "No data fields are found" if @fields.empty?
94
+ end
95
+
96
+ def init_field_separator
97
+ res = @properties[:fieldsTerminatedBy] || ","
98
+ res = "\t" if res == "\\t"
99
+ res
100
+ end
101
+
102
+ def init_size
103
+ `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
104
+ end
105
+ end
106
+ end