dwc-archive 0.9.6 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents core of the DarwinCore Archive
5
+ class Core
6
+ include DarwinCore::Ingester
7
+ attr_reader :id
8
+
9
+ # rubocop:disable Metrics/MethodLength
10
+ def initialize(dwc)
11
+ @dwc = dwc
12
+ @archive = @dwc.archive
13
+ @path = @archive.files_path
14
+ root_key = @archive.meta.keys[0]
15
+ @data = @archive.meta[root_key][:core]
16
+ unless @data
17
+ raise DarwinCore::CoreFileError,
18
+ "Cannot find core in meta.xml, is meta.xml valid?"
19
+ end
20
+ @id = @data[:id][:attributes]
21
+ init_attributes
22
+ end
23
+ end
24
+ # rubocop:enable Metrics/MethodLength
25
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
2
4
  class Error < RuntimeError; end
3
5
  class FileNotFoundError < Error; end
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Unpacks compressed archives into a temp directory
5
+ class Expander
6
+ def initialize(archive_path, tmp_dir)
7
+ @archive_path = archive_path
8
+ @tmp_dir = tmp_dir
9
+ @dir_path = DarwinCore.random_path(tmp_dir)
10
+ @unpacker = init_unpacker
11
+ end
12
+
13
+ def unpack
14
+ clean
15
+ raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
16
+ success = @unpacker.call(@dir_path, @archive_path) if @unpacker
17
+ if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
18
+ success
19
+ else
20
+ clean
21
+ raise DarwinCore::UnpackingError
22
+ end
23
+ end
24
+
25
+ def path
26
+ @path ||= files_path
27
+ end
28
+
29
+ def clean
30
+ DarwinCore.clean(@dir_path)
31
+ end
32
+
33
+ def files
34
+ DarwinCore.files(path)
35
+ end
36
+
37
+ private
38
+
39
+ def init_unpacker
40
+ return tar_unpacker if @archive_path =~ /tar.gz$/i
41
+ return zip_unpacker if @archive_path =~ /zip$/i
42
+ nil
43
+ end
44
+
45
+ def tar_unpacker
46
+ proc do |tmp_path, archive_path|
47
+ FileUtils.mkdir tmp_path
48
+ path = esc(archive_path)
49
+ system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
50
+ end
51
+ end
52
+
53
+ def zip_unpacker
54
+ proc do |tmp_path, archive_path|
55
+ path = esc(archive_path)
56
+ system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
57
+ end
58
+ end
59
+
60
+ def esc(a_str)
61
+ "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
62
+ end
63
+
64
+ def path_entries(dir)
65
+ Dir.entries(dir).reject { |e| e.match(/[\.]{1,2}$/) }.sort
66
+ end
67
+
68
+ def files_path
69
+ entries = path_entries(@dir_path)
70
+ entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
71
+ end
72
+
73
+ def search_for_file_path(entries)
74
+ res = nil
75
+ entries.each do |e|
76
+ check_path = File.join(@dir_path, e)
77
+ next unless FileTest.directory?(check_path) &&
78
+ path_entries(check_path).include?("meta.xml")
79
+ res = check_path
80
+ break
81
+ end
82
+ res
83
+ end
84
+ end
85
+ end
@@ -1,8 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class DarwinCore
4
+ # Represents extensions of DarwinCore Archive
2
5
  class Extension
3
6
  include DarwinCore::Ingester
4
7
  attr_reader :coreid
5
- alias :id :coreid
8
+ alias id coreid
6
9
 
7
10
  def initialize(dwc, data)
8
11
  @dwc = dwc
@@ -10,8 +13,7 @@ class DarwinCore
10
13
  @path = @archive.files_path
11
14
  @data = data
12
15
  @coreid = @data[:coreid][:attributes]
13
- get_attributes(DarwinCore::ExtensionFileError)
16
+ init_attributes
14
17
  end
15
-
16
18
  end
17
19
  end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Creates csv files for core and extensions
5
+ class Generator
6
+ attr_reader :eml_xml_data, :path
7
+
8
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
9
+ @dwc_path = dwc_path
10
+ @path = DarwinCore.random_path(tmp_dir)
11
+ FileUtils.mkdir(@path)
12
+ @meta_xml_data = { extensions: [] }
13
+ @eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
14
+ citation: nil, url: nil }
15
+ @write = "w:utf-8"
16
+ end
17
+
18
+ def clean
19
+ DarwinCore.clean(@path)
20
+ end
21
+
22
+ def add_core(data, file_name, keep_headers = true)
23
+ opts = { type: "core", data: data, file_name: file_name,
24
+ keep_headers: keep_headers }
25
+ prepare_csv_file(opts)
26
+ end
27
+
28
+ def add_extension(data, file_name, keep_headers = true,
29
+ row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
30
+ opts = { type: "extension", data: data, file_name: file_name,
31
+ keep_headers: keep_headers, row_type: row_type }
32
+ prepare_csv_file(opts)
33
+ end
34
+
35
+ def add_meta_xml
36
+ meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
37
+ meta.create
38
+ end
39
+
40
+ def add_eml_xml(data)
41
+ @eml_xml_data = data
42
+ eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
43
+ eml.create
44
+ end
45
+
46
+ def files
47
+ DarwinCore.files(@path)
48
+ end
49
+
50
+ def pack
51
+ a = "cd #{@path}; tar -zcf #{@dwc_path} *"
52
+ system(a)
53
+ end
54
+
55
+ private
56
+
57
+ def prepare_csv_file(opts)
58
+ c = CSV.open(File.join(@path, opts[:file_name]), @write)
59
+ attributes = prepare_attributes(opts)
60
+ if opts[:type] == "core"
61
+ @meta_xml_data[:core] = attributes
62
+ else
63
+ @meta_xml_data[:extensions] << attributes
64
+ end
65
+ opts[:data].each { |d| c << d }
66
+ c.close
67
+ end
68
+
69
+ def prepare_attributes(opts)
70
+ header = opts[:data].shift
71
+ fields = init_fields(header, opts[:type])
72
+ opts[:data].unshift(fields) if opts[:keep_headers]
73
+ ignore_header_lines = opts[:keep_headers] ? 1 : 0
74
+
75
+ res = { fields: header, ignoreHeaderLines: ignore_header_lines,
76
+ location: opts[:file_name] }
77
+ res[:rowType] = opts[:row_type] if opts[:row_type]
78
+ res
79
+ end
80
+
81
+ def init_fields(header, file_type)
82
+ header.map do |f|
83
+ f = f.strip
84
+ err = "No header in #{file_type} data, or header fields are not urls"
85
+ raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
86
+ f.split("/")[-1]
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,116 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates EML file with meta information about archive
4
+ class EmlXml
5
+ SCHEMA_DATA = {
6
+ :"xml:lang" => "en",
7
+ :"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
8
+ :"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
9
+ :"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
10
+ :"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
11
+ :"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
12
+ :"xmlns:dc" => "http://purl.org/dc/terms/",
13
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
14
+ :"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
15
+ "http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
16
+ }
17
+
18
+ def initialize(data, path)
19
+ @data = data
20
+ @path = path
21
+ @write = "w:utf-8"
22
+ end
23
+
24
+ def create
25
+ schema_data = {
26
+ packageId: "#{@data[:id]}/#{timestamp}",
27
+ system: @data[:system] || "http://globalnames.org"
28
+ }.merge(SCHEMA_DATA)
29
+ builder = Nokogiri::XML::Builder.new do |xml|
30
+ xml.eml(schema_data) do
31
+ build_body(xml)
32
+ end
33
+ end
34
+ save_eml(builder)
35
+ end
36
+
37
+ private
38
+
39
+ def build_body(xml)
40
+ build_dataset(xml)
41
+ build_additional_metadata(xml)
42
+ xml.parent.namespace = xml.parent.namespace_definitions.first
43
+ end
44
+
45
+ def save_eml(builder)
46
+ data = builder.to_xml
47
+ f = open(File.join(@path, "eml.xml"), @write)
48
+ f.write(data)
49
+ f.close
50
+ end
51
+
52
+ def build_dataset(xml)
53
+ xml.dataset(id: @data[:id]) do
54
+ xml.title(@data[:title])
55
+ xml.license(@data[:license])
56
+ contacts = []
57
+ build_authors(xml, contacts)
58
+ build_metadata_providers(xml)
59
+ xml.pubDate(Time.now.to_s)
60
+ build_abstract(xml)
61
+ build_contacts(xml, contacts)
62
+ end
63
+ end
64
+
65
+ def build_abstract(xml)
66
+ xml.abstract { xml.para(@data[:abstract]) }
67
+ end
68
+
69
+ def build_contacts(xml, contacts)
70
+ contacts.each { |contact| xml.contact { xml.references(contact) } }
71
+ end
72
+
73
+ def build_metadata_providers(xml)
74
+ @data[:metadata_providers].each do |a|
75
+ xml.metadataProvider { build_person(xml, a) }
76
+ end if @data[:metadata_providers]
77
+ end
78
+
79
+ def build_authors(xml, contacts)
80
+ @data[:authors].each_with_index do |a, i|
81
+ creator_id = i + 1
82
+ contacts << creator_id
83
+ xml.creator(id: creator_id, scope: "document") do
84
+ build_person(xml, a)
85
+ end
86
+ end
87
+ end
88
+
89
+ def build_additional_metadata(xml)
90
+ xml.additionalMetadata do
91
+ xml.metadata do
92
+ xml.citation(@data[:citation])
93
+ xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
94
+ end
95
+ end
96
+ end
97
+
98
+ def build_person(xml, data)
99
+ a = data
100
+ xml.individualName do
101
+ xml.givenName(a[:first_name])
102
+ xml.surName(a[:last_name])
103
+ end
104
+ xml.organizationName(a[:organization]) if a[:organization]
105
+ xml.positionName(a[:position]) if a[:position]
106
+ xml.onlineUrl(a[:url]) if a[:url]
107
+ xml.electronicMailAddress(a[:email])
108
+ end
109
+
110
+ def timestamp
111
+ t = Time.now.getutc.to_a[0..5].reverse
112
+ t[0..2] * ("-") + "::" + t[-3..-1] * (":")
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,72 @@
1
+ class DarwinCore
2
+ class Generator
3
+ # Creates DarwinCore meta file
4
+ class MetaXml
5
+ def initialize(data, path)
6
+ @data = data
7
+ @path = path
8
+ @write = "w:utf-8"
9
+ end
10
+
11
+ def create
12
+ schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
13
+ "http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
14
+ builder = Nokogiri::XML::Builder.new do |xml|
15
+ opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
16
+ fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
17
+ rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
18
+ build_archive(xml, opts, schema_uri)
19
+ end
20
+ save_meta(builder)
21
+ end
22
+
23
+ private
24
+
25
+ def save_meta(builder)
26
+ meta_xml_data = builder.to_xml
27
+ meta_file = open(File.join(@path, "meta.xml"), @write)
28
+ meta_file.write(meta_xml_data)
29
+ meta_file.close
30
+ end
31
+
32
+ def build_archive(xml, opts, schema_uri)
33
+ xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
34
+ :"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
35
+ :"xsi:schemaLocation" => schema_uri) do
36
+ build_core(xml, opts)
37
+ build_extensions(xml, opts)
38
+ end
39
+ end
40
+
41
+ def build_core(xml, opts)
42
+ xml.core(opts.merge(ignoreHeaderLines:
43
+ @data[:core][:ignoreHeaderLines])) do
44
+ xml.files { xml.location(@data[:core][:location]) }
45
+ taxon_id, fields = find_taxon_id(@data[:core][:fields])
46
+ xml.id_(index: taxon_id[1])
47
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
48
+ end
49
+ end
50
+
51
+ def build_extensions(xml, opts)
52
+ @data[:extensions].each do |e|
53
+ xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
54
+ rowType: e[:rowType])) do
55
+ xml.files { xml.location(e[:location]) }
56
+ taxon_id, fields = find_taxon_id(e[:fields])
57
+ xml.coreid(index: taxon_id[1])
58
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
59
+ end
60
+ end
61
+ end
62
+
63
+ def find_taxon_id(data)
64
+ fields = []
65
+ data.each_with_index { |f, i| fields << [f.strip, i] }
66
+ taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
67
+ fail DarwinCore::GeneratorError if taxon_id.size != 1
68
+ [taxon_id[0], fields]
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Covers special case of Global Names Usage Bank data
5
+ class GnubTaxon < TaxonNormalized
6
+ attr_accessor :uuid, :uuid_path
7
+
8
+ def initialize
9
+ super
10
+ @uuid = nil
11
+ @uuid_path = []
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,106 @@
1
+ # encoding: utf-8
2
+ class DarwinCore
3
+ # This module abstracts information for reading csv file to be used
4
+ # in several classes which need such functionality
5
+ module Ingester
6
+ attr_reader :data, :properties, :encoding, :fields_separator, :size
7
+ attr_reader :file_path, :fields, :line_separator, :quote_character,
8
+ :ignore_headers
9
+
10
+ def size
11
+ @size ||= init_size
12
+ end
13
+
14
+ def read(batch_size = 10_000)
15
+ DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
16
+ res = []
17
+ errors = []
18
+ args = define_csv_args
19
+ min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1
20
+ csv = CSV.new(open(@file_path), args)
21
+ csv.each_with_index do |r, i|
22
+ next if @ignore_headers && i == 0
23
+ min_size > r.size ? errors << r : process_csv_row(res, errors, r)
24
+ next if i == 0 || i % batch_size != 0
25
+ DarwinCore.logger_write(@dwc.object_id,
26
+ format("Ingested %s records from %s",
27
+ i, name))
28
+ next unless block_given?
29
+ yield [res, errors]
30
+ res = []
31
+ errors = []
32
+ end
33
+ yield [res, errors] if block_given?
34
+ [res, errors]
35
+ end
36
+
37
+ private
38
+
39
+ def define_csv_args
40
+ args = { col_sep: @field_separator }
41
+ @quote_character = "\b" if @quote_character.empty?
42
+ args.merge(quote_char: @quote_character)
43
+ end
44
+
45
+ def name
46
+ self.class.to_s.split("::")[-1].downcase
47
+ end
48
+
49
+ def process_csv_row(result, errors, row)
50
+ str = row.join("")
51
+ str = str.force_encoding("utf-8")
52
+ if str.encoding.name == "UTF-8" && str.valid_encoding?
53
+ result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
54
+ else
55
+ errors << row
56
+ end
57
+ end
58
+
59
+ def init_attributes
60
+ @properties = @data[:attributes]
61
+ init_encoding
62
+ @field_separator = init_field_separator
63
+ @quote_character = @properties[:fieldsEnclosedBy] || ""
64
+ @line_separator = @properties[:linesTerminatedBy] || "\n"
65
+ @ignore_headers = @properties[:ignoreHeaderLines] &&
66
+ [1, true].include?(@properties[:ignoreHeaderLines])
67
+ init_file_path
68
+ init_fields
69
+ end
70
+
71
+ def init_encoding
72
+ @encoding = @properties[:encoding] || "UTF-8"
73
+ accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"].
74
+ include?(@encoding.downcase)
75
+ fail(
76
+ DarwinCore::EncodingError,
77
+ "No support for encodings other than utf-8 or utf-16 at the moment"
78
+ ) unless accepted_encoding
79
+ end
80
+
81
+ def init_file_path
82
+ file = @data[:location] ||
83
+ @data[:attributes][:location] ||
84
+ @data[:files][:location]
85
+ @file_path = File.join(@path, file)
86
+ fail DarwinCore::FileNotFoundError, "No file data" unless @file_path
87
+ end
88
+
89
+ def init_fields
90
+ @data[:field] = [data[:field]] if data[:field].class != Array
91
+ @fields = @data[:field].map { |f| f[:attributes] }
92
+ fail DarwinCore::InvalidArchiveError,
93
+ "No data fields are found" if @fields.empty?
94
+ end
95
+
96
+ def init_field_separator
97
+ res = @properties[:fieldsTerminatedBy] || ","
98
+ res = "\t" if res == "\\t"
99
+ res
100
+ end
101
+
102
+ def init_size
103
+ `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
104
+ end
105
+ end
106
+ end