dwc-archive 0.9.10 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +14 -8
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +119 -107
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +77 -69
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +105 -88
- data/.rvmrc +0 -1
- data/] +0 -40
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -428
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -84
- data/lib/dwc-archive/generator.rb +0 -85
- data/lib/dwc-archive/generator_eml_xml.rb +0 -86
- data/lib/dwc-archive/generator_meta_xml.rb +0 -58
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Represents core of the DarwinCore Archive
|
5
|
+
class Core
|
6
|
+
include DarwinCore::Ingester
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
# rubocop:disable Metrics/MethodLength
|
10
|
+
def initialize(dwc)
|
11
|
+
@dwc = dwc
|
12
|
+
@archive = @dwc.archive
|
13
|
+
@path = @archive.files_path
|
14
|
+
root_key = @archive.meta.keys[0]
|
15
|
+
@data = @archive.meta[root_key][:core]
|
16
|
+
unless @data
|
17
|
+
raise DarwinCore::CoreFileError,
|
18
|
+
"Cannot find core in meta.xml, is meta.xml valid?"
|
19
|
+
end
|
20
|
+
@id = @data[:id][:attributes]
|
21
|
+
init_attributes
|
22
|
+
end
|
23
|
+
end
|
24
|
+
# rubocop:enable Metrics/MethodLength
|
25
|
+
end
|
@@ -1,11 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class DarwinCore
|
2
4
|
class Error < RuntimeError; end
|
5
|
+
|
3
6
|
class FileNotFoundError < Error; end
|
7
|
+
|
4
8
|
class UnpackingError < Error; end
|
9
|
+
|
5
10
|
class InvalidArchiveError < Error; end
|
11
|
+
|
6
12
|
class CoreFileError < Error; end
|
13
|
+
|
7
14
|
class ExtensionFileError < Error; end
|
15
|
+
|
8
16
|
class GeneratorError < Error; end
|
17
|
+
|
9
18
|
class ParentNotCurrentError < Error; end
|
19
|
+
|
10
20
|
class EncodingError < Error; end
|
11
21
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Unpacks compressed archives into a temp directory
|
5
|
+
class Expander
|
6
|
+
def initialize(archive_path, tmp_dir)
|
7
|
+
@archive_path = archive_path
|
8
|
+
@tmp_dir = tmp_dir
|
9
|
+
@dir_path = DarwinCore.random_path(tmp_dir)
|
10
|
+
@unpacker = init_unpacker
|
11
|
+
end
|
12
|
+
|
13
|
+
def unpack
|
14
|
+
clean
|
15
|
+
raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
|
16
|
+
|
17
|
+
success = @unpacker.call(@dir_path, @archive_path) if @unpacker
|
18
|
+
if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
|
19
|
+
success
|
20
|
+
else
|
21
|
+
clean
|
22
|
+
raise DarwinCore::UnpackingError
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def path
|
27
|
+
@path ||= files_path
|
28
|
+
end
|
29
|
+
|
30
|
+
def clean
|
31
|
+
DarwinCore.clean(@dir_path)
|
32
|
+
end
|
33
|
+
|
34
|
+
def files
|
35
|
+
DarwinCore.files(path)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def init_unpacker
|
41
|
+
return tar_unpacker if @archive_path =~ /tar.gz$/i
|
42
|
+
return zip_unpacker if @archive_path =~ /zip$/i
|
43
|
+
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
|
47
|
+
def tar_unpacker
|
48
|
+
proc do |tmp_path, archive_path|
|
49
|
+
FileUtils.mkdir tmp_path
|
50
|
+
path = esc(archive_path)
|
51
|
+
system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def zip_unpacker
|
56
|
+
proc do |tmp_path, archive_path|
|
57
|
+
path = esc(archive_path)
|
58
|
+
system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def esc(a_str)
|
63
|
+
"'#{a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'")}'"
|
64
|
+
end
|
65
|
+
|
66
|
+
def path_entries(dir)
|
67
|
+
Dir.entries(dir).reject { |e| e.match(/\.{1,2}$/) }.sort
|
68
|
+
end
|
69
|
+
|
70
|
+
def files_path
|
71
|
+
entries = path_entries(@dir_path)
|
72
|
+
entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
|
73
|
+
end
|
74
|
+
|
75
|
+
def search_for_file_path(entries)
|
76
|
+
res = nil
|
77
|
+
entries.each do |e|
|
78
|
+
check_path = File.join(@dir_path, e)
|
79
|
+
next unless FileTest.directory?(check_path) &&
|
80
|
+
path_entries(check_path).include?("meta.xml")
|
81
|
+
|
82
|
+
res = check_path
|
83
|
+
break
|
84
|
+
end
|
85
|
+
res
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class DarwinCore
|
4
|
+
# Represents extensions of DarwinCore Archive
|
2
5
|
class Extension
|
3
6
|
include DarwinCore::Ingester
|
4
7
|
attr_reader :coreid
|
5
|
-
alias
|
8
|
+
alias id coreid
|
6
9
|
|
7
10
|
def initialize(dwc, data)
|
8
11
|
@dwc = dwc
|
@@ -10,8 +13,7 @@ class DarwinCore
|
|
10
13
|
@path = @archive.files_path
|
11
14
|
@data = data
|
12
15
|
@coreid = @data[:coreid][:attributes]
|
13
|
-
|
16
|
+
init_attributes
|
14
17
|
end
|
15
|
-
|
16
18
|
end
|
17
19
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Creates csv files for core and extensions
|
5
|
+
class Generator
|
6
|
+
attr_reader :eml_xml_data, :path
|
7
|
+
|
8
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
9
|
+
@dwc_path = dwc_path
|
10
|
+
@path = DarwinCore.random_path(tmp_dir)
|
11
|
+
FileUtils.mkdir(@path)
|
12
|
+
@meta_xml_data = { extensions: [] }
|
13
|
+
@eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
|
14
|
+
citation: nil, url: nil }
|
15
|
+
@write = 'w:utf-8'
|
16
|
+
end
|
17
|
+
|
18
|
+
def clean
|
19
|
+
DarwinCore.clean(@path)
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_core(data, file_name, keep_headers = true)
|
23
|
+
opts = { type: 'core', data: data, file_name: file_name,
|
24
|
+
keep_headers: keep_headers }
|
25
|
+
prepare_csv_file(opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_extension(data, file_name, keep_headers = true,
|
29
|
+
row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
|
30
|
+
opts = { type: 'extension', data: data, file_name: file_name,
|
31
|
+
keep_headers: keep_headers, row_type: row_type }
|
32
|
+
prepare_csv_file(opts)
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_meta_xml
|
36
|
+
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
37
|
+
meta.create
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_eml_xml(data)
|
41
|
+
@eml_xml_data = data
|
42
|
+
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
43
|
+
eml.create
|
44
|
+
end
|
45
|
+
|
46
|
+
def files
|
47
|
+
DarwinCore.files(@path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def pack
|
51
|
+
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
52
|
+
system(a)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_csv_file(opts)
|
58
|
+
c = CSV.open(File.join(@path, opts[:file_name]), @write)
|
59
|
+
attributes = prepare_attributes(opts)
|
60
|
+
if opts[:type] == 'core'
|
61
|
+
@meta_xml_data[:core] = attributes
|
62
|
+
else
|
63
|
+
@meta_xml_data[:extensions] << attributes
|
64
|
+
end
|
65
|
+
opts[:data].each { |d| c << d }
|
66
|
+
c.close
|
67
|
+
end
|
68
|
+
|
69
|
+
def prepare_attributes(opts)
|
70
|
+
header = opts[:data].shift
|
71
|
+
fields = init_fields(header, opts[:type])
|
72
|
+
opts[:data].unshift(fields) if opts[:keep_headers]
|
73
|
+
ignore_header_lines = opts[:keep_headers] ? 1 : 0
|
74
|
+
|
75
|
+
res = { fields: header, ignoreHeaderLines: ignore_header_lines,
|
76
|
+
location: opts[:file_name] }
|
77
|
+
res[:rowType] = opts[:row_type] if opts[:row_type]
|
78
|
+
res
|
79
|
+
end
|
80
|
+
|
81
|
+
def init_fields(header, file_type)
|
82
|
+
header.map do |f|
|
83
|
+
f = f.strip
|
84
|
+
err = "No header in #{file_type} data, or header fields are not urls"
|
85
|
+
raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
|
86
|
+
|
87
|
+
f.split('/')[-1]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
# Creates EML file with meta information about archive
|
4
|
+
class EmlXml
|
5
|
+
SCHEMA_DATA = {
|
6
|
+
:"xml:lang" => "en",
|
7
|
+
:"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
|
8
|
+
:"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
|
9
|
+
:"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
|
10
|
+
:"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
|
11
|
+
:"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
|
12
|
+
:"xmlns:dc" => "http://purl.org/dc/terms/",
|
13
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
14
|
+
:"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
|
15
|
+
"http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
|
16
|
+
}
|
17
|
+
|
18
|
+
def initialize(data, path)
|
19
|
+
@data = data
|
20
|
+
@path = path
|
21
|
+
@write = "w:utf-8"
|
22
|
+
end
|
23
|
+
|
24
|
+
def create
|
25
|
+
schema_data = {
|
26
|
+
packageId: "#{@data[:id]}/#{timestamp}",
|
27
|
+
system: @data[:system] || "http://globalnames.org"
|
28
|
+
}.merge(SCHEMA_DATA)
|
29
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
30
|
+
xml.eml(schema_data) do
|
31
|
+
build_body(xml)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
save_eml(builder)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def build_body(xml)
|
40
|
+
build_dataset(xml)
|
41
|
+
build_additional_metadata(xml)
|
42
|
+
xml.parent.namespace = xml.parent.namespace_definitions.first
|
43
|
+
end
|
44
|
+
|
45
|
+
def save_eml(builder)
|
46
|
+
data = builder.to_xml
|
47
|
+
f = open(File.join(@path, "eml.xml"), @write)
|
48
|
+
f.write(data)
|
49
|
+
f.close
|
50
|
+
end
|
51
|
+
|
52
|
+
def build_dataset(xml)
|
53
|
+
xml.dataset(id: @data[:id]) do
|
54
|
+
xml.title(@data[:title])
|
55
|
+
xml.license(@data[:license])
|
56
|
+
contacts = []
|
57
|
+
build_authors(xml, contacts)
|
58
|
+
build_metadata_providers(xml)
|
59
|
+
xml.pubDate(Time.now.to_s)
|
60
|
+
build_abstract(xml)
|
61
|
+
build_contacts(xml, contacts)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def build_abstract(xml)
|
66
|
+
xml.abstract { xml.para(@data[:abstract]) }
|
67
|
+
end
|
68
|
+
|
69
|
+
def build_contacts(xml, contacts)
|
70
|
+
contacts.each { |contact| xml.contact { xml.references(contact) } }
|
71
|
+
end
|
72
|
+
|
73
|
+
def build_metadata_providers(xml)
|
74
|
+
@data[:metadata_providers].each do |a|
|
75
|
+
xml.metadataProvider { build_person(xml, a) }
|
76
|
+
end if @data[:metadata_providers]
|
77
|
+
end
|
78
|
+
|
79
|
+
def build_authors(xml, contacts)
|
80
|
+
@data[:authors].each_with_index do |a, i|
|
81
|
+
creator_id = i + 1
|
82
|
+
contacts << creator_id
|
83
|
+
xml.creator(id: creator_id, scope: "document") do
|
84
|
+
build_person(xml, a)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def build_additional_metadata(xml)
|
90
|
+
xml.additionalMetadata do
|
91
|
+
xml.metadata do
|
92
|
+
xml.citation(@data[:citation])
|
93
|
+
xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def build_person(xml, data)
|
99
|
+
a = data
|
100
|
+
xml.individualName do
|
101
|
+
xml.givenName(a[:first_name])
|
102
|
+
xml.surName(a[:last_name])
|
103
|
+
end
|
104
|
+
xml.organizationName(a[:organization]) if a[:organization]
|
105
|
+
xml.positionName(a[:position]) if a[:position]
|
106
|
+
xml.onlineUrl(a[:url]) if a[:url]
|
107
|
+
xml.electronicMailAddress(a[:email])
|
108
|
+
end
|
109
|
+
|
110
|
+
def timestamp
|
111
|
+
t = Time.now.getutc.to_a[0..5].reverse
|
112
|
+
t[0..2] * ("-") + "::" + t[-3..-1] * (":")
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
# Creates DarwinCore meta file
|
4
|
+
class MetaXml
|
5
|
+
def initialize(data, path)
|
6
|
+
@data = data
|
7
|
+
@path = path
|
8
|
+
@write = "w:utf-8"
|
9
|
+
end
|
10
|
+
|
11
|
+
def create
|
12
|
+
schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
|
13
|
+
"http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
|
14
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
15
|
+
opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
|
16
|
+
fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
|
17
|
+
rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
|
18
|
+
build_archive(xml, opts, schema_uri)
|
19
|
+
end
|
20
|
+
save_meta(builder)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def save_meta(builder)
|
26
|
+
meta_xml_data = builder.to_xml
|
27
|
+
meta_file = open(File.join(@path, "meta.xml"), @write)
|
28
|
+
meta_file.write(meta_xml_data)
|
29
|
+
meta_file.close
|
30
|
+
end
|
31
|
+
|
32
|
+
def build_archive(xml, opts, schema_uri)
|
33
|
+
xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
|
34
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
35
|
+
:"xsi:schemaLocation" => schema_uri) do
|
36
|
+
build_core(xml, opts)
|
37
|
+
build_extensions(xml, opts)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def build_core(xml, opts)
|
42
|
+
xml.core(opts.merge(ignoreHeaderLines:
|
43
|
+
@data[:core][:ignoreHeaderLines])) do
|
44
|
+
xml.files { xml.location(@data[:core][:location]) }
|
45
|
+
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
46
|
+
xml.id_(index: taxon_id[1])
|
47
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_extensions(xml, opts)
|
52
|
+
@data[:extensions].each do |e|
|
53
|
+
xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
|
54
|
+
rowType: e[:rowType])) do
|
55
|
+
xml.files { xml.location(e[:location]) }
|
56
|
+
taxon_id, fields = find_taxon_id(e[:fields])
|
57
|
+
xml.coreid(index: taxon_id[1])
|
58
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def find_taxon_id(data)
|
64
|
+
fields = []
|
65
|
+
data.each_with_index { |f, i| fields << [f.strip, i] }
|
66
|
+
taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
|
67
|
+
fail DarwinCore::GeneratorError if taxon_id.size != 1
|
68
|
+
[taxon_id[0], fields]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Covers special case of Global Names Usage Bank data
|
5
|
+
class GnubTaxon < TaxonNormalized
|
6
|
+
attr_accessor :uuid, :uuid_path
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super
|
10
|
+
@uuid = nil
|
11
|
+
@uuid_path = []
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|