dwc-archive 0.9.11 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +4 -0
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +114 -109
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
- data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +1 -1
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +73 -74
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +101 -87
- data/.rvmrc +0 -1
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -427
- data/lib/dwc-archive/core.rb +0 -19
- data/lib/dwc-archive/expander.rb +0 -85
- data/lib/dwc-archive/generator.rb +0 -86
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -80
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Represents core of the DarwinCore Archive
|
5
|
+
class Core
|
6
|
+
include DarwinCore::Ingester
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
# rubocop:disable Metrics/MethodLength
|
10
|
+
def initialize(dwc)
|
11
|
+
@dwc = dwc
|
12
|
+
@archive = @dwc.archive
|
13
|
+
@path = @archive.files_path
|
14
|
+
root_key = @archive.meta.keys[0]
|
15
|
+
@data = @archive.meta[root_key][:core]
|
16
|
+
unless @data
|
17
|
+
raise DarwinCore::CoreFileError,
|
18
|
+
"Cannot find core in meta.xml, is meta.xml valid?"
|
19
|
+
end
|
20
|
+
@id = @data[:id][:attributes]
|
21
|
+
init_attributes
|
22
|
+
end
|
23
|
+
end
|
24
|
+
# rubocop:enable Metrics/MethodLength
|
25
|
+
end
|
@@ -1,11 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class DarwinCore
|
2
4
|
class Error < RuntimeError; end
|
5
|
+
|
3
6
|
class FileNotFoundError < Error; end
|
7
|
+
|
4
8
|
class UnpackingError < Error; end
|
9
|
+
|
5
10
|
class InvalidArchiveError < Error; end
|
11
|
+
|
6
12
|
class CoreFileError < Error; end
|
13
|
+
|
7
14
|
class ExtensionFileError < Error; end
|
15
|
+
|
8
16
|
class GeneratorError < Error; end
|
17
|
+
|
9
18
|
class ParentNotCurrentError < Error; end
|
19
|
+
|
10
20
|
class EncodingError < Error; end
|
11
21
|
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Unpacks compressed archives into a temp directory
|
5
|
+
class Expander
|
6
|
+
def initialize(archive_path, tmp_dir)
|
7
|
+
@archive_path = archive_path
|
8
|
+
@tmp_dir = tmp_dir
|
9
|
+
@dir_path = DarwinCore.random_path(tmp_dir)
|
10
|
+
@unpacker = init_unpacker
|
11
|
+
end
|
12
|
+
|
13
|
+
def unpack
|
14
|
+
clean
|
15
|
+
raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
|
16
|
+
|
17
|
+
success = @unpacker.call(@dir_path, @archive_path) if @unpacker
|
18
|
+
if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
|
19
|
+
success
|
20
|
+
else
|
21
|
+
clean
|
22
|
+
raise DarwinCore::UnpackingError
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def path
|
27
|
+
@path ||= files_path
|
28
|
+
end
|
29
|
+
|
30
|
+
def clean
|
31
|
+
DarwinCore.clean(@dir_path)
|
32
|
+
end
|
33
|
+
|
34
|
+
def files
|
35
|
+
DarwinCore.files(path)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def init_unpacker
|
41
|
+
return tar_unpacker if @archive_path =~ /tar.gz$/i
|
42
|
+
return zip_unpacker if @archive_path =~ /zip$/i
|
43
|
+
|
44
|
+
nil
|
45
|
+
end
|
46
|
+
|
47
|
+
def tar_unpacker
|
48
|
+
proc do |tmp_path, archive_path|
|
49
|
+
FileUtils.mkdir tmp_path
|
50
|
+
path = esc(archive_path)
|
51
|
+
system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def zip_unpacker
|
56
|
+
proc do |tmp_path, archive_path|
|
57
|
+
path = esc(archive_path)
|
58
|
+
system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def esc(a_str)
|
63
|
+
"'#{a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'")}'"
|
64
|
+
end
|
65
|
+
|
66
|
+
def path_entries(dir)
|
67
|
+
Dir.entries(dir).reject { |e| e.match(/\.{1,2}$/) }.sort
|
68
|
+
end
|
69
|
+
|
70
|
+
def files_path
|
71
|
+
entries = path_entries(@dir_path)
|
72
|
+
entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
|
73
|
+
end
|
74
|
+
|
75
|
+
def search_for_file_path(entries)
|
76
|
+
res = nil
|
77
|
+
entries.each do |e|
|
78
|
+
check_path = File.join(@dir_path, e)
|
79
|
+
next unless FileTest.directory?(check_path) &&
|
80
|
+
path_entries(check_path).include?("meta.xml")
|
81
|
+
|
82
|
+
res = check_path
|
83
|
+
break
|
84
|
+
end
|
85
|
+
res
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class DarwinCore
|
4
|
+
# Represents extensions of DarwinCore Archive
|
2
5
|
class Extension
|
3
6
|
include DarwinCore::Ingester
|
4
7
|
attr_reader :coreid
|
5
|
-
alias
|
8
|
+
alias id coreid
|
6
9
|
|
7
10
|
def initialize(dwc, data)
|
8
11
|
@dwc = dwc
|
@@ -10,8 +13,7 @@ class DarwinCore
|
|
10
13
|
@path = @archive.files_path
|
11
14
|
@data = data
|
12
15
|
@coreid = @data[:coreid][:attributes]
|
13
|
-
|
16
|
+
init_attributes
|
14
17
|
end
|
15
|
-
|
16
18
|
end
|
17
19
|
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Creates csv files for core and extensions
|
5
|
+
class Generator
|
6
|
+
attr_reader :eml_xml_data, :path
|
7
|
+
|
8
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
9
|
+
@dwc_path = dwc_path
|
10
|
+
@path = DarwinCore.random_path(tmp_dir)
|
11
|
+
FileUtils.mkdir(@path)
|
12
|
+
@meta_xml_data = { extensions: [] }
|
13
|
+
@eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
|
14
|
+
citation: nil, url: nil }
|
15
|
+
@write = 'w:utf-8'
|
16
|
+
end
|
17
|
+
|
18
|
+
def clean
|
19
|
+
DarwinCore.clean(@path)
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_core(data, file_name, keep_headers = true)
|
23
|
+
opts = { type: 'core', data: data, file_name: file_name,
|
24
|
+
keep_headers: keep_headers }
|
25
|
+
prepare_csv_file(opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_extension(data, file_name, keep_headers = true,
|
29
|
+
row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
|
30
|
+
opts = { type: 'extension', data: data, file_name: file_name,
|
31
|
+
keep_headers: keep_headers, row_type: row_type }
|
32
|
+
prepare_csv_file(opts)
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_meta_xml
|
36
|
+
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
37
|
+
meta.create
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_eml_xml(data)
|
41
|
+
@eml_xml_data = data
|
42
|
+
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
43
|
+
eml.create
|
44
|
+
end
|
45
|
+
|
46
|
+
def files
|
47
|
+
DarwinCore.files(@path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def pack
|
51
|
+
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
52
|
+
system(a)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_csv_file(opts)
|
58
|
+
c = CSV.open(File.join(@path, opts[:file_name]), @write)
|
59
|
+
attributes = prepare_attributes(opts)
|
60
|
+
if opts[:type] == 'core'
|
61
|
+
@meta_xml_data[:core] = attributes
|
62
|
+
else
|
63
|
+
@meta_xml_data[:extensions] << attributes
|
64
|
+
end
|
65
|
+
opts[:data].each { |d| c << d }
|
66
|
+
c.close
|
67
|
+
end
|
68
|
+
|
69
|
+
def prepare_attributes(opts)
|
70
|
+
header = opts[:data].shift
|
71
|
+
fields = init_fields(header, opts[:type])
|
72
|
+
opts[:data].unshift(fields) if opts[:keep_headers]
|
73
|
+
ignore_header_lines = opts[:keep_headers] ? 1 : 0
|
74
|
+
|
75
|
+
res = { fields: header, ignoreHeaderLines: ignore_header_lines,
|
76
|
+
location: opts[:file_name] }
|
77
|
+
res[:rowType] = opts[:row_type] if opts[:row_type]
|
78
|
+
res
|
79
|
+
end
|
80
|
+
|
81
|
+
def init_fields(header, file_type)
|
82
|
+
header.map do |f|
|
83
|
+
f = f.strip
|
84
|
+
err = "No header in #{file_type} data, or header fields are not urls"
|
85
|
+
raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
|
86
|
+
|
87
|
+
f.split('/')[-1]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,41 +1,54 @@
|
|
1
1
|
class DarwinCore
|
2
2
|
class Generator
|
3
|
+
# Creates EML file with meta information about archive
|
3
4
|
class EmlXml
|
5
|
+
SCHEMA_DATA = {
|
6
|
+
:"xml:lang" => "en",
|
7
|
+
:"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
|
8
|
+
:"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
|
9
|
+
:"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
|
10
|
+
:"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
|
11
|
+
:"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
|
12
|
+
:"xmlns:dc" => "http://purl.org/dc/terms/",
|
13
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
14
|
+
:"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
|
15
|
+
"http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
|
16
|
+
}
|
4
17
|
|
5
18
|
def initialize(data, path)
|
6
19
|
@data = data
|
7
20
|
@path = path
|
8
|
-
@write =
|
21
|
+
@write = "w:utf-8"
|
9
22
|
end
|
10
23
|
|
11
24
|
def create
|
12
|
-
|
13
|
-
|
25
|
+
schema_data = {
|
26
|
+
packageId: "#{@data[:id]}/#{timestamp}",
|
27
|
+
system: @data[:system] || "http://globalnames.org"
|
28
|
+
}.merge(SCHEMA_DATA)
|
14
29
|
builder = Nokogiri::XML::Builder.new do |xml|
|
15
|
-
xml.eml(
|
16
|
-
|
17
|
-
:'xml:lang' => 'en',
|
18
|
-
:'xmlns:eml' => 'eml://ecoinformatics.org/eml-2.1.1',
|
19
|
-
:'xmlns:md' => 'eml://ecoinformatics.org/methods-2.1.1',
|
20
|
-
:'xmlns:proj' => 'eml://ecoinformatics.org/project-2.1.1',
|
21
|
-
:'xmlns:d' => 'eml://ecoinformatics.org/dataset-2.1.1',
|
22
|
-
:'xmlns:res' => 'eml://ecoinformatics.org/resource-2.1.1',
|
23
|
-
:'xmlns:dc' => 'http://purl.org/dc/terms/',
|
24
|
-
:'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
25
|
-
:'xsi:schemaLocation' => 'eml_uri') do
|
26
|
-
build_dataset(xml)
|
27
|
-
build_additional_metadata(xml)
|
28
|
-
xml.parent.namespace = xml.parent.namespace_definitions.first
|
30
|
+
xml.eml(schema_data) do
|
31
|
+
build_body(xml)
|
29
32
|
end
|
30
33
|
end
|
34
|
+
save_eml(builder)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def build_body(xml)
|
40
|
+
build_dataset(xml)
|
41
|
+
build_additional_metadata(xml)
|
42
|
+
xml.parent.namespace = xml.parent.namespace_definitions.first
|
43
|
+
end
|
44
|
+
|
45
|
+
def save_eml(builder)
|
31
46
|
data = builder.to_xml
|
32
|
-
f = open(File.join(@path,
|
47
|
+
f = open(File.join(@path, "eml.xml"), @write)
|
33
48
|
f.write(data)
|
34
49
|
f.close
|
35
50
|
end
|
36
51
|
|
37
|
-
private
|
38
|
-
|
39
52
|
def build_dataset(xml)
|
40
53
|
xml.dataset(id: @data[:id]) do
|
41
54
|
xml.title(@data[:title])
|
@@ -50,30 +63,24 @@ class DarwinCore
|
|
50
63
|
end
|
51
64
|
|
52
65
|
def build_abstract(xml)
|
53
|
-
xml.abstract()
|
54
|
-
xml.para(@data[:abstract])
|
55
|
-
end
|
66
|
+
xml.abstract { xml.para(@data[:abstract]) }
|
56
67
|
end
|
57
68
|
|
58
69
|
def build_contacts(xml, contacts)
|
59
|
-
contacts.each
|
60
|
-
xml.contact { xml.references(contact) }
|
61
|
-
end
|
70
|
+
contacts.each { |contact| xml.contact { xml.references(contact) } }
|
62
71
|
end
|
63
72
|
|
64
73
|
def build_metadata_providers(xml)
|
65
|
-
@data[:metadata_providers].
|
66
|
-
xml.metadataProvider
|
67
|
-
build_person(xml, a)
|
68
|
-
end
|
74
|
+
@data[:metadata_providers].each do |a|
|
75
|
+
xml.metadataProvider { build_person(xml, a) }
|
69
76
|
end if @data[:metadata_providers]
|
70
77
|
end
|
71
|
-
|
78
|
+
|
72
79
|
def build_authors(xml, contacts)
|
73
80
|
@data[:authors].each_with_index do |a, i|
|
74
81
|
creator_id = i + 1
|
75
82
|
contacts << creator_id
|
76
|
-
xml.creator(id: creator_id, scope:
|
83
|
+
xml.creator(id: creator_id, scope: "document") do
|
77
84
|
build_person(xml, a)
|
78
85
|
end
|
79
86
|
end
|
@@ -102,7 +109,7 @@ class DarwinCore
|
|
102
109
|
|
103
110
|
def timestamp
|
104
111
|
t = Time.now.getutc.to_a[0..5].reverse
|
105
|
-
t[0..2]
|
112
|
+
t[0..2] * ("-") + "::" + t[-3..-1] * (":")
|
106
113
|
end
|
107
114
|
end
|
108
115
|
end
|
@@ -1,42 +1,45 @@
|
|
1
1
|
class DarwinCore
|
2
2
|
class Generator
|
3
|
+
# Creates DarwinCore meta file
|
3
4
|
class MetaXml
|
4
5
|
def initialize(data, path)
|
5
6
|
@data = data
|
6
7
|
@path = path
|
7
|
-
@write =
|
8
|
+
@write = "w:utf-8"
|
8
9
|
end
|
9
10
|
|
10
11
|
def create
|
11
|
-
schema_uri =
|
12
|
-
|
12
|
+
schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
|
13
|
+
"http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
|
13
14
|
builder = Nokogiri::XML::Builder.new do |xml|
|
14
|
-
opts = { encoding:
|
15
|
-
|
16
|
-
|
17
|
-
linesTerminatedBy: "\n",
|
18
|
-
rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
|
15
|
+
opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
|
16
|
+
fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
|
17
|
+
rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
|
19
18
|
build_archive(xml, opts, schema_uri)
|
20
19
|
end
|
20
|
+
save_meta(builder)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def save_meta(builder)
|
21
26
|
meta_xml_data = builder.to_xml
|
22
|
-
meta_file = open(File.join(@path,
|
27
|
+
meta_file = open(File.join(@path, "meta.xml"), @write)
|
23
28
|
meta_file.write(meta_xml_data)
|
24
29
|
meta_file.close
|
25
30
|
end
|
26
31
|
|
27
|
-
private
|
28
|
-
|
29
32
|
def build_archive(xml, opts, schema_uri)
|
30
|
-
xml.archive(xmlns:
|
31
|
-
|
32
|
-
|
33
|
+
xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
|
34
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
35
|
+
:"xsi:schemaLocation" => schema_uri) do
|
33
36
|
build_core(xml, opts)
|
34
37
|
build_extensions(xml, opts)
|
35
38
|
end
|
36
39
|
end
|
37
40
|
|
38
41
|
def build_core(xml, opts)
|
39
|
-
xml.core(opts.merge(ignoreHeaderLines:
|
42
|
+
xml.core(opts.merge(ignoreHeaderLines:
|
40
43
|
@data[:core][:ignoreHeaderLines])) do
|
41
44
|
xml.files { xml.location(@data[:core][:location]) }
|
42
45
|
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
@@ -47,7 +50,7 @@ class DarwinCore
|
|
47
50
|
|
48
51
|
def build_extensions(xml, opts)
|
49
52
|
@data[:extensions].each do |e|
|
50
|
-
xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
|
53
|
+
xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
|
51
54
|
rowType: e[:rowType])) do
|
52
55
|
xml.files { xml.location(e[:location]) }
|
53
56
|
taxon_id, fields = find_taxon_id(e[:fields])
|
@@ -60,12 +63,10 @@ class DarwinCore
|
|
60
63
|
def find_taxon_id(data)
|
61
64
|
fields = []
|
62
65
|
data.each_with_index { |f, i| fields << [f.strip, i] }
|
63
|
-
taxon_id, fields = fields.partition { |f| f[0].match(
|
64
|
-
|
66
|
+
taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
|
67
|
+
fail DarwinCore::GeneratorError if taxon_id.size != 1
|
65
68
|
[taxon_id[0], fields]
|
66
69
|
end
|
67
|
-
|
68
70
|
end
|
69
71
|
end
|
70
72
|
end
|
71
|
-
|