dwc-archive 0.9.11 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +4 -0
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +114 -109
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
- data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +1 -1
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +73 -74
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +101 -87
- data/.rvmrc +0 -1
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -427
- data/lib/dwc-archive/core.rb +0 -19
- data/lib/dwc-archive/expander.rb +0 -85
- data/lib/dwc-archive/generator.rb +0 -86
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -80
data/lib/dwc-archive/core.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Core
|
3
|
-
include DarwinCore::Ingester
|
4
|
-
attr_reader :id
|
5
|
-
|
6
|
-
def initialize(dwc)
|
7
|
-
@dwc = dwc
|
8
|
-
@archive = @dwc.archive
|
9
|
-
@path = @archive.files_path
|
10
|
-
root_key = @archive.meta.keys[0]
|
11
|
-
@data = @archive.meta[root_key][:core]
|
12
|
-
raise DarwinCore::CoreFileError.
|
13
|
-
new('Cannot find core in meta.xml, is meta.xml valid?') unless @data
|
14
|
-
@id = @data[:id][:attributes]
|
15
|
-
get_attributes(DarwinCore::CoreFileError)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
19
|
-
end
|
data/lib/dwc-archive/expander.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Expander
|
3
|
-
|
4
|
-
def initialize(archive_path, tmp_dir)
|
5
|
-
@archive_path = archive_path
|
6
|
-
@tmp_dir = tmp_dir
|
7
|
-
@path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
|
8
|
-
@unpacker = get_unpacker
|
9
|
-
end
|
10
|
-
|
11
|
-
def unpack
|
12
|
-
clean
|
13
|
-
raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
|
14
|
-
success = @unpacker.call(@path, @archive_path) if @unpacker
|
15
|
-
(@unpacker && success && $?.exitstatus == 0) ?
|
16
|
-
success :
|
17
|
-
(clean; raise DarwinCore::UnpackingError)
|
18
|
-
end
|
19
|
-
|
20
|
-
def path
|
21
|
-
@files_path ||= files_path
|
22
|
-
end
|
23
|
-
|
24
|
-
def clean
|
25
|
-
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
26
|
-
end
|
27
|
-
|
28
|
-
def files
|
29
|
-
return nil unless path && FileTest.exists?(path)
|
30
|
-
Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def esc(a_str)
|
36
|
-
"'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
|
37
|
-
end
|
38
|
-
|
39
|
-
def get_unpacker
|
40
|
-
file_command = IO.popen("file -z " + esc(@archive_path))
|
41
|
-
file_type = file_command.read
|
42
|
-
file_command.close
|
43
|
-
|
44
|
-
if file_type.match(/tar.*gzip/i)
|
45
|
-
return proc do |tmp_path, archive_path|
|
46
|
-
FileUtils.mkdir tmp_path
|
47
|
-
path = esc(archive_path)
|
48
|
-
system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
if file_type.match(/Zip/)
|
53
|
-
return proc do |tmp_path, archive_path|
|
54
|
-
path = esc(archive_path)
|
55
|
-
system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
return nil
|
60
|
-
end
|
61
|
-
|
62
|
-
def path_entries(dir)
|
63
|
-
Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
|
64
|
-
end
|
65
|
-
|
66
|
-
def files_path
|
67
|
-
res = nil
|
68
|
-
entries = path_entries(@path)
|
69
|
-
if entries.include?('meta.xml')
|
70
|
-
res = @path
|
71
|
-
else
|
72
|
-
entries.each do |e|
|
73
|
-
check_path = File.join(@path, e)
|
74
|
-
if FileTest.directory?(check_path)
|
75
|
-
if path_entries(check_path).include?('meta.xml')
|
76
|
-
res = check_path
|
77
|
-
break
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
res
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
@@ -1,86 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Generator
|
3
|
-
attr_reader :eml_xml_data
|
4
|
-
|
5
|
-
#TODO refactor -- for now copying expander methods
|
6
|
-
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
7
|
-
@dwc_path = dwc_path
|
8
|
-
@path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
|
9
|
-
FileUtils.mkdir(@path)
|
10
|
-
@meta_xml_data = {:extensions => []}
|
11
|
-
@eml_xml_data = {:id => nil, :title => nil,
|
12
|
-
:authors => [], :abstract => nil, :citation => nil, :url => nil}
|
13
|
-
@write = 'w:utf-8'
|
14
|
-
end
|
15
|
-
|
16
|
-
#TODO refactor!
|
17
|
-
def clean
|
18
|
-
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
19
|
-
end
|
20
|
-
|
21
|
-
def add_core(data, file_name, keep_headers = true)
|
22
|
-
c = CSV.open(File.join(@path,file_name), @write)
|
23
|
-
header = data.shift
|
24
|
-
fields = get_fields(header, 'core')
|
25
|
-
data.unshift(fields) if keep_headers
|
26
|
-
ignore_header_lines = keep_headers ? 1 : 0
|
27
|
-
@meta_xml_data[:core] = { fields: header,
|
28
|
-
ignoreHeaderLines: ignore_header_lines,
|
29
|
-
location:file_name }
|
30
|
-
data.each {|d| c << d}
|
31
|
-
c.close
|
32
|
-
end
|
33
|
-
|
34
|
-
def add_extension(data, file_name,
|
35
|
-
keep_headers = true,
|
36
|
-
row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
|
37
|
-
c = CSV.open(File.join(@path,file_name), @write)
|
38
|
-
header = data.shift
|
39
|
-
fields = get_fields(header, 'extension')
|
40
|
-
data.unshift(fields) if keep_headers
|
41
|
-
ignore_header_lines = keep_headers ? 1 : 0
|
42
|
-
@meta_xml_data[:extensions] << { fields: header,
|
43
|
-
ignoreHeaderLines: ignore_header_lines,
|
44
|
-
location: file_name,
|
45
|
-
rowType: row_type }
|
46
|
-
data.each { |d| c << d }
|
47
|
-
c.close
|
48
|
-
end
|
49
|
-
|
50
|
-
def add_meta_xml
|
51
|
-
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
52
|
-
meta.create
|
53
|
-
end
|
54
|
-
|
55
|
-
def add_eml_xml(data)
|
56
|
-
@eml_xml_data = data
|
57
|
-
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
58
|
-
eml.create
|
59
|
-
end
|
60
|
-
|
61
|
-
def path
|
62
|
-
@path
|
63
|
-
end
|
64
|
-
|
65
|
-
def files
|
66
|
-
return nil unless @path && FileTest.exists?(@path)
|
67
|
-
Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
68
|
-
end
|
69
|
-
|
70
|
-
def pack
|
71
|
-
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
72
|
-
system(a)
|
73
|
-
end
|
74
|
-
|
75
|
-
private
|
76
|
-
|
77
|
-
def get_fields(header, file_type)
|
78
|
-
header.map do |f|
|
79
|
-
f.strip!
|
80
|
-
err = "No header in %s data, or header fields are not urls" % file_type
|
81
|
-
raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
|
82
|
-
f.split('/')[-1]
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
data/lib/dwc-archive/ingester.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
class DarwinCore
|
3
|
-
module Ingester
|
4
|
-
attr_reader :data, :properties, :encoding, :fields_separator, :size
|
5
|
-
attr_reader :file_path, :fields, :line_separator,
|
6
|
-
:quote_character, :ignore_headers
|
7
|
-
|
8
|
-
def size
|
9
|
-
@size ||= get_size
|
10
|
-
end
|
11
|
-
|
12
|
-
def read(batch_size = 10000)
|
13
|
-
DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
|
14
|
-
res = []
|
15
|
-
errors = []
|
16
|
-
index_fix = 1
|
17
|
-
args = {:col_sep => @field_separator}
|
18
|
-
@quote_character = "\b" if @quote_character.empty?
|
19
|
-
args.merge!({:quote_char => @quote_character})
|
20
|
-
min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
|
21
|
-
csv = CSV.new(open(@file_path), args)
|
22
|
-
csv.each_with_index do |r, i|
|
23
|
-
index_fix = 0; next if @ignore_headers && i == 0
|
24
|
-
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
25
|
-
if (i + index_fix) % batch_size == 0
|
26
|
-
DarwinCore.logger_write(@dwc.object_id,
|
27
|
-
"Ingested %s records from %s" %
|
28
|
-
[(i + index_fix), name])
|
29
|
-
if block_given?
|
30
|
-
yield [res, errors]
|
31
|
-
res = []
|
32
|
-
errors = []
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
yield [res, errors] if block_given?
|
37
|
-
[res, errors]
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def name
|
42
|
-
self.class.to_s.split('::')[-1].downcase
|
43
|
-
end
|
44
|
-
|
45
|
-
def process_csv_row(result, errors, row)
|
46
|
-
str = row.join('')
|
47
|
-
str = str.force_encoding('utf-8')
|
48
|
-
if str.encoding.name == 'UTF-8' && str.valid_encoding?
|
49
|
-
result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
|
50
|
-
else
|
51
|
-
errors << row
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def get_attributes(exception)
|
56
|
-
@properties = @data[:attributes]
|
57
|
-
@encoding = @properties[:encoding] || 'UTF-8'
|
58
|
-
err_msg = 'No support for encodings other ' +
|
59
|
-
'than utf-8 or utf-16 at the moment'
|
60
|
-
encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
|
61
|
-
unless encodings.include? @encoding.downcase
|
62
|
-
raise DarwinCore::EncodingError.new(err_msg)
|
63
|
-
end
|
64
|
-
@field_separator = get_field_separator
|
65
|
-
@quote_character = @properties[:fieldsEnclosedBy] || ''
|
66
|
-
@line_separator = @properties[:linesTerminatedBy] || '\n'
|
67
|
-
@ignore_headers = @properties[:ignoreHeaderLines] ?
|
68
|
-
[1, true].include?(@properties[:ignoreHeaderLines]) :
|
69
|
-
false
|
70
|
-
@file_path = get_file_path
|
71
|
-
raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
|
72
|
-
@fields = get_fields
|
73
|
-
if @fields.empty?
|
74
|
-
raise DarwinCore::InvalidArchiveError.new("No data fields are found")
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def get_file_path
|
79
|
-
file = @data[:location] ||
|
80
|
-
@data[:attributes][:location] ||
|
81
|
-
@data[:files][:location]
|
82
|
-
File.join(@path, file)
|
83
|
-
end
|
84
|
-
|
85
|
-
def get_fields
|
86
|
-
@data[:field] = [data[:field]] if data[:field].class != Array
|
87
|
-
@data[:field].map {|f| f[:attributes]}
|
88
|
-
end
|
89
|
-
|
90
|
-
def get_field_separator
|
91
|
-
res = @properties[:fieldsTerminatedBy] || ','
|
92
|
-
res = "\t" if res == "\\t"
|
93
|
-
res
|
94
|
-
end
|
95
|
-
|
96
|
-
def get_size
|
97
|
-
`wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
data/lib/dwc-archive/metadata.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Metadata
|
3
|
-
def initialize(archive = nil)
|
4
|
-
@archive = archive
|
5
|
-
@metadata = @archive.eml
|
6
|
-
end
|
7
|
-
|
8
|
-
def data
|
9
|
-
@metadata
|
10
|
-
end
|
11
|
-
|
12
|
-
def id
|
13
|
-
@metadata[:eml][:dataset][:attributes][:id] rescue nil
|
14
|
-
end
|
15
|
-
|
16
|
-
def package_id
|
17
|
-
@metadata.data[:eml][:attributes][:packageId] rescue nil
|
18
|
-
end
|
19
|
-
|
20
|
-
def title
|
21
|
-
@metadata[:eml][:dataset][:title] rescue nil
|
22
|
-
end
|
23
|
-
|
24
|
-
def authors
|
25
|
-
return nil unless defined?(@metadata[:eml][:dataset][:creator])
|
26
|
-
@metadata[:eml][:dataset][:creator] =
|
27
|
-
[@metadata[:eml][:dataset][:creator]] unless
|
28
|
-
@metadata[:eml][:dataset][:creator].class == Array
|
29
|
-
@metadata[:eml][:dataset][:creator].map do |c|
|
30
|
-
{ first_name: c[:individualName][:givenName],
|
31
|
-
last_name: c[:individualName][:surName],
|
32
|
-
email: c[:electronicMailAddress] }
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
def abstract
|
37
|
-
@metadata[:eml][:dataset][:abstract] rescue nil
|
38
|
-
end
|
39
|
-
|
40
|
-
def citation
|
41
|
-
@metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
|
42
|
-
end
|
43
|
-
|
44
|
-
def url
|
45
|
-
@metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
data/lib/dwc-archive/version.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
# USAGE: Hash.from_xml:(YOUR_XML_STRING)
|
2
|
-
# modified from
|
3
|
-
# http://stackoverflow.com/questions/1230741/
|
4
|
-
# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
|
5
|
-
class DarwinCore
|
6
|
-
module XmlReader
|
7
|
-
class << self
|
8
|
-
|
9
|
-
def from_xml(xml_io)
|
10
|
-
result = Nokogiri::XML(xml_io)
|
11
|
-
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
def xml_node_to_hash(node)
|
16
|
-
# If we are at the root of the document, start the hash
|
17
|
-
if node.element?
|
18
|
-
prepare_node_element(node)
|
19
|
-
else
|
20
|
-
return prepare(node.content.to_s)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def add_attributes(node, result_hash)
|
25
|
-
if node.attributes != {}
|
26
|
-
result_hash[:attributes] = {}
|
27
|
-
node.attributes.keys.each do |key|
|
28
|
-
result_hash[:attributes][node.attributes[key].name.to_sym] =
|
29
|
-
prepare(node.attributes[key].value)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def prepare_node_element(node)
|
35
|
-
result_hash = {}
|
36
|
-
add_attributes(node, result_hash)
|
37
|
-
if node.children.size > 0
|
38
|
-
result_hash = add_children(node, result_hash)
|
39
|
-
end
|
40
|
-
result_hash
|
41
|
-
end
|
42
|
-
|
43
|
-
def add_children(node, result_hash)
|
44
|
-
node.children.each do |child|
|
45
|
-
result = xml_node_to_hash(child)
|
46
|
-
|
47
|
-
if child.name == "text"
|
48
|
-
text = handle_text(child, result)
|
49
|
-
return text if text
|
50
|
-
elsif result_hash[child.name.to_sym]
|
51
|
-
handle_child_node(child, result_hash, result)
|
52
|
-
else
|
53
|
-
result_hash[child.name.to_sym] = prepare(result)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
result_hash
|
57
|
-
end
|
58
|
-
|
59
|
-
def handle_child_node(child, result_hash, result)
|
60
|
-
if result_hash[child.name.to_sym].is_a?(Object::Array)
|
61
|
-
result_hash[child.name.to_sym] << prepare(result)
|
62
|
-
else
|
63
|
-
result_hash[child.name.to_sym] =
|
64
|
-
[result_hash[child.name.to_sym]] << prepare(result)
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def handle_text(child, result)
|
69
|
-
unless child.next_sibling || child.previous_sibling
|
70
|
-
prepare(result)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def prepare(data)
|
75
|
-
(data.class == String && data.to_i.to_s == data) ? data.to_i : data
|
76
|
-
end
|
77
|
-
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|