dwc-archive 0.9.11 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +4 -0
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +114 -109
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
- data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +1 -1
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +73 -74
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +101 -87
- data/.rvmrc +0 -1
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -427
- data/lib/dwc-archive/core.rb +0 -19
- data/lib/dwc-archive/expander.rb +0 -85
- data/lib/dwc-archive/generator.rb +0 -86
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -80
data/lib/dwc-archive/core.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Core
|
3
|
-
include DarwinCore::Ingester
|
4
|
-
attr_reader :id
|
5
|
-
|
6
|
-
def initialize(dwc)
|
7
|
-
@dwc = dwc
|
8
|
-
@archive = @dwc.archive
|
9
|
-
@path = @archive.files_path
|
10
|
-
root_key = @archive.meta.keys[0]
|
11
|
-
@data = @archive.meta[root_key][:core]
|
12
|
-
raise DarwinCore::CoreFileError.
|
13
|
-
new('Cannot find core in meta.xml, is meta.xml valid?') unless @data
|
14
|
-
@id = @data[:id][:attributes]
|
15
|
-
get_attributes(DarwinCore::CoreFileError)
|
16
|
-
end
|
17
|
-
|
18
|
-
end
|
19
|
-
end
|
data/lib/dwc-archive/expander.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Expander
|
3
|
-
|
4
|
-
def initialize(archive_path, tmp_dir)
|
5
|
-
@archive_path = archive_path
|
6
|
-
@tmp_dir = tmp_dir
|
7
|
-
@path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
|
8
|
-
@unpacker = get_unpacker
|
9
|
-
end
|
10
|
-
|
11
|
-
def unpack
|
12
|
-
clean
|
13
|
-
raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
|
14
|
-
success = @unpacker.call(@path, @archive_path) if @unpacker
|
15
|
-
(@unpacker && success && $?.exitstatus == 0) ?
|
16
|
-
success :
|
17
|
-
(clean; raise DarwinCore::UnpackingError)
|
18
|
-
end
|
19
|
-
|
20
|
-
def path
|
21
|
-
@files_path ||= files_path
|
22
|
-
end
|
23
|
-
|
24
|
-
def clean
|
25
|
-
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
26
|
-
end
|
27
|
-
|
28
|
-
def files
|
29
|
-
return nil unless path && FileTest.exists?(path)
|
30
|
-
Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def esc(a_str)
|
36
|
-
"'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
|
37
|
-
end
|
38
|
-
|
39
|
-
def get_unpacker
|
40
|
-
file_command = IO.popen("file -z " + esc(@archive_path))
|
41
|
-
file_type = file_command.read
|
42
|
-
file_command.close
|
43
|
-
|
44
|
-
if file_type.match(/tar.*gzip/i)
|
45
|
-
return proc do |tmp_path, archive_path|
|
46
|
-
FileUtils.mkdir tmp_path
|
47
|
-
path = esc(archive_path)
|
48
|
-
system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
if file_type.match(/Zip/)
|
53
|
-
return proc do |tmp_path, archive_path|
|
54
|
-
path = esc(archive_path)
|
55
|
-
system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
return nil
|
60
|
-
end
|
61
|
-
|
62
|
-
def path_entries(dir)
|
63
|
-
Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
|
64
|
-
end
|
65
|
-
|
66
|
-
def files_path
|
67
|
-
res = nil
|
68
|
-
entries = path_entries(@path)
|
69
|
-
if entries.include?('meta.xml')
|
70
|
-
res = @path
|
71
|
-
else
|
72
|
-
entries.each do |e|
|
73
|
-
check_path = File.join(@path, e)
|
74
|
-
if FileTest.directory?(check_path)
|
75
|
-
if path_entries(check_path).include?('meta.xml')
|
76
|
-
res = check_path
|
77
|
-
break
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
res
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
@@ -1,86 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Generator
|
3
|
-
attr_reader :eml_xml_data
|
4
|
-
|
5
|
-
#TODO refactor -- for now copying expander methods
|
6
|
-
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
7
|
-
@dwc_path = dwc_path
|
8
|
-
@path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
|
9
|
-
FileUtils.mkdir(@path)
|
10
|
-
@meta_xml_data = {:extensions => []}
|
11
|
-
@eml_xml_data = {:id => nil, :title => nil,
|
12
|
-
:authors => [], :abstract => nil, :citation => nil, :url => nil}
|
13
|
-
@write = 'w:utf-8'
|
14
|
-
end
|
15
|
-
|
16
|
-
#TODO refactor!
|
17
|
-
def clean
|
18
|
-
FileUtils.rm_rf(@path) if FileTest.exists?(@path)
|
19
|
-
end
|
20
|
-
|
21
|
-
def add_core(data, file_name, keep_headers = true)
|
22
|
-
c = CSV.open(File.join(@path,file_name), @write)
|
23
|
-
header = data.shift
|
24
|
-
fields = get_fields(header, 'core')
|
25
|
-
data.unshift(fields) if keep_headers
|
26
|
-
ignore_header_lines = keep_headers ? 1 : 0
|
27
|
-
@meta_xml_data[:core] = { fields: header,
|
28
|
-
ignoreHeaderLines: ignore_header_lines,
|
29
|
-
location:file_name }
|
30
|
-
data.each {|d| c << d}
|
31
|
-
c.close
|
32
|
-
end
|
33
|
-
|
34
|
-
def add_extension(data, file_name,
|
35
|
-
keep_headers = true,
|
36
|
-
row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
|
37
|
-
c = CSV.open(File.join(@path,file_name), @write)
|
38
|
-
header = data.shift
|
39
|
-
fields = get_fields(header, 'extension')
|
40
|
-
data.unshift(fields) if keep_headers
|
41
|
-
ignore_header_lines = keep_headers ? 1 : 0
|
42
|
-
@meta_xml_data[:extensions] << { fields: header,
|
43
|
-
ignoreHeaderLines: ignore_header_lines,
|
44
|
-
location: file_name,
|
45
|
-
rowType: row_type }
|
46
|
-
data.each { |d| c << d }
|
47
|
-
c.close
|
48
|
-
end
|
49
|
-
|
50
|
-
def add_meta_xml
|
51
|
-
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
52
|
-
meta.create
|
53
|
-
end
|
54
|
-
|
55
|
-
def add_eml_xml(data)
|
56
|
-
@eml_xml_data = data
|
57
|
-
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
58
|
-
eml.create
|
59
|
-
end
|
60
|
-
|
61
|
-
def path
|
62
|
-
@path
|
63
|
-
end
|
64
|
-
|
65
|
-
def files
|
66
|
-
return nil unless @path && FileTest.exists?(@path)
|
67
|
-
Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
|
68
|
-
end
|
69
|
-
|
70
|
-
def pack
|
71
|
-
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
72
|
-
system(a)
|
73
|
-
end
|
74
|
-
|
75
|
-
private
|
76
|
-
|
77
|
-
def get_fields(header, file_type)
|
78
|
-
header.map do |f|
|
79
|
-
f.strip!
|
80
|
-
err = "No header in %s data, or header fields are not urls" % file_type
|
81
|
-
raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
|
82
|
-
f.split('/')[-1]
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
data/lib/dwc-archive/ingester.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
class DarwinCore
|
3
|
-
module Ingester
|
4
|
-
attr_reader :data, :properties, :encoding, :fields_separator, :size
|
5
|
-
attr_reader :file_path, :fields, :line_separator,
|
6
|
-
:quote_character, :ignore_headers
|
7
|
-
|
8
|
-
def size
|
9
|
-
@size ||= get_size
|
10
|
-
end
|
11
|
-
|
12
|
-
def read(batch_size = 10000)
|
13
|
-
DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
|
14
|
-
res = []
|
15
|
-
errors = []
|
16
|
-
index_fix = 1
|
17
|
-
args = {:col_sep => @field_separator}
|
18
|
-
@quote_character = "\b" if @quote_character.empty?
|
19
|
-
args.merge!({:quote_char => @quote_character})
|
20
|
-
min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
|
21
|
-
csv = CSV.new(open(@file_path), args)
|
22
|
-
csv.each_with_index do |r, i|
|
23
|
-
index_fix = 0; next if @ignore_headers && i == 0
|
24
|
-
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
25
|
-
if (i + index_fix) % batch_size == 0
|
26
|
-
DarwinCore.logger_write(@dwc.object_id,
|
27
|
-
"Ingested %s records from %s" %
|
28
|
-
[(i + index_fix), name])
|
29
|
-
if block_given?
|
30
|
-
yield [res, errors]
|
31
|
-
res = []
|
32
|
-
errors = []
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
yield [res, errors] if block_given?
|
37
|
-
[res, errors]
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
def name
|
42
|
-
self.class.to_s.split('::')[-1].downcase
|
43
|
-
end
|
44
|
-
|
45
|
-
def process_csv_row(result, errors, row)
|
46
|
-
str = row.join('')
|
47
|
-
str = str.force_encoding('utf-8')
|
48
|
-
if str.encoding.name == 'UTF-8' && str.valid_encoding?
|
49
|
-
result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
|
50
|
-
else
|
51
|
-
errors << row
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def get_attributes(exception)
|
56
|
-
@properties = @data[:attributes]
|
57
|
-
@encoding = @properties[:encoding] || 'UTF-8'
|
58
|
-
err_msg = 'No support for encodings other ' +
|
59
|
-
'than utf-8 or utf-16 at the moment'
|
60
|
-
encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
|
61
|
-
unless encodings.include? @encoding.downcase
|
62
|
-
raise DarwinCore::EncodingError.new(err_msg)
|
63
|
-
end
|
64
|
-
@field_separator = get_field_separator
|
65
|
-
@quote_character = @properties[:fieldsEnclosedBy] || ''
|
66
|
-
@line_separator = @properties[:linesTerminatedBy] || '\n'
|
67
|
-
@ignore_headers = @properties[:ignoreHeaderLines] ?
|
68
|
-
[1, true].include?(@properties[:ignoreHeaderLines]) :
|
69
|
-
false
|
70
|
-
@file_path = get_file_path
|
71
|
-
raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
|
72
|
-
@fields = get_fields
|
73
|
-
if @fields.empty?
|
74
|
-
raise DarwinCore::InvalidArchiveError.new("No data fields are found")
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
|
-
def get_file_path
|
79
|
-
file = @data[:location] ||
|
80
|
-
@data[:attributes][:location] ||
|
81
|
-
@data[:files][:location]
|
82
|
-
File.join(@path, file)
|
83
|
-
end
|
84
|
-
|
85
|
-
def get_fields
|
86
|
-
@data[:field] = [data[:field]] if data[:field].class != Array
|
87
|
-
@data[:field].map {|f| f[:attributes]}
|
88
|
-
end
|
89
|
-
|
90
|
-
def get_field_separator
|
91
|
-
res = @properties[:fieldsTerminatedBy] || ','
|
92
|
-
res = "\t" if res == "\\t"
|
93
|
-
res
|
94
|
-
end
|
95
|
-
|
96
|
-
def get_size
|
97
|
-
`wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
data/lib/dwc-archive/metadata.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
class DarwinCore
|
2
|
-
class Metadata
|
3
|
-
def initialize(archive = nil)
|
4
|
-
@archive = archive
|
5
|
-
@metadata = @archive.eml
|
6
|
-
end
|
7
|
-
|
8
|
-
def data
|
9
|
-
@metadata
|
10
|
-
end
|
11
|
-
|
12
|
-
def id
|
13
|
-
@metadata[:eml][:dataset][:attributes][:id] rescue nil
|
14
|
-
end
|
15
|
-
|
16
|
-
def package_id
|
17
|
-
@metadata.data[:eml][:attributes][:packageId] rescue nil
|
18
|
-
end
|
19
|
-
|
20
|
-
def title
|
21
|
-
@metadata[:eml][:dataset][:title] rescue nil
|
22
|
-
end
|
23
|
-
|
24
|
-
def authors
|
25
|
-
return nil unless defined?(@metadata[:eml][:dataset][:creator])
|
26
|
-
@metadata[:eml][:dataset][:creator] =
|
27
|
-
[@metadata[:eml][:dataset][:creator]] unless
|
28
|
-
@metadata[:eml][:dataset][:creator].class == Array
|
29
|
-
@metadata[:eml][:dataset][:creator].map do |c|
|
30
|
-
{ first_name: c[:individualName][:givenName],
|
31
|
-
last_name: c[:individualName][:surName],
|
32
|
-
email: c[:electronicMailAddress] }
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
def abstract
|
37
|
-
@metadata[:eml][:dataset][:abstract] rescue nil
|
38
|
-
end
|
39
|
-
|
40
|
-
def citation
|
41
|
-
@metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
|
42
|
-
end
|
43
|
-
|
44
|
-
def url
|
45
|
-
@metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
data/lib/dwc-archive/version.rb
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
# USAGE: Hash.from_xml:(YOUR_XML_STRING)
|
2
|
-
# modified from
|
3
|
-
# http://stackoverflow.com/questions/1230741/
|
4
|
-
# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
|
5
|
-
class DarwinCore
|
6
|
-
module XmlReader
|
7
|
-
class << self
|
8
|
-
|
9
|
-
def from_xml(xml_io)
|
10
|
-
result = Nokogiri::XML(xml_io)
|
11
|
-
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
def xml_node_to_hash(node)
|
16
|
-
# If we are at the root of the document, start the hash
|
17
|
-
if node.element?
|
18
|
-
prepare_node_element(node)
|
19
|
-
else
|
20
|
-
return prepare(node.content.to_s)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def add_attributes(node, result_hash)
|
25
|
-
if node.attributes != {}
|
26
|
-
result_hash[:attributes] = {}
|
27
|
-
node.attributes.keys.each do |key|
|
28
|
-
result_hash[:attributes][node.attributes[key].name.to_sym] =
|
29
|
-
prepare(node.attributes[key].value)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def prepare_node_element(node)
|
35
|
-
result_hash = {}
|
36
|
-
add_attributes(node, result_hash)
|
37
|
-
if node.children.size > 0
|
38
|
-
result_hash = add_children(node, result_hash)
|
39
|
-
end
|
40
|
-
result_hash
|
41
|
-
end
|
42
|
-
|
43
|
-
def add_children(node, result_hash)
|
44
|
-
node.children.each do |child|
|
45
|
-
result = xml_node_to_hash(child)
|
46
|
-
|
47
|
-
if child.name == "text"
|
48
|
-
text = handle_text(child, result)
|
49
|
-
return text if text
|
50
|
-
elsif result_hash[child.name.to_sym]
|
51
|
-
handle_child_node(child, result_hash, result)
|
52
|
-
else
|
53
|
-
result_hash[child.name.to_sym] = prepare(result)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
result_hash
|
57
|
-
end
|
58
|
-
|
59
|
-
def handle_child_node(child, result_hash, result)
|
60
|
-
if result_hash[child.name.to_sym].is_a?(Object::Array)
|
61
|
-
result_hash[child.name.to_sym] << prepare(result)
|
62
|
-
else
|
63
|
-
result_hash[child.name.to_sym] =
|
64
|
-
[result_hash[child.name.to_sym]] << prepare(result)
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def handle_text(child, result)
|
69
|
-
unless child.next_sibling || child.previous_sibling
|
70
|
-
prepare(result)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def prepare(data)
|
75
|
-
(data.class == String && data.to_i.to_s == data) ? data.to_i : data
|
76
|
-
end
|
77
|
-
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|