dwc-archive 0.9.11 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +4 -0
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +114 -109
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
  25. data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +1 -1
  34. data/spec/lib/classification_normalizer_spec.rb +96 -105
  35. data/spec/lib/core_spec.rb +43 -41
  36. data/spec/lib/darwin_core_spec.rb +108 -138
  37. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  38. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  39. data/spec/lib/generator_spec.rb +73 -74
  40. data/spec/lib/gnub_taxon_spec.rb +15 -17
  41. data/spec/lib/metadata_spec.rb +50 -41
  42. data/spec/lib/taxon_normalized_spec.rb +62 -65
  43. data/spec/lib/xml_reader_spec.rb +9 -12
  44. data/spec/spec_helper.rb +54 -51
  45. metadata +101 -87
  46. data/.rvmrc +0 -1
  47. data/lib/dwc-archive.rb +0 -107
  48. data/lib/dwc-archive/archive.rb +0 -40
  49. data/lib/dwc-archive/classification_normalizer.rb +0 -427
  50. data/lib/dwc-archive/core.rb +0 -19
  51. data/lib/dwc-archive/expander.rb +0 -85
  52. data/lib/dwc-archive/generator.rb +0 -86
  53. data/lib/dwc-archive/ingester.rb +0 -101
  54. data/lib/dwc-archive/metadata.rb +0 -48
  55. data/lib/dwc-archive/version.rb +0 -3
  56. data/lib/dwc-archive/xml_reader.rb +0 -80
@@ -1,19 +0,0 @@
1
- class DarwinCore
2
- class Core
3
- include DarwinCore::Ingester
4
- attr_reader :id
5
-
6
- def initialize(dwc)
7
- @dwc = dwc
8
- @archive = @dwc.archive
9
- @path = @archive.files_path
10
- root_key = @archive.meta.keys[0]
11
- @data = @archive.meta[root_key][:core]
12
- raise DarwinCore::CoreFileError.
13
- new('Cannot find core in meta.xml, is meta.xml valid?') unless @data
14
- @id = @data[:id][:attributes]
15
- get_attributes(DarwinCore::CoreFileError)
16
- end
17
-
18
- end
19
- end
@@ -1,85 +0,0 @@
1
- class DarwinCore
2
- class Expander
3
-
4
- def initialize(archive_path, tmp_dir)
5
- @archive_path = archive_path
6
- @tmp_dir = tmp_dir
7
- @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
8
- @unpacker = get_unpacker
9
- end
10
-
11
- def unpack
12
- clean
13
- raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
14
- success = @unpacker.call(@path, @archive_path) if @unpacker
15
- (@unpacker && success && $?.exitstatus == 0) ?
16
- success :
17
- (clean; raise DarwinCore::UnpackingError)
18
- end
19
-
20
- def path
21
- @files_path ||= files_path
22
- end
23
-
24
- def clean
25
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
26
- end
27
-
28
- def files
29
- return nil unless path && FileTest.exists?(path)
30
- Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
31
- end
32
-
33
- private
34
-
35
- def esc(a_str)
36
- "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
37
- end
38
-
39
- def get_unpacker
40
- file_command = IO.popen("file -z " + esc(@archive_path))
41
- file_type = file_command.read
42
- file_command.close
43
-
44
- if file_type.match(/tar.*gzip/i)
45
- return proc do |tmp_path, archive_path|
46
- FileUtils.mkdir tmp_path
47
- path = esc(archive_path)
48
- system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
49
- end
50
- end
51
-
52
- if file_type.match(/Zip/)
53
- return proc do |tmp_path, archive_path|
54
- path = esc(archive_path)
55
- system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
56
- end
57
- end
58
-
59
- return nil
60
- end
61
-
62
- def path_entries(dir)
63
- Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
64
- end
65
-
66
- def files_path
67
- res = nil
68
- entries = path_entries(@path)
69
- if entries.include?('meta.xml')
70
- res = @path
71
- else
72
- entries.each do |e|
73
- check_path = File.join(@path, e)
74
- if FileTest.directory?(check_path)
75
- if path_entries(check_path).include?('meta.xml')
76
- res = check_path
77
- break
78
- end
79
- end
80
- end
81
- end
82
- res
83
- end
84
- end
85
- end
@@ -1,86 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- attr_reader :eml_xml_data
4
-
5
- #TODO refactor -- for now copying expander methods
6
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
- @dwc_path = dwc_path
8
- @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
- FileUtils.mkdir(@path)
10
- @meta_xml_data = {:extensions => []}
11
- @eml_xml_data = {:id => nil, :title => nil,
12
- :authors => [], :abstract => nil, :citation => nil, :url => nil}
13
- @write = 'w:utf-8'
14
- end
15
-
16
- #TODO refactor!
17
- def clean
18
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
19
- end
20
-
21
- def add_core(data, file_name, keep_headers = true)
22
- c = CSV.open(File.join(@path,file_name), @write)
23
- header = data.shift
24
- fields = get_fields(header, 'core')
25
- data.unshift(fields) if keep_headers
26
- ignore_header_lines = keep_headers ? 1 : 0
27
- @meta_xml_data[:core] = { fields: header,
28
- ignoreHeaderLines: ignore_header_lines,
29
- location:file_name }
30
- data.each {|d| c << d}
31
- c.close
32
- end
33
-
34
- def add_extension(data, file_name,
35
- keep_headers = true,
36
- row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
37
- c = CSV.open(File.join(@path,file_name), @write)
38
- header = data.shift
39
- fields = get_fields(header, 'extension')
40
- data.unshift(fields) if keep_headers
41
- ignore_header_lines = keep_headers ? 1 : 0
42
- @meta_xml_data[:extensions] << { fields: header,
43
- ignoreHeaderLines: ignore_header_lines,
44
- location: file_name,
45
- rowType: row_type }
46
- data.each { |d| c << d }
47
- c.close
48
- end
49
-
50
- def add_meta_xml
51
- meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
52
- meta.create
53
- end
54
-
55
- def add_eml_xml(data)
56
- @eml_xml_data = data
57
- eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
58
- eml.create
59
- end
60
-
61
- def path
62
- @path
63
- end
64
-
65
- def files
66
- return nil unless @path && FileTest.exists?(@path)
67
- Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
68
- end
69
-
70
- def pack
71
- a = "cd #{@path}; tar -zcf #{@dwc_path} *"
72
- system(a)
73
- end
74
-
75
- private
76
-
77
- def get_fields(header, file_type)
78
- header.map do |f|
79
- f.strip!
80
- err = "No header in %s data, or header fields are not urls" % file_type
81
- raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
82
- f.split('/')[-1]
83
- end
84
- end
85
- end
86
- end
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
- class DarwinCore
3
- module Ingester
4
- attr_reader :data, :properties, :encoding, :fields_separator, :size
5
- attr_reader :file_path, :fields, :line_separator,
6
- :quote_character, :ignore_headers
7
-
8
- def size
9
- @size ||= get_size
10
- end
11
-
12
- def read(batch_size = 10000)
13
- DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
14
- res = []
15
- errors = []
16
- index_fix = 1
17
- args = {:col_sep => @field_separator}
18
- @quote_character = "\b" if @quote_character.empty?
19
- args.merge!({:quote_char => @quote_character})
20
- min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
21
- csv = CSV.new(open(@file_path), args)
22
- csv.each_with_index do |r, i|
23
- index_fix = 0; next if @ignore_headers && i == 0
24
- min_size > r.size ? errors << r : process_csv_row(res, errors, r)
25
- if (i + index_fix) % batch_size == 0
26
- DarwinCore.logger_write(@dwc.object_id,
27
- "Ingested %s records from %s" %
28
- [(i + index_fix), name])
29
- if block_given?
30
- yield [res, errors]
31
- res = []
32
- errors = []
33
- end
34
- end
35
- end
36
- yield [res, errors] if block_given?
37
- [res, errors]
38
- end
39
-
40
- private
41
- def name
42
- self.class.to_s.split('::')[-1].downcase
43
- end
44
-
45
- def process_csv_row(result, errors, row)
46
- str = row.join('')
47
- str = str.force_encoding('utf-8')
48
- if str.encoding.name == 'UTF-8' && str.valid_encoding?
49
- result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
50
- else
51
- errors << row
52
- end
53
- end
54
-
55
- def get_attributes(exception)
56
- @properties = @data[:attributes]
57
- @encoding = @properties[:encoding] || 'UTF-8'
58
- err_msg = 'No support for encodings other ' +
59
- 'than utf-8 or utf-16 at the moment'
60
- encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
61
- unless encodings.include? @encoding.downcase
62
- raise DarwinCore::EncodingError.new(err_msg)
63
- end
64
- @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ''
66
- @line_separator = @properties[:linesTerminatedBy] || '\n'
67
- @ignore_headers = @properties[:ignoreHeaderLines] ?
68
- [1, true].include?(@properties[:ignoreHeaderLines]) :
69
- false
70
- @file_path = get_file_path
71
- raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
72
- @fields = get_fields
73
- if @fields.empty?
74
- raise DarwinCore::InvalidArchiveError.new("No data fields are found")
75
- end
76
- end
77
-
78
- def get_file_path
79
- file = @data[:location] ||
80
- @data[:attributes][:location] ||
81
- @data[:files][:location]
82
- File.join(@path, file)
83
- end
84
-
85
- def get_fields
86
- @data[:field] = [data[:field]] if data[:field].class != Array
87
- @data[:field].map {|f| f[:attributes]}
88
- end
89
-
90
- def get_field_separator
91
- res = @properties[:fieldsTerminatedBy] || ','
92
- res = "\t" if res == "\\t"
93
- res
94
- end
95
-
96
- def get_size
97
- `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
98
- end
99
- end
100
- end
101
-
@@ -1,48 +0,0 @@
1
- class DarwinCore
2
- class Metadata
3
- def initialize(archive = nil)
4
- @archive = archive
5
- @metadata = @archive.eml
6
- end
7
-
8
- def data
9
- @metadata
10
- end
11
-
12
- def id
13
- @metadata[:eml][:dataset][:attributes][:id] rescue nil
14
- end
15
-
16
- def package_id
17
- @metadata.data[:eml][:attributes][:packageId] rescue nil
18
- end
19
-
20
- def title
21
- @metadata[:eml][:dataset][:title] rescue nil
22
- end
23
-
24
- def authors
25
- return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] =
27
- [@metadata[:eml][:dataset][:creator]] unless
28
- @metadata[:eml][:dataset][:creator].class == Array
29
- @metadata[:eml][:dataset][:creator].map do |c|
30
- { first_name: c[:individualName][:givenName],
31
- last_name: c[:individualName][:surName],
32
- email: c[:electronicMailAddress] }
33
- end
34
- end
35
-
36
- def abstract
37
- @metadata[:eml][:dataset][:abstract] rescue nil
38
- end
39
-
40
- def citation
41
- @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
42
- end
43
-
44
- def url
45
- @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
46
- end
47
- end
48
- end
@@ -1,3 +0,0 @@
1
- class DarwinCore
2
- VERSION = "0.9.11"
3
- end
@@ -1,80 +0,0 @@
1
- # USAGE: Hash.from_xml:(YOUR_XML_STRING)
2
- # modified from
3
- # http://stackoverflow.com/questions/1230741/
4
- # convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
5
- class DarwinCore
6
- module XmlReader
7
- class << self
8
-
9
- def from_xml(xml_io)
10
- result = Nokogiri::XML(xml_io)
11
- return { result.root.name.to_sym => xml_node_to_hash(result.root)}
12
- end
13
-
14
- private
15
- def xml_node_to_hash(node)
16
- # If we are at the root of the document, start the hash
17
- if node.element?
18
- prepare_node_element(node)
19
- else
20
- return prepare(node.content.to_s)
21
- end
22
- end
23
-
24
- def add_attributes(node, result_hash)
25
- if node.attributes != {}
26
- result_hash[:attributes] = {}
27
- node.attributes.keys.each do |key|
28
- result_hash[:attributes][node.attributes[key].name.to_sym] =
29
- prepare(node.attributes[key].value)
30
- end
31
- end
32
- end
33
-
34
- def prepare_node_element(node)
35
- result_hash = {}
36
- add_attributes(node, result_hash)
37
- if node.children.size > 0
38
- result_hash = add_children(node, result_hash)
39
- end
40
- result_hash
41
- end
42
-
43
- def add_children(node, result_hash)
44
- node.children.each do |child|
45
- result = xml_node_to_hash(child)
46
-
47
- if child.name == "text"
48
- text = handle_text(child, result)
49
- return text if text
50
- elsif result_hash[child.name.to_sym]
51
- handle_child_node(child, result_hash, result)
52
- else
53
- result_hash[child.name.to_sym] = prepare(result)
54
- end
55
- end
56
- result_hash
57
- end
58
-
59
- def handle_child_node(child, result_hash, result)
60
- if result_hash[child.name.to_sym].is_a?(Object::Array)
61
- result_hash[child.name.to_sym] << prepare(result)
62
- else
63
- result_hash[child.name.to_sym] =
64
- [result_hash[child.name.to_sym]] << prepare(result)
65
- end
66
- end
67
-
68
- def handle_text(child, result)
69
- unless child.next_sibling || child.previous_sibling
70
- prepare(result)
71
- end
72
- end
73
-
74
- def prepare(data)
75
- (data.class == String && data.to_i.to_s == data) ? data.to_i : data
76
- end
77
-
78
- end
79
- end
80
- end