dwc-archive 0.9.11 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +5 -5
  2. data/.rspec +2 -1
  3. data/.rubocop.yml +23 -0
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +2 -3
  6. data/CHANGELOG +2 -0
  7. data/Gemfile +3 -1
  8. data/README.md +110 -106
  9. data/Rakefile +13 -36
  10. data/dwc-archive.gemspec +24 -19
  11. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  12. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  13. data/features/support/env.rb +1 -1
  14. data/lib/dwc_archive.rb +121 -0
  15. data/lib/dwc_archive/archive.rb +59 -0
  16. data/lib/dwc_archive/classification_normalizer.rb +392 -0
  17. data/lib/dwc_archive/core.rb +25 -0
  18. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  19. data/lib/dwc_archive/expander.rb +88 -0
  20. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  21. data/lib/dwc_archive/generator.rb +90 -0
  22. data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
  23. data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
  24. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  25. data/lib/dwc_archive/ingester.rb +106 -0
  26. data/lib/dwc_archive/metadata.rb +56 -0
  27. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  28. data/lib/dwc_archive/version.rb +6 -0
  29. data/lib/dwc_archive/xml_reader.rb +89 -0
  30. data/spec/files/generator_eml.xml +1 -1
  31. data/spec/lib/classification_normalizer_spec.rb +96 -105
  32. data/spec/lib/core_spec.rb +43 -41
  33. data/spec/lib/darwin_core_spec.rb +111 -132
  34. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  35. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  36. data/spec/lib/generator_spec.rb +73 -74
  37. data/spec/lib/gnub_taxon_spec.rb +14 -16
  38. data/spec/lib/metadata_spec.rb +50 -41
  39. data/spec/lib/taxon_normalized_spec.rb +62 -65
  40. data/spec/lib/xml_reader_spec.rb +9 -12
  41. data/spec/spec_helper.rb +55 -49
  42. metadata +92 -77
  43. data/.rvmrc +0 -1
  44. data/lib/dwc-archive.rb +0 -107
  45. data/lib/dwc-archive/archive.rb +0 -40
  46. data/lib/dwc-archive/classification_normalizer.rb +0 -427
  47. data/lib/dwc-archive/core.rb +0 -19
  48. data/lib/dwc-archive/expander.rb +0 -85
  49. data/lib/dwc-archive/generator.rb +0 -86
  50. data/lib/dwc-archive/ingester.rb +0 -101
  51. data/lib/dwc-archive/metadata.rb +0 -48
  52. data/lib/dwc-archive/version.rb +0 -3
  53. data/lib/dwc-archive/xml_reader.rb +0 -80
@@ -1,19 +0,0 @@
1
- class DarwinCore
2
- class Core
3
- include DarwinCore::Ingester
4
- attr_reader :id
5
-
6
- def initialize(dwc)
7
- @dwc = dwc
8
- @archive = @dwc.archive
9
- @path = @archive.files_path
10
- root_key = @archive.meta.keys[0]
11
- @data = @archive.meta[root_key][:core]
12
- raise DarwinCore::CoreFileError.
13
- new('Cannot find core in meta.xml, is meta.xml valid?') unless @data
14
- @id = @data[:id][:attributes]
15
- get_attributes(DarwinCore::CoreFileError)
16
- end
17
-
18
- end
19
- end
@@ -1,85 +0,0 @@
1
- class DarwinCore
2
- class Expander
3
-
4
- def initialize(archive_path, tmp_dir)
5
- @archive_path = archive_path
6
- @tmp_dir = tmp_dir
7
- @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
8
- @unpacker = get_unpacker
9
- end
10
-
11
- def unpack
12
- clean
13
- raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
14
- success = @unpacker.call(@path, @archive_path) if @unpacker
15
- (@unpacker && success && $?.exitstatus == 0) ?
16
- success :
17
- (clean; raise DarwinCore::UnpackingError)
18
- end
19
-
20
- def path
21
- @files_path ||= files_path
22
- end
23
-
24
- def clean
25
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
26
- end
27
-
28
- def files
29
- return nil unless path && FileTest.exists?(path)
30
- Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
31
- end
32
-
33
- private
34
-
35
- def esc(a_str)
36
- "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
37
- end
38
-
39
- def get_unpacker
40
- file_command = IO.popen("file -z " + esc(@archive_path))
41
- file_type = file_command.read
42
- file_command.close
43
-
44
- if file_type.match(/tar.*gzip/i)
45
- return proc do |tmp_path, archive_path|
46
- FileUtils.mkdir tmp_path
47
- path = esc(archive_path)
48
- system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
49
- end
50
- end
51
-
52
- if file_type.match(/Zip/)
53
- return proc do |tmp_path, archive_path|
54
- path = esc(archive_path)
55
- system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
56
- end
57
- end
58
-
59
- return nil
60
- end
61
-
62
- def path_entries(dir)
63
- Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
64
- end
65
-
66
- def files_path
67
- res = nil
68
- entries = path_entries(@path)
69
- if entries.include?('meta.xml')
70
- res = @path
71
- else
72
- entries.each do |e|
73
- check_path = File.join(@path, e)
74
- if FileTest.directory?(check_path)
75
- if path_entries(check_path).include?('meta.xml')
76
- res = check_path
77
- break
78
- end
79
- end
80
- end
81
- end
82
- res
83
- end
84
- end
85
- end
@@ -1,86 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- attr_reader :eml_xml_data
4
-
5
- #TODO refactor -- for now copying expander methods
6
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
- @dwc_path = dwc_path
8
- @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
- FileUtils.mkdir(@path)
10
- @meta_xml_data = {:extensions => []}
11
- @eml_xml_data = {:id => nil, :title => nil,
12
- :authors => [], :abstract => nil, :citation => nil, :url => nil}
13
- @write = 'w:utf-8'
14
- end
15
-
16
- #TODO refactor!
17
- def clean
18
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
19
- end
20
-
21
- def add_core(data, file_name, keep_headers = true)
22
- c = CSV.open(File.join(@path,file_name), @write)
23
- header = data.shift
24
- fields = get_fields(header, 'core')
25
- data.unshift(fields) if keep_headers
26
- ignore_header_lines = keep_headers ? 1 : 0
27
- @meta_xml_data[:core] = { fields: header,
28
- ignoreHeaderLines: ignore_header_lines,
29
- location:file_name }
30
- data.each {|d| c << d}
31
- c.close
32
- end
33
-
34
- def add_extension(data, file_name,
35
- keep_headers = true,
36
- row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
37
- c = CSV.open(File.join(@path,file_name), @write)
38
- header = data.shift
39
- fields = get_fields(header, 'extension')
40
- data.unshift(fields) if keep_headers
41
- ignore_header_lines = keep_headers ? 1 : 0
42
- @meta_xml_data[:extensions] << { fields: header,
43
- ignoreHeaderLines: ignore_header_lines,
44
- location: file_name,
45
- rowType: row_type }
46
- data.each { |d| c << d }
47
- c.close
48
- end
49
-
50
- def add_meta_xml
51
- meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
52
- meta.create
53
- end
54
-
55
- def add_eml_xml(data)
56
- @eml_xml_data = data
57
- eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
58
- eml.create
59
- end
60
-
61
- def path
62
- @path
63
- end
64
-
65
- def files
66
- return nil unless @path && FileTest.exists?(@path)
67
- Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
68
- end
69
-
70
- def pack
71
- a = "cd #{@path}; tar -zcf #{@dwc_path} *"
72
- system(a)
73
- end
74
-
75
- private
76
-
77
- def get_fields(header, file_type)
78
- header.map do |f|
79
- f.strip!
80
- err = "No header in %s data, or header fields are not urls" % file_type
81
- raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
82
- f.split('/')[-1]
83
- end
84
- end
85
- end
86
- end
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
- class DarwinCore
3
- module Ingester
4
- attr_reader :data, :properties, :encoding, :fields_separator, :size
5
- attr_reader :file_path, :fields, :line_separator,
6
- :quote_character, :ignore_headers
7
-
8
- def size
9
- @size ||= get_size
10
- end
11
-
12
- def read(batch_size = 10000)
13
- DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
14
- res = []
15
- errors = []
16
- index_fix = 1
17
- args = {:col_sep => @field_separator}
18
- @quote_character = "\b" if @quote_character.empty?
19
- args.merge!({:quote_char => @quote_character})
20
- min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
21
- csv = CSV.new(open(@file_path), args)
22
- csv.each_with_index do |r, i|
23
- index_fix = 0; next if @ignore_headers && i == 0
24
- min_size > r.size ? errors << r : process_csv_row(res, errors, r)
25
- if (i + index_fix) % batch_size == 0
26
- DarwinCore.logger_write(@dwc.object_id,
27
- "Ingested %s records from %s" %
28
- [(i + index_fix), name])
29
- if block_given?
30
- yield [res, errors]
31
- res = []
32
- errors = []
33
- end
34
- end
35
- end
36
- yield [res, errors] if block_given?
37
- [res, errors]
38
- end
39
-
40
- private
41
- def name
42
- self.class.to_s.split('::')[-1].downcase
43
- end
44
-
45
- def process_csv_row(result, errors, row)
46
- str = row.join('')
47
- str = str.force_encoding('utf-8')
48
- if str.encoding.name == 'UTF-8' && str.valid_encoding?
49
- result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
50
- else
51
- errors << row
52
- end
53
- end
54
-
55
- def get_attributes(exception)
56
- @properties = @data[:attributes]
57
- @encoding = @properties[:encoding] || 'UTF-8'
58
- err_msg = 'No support for encodings other ' +
59
- 'than utf-8 or utf-16 at the moment'
60
- encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
61
- unless encodings.include? @encoding.downcase
62
- raise DarwinCore::EncodingError.new(err_msg)
63
- end
64
- @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ''
66
- @line_separator = @properties[:linesTerminatedBy] || '\n'
67
- @ignore_headers = @properties[:ignoreHeaderLines] ?
68
- [1, true].include?(@properties[:ignoreHeaderLines]) :
69
- false
70
- @file_path = get_file_path
71
- raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
72
- @fields = get_fields
73
- if @fields.empty?
74
- raise DarwinCore::InvalidArchiveError.new("No data fields are found")
75
- end
76
- end
77
-
78
- def get_file_path
79
- file = @data[:location] ||
80
- @data[:attributes][:location] ||
81
- @data[:files][:location]
82
- File.join(@path, file)
83
- end
84
-
85
- def get_fields
86
- @data[:field] = [data[:field]] if data[:field].class != Array
87
- @data[:field].map {|f| f[:attributes]}
88
- end
89
-
90
- def get_field_separator
91
- res = @properties[:fieldsTerminatedBy] || ','
92
- res = "\t" if res == "\\t"
93
- res
94
- end
95
-
96
- def get_size
97
- `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
98
- end
99
- end
100
- end
101
-
@@ -1,48 +0,0 @@
1
- class DarwinCore
2
- class Metadata
3
- def initialize(archive = nil)
4
- @archive = archive
5
- @metadata = @archive.eml
6
- end
7
-
8
- def data
9
- @metadata
10
- end
11
-
12
- def id
13
- @metadata[:eml][:dataset][:attributes][:id] rescue nil
14
- end
15
-
16
- def package_id
17
- @metadata.data[:eml][:attributes][:packageId] rescue nil
18
- end
19
-
20
- def title
21
- @metadata[:eml][:dataset][:title] rescue nil
22
- end
23
-
24
- def authors
25
- return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] =
27
- [@metadata[:eml][:dataset][:creator]] unless
28
- @metadata[:eml][:dataset][:creator].class == Array
29
- @metadata[:eml][:dataset][:creator].map do |c|
30
- { first_name: c[:individualName][:givenName],
31
- last_name: c[:individualName][:surName],
32
- email: c[:electronicMailAddress] }
33
- end
34
- end
35
-
36
- def abstract
37
- @metadata[:eml][:dataset][:abstract] rescue nil
38
- end
39
-
40
- def citation
41
- @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
42
- end
43
-
44
- def url
45
- @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
46
- end
47
- end
48
- end
@@ -1,3 +0,0 @@
1
- class DarwinCore
2
- VERSION = "0.9.11"
3
- end
@@ -1,80 +0,0 @@
1
- # USAGE: Hash.from_xml:(YOUR_XML_STRING)
2
- # modified from
3
- # http://stackoverflow.com/questions/1230741/
4
- # convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
5
- class DarwinCore
6
- module XmlReader
7
- class << self
8
-
9
- def from_xml(xml_io)
10
- result = Nokogiri::XML(xml_io)
11
- return { result.root.name.to_sym => xml_node_to_hash(result.root)}
12
- end
13
-
14
- private
15
- def xml_node_to_hash(node)
16
- # If we are at the root of the document, start the hash
17
- if node.element?
18
- prepare_node_element(node)
19
- else
20
- return prepare(node.content.to_s)
21
- end
22
- end
23
-
24
- def add_attributes(node, result_hash)
25
- if node.attributes != {}
26
- result_hash[:attributes] = {}
27
- node.attributes.keys.each do |key|
28
- result_hash[:attributes][node.attributes[key].name.to_sym] =
29
- prepare(node.attributes[key].value)
30
- end
31
- end
32
- end
33
-
34
- def prepare_node_element(node)
35
- result_hash = {}
36
- add_attributes(node, result_hash)
37
- if node.children.size > 0
38
- result_hash = add_children(node, result_hash)
39
- end
40
- result_hash
41
- end
42
-
43
- def add_children(node, result_hash)
44
- node.children.each do |child|
45
- result = xml_node_to_hash(child)
46
-
47
- if child.name == "text"
48
- text = handle_text(child, result)
49
- return text if text
50
- elsif result_hash[child.name.to_sym]
51
- handle_child_node(child, result_hash, result)
52
- else
53
- result_hash[child.name.to_sym] = prepare(result)
54
- end
55
- end
56
- result_hash
57
- end
58
-
59
- def handle_child_node(child, result_hash, result)
60
- if result_hash[child.name.to_sym].is_a?(Object::Array)
61
- result_hash[child.name.to_sym] << prepare(result)
62
- else
63
- result_hash[child.name.to_sym] =
64
- [result_hash[child.name.to_sym]] << prepare(result)
65
- end
66
- end
67
-
68
- def handle_text(child, result)
69
- unless child.next_sibling || child.previous_sibling
70
- prepare(result)
71
- end
72
- end
73
-
74
- def prepare(data)
75
- (data.class == String && data.to_i.to_s == data) ? data.to_i : data
76
- end
77
-
78
- end
79
- end
80
- end