dwc-archive 0.9.10 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +14 -8
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +119 -107
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +96 -105
  36. data/spec/lib/core_spec.rb +43 -41
  37. data/spec/lib/darwin_core_spec.rb +108 -138
  38. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  39. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  40. data/spec/lib/generator_spec.rb +77 -69
  41. data/spec/lib/gnub_taxon_spec.rb +15 -17
  42. data/spec/lib/metadata_spec.rb +50 -41
  43. data/spec/lib/taxon_normalized_spec.rb +62 -65
  44. data/spec/lib/xml_reader_spec.rb +9 -12
  45. data/spec/spec_helper.rb +54 -51
  46. metadata +105 -88
  47. data/.rvmrc +0 -1
  48. data/] +0 -40
  49. data/lib/dwc-archive.rb +0 -107
  50. data/lib/dwc-archive/archive.rb +0 -40
  51. data/lib/dwc-archive/classification_normalizer.rb +0 -428
  52. data/lib/dwc-archive/core.rb +0 -17
  53. data/lib/dwc-archive/expander.rb +0 -84
  54. data/lib/dwc-archive/generator.rb +0 -85
  55. data/lib/dwc-archive/generator_eml_xml.rb +0 -86
  56. data/lib/dwc-archive/generator_meta_xml.rb +0 -58
  57. data/lib/dwc-archive/ingester.rb +0 -101
  58. data/lib/dwc-archive/metadata.rb +0 -48
  59. data/lib/dwc-archive/version.rb +0 -3
  60. data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -1,17 +0,0 @@
1
- class DarwinCore
2
- class Core
3
- include DarwinCore::Ingester
4
- attr_reader :id
5
- def initialize(dwc)
6
- @dwc = dwc
7
- @archive = @dwc.archive
8
- @path = @archive.files_path
9
- root_key = @archive.meta.keys[0]
10
- @data = @archive.meta[root_key][:core]
11
- raise DarwinCore::CoreFileError.
12
- new("Cannot find core in meta.xml, is meta.xml valid?") unless @data
13
- @id = @data[:id][:attributes]
14
- get_attributes(DarwinCore::CoreFileError)
15
- end
16
- end
17
- end
@@ -1,84 +0,0 @@
1
- class DarwinCore
2
- class Expander
3
- def initialize(archive_path, tmp_dir)
4
- @archive_path = archive_path
5
- @tmp_dir = tmp_dir
6
- @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
7
- @unpacker = get_unpacker
8
- end
9
-
10
- def unpack
11
- clean
12
- raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
13
- success = @unpacker.call(@path, @archive_path) if @unpacker
14
- (@unpacker && success && $?.exitstatus == 0) ?
15
- success :
16
- (clean; raise DarwinCore::UnpackingError)
17
- end
18
-
19
- def path
20
- @files_path ||= files_path
21
- end
22
-
23
- def clean
24
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
25
- end
26
-
27
- def files
28
- return nil unless path && FileTest.exists?(path)
29
- Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
30
- end
31
-
32
- private
33
-
34
- def esc(a_str)
35
- "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
36
- end
37
-
38
- def get_unpacker
39
- file_command = IO.popen("file -z " + esc(@archive_path))
40
- file_type = file_command.read
41
- file_command.close
42
-
43
- if file_type.match(/tar.*gzip/i)
44
- return proc do |tmp_path, archive_path|
45
- FileUtils.mkdir tmp_path
46
- path = esc(archive_path)
47
- system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
48
- end
49
- end
50
-
51
- if file_type.match(/Zip/)
52
- return proc do |tmp_path, archive_path|
53
- path = esc(archive_path)
54
- system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
55
- end
56
- end
57
-
58
- return nil
59
- end
60
-
61
- def path_entries(dir)
62
- Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
63
- end
64
-
65
- def files_path
66
- res = nil
67
- entries = path_entries(@path)
68
- if entries.include?('meta.xml')
69
- res = @path
70
- else
71
- entries.each do |e|
72
- check_path = File.join(@path, e)
73
- if FileTest.directory?(check_path)
74
- if path_entries(check_path).include?('meta.xml')
75
- res = check_path
76
- break
77
- end
78
- end
79
- end
80
- end
81
- res
82
- end
83
- end
84
- end
@@ -1,85 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- attr_reader :eml_xml_data
4
-
5
- #TODO refactor -- for now copying expander methods
6
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
- @dwc_path = dwc_path
8
- @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
- FileUtils.mkdir(@path)
10
- @meta_xml_data = {:extensions => []}
11
- @eml_xml_data = {:id => nil, :title => nil,
12
- :authors => [], :abstract => nil, :citation => nil, :url => nil}
13
- @write = 'w:utf-8'
14
- end
15
-
16
- #TODO refactor!
17
- def clean
18
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
19
- end
20
-
21
- def add_core(data, file_name, keep_headers = true)
22
- c = CSV.open(File.join(@path,file_name), @write)
23
- header = data.shift
24
- fields = header.map do |f|
25
- f.strip!
26
- err = 'No header in core data, or header fields are not urls'
27
- raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
28
- f.split('/')[-1]
29
- end
30
- data.unshift(fields) if keep_headers
31
- ignore_header_lines = keep_headers ? 1 : 0
32
- @meta_xml_data[:core] = { fields: header,
33
- ignoreHeaderLines: ignore_header_lines,
34
- location:file_name }
35
- data.each {|d| c << d}
36
- c.close
37
- end
38
-
39
- def add_extension(data, file_name,
40
- keep_headers = true,
41
- row_type = 'http://rs.tdwg.org/dwc/terms/Taxon')
42
- c = CSV.open(File.join(@path,file_name), @write)
43
- header = data.shift
44
- fields = header.map do |f|
45
- f.strip!
46
- err = 'No header in core data, or header fields are not urls'
47
- raise DarwinCore::GeneratorError.new(err) unless f.match(/^http:\/\//)
48
- f.split('/')[-1]
49
- end
50
- data.unshift(fields) if keep_headers
51
- ignore_header_lines = keep_headers ? 1 : 0
52
- @meta_xml_data[:extensions] << { fields: header,
53
- ignoreHeaderLines: ignore_header_lines,
54
- location: file_name,
55
- rowType: row_type }
56
- data.each { |d| c << d }
57
- c.close
58
- end
59
-
60
- def add_meta_xml
61
- meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
62
- meta.create
63
- end
64
-
65
- def add_eml_xml(data)
66
- @eml_xml_data = data
67
- eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
68
- eml.create
69
- end
70
-
71
- def path
72
- @path
73
- end
74
-
75
- def files
76
- return nil unless @path && FileTest.exists?(@path)
77
- Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
78
- end
79
-
80
- def pack
81
- a = "cd #{@path}; tar -zcf #{@dwc_path} *"
82
- system(a)
83
- end
84
- end
85
- end
@@ -1,86 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class EmlXml
4
-
5
- def initialize(data, path)
6
- @data = data
7
- @path = path
8
- @write = 'w:utf-8'
9
- end
10
-
11
- def create
12
- eml_uri = 'eml://ecoinformatics.org/eml-2.1.1' +
13
- ' http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd'
14
- builder = Nokogiri::XML::Builder.new do |xml|
15
- xml.eml(packageId: "%s/%s" % [@data[:id], timestamp],
16
- system: @data[:system] || 'http://globalnames.org',
17
- :'xml:lang' => 'en',
18
- :'xmlns:eml' => 'eml://ecoinformatics.org/eml-2.1.1',
19
- :'xmlns:md' => 'eml://ecoinformatics.org/methods-2.1.1',
20
- :'xmlns:proj' => 'eml://ecoinformatics.org/project-2.1.1',
21
- :'xmlns:d' => 'eml://ecoinformatics.org/dataset-2.1.1',
22
- :'xmlns:res' => 'eml://ecoinformatics.org/resource-2.1.1',
23
- :'xmlns:dc' => 'http://purl.org/dc/terms/',
24
- :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
25
- :'xsi:schemaLocation' => 'eml_uri') do
26
- xml.dataset(id: @data[:id]) do
27
- xml.title(@data[:title])
28
- xml.license(@data[:license])
29
- contacts = []
30
- @data[:authors].each_with_index do |a, i|
31
- creator_id = i + 1
32
- contacts << creator_id
33
- xml.creator(id: creator_id, scope: 'document') do
34
- xml.individualName do
35
- xml.givenName(a[:first_name])
36
- xml.surName(a[:last_name])
37
- end
38
- xml.organizationName(a[:organization]) if a[:organization]
39
- xml.positionName(a[:position]) if a[:position]
40
- xml.onlineUrl(a[:url]) if a[:url]
41
- xml.electronicMailAddress(a[:email])
42
- end
43
- end
44
- @data[:metadata_providers].each_with_index do |a, i|
45
- xml.metadataProvider do
46
- xml.individualName do
47
- xml.givenName(a[:first_name])
48
- xml.surName(a[:last_name])
49
- end
50
- xml.organizationName(a[:organization]) if a[:organization]
51
- xml.positionName(a[:position]) if a[:position]
52
- xml.onlineUrl(a[:url]) if a[:url]
53
- xml.electronicMailAddress(a[:email])
54
- end
55
- end if @data[:metadata_providers]
56
- xml.pubDate(Time.now.to_s)
57
- xml.abstract() do
58
- xml.para(@data[:abstract])
59
- end
60
- contacts.each do |contact|
61
- xml.contact { xml.references(contact) }
62
- end
63
- end
64
- xml.additionalMetadata do
65
- xml.metadata do
66
- xml.citation(@data[:citation])
67
- xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
68
- end
69
- end
70
- xml.parent.namespace = xml.parent.namespace_definitions.first
71
- end
72
- end
73
- data = builder.to_xml
74
- f = open(File.join(@path, 'eml.xml'), @write)
75
- f.write(data)
76
- f.close
77
- end
78
-
79
- private
80
- def timestamp
81
- t = Time.now.getutc.to_a[0..5].reverse
82
- t[0..2].join('-') + '::' + t[-3..-1].join(':')
83
- end
84
- end
85
- end
86
- end
@@ -1,58 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class MetaXml
4
- def initialize(data, path)
5
- @data = data
6
- @path = path
7
- @write = 'w:utf-8'
8
- end
9
-
10
- def create
11
- schema_uri = 'http://rs.tdwg.org/dwc/terms/xsd/archive/' +
12
- ' http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd'
13
- builder = Nokogiri::XML::Builder.new do |xml|
14
- opts = { encoding: 'UTF-8',
15
- fieldsTerminatedBy: ',',
16
- fieldsEnclosedBy: '"',
17
- linesTerminatedBy: "\n",
18
- rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
19
- xml.archive(xmlns: 'http://rs.tdwg.org/dwc/text/',
20
- :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
21
- :'xsi:schemaLocation' => schema_uri) do
22
- xml.core(opts.merge(ignoreHeaderLines:
23
- @data[:core][:ignoreHeaderLines])) do
24
- xml.files { xml.location(@data[:core][:location]) }
25
- taxon_id, fields = find_taxon_id(@data[:core][:fields])
26
- xml.id_(index: taxon_id[1])
27
- fields.each { |f| xml.field(term: f[0], index: f[1]) }
28
- end
29
- @data[:extensions].each do |e|
30
- xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
31
- rowType: e[:rowType])) do
32
- xml.files { xml.location(e[:location]) }
33
- taxon_id, fields = find_taxon_id(e[:fields])
34
- xml.coreid(index: taxon_id[1])
35
- fields.each { |f| xml.field(term: f[0], index: f[1]) }
36
- end
37
- end
38
- end
39
- end
40
- meta_xml_data = builder.to_xml
41
- meta_file = open(File.join(@path, 'meta.xml'), @write)
42
- meta_file.write(meta_xml_data)
43
- meta_file.close
44
- end
45
-
46
- private
47
- def find_taxon_id(data)
48
- fields = []
49
- data.each_with_index { |f, i| fields << [f.strip, i] }
50
- taxon_id, fields = fields.partition { |f| f[0].match(%r|/taxonid$|i) }
51
- raise DarwinCore::GeneratorError if taxon_id.size != 1
52
- [taxon_id[0], fields]
53
- end
54
-
55
- end
56
- end
57
- end
58
-
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
- class DarwinCore
3
- module Ingester
4
- attr_reader :data, :properties, :encoding, :fields_separator, :size
5
- attr_reader :file_path, :fields, :line_separator,
6
- :quote_character, :ignore_headers
7
-
8
- def size
9
- @size ||= get_size
10
- end
11
-
12
- def read(batch_size = 10000)
13
- DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
14
- res = []
15
- errors = []
16
- index_fix = 1
17
- args = {:col_sep => @field_separator}
18
- @quote_character = "\b" if @quote_character.empty?
19
- args.merge!({:quote_char => @quote_character})
20
- min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
21
- csv = CSV.new(open(@file_path), args)
22
- csv.each_with_index do |r, i|
23
- index_fix = 0; next if @ignore_headers && i == 0
24
- min_size > r.size ? errors << r : process_csv_row(res, errors, r)
25
- if (i + index_fix) % batch_size == 0
26
- DarwinCore.logger_write(@dwc.object_id,
27
- "Ingested %s records from %s" %
28
- [(i + index_fix), name])
29
- if block_given?
30
- yield [res, errors]
31
- res = []
32
- errors = []
33
- end
34
- end
35
- end
36
- yield [res, errors] if block_given?
37
- [res, errors]
38
- end
39
-
40
- private
41
- def name
42
- self.class.to_s.split('::')[-1].downcase
43
- end
44
-
45
- def process_csv_row(result, errors, row)
46
- str = row.join('')
47
- str = str.force_encoding('utf-8')
48
- if str.encoding.name == 'UTF-8' && str.valid_encoding?
49
- result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
50
- else
51
- errors << row
52
- end
53
- end
54
-
55
- def get_attributes(exception)
56
- @properties = @data[:attributes]
57
- @encoding = @properties[:encoding] || 'UTF-8'
58
- err_msg = 'No support for encodings other ' +
59
- 'than utf-8 or utf-16 at the moment'
60
- encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
61
- unless encodings.include? @encoding.downcase
62
- raise DarwinCore::EncodingError.new(err_msg)
63
- end
64
- @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ''
66
- @line_separator = @properties[:linesTerminatedBy] || '\n'
67
- @ignore_headers = @properties[:ignoreHeaderLines] ?
68
- [1, true].include?(@properties[:ignoreHeaderLines]) :
69
- false
70
- @file_path = get_file_path
71
- raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
72
- @fields = get_fields
73
- if @fields.empty?
74
- raise DarwinCore::InvalidArchiveError.new("No data fields are found")
75
- end
76
- end
77
-
78
- def get_file_path
79
- file = @data[:location] ||
80
- @data[:attributes][:location] ||
81
- @data[:files][:location]
82
- File.join(@path, file)
83
- end
84
-
85
- def get_fields
86
- @data[:field] = [data[:field]] if data[:field].class != Array
87
- @data[:field].map {|f| f[:attributes]}
88
- end
89
-
90
- def get_field_separator
91
- res = @properties[:fieldsTerminatedBy] || ','
92
- res = "\t" if res == "\\t"
93
- res
94
- end
95
-
96
- def get_size
97
- `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
98
- end
99
- end
100
- end
101
-
@@ -1,48 +0,0 @@
1
- class DarwinCore
2
- class Metadata
3
- def initialize(archive = nil)
4
- @archive = archive
5
- @metadata = @archive.eml
6
- end
7
-
8
- def data
9
- @metadata
10
- end
11
-
12
- def id
13
- @metadata[:eml][:dataset][:attributes][:id] rescue nil
14
- end
15
-
16
- def package_id
17
- @metadata.data[:eml][:attributes][:packageId] rescue nil
18
- end
19
-
20
- def title
21
- @metadata[:eml][:dataset][:title] rescue nil
22
- end
23
-
24
- def authors
25
- return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] =
27
- [@metadata[:eml][:dataset][:creator]] unless
28
- @metadata[:eml][:dataset][:creator].class == Array
29
- @metadata[:eml][:dataset][:creator].map do |c|
30
- { first_name: c[:individualName][:givenName],
31
- last_name: c[:individualName][:surName],
32
- email: c[:electronicMailAddress] }
33
- end
34
- end
35
-
36
- def abstract
37
- @metadata[:eml][:dataset][:abstract] rescue nil
38
- end
39
-
40
- def citation
41
- @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
42
- end
43
-
44
- def url
45
- @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
46
- end
47
- end
48
- end