dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -1,17 +0,0 @@
1
- class DarwinCore
2
- class Core
3
- include DarwinCore::Ingester
4
- attr_reader :id
5
- def initialize(dwc)
6
- @dwc = dwc
7
- @archive = @dwc.archive
8
- @path = @archive.files_path
9
- root_key = @archive.meta.keys[0]
10
- @data = @archive.meta[root_key][:core]
11
- raise DarwinCore::CoreFileError.new("Cannot find core in meta.xml, is meta.xml valid?") unless @data
12
- @id = @data[:id][:attributes]
13
- # raise DarwinCore::CoreFileError.new("Cannot find core identifier") unless @id
14
- get_attributes(DarwinCore::CoreFileError)
15
- end
16
- end
17
- end
@@ -1,80 +0,0 @@
1
- class DarwinCore
2
- class Expander
3
- def initialize(archive_path, tmp_dir)
4
- @archive_path = archive_path
5
- @tmp_dir = tmp_dir
6
- @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
7
- @unpacker = get_unpacker
8
- end
9
-
10
- def unpack
11
- clean
12
- raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
13
- success = @unpacker.call(@path, @archive_path) if @unpacker
14
- (@unpacker && success && $?.exitstatus == 0) ?
15
- success :
16
- (clean; raise DarwinCore::UnpackingError)
17
- end
18
-
19
- def path
20
- @files_path ||= files_path
21
- end
22
-
23
- def clean
24
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
25
- end
26
-
27
- def files
28
- return nil unless path && FileTest.exists?(path)
29
- Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
30
- end
31
-
32
- private
33
-
34
- def esc(a_str)
35
- "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
36
- end
37
-
38
- def get_unpacker
39
- file_command = IO.popen("file -z " + esc(@archive_path))
40
- file_type = file_command.read
41
- file_command.close
42
-
43
- if file_type.match(/tar.*gzip/i)
44
- return proc do |tmp_path, archive_path|
45
- FileUtils.mkdir tmp_path
46
- system("tar -zxf #{esc(archive_path)} -C #{tmp_path} > /dev/null 2>&1")
47
- end
48
- end
49
-
50
- if file_type.match(/Zip/)
51
- return proc { |tmp_path, archive_path| system("unzip -qq -d #{tmp_path} #{esc(archive_path)} > /dev/null 2>&1") }
52
- end
53
-
54
- return nil
55
- end
56
-
57
- def path_entries(dir)
58
- Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
59
- end
60
-
61
- def files_path
62
- res = nil
63
- entries = path_entries(@path)
64
- if entries.include?('meta.xml')
65
- res = @path
66
- else
67
- entries.each do |e|
68
- check_path = File.join(@path, e)
69
- if FileTest.directory?(check_path)
70
- if path_entries(check_path).include?('meta.xml')
71
- res = check_path
72
- break
73
- end
74
- end
75
- end
76
- end
77
- res
78
- end
79
- end
80
- end
@@ -1,75 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- attr_reader :eml_xml_data
4
-
5
- #TODO refactor -- for now copying expander methods
6
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
- @dwc_path = dwc_path
8
- @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
- FileUtils.mkdir(@path)
10
- @meta_xml_data = {:extensions => []}
11
- @eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
12
- @write = 'w:utf-8'
13
- end
14
-
15
- #TODO refactor!
16
- def clean
17
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
18
- end
19
-
20
- def add_core(data, file_name, keep_headers = true)
21
- c = CSV.open(File.join(@path,file_name), @write)
22
- header = data.shift
23
- fields = header.map do |f|
24
- f.strip!
25
- raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
26
- f.split("/")[-1]
27
- end
28
- data.unshift(fields) if keep_headers
29
- ignore_header_lines = keep_headers ? 1 : 0
30
- @meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name}
31
- data.each {|d| c << d}
32
- c.close
33
- end
34
-
35
- def add_extension(data, file_name, keep_headers = true, row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
36
- c = CSV.open(File.join(@path,file_name), @write)
37
- header = data.shift
38
- fields = header.map do |f|
39
- f.strip!
40
- raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
41
- f.split("/")[-1]
42
- end
43
- data.unshift(fields) if keep_headers
44
- ignore_header_lines = keep_headers ? 1 : 0
45
- @meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name, :rowType => row_type }
46
- data.each { |d| c << d }
47
- c.close
48
- end
49
-
50
- def add_meta_xml
51
- meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
52
- meta.create
53
- end
54
-
55
- def add_eml_xml(data)
56
- @eml_xml_data = data
57
- eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
58
- eml.create
59
- end
60
-
61
- def path
62
- @path
63
- end
64
-
65
- def files
66
- return nil unless @path && FileTest.exists?(@path)
67
- Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
68
- end
69
-
70
- def pack
71
- a = "cd #{@path}; tar -zcf #{@dwc_path} *"
72
- system(a)
73
- end
74
- end
75
- end
@@ -1,84 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class EmlXml
4
-
5
- def initialize(data, path)
6
- @data = data
7
- @path = path
8
- @write = 'w:utf-8'
9
- end
10
-
11
- def create
12
- builder = Nokogiri::XML::Builder.new do |xml|
13
- xml.eml(:packageId => "%s/%s" % [@data[:id], timestamp],
14
- :system => @data[:system] || "http://globalnames.org",
15
- :'xml:lang' => "en",
16
- :'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.1",
17
- :'xmlns:md' => "eml://ecoinformatics.org/methods-2.1.1",
18
- :'xmlns:proj' => "eml://ecoinformatics.org/project-2.1.1",
19
- :'xmlns:d' => "eml://ecoinformatics.org/dataset-2.1.1",
20
- :'xmlns:res' => "eml://ecoinformatics.org/resource-2.1.1",
21
- :'xmlns:dc' => "http://purl.org/dc/terms/",
22
- :'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
23
- :'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd") do
24
- xml.dataset(:id => @data[:id]) do
25
- xml.title(@data[:title])
26
- xml.license(@data[:license])
27
- contacts = []
28
- @data[:authors].each_with_index do |a, i|
29
- creator_id = i + 1
30
- contacts << creator_id
31
- xml.creator(:id => creator_id, :scope => 'document') do
32
- xml.individualName do
33
- xml.givenName(a[:first_name])
34
- xml.surName(a[:last_name])
35
- end
36
- xml.organizationName(a[:organization]) if a[:organization]
37
- xml.positionName(a[:position]) if a[:position]
38
- xml.onlineUrl(a[:url]) if a[:url]
39
- xml.electronicMailAddress(a[:email])
40
- end
41
- end
42
- @data[:metadata_providers].each_with_index do |a, i|
43
- xml.metadataProvider do
44
- xml.individualName do
45
- xml.givenName(a[:first_name])
46
- xml.surName(a[:last_name])
47
- end
48
- xml.organizationName(a[:organization]) if a[:organization]
49
- xml.positionName(a[:position]) if a[:position]
50
- xml.onlineUrl(a[:url]) if a[:url]
51
- xml.electronicMailAddress(a[:email])
52
- end
53
- end if @data[:metadata_providers]
54
- xml.pubDate(Time.now.to_s)
55
- xml.abstract() do
56
- xml.para(@data[:abstract])
57
- end
58
- contacts.each do |contact|
59
- xml.contact { xml.references(contact) }
60
- end
61
- end
62
- xml.additionalMetadata do
63
- xml.metadata do
64
- xml.citation(@data[:citation])
65
- xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
66
- end
67
- end
68
- xml.parent.namespace = xml.parent.namespace_definitions.first
69
- end
70
- end
71
- data = builder.to_xml
72
- f = open(File.join(@path, 'eml.xml'), @write)
73
- f.write(data)
74
- f.close
75
- end
76
-
77
- private
78
- def timestamp
79
- t = Time.now.getutc.to_a[0..5].reverse
80
- t[0..2].join('-') + "::" + t[-3..-1].join(':')
81
- end
82
- end
83
- end
84
- end
@@ -1,50 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class MetaXml
4
- def initialize(data, path)
5
- @data = data
6
- @path = path
7
- @write = 'w:utf-8'
8
- end
9
-
10
- def create
11
- builder = Nokogiri::XML::Builder.new do |xml|
12
- opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/terms/Taxon" }
13
- xml.archive(:xmlns => "http://rs.tdwg.org/dwc/text/",
14
- "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
15
- "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd") do
16
- xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
17
- xml.files { xml.location(@data[:core][:location]) }
18
- taxon_id, fields = find_taxon_id(@data[:core][:fields])
19
- xml.id_(:index => taxon_id[1])
20
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
21
- end
22
- @data[:extensions].each do |e|
23
- xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines], :rowType => e[:rowType])) do
24
- xml.files { xml.location(e[:location]) }
25
- taxon_id, fields = find_taxon_id(e[:fields])
26
- xml.coreid(:index => taxon_id[1])
27
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
28
- end
29
- end
30
- end
31
- end
32
- meta_xml_data = builder.to_xml
33
- meta_file = open(File.join(@path, 'meta.xml'), @write)
34
- meta_file.write(meta_xml_data)
35
- meta_file.close
36
- end
37
-
38
- private
39
- def find_taxon_id(data)
40
- fields = []
41
- data.each_with_index { |f, i| fields << [f.strip, i] }
42
- taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
43
- raise DarwinCore::GeneratorError if taxon_id.size != 1
44
- [taxon_id[0], fields]
45
- end
46
-
47
- end
48
- end
49
- end
50
-
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
- class DarwinCore
3
- module Ingester
4
- attr_reader :data, :properties, :encoding, :fields_separator, :size
5
- attr_reader :file_path, :fields, :line_separator,
6
- :quote_character, :ignore_headers
7
-
8
- def size
9
- @size ||= get_size
10
- end
11
-
12
- def read(batch_size = 10000)
13
- DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
14
- res = []
15
- errors = []
16
- index_fix = 1
17
- args = {:col_sep => @field_separator}
18
- @quote_character = "\b" if @quote_character.empty?
19
- args.merge!({:quote_char => @quote_character})
20
- min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
21
- csv = CSV.new(open(@file_path), args)
22
- csv.each_with_index do |r, i|
23
- index_fix = 0; next if @ignore_headers && i == 0
24
- min_size > r.size ? errors << r : process_csv_row(res, errors, r)
25
- if (i + index_fix) % batch_size == 0
26
- DarwinCore.logger_write(@dwc.object_id,
27
- "Ingested %s records from %s" %
28
- [(i + index_fix), name])
29
- if block_given?
30
- yield [res, errors]
31
- res = []
32
- errors = []
33
- end
34
- end
35
- end
36
- yield [res, errors] if block_given?
37
- [res, errors]
38
- end
39
-
40
- private
41
- def name
42
- self.class.to_s.split('::')[-1].downcase
43
- end
44
-
45
- def process_csv_row(result, errors, row)
46
- str = row.join('')
47
- str = str.force_encoding('utf-8')
48
- if str.encoding.name == 'UTF-8' && str.valid_encoding?
49
- result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
50
- else
51
- errors << row
52
- end
53
- end
54
-
55
- def get_attributes(exception)
56
- @properties = @data[:attributes]
57
- @encoding = @properties[:encoding] || 'UTF-8'
58
- err_msg = 'No support for encodings other ' +
59
- 'than utf-8 or utf-16 at the moment'
60
- encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
61
- unless encodings.include? @encoding.downcase
62
- raise DarwinCore::EncodingError.new(err_msg)
63
- end
64
- @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ""
66
- @line_separator = @properties[:linesTerminatedBy] || '\n'
67
- @ignore_headers = @properties[:ignoreHeaderLines] ?
68
- [1, true].include?(@properties[:ignoreHeaderLines]) :
69
- false
70
- @file_path = get_file_path
71
- raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
72
- @fields = get_fields
73
- if @fields.empty?
74
- raise DarwinCore::InvalidArchiveError.new("No data fields are found")
75
- end
76
- end
77
-
78
- def get_file_path
79
- file = @data[:location] ||
80
- @data[:attributes][:location] ||
81
- @data[:files][:location]
82
- File.join(@path, file)
83
- end
84
-
85
- def get_fields
86
- @data[:field] = [data[:field]] if data[:field].class != Array
87
- @data[:field].map {|f| f[:attributes]}
88
- end
89
-
90
- def get_field_separator
91
- res = @properties[:fieldsTerminatedBy] || ','
92
- res = "\t" if res == "\\t"
93
- res
94
- end
95
-
96
- def get_size
97
- `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
98
- end
99
- end
100
- end
101
-
@@ -1,42 +0,0 @@
1
- class DarwinCore
2
- class Metadata
3
- def initialize(archive = nil)
4
- @archive = archive
5
- @metadata = @archive.eml
6
- end
7
-
8
- def data
9
- @metadata
10
- end
11
-
12
- def id
13
- @metadata[:eml][:dataset][:attributes][:id] rescue nil
14
- end
15
-
16
- def package_id
17
- @metadata.data[:eml][:attributes][:packageId] rescue nil
18
- end
19
-
20
- def title
21
- @metadata[:eml][:dataset][:title] rescue nil
22
- end
23
-
24
- def authors
25
- return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] = [@metadata[:eml][:dataset][:creator]] unless @metadata[:eml][:dataset][:creator].class == Array
27
- @metadata[:eml][:dataset][:creator].map {|c| {:first_name => c[:individualName][:givenName], :last_name => c[:individualName][:surName], :email => c[:electronicMailAddress]}}
28
- end
29
-
30
- def abstract
31
- @metadata[:eml][:dataset][:abstract] rescue nil
32
- end
33
-
34
- def citation
35
- @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
36
- end
37
-
38
- def url
39
- @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
40
- end
41
- end
42
- end