dwc-archive 0.9.6 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -1,17 +0,0 @@
1
- class DarwinCore
2
- class Core
3
- include DarwinCore::Ingester
4
- attr_reader :id
5
- def initialize(dwc)
6
- @dwc = dwc
7
- @archive = @dwc.archive
8
- @path = @archive.files_path
9
- root_key = @archive.meta.keys[0]
10
- @data = @archive.meta[root_key][:core]
11
- raise DarwinCore::CoreFileError.new("Cannot find core in meta.xml, is meta.xml valid?") unless @data
12
- @id = @data[:id][:attributes]
13
- # raise DarwinCore::CoreFileError.new("Cannot find core identifier") unless @id
14
- get_attributes(DarwinCore::CoreFileError)
15
- end
16
- end
17
- end
@@ -1,80 +0,0 @@
1
- class DarwinCore
2
- class Expander
3
- def initialize(archive_path, tmp_dir)
4
- @archive_path = archive_path
5
- @tmp_dir = tmp_dir
6
- @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
7
- @unpacker = get_unpacker
8
- end
9
-
10
- def unpack
11
- clean
12
- raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
13
- success = @unpacker.call(@path, @archive_path) if @unpacker
14
- (@unpacker && success && $?.exitstatus == 0) ?
15
- success :
16
- (clean; raise DarwinCore::UnpackingError)
17
- end
18
-
19
- def path
20
- @files_path ||= files_path
21
- end
22
-
23
- def clean
24
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
25
- end
26
-
27
- def files
28
- return nil unless path && FileTest.exists?(path)
29
- Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
30
- end
31
-
32
- private
33
-
34
- def esc(a_str)
35
- "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
36
- end
37
-
38
- def get_unpacker
39
- file_command = IO.popen("file -z " + esc(@archive_path))
40
- file_type = file_command.read
41
- file_command.close
42
-
43
- if file_type.match(/tar.*gzip/i)
44
- return proc do |tmp_path, archive_path|
45
- FileUtils.mkdir tmp_path
46
- system("tar -zxf #{esc(archive_path)} -C #{tmp_path} > /dev/null 2>&1")
47
- end
48
- end
49
-
50
- if file_type.match(/Zip/)
51
- return proc { |tmp_path, archive_path| system("unzip -qq -d #{tmp_path} #{esc(archive_path)} > /dev/null 2>&1") }
52
- end
53
-
54
- return nil
55
- end
56
-
57
- def path_entries(dir)
58
- Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
59
- end
60
-
61
- def files_path
62
- res = nil
63
- entries = path_entries(@path)
64
- if entries.include?('meta.xml')
65
- res = @path
66
- else
67
- entries.each do |e|
68
- check_path = File.join(@path, e)
69
- if FileTest.directory?(check_path)
70
- if path_entries(check_path).include?('meta.xml')
71
- res = check_path
72
- break
73
- end
74
- end
75
- end
76
- end
77
- res
78
- end
79
- end
80
- end
@@ -1,75 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- attr_reader :eml_xml_data
4
-
5
- #TODO refactor -- for now copying expander methods
6
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
7
- @dwc_path = dwc_path
8
- @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
9
- FileUtils.mkdir(@path)
10
- @meta_xml_data = {:extensions => []}
11
- @eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
12
- @write = 'w:utf-8'
13
- end
14
-
15
- #TODO refactor!
16
- def clean
17
- FileUtils.rm_rf(@path) if FileTest.exists?(@path)
18
- end
19
-
20
- def add_core(data, file_name, keep_headers = true)
21
- c = CSV.open(File.join(@path,file_name), @write)
22
- header = data.shift
23
- fields = header.map do |f|
24
- f.strip!
25
- raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
26
- f.split("/")[-1]
27
- end
28
- data.unshift(fields) if keep_headers
29
- ignore_header_lines = keep_headers ? 1 : 0
30
- @meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name}
31
- data.each {|d| c << d}
32
- c.close
33
- end
34
-
35
- def add_extension(data, file_name, keep_headers = true, row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
36
- c = CSV.open(File.join(@path,file_name), @write)
37
- header = data.shift
38
- fields = header.map do |f|
39
- f.strip!
40
- raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
41
- f.split("/")[-1]
42
- end
43
- data.unshift(fields) if keep_headers
44
- ignore_header_lines = keep_headers ? 1 : 0
45
- @meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name, :rowType => row_type }
46
- data.each { |d| c << d }
47
- c.close
48
- end
49
-
50
- def add_meta_xml
51
- meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
52
- meta.create
53
- end
54
-
55
- def add_eml_xml(data)
56
- @eml_xml_data = data
57
- eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
58
- eml.create
59
- end
60
-
61
- def path
62
- @path
63
- end
64
-
65
- def files
66
- return nil unless @path && FileTest.exists?(@path)
67
- Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
68
- end
69
-
70
- def pack
71
- a = "cd #{@path}; tar -zcf #{@dwc_path} *"
72
- system(a)
73
- end
74
- end
75
- end
@@ -1,84 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class EmlXml
4
-
5
- def initialize(data, path)
6
- @data = data
7
- @path = path
8
- @write = 'w:utf-8'
9
- end
10
-
11
- def create
12
- builder = Nokogiri::XML::Builder.new do |xml|
13
- xml.eml(:packageId => "%s/%s" % [@data[:id], timestamp],
14
- :system => @data[:system] || "http://globalnames.org",
15
- :'xml:lang' => "en",
16
- :'xmlns:eml' => "eml://ecoinformatics.org/eml-2.1.1",
17
- :'xmlns:md' => "eml://ecoinformatics.org/methods-2.1.1",
18
- :'xmlns:proj' => "eml://ecoinformatics.org/project-2.1.1",
19
- :'xmlns:d' => "eml://ecoinformatics.org/dataset-2.1.1",
20
- :'xmlns:res' => "eml://ecoinformatics.org/resource-2.1.1",
21
- :'xmlns:dc' => "http://purl.org/dc/terms/",
22
- :'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
23
- :'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd") do
24
- xml.dataset(:id => @data[:id]) do
25
- xml.title(@data[:title])
26
- xml.license(@data[:license])
27
- contacts = []
28
- @data[:authors].each_with_index do |a, i|
29
- creator_id = i + 1
30
- contacts << creator_id
31
- xml.creator(:id => creator_id, :scope => 'document') do
32
- xml.individualName do
33
- xml.givenName(a[:first_name])
34
- xml.surName(a[:last_name])
35
- end
36
- xml.organizationName(a[:organization]) if a[:organization]
37
- xml.positionName(a[:position]) if a[:position]
38
- xml.onlineUrl(a[:url]) if a[:url]
39
- xml.electronicMailAddress(a[:email])
40
- end
41
- end
42
- @data[:metadata_providers].each_with_index do |a, i|
43
- xml.metadataProvider do
44
- xml.individualName do
45
- xml.givenName(a[:first_name])
46
- xml.surName(a[:last_name])
47
- end
48
- xml.organizationName(a[:organization]) if a[:organization]
49
- xml.positionName(a[:position]) if a[:position]
50
- xml.onlineUrl(a[:url]) if a[:url]
51
- xml.electronicMailAddress(a[:email])
52
- end
53
- end if @data[:metadata_providers]
54
- xml.pubDate(Time.now.to_s)
55
- xml.abstract() do
56
- xml.para(@data[:abstract])
57
- end
58
- contacts.each do |contact|
59
- xml.contact { xml.references(contact) }
60
- end
61
- end
62
- xml.additionalMetadata do
63
- xml.metadata do
64
- xml.citation(@data[:citation])
65
- xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
66
- end
67
- end
68
- xml.parent.namespace = xml.parent.namespace_definitions.first
69
- end
70
- end
71
- data = builder.to_xml
72
- f = open(File.join(@path, 'eml.xml'), @write)
73
- f.write(data)
74
- f.close
75
- end
76
-
77
- private
78
- def timestamp
79
- t = Time.now.getutc.to_a[0..5].reverse
80
- t[0..2].join('-') + "::" + t[-3..-1].join(':')
81
- end
82
- end
83
- end
84
- end
@@ -1,50 +0,0 @@
1
- class DarwinCore
2
- class Generator
3
- class MetaXml
4
- def initialize(data, path)
5
- @data = data
6
- @path = path
7
- @write = 'w:utf-8'
8
- end
9
-
10
- def create
11
- builder = Nokogiri::XML::Builder.new do |xml|
12
- opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/terms/Taxon" }
13
- xml.archive(:xmlns => "http://rs.tdwg.org/dwc/text/",
14
- "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
15
- "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd") do
16
- xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
17
- xml.files { xml.location(@data[:core][:location]) }
18
- taxon_id, fields = find_taxon_id(@data[:core][:fields])
19
- xml.id_(:index => taxon_id[1])
20
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
21
- end
22
- @data[:extensions].each do |e|
23
- xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines], :rowType => e[:rowType])) do
24
- xml.files { xml.location(e[:location]) }
25
- taxon_id, fields = find_taxon_id(e[:fields])
26
- xml.coreid(:index => taxon_id[1])
27
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
28
- end
29
- end
30
- end
31
- end
32
- meta_xml_data = builder.to_xml
33
- meta_file = open(File.join(@path, 'meta.xml'), @write)
34
- meta_file.write(meta_xml_data)
35
- meta_file.close
36
- end
37
-
38
- private
39
- def find_taxon_id(data)
40
- fields = []
41
- data.each_with_index { |f, i| fields << [f.strip, i] }
42
- taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
43
- raise DarwinCore::GeneratorError if taxon_id.size != 1
44
- [taxon_id[0], fields]
45
- end
46
-
47
- end
48
- end
49
- end
50
-
@@ -1,101 +0,0 @@
1
- # encoding: utf-8
2
- class DarwinCore
3
- module Ingester
4
- attr_reader :data, :properties, :encoding, :fields_separator, :size
5
- attr_reader :file_path, :fields, :line_separator,
6
- :quote_character, :ignore_headers
7
-
8
- def size
9
- @size ||= get_size
10
- end
11
-
12
- def read(batch_size = 10000)
13
- DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
14
- res = []
15
- errors = []
16
- index_fix = 1
17
- args = {:col_sep => @field_separator}
18
- @quote_character = "\b" if @quote_character.empty?
19
- args.merge!({:quote_char => @quote_character})
20
- min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
21
- csv = CSV.new(open(@file_path), args)
22
- csv.each_with_index do |r, i|
23
- index_fix = 0; next if @ignore_headers && i == 0
24
- min_size > r.size ? errors << r : process_csv_row(res, errors, r)
25
- if (i + index_fix) % batch_size == 0
26
- DarwinCore.logger_write(@dwc.object_id,
27
- "Ingested %s records from %s" %
28
- [(i + index_fix), name])
29
- if block_given?
30
- yield [res, errors]
31
- res = []
32
- errors = []
33
- end
34
- end
35
- end
36
- yield [res, errors] if block_given?
37
- [res, errors]
38
- end
39
-
40
- private
41
- def name
42
- self.class.to_s.split('::')[-1].downcase
43
- end
44
-
45
- def process_csv_row(result, errors, row)
46
- str = row.join('')
47
- str = str.force_encoding('utf-8')
48
- if str.encoding.name == 'UTF-8' && str.valid_encoding?
49
- result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
50
- else
51
- errors << row
52
- end
53
- end
54
-
55
- def get_attributes(exception)
56
- @properties = @data[:attributes]
57
- @encoding = @properties[:encoding] || 'UTF-8'
58
- err_msg = 'No support for encodings other ' +
59
- 'than utf-8 or utf-16 at the moment'
60
- encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
61
- unless encodings.include? @encoding.downcase
62
- raise DarwinCore::EncodingError.new(err_msg)
63
- end
64
- @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ""
66
- @line_separator = @properties[:linesTerminatedBy] || '\n'
67
- @ignore_headers = @properties[:ignoreHeaderLines] ?
68
- [1, true].include?(@properties[:ignoreHeaderLines]) :
69
- false
70
- @file_path = get_file_path
71
- raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
72
- @fields = get_fields
73
- if @fields.empty?
74
- raise DarwinCore::InvalidArchiveError.new("No data fields are found")
75
- end
76
- end
77
-
78
- def get_file_path
79
- file = @data[:location] ||
80
- @data[:attributes][:location] ||
81
- @data[:files][:location]
82
- File.join(@path, file)
83
- end
84
-
85
- def get_fields
86
- @data[:field] = [data[:field]] if data[:field].class != Array
87
- @data[:field].map {|f| f[:attributes]}
88
- end
89
-
90
- def get_field_separator
91
- res = @properties[:fieldsTerminatedBy] || ','
92
- res = "\t" if res == "\\t"
93
- res
94
- end
95
-
96
- def get_size
97
- `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
98
- end
99
- end
100
- end
101
-
@@ -1,42 +0,0 @@
1
- class DarwinCore
2
- class Metadata
3
- def initialize(archive = nil)
4
- @archive = archive
5
- @metadata = @archive.eml
6
- end
7
-
8
- def data
9
- @metadata
10
- end
11
-
12
- def id
13
- @metadata[:eml][:dataset][:attributes][:id] rescue nil
14
- end
15
-
16
- def package_id
17
- @metadata.data[:eml][:attributes][:packageId] rescue nil
18
- end
19
-
20
- def title
21
- @metadata[:eml][:dataset][:title] rescue nil
22
- end
23
-
24
- def authors
25
- return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] = [@metadata[:eml][:dataset][:creator]] unless @metadata[:eml][:dataset][:creator].class == Array
27
- @metadata[:eml][:dataset][:creator].map {|c| {:first_name => c[:individualName][:givenName], :last_name => c[:individualName][:surName], :email => c[:electronicMailAddress]}}
28
- end
29
-
30
- def abstract
31
- @metadata[:eml][:dataset][:abstract] rescue nil
32
- end
33
-
34
- def citation
35
- @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
36
- end
37
-
38
- def url
39
- @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
40
- end
41
- end
42
- end