dwc-archive 0.9.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -5
- data/CHANGELOG +15 -7
- data/Gemfile +3 -15
- data/LICENSE +1 -1
- data/README.md +135 -111
- data/Rakefile +13 -54
- data/dwc-archive.gemspec +37 -0
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +121 -0
- data/lib/dwc_archive/archive.rb +59 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
- data/lib/dwc_archive/expander.rb +85 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +90 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +56 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +89 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +214 -0
- data/spec/lib/core_spec.rb +100 -0
- data/spec/lib/darwin_core_spec.rb +249 -0
- data/spec/lib/generator_eml_xml_spec.rb +22 -0
- data/spec/lib/generator_meta_xml_spec.rb +22 -0
- data/spec/lib/generator_spec.rb +124 -0
- data/spec/lib/gnub_taxon_spec.rb +32 -0
- data/spec/lib/metadata_spec.rb +89 -0
- data/spec/lib/taxon_normalized_spec.rb +142 -0
- data/spec/lib/xml_reader_spec.rb +11 -11
- data/spec/spec_helper.rb +78 -6
- metadata +180 -92
- data/.rvmrc +0 -1
- data/Gemfile.lock +0 -155
- data/VERSION +0 -1
- data/lib/dwc-archive.rb +0 -95
- data/lib/dwc-archive/.expander.rb.swo +0 -0
- data/lib/dwc-archive/archive.rb +0 -37
- data/lib/dwc-archive/classification_normalizer.rb +0 -424
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -80
- data/lib/dwc-archive/generator.rb +0 -75
- data/lib/dwc-archive/generator_eml_xml.rb +0 -84
- data/lib/dwc-archive/generator_meta_xml.rb +0 -50
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -42
- data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
- data/lib/dwc-archive/xml_reader.rb +0 -64
- data/spec/lib/dwc-archive_spec.rb +0 -250
- data/spec/spec.opts +0 -1
    
        data/lib/dwc-archive/core.rb
    DELETED
    
    | @@ -1,17 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Core
         | 
| 3 | 
            -
                include DarwinCore::Ingester
         | 
| 4 | 
            -
                attr_reader :id 
         | 
| 5 | 
            -
                def initialize(dwc)
         | 
| 6 | 
            -
                  @dwc = dwc
         | 
| 7 | 
            -
                  @archive = @dwc.archive
         | 
| 8 | 
            -
                  @path = @archive.files_path
         | 
| 9 | 
            -
                  root_key = @archive.meta.keys[0]
         | 
| 10 | 
            -
                  @data = @archive.meta[root_key][:core]
         | 
| 11 | 
            -
                  raise DarwinCore::CoreFileError.new("Cannot find core in meta.xml, is meta.xml valid?") unless @data
         | 
| 12 | 
            -
                  @id = @data[:id][:attributes] 
         | 
| 13 | 
            -
                  # raise DarwinCore::CoreFileError.new("Cannot find core identifier") unless @id
         | 
| 14 | 
            -
                  get_attributes(DarwinCore::CoreFileError)
         | 
| 15 | 
            -
                end
         | 
| 16 | 
            -
              end
         | 
| 17 | 
            -
            end
         | 
    
        data/lib/dwc-archive/expander.rb
    DELETED
    
    | @@ -1,80 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Expander
         | 
| 3 | 
            -
                def initialize(archive_path, tmp_dir)
         | 
| 4 | 
            -
                  @archive_path = archive_path
         | 
| 5 | 
            -
                  @tmp_dir = tmp_dir
         | 
| 6 | 
            -
                  @path = File.join(tmp_dir, 'dwc_' + rand(10_000_000_000).to_s)
         | 
| 7 | 
            -
                  @unpacker = get_unpacker
         | 
| 8 | 
            -
                end
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                def unpack
         | 
| 11 | 
            -
                  clean
         | 
| 12 | 
            -
                  raise DarwinCore::FileNotFoundError unless File.exists?(@archive_path)
         | 
| 13 | 
            -
                  success = @unpacker.call(@path, @archive_path) if @unpacker
         | 
| 14 | 
            -
                  (@unpacker && success && $?.exitstatus == 0) ? 
         | 
| 15 | 
            -
                    success : 
         | 
| 16 | 
            -
                    (clean; raise DarwinCore::UnpackingError)
         | 
| 17 | 
            -
                end
         | 
| 18 | 
            -
             | 
| 19 | 
            -
                def path
         | 
| 20 | 
            -
                  @files_path ||= files_path
         | 
| 21 | 
            -
                end
         | 
| 22 | 
            -
             | 
| 23 | 
            -
                def clean
         | 
| 24 | 
            -
                  FileUtils.rm_rf(@path) if FileTest.exists?(@path)
         | 
| 25 | 
            -
                end
         | 
| 26 | 
            -
             | 
| 27 | 
            -
                def files
         | 
| 28 | 
            -
                  return nil unless path && FileTest.exists?(path)
         | 
| 29 | 
            -
                  Dir.entries(path).select {|e| e !~ /[\.]{1,2}$/}.sort
         | 
| 30 | 
            -
                end
         | 
| 31 | 
            -
             | 
| 32 | 
            -
                private
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                def esc(a_str)
         | 
| 35 | 
            -
                  "'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
         | 
| 36 | 
            -
                end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                def get_unpacker
         | 
| 39 | 
            -
                  file_command = IO.popen("file -z " + esc(@archive_path))
         | 
| 40 | 
            -
                  file_type    = file_command.read
         | 
| 41 | 
            -
                  file_command.close
         | 
| 42 | 
            -
             | 
| 43 | 
            -
                  if file_type.match(/tar.*gzip/i)
         | 
| 44 | 
            -
                    return proc do |tmp_path, archive_path|
         | 
| 45 | 
            -
                      FileUtils.mkdir tmp_path
         | 
| 46 | 
            -
                      system("tar -zxf #{esc(archive_path)} -C #{tmp_path} > /dev/null 2>&1")
         | 
| 47 | 
            -
                    end
         | 
| 48 | 
            -
                  end
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                  if file_type.match(/Zip/)
         | 
| 51 | 
            -
                    return proc { |tmp_path, archive_path| system("unzip -qq -d #{tmp_path} #{esc(archive_path)} > /dev/null 2>&1") }
         | 
| 52 | 
            -
                  end
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                  return nil
         | 
| 55 | 
            -
                end
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                def path_entries(dir)
         | 
| 58 | 
            -
                  Dir.entries(dir).select {|e| e !~ /[\.]{1,2}$/}.sort
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                def files_path
         | 
| 62 | 
            -
                  res = nil
         | 
| 63 | 
            -
                  entries = path_entries(@path)
         | 
| 64 | 
            -
                  if entries.include?('meta.xml')
         | 
| 65 | 
            -
                    res = @path
         | 
| 66 | 
            -
                  else
         | 
| 67 | 
            -
                    entries.each do |e|
         | 
| 68 | 
            -
                      check_path = File.join(@path, e)
         | 
| 69 | 
            -
                      if FileTest.directory?(check_path)
         | 
| 70 | 
            -
                        if path_entries(check_path).include?('meta.xml')
         | 
| 71 | 
            -
                          res = check_path
         | 
| 72 | 
            -
                          break
         | 
| 73 | 
            -
                        end
         | 
| 74 | 
            -
                      end
         | 
| 75 | 
            -
                    end
         | 
| 76 | 
            -
                  end
         | 
| 77 | 
            -
                  res
         | 
| 78 | 
            -
                end
         | 
| 79 | 
            -
              end
         | 
| 80 | 
            -
            end
         | 
| @@ -1,75 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Generator
         | 
| 3 | 
            -
                attr_reader :eml_xml_data
         | 
| 4 | 
            -
             | 
| 5 | 
            -
                #TODO refactor -- for now copying expander methods
         | 
| 6 | 
            -
                def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
         | 
| 7 | 
            -
                  @dwc_path = dwc_path
         | 
| 8 | 
            -
                  @path = File.join(tmp_dir, 'dwc_' + rand(10000000000).to_s)
         | 
| 9 | 
            -
                  FileUtils.mkdir(@path)
         | 
| 10 | 
            -
                  @meta_xml_data = {:extensions => []}
         | 
| 11 | 
            -
                  @eml_xml_data = {:id => nil, :title => nil, :authors => [], :abstract => nil, :citation => nil, :url => nil}
         | 
| 12 | 
            -
                  @write = 'w:utf-8'
         | 
| 13 | 
            -
                end
         | 
| 14 | 
            -
                
         | 
| 15 | 
            -
                #TODO refactor!
         | 
| 16 | 
            -
                def clean
         | 
| 17 | 
            -
                  FileUtils.rm_rf(@path) if FileTest.exists?(@path)
         | 
| 18 | 
            -
                end
         | 
| 19 | 
            -
             | 
| 20 | 
            -
                def add_core(data, file_name, keep_headers = true)
         | 
| 21 | 
            -
                  c = CSV.open(File.join(@path,file_name), @write)
         | 
| 22 | 
            -
                  header = data.shift
         | 
| 23 | 
            -
                  fields = header.map do |f|
         | 
| 24 | 
            -
                    f.strip!
         | 
| 25 | 
            -
                    raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
         | 
| 26 | 
            -
                    f.split("/")[-1]
         | 
| 27 | 
            -
                  end
         | 
| 28 | 
            -
                  data.unshift(fields) if keep_headers
         | 
| 29 | 
            -
                  ignore_header_lines = keep_headers ? 1 : 0
         | 
| 30 | 
            -
                  @meta_xml_data[:core] = {:fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name}
         | 
| 31 | 
            -
                  data.each {|d| c << d}
         | 
| 32 | 
            -
                  c.close
         | 
| 33 | 
            -
                end
         | 
| 34 | 
            -
             | 
| 35 | 
            -
                def add_extension(data, file_name, keep_headers = true, row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
         | 
| 36 | 
            -
                  c = CSV.open(File.join(@path,file_name), @write)
         | 
| 37 | 
            -
                  header = data.shift
         | 
| 38 | 
            -
                  fields = header.map do |f|
         | 
| 39 | 
            -
                    f.strip!
         | 
| 40 | 
            -
                    raise DarwinCore::GeneratorError.new("No header in core data, or header fields are not urls") unless f.match(/^http:\/\//)
         | 
| 41 | 
            -
                    f.split("/")[-1]
         | 
| 42 | 
            -
                  end
         | 
| 43 | 
            -
                  data.unshift(fields) if keep_headers
         | 
| 44 | 
            -
                  ignore_header_lines = keep_headers ? 1 : 0
         | 
| 45 | 
            -
                  @meta_xml_data[:extensions] << { :fields => header, :ignoreHeaderLines => ignore_header_lines, :location => file_name, :rowType => row_type }
         | 
| 46 | 
            -
                  data.each { |d| c << d }
         | 
| 47 | 
            -
                  c.close
         | 
| 48 | 
            -
                end
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                def add_meta_xml
         | 
| 51 | 
            -
                  meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
         | 
| 52 | 
            -
                  meta.create
         | 
| 53 | 
            -
                end
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                def add_eml_xml(data)
         | 
| 56 | 
            -
                  @eml_xml_data = data
         | 
| 57 | 
            -
                  eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
         | 
| 58 | 
            -
                  eml.create
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                def path
         | 
| 62 | 
            -
                  @path
         | 
| 63 | 
            -
                end
         | 
| 64 | 
            -
                
         | 
| 65 | 
            -
                def files
         | 
| 66 | 
            -
                  return nil unless @path && FileTest.exists?(@path)
         | 
| 67 | 
            -
                  Dir.entries(@path).select {|e| e !~ /[\.]{1,2}$/}.sort
         | 
| 68 | 
            -
                end
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                def pack
         | 
| 71 | 
            -
                  a = "cd #{@path}; tar -zcf #{@dwc_path} *"
         | 
| 72 | 
            -
                  system(a)
         | 
| 73 | 
            -
                end
         | 
| 74 | 
            -
              end
         | 
| 75 | 
            -
            end
         | 
| @@ -1,84 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Generator
         | 
| 3 | 
            -
                class EmlXml
         | 
| 4 | 
            -
             | 
| 5 | 
            -
                  def initialize(data, path)
         | 
| 6 | 
            -
                    @data = data
         | 
| 7 | 
            -
                    @path = path
         | 
| 8 | 
            -
                    @write = 'w:utf-8'
         | 
| 9 | 
            -
                  end
         | 
| 10 | 
            -
             | 
| 11 | 
            -
                  def create
         | 
| 12 | 
            -
                    builder = Nokogiri::XML::Builder.new do |xml|
         | 
| 13 | 
            -
                      xml.eml(:packageId      => "%s/%s" % [@data[:id], timestamp],
         | 
| 14 | 
            -
                        :system               => @data[:system] || "http://globalnames.org",
         | 
| 15 | 
            -
                        :'xml:lang'           => "en",
         | 
| 16 | 
            -
                        :'xmlns:eml'          => "eml://ecoinformatics.org/eml-2.1.1",
         | 
| 17 | 
            -
                        :'xmlns:md'           => "eml://ecoinformatics.org/methods-2.1.1",
         | 
| 18 | 
            -
                        :'xmlns:proj'         => "eml://ecoinformatics.org/project-2.1.1",
         | 
| 19 | 
            -
                        :'xmlns:d'            => "eml://ecoinformatics.org/dataset-2.1.1",
         | 
| 20 | 
            -
                        :'xmlns:res'          => "eml://ecoinformatics.org/resource-2.1.1",
         | 
| 21 | 
            -
                        :'xmlns:dc'           => "http://purl.org/dc/terms/",
         | 
| 22 | 
            -
                        :'xmlns:xsi'          => "http://www.w3.org/2001/XMLSchema-instance",
         | 
| 23 | 
            -
                        :'xsi:schemaLocation' => "eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd") do
         | 
| 24 | 
            -
                        xml.dataset(:id => @data[:id]) do
         | 
| 25 | 
            -
                          xml.title(@data[:title])
         | 
| 26 | 
            -
                          xml.license(@data[:license])
         | 
| 27 | 
            -
                          contacts = []
         | 
| 28 | 
            -
                          @data[:authors].each_with_index do |a, i|
         | 
| 29 | 
            -
                            creator_id = i + 1
         | 
| 30 | 
            -
                            contacts << creator_id
         | 
| 31 | 
            -
                            xml.creator(:id => creator_id, :scope => 'document') do
         | 
| 32 | 
            -
                              xml.individualName do
         | 
| 33 | 
            -
                                xml.givenName(a[:first_name])
         | 
| 34 | 
            -
                                xml.surName(a[:last_name])
         | 
| 35 | 
            -
                              end
         | 
| 36 | 
            -
                              xml.organizationName(a[:organization]) if a[:organization]
         | 
| 37 | 
            -
                              xml.positionName(a[:position]) if a[:position]
         | 
| 38 | 
            -
                              xml.onlineUrl(a[:url]) if a[:url]
         | 
| 39 | 
            -
                              xml.electronicMailAddress(a[:email])
         | 
| 40 | 
            -
                            end
         | 
| 41 | 
            -
                          end
         | 
| 42 | 
            -
                          @data[:metadata_providers].each_with_index do |a, i|
         | 
| 43 | 
            -
                            xml.metadataProvider do
         | 
| 44 | 
            -
                              xml.individualName do
         | 
| 45 | 
            -
                                xml.givenName(a[:first_name])
         | 
| 46 | 
            -
                                xml.surName(a[:last_name])
         | 
| 47 | 
            -
                              end
         | 
| 48 | 
            -
                              xml.organizationName(a[:organization]) if a[:organization]
         | 
| 49 | 
            -
                              xml.positionName(a[:position]) if a[:position]
         | 
| 50 | 
            -
                              xml.onlineUrl(a[:url]) if a[:url]
         | 
| 51 | 
            -
                              xml.electronicMailAddress(a[:email])
         | 
| 52 | 
            -
                            end
         | 
| 53 | 
            -
                          end if @data[:metadata_providers]
         | 
| 54 | 
            -
                          xml.pubDate(Time.now.to_s)
         | 
| 55 | 
            -
                          xml.abstract() do
         | 
| 56 | 
            -
                            xml.para(@data[:abstract])
         | 
| 57 | 
            -
                          end
         | 
| 58 | 
            -
                          contacts.each do |contact|
         | 
| 59 | 
            -
                            xml.contact { xml.references(contact) }
         | 
| 60 | 
            -
                          end
         | 
| 61 | 
            -
                        end
         | 
| 62 | 
            -
                        xml.additionalMetadata do
         | 
| 63 | 
            -
                          xml.metadata do
         | 
| 64 | 
            -
                            xml.citation(@data[:citation])
         | 
| 65 | 
            -
                            xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
         | 
| 66 | 
            -
                          end
         | 
| 67 | 
            -
                        end
         | 
| 68 | 
            -
                        xml.parent.namespace = xml.parent.namespace_definitions.first
         | 
| 69 | 
            -
                      end
         | 
| 70 | 
            -
                    end
         | 
| 71 | 
            -
                    data = builder.to_xml
         | 
| 72 | 
            -
                    f = open(File.join(@path, 'eml.xml'), @write)
         | 
| 73 | 
            -
                    f.write(data)
         | 
| 74 | 
            -
                    f.close
         | 
| 75 | 
            -
                  end
         | 
| 76 | 
            -
             | 
| 77 | 
            -
                  private
         | 
| 78 | 
            -
                  def timestamp
         | 
| 79 | 
            -
                    t = Time.now.getutc.to_a[0..5].reverse
         | 
| 80 | 
            -
                    t[0..2].join('-') + "::" + t[-3..-1].join(':')
         | 
| 81 | 
            -
                  end
         | 
| 82 | 
            -
                end
         | 
| 83 | 
            -
              end
         | 
| 84 | 
            -
            end
         | 
| @@ -1,50 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Generator
         | 
| 3 | 
            -
                class MetaXml
         | 
| 4 | 
            -
                  def initialize(data, path)
         | 
| 5 | 
            -
                    @data = data
         | 
| 6 | 
            -
                    @path = path
         | 
| 7 | 
            -
                    @write = 'w:utf-8'
         | 
| 8 | 
            -
                  end
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                  def create
         | 
| 11 | 
            -
                    builder = Nokogiri::XML::Builder.new do |xml|
         | 
| 12 | 
            -
                      opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/terms/Taxon" }
         | 
| 13 | 
            -
                      xml.archive(:xmlns => "http://rs.tdwg.org/dwc/text/",
         | 
| 14 | 
            -
                        "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
         | 
| 15 | 
            -
                        "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd") do
         | 
| 16 | 
            -
                        xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
         | 
| 17 | 
            -
                          xml.files { xml.location(@data[:core][:location]) }
         | 
| 18 | 
            -
                          taxon_id, fields = find_taxon_id(@data[:core][:fields])
         | 
| 19 | 
            -
                          xml.id_(:index => taxon_id[1])
         | 
| 20 | 
            -
                          fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
         | 
| 21 | 
            -
                        end
         | 
| 22 | 
            -
                        @data[:extensions].each do |e|
         | 
| 23 | 
            -
                          xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines], :rowType => e[:rowType])) do
         | 
| 24 | 
            -
                            xml.files { xml.location(e[:location]) }
         | 
| 25 | 
            -
                            taxon_id, fields = find_taxon_id(e[:fields])
         | 
| 26 | 
            -
                            xml.coreid(:index => taxon_id[1])
         | 
| 27 | 
            -
                            fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
         | 
| 28 | 
            -
                          end
         | 
| 29 | 
            -
                        end
         | 
| 30 | 
            -
                      end
         | 
| 31 | 
            -
                    end
         | 
| 32 | 
            -
                    meta_xml_data = builder.to_xml
         | 
| 33 | 
            -
                    meta_file = open(File.join(@path, 'meta.xml'), @write)
         | 
| 34 | 
            -
                    meta_file.write(meta_xml_data)
         | 
| 35 | 
            -
                    meta_file.close
         | 
| 36 | 
            -
                  end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                  private
         | 
| 39 | 
            -
                  def find_taxon_id(data)
         | 
| 40 | 
            -
                    fields = []
         | 
| 41 | 
            -
                    data.each_with_index { |f, i| fields << [f.strip, i] }
         | 
| 42 | 
            -
                    taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
         | 
| 43 | 
            -
                    raise DarwinCore::GeneratorError if taxon_id.size != 1
         | 
| 44 | 
            -
                    [taxon_id[0], fields]
         | 
| 45 | 
            -
                  end
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                end
         | 
| 48 | 
            -
              end
         | 
| 49 | 
            -
            end
         | 
| 50 | 
            -
             | 
    
        data/lib/dwc-archive/ingester.rb
    DELETED
    
    | @@ -1,101 +0,0 @@ | |
| 1 | 
            -
            # encoding: utf-8
         | 
| 2 | 
            -
            class DarwinCore
         | 
| 3 | 
            -
              module Ingester
         | 
| 4 | 
            -
                attr_reader :data, :properties, :encoding, :fields_separator, :size
         | 
| 5 | 
            -
                attr_reader :file_path, :fields, :line_separator, 
         | 
| 6 | 
            -
                            :quote_character, :ignore_headers
         | 
| 7 | 
            -
             | 
| 8 | 
            -
                def size
         | 
| 9 | 
            -
                  @size ||= get_size
         | 
| 10 | 
            -
                end
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                def read(batch_size = 10000)
         | 
| 13 | 
            -
                  DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
         | 
| 14 | 
            -
                  res = []
         | 
| 15 | 
            -
                  errors = []
         | 
| 16 | 
            -
                  index_fix = 1
         | 
| 17 | 
            -
                  args = {:col_sep => @field_separator}
         | 
| 18 | 
            -
                  @quote_character = "\b" if @quote_character.empty?
         | 
| 19 | 
            -
                  args.merge!({:quote_char => @quote_character})
         | 
| 20 | 
            -
                  min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
         | 
| 21 | 
            -
                  csv = CSV.new(open(@file_path), args)
         | 
| 22 | 
            -
                  csv.each_with_index do |r, i|
         | 
| 23 | 
            -
                    index_fix = 0; next if @ignore_headers && i == 0
         | 
| 24 | 
            -
                    min_size > r.size ? errors << r : process_csv_row(res, errors, r)
         | 
| 25 | 
            -
                    if (i + index_fix) % batch_size == 0
         | 
| 26 | 
            -
                      DarwinCore.logger_write(@dwc.object_id, 
         | 
| 27 | 
            -
                                              "Ingested %s records from %s" % 
         | 
| 28 | 
            -
                                              [(i + index_fix), name])
         | 
| 29 | 
            -
                      if block_given?
         | 
| 30 | 
            -
                        yield [res, errors]
         | 
| 31 | 
            -
                        res = []
         | 
| 32 | 
            -
                        errors = []
         | 
| 33 | 
            -
                      end
         | 
| 34 | 
            -
                    end
         | 
| 35 | 
            -
                  end
         | 
| 36 | 
            -
                  yield [res, errors] if block_given?
         | 
| 37 | 
            -
                  [res, errors]
         | 
| 38 | 
            -
                end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
                private
         | 
| 41 | 
            -
                def name
         | 
| 42 | 
            -
                  self.class.to_s.split('::')[-1].downcase
         | 
| 43 | 
            -
                end
         | 
| 44 | 
            -
             | 
| 45 | 
            -
                def process_csv_row(result, errors, row)
         | 
| 46 | 
            -
                  str = row.join('')
         | 
| 47 | 
            -
                  str = str.force_encoding('utf-8')
         | 
| 48 | 
            -
                  if str.encoding.name == 'UTF-8' && str.valid_encoding?
         | 
| 49 | 
            -
                    result << row.map { |f| f.nil? ? nil : f.force_encoding('utf-8') }
         | 
| 50 | 
            -
                  else
         | 
| 51 | 
            -
                    errors << row
         | 
| 52 | 
            -
                  end
         | 
| 53 | 
            -
                end
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                def get_attributes(exception)
         | 
| 56 | 
            -
                  @properties = @data[:attributes]
         | 
| 57 | 
            -
                  @encoding = @properties[:encoding] || 'UTF-8'
         | 
| 58 | 
            -
                  err_msg = 'No support for encodings other ' + 
         | 
| 59 | 
            -
                    'than utf-8 or utf-16 at the moment'
         | 
| 60 | 
            -
                  encodings = ['utf-8', 'utf8', 'utf-16', 'utf16']
         | 
| 61 | 
            -
                  unless encodings.include? @encoding.downcase
         | 
| 62 | 
            -
                    raise DarwinCore::EncodingError.new(err_msg) 
         | 
| 63 | 
            -
                  end
         | 
| 64 | 
            -
                  @field_separator = get_field_separator
         | 
| 65 | 
            -
                  @quote_character = @properties[:fieldsEnclosedBy] || ""
         | 
| 66 | 
            -
                  @line_separator = @properties[:linesTerminatedBy] || '\n'
         | 
| 67 | 
            -
                  @ignore_headers = @properties[:ignoreHeaderLines] ? 
         | 
| 68 | 
            -
                                    [1, true].include?(@properties[:ignoreHeaderLines]) : 
         | 
| 69 | 
            -
                                    false
         | 
| 70 | 
            -
                  @file_path = get_file_path
         | 
| 71 | 
            -
                  raise DarwinCore::FileNotFoundError.new("No file data") unless @file_path
         | 
| 72 | 
            -
                  @fields = get_fields
         | 
| 73 | 
            -
                  if @fields.empty?
         | 
| 74 | 
            -
                    raise DarwinCore::InvalidArchiveError.new("No data fields are found")
         | 
| 75 | 
            -
                  end
         | 
| 76 | 
            -
                end
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                def get_file_path
         | 
| 79 | 
            -
                  file = @data[:location] || 
         | 
| 80 | 
            -
                         @data[:attributes][:location] || 
         | 
| 81 | 
            -
                         @data[:files][:location]
         | 
| 82 | 
            -
                  File.join(@path, file)
         | 
| 83 | 
            -
                end
         | 
| 84 | 
            -
             | 
| 85 | 
            -
                def get_fields
         | 
| 86 | 
            -
                  @data[:field] = [data[:field]] if data[:field].class != Array
         | 
| 87 | 
            -
                  @data[:field].map {|f| f[:attributes]}
         | 
| 88 | 
            -
                end
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                def get_field_separator
         | 
| 91 | 
            -
                  res = @properties[:fieldsTerminatedBy] || ','
         | 
| 92 | 
            -
                  res = "\t" if res == "\\t"
         | 
| 93 | 
            -
                  res
         | 
| 94 | 
            -
                end
         | 
| 95 | 
            -
             | 
| 96 | 
            -
                def get_size
         | 
| 97 | 
            -
                  `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
         | 
| 98 | 
            -
                end
         | 
| 99 | 
            -
              end
         | 
| 100 | 
            -
            end
         | 
| 101 | 
            -
             | 
    
        data/lib/dwc-archive/metadata.rb
    DELETED
    
    | @@ -1,42 +0,0 @@ | |
| 1 | 
            -
            class DarwinCore
         | 
| 2 | 
            -
              class Metadata
         | 
| 3 | 
            -
                def initialize(archive = nil)
         | 
| 4 | 
            -
                  @archive = archive
         | 
| 5 | 
            -
                  @metadata = @archive.eml
         | 
| 6 | 
            -
                end
         | 
| 7 | 
            -
             | 
| 8 | 
            -
                def data
         | 
| 9 | 
            -
                  @metadata
         | 
| 10 | 
            -
                end
         | 
| 11 | 
            -
             | 
| 12 | 
            -
                def id
         | 
| 13 | 
            -
                  @metadata[:eml][:dataset][:attributes][:id] rescue nil
         | 
| 14 | 
            -
                end
         | 
| 15 | 
            -
             | 
| 16 | 
            -
                def package_id
         | 
| 17 | 
            -
                  @metadata.data[:eml][:attributes][:packageId] rescue nil
         | 
| 18 | 
            -
                end
         | 
| 19 | 
            -
             | 
| 20 | 
            -
                def title
         | 
| 21 | 
            -
                  @metadata[:eml][:dataset][:title] rescue nil
         | 
| 22 | 
            -
                end
         | 
| 23 | 
            -
             | 
| 24 | 
            -
                def authors
         | 
| 25 | 
            -
                  return nil unless defined?(@metadata[:eml][:dataset][:creator])
         | 
| 26 | 
            -
                  @metadata[:eml][:dataset][:creator] = [@metadata[:eml][:dataset][:creator]] unless @metadata[:eml][:dataset][:creator].class == Array 
         | 
| 27 | 
            -
                  @metadata[:eml][:dataset][:creator].map {|c| {:first_name => c[:individualName][:givenName], :last_name => c[:individualName][:surName], :email => c[:electronicMailAddress]}}
         | 
| 28 | 
            -
                end
         | 
| 29 | 
            -
             | 
| 30 | 
            -
                def abstract
         | 
| 31 | 
            -
                  @metadata[:eml][:dataset][:abstract] rescue nil
         | 
| 32 | 
            -
                end
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                def citation
         | 
| 35 | 
            -
                  @metadata[:eml][:additionalMetadata][:metadata][:citation] rescue nil
         | 
| 36 | 
            -
                end
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                def url
         | 
| 39 | 
            -
                  @metadata[:eml][:dataset][:distribution][:online][:url] rescue nil
         | 
| 40 | 
            -
                end
         | 
| 41 | 
            -
              end
         | 
| 42 | 
            -
            end
         |