iostreams 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/LICENSE +202 -0
 - data/README.md +155 -47
 - data/lib/io_streams/file/reader.rb +7 -8
 - data/lib/io_streams/file/writer.rb +7 -8
 - data/lib/io_streams/io_streams.rb +313 -129
 - data/lib/io_streams/{delimited → line}/reader.rb +20 -30
 - data/lib/io_streams/line/writer.rb +81 -0
 - data/lib/io_streams/pgp.rb +4 -14
 - data/lib/io_streams/record/reader.rb +55 -0
 - data/lib/io_streams/record/writer.rb +63 -0
 - data/lib/io_streams/row/reader.rb +60 -0
 - data/lib/io_streams/row/writer.rb +62 -0
 - data/lib/io_streams/s3.rb +25 -0
 - data/lib/io_streams/s3/reader.rb +64 -0
 - data/lib/io_streams/s3/writer.rb +13 -0
 - data/lib/io_streams/streams.rb +1 -1
 - data/lib/io_streams/tabular.rb +163 -0
 - data/lib/io_streams/tabular/errors.rb +14 -0
 - data/lib/io_streams/tabular/header.rb +146 -0
 - data/lib/io_streams/tabular/parser/array.rb +26 -0
 - data/lib/io_streams/tabular/parser/base.rb +12 -0
 - data/lib/io_streams/tabular/parser/csv.rb +35 -0
 - data/lib/io_streams/tabular/parser/fixed.rb +88 -0
 - data/lib/io_streams/tabular/parser/hash.rb +21 -0
 - data/lib/io_streams/tabular/parser/json.rb +25 -0
 - data/lib/io_streams/tabular/parser/psv.rb +34 -0
 - data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
 - data/lib/io_streams/version.rb +2 -2
 - data/lib/io_streams/xlsx/reader.rb +1 -1
 - data/lib/io_streams/zip/reader.rb +1 -1
 - data/lib/io_streams/zip/writer.rb +1 -1
 - data/lib/iostreams.rb +21 -10
 - data/test/bzip2_reader_test.rb +21 -22
 - data/test/bzip2_writer_test.rb +38 -32
 - data/test/file_reader_test.rb +19 -18
 - data/test/file_writer_test.rb +23 -22
 - data/test/files/test.json +3 -0
 - data/test/gzip_reader_test.rb +21 -22
 - data/test/gzip_writer_test.rb +35 -29
 - data/test/io_streams_test.rb +137 -61
 - data/test/line_reader_test.rb +105 -0
 - data/test/line_writer_test.rb +50 -0
 - data/test/pgp_reader_test.rb +29 -29
 - data/test/pgp_test.rb +149 -195
 - data/test/pgp_writer_test.rb +63 -62
 - data/test/record_reader_test.rb +61 -0
 - data/test/record_writer_test.rb +73 -0
 - data/test/row_reader_test.rb +34 -0
 - data/test/row_writer_test.rb +51 -0
 - data/test/tabular_test.rb +184 -0
 - data/test/xlsx_reader_test.rb +13 -17
 - data/test/zip_reader_test.rb +21 -22
 - data/test/zip_writer_test.rb +40 -36
 - metadata +41 -17
 - data/lib/io_streams/csv/reader.rb +0 -21
 - data/lib/io_streams/csv/writer.rb +0 -20
 - data/lib/io_streams/delimited/writer.rb +0 -67
 - data/test/csv_reader_test.rb +0 -34
 - data/test/csv_writer_test.rb +0 -35
 - data/test/delimited_reader_test.rb +0 -115
 - data/test/delimited_writer_test.rb +0 -44
 
| 
         @@ -0,0 +1,25 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            begin
         
     | 
| 
      
 2 
     | 
    
         
            +
              require 'aws-sdk-s3'
         
     | 
| 
      
 3 
     | 
    
         
            +
            rescue LoadError => exc
         
     | 
| 
      
 4 
     | 
    
         
            +
              raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
         
     | 
| 
      
 5 
     | 
    
         
            +
            end
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 8 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 9 
     | 
    
         
            +
              module S3
         
     | 
| 
      
 10 
     | 
    
         
            +
                # Sample URI: s3://mybucket/user/abc.zip
         
     | 
| 
      
 11 
     | 
    
         
            +
                def self.parse_uri(uri)
         
     | 
| 
      
 12 
     | 
    
         
            +
                  # 's3://mybucket/user/abc.zip'
         
     | 
| 
      
 13 
     | 
    
         
            +
                  uri = URI.parse(uri)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # Filename and bucket only
         
     | 
| 
      
 15 
     | 
    
         
            +
                  if uri.scheme.nil?
         
     | 
| 
      
 16 
     | 
    
         
            +
                    segments = uri.path.split('/')
         
     | 
| 
      
 17 
     | 
    
         
            +
                    raise "S3 URI must at the very least contain '<bucket_name>/<key>'" if (segments.size == 1) || (segments[0] == '')
         
     | 
| 
      
 18 
     | 
    
         
            +
                    {
         
     | 
| 
      
 19 
     | 
    
         
            +
                      bucket: segments.shift,
         
     | 
| 
      
 20 
     | 
    
         
            +
                      key:    segments.join('/')
         
     | 
| 
      
 21 
     | 
    
         
            +
                    }
         
     | 
| 
      
 22 
     | 
    
         
            +
                  end
         
     | 
| 
      
 23 
     | 
    
         
            +
                end
         
     | 
| 
      
 24 
     | 
    
         
            +
              end
         
     | 
| 
      
 25 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,64 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 2 
     | 
    
         
            +
              module S3
         
     | 
| 
      
 3 
     | 
    
         
            +
                class Reader
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # Read from a AWS S3 file
         
     | 
| 
      
 5 
     | 
    
         
            +
                  def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
         
     | 
| 
      
 6 
     | 
    
         
            +
                    options = uri.nil? ? args : parse_uri(uri).merge(args)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    s3      = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
         
     | 
| 
      
 8 
     | 
    
         
            +
                    object  = s3.bucket(options[:bucket]).object(options[:key])
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                    IO.pipe do |read_io, write_io|
         
     | 
| 
      
 11 
     | 
    
         
            +
                      object.get(response_target: write_io)
         
     | 
| 
      
 12 
     | 
    
         
            +
                      write_io.close
         
     | 
| 
      
 13 
     | 
    
         
            +
                      block.call(read_io)
         
     | 
| 
      
 14 
     | 
    
         
            +
                    end
         
     | 
| 
      
 15 
     | 
    
         
            +
                  end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                  def self.open2(uri = nil, **args, &block)
         
     | 
| 
      
 18 
     | 
    
         
            +
                    if !uri.nil? && IOStreams.reader_stream?(uri)
         
     | 
| 
      
 19 
     | 
    
         
            +
                      raise(ArgumentError, 'S3 can only accept a URI, not an IO stream when reading.')
         
     | 
| 
      
 20 
     | 
    
         
            +
                    end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                    unless defined?(Aws::S3::Resource)
         
     | 
| 
      
 23 
     | 
    
         
            +
                      begin
         
     | 
| 
      
 24 
     | 
    
         
            +
                        require 'aws-sdk-s3'
         
     | 
| 
      
 25 
     | 
    
         
            +
                      rescue LoadError => exc
         
     | 
| 
      
 26 
     | 
    
         
            +
                        raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
         
     | 
| 
      
 27 
     | 
    
         
            +
                      end
         
     | 
| 
      
 28 
     | 
    
         
            +
                    end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                    options = uri.nil? ? args : parse_uri(uri).merge(args)
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 33 
     | 
    
         
            +
                      io = new(**options)
         
     | 
| 
      
 34 
     | 
    
         
            +
                      block.call(io)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    ensure
         
     | 
| 
      
 36 
     | 
    
         
            +
                      io.close if io && (io.respond_to?(:closed?) && !io.closed?)
         
     | 
| 
      
 37 
     | 
    
         
            +
                    end
         
     | 
| 
      
 38 
     | 
    
         
            +
                  end
         
     | 
| 
      
 39 
     | 
    
         
            +
             
     | 
| 
      
 40 
     | 
    
         
            +
                  def initialize(region: nil, bucket:, key:)
         
     | 
| 
      
 41 
     | 
    
         
            +
                    s3      = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
         
     | 
| 
      
 42 
     | 
    
         
            +
                    @object = s3.bucket(bucket).object(key)
         
     | 
| 
      
 43 
     | 
    
         
            +
                    @buffer = []
         
     | 
| 
      
 44 
     | 
    
         
            +
                  end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
                  def read(length = nil, outbuf = nil)
         
     | 
| 
      
 47 
     | 
    
         
            +
                    # Sufficient data already in the buffer
         
     | 
| 
      
 48 
     | 
    
         
            +
                    return @buffer.slice!(0, length) if length && (length <= @buffer.length)
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                    # Fetch in chunks
         
     | 
| 
      
 51 
     | 
    
         
            +
                    @object.get do |chunk|
         
     | 
| 
      
 52 
     | 
    
         
            +
                      @buffer << chunk
         
     | 
| 
      
 53 
     | 
    
         
            +
                      return @buffer.slice!(0, length) if length && (length <= @buffer.length)
         
     | 
| 
      
 54 
     | 
    
         
            +
                    end
         
     | 
| 
      
 55 
     | 
    
         
            +
                    @buffer if @buffer.size > 0
         
     | 
| 
      
 56 
     | 
    
         
            +
                  end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
                  private
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                  attr_reader :object
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                end
         
     | 
| 
      
 63 
     | 
    
         
            +
              end
         
     | 
| 
      
 64 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,13 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 2 
     | 
    
         
            +
              module S3
         
     | 
| 
      
 3 
     | 
    
         
            +
                class Writer
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # Write to AWS S3
         
     | 
| 
      
 5 
     | 
    
         
            +
                  def self.open(uri = nil, bucket: nil, region: nil, key: nil, &block)
         
     | 
| 
      
 6 
     | 
    
         
            +
                    options = uri.nil? ? args : parse_uri(uri).merge(args)
         
     | 
| 
      
 7 
     | 
    
         
            +
                    s3      = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
         
     | 
| 
      
 8 
     | 
    
         
            +
                    object  = s3.bucket(options[:bucket]).object(options[:key])
         
     | 
| 
      
 9 
     | 
    
         
            +
                    object.upload_stream(file_name_or_io, &block)
         
     | 
| 
      
 10 
     | 
    
         
            +
                  end
         
     | 
| 
      
 11 
     | 
    
         
            +
                end
         
     | 
| 
      
 12 
     | 
    
         
            +
              end
         
     | 
| 
      
 13 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/io_streams/streams.rb
    CHANGED
    
    | 
         @@ -92,7 +92,7 @@ module IOStreams 
     | 
|
| 
       92 
92 
     | 
    
         
             
                #   RocketJob::Formatter::Formats.streams_for_file_name('myfile.csv')
         
     | 
| 
       93 
93 
     | 
    
         
             
                #   => [ :file ]
         
     | 
| 
       94 
94 
     | 
    
         
             
                def streams_for_file_name(file_name)
         
     | 
| 
       95 
     | 
    
         
            -
                  raise ArgumentError.new(" 
     | 
| 
      
 95 
     | 
    
         
            +
                  raise ArgumentError.new("Cannot auto-detect streams when already a stream: #{file_name.inspect}") if reader_stream?(file_name)
         
     | 
| 
       96 
96 
     | 
    
         | 
| 
       97 
97 
     | 
    
         
             
                  parts      = file_name.split('.')
         
     | 
| 
       98 
98 
     | 
    
         
             
                  extensions = []
         
     | 
| 
         @@ -0,0 +1,163 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 2 
     | 
    
         
            +
              # Common handling for efficiently processing tabular data such as CSV, spreadsheet or other tabular files
         
     | 
| 
      
 3 
     | 
    
         
            +
              # on a line by line basis.
         
     | 
| 
      
 4 
     | 
    
         
            +
              #
         
     | 
| 
      
 5 
     | 
    
         
            +
              # Tabular consists of a table of data where the first row is usually the header, and subsequent
         
     | 
| 
      
 6 
     | 
    
         
            +
              # rows are the data elements.
         
     | 
| 
      
 7 
     | 
    
         
            +
              #
         
     | 
| 
      
 8 
     | 
    
         
            +
              # Tabular applies the header information to every row of data when #as_hash is called.
         
     | 
| 
      
 9 
     | 
    
         
            +
              #
         
     | 
| 
      
 10 
     | 
    
         
            +
              # Example using the default CSV parser:
         
     | 
| 
      
 11 
     | 
    
         
            +
              #
         
     | 
| 
      
 12 
     | 
    
         
            +
              #   tabular = Tabular.new
         
     | 
| 
      
 13 
     | 
    
         
            +
              #   tabular.parse_header("first field,Second,thirD")
         
     | 
| 
      
 14 
     | 
    
         
            +
              #   # => ["first field", "Second", "thirD"]
         
     | 
| 
      
 15 
     | 
    
         
            +
              #
         
     | 
| 
      
 16 
     | 
    
         
            +
              #   tabular.cleanse_header!
         
     | 
| 
      
 17 
     | 
    
         
            +
              #   # => ["first_field", "second", "third"]
         
     | 
| 
      
 18 
     | 
    
         
            +
              #
         
     | 
| 
      
 19 
     | 
    
         
            +
              #   tabular.record_parse("1,2,3")
         
     | 
| 
      
 20 
     | 
    
         
            +
              #   # => {"first_field"=>"1", "second"=>"2", "third"=>"3"}
         
     | 
| 
      
 21 
     | 
    
         
            +
              #
         
     | 
| 
      
 22 
     | 
    
         
            +
              #   tabular.record_parse([1,2,3])
         
     | 
| 
      
 23 
     | 
    
         
            +
              #   # => {"first_field"=>1, "second"=>2, "third"=>3}
         
     | 
| 
      
 24 
     | 
    
         
            +
              #
         
     | 
| 
      
 25 
     | 
    
         
            +
              #   tabular.render([5,6,9])
         
     | 
| 
      
 26 
     | 
    
         
            +
              #   # => "5,6,9"
         
     | 
| 
      
 27 
     | 
    
         
            +
              #
         
     | 
| 
      
 28 
     | 
    
         
            +
              #   tabular.render({"third"=>"3", "first_field"=>"1" })
         
     | 
| 
      
 29 
     | 
    
         
            +
              #   # => "1,,3"
         
     | 
| 
      
 30 
     | 
    
         
            +
              class Tabular
         
     | 
| 
      
 31 
     | 
    
         
            +
                autoload :Errors, 'io_streams/tabular/errors'
         
     | 
| 
      
 32 
     | 
    
         
            +
                autoload :Header, 'io_streams/tabular/header'
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                module Parser
         
     | 
| 
      
 35 
     | 
    
         
            +
                  autoload :Array, 'io_streams/tabular/parser/array'
         
     | 
| 
      
 36 
     | 
    
         
            +
                  autoload :Base, 'io_streams/tabular/parser/base'
         
     | 
| 
      
 37 
     | 
    
         
            +
                  autoload :Csv, 'io_streams/tabular/parser/csv'
         
     | 
| 
      
 38 
     | 
    
         
            +
                  autoload :Fixed, 'io_streams/tabular/parser/fixed'
         
     | 
| 
      
 39 
     | 
    
         
            +
                  autoload :Hash, 'io_streams/tabular/parser/hash'
         
     | 
| 
      
 40 
     | 
    
         
            +
                  autoload :Json, 'io_streams/tabular/parser/json'
         
     | 
| 
      
 41 
     | 
    
         
            +
                  autoload :Psv, 'io_streams/tabular/parser/psv'
         
     | 
| 
      
 42 
     | 
    
         
            +
                end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
                module Utility
         
     | 
| 
      
 45 
     | 
    
         
            +
                  autoload :CSVRow, 'io_streams/tabular/utility/csv_row'
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                attr_reader :format, :header, :parser
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                # Parse a delimited data source.
         
     | 
| 
      
 51 
     | 
    
         
            +
                #
         
     | 
| 
      
 52 
     | 
    
         
            +
                # Parameters
         
     | 
| 
      
 53 
     | 
    
         
            +
                #   format: [Symbol]
         
     | 
| 
      
 54 
     | 
    
         
            +
                #     :csv, :hash, :array, :json, :psv, :fixed
         
     | 
| 
      
 55 
     | 
    
         
            +
                #
         
     | 
| 
      
 56 
     | 
    
         
            +
                #   For all other parameters, see Tabular::Header.new
         
     | 
| 
      
 57 
     | 
    
         
            +
                def initialize(format: nil, file_name: nil, **args)
         
     | 
| 
      
 58 
     | 
    
         
            +
                  @header = Header.new(**args)
         
     | 
| 
      
 59 
     | 
    
         
            +
                  klass   =
         
     | 
| 
      
 60 
     | 
    
         
            +
                    if file_name && format.nil?
         
     | 
| 
      
 61 
     | 
    
         
            +
                      self.class.parser_class_for_file_name(file_name)
         
     | 
| 
      
 62 
     | 
    
         
            +
                    else
         
     | 
| 
      
 63 
     | 
    
         
            +
                      self.class.parser_class(format)
         
     | 
| 
      
 64 
     | 
    
         
            +
                    end
         
     | 
| 
      
 65 
     | 
    
         
            +
                  @parser = klass.new
         
     | 
| 
      
 66 
     | 
    
         
            +
                end
         
     | 
| 
      
 67 
     | 
    
         
            +
             
     | 
| 
      
 68 
     | 
    
         
            +
                # Returns [true|false] whether a header row needs to be read first.
         
     | 
| 
      
 69 
     | 
    
         
            +
                def requires_header?
         
     | 
| 
      
 70 
     | 
    
         
            +
                  parser.requires_header? && IOStreams.blank?(header.columns)
         
     | 
| 
      
 71 
     | 
    
         
            +
                end
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
                # Returns [Array] the header row/line after parsing and cleansing.
         
     | 
| 
      
 74 
     | 
    
         
            +
                # Returns `nil` if the row/line is blank, or a header is not required for the supplied format (:json, :hash).
         
     | 
| 
      
 75 
     | 
    
         
            +
                #
         
     | 
| 
      
 76 
     | 
    
         
            +
                # Notes:
         
     | 
| 
      
 77 
     | 
    
         
            +
                # * Call `parse_header?` first to determine if the header should be parsed first.
         
     | 
| 
      
 78 
     | 
    
         
            +
                # * The header columns are set after parsing the row, but the header is not cleansed.
         
     | 
| 
      
 79 
     | 
    
         
            +
                def parse_header(line)
         
     | 
| 
      
 80 
     | 
    
         
            +
                  return if IOStreams.blank?(line) || !parser.requires_header?
         
     | 
| 
      
 81 
     | 
    
         
            +
             
     | 
| 
      
 82 
     | 
    
         
            +
                  header.columns = parser.parse(line)
         
     | 
| 
      
 83 
     | 
    
         
            +
                end
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                # Returns [Hash<String,Object>] the line as a hash.
         
     | 
| 
      
 86 
     | 
    
         
            +
                # Returns nil if the line is blank.
         
     | 
| 
      
 87 
     | 
    
         
            +
                def record_parse(line)
         
     | 
| 
      
 88 
     | 
    
         
            +
                  line = row_parse(line)
         
     | 
| 
      
 89 
     | 
    
         
            +
                  header.to_hash(line) if line
         
     | 
| 
      
 90 
     | 
    
         
            +
                end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                # Returns [Array] the row/line as a parsed Array of values.
         
     | 
| 
      
 93 
     | 
    
         
            +
                # Returns nil if the row/line is blank.
         
     | 
| 
      
 94 
     | 
    
         
            +
                def row_parse(line)
         
     | 
| 
      
 95 
     | 
    
         
            +
                  return if IOStreams.blank?(line)
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                  parser.parse(line)
         
     | 
| 
      
 98 
     | 
    
         
            +
                end
         
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
      
 100 
     | 
    
         
            +
                # Renders the output row
         
     | 
| 
      
 101 
     | 
    
         
            +
                def render(row)
         
     | 
| 
      
 102 
     | 
    
         
            +
                  return if IOStreams.blank?(row)
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                  parser.render(row, header)
         
     | 
| 
      
 105 
     | 
    
         
            +
                end
         
     | 
| 
      
 106 
     | 
    
         
            +
             
     | 
| 
      
 107 
     | 
    
         
            +
                # Returns [Array<String>] the cleansed columns
         
     | 
| 
      
 108 
     | 
    
         
            +
                def cleanse_header!
         
     | 
| 
      
 109 
     | 
    
         
            +
                  header.cleanse!
         
     | 
| 
      
 110 
     | 
    
         
            +
                  header.columns
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                # Register a file extension and the reader and writer classes to use to format it
         
     | 
| 
      
 114 
     | 
    
         
            +
                #
         
     | 
| 
      
 115 
     | 
    
         
            +
                # Example:
         
     | 
| 
      
 116 
     | 
    
         
            +
                #   # MyXls::Reader and MyXls::Writer must implement .open
         
     | 
| 
      
 117 
     | 
    
         
            +
                #   register_extension(:xls, MyXls::Reader, MyXls::Writer)
         
     | 
| 
      
 118 
     | 
    
         
            +
                def self.register_extension(extension, parser)
         
     | 
| 
      
 119 
     | 
    
         
            +
                  raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.nil? || extension.to_s =~ /\A\w+\Z/
         
     | 
| 
      
 120 
     | 
    
         
            +
                  @extensions[extension.nil? ? nil : extension.to_sym] = parser
         
     | 
| 
      
 121 
     | 
    
         
            +
                end
         
     | 
| 
      
 122 
     | 
    
         
            +
             
     | 
| 
      
 123 
     | 
    
         
            +
                # De-Register a file extension
         
     | 
| 
      
 124 
     | 
    
         
            +
                #
         
     | 
| 
      
 125 
     | 
    
         
            +
                # Returns [Symbol] the extension removed, or nil if the extension was not registered
         
     | 
| 
      
 126 
     | 
    
         
            +
                #
         
     | 
| 
      
 127 
     | 
    
         
            +
                # Example:
         
     | 
| 
      
 128 
     | 
    
         
            +
                #   register_extension(:xls)
         
     | 
| 
      
 129 
     | 
    
         
            +
                def self.deregister_extension(extension)
         
     | 
| 
      
 130 
     | 
    
         
            +
                  raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.to_s =~ /\A\w+\Z/
         
     | 
| 
      
 131 
     | 
    
         
            +
                  @extensions.delete(extension.to_sym)
         
     | 
| 
      
 132 
     | 
    
         
            +
                end
         
     | 
| 
      
 133 
     | 
    
         
            +
             
     | 
| 
      
 134 
     | 
    
         
            +
                private
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                # A registry to hold formats for processing files during upload or download
         
     | 
| 
      
 137 
     | 
    
         
            +
                @extensions = {}
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                def self.parser_class(format)
         
     | 
| 
      
 140 
     | 
    
         
            +
                  @extensions[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
         
     | 
| 
      
 141 
     | 
    
         
            +
                end
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
                # Returns the parser to use with tabular for the supplied file_name
         
     | 
| 
      
 144 
     | 
    
         
            +
                def self.parser_class_for_file_name(file_name)
         
     | 
| 
      
 145 
     | 
    
         
            +
                  extension = nil
         
     | 
| 
      
 146 
     | 
    
         
            +
                  file_name.to_s.split('.').reverse_each do |ext|
         
     | 
| 
      
 147 
     | 
    
         
            +
                    if @extensions.include?(ext.to_sym)
         
     | 
| 
      
 148 
     | 
    
         
            +
                      extension = ext.to_sym
         
     | 
| 
      
 149 
     | 
    
         
            +
                      break
         
     | 
| 
      
 150 
     | 
    
         
            +
                    end
         
     | 
| 
      
 151 
     | 
    
         
            +
                  end
         
     | 
| 
      
 152 
     | 
    
         
            +
                  parser_class(extension)
         
     | 
| 
      
 153 
     | 
    
         
            +
                end
         
     | 
| 
      
 154 
     | 
    
         
            +
             
     | 
| 
      
 155 
     | 
    
         
            +
                register_extension(nil, IOStreams::Tabular::Parser::Csv)
         
     | 
| 
      
 156 
     | 
    
         
            +
                register_extension(:array, IOStreams::Tabular::Parser::Array)
         
     | 
| 
      
 157 
     | 
    
         
            +
                register_extension(:csv, IOStreams::Tabular::Parser::Csv)
         
     | 
| 
      
 158 
     | 
    
         
            +
                register_extension(:fixed, IOStreams::Tabular::Parser::Fixed)
         
     | 
| 
      
 159 
     | 
    
         
            +
                register_extension(:hash, IOStreams::Tabular::Parser::Hash)
         
     | 
| 
      
 160 
     | 
    
         
            +
                register_extension(:json, IOStreams::Tabular::Parser::Json)
         
     | 
| 
      
 161 
     | 
    
         
            +
                register_extension(:psv, IOStreams::Tabular::Parser::Psv)
         
     | 
| 
      
 162 
     | 
    
         
            +
              end
         
     | 
| 
      
 163 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,146 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 2 
     | 
    
         
            +
              class Tabular
         
     | 
| 
      
 3 
     | 
    
         
            +
                # Process files / streams that start with a header.
         
     | 
| 
      
 4 
     | 
    
         
            +
                class Header
         
     | 
| 
      
 5 
     | 
    
         
            +
                  attr_accessor :columns, :allowed_columns, :required_columns, :skip_unknown
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                  # Header
         
     | 
| 
      
 8 
     | 
    
         
            +
                  #
         
     | 
| 
      
 9 
     | 
    
         
            +
                  # Parameters
         
     | 
| 
      
 10 
     | 
    
         
            +
                  #   columns [Array<String>]
         
     | 
| 
      
 11 
     | 
    
         
            +
                  #     Columns in this header.
         
     | 
| 
      
 12 
     | 
    
         
            +
                  #     Note:
         
     | 
| 
      
 13 
     | 
    
         
            +
                  #       It is recommended to keep all columns as strings to avoid any issues when persistence
         
     | 
| 
      
 14 
     | 
    
         
            +
                  #       with MongoDB when it converts symbol keys to strings.
         
     | 
| 
      
 15 
     | 
    
         
            +
                  #
         
     | 
| 
      
 16 
     | 
    
         
            +
                  #   allowed_columns [Array<String>]
         
     | 
| 
      
 17 
     | 
    
         
            +
                  #     List of columns to allow.
         
     | 
| 
      
 18 
     | 
    
         
            +
                  #     Default: nil ( Allow all columns )
         
     | 
| 
      
 19 
     | 
    
         
            +
                  #     Note:
         
     | 
| 
      
 20 
     | 
    
         
            +
                  #       When supplied any columns that are rejected will be returned in the cleansed columns
         
     | 
| 
      
 21 
     | 
    
         
            +
                  #       as nil so that they can be ignored during processing.
         
     | 
| 
      
 22 
     | 
    
         
            +
                  #
         
     | 
| 
      
 23 
     | 
    
         
            +
                  #   required_columns [Array<String>]
         
     | 
| 
      
 24 
     | 
    
         
            +
                  #     List of columns that must be present, otherwise an Exception is raised.
         
     | 
| 
      
 25 
     | 
    
         
            +
                  #
         
     | 
| 
      
 26 
     | 
    
         
            +
                  #   skip_unknown [true|false]
         
     | 
| 
      
 27 
     | 
    
         
            +
                  #     true:
         
     | 
| 
      
 28 
     | 
    
         
            +
                  #       Skip columns not present in the whitelist by cleansing them to nil.
         
     | 
| 
      
 29 
     | 
    
         
            +
                  #       #as_hash will skip these additional columns entirely as if they were not in the file at all.
         
     | 
| 
      
 30 
     | 
    
         
            +
                  #     false:
         
     | 
| 
      
 31 
     | 
    
         
            +
                  #       Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
         
     | 
| 
      
 32 
     | 
    
         
            +
                  def initialize(columns: nil, allowed_columns: nil, required_columns: nil, skip_unknown: true)
         
     | 
| 
      
 33 
     | 
    
         
            +
                    @columns          = columns
         
     | 
| 
      
 34 
     | 
    
         
            +
                    @required_columns = required_columns
         
     | 
| 
      
 35 
     | 
    
         
            +
                    @allowed_columns  = allowed_columns
         
     | 
| 
      
 36 
     | 
    
         
            +
                    @skip_unknown     = skip_unknown
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                  # Returns [Array<String>] list columns that were ignored during cleansing.
         
     | 
| 
      
 40 
     | 
    
         
            +
                  #
         
     | 
| 
      
 41 
     | 
    
         
            +
                  # Each column is cleansed as follows:
         
     | 
| 
      
 42 
     | 
    
         
            +
                  # - Leading and trailing whitespace is stripped.
         
     | 
| 
      
 43 
     | 
    
         
            +
                  # - All characters converted to lower case.
         
     | 
| 
      
 44 
     | 
    
         
            +
                  # - Spaces and '-' are converted to '_'.
         
     | 
| 
      
 45 
     | 
    
         
            +
                  # - All characters except for letters, digits, and '_' are stripped.
         
     | 
| 
      
 46 
     | 
    
         
            +
                  #
         
     | 
| 
      
 47 
     | 
    
         
            +
                  # Notes
         
     | 
| 
      
 48 
     | 
    
         
            +
                  # * Raises Tabular::InvalidHeader when there are no non-nil columns left after cleansing.
         
     | 
| 
      
 49 
     | 
    
         
            +
                  def cleanse!
         
     | 
| 
      
 50 
     | 
    
         
            +
                    return [] if columns.nil? || columns.empty?
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                    ignored_columns = []
         
     | 
| 
      
 53 
     | 
    
         
            +
                    self.columns    = columns.collect do |column|
         
     | 
| 
      
 54 
     | 
    
         
            +
                      cleansed = cleanse_column(column)
         
     | 
| 
      
 55 
     | 
    
         
            +
                      if allowed_columns.nil? || allowed_columns.include?(cleansed)
         
     | 
| 
      
 56 
     | 
    
         
            +
                        cleansed
         
     | 
| 
      
 57 
     | 
    
         
            +
                      else
         
     | 
| 
      
 58 
     | 
    
         
            +
                        ignored_columns << column
         
     | 
| 
      
 59 
     | 
    
         
            +
                        nil
         
     | 
| 
      
 60 
     | 
    
         
            +
                      end
         
     | 
| 
      
 61 
     | 
    
         
            +
                    end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                    if !skip_unknown && !ignored_columns.empty?
         
     | 
| 
      
 64 
     | 
    
         
            +
                      raise(IOStreams::Tabular::Errors::InvalidHeader, "Unknown columns after cleansing: #{ignored_columns.join(',')}")
         
     | 
| 
      
 65 
     | 
    
         
            +
                    end
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                    if ignored_columns.size == columns.size
         
     | 
| 
      
 68 
     | 
    
         
            +
                      raise(IOStreams::Tabular::Errors::InvalidHeader, "All columns are unknown after cleansing: #{ignored_columns.join(',')}")
         
     | 
| 
      
 69 
     | 
    
         
            +
                    end
         
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
      
 71 
     | 
    
         
            +
                    if required_columns
         
     | 
| 
      
 72 
     | 
    
         
            +
                      missing_columns = required_columns - columns
         
     | 
| 
      
 73 
     | 
    
         
            +
                      unless missing_columns.empty?
         
     | 
| 
      
 74 
     | 
    
         
            +
                        raise(IOStreams::Tabular::Errors::InvalidHeader, "Missing columns after cleansing: #{missing_columns.join(',')}")
         
     | 
| 
      
 75 
     | 
    
         
            +
                      end
         
     | 
| 
      
 76 
     | 
    
         
            +
                    end
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                    ignored_columns
         
     | 
| 
      
 79 
     | 
    
         
            +
                  end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                  # Marshal to Hash from Array or Hash by applying this header
         
     | 
| 
      
 82 
     | 
    
         
            +
                  #
         
     | 
| 
      
 83 
     | 
    
         
            +
                  # Parameters:
         
     | 
| 
      
 84 
     | 
    
         
            +
                  #   cleanse [true|false]
         
     | 
| 
      
 85 
     | 
    
         
            +
                  #     Whether to cleanse and narrow the supplied hash to just those columns in this header.
         
     | 
| 
      
 86 
     | 
    
         
            +
                  #     Only Applies to when the hash is already a Hash.
         
     | 
| 
      
 87 
     | 
    
         
            +
                  #     Useful to turn off narrowing when the input data is already trusted.
         
     | 
| 
      
 88 
     | 
    
         
            +
                  def to_hash(row, cleanse = true)
         
     | 
| 
      
 89 
     | 
    
         
            +
                    return if IOStreams.blank?(row)
         
     | 
| 
      
 90 
     | 
    
         
            +
             
     | 
| 
      
 91 
     | 
    
         
            +
                    case row
         
     | 
| 
      
 92 
     | 
    
         
            +
                    when Array
         
     | 
| 
      
 93 
     | 
    
         
            +
                      raise(Tabular::Errors::InvalidHeader, "Missing mandatory header when trying to convert a row into a hash") unless columns
         
     | 
| 
      
 94 
     | 
    
         
            +
                      array_to_hash(row)
         
     | 
| 
      
 95 
     | 
    
         
            +
                    when Hash
         
     | 
| 
      
 96 
     | 
    
         
            +
                      cleanse && columns ? cleanse_hash(row) : row
         
     | 
| 
      
 97 
     | 
    
         
            +
                    else
         
     | 
| 
      
 98 
     | 
    
         
            +
                      raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to a Hash")
         
     | 
| 
      
 99 
     | 
    
         
            +
                    end
         
     | 
| 
      
 100 
     | 
    
         
            +
                  end
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                  def to_array(row, cleanse = true)
         
     | 
| 
      
 103 
     | 
    
         
            +
                    if row.is_a?(Hash) && columns
         
     | 
| 
      
 104 
     | 
    
         
            +
                      row = cleanse_hash(row) if cleanse
         
     | 
| 
      
 105 
     | 
    
         
            +
                      row = columns.collect { |column| row[column] }
         
     | 
| 
      
 106 
     | 
    
         
            +
                    end
         
     | 
| 
      
 107 
     | 
    
         
            +
                    raise(Tabular::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to an Array without the header columns being set.") unless row.is_a?(Array)
         
     | 
| 
      
 108 
     | 
    
         
            +
                    row
         
     | 
| 
      
 109 
     | 
    
         
            +
                  end
         
     | 
| 
      
 110 
     | 
    
         
            +
             
     | 
| 
      
 111 
     | 
    
         
            +
                  private
         
     | 
| 
      
 112 
     | 
    
         
            +
             
     | 
| 
      
 113 
     | 
    
         
            +
                  def array_to_hash(row)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    h = {}
         
     | 
| 
      
 115 
     | 
    
         
            +
                    columns.each_with_index { |col, i| h[col] = row[i] unless IOStreams.blank?(col) }
         
     | 
| 
      
 116 
     | 
    
         
            +
                    h
         
     | 
| 
      
 117 
     | 
    
         
            +
                  end
         
     | 
| 
      
 118 
     | 
    
         
            +
             
     | 
| 
      
 119 
     | 
    
         
            +
                  # Perform cleansing on returned Hash keys during the narrowing process.
         
     | 
| 
      
 120 
     | 
    
         
            +
                  # For example, avoids issues with case etc.
         
     | 
| 
      
 121 
     | 
    
         
            +
                  def cleanse_hash(hash)
         
     | 
| 
      
 122 
     | 
    
         
            +
                    h = {}
         
     | 
| 
      
 123 
     | 
    
         
            +
                    hash.each_pair do |key, value|
         
     | 
| 
      
 124 
     | 
    
         
            +
                      cleansed_key    =
         
     | 
| 
      
 125 
     | 
    
         
            +
                        if columns.include?(key)
         
     | 
| 
      
 126 
     | 
    
         
            +
                          key
         
     | 
| 
      
 127 
     | 
    
         
            +
                        else
         
     | 
| 
      
 128 
     | 
    
         
            +
                          key = cleanse_column(key)
         
     | 
| 
      
 129 
     | 
    
         
            +
                          key if columns.include?(key)
         
     | 
| 
      
 130 
     | 
    
         
            +
                        end
         
     | 
| 
      
 131 
     | 
    
         
            +
                      h[cleansed_key] = value if cleansed_key
         
     | 
| 
      
 132 
     | 
    
         
            +
                    end
         
     | 
| 
      
 133 
     | 
    
         
            +
                    h
         
     | 
| 
      
 134 
     | 
    
         
            +
                  end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                  def cleanse_column(name)
         
     | 
| 
      
 137 
     | 
    
         
            +
                    cleansed = name.to_s.strip.downcase
         
     | 
| 
      
 138 
     | 
    
         
            +
                    cleansed.gsub!(/\s+/, '_')
         
     | 
| 
      
 139 
     | 
    
         
            +
                    cleansed.gsub!(/-+/, '_')
         
     | 
| 
      
 140 
     | 
    
         
            +
                    cleansed.gsub!(/\W+/, '')
         
     | 
| 
      
 141 
     | 
    
         
            +
                    cleansed
         
     | 
| 
      
 142 
     | 
    
         
            +
                  end
         
     | 
| 
      
 143 
     | 
    
         
            +
             
     | 
| 
      
 144 
     | 
    
         
            +
                end
         
     | 
| 
      
 145 
     | 
    
         
            +
              end
         
     | 
| 
      
 146 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,26 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 2 
     | 
    
         
            +
            module IOStreams
         
     | 
| 
      
 3 
     | 
    
         
            +
              class Tabular
         
     | 
| 
      
 4 
     | 
    
         
            +
                module Parser
         
     | 
| 
      
 5 
     | 
    
         
            +
                  class Array < Base
         
     | 
| 
      
 6 
     | 
    
         
            +
                    # Returns [Array<String>] the header row.
         
     | 
| 
      
 7 
     | 
    
         
            +
                    # Returns nil if the row is blank.
         
     | 
| 
      
 8 
     | 
    
         
            +
                    def parse_header(row)
         
     | 
| 
      
 9 
     | 
    
         
            +
                      raise(Tabular::Errors::InvalidHeader, "Format is :array. Invalid input header: #{row.class.name}") unless row.is_a?(::Array)
         
     | 
| 
      
 10 
     | 
    
         
            +
                      row
         
     | 
| 
      
 11 
     | 
    
         
            +
                    end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                    # Returns Array
         
     | 
| 
      
 14 
     | 
    
         
            +
                    def parse(row)
         
     | 
| 
      
 15 
     | 
    
         
            +
                      raise(Tabular::Errors::TypeMismatch, "Format is :array. Invalid input: #{row.class.name}") unless row.is_a?(::Array)
         
     | 
| 
      
 16 
     | 
    
         
            +
                      row
         
     | 
| 
      
 17 
     | 
    
         
            +
                    end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                    def render(row, header)
         
     | 
| 
      
 20 
     | 
    
         
            +
                      header.to_array(row)
         
     | 
| 
      
 21 
     | 
    
         
            +
                    end
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
            end
         
     |