RubyGems - iostreams - Versions diffs - 0.14.0 → 0.15.0 - Mend

iostreams 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

checksums.yaml +4 -4
data/LICENSE +202 -0
data/README.md +155 -47
data/lib/io_streams/file/reader.rb +7 -8
data/lib/io_streams/file/writer.rb +7 -8
data/lib/io_streams/io_streams.rb +313 -129
data/lib/io_streams/{delimited → line}/reader.rb +20 -30
data/lib/io_streams/line/writer.rb +81 -0
data/lib/io_streams/pgp.rb +4 -14
data/lib/io_streams/record/reader.rb +55 -0
data/lib/io_streams/record/writer.rb +63 -0
data/lib/io_streams/row/reader.rb +60 -0
data/lib/io_streams/row/writer.rb +62 -0
data/lib/io_streams/s3.rb +25 -0
data/lib/io_streams/s3/reader.rb +64 -0
data/lib/io_streams/s3/writer.rb +13 -0
data/lib/io_streams/streams.rb +1 -1
data/lib/io_streams/tabular.rb +163 -0
data/lib/io_streams/tabular/errors.rb +14 -0
data/lib/io_streams/tabular/header.rb +146 -0
data/lib/io_streams/tabular/parser/array.rb +26 -0
data/lib/io_streams/tabular/parser/base.rb +12 -0
data/lib/io_streams/tabular/parser/csv.rb +35 -0
data/lib/io_streams/tabular/parser/fixed.rb +88 -0
data/lib/io_streams/tabular/parser/hash.rb +21 -0
data/lib/io_streams/tabular/parser/json.rb +25 -0
data/lib/io_streams/tabular/parser/psv.rb +34 -0
data/lib/io_streams/tabular/utility/csv_row.rb +115 -0
data/lib/io_streams/version.rb +2 -2
data/lib/io_streams/xlsx/reader.rb +1 -1
data/lib/io_streams/zip/reader.rb +1 -1
data/lib/io_streams/zip/writer.rb +1 -1
data/lib/iostreams.rb +21 -10
data/test/bzip2_reader_test.rb +21 -22
data/test/bzip2_writer_test.rb +38 -32
data/test/file_reader_test.rb +19 -18
data/test/file_writer_test.rb +23 -22
data/test/files/test.json +3 -0
data/test/gzip_reader_test.rb +21 -22
data/test/gzip_writer_test.rb +35 -29
data/test/io_streams_test.rb +137 -61
data/test/line_reader_test.rb +105 -0
data/test/line_writer_test.rb +50 -0
data/test/pgp_reader_test.rb +29 -29
data/test/pgp_test.rb +149 -195
data/test/pgp_writer_test.rb +63 -62
data/test/record_reader_test.rb +61 -0
data/test/record_writer_test.rb +73 -0
data/test/row_reader_test.rb +34 -0
data/test/row_writer_test.rb +51 -0
data/test/tabular_test.rb +184 -0
data/test/xlsx_reader_test.rb +13 -17
data/test/zip_reader_test.rb +21 -22
data/test/zip_writer_test.rb +40 -36
metadata +41 -17
data/lib/io_streams/csv/reader.rb +0 -21
data/lib/io_streams/csv/writer.rb +0 -20
data/lib/io_streams/delimited/writer.rb +0 -67
data/test/csv_reader_test.rb +0 -34
data/test/csv_writer_test.rb +0 -35
data/test/delimited_reader_test.rb +0 -115
data/test/delimited_writer_test.rb +0 -44

data/lib/io_streams/tabular/parser/base.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module IOStreams
+  class Tabular
+    module Parser
+      class Base
+        # Returns [true|false] whether a header row is required for this format.
+        def requires_header?
+          true
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/parser/csv.rb ADDED Viewed

@@ -0,0 +1,35 @@
+module IOStreams
+  class Tabular
+    module Parser
+      class Csv < Base
+        attr_reader :csv_parser
+        def initialize
+          @csv_parser = Utility::CSVRow.new
+        end
+        # Returns [Array<String>] the header row.
+        # Returns nil if the row is blank.
+        def parse_header(row)
+          raise(Tabular::Errors::InvalidHeader, "Format is :csv. Invalid input header: #{row.class.name}") unless row.is_a?(String)
+          csv_parser.parse(row)
+        end
+        # Returns [Array] the parsed CSV line
+        def parse(row)
+          raise(Tabular::Errors::TypeMismatch, "Format is :csv. Invalid input: #{row.class.name}") unless row.is_a?(String)
+          csv_parser.parse(row)
+        end
+        # Return the supplied array as a single line CSV string.
+        def render(row, header)
+          array = header.to_array(row)
+          csv_parser.to_csv(array)
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/parser/fixed.rb ADDED Viewed

@@ -0,0 +1,88 @@
+module IOStreams
+  class Tabular
+    module Parser
+      # Parsing and rendering fixed length data
+      class Fixed < Base
+        attr_reader :encoding, :encoding_options, :fixed_format
+        # Returns [IOStreams::Tabular::Parser]
+        #
+        # Arguments:
+        #   format: [Array<Hash>]
+        #     [
+        #       {key: 'name',    size: 23 },
+        #       {key: 'address', size: 40 },
+        #       {key: 'zip',     size: 5 }
+        #     ]
+        #
+        #   encoding: [String|Encoding]
+        #     nil: Don't perform any encoding conversion
+        #     'ASCII': ASCII Format
+        #     'UTF-8': UTF-8 Format
+        #     Etc.
+        #     Default: nil
+        #
+        #   replacement: [String]
+        #     The character to replace with when a character cannot be converted to the target encoding.
+        #     nil: Don't replace any invalid characters. Encoding::UndefinedConversionError is raised.
+        #     Default: nil
+        def initialize(format:, encoding: nil, replacement: nil)
+          @encoding         = encoding.nil? || encoding.is_a?(Encoding) ? encoding : Encoding.find(encoding)
+          @encoding_options = replacement.nil? ? {} : {invalid: :replace, undef: :replace, replace: replacement}
+          @fixed_format     = parse_format(format)
+        end
+        # Returns [String] fixed format values extracted from the supplied hash.
+        # String will be encoded to `encoding`
+        def render(row, header)
+          hash = header.to_hash(row)
+          result = encoding.nil? ? '' : ''.encode(encoding)
+          fixed_format.each do |map|
+            # A nil value is considered an empty string
+            value = hash[map.key].to_s
+            result <<
+              if encoding
+                format("%-#{map.size}.#{map.size}s".encode(encoding), value.encode(encoding, encoding_options))
+              else
+                format("%-#{map.size}.#{map.size}s", value)
+              end
+          end
+          result
+        end
+        # Returns [Hash<Symbol, String>] fixed format values extracted from the supplied line.
+        # String will be encoded to `encoding`
+        def parse(line)
+          unless line.is_a?(String)
+            raise(Tabular::Errors::TypeMismatch, "Format is :fixed. Invalid parse input: #{line.class.name}")
+          end
+          hash  = {}
+          index = 0
+          fixed_format.each do |map|
+            value         = line[index..(index + map.size - 1)]
+            index         += map.size
+            hash[map.key] = encoding.nil? ? value.strip : value.strip.encode(encoding, encoding_options)
+          end
+          hash
+        end
+        private
+        FixedFormat = Struct.new(:key, :size)
+        # Returns [Array<FixedFormat>] the format for this fixed width file.
+        # Also validates values
+        def parse_format(format)
+          format.collect do |map|
+            size = map[:size]
+            key  = map[:key]
+            raise(ArgumentError, "Missing required :key and :size in: #{map.inspect}") unless size && key
+            FixedFormat.new(key, size)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/parser/hash.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'json'
+module IOStreams
+  class Tabular
+    module Parser
+      class Hash < Base
+        def parse(row)
+          raise(Tabular::Errors::TypeMismatch, "Format is :hash. Invalid input: #{row.class.name}") unless row.is_a?(::Hash)
+          row
+        end
+        def render(row, header)
+          header.to_hash(row)
+        end
+        def requires_header?
+          false
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/parser/json.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'json'
+module IOStreams
+  class Tabular
+    module Parser
+      # For parsing a single line of JSON at a time
+      class Json < Base
+        def parse(row)
+          raise(Tabular::Errors::TypeMismatch, "Format is :json. Invalid input: #{row.class.name}") unless row.is_a?(String)
+          JSON.parse(row)
+        end
+        # Return the supplied array as a single line JSON string.
+        def render(row, header)
+          hash = header.to_hash(row)
+          hash.to_json
+        end
+        def requires_header?
+          false
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/parser/psv.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module IOStreams
+  class Tabular
+    module Parser
+      # For parsing a single line of Pipe-separated values
+      class Psv < Base
+        # Returns [Array<String>] the header row.
+        # Returns nil if the row is blank.
+        def parse_header(row)
+          unless row.is_a?(String)
+            raise(Tabular::Errors::InvalidHeader, "Format is :psv. Invalid input header: #{row.class.name}")
+          end
+          row.split('|')
+        end
+        # Returns [Array] the parsed PSV line
+        def parse(row)
+          raise(Tabular::Errors::TypeMismatch, "Format is :psv. Invalid input: #{row.class.name}") unless row.is_a?(String)
+          row.split('|')
+        end
+        # Return the supplied array as a single line JSON string.
+        def render(row, header)
+          array          = header.to_array(row)
+          cleansed_array = array.collect do |i|
+            i.is_a?(String) ? i.tr('|', ':') : i
+          end
+          cleansed_array.join('|')
+        end
+      end
+    end
+  end
+end

data/lib/io_streams/tabular/utility/csv_row.rb ADDED Viewed

@@ -0,0 +1,115 @@
+require 'csv'
+module IOStreams
+  class Tabular
+    module Utility
+      # For parsing a single line of CSV at a time
+      # 2 to 3 times better performance than CSV.parse_line and considerably less
+      # garbage collection required.
+      #
+      # Note:
+      #   This parser does not support line feeds embedded in quoted fields since
+      #   the file is broken apart based on line feeds during the upload process and
+      #   is then processed by each worker on a line by line basis.
+      class CSVRow < ::CSV
+        UTF8_ENCODING = Encoding.find('UTF-8').freeze
+        def initialize(encoding = UTF8_ENCODING)
+          @io = StringIO.new(''.force_encoding(encoding))
+          super(@io, row_sep: '')
+        end
+        # Parse a single line of CSV data
+        # Parameters
+        #   line [String]
+        #     A single line of CSV data without any line terminators
+        def parse(line)
+          return if IOStreams.blank?(line)
+          return if @skip_lines and @skip_lines.match line
+          in_extended_col = false
+          csv             = Array.new
+          parts           = line.split(@col_sep, -1)
+          csv << nil if parts.empty?
+          # This loop is the hot path of csv parsing. Some things may be non-dry
+          # for a reason. Make sure to benchmark when refactoring.
+          parts.each do |part|
+            if in_extended_col
+              # If we are continuing a previous column
+              if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
+                # extended column ends
+                csv.last << part[0..-2]
+                if csv.last =~ @parsers[:stray_quote]
+                  raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
+                end
+                csv.last.gsub!(@quote_char * 2, @quote_char)
+                in_extended_col = false
+              else
+                csv.last << part
+                csv.last << @col_sep
+              end
+            elsif part[0] == @quote_char
+              # If we are starting a new quoted column
+              if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
+                # start an extended column
+                csv << part[1..-1]
+                csv.last << @col_sep
+                in_extended_col = true
+              else
+                # regular quoted column
+                csv << part[1..-2]
+                if csv.last =~ @parsers[:stray_quote]
+                  raise MalformedCSVError, "Missing or stray quote in line #{lineno + 1}"
+                end
+                csv.last.gsub!(@quote_char * 2, @quote_char)
+              end
+            elsif part =~ @parsers[:quote_or_nl]
+              # Unquoted field with bad characters.
+              if part =~ @parsers[:nl_or_lf]
+                raise MalformedCSVError, "Unquoted fields do not allow \\r or \\n (line #{lineno + 1})."
+              else
+                raise MalformedCSVError, "Illegal quoting in line #{lineno + 1}."
+              end
+            else
+              # Regular ole unquoted field.
+              csv << (part.empty? ? nil : part)
+            end
+          end
+          # Replace tacked on @col_sep with @row_sep if we are still in an extended
+          # column.
+          csv[-1][-1] = @row_sep if in_extended_col
+          if in_extended_col
+            raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
+          end
+          @lineno     += 1
+          # save fields unconverted fields, if needed...
+          unconverted = csv.dup if @unconverted_fields
+          # convert fields, if needed...
+          csv         = convert_fields(csv) unless @use_headers or @converters.empty?
+          # parse out header rows and handle CSV::Row conversions...
+          csv         = parse_headers(csv) if @use_headers
+          # inject unconverted fields and accessor, if requested...
+          if @unconverted_fields and not csv.respond_to? :unconverted_fields
+            add_unconverted_fields(csv, unconverted)
+          end
+          csv
+        end
+        # Return the supplied array as a single line CSV string.
+        def render(row)
+          row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
+        end
+        alias_method :to_csv, :render
+      end
+    end
+  end
+end

data/lib/io_streams/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
-module IOStreams #:nodoc
-  VERSION = '0.14.0'
+module IOStreams
+  VERSION = '0.15.0'
 end

data/lib/io_streams/xlsx/reader.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module IOStreams
           file_name = temp_file.to_path
           ::File.open(file_name, 'wb') do |file|
-            IOStreams.copy(file_name_or_io, file, buffer_size)
+            IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
           end
         else
           file_name = file_name_or_io

data/lib/io_streams/zip/reader.rb CHANGED Viewed

@@ -33,7 +33,7 @@ module IOStreams
           # Stream zip stream into temp file
           ::File.open(file_name, 'wb') do |file|
-            IOStreams.copy(file_name_or_io, file, buffer_size)
+            IOStreams.copy(file_name_or_io, file, buffer_size: buffer_size)
           end
           read_file(file_name, &block)

data/lib/io_streams/zip/writer.rb CHANGED Viewed

@@ -45,7 +45,7 @@ module IOStreams
           write_file(temp_file.to_path, zip_file_name, &block)
           # Stream temp file into output stream
-          IOStreams.copy(temp_file, file_name_or_io, buffer_size)
+          IOStreams.copy(temp_file, file_name_or_io, buffer_size: buffer_size)
         ensure
           temp_file.delete if temp_file
         end

data/lib/iostreams.rb CHANGED Viewed

@@ -1,23 +1,23 @@
 require 'io_streams/version'
 #@formatter:off
 module IOStreams
-  module CSV
-    autoload :Reader, 'io_streams/csv/reader'
-    autoload :Writer, 'io_streams/csv/writer'
+  module Bzip2
+    autoload :Reader, 'io_streams/bzip2/reader'
+    autoload :Writer, 'io_streams/bzip2/writer'
   end
   module File
     autoload :Reader, 'io_streams/file/reader'
     autoload :Writer, 'io_streams/file/writer'
   end
-  module Bzip2
-    autoload :Reader, 'io_streams/bzip2/reader'
-    autoload :Writer, 'io_streams/bzip2/writer'
-  end
   module Gzip
     autoload :Reader, 'io_streams/gzip/reader'
     autoload :Writer, 'io_streams/gzip/writer'
   end
   autoload :Pgp,      'io_streams/pgp'
+  module S3
+    autoload :Reader, 'io_streams/s3/reader'
+    autoload :Writer, 'io_streams/s3/writer'
+  end
   module SFTP
     autoload :Reader, 'io_streams/sftp/reader'
     autoload :Writer, 'io_streams/sftp/writer'
@@ -26,12 +26,23 @@ module IOStreams
     autoload :Reader, 'io_streams/zip/reader'
     autoload :Writer, 'io_streams/zip/writer'
   end
-  module Delimited
-    autoload :Reader, 'io_streams/delimited/reader'
-    autoload :Writer, 'io_streams/delimited/writer'
+  module Line
+    autoload :Reader, 'io_streams/line/reader'
+    autoload :Writer, 'io_streams/line/writer'
+  end
+  module Record
+    autoload :Reader, 'io_streams/record/reader'
+    autoload :Writer, 'io_streams/record/writer'
+  end
+  module Row
+    autoload :Reader, 'io_streams/row/reader'
+    autoload :Writer, 'io_streams/row/writer'
   end
   module Xlsx
     autoload :Reader, 'io_streams/xlsx/reader'
   end
+  autoload :Tabular, 'io_streams/tabular'
 end
 require 'io_streams/io_streams'

data/test/bzip2_reader_test.rb CHANGED Viewed

@@ -1,33 +1,32 @@
 require_relative 'test_helper'
-# Unit Test for IOStreams::Gzip
-module Streams
-  class Bzip2ReaderTest < Minitest::Test
-    describe IOStreams::Bzip2::Reader do
-      before do
-        @file_name = File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
-        @gzip_data = File.open(@file_name, 'rb') { |f| f.read }
-        @data      = File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
-      end
+class Bzip2ReaderTest < Minitest::Test
+  describe IOStreams::Bzip2::Reader do
+    let :file_name do
+      File.join(File.dirname(__FILE__), 'files', 'text.txt.bz2')
+    end
-      describe '.open' do
-        it 'file' do
-          result = IOStreams::Bzip2::Reader.open(@file_name) do |io|
-            io.read
-          end
-          assert_equal @data, result
+    let :decompressed do
+      File.read(File.join(File.dirname(__FILE__), 'files', 'text.txt'))
+    end
+    describe '.open' do
+      it 'file' do
+        result = IOStreams::Bzip2::Reader.open(file_name) do |io|
+          io.read
         end
+        assert_equal decompressed, result
+      end
-        it 'stream' do
-          result = File.open(@file_name) do |file|
-            IOStreams::Bzip2::Reader.open(file) do |io|
-              io.read
-            end
+      it 'stream' do
+        result = File.open(file_name) do |file|
+          IOStreams::Bzip2::Reader.open(file) do |io|
+            io.read
           end
-          assert_equal @data, result
         end
+        assert_equal decompressed, result
       end
     end
   end
 end