RubyGems - iostreams - Versions diffs - 1.5.1 → 1.6.0 - Mend

iostreams 1.5.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/README.md +13 -1
data/lib/io_streams/builder.rb +19 -5
data/lib/io_streams/errors.rb +12 -0
data/lib/io_streams/io_streams.rb +0 -2
data/lib/io_streams/line/reader.rb +23 -11
data/lib/io_streams/path.rb +1 -1
data/lib/io_streams/paths/s3.rb +5 -2
data/lib/io_streams/stream.rb +60 -5
data/lib/io_streams/tabular.rb +23 -23
data/lib/io_streams/tabular/parser/csv.rb +4 -2
data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
data/lib/io_streams/version.rb +1 -1
data/test/builder_test.rb +29 -0
data/test/deprecated_test.rb +2 -0
data/test/files/test.psv +4 -0
data/test/files/unclosed_quote_large_test.csv +1658 -0
data/test/files/unclosed_quote_test2.csv +3 -0
data/test/line_reader_test.rb +30 -4
data/test/stream_test.rb +174 -8
metadata +47 -42

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c665e5c262a98de9ceccf1cf93f5bc391370d0b674f966d9e266b731a31d3b7f
-  data.tar.gz: 1ab8c125e49abc178ce4c1e94f36dfb9219011ece6c8a9bd65b1c5a5d2f14604
+  metadata.gz: e4fb750e5c3779000fac8f21803b84a5977f111f481b7d4e11a117149cd0d9e6
+  data.tar.gz: '09cb2dc7ff67afd44d3ebcfefa29f6a1840d1b3dd871c33bc1a06914cd8bee2d'
 SHA512:
-  metadata.gz: 63bec4c3602cd4ab699bcf73abe3d97b7b808c6559c378e67be99c1af1ab7ada84dd0753c91dd2fe7be0924690a1deb30b44b87f56cd4638c4954a6bfcd38796
-  data.tar.gz: 2b1138c5389747892a33b5213a42b0fe4ececabc421c824db03d5218d436725a95d0306c7b0a1cc2a4b0a8eb4e1b4192152f6d60f11892fb2980c418c7be1f80
+  metadata.gz: 0cf2db14e03b9e81e0e39119b35f293408a9d4b6bf3365abc724d95d7376abf7af73720b8fd8d000b70b2eb7abbae20055753e845a77dec2d8d7297e5b6693ba
+  data.tar.gz: d9fa2194965ef99a99e1ecd0655e84066e6021236d1494336c40693b103d3c7ecb7e5a87e8f179a38c990ad7670d6bd4901b6a7cd8b89757c00a3179d657a1cc

data/README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 # IOStreams
-[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Build Status](https://travis-ci.org/rocketjob/iostreams.svg?branch=master)](https://travis-ci.org/rocketjob/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
+[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
 IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
 or storage mechanism transparent to the application.
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
 Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
+## Upgrading to v1.6
+The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
+the following line to your code:
+~~~ruby
+IOStreams.include(IOStreams::Deprecated)
+~~~
+It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
+release.
 ## Versioning
 This project adheres to [Semantic Versioning](http://semver.org/).

data/lib/io_streams/builder.rb CHANGED Viewed

@@ -1,13 +1,15 @@
 module IOStreams
   # Build the streams that need to be applied to a path druing reading or writing.
   class Builder
-    attr_accessor :file_name
+    attr_accessor :file_name, :format_options
     attr_reader :streams, :options
     def initialize(file_name = nil)
-      @file_name = file_name
-      @streams   = nil
-      @options   = nil
+      @file_name     = file_name
+      @streams       = nil
+      @options       = nil
+      @format        = nil
+      @format_option = nil
     end
     # Supply an option that is only applied once the file name extensions have been parsed.
@@ -88,11 +90,23 @@ module IOStreams
       built_streams.freeze
     end
+    # Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
+    # Returns [nil] if no format is set, or if it cannot be determined from the file_name
+    def format
+      @format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
+    end
+    def format=(format)
+      raise(ArgumentError, "Invalid format: #{format.inspect}") unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
+      @format = format
+    end
     private
     def class_for_stream(type, stream)
       ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
-            raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
+        raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
       ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
     end

data/lib/io_streams/errors.rb CHANGED Viewed

@@ -9,6 +9,9 @@ module IOStreams
     class MissingHeader < Error
     end
+    class UnknownFormat < Error
+    end
     class TypeMismatch < Error
     end
@@ -26,6 +29,15 @@ module IOStreams
     class ValueTooLong < Error
     end
+    class MalformedDataError < RuntimeError
+      attr_reader :line_number
+      def initialize(message, line_number)
+        @line_number = line_number
+        super("#{message} on line #{line_number}.")
+      end
+    end
     class InvalidLayout < Error
     end
   end

data/lib/io_streams/io_streams.rb CHANGED Viewed

@@ -13,8 +13,6 @@ require "uri"
 #   .zip.enc  [ :zip, :enc ]
 #   .gz.enc   [ :gz,  :enc ]
 module IOStreams
-  include Deprecated
   # Returns [Path] instance for the supplied complete path with optional scheme.
   #
   # Example:

data/lib/io_streams/line/reader.rb CHANGED Viewed

@@ -38,12 +38,12 @@ module IOStreams
       #     Size of blocks to read from the input stream at a time.
       #     Default: 65536 ( 64K )
       #
-      # TODO:
-      # - Handle embedded line feeds when reading csv files.
-      # - Skip Comment lines. RegExp?
-      # - Skip "empty" / "blank" lines. RegExp?
-      # - Extract header line(s) / first non-comment, non-blank line
-      # - Embedded newline support, RegExp? or Proc?
+      #   embedded_within: [String]
+      #     Supports CSV files where a line may contain an embedded newline.
+      #     For CSV files set `embedded_within: '"'`
+      #
+      # Note:
+      # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
       def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
         super(input_stream)
@@ -86,17 +86,29 @@ module IOStreams
         line_count
       end
-      # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
-      # The embedded_within argument is set in IOStreams::LineReader
+      # Reads each line per the `delimeter`.
+      # Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
+      # For Example, CSV files can contain newlines embedded within double quotes.
       def readline
         line = _readline
         if line && @embedded_within
           initial_line_number = @line_number
           while line.count(@embedded_within).odd?
-            raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
+            if eof? || line.length > @buffer_size * 10
+              raise(Errors::MalformedDataError.new(
+                "Unbalanced delimited field, delimiter: #{@embedded_within}",
+                initial_line_number
+              ))
+            end
             line << @delimiter
-            line << _readline
+            next_line = _readline
+            if next_line.nil?
+              raise(Errors::MalformedDataError.new(
+                "Unbalanced delimited field, delimiter: #{@embedded_within}",
+                initial_line_number
+              ))
+            end
+            line << next_line
           end
         end
         line

data/lib/io_streams/path.rb CHANGED Viewed

@@ -153,7 +153,7 @@ module IOStreams
     # Returns [true|false] whether the file is compressed based on its file extensions.
     def compressed?
       # TODO: Look at streams?
-      !(path =~ /\.(zip|gz|gzip|xls.|)\z/i).nil?
+      !(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
     end
     # Returns [true|false] whether the file is encrypted based on its file extensions.

data/lib/io_streams/paths/s3.rb CHANGED Viewed

@@ -5,6 +5,9 @@ module IOStreams
     class S3 < IOStreams::Path
       attr_reader :bucket_name, :client, :options
+      # Largest file size supported by the S3 copy object api.
+      S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
       # Arguments:
       #
       # url: [String]
@@ -188,7 +191,7 @@ module IOStreams
       # Make S3 perform direct copies within S3 itself.
       def copy_to(target_path, convert: true)
-        return super(target_path) if convert
+        return super(target_path) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
         target = IOStreams.new(target_path)
         return super(target) unless target.is_a?(self.class)
@@ -203,7 +206,7 @@ module IOStreams
         return super(source_path) if convert
         source = IOStreams.new(source_path)
-        return super(source) unless source.is_a?(self.class)
+        return super(source) if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
         source_name = ::File.join(source.bucket_name, source.path)
         client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))

data/lib/io_streams/stream.rb CHANGED Viewed

@@ -191,11 +191,41 @@ module IOStreams
       end
     end
-    # Set/get the original file_name
+    # Set the original file_name
     def file_name=(file_name)
       builder.file_name = file_name
     end
+    # Set/get the tabular format_options
+    def format(format = :none)
+      if format == :none
+        builder.format
+      else
+        builder.format = format
+        self
+      end
+    end
+    # Set the tabular format
+    def format=(format)
+      builder.format = format
+    end
+    # Set/get the tabular format options
+    def format_options(format_options = :none)
+      if format_options == :none
+        builder.format_options
+      else
+        builder.format_options = format_options
+        self
+      end
+    end
+    # Set the tabular format_options
+    def format_options=(format_options)
+      builder.format_options = format_options
+    end
     # Returns [String] the last component of this path.
     # Returns `nil` if no `file_name` was set.
     #
@@ -293,14 +323,26 @@ module IOStreams
     # Iterate over a file / stream returning each line as an array, one at a time.
     def row_reader(delimiter: nil, embedded_within: nil, **args)
       line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
-        yield IOStreams::Row::Reader.new(io, original_file_name: builder.file_name, **args)
+        yield IOStreams::Row::Reader.new(
+          io,
+          original_file_name: builder.file_name,
+          format:             builder.format,
+          format_options:     builder.format_options,
+          **args
+        )
       end
     end
     # Iterate over a file / stream returning each line as a hash, one at a time.
     def record_reader(delimiter: nil, embedded_within: nil, **args)
       line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
-        yield IOStreams::Record::Reader.new(io, original_file_name: builder.file_name, **args)
+        yield IOStreams::Record::Reader.new(
+          io,
+          original_file_name: builder.file_name,
+          format:             builder.format,
+          format_options:     builder.format_options,
+          **args
+        )
       end
     end
@@ -320,7 +362,14 @@ module IOStreams
       return block.call(io_stream) if io_stream&.is_a?(IOStreams::Row::Writer)
       line_writer(delimiter: delimiter) do |io|
-        IOStreams::Row::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
+        IOStreams::Row::Writer.stream(
+          io,
+          original_file_name: builder.file_name,
+          format: builder.format,
+          format_options: builder.format_options,
+          **args,
+          &block
+        )
       end
     end
@@ -328,7 +377,13 @@ module IOStreams
       return block.call(io_stream) if io_stream&.is_a?(IOStreams::Record::Writer)
       line_writer(delimiter: delimiter) do |io|
-        IOStreams::Record::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
+        IOStreams::Record::Writer.stream(
+          io,
+          original_file_name: builder.file_name,
+          format: builder.format,
+          format_options: builder.format_options,
+          **args,
+          &block)
       end
     end
   end

data/lib/io_streams/tabular.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module IOStreams
     #   format: [Symbol]
     #     :csv, :hash, :array, :json, :psv, :fixed
     #
-    #   file_name: [String]
+    #   file_name: [IOStreams::Path | String]
     #     When `:format` is not supplied the file name can be used to infer the required format.
     #     Optional. Default: nil
     #
@@ -81,14 +81,19 @@ module IOStreams
     #       #as_hash will skip these additional columns entirely as if they were not in the file at all.
     #     false:
     #       Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
-    def initialize(format: nil, file_name: nil, format_options: nil, **args)
+    #
+    #   default_format: [Symbol]
+    #     When the format is not supplied, and the format cannot be inferred from the supplied file name
+    #     then this default format will be used.
+    #     Default: :csv
+    #     Set to nil to force it to raise an exception when the format is undefined.
+    def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
       @header = Header.new(**args)
-      klass   =
-        if file_name && format.nil?
-          self.class.parser_class_for_file_name(file_name)
-        else
-          self.class.parser_class(format)
-        end
+      @format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
+      @format ||= default_format
+      raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
+      klass   = self.class.parser_class(@format)
       @parser = format_options ? klass.new(**format_options) : klass.new
     end
@@ -162,9 +167,9 @@ module IOStreams
     # Example:
     #   register_format(:csv, IOStreams::Tabular::Parser::Csv)
     def self.register_format(format, parser)
-      raise(ArgumentError, "Invalid format #{format.inspect}") unless format.nil? || format.to_s =~ /\A\w+\Z/
+      raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
-      @formats[format.nil? ? nil : format.to_sym] = parser
+      @formats[format.to_sym] = parser
     end
     # De-Register a file format
@@ -187,23 +192,18 @@ module IOStreams
     # A registry to hold formats for processing files during upload or download
     @formats = {}
-    def self.parser_class(format)
-      @formats[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
+    # Returns the registered format that will be used for the supplied file name.
+    def self.format_from_file_name(file_name)
+      file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
+      nil
     end
-    # Returns the parser to use with tabular for the supplied file_name
-    def self.parser_class_for_file_name(file_name)
-      format = nil
-      file_name.to_s.split(".").reverse_each do |ext|
-        if @formats.include?(ext.to_sym)
-          format = ext.to_sym
-          break
-        end
-      end
-      parser_class(format)
+    # Returns the parser class for the registered format.
+    def self.parser_class(format)
+      @formats[format.nil? ? nil : format.to_sym] ||
+        raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
     end
-    register_format(nil, IOStreams::Tabular::Parser::Csv)
     register_format(:array, IOStreams::Tabular::Parser::Array)
     register_format(:csv, IOStreams::Tabular::Parser::Csv)
     register_format(:fixed, IOStreams::Tabular::Parser::Fixed)

data/lib/io_streams/tabular/parser/csv.rb CHANGED Viewed

@@ -5,8 +5,10 @@ module IOStreams
       class Csv < Base
         attr_reader :csv_parser
-        def initialize
-          @csv_parser = Utility::CSVRow.new unless RUBY_VERSION.to_f >= 2.6
+        unless RUBY_VERSION.to_f >= 2.6
+          def initialize
+            @csv_parser = Utility::CSVRow.new
+          end
         end
         # Returns [Array<String>] the header row.

data/lib/io_streams/tabular/utility/csv_row.rb CHANGED Viewed

@@ -6,10 +6,7 @@ module IOStreams
       # 2 to 3 times better performance than CSV.parse_line and considerably less
       # garbage collection required.
       #
-      # Note:
-      #   This parser does not support line feeds embedded in quoted fields since
-      #   the file is broken apart based on line feeds during the upload process and
-      #   is then processed by each worker on a line by line basis.
+      # Note: Only used prior to Ruby 2.6
       class CSVRow < ::CSV
         UTF8_ENCODING = Encoding.find("UTF-8").freeze

data/lib/io_streams/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module IOStreams
-  VERSION = "1.5.1".freeze
+  VERSION = "1.6.0".freeze
 end

data/test/builder_test.rb CHANGED Viewed

@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
       end
     end
+    describe "#format" do
+      it "detects the format from the file name" do
+        streams = IOStreams::Builder.new("abc.json")
+        assert_equal :json, streams.format
+      end
+      it "is nil if the file name has no meaningful format" do
+        assert_nil streams.format
+      end
+      it "returns set format with no file_name" do
+        streams        = IOStreams::Builder.new
+        streams.format = :csv
+        assert_equal :csv, streams.format
+      end
+      it "returns set format with file_name" do
+        streams        = IOStreams::Builder.new("abc.json")
+        streams.format = :csv
+        assert_equal :csv, streams.format
+      end
+      it "validates bad format" do
+        assert_raises ArgumentError do
+          streams.format = :blah
+        end
+      end
+    end
     describe "#stream" do
       it "adds one stream" do
         streams.stream(:pgp, passphrase: "unlock-me")