iostreams 1.5.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c665e5c262a98de9ceccf1cf93f5bc391370d0b674f966d9e266b731a31d3b7f
4
- data.tar.gz: 1ab8c125e49abc178ce4c1e94f36dfb9219011ece6c8a9bd65b1c5a5d2f14604
3
+ metadata.gz: e4fb750e5c3779000fac8f21803b84a5977f111f481b7d4e11a117149cd0d9e6
4
+ data.tar.gz: '09cb2dc7ff67afd44d3ebcfefa29f6a1840d1b3dd871c33bc1a06914cd8bee2d'
5
5
  SHA512:
6
- metadata.gz: 63bec4c3602cd4ab699bcf73abe3d97b7b808c6559c378e67be99c1af1ab7ada84dd0753c91dd2fe7be0924690a1deb30b44b87f56cd4638c4954a6bfcd38796
7
- data.tar.gz: 2b1138c5389747892a33b5213a42b0fe4ececabc421c824db03d5218d436725a95d0306c7b0a1cc2a4b0a8eb4e1b4192152f6d60f11892fb2980c418c7be1f80
6
+ metadata.gz: 0cf2db14e03b9e81e0e39119b35f293408a9d4b6bf3365abc724d95d7376abf7af73720b8fd8d000b70b2eb7abbae20055753e845a77dec2d8d7297e5b6693ba
7
+ data.tar.gz: d9fa2194965ef99a99e1ecd0655e84066e6021236d1494336c40693b103d3c7ecb7e5a87e8f179a38c990ad7670d6bd4901b6a7cd8b89757c00a3179d657a1cc
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  # IOStreams
2
- [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Build Status](https://travis-ci.org/rocketjob/iostreams.svg?branch=master)](https://travis-ci.org/rocketjob/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
2
+ [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
3
3
 
4
4
  IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
5
5
  or storage mechanism transparent to the application.
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
14
14
 
15
15
  Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
16
16
 
17
+ ## Upgrading to v1.6
18
+
19
+ The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
20
+ the following line to your code:
21
+
22
+ ~~~ruby
23
+ IOStreams.include(IOStreams::Deprecated)
24
+ ~~~
25
+
26
+ It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
27
+ release.
28
+
17
29
  ## Versioning
18
30
 
19
31
  This project adheres to [Semantic Versioning](http://semver.org/).
@@ -1,13 +1,15 @@
1
1
  module IOStreams
2
2
  # Build the streams that need to be applied to a path druing reading or writing.
3
3
  class Builder
4
- attr_accessor :file_name
4
+ attr_accessor :file_name, :format_options
5
5
  attr_reader :streams, :options
6
6
 
7
7
  def initialize(file_name = nil)
8
- @file_name = file_name
9
- @streams = nil
10
- @options = nil
8
+ @file_name = file_name
9
+ @streams = nil
10
+ @options = nil
11
+ @format = nil
12
+ @format_option = nil
11
13
  end
12
14
 
13
15
  # Supply an option that is only applied once the file name extensions have been parsed.
@@ -88,11 +90,23 @@ module IOStreams
88
90
  built_streams.freeze
89
91
  end
90
92
 
93
+ # Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
94
+ # Returns [nil] if no format is set, or if it cannot be determined from the file_name
95
+ def format
96
+ @format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
97
+ end
98
+
99
+ def format=(format)
100
+ raise(ArgumentError, "Invalid format: #{format.inspect}") unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
101
+
102
+ @format = format
103
+ end
104
+
91
105
  private
92
106
 
93
107
  def class_for_stream(type, stream)
94
108
  ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
95
- raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
109
+ raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
96
110
  ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
97
111
  end
98
112
 
@@ -9,6 +9,9 @@ module IOStreams
9
9
  class MissingHeader < Error
10
10
  end
11
11
 
12
+ class UnknownFormat < Error
13
+ end
14
+
12
15
  class TypeMismatch < Error
13
16
  end
14
17
 
@@ -26,6 +29,15 @@ module IOStreams
26
29
  class ValueTooLong < Error
27
30
  end
28
31
 
32
+ class MalformedDataError < RuntimeError
33
+ attr_reader :line_number
34
+
35
+ def initialize(message, line_number)
36
+ @line_number = line_number
37
+ super("#{message} on line #{line_number}.")
38
+ end
39
+ end
40
+
29
41
  class InvalidLayout < Error
30
42
  end
31
43
  end
@@ -13,8 +13,6 @@ require "uri"
13
13
  # .zip.enc [ :zip, :enc ]
14
14
  # .gz.enc [ :gz, :enc ]
15
15
  module IOStreams
16
- include Deprecated
17
-
18
16
  # Returns [Path] instance for the supplied complete path with optional scheme.
19
17
  #
20
18
  # Example:
@@ -38,12 +38,12 @@ module IOStreams
38
38
  # Size of blocks to read from the input stream at a time.
39
39
  # Default: 65536 ( 64K )
40
40
  #
41
- # TODO:
42
- # - Handle embedded line feeds when reading csv files.
43
- # - Skip Comment lines. RegExp?
44
- # - Skip "empty" / "blank" lines. RegExp?
45
- # - Extract header line(s) / first non-comment, non-blank line
46
- # - Embedded newline support, RegExp? or Proc?
41
+ # embedded_within: [String]
42
+ # Supports CSV files where a line may contain an embedded newline.
43
+ # For CSV files set `embedded_within: '"'`
44
+ #
45
+ # Note:
46
+ # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
47
47
  def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
48
48
  super(input_stream)
49
49
 
@@ -86,17 +86,29 @@ module IOStreams
86
86
  line_count
87
87
  end
88
88
 
89
- # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
90
- # The embedded_within argument is set in IOStreams::LineReader
89
+ # Reads each line per the `delimeter`.
90
+ # Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
91
+ # For Example, CSV files can contain newlines embedded within double quotes.
91
92
  def readline
92
93
  line = _readline
93
94
  if line && @embedded_within
94
95
  initial_line_number = @line_number
95
96
  while line.count(@embedded_within).odd?
96
- raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
97
-
97
+ if eof? || line.length > @buffer_size * 10
98
+ raise(Errors::MalformedDataError.new(
99
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
100
+ initial_line_number
101
+ ))
102
+ end
98
103
  line << @delimiter
99
- line << _readline
104
+ next_line = _readline
105
+ if next_line.nil?
106
+ raise(Errors::MalformedDataError.new(
107
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
108
+ initial_line_number
109
+ ))
110
+ end
111
+ line << next_line
100
112
  end
101
113
  end
102
114
  line
@@ -153,7 +153,7 @@ module IOStreams
153
153
  # Returns [true|false] whether the file is compressed based on its file extensions.
154
154
  def compressed?
155
155
  # TODO: Look at streams?
156
- !(path =~ /\.(zip|gz|gzip|xls.|)\z/i).nil?
156
+ !(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
157
157
  end
158
158
 
159
159
  # Returns [true|false] whether the file is encrypted based on its file extensions.
@@ -5,6 +5,9 @@ module IOStreams
5
5
  class S3 < IOStreams::Path
6
6
  attr_reader :bucket_name, :client, :options
7
7
 
8
+ # Largest file size supported by the S3 copy object api.
9
+ S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
10
+
8
11
  # Arguments:
9
12
  #
10
13
  # url: [String]
@@ -188,7 +191,7 @@ module IOStreams
188
191
 
189
192
  # Make S3 perform direct copies within S3 itself.
190
193
  def copy_to(target_path, convert: true)
191
- return super(target_path) if convert
194
+ return super(target_path) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
192
195
 
193
196
  target = IOStreams.new(target_path)
194
197
  return super(target) unless target.is_a?(self.class)
@@ -203,7 +206,7 @@ module IOStreams
203
206
  return super(source_path) if convert
204
207
 
205
208
  source = IOStreams.new(source_path)
206
- return super(source) unless source.is_a?(self.class)
209
+ return super(source) if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
207
210
 
208
211
  source_name = ::File.join(source.bucket_name, source.path)
209
212
  client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
@@ -191,11 +191,41 @@ module IOStreams
191
191
  end
192
192
  end
193
193
 
194
- # Set/get the original file_name
194
+ # Set the original file_name
195
195
  def file_name=(file_name)
196
196
  builder.file_name = file_name
197
197
  end
198
198
 
199
+ # Set/get the tabular format_options
200
+ def format(format = :none)
201
+ if format == :none
202
+ builder.format
203
+ else
204
+ builder.format = format
205
+ self
206
+ end
207
+ end
208
+
209
+ # Set the tabular format
210
+ def format=(format)
211
+ builder.format = format
212
+ end
213
+
214
+ # Set/get the tabular format options
215
+ def format_options(format_options = :none)
216
+ if format_options == :none
217
+ builder.format_options
218
+ else
219
+ builder.format_options = format_options
220
+ self
221
+ end
222
+ end
223
+
224
+ # Set the tabular format_options
225
+ def format_options=(format_options)
226
+ builder.format_options = format_options
227
+ end
228
+
199
229
  # Returns [String] the last component of this path.
200
230
  # Returns `nil` if no `file_name` was set.
201
231
  #
@@ -293,14 +323,26 @@ module IOStreams
293
323
  # Iterate over a file / stream returning each line as an array, one at a time.
294
324
  def row_reader(delimiter: nil, embedded_within: nil, **args)
295
325
  line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
296
- yield IOStreams::Row::Reader.new(io, original_file_name: builder.file_name, **args)
326
+ yield IOStreams::Row::Reader.new(
327
+ io,
328
+ original_file_name: builder.file_name,
329
+ format: builder.format,
330
+ format_options: builder.format_options,
331
+ **args
332
+ )
297
333
  end
298
334
  end
299
335
 
300
336
  # Iterate over a file / stream returning each line as a hash, one at a time.
301
337
  def record_reader(delimiter: nil, embedded_within: nil, **args)
302
338
  line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
303
- yield IOStreams::Record::Reader.new(io, original_file_name: builder.file_name, **args)
339
+ yield IOStreams::Record::Reader.new(
340
+ io,
341
+ original_file_name: builder.file_name,
342
+ format: builder.format,
343
+ format_options: builder.format_options,
344
+ **args
345
+ )
304
346
  end
305
347
  end
306
348
 
@@ -320,7 +362,14 @@ module IOStreams
320
362
  return block.call(io_stream) if io_stream&.is_a?(IOStreams::Row::Writer)
321
363
 
322
364
  line_writer(delimiter: delimiter) do |io|
323
- IOStreams::Row::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
365
+ IOStreams::Row::Writer.stream(
366
+ io,
367
+ original_file_name: builder.file_name,
368
+ format: builder.format,
369
+ format_options: builder.format_options,
370
+ **args,
371
+ &block
372
+ )
324
373
  end
325
374
  end
326
375
 
@@ -328,7 +377,13 @@ module IOStreams
328
377
  return block.call(io_stream) if io_stream&.is_a?(IOStreams::Record::Writer)
329
378
 
330
379
  line_writer(delimiter: delimiter) do |io|
331
- IOStreams::Record::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
380
+ IOStreams::Record::Writer.stream(
381
+ io,
382
+ original_file_name: builder.file_name,
383
+ format: builder.format,
384
+ format_options: builder.format_options,
385
+ **args,
386
+ &block)
332
387
  end
333
388
  end
334
389
  end
@@ -52,7 +52,7 @@ module IOStreams
52
52
  # format: [Symbol]
53
53
  # :csv, :hash, :array, :json, :psv, :fixed
54
54
  #
55
- # file_name: [String]
55
+ # file_name: [IOStreams::Path | String]
56
56
  # When `:format` is not supplied the file name can be used to infer the required format.
57
57
  # Optional. Default: nil
58
58
  #
@@ -81,14 +81,19 @@ module IOStreams
81
81
  # #as_hash will skip these additional columns entirely as if they were not in the file at all.
82
82
  # false:
83
83
  # Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
84
- def initialize(format: nil, file_name: nil, format_options: nil, **args)
84
+ #
85
+ # default_format: [Symbol]
86
+ # When the format is not supplied, and the format cannot be inferred from the supplied file name
87
+ # then this default format will be used.
88
+ # Default: :csv
89
+ # Set to nil to force it to raise an exception when the format is undefined.
90
+ def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
85
91
  @header = Header.new(**args)
86
- klass =
87
- if file_name && format.nil?
88
- self.class.parser_class_for_file_name(file_name)
89
- else
90
- self.class.parser_class(format)
91
- end
92
+ @format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
93
+ @format ||= default_format
94
+ raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
95
+
96
+ klass = self.class.parser_class(@format)
92
97
  @parser = format_options ? klass.new(**format_options) : klass.new
93
98
  end
94
99
 
@@ -162,9 +167,9 @@ module IOStreams
162
167
  # Example:
163
168
  # register_format(:csv, IOStreams::Tabular::Parser::Csv)
164
169
  def self.register_format(format, parser)
165
- raise(ArgumentError, "Invalid format #{format.inspect}") unless format.nil? || format.to_s =~ /\A\w+\Z/
170
+ raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
166
171
 
167
- @formats[format.nil? ? nil : format.to_sym] = parser
172
+ @formats[format.to_sym] = parser
168
173
  end
169
174
 
170
175
  # De-Register a file format
@@ -187,23 +192,18 @@ module IOStreams
187
192
  # A registry to hold formats for processing files during upload or download
188
193
  @formats = {}
189
194
 
190
- def self.parser_class(format)
191
- @formats[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
195
+ # Returns the registered format that will be used for the supplied file name.
196
+ def self.format_from_file_name(file_name)
197
+ file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
198
+ nil
192
199
  end
193
200
 
194
- # Returns the parser to use with tabular for the supplied file_name
195
- def self.parser_class_for_file_name(file_name)
196
- format = nil
197
- file_name.to_s.split(".").reverse_each do |ext|
198
- if @formats.include?(ext.to_sym)
199
- format = ext.to_sym
200
- break
201
- end
202
- end
203
- parser_class(format)
201
+ # Returns the parser class for the registered format.
202
+ def self.parser_class(format)
203
+ @formats[format.nil? ? nil : format.to_sym] ||
204
+ raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
204
205
  end
205
206
 
206
- register_format(nil, IOStreams::Tabular::Parser::Csv)
207
207
  register_format(:array, IOStreams::Tabular::Parser::Array)
208
208
  register_format(:csv, IOStreams::Tabular::Parser::Csv)
209
209
  register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
@@ -5,8 +5,10 @@ module IOStreams
5
5
  class Csv < Base
6
6
  attr_reader :csv_parser
7
7
 
8
- def initialize
9
- @csv_parser = Utility::CSVRow.new unless RUBY_VERSION.to_f >= 2.6
8
+ unless RUBY_VERSION.to_f >= 2.6
9
+ def initialize
10
+ @csv_parser = Utility::CSVRow.new
11
+ end
10
12
  end
11
13
 
12
14
  # Returns [Array<String>] the header row.
@@ -6,10 +6,7 @@ module IOStreams
6
6
  # 2 to 3 times better performance than CSV.parse_line and considerably less
7
7
  # garbage collection required.
8
8
  #
9
- # Note:
10
- # This parser does not support line feeds embedded in quoted fields since
11
- # the file is broken apart based on line feeds during the upload process and
12
- # is then processed by each worker on a line by line basis.
9
+ # Note: Only used prior to Ruby 2.6
13
10
  class CSVRow < ::CSV
14
11
  UTF8_ENCODING = Encoding.find("UTF-8").freeze
15
12
 
@@ -1,3 +1,3 @@
1
1
  module IOStreams
2
- VERSION = "1.5.1".freeze
2
+ VERSION = "1.6.0".freeze
3
3
  end
data/test/builder_test.rb CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
41
41
  end
42
42
  end
43
43
 
44
+ describe "#format" do
45
+ it "detects the format from the file name" do
46
+ streams = IOStreams::Builder.new("abc.json")
47
+ assert_equal :json, streams.format
48
+ end
49
+
50
+ it "is nil if the file name has no meaningful format" do
51
+ assert_nil streams.format
52
+ end
53
+
54
+ it "returns set format with no file_name" do
55
+ streams = IOStreams::Builder.new
56
+ streams.format = :csv
57
+ assert_equal :csv, streams.format
58
+ end
59
+
60
+ it "returns set format with file_name" do
61
+ streams = IOStreams::Builder.new("abc.json")
62
+ streams.format = :csv
63
+ assert_equal :csv, streams.format
64
+ end
65
+
66
+ it "validates bad format" do
67
+ assert_raises ArgumentError do
68
+ streams.format = :blah
69
+ end
70
+ end
71
+ end
72
+
44
73
  describe "#stream" do
45
74
  it "adds one stream" do
46
75
  streams.stream(:pgp, passphrase: "unlock-me")