iostreams 1.5.1 → 1.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c665e5c262a98de9ceccf1cf93f5bc391370d0b674f966d9e266b731a31d3b7f
4
- data.tar.gz: 1ab8c125e49abc178ce4c1e94f36dfb9219011ece6c8a9bd65b1c5a5d2f14604
3
+ metadata.gz: e4fb750e5c3779000fac8f21803b84a5977f111f481b7d4e11a117149cd0d9e6
4
+ data.tar.gz: '09cb2dc7ff67afd44d3ebcfefa29f6a1840d1b3dd871c33bc1a06914cd8bee2d'
5
5
  SHA512:
6
- metadata.gz: 63bec4c3602cd4ab699bcf73abe3d97b7b808c6559c378e67be99c1af1ab7ada84dd0753c91dd2fe7be0924690a1deb30b44b87f56cd4638c4954a6bfcd38796
7
- data.tar.gz: 2b1138c5389747892a33b5213a42b0fe4ececabc421c824db03d5218d436725a95d0306c7b0a1cc2a4b0a8eb4e1b4192152f6d60f11892fb2980c418c7be1f80
6
+ metadata.gz: 0cf2db14e03b9e81e0e39119b35f293408a9d4b6bf3365abc724d95d7376abf7af73720b8fd8d000b70b2eb7abbae20055753e845a77dec2d8d7297e5b6693ba
7
+ data.tar.gz: d9fa2194965ef99a99e1ecd0655e84066e6021236d1494336c40693b103d3c7ecb7e5a87e8f179a38c990ad7670d6bd4901b6a7cd8b89757c00a3179d657a1cc
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  # IOStreams
2
- [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Build Status](https://travis-ci.org/rocketjob/iostreams.svg?branch=master)](https://travis-ci.org/rocketjob/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
2
+ [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
3
3
 
4
4
  IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
5
5
  or storage mechanism transparent to the application.
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
14
14
 
15
15
  Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
16
16
 
17
+ ## Upgrading to v1.6
18
+
19
+ The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
20
+ the following line to your code:
21
+
22
+ ~~~ruby
23
+ IOStreams.include(IOStreams::Deprecated)
24
+ ~~~
25
+
26
+ It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
27
+ release.
28
+
17
29
  ## Versioning
18
30
 
19
31
  This project adheres to [Semantic Versioning](http://semver.org/).
@@ -1,13 +1,15 @@
1
1
  module IOStreams
2
2
  # Build the streams that need to be applied to a path druing reading or writing.
3
3
  class Builder
4
- attr_accessor :file_name
4
+ attr_accessor :file_name, :format_options
5
5
  attr_reader :streams, :options
6
6
 
7
7
  def initialize(file_name = nil)
8
- @file_name = file_name
9
- @streams = nil
10
- @options = nil
8
+ @file_name = file_name
9
+ @streams = nil
10
+ @options = nil
11
+ @format = nil
12
+ @format_option = nil
11
13
  end
12
14
 
13
15
  # Supply an option that is only applied once the file name extensions have been parsed.
@@ -88,11 +90,23 @@ module IOStreams
88
90
  built_streams.freeze
89
91
  end
90
92
 
93
+ # Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
94
+ # Returns [nil] if no format is set, or if it cannot be determined from the file_name
95
+ def format
96
+ @format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
97
+ end
98
+
99
+ def format=(format)
100
+ raise(ArgumentError, "Invalid format: #{format.inspect}") unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
101
+
102
+ @format = format
103
+ end
104
+
91
105
  private
92
106
 
93
107
  def class_for_stream(type, stream)
94
108
  ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
95
- raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
109
+ raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
96
110
  ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
97
111
  end
98
112
 
@@ -9,6 +9,9 @@ module IOStreams
9
9
  class MissingHeader < Error
10
10
  end
11
11
 
12
+ class UnknownFormat < Error
13
+ end
14
+
12
15
  class TypeMismatch < Error
13
16
  end
14
17
 
@@ -26,6 +29,15 @@ module IOStreams
26
29
  class ValueTooLong < Error
27
30
  end
28
31
 
32
+ class MalformedDataError < RuntimeError
33
+ attr_reader :line_number
34
+
35
+ def initialize(message, line_number)
36
+ @line_number = line_number
37
+ super("#{message} on line #{line_number}.")
38
+ end
39
+ end
40
+
29
41
  class InvalidLayout < Error
30
42
  end
31
43
  end
@@ -13,8 +13,6 @@ require "uri"
13
13
  # .zip.enc [ :zip, :enc ]
14
14
  # .gz.enc [ :gz, :enc ]
15
15
  module IOStreams
16
- include Deprecated
17
-
18
16
  # Returns [Path] instance for the supplied complete path with optional scheme.
19
17
  #
20
18
  # Example:
@@ -38,12 +38,12 @@ module IOStreams
38
38
  # Size of blocks to read from the input stream at a time.
39
39
  # Default: 65536 ( 64K )
40
40
  #
41
- # TODO:
42
- # - Handle embedded line feeds when reading csv files.
43
- # - Skip Comment lines. RegExp?
44
- # - Skip "empty" / "blank" lines. RegExp?
45
- # - Extract header line(s) / first non-comment, non-blank line
46
- # - Embedded newline support, RegExp? or Proc?
41
+ # embedded_within: [String]
42
+ # Supports CSV files where a line may contain an embedded newline.
43
+ # For CSV files set `embedded_within: '"'`
44
+ #
45
+ # Note:
46
+ # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
47
47
  def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
48
48
  super(input_stream)
49
49
 
@@ -86,17 +86,29 @@ module IOStreams
86
86
  line_count
87
87
  end
88
88
 
89
- # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
90
- # The embedded_within argument is set in IOStreams::LineReader
89
+ # Reads each line per the `delimeter`.
90
+ # Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
91
+ # For Example, CSV files can contain newlines embedded within double quotes.
91
92
  def readline
92
93
  line = _readline
93
94
  if line && @embedded_within
94
95
  initial_line_number = @line_number
95
96
  while line.count(@embedded_within).odd?
96
- raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
97
-
97
+ if eof? || line.length > @buffer_size * 10
98
+ raise(Errors::MalformedDataError.new(
99
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
100
+ initial_line_number
101
+ ))
102
+ end
98
103
  line << @delimiter
99
- line << _readline
104
+ next_line = _readline
105
+ if next_line.nil?
106
+ raise(Errors::MalformedDataError.new(
107
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
108
+ initial_line_number
109
+ ))
110
+ end
111
+ line << next_line
100
112
  end
101
113
  end
102
114
  line
@@ -153,7 +153,7 @@ module IOStreams
153
153
  # Returns [true|false] whether the file is compressed based on its file extensions.
154
154
  def compressed?
155
155
  # TODO: Look at streams?
156
- !(path =~ /\.(zip|gz|gzip|xls.|)\z/i).nil?
156
+ !(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
157
157
  end
158
158
 
159
159
  # Returns [true|false] whether the file is encrypted based on its file extensions.
@@ -5,6 +5,9 @@ module IOStreams
5
5
  class S3 < IOStreams::Path
6
6
  attr_reader :bucket_name, :client, :options
7
7
 
8
+ # Largest file size supported by the S3 copy object api.
9
+ S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
10
+
8
11
  # Arguments:
9
12
  #
10
13
  # url: [String]
@@ -188,7 +191,7 @@ module IOStreams
188
191
 
189
192
  # Make S3 perform direct copies within S3 itself.
190
193
  def copy_to(target_path, convert: true)
191
- return super(target_path) if convert
194
+ return super(target_path) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
192
195
 
193
196
  target = IOStreams.new(target_path)
194
197
  return super(target) unless target.is_a?(self.class)
@@ -203,7 +206,7 @@ module IOStreams
203
206
  return super(source_path) if convert
204
207
 
205
208
  source = IOStreams.new(source_path)
206
- return super(source) unless source.is_a?(self.class)
209
+ return super(source) if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
207
210
 
208
211
  source_name = ::File.join(source.bucket_name, source.path)
209
212
  client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
@@ -191,11 +191,41 @@ module IOStreams
191
191
  end
192
192
  end
193
193
 
194
- # Set/get the original file_name
194
+ # Set the original file_name
195
195
  def file_name=(file_name)
196
196
  builder.file_name = file_name
197
197
  end
198
198
 
199
+ # Set/get the tabular format_options
200
+ def format(format = :none)
201
+ if format == :none
202
+ builder.format
203
+ else
204
+ builder.format = format
205
+ self
206
+ end
207
+ end
208
+
209
+ # Set the tabular format
210
+ def format=(format)
211
+ builder.format = format
212
+ end
213
+
214
+ # Set/get the tabular format options
215
+ def format_options(format_options = :none)
216
+ if format_options == :none
217
+ builder.format_options
218
+ else
219
+ builder.format_options = format_options
220
+ self
221
+ end
222
+ end
223
+
224
+ # Set the tabular format_options
225
+ def format_options=(format_options)
226
+ builder.format_options = format_options
227
+ end
228
+
199
229
  # Returns [String] the last component of this path.
200
230
  # Returns `nil` if no `file_name` was set.
201
231
  #
@@ -293,14 +323,26 @@ module IOStreams
293
323
  # Iterate over a file / stream returning each line as an array, one at a time.
294
324
  def row_reader(delimiter: nil, embedded_within: nil, **args)
295
325
  line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
296
- yield IOStreams::Row::Reader.new(io, original_file_name: builder.file_name, **args)
326
+ yield IOStreams::Row::Reader.new(
327
+ io,
328
+ original_file_name: builder.file_name,
329
+ format: builder.format,
330
+ format_options: builder.format_options,
331
+ **args
332
+ )
297
333
  end
298
334
  end
299
335
 
300
336
  # Iterate over a file / stream returning each line as a hash, one at a time.
301
337
  def record_reader(delimiter: nil, embedded_within: nil, **args)
302
338
  line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
303
- yield IOStreams::Record::Reader.new(io, original_file_name: builder.file_name, **args)
339
+ yield IOStreams::Record::Reader.new(
340
+ io,
341
+ original_file_name: builder.file_name,
342
+ format: builder.format,
343
+ format_options: builder.format_options,
344
+ **args
345
+ )
304
346
  end
305
347
  end
306
348
 
@@ -320,7 +362,14 @@ module IOStreams
320
362
  return block.call(io_stream) if io_stream&.is_a?(IOStreams::Row::Writer)
321
363
 
322
364
  line_writer(delimiter: delimiter) do |io|
323
- IOStreams::Row::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
365
+ IOStreams::Row::Writer.stream(
366
+ io,
367
+ original_file_name: builder.file_name,
368
+ format: builder.format,
369
+ format_options: builder.format_options,
370
+ **args,
371
+ &block
372
+ )
324
373
  end
325
374
  end
326
375
 
@@ -328,7 +377,13 @@ module IOStreams
328
377
  return block.call(io_stream) if io_stream&.is_a?(IOStreams::Record::Writer)
329
378
 
330
379
  line_writer(delimiter: delimiter) do |io|
331
- IOStreams::Record::Writer.stream(io, original_file_name: builder.file_name, **args, &block)
380
+ IOStreams::Record::Writer.stream(
381
+ io,
382
+ original_file_name: builder.file_name,
383
+ format: builder.format,
384
+ format_options: builder.format_options,
385
+ **args,
386
+ &block)
332
387
  end
333
388
  end
334
389
  end
@@ -52,7 +52,7 @@ module IOStreams
52
52
  # format: [Symbol]
53
53
  # :csv, :hash, :array, :json, :psv, :fixed
54
54
  #
55
- # file_name: [String]
55
+ # file_name: [IOStreams::Path | String]
56
56
  # When `:format` is not supplied the file name can be used to infer the required format.
57
57
  # Optional. Default: nil
58
58
  #
@@ -81,14 +81,19 @@ module IOStreams
81
81
  # #as_hash will skip these additional columns entirely as if they were not in the file at all.
82
82
  # false:
83
83
  # Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
84
- def initialize(format: nil, file_name: nil, format_options: nil, **args)
84
+ #
85
+ # default_format: [Symbol]
86
+ # When the format is not supplied, and the format cannot be inferred from the supplied file name
87
+ # then this default format will be used.
88
+ # Default: :csv
89
+ # Set to nil to force it to raise an exception when the format is undefined.
90
+ def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
85
91
  @header = Header.new(**args)
86
- klass =
87
- if file_name && format.nil?
88
- self.class.parser_class_for_file_name(file_name)
89
- else
90
- self.class.parser_class(format)
91
- end
92
+ @format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
93
+ @format ||= default_format
94
+ raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
95
+
96
+ klass = self.class.parser_class(@format)
92
97
  @parser = format_options ? klass.new(**format_options) : klass.new
93
98
  end
94
99
 
@@ -162,9 +167,9 @@ module IOStreams
162
167
  # Example:
163
168
  # register_format(:csv, IOStreams::Tabular::Parser::Csv)
164
169
  def self.register_format(format, parser)
165
- raise(ArgumentError, "Invalid format #{format.inspect}") unless format.nil? || format.to_s =~ /\A\w+\Z/
170
+ raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
166
171
 
167
- @formats[format.nil? ? nil : format.to_sym] = parser
172
+ @formats[format.to_sym] = parser
168
173
  end
169
174
 
170
175
  # De-Register a file format
@@ -187,23 +192,18 @@ module IOStreams
187
192
  # A registry to hold formats for processing files during upload or download
188
193
  @formats = {}
189
194
 
190
- def self.parser_class(format)
191
- @formats[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
195
+ # Returns the registered format that will be used for the supplied file name.
196
+ def self.format_from_file_name(file_name)
197
+ file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
198
+ nil
192
199
  end
193
200
 
194
- # Returns the parser to use with tabular for the supplied file_name
195
- def self.parser_class_for_file_name(file_name)
196
- format = nil
197
- file_name.to_s.split(".").reverse_each do |ext|
198
- if @formats.include?(ext.to_sym)
199
- format = ext.to_sym
200
- break
201
- end
202
- end
203
- parser_class(format)
201
+ # Returns the parser class for the registered format.
202
+ def self.parser_class(format)
203
+ @formats[format.nil? ? nil : format.to_sym] ||
204
+ raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
204
205
  end
205
206
 
206
- register_format(nil, IOStreams::Tabular::Parser::Csv)
207
207
  register_format(:array, IOStreams::Tabular::Parser::Array)
208
208
  register_format(:csv, IOStreams::Tabular::Parser::Csv)
209
209
  register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
@@ -5,8 +5,10 @@ module IOStreams
5
5
  class Csv < Base
6
6
  attr_reader :csv_parser
7
7
 
8
- def initialize
9
- @csv_parser = Utility::CSVRow.new unless RUBY_VERSION.to_f >= 2.6
8
+ unless RUBY_VERSION.to_f >= 2.6
9
+ def initialize
10
+ @csv_parser = Utility::CSVRow.new
11
+ end
10
12
  end
11
13
 
12
14
  # Returns [Array<String>] the header row.
@@ -6,10 +6,7 @@ module IOStreams
6
6
  # 2 to 3 times better performance than CSV.parse_line and considerably less
7
7
  # garbage collection required.
8
8
  #
9
- # Note:
10
- # This parser does not support line feeds embedded in quoted fields since
11
- # the file is broken apart based on line feeds during the upload process and
12
- # is then processed by each worker on a line by line basis.
9
+ # Note: Only used prior to Ruby 2.6
13
10
  class CSVRow < ::CSV
14
11
  UTF8_ENCODING = Encoding.find("UTF-8").freeze
15
12
 
@@ -1,3 +1,3 @@
1
1
  module IOStreams
2
- VERSION = "1.5.1".freeze
2
+ VERSION = "1.6.0".freeze
3
3
  end
data/test/builder_test.rb CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
41
41
  end
42
42
  end
43
43
 
44
+ describe "#format" do
45
+ it "detects the format from the file name" do
46
+ streams = IOStreams::Builder.new("abc.json")
47
+ assert_equal :json, streams.format
48
+ end
49
+
50
+ it "is nil if the file name has no meaningful format" do
51
+ assert_nil streams.format
52
+ end
53
+
54
+ it "returns set format with no file_name" do
55
+ streams = IOStreams::Builder.new
56
+ streams.format = :csv
57
+ assert_equal :csv, streams.format
58
+ end
59
+
60
+ it "returns set format with file_name" do
61
+ streams = IOStreams::Builder.new("abc.json")
62
+ streams.format = :csv
63
+ assert_equal :csv, streams.format
64
+ end
65
+
66
+ it "validates bad format" do
67
+ assert_raises ArgumentError do
68
+ streams.format = :blah
69
+ end
70
+ end
71
+ end
72
+
44
73
  describe "#stream" do
45
74
  it "adds one stream" do
46
75
  streams.stream(:pgp, passphrase: "unlock-me")