iostreams 1.5.1 → 1.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -1
- data/lib/io_streams/builder.rb +19 -5
- data/lib/io_streams/errors.rb +12 -0
- data/lib/io_streams/io_streams.rb +0 -2
- data/lib/io_streams/line/reader.rb +23 -11
- data/lib/io_streams/path.rb +1 -1
- data/lib/io_streams/paths/s3.rb +5 -2
- data/lib/io_streams/stream.rb +60 -5
- data/lib/io_streams/tabular.rb +23 -23
- data/lib/io_streams/tabular/parser/csv.rb +4 -2
- data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
- data/lib/io_streams/version.rb +1 -1
- data/test/builder_test.rb +29 -0
- data/test/deprecated_test.rb +2 -0
- data/test/files/test.psv +4 -0
- data/test/files/unclosed_quote_large_test.csv +1658 -0
- data/test/files/unclosed_quote_test2.csv +3 -0
- data/test/line_reader_test.rb +30 -4
- data/test/stream_test.rb +174 -8
- metadata +47 -42
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4fb750e5c3779000fac8f21803b84a5977f111f481b7d4e11a117149cd0d9e6
|
4
|
+
data.tar.gz: '09cb2dc7ff67afd44d3ebcfefa29f6a1840d1b3dd871c33bc1a06914cd8bee2d'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cf2db14e03b9e81e0e39119b35f293408a9d4b6bf3365abc724d95d7376abf7af73720b8fd8d000b70b2eb7abbae20055753e845a77dec2d8d7297e5b6693ba
|
7
|
+
data.tar.gz: d9fa2194965ef99a99e1ecd0655e84066e6021236d1494336c40693b103d3c7ecb7e5a87e8f179a38c990ad7670d6bd4901b6a7cd8b89757c00a3179d657a1cc
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# IOStreams
|
2
|
-
[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![
|
2
|
+
[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
|
3
3
|
|
4
4
|
IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
|
5
5
|
or storage mechanism transparent to the application.
|
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
|
|
14
14
|
|
15
15
|
Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
|
16
16
|
|
17
|
+
## Upgrading to v1.6
|
18
|
+
|
19
|
+
The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
|
20
|
+
the following line to your code:
|
21
|
+
|
22
|
+
~~~ruby
|
23
|
+
IOStreams.include(IOStreams::Deprecated)
|
24
|
+
~~~
|
25
|
+
|
26
|
+
It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
|
27
|
+
release.
|
28
|
+
|
17
29
|
## Versioning
|
18
30
|
|
19
31
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
data/lib/io_streams/builder.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module IOStreams
|
2
2
|
# Build the streams that need to be applied to a path druing reading or writing.
|
3
3
|
class Builder
|
4
|
-
attr_accessor :file_name
|
4
|
+
attr_accessor :file_name, :format_options
|
5
5
|
attr_reader :streams, :options
|
6
6
|
|
7
7
|
def initialize(file_name = nil)
|
8
|
-
@file_name
|
9
|
-
@streams
|
10
|
-
@options
|
8
|
+
@file_name = file_name
|
9
|
+
@streams = nil
|
10
|
+
@options = nil
|
11
|
+
@format = nil
|
12
|
+
@format_option = nil
|
11
13
|
end
|
12
14
|
|
13
15
|
# Supply an option that is only applied once the file name extensions have been parsed.
|
@@ -88,11 +90,23 @@ module IOStreams
|
|
88
90
|
built_streams.freeze
|
89
91
|
end
|
90
92
|
|
93
|
+
# Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
|
94
|
+
# Returns [nil] if no format is set, or if it cannot be determined from the file_name
|
95
|
+
def format
|
96
|
+
@format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def format=(format)
|
100
|
+
raise(ArgumentError, "Invalid format: #{format.inspect}") unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
|
101
|
+
|
102
|
+
@format = format
|
103
|
+
end
|
104
|
+
|
91
105
|
private
|
92
106
|
|
93
107
|
def class_for_stream(type, stream)
|
94
108
|
ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
|
95
|
-
|
109
|
+
raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
|
96
110
|
ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
|
97
111
|
end
|
98
112
|
|
data/lib/io_streams/errors.rb
CHANGED
@@ -9,6 +9,9 @@ module IOStreams
|
|
9
9
|
class MissingHeader < Error
|
10
10
|
end
|
11
11
|
|
12
|
+
class UnknownFormat < Error
|
13
|
+
end
|
14
|
+
|
12
15
|
class TypeMismatch < Error
|
13
16
|
end
|
14
17
|
|
@@ -26,6 +29,15 @@ module IOStreams
|
|
26
29
|
class ValueTooLong < Error
|
27
30
|
end
|
28
31
|
|
32
|
+
class MalformedDataError < RuntimeError
|
33
|
+
attr_reader :line_number
|
34
|
+
|
35
|
+
def initialize(message, line_number)
|
36
|
+
@line_number = line_number
|
37
|
+
super("#{message} on line #{line_number}.")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
29
41
|
class InvalidLayout < Error
|
30
42
|
end
|
31
43
|
end
|
@@ -38,12 +38,12 @@ module IOStreams
|
|
38
38
|
# Size of blocks to read from the input stream at a time.
|
39
39
|
# Default: 65536 ( 64K )
|
40
40
|
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
41
|
+
# embedded_within: [String]
|
42
|
+
# Supports CSV files where a line may contain an embedded newline.
|
43
|
+
# For CSV files set `embedded_within: '"'`
|
44
|
+
#
|
45
|
+
# Note:
|
46
|
+
# * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
|
47
47
|
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
|
48
48
|
super(input_stream)
|
49
49
|
|
@@ -86,17 +86,29 @@ module IOStreams
|
|
86
86
|
line_count
|
87
87
|
end
|
88
88
|
|
89
|
-
# Reads each line per the
|
90
|
-
#
|
89
|
+
# Reads each line per the `delimeter`.
|
90
|
+
# Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
|
91
|
+
# For Example, CSV files can contain newlines embedded within double quotes.
|
91
92
|
def readline
|
92
93
|
line = _readline
|
93
94
|
if line && @embedded_within
|
94
95
|
initial_line_number = @line_number
|
95
96
|
while line.count(@embedded_within).odd?
|
96
|
-
|
97
|
-
|
97
|
+
if eof? || line.length > @buffer_size * 10
|
98
|
+
raise(Errors::MalformedDataError.new(
|
99
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
100
|
+
initial_line_number
|
101
|
+
))
|
102
|
+
end
|
98
103
|
line << @delimiter
|
99
|
-
|
104
|
+
next_line = _readline
|
105
|
+
if next_line.nil?
|
106
|
+
raise(Errors::MalformedDataError.new(
|
107
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
108
|
+
initial_line_number
|
109
|
+
))
|
110
|
+
end
|
111
|
+
line << next_line
|
100
112
|
end
|
101
113
|
end
|
102
114
|
line
|
data/lib/io_streams/path.rb
CHANGED
@@ -153,7 +153,7 @@ module IOStreams
|
|
153
153
|
# Returns [true|false] whether the file is compressed based on its file extensions.
|
154
154
|
def compressed?
|
155
155
|
# TODO: Look at streams?
|
156
|
-
!(path =~ /\.(zip|gz|gzip|
|
156
|
+
!(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
|
157
157
|
end
|
158
158
|
|
159
159
|
# Returns [true|false] whether the file is encrypted based on its file extensions.
|
data/lib/io_streams/paths/s3.rb
CHANGED
@@ -5,6 +5,9 @@ module IOStreams
|
|
5
5
|
class S3 < IOStreams::Path
|
6
6
|
attr_reader :bucket_name, :client, :options
|
7
7
|
|
8
|
+
# Largest file size supported by the S3 copy object api.
|
9
|
+
S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
|
10
|
+
|
8
11
|
# Arguments:
|
9
12
|
#
|
10
13
|
# url: [String]
|
@@ -188,7 +191,7 @@ module IOStreams
|
|
188
191
|
|
189
192
|
# Make S3 perform direct copies within S3 itself.
|
190
193
|
def copy_to(target_path, convert: true)
|
191
|
-
return super(target_path) if convert
|
194
|
+
return super(target_path) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
192
195
|
|
193
196
|
target = IOStreams.new(target_path)
|
194
197
|
return super(target) unless target.is_a?(self.class)
|
@@ -203,7 +206,7 @@ module IOStreams
|
|
203
206
|
return super(source_path) if convert
|
204
207
|
|
205
208
|
source = IOStreams.new(source_path)
|
206
|
-
return super(source)
|
209
|
+
return super(source) if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
207
210
|
|
208
211
|
source_name = ::File.join(source.bucket_name, source.path)
|
209
212
|
client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
|
data/lib/io_streams/stream.rb
CHANGED
@@ -191,11 +191,41 @@ module IOStreams
|
|
191
191
|
end
|
192
192
|
end
|
193
193
|
|
194
|
-
# Set
|
194
|
+
# Set the original file_name
|
195
195
|
def file_name=(file_name)
|
196
196
|
builder.file_name = file_name
|
197
197
|
end
|
198
198
|
|
199
|
+
# Set/get the tabular format_options
|
200
|
+
def format(format = :none)
|
201
|
+
if format == :none
|
202
|
+
builder.format
|
203
|
+
else
|
204
|
+
builder.format = format
|
205
|
+
self
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# Set the tabular format
|
210
|
+
def format=(format)
|
211
|
+
builder.format = format
|
212
|
+
end
|
213
|
+
|
214
|
+
# Set/get the tabular format options
|
215
|
+
def format_options(format_options = :none)
|
216
|
+
if format_options == :none
|
217
|
+
builder.format_options
|
218
|
+
else
|
219
|
+
builder.format_options = format_options
|
220
|
+
self
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Set the tabular format_options
|
225
|
+
def format_options=(format_options)
|
226
|
+
builder.format_options = format_options
|
227
|
+
end
|
228
|
+
|
199
229
|
# Returns [String] the last component of this path.
|
200
230
|
# Returns `nil` if no `file_name` was set.
|
201
231
|
#
|
@@ -293,14 +323,26 @@ module IOStreams
|
|
293
323
|
# Iterate over a file / stream returning each line as an array, one at a time.
|
294
324
|
def row_reader(delimiter: nil, embedded_within: nil, **args)
|
295
325
|
line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
|
296
|
-
yield IOStreams::Row::Reader.new(
|
326
|
+
yield IOStreams::Row::Reader.new(
|
327
|
+
io,
|
328
|
+
original_file_name: builder.file_name,
|
329
|
+
format: builder.format,
|
330
|
+
format_options: builder.format_options,
|
331
|
+
**args
|
332
|
+
)
|
297
333
|
end
|
298
334
|
end
|
299
335
|
|
300
336
|
# Iterate over a file / stream returning each line as a hash, one at a time.
|
301
337
|
def record_reader(delimiter: nil, embedded_within: nil, **args)
|
302
338
|
line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
|
303
|
-
yield IOStreams::Record::Reader.new(
|
339
|
+
yield IOStreams::Record::Reader.new(
|
340
|
+
io,
|
341
|
+
original_file_name: builder.file_name,
|
342
|
+
format: builder.format,
|
343
|
+
format_options: builder.format_options,
|
344
|
+
**args
|
345
|
+
)
|
304
346
|
end
|
305
347
|
end
|
306
348
|
|
@@ -320,7 +362,14 @@ module IOStreams
|
|
320
362
|
return block.call(io_stream) if io_stream&.is_a?(IOStreams::Row::Writer)
|
321
363
|
|
322
364
|
line_writer(delimiter: delimiter) do |io|
|
323
|
-
IOStreams::Row::Writer.stream(
|
365
|
+
IOStreams::Row::Writer.stream(
|
366
|
+
io,
|
367
|
+
original_file_name: builder.file_name,
|
368
|
+
format: builder.format,
|
369
|
+
format_options: builder.format_options,
|
370
|
+
**args,
|
371
|
+
&block
|
372
|
+
)
|
324
373
|
end
|
325
374
|
end
|
326
375
|
|
@@ -328,7 +377,13 @@ module IOStreams
|
|
328
377
|
return block.call(io_stream) if io_stream&.is_a?(IOStreams::Record::Writer)
|
329
378
|
|
330
379
|
line_writer(delimiter: delimiter) do |io|
|
331
|
-
IOStreams::Record::Writer.stream(
|
380
|
+
IOStreams::Record::Writer.stream(
|
381
|
+
io,
|
382
|
+
original_file_name: builder.file_name,
|
383
|
+
format: builder.format,
|
384
|
+
format_options: builder.format_options,
|
385
|
+
**args,
|
386
|
+
&block)
|
332
387
|
end
|
333
388
|
end
|
334
389
|
end
|
data/lib/io_streams/tabular.rb
CHANGED
@@ -52,7 +52,7 @@ module IOStreams
|
|
52
52
|
# format: [Symbol]
|
53
53
|
# :csv, :hash, :array, :json, :psv, :fixed
|
54
54
|
#
|
55
|
-
# file_name: [String]
|
55
|
+
# file_name: [IOStreams::Path | String]
|
56
56
|
# When `:format` is not supplied the file name can be used to infer the required format.
|
57
57
|
# Optional. Default: nil
|
58
58
|
#
|
@@ -81,14 +81,19 @@ module IOStreams
|
|
81
81
|
# #as_hash will skip these additional columns entirely as if they were not in the file at all.
|
82
82
|
# false:
|
83
83
|
# Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
|
84
|
-
|
84
|
+
#
|
85
|
+
# default_format: [Symbol]
|
86
|
+
# When the format is not supplied, and the format cannot be inferred from the supplied file name
|
87
|
+
# then this default format will be used.
|
88
|
+
# Default: :csv
|
89
|
+
# Set to nil to force it to raise an exception when the format is undefined.
|
90
|
+
def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
|
85
91
|
@header = Header.new(**args)
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
+
@format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
|
93
|
+
@format ||= default_format
|
94
|
+
raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
|
95
|
+
|
96
|
+
klass = self.class.parser_class(@format)
|
92
97
|
@parser = format_options ? klass.new(**format_options) : klass.new
|
93
98
|
end
|
94
99
|
|
@@ -162,9 +167,9 @@ module IOStreams
|
|
162
167
|
# Example:
|
163
168
|
# register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
164
169
|
def self.register_format(format, parser)
|
165
|
-
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.
|
170
|
+
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
|
166
171
|
|
167
|
-
@formats[format.
|
172
|
+
@formats[format.to_sym] = parser
|
168
173
|
end
|
169
174
|
|
170
175
|
# De-Register a file format
|
@@ -187,23 +192,18 @@ module IOStreams
|
|
187
192
|
# A registry to hold formats for processing files during upload or download
|
188
193
|
@formats = {}
|
189
194
|
|
190
|
-
|
191
|
-
|
195
|
+
# Returns the registered format that will be used for the supplied file name.
|
196
|
+
def self.format_from_file_name(file_name)
|
197
|
+
file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
|
198
|
+
nil
|
192
199
|
end
|
193
200
|
|
194
|
-
# Returns the parser
|
195
|
-
def self.
|
196
|
-
format
|
197
|
-
|
198
|
-
if @formats.include?(ext.to_sym)
|
199
|
-
format = ext.to_sym
|
200
|
-
break
|
201
|
-
end
|
202
|
-
end
|
203
|
-
parser_class(format)
|
201
|
+
# Returns the parser class for the registered format.
|
202
|
+
def self.parser_class(format)
|
203
|
+
@formats[format.nil? ? nil : format.to_sym] ||
|
204
|
+
raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
|
204
205
|
end
|
205
206
|
|
206
|
-
register_format(nil, IOStreams::Tabular::Parser::Csv)
|
207
207
|
register_format(:array, IOStreams::Tabular::Parser::Array)
|
208
208
|
register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
209
209
|
register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
|
@@ -5,8 +5,10 @@ module IOStreams
|
|
5
5
|
class Csv < Base
|
6
6
|
attr_reader :csv_parser
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
unless RUBY_VERSION.to_f >= 2.6
|
9
|
+
def initialize
|
10
|
+
@csv_parser = Utility::CSVRow.new
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
# Returns [Array<String>] the header row.
|
@@ -6,10 +6,7 @@ module IOStreams
|
|
6
6
|
# 2 to 3 times better performance than CSV.parse_line and considerably less
|
7
7
|
# garbage collection required.
|
8
8
|
#
|
9
|
-
# Note:
|
10
|
-
# This parser does not support line feeds embedded in quoted fields since
|
11
|
-
# the file is broken apart based on line feeds during the upload process and
|
12
|
-
# is then processed by each worker on a line by line basis.
|
9
|
+
# Note: Only used prior to Ruby 2.6
|
13
10
|
class CSVRow < ::CSV
|
14
11
|
UTF8_ENCODING = Encoding.find("UTF-8").freeze
|
15
12
|
|
data/lib/io_streams/version.rb
CHANGED
data/test/builder_test.rb
CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
describe "#format" do
|
45
|
+
it "detects the format from the file name" do
|
46
|
+
streams = IOStreams::Builder.new("abc.json")
|
47
|
+
assert_equal :json, streams.format
|
48
|
+
end
|
49
|
+
|
50
|
+
it "is nil if the file name has no meaningful format" do
|
51
|
+
assert_nil streams.format
|
52
|
+
end
|
53
|
+
|
54
|
+
it "returns set format with no file_name" do
|
55
|
+
streams = IOStreams::Builder.new
|
56
|
+
streams.format = :csv
|
57
|
+
assert_equal :csv, streams.format
|
58
|
+
end
|
59
|
+
|
60
|
+
it "returns set format with file_name" do
|
61
|
+
streams = IOStreams::Builder.new("abc.json")
|
62
|
+
streams.format = :csv
|
63
|
+
assert_equal :csv, streams.format
|
64
|
+
end
|
65
|
+
|
66
|
+
it "validates bad format" do
|
67
|
+
assert_raises ArgumentError do
|
68
|
+
streams.format = :blah
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
44
73
|
describe "#stream" do
|
45
74
|
it "adds one stream" do
|
46
75
|
streams.stream(:pgp, passphrase: "unlock-me")
|