iostreams 1.5.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -1
- data/lib/io_streams/builder.rb +19 -5
- data/lib/io_streams/errors.rb +12 -0
- data/lib/io_streams/io_streams.rb +0 -2
- data/lib/io_streams/line/reader.rb +23 -11
- data/lib/io_streams/path.rb +1 -1
- data/lib/io_streams/paths/s3.rb +5 -2
- data/lib/io_streams/stream.rb +60 -5
- data/lib/io_streams/tabular.rb +23 -23
- data/lib/io_streams/tabular/parser/csv.rb +4 -2
- data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
- data/lib/io_streams/version.rb +1 -1
- data/test/builder_test.rb +29 -0
- data/test/deprecated_test.rb +2 -0
- data/test/files/test.psv +4 -0
- data/test/files/unclosed_quote_large_test.csv +1658 -0
- data/test/files/unclosed_quote_test2.csv +3 -0
- data/test/line_reader_test.rb +30 -4
- data/test/stream_test.rb +174 -8
- metadata +47 -42
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4fb750e5c3779000fac8f21803b84a5977f111f481b7d4e11a117149cd0d9e6
|
4
|
+
data.tar.gz: '09cb2dc7ff67afd44d3ebcfefa29f6a1840d1b3dd871c33bc1a06914cd8bee2d'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cf2db14e03b9e81e0e39119b35f293408a9d4b6bf3365abc724d95d7376abf7af73720b8fd8d000b70b2eb7abbae20055753e845a77dec2d8d7297e5b6693ba
|
7
|
+
data.tar.gz: d9fa2194965ef99a99e1ecd0655e84066e6021236d1494336c40693b103d3c7ecb7e5a87e8f179a38c990ad7670d6bd4901b6a7cd8b89757c00a3179d657a1cc
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# IOStreams
|
2
|
-
[](https://rubygems.org/gems/iostreams) [](https://rubygems.org/gems/iostreams) [](https://rubygems.org/gems/iostreams) [](http://opensource.org/licenses/Apache-2.0)  [-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
|
3
3
|
|
4
4
|
IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
|
5
5
|
or storage mechanism transparent to the application.
|
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
|
|
14
14
|
|
15
15
|
Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
|
16
16
|
|
17
|
+
## Upgrading to v1.6
|
18
|
+
|
19
|
+
The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
|
20
|
+
the following line to your code:
|
21
|
+
|
22
|
+
~~~ruby
|
23
|
+
IOStreams.include(IOStreams::Deprecated)
|
24
|
+
~~~
|
25
|
+
|
26
|
+
It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
|
27
|
+
release.
|
28
|
+
|
17
29
|
## Versioning
|
18
30
|
|
19
31
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
data/lib/io_streams/builder.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module IOStreams
|
2
2
|
# Build the streams that need to be applied to a path druing reading or writing.
|
3
3
|
class Builder
|
4
|
-
attr_accessor :file_name
|
4
|
+
attr_accessor :file_name, :format_options
|
5
5
|
attr_reader :streams, :options
|
6
6
|
|
7
7
|
def initialize(file_name = nil)
|
8
|
-
@file_name
|
9
|
-
@streams
|
10
|
-
@options
|
8
|
+
@file_name = file_name
|
9
|
+
@streams = nil
|
10
|
+
@options = nil
|
11
|
+
@format = nil
|
12
|
+
@format_option = nil
|
11
13
|
end
|
12
14
|
|
13
15
|
# Supply an option that is only applied once the file name extensions have been parsed.
|
@@ -88,11 +90,23 @@ module IOStreams
|
|
88
90
|
built_streams.freeze
|
89
91
|
end
|
90
92
|
|
93
|
+
# Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
|
94
|
+
# Returns [nil] if no format is set, or if it cannot be determined from the file_name
|
95
|
+
def format
|
96
|
+
@format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def format=(format)
|
100
|
+
raise(ArgumentError, "Invalid format: #{format.inspect}") unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
|
101
|
+
|
102
|
+
@format = format
|
103
|
+
end
|
104
|
+
|
91
105
|
private
|
92
106
|
|
93
107
|
def class_for_stream(type, stream)
|
94
108
|
ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
|
95
|
-
|
109
|
+
raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
|
96
110
|
ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
|
97
111
|
end
|
98
112
|
|
data/lib/io_streams/errors.rb
CHANGED
@@ -9,6 +9,9 @@ module IOStreams
|
|
9
9
|
class MissingHeader < Error
|
10
10
|
end
|
11
11
|
|
12
|
+
class UnknownFormat < Error
|
13
|
+
end
|
14
|
+
|
12
15
|
class TypeMismatch < Error
|
13
16
|
end
|
14
17
|
|
@@ -26,6 +29,15 @@ module IOStreams
|
|
26
29
|
class ValueTooLong < Error
|
27
30
|
end
|
28
31
|
|
32
|
+
class MalformedDataError < RuntimeError
|
33
|
+
attr_reader :line_number
|
34
|
+
|
35
|
+
def initialize(message, line_number)
|
36
|
+
@line_number = line_number
|
37
|
+
super("#{message} on line #{line_number}.")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
29
41
|
class InvalidLayout < Error
|
30
42
|
end
|
31
43
|
end
|
@@ -38,12 +38,12 @@ module IOStreams
|
|
38
38
|
# Size of blocks to read from the input stream at a time.
|
39
39
|
# Default: 65536 ( 64K )
|
40
40
|
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
41
|
+
# embedded_within: [String]
|
42
|
+
# Supports CSV files where a line may contain an embedded newline.
|
43
|
+
# For CSV files set `embedded_within: '"'`
|
44
|
+
#
|
45
|
+
# Note:
|
46
|
+
# * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
|
47
47
|
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
|
48
48
|
super(input_stream)
|
49
49
|
|
@@ -86,17 +86,29 @@ module IOStreams
|
|
86
86
|
line_count
|
87
87
|
end
|
88
88
|
|
89
|
-
# Reads each line per the
|
90
|
-
#
|
89
|
+
# Reads each line per the `delimeter`.
|
90
|
+
# Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
|
91
|
+
# For Example, CSV files can contain newlines embedded within double quotes.
|
91
92
|
def readline
|
92
93
|
line = _readline
|
93
94
|
if line && @embedded_within
|
94
95
|
initial_line_number = @line_number
|
95
96
|
while line.count(@embedded_within).odd?
|
96
|
-
|
97
|
-
|
97
|
+
if eof? || line.length > @buffer_size * 10
|
98
|
+
raise(Errors::MalformedDataError.new(
|
99
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
100
|
+
initial_line_number
|
101
|
+
))
|
102
|
+
end
|
98
103
|
line << @delimiter
|
99
|
-
|
104
|
+
next_line = _readline
|
105
|
+
if next_line.nil?
|
106
|
+
raise(Errors::MalformedDataError.new(
|
107
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
108
|
+
initial_line_number
|
109
|
+
))
|
110
|
+
end
|
111
|
+
line << next_line
|
100
112
|
end
|
101
113
|
end
|
102
114
|
line
|
data/lib/io_streams/path.rb
CHANGED
@@ -153,7 +153,7 @@ module IOStreams
|
|
153
153
|
# Returns [true|false] whether the file is compressed based on its file extensions.
|
154
154
|
def compressed?
|
155
155
|
# TODO: Look at streams?
|
156
|
-
!(path =~ /\.(zip|gz|gzip|
|
156
|
+
!(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
|
157
157
|
end
|
158
158
|
|
159
159
|
# Returns [true|false] whether the file is encrypted based on its file extensions.
|
data/lib/io_streams/paths/s3.rb
CHANGED
@@ -5,6 +5,9 @@ module IOStreams
|
|
5
5
|
class S3 < IOStreams::Path
|
6
6
|
attr_reader :bucket_name, :client, :options
|
7
7
|
|
8
|
+
# Largest file size supported by the S3 copy object api.
|
9
|
+
S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
|
10
|
+
|
8
11
|
# Arguments:
|
9
12
|
#
|
10
13
|
# url: [String]
|
@@ -188,7 +191,7 @@ module IOStreams
|
|
188
191
|
|
189
192
|
# Make S3 perform direct copies within S3 itself.
|
190
193
|
def copy_to(target_path, convert: true)
|
191
|
-
return super(target_path) if convert
|
194
|
+
return super(target_path) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
192
195
|
|
193
196
|
target = IOStreams.new(target_path)
|
194
197
|
return super(target) unless target.is_a?(self.class)
|
@@ -203,7 +206,7 @@ module IOStreams
|
|
203
206
|
return super(source_path) if convert
|
204
207
|
|
205
208
|
source = IOStreams.new(source_path)
|
206
|
-
return super(source)
|
209
|
+
return super(source) if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
207
210
|
|
208
211
|
source_name = ::File.join(source.bucket_name, source.path)
|
209
212
|
client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
|
data/lib/io_streams/stream.rb
CHANGED
@@ -191,11 +191,41 @@ module IOStreams
|
|
191
191
|
end
|
192
192
|
end
|
193
193
|
|
194
|
-
# Set
|
194
|
+
# Set the original file_name
|
195
195
|
def file_name=(file_name)
|
196
196
|
builder.file_name = file_name
|
197
197
|
end
|
198
198
|
|
199
|
+
# Set/get the tabular format_options
|
200
|
+
def format(format = :none)
|
201
|
+
if format == :none
|
202
|
+
builder.format
|
203
|
+
else
|
204
|
+
builder.format = format
|
205
|
+
self
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# Set the tabular format
|
210
|
+
def format=(format)
|
211
|
+
builder.format = format
|
212
|
+
end
|
213
|
+
|
214
|
+
# Set/get the tabular format options
|
215
|
+
def format_options(format_options = :none)
|
216
|
+
if format_options == :none
|
217
|
+
builder.format_options
|
218
|
+
else
|
219
|
+
builder.format_options = format_options
|
220
|
+
self
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Set the tabular format_options
|
225
|
+
def format_options=(format_options)
|
226
|
+
builder.format_options = format_options
|
227
|
+
end
|
228
|
+
|
199
229
|
# Returns [String] the last component of this path.
|
200
230
|
# Returns `nil` if no `file_name` was set.
|
201
231
|
#
|
@@ -293,14 +323,26 @@ module IOStreams
|
|
293
323
|
# Iterate over a file / stream returning each line as an array, one at a time.
|
294
324
|
def row_reader(delimiter: nil, embedded_within: nil, **args)
|
295
325
|
line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
|
296
|
-
yield IOStreams::Row::Reader.new(
|
326
|
+
yield IOStreams::Row::Reader.new(
|
327
|
+
io,
|
328
|
+
original_file_name: builder.file_name,
|
329
|
+
format: builder.format,
|
330
|
+
format_options: builder.format_options,
|
331
|
+
**args
|
332
|
+
)
|
297
333
|
end
|
298
334
|
end
|
299
335
|
|
300
336
|
# Iterate over a file / stream returning each line as a hash, one at a time.
|
301
337
|
def record_reader(delimiter: nil, embedded_within: nil, **args)
|
302
338
|
line_reader(delimiter: delimiter, embedded_within: embedded_within) do |io|
|
303
|
-
yield IOStreams::Record::Reader.new(
|
339
|
+
yield IOStreams::Record::Reader.new(
|
340
|
+
io,
|
341
|
+
original_file_name: builder.file_name,
|
342
|
+
format: builder.format,
|
343
|
+
format_options: builder.format_options,
|
344
|
+
**args
|
345
|
+
)
|
304
346
|
end
|
305
347
|
end
|
306
348
|
|
@@ -320,7 +362,14 @@ module IOStreams
|
|
320
362
|
return block.call(io_stream) if io_stream&.is_a?(IOStreams::Row::Writer)
|
321
363
|
|
322
364
|
line_writer(delimiter: delimiter) do |io|
|
323
|
-
IOStreams::Row::Writer.stream(
|
365
|
+
IOStreams::Row::Writer.stream(
|
366
|
+
io,
|
367
|
+
original_file_name: builder.file_name,
|
368
|
+
format: builder.format,
|
369
|
+
format_options: builder.format_options,
|
370
|
+
**args,
|
371
|
+
&block
|
372
|
+
)
|
324
373
|
end
|
325
374
|
end
|
326
375
|
|
@@ -328,7 +377,13 @@ module IOStreams
|
|
328
377
|
return block.call(io_stream) if io_stream&.is_a?(IOStreams::Record::Writer)
|
329
378
|
|
330
379
|
line_writer(delimiter: delimiter) do |io|
|
331
|
-
IOStreams::Record::Writer.stream(
|
380
|
+
IOStreams::Record::Writer.stream(
|
381
|
+
io,
|
382
|
+
original_file_name: builder.file_name,
|
383
|
+
format: builder.format,
|
384
|
+
format_options: builder.format_options,
|
385
|
+
**args,
|
386
|
+
&block)
|
332
387
|
end
|
333
388
|
end
|
334
389
|
end
|
data/lib/io_streams/tabular.rb
CHANGED
@@ -52,7 +52,7 @@ module IOStreams
|
|
52
52
|
# format: [Symbol]
|
53
53
|
# :csv, :hash, :array, :json, :psv, :fixed
|
54
54
|
#
|
55
|
-
# file_name: [String]
|
55
|
+
# file_name: [IOStreams::Path | String]
|
56
56
|
# When `:format` is not supplied the file name can be used to infer the required format.
|
57
57
|
# Optional. Default: nil
|
58
58
|
#
|
@@ -81,14 +81,19 @@ module IOStreams
|
|
81
81
|
# #as_hash will skip these additional columns entirely as if they were not in the file at all.
|
82
82
|
# false:
|
83
83
|
# Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
|
84
|
-
|
84
|
+
#
|
85
|
+
# default_format: [Symbol]
|
86
|
+
# When the format is not supplied, and the format cannot be inferred from the supplied file name
|
87
|
+
# then this default format will be used.
|
88
|
+
# Default: :csv
|
89
|
+
# Set to nil to force it to raise an exception when the format is undefined.
|
90
|
+
def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
|
85
91
|
@header = Header.new(**args)
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
+
@format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
|
93
|
+
@format ||= default_format
|
94
|
+
raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
|
95
|
+
|
96
|
+
klass = self.class.parser_class(@format)
|
92
97
|
@parser = format_options ? klass.new(**format_options) : klass.new
|
93
98
|
end
|
94
99
|
|
@@ -162,9 +167,9 @@ module IOStreams
|
|
162
167
|
# Example:
|
163
168
|
# register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
164
169
|
def self.register_format(format, parser)
|
165
|
-
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.
|
170
|
+
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
|
166
171
|
|
167
|
-
@formats[format.
|
172
|
+
@formats[format.to_sym] = parser
|
168
173
|
end
|
169
174
|
|
170
175
|
# De-Register a file format
|
@@ -187,23 +192,18 @@ module IOStreams
|
|
187
192
|
# A registry to hold formats for processing files during upload or download
|
188
193
|
@formats = {}
|
189
194
|
|
190
|
-
|
191
|
-
|
195
|
+
# Returns the registered format that will be used for the supplied file name.
|
196
|
+
def self.format_from_file_name(file_name)
|
197
|
+
file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
|
198
|
+
nil
|
192
199
|
end
|
193
200
|
|
194
|
-
# Returns the parser
|
195
|
-
def self.
|
196
|
-
format
|
197
|
-
|
198
|
-
if @formats.include?(ext.to_sym)
|
199
|
-
format = ext.to_sym
|
200
|
-
break
|
201
|
-
end
|
202
|
-
end
|
203
|
-
parser_class(format)
|
201
|
+
# Returns the parser class for the registered format.
|
202
|
+
def self.parser_class(format)
|
203
|
+
@formats[format.nil? ? nil : format.to_sym] ||
|
204
|
+
raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
|
204
205
|
end
|
205
206
|
|
206
|
-
register_format(nil, IOStreams::Tabular::Parser::Csv)
|
207
207
|
register_format(:array, IOStreams::Tabular::Parser::Array)
|
208
208
|
register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
209
209
|
register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
|
@@ -5,8 +5,10 @@ module IOStreams
|
|
5
5
|
class Csv < Base
|
6
6
|
attr_reader :csv_parser
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
unless RUBY_VERSION.to_f >= 2.6
|
9
|
+
def initialize
|
10
|
+
@csv_parser = Utility::CSVRow.new
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
# Returns [Array<String>] the header row.
|
@@ -6,10 +6,7 @@ module IOStreams
|
|
6
6
|
# 2 to 3 times better performance than CSV.parse_line and considerably less
|
7
7
|
# garbage collection required.
|
8
8
|
#
|
9
|
-
# Note:
|
10
|
-
# This parser does not support line feeds embedded in quoted fields since
|
11
|
-
# the file is broken apart based on line feeds during the upload process and
|
12
|
-
# is then processed by each worker on a line by line basis.
|
9
|
+
# Note: Only used prior to Ruby 2.6
|
13
10
|
class CSVRow < ::CSV
|
14
11
|
UTF8_ENCODING = Encoding.find("UTF-8").freeze
|
15
12
|
|
data/lib/io_streams/version.rb
CHANGED
data/test/builder_test.rb
CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
describe "#format" do
|
45
|
+
it "detects the format from the file name" do
|
46
|
+
streams = IOStreams::Builder.new("abc.json")
|
47
|
+
assert_equal :json, streams.format
|
48
|
+
end
|
49
|
+
|
50
|
+
it "is nil if the file name has no meaningful format" do
|
51
|
+
assert_nil streams.format
|
52
|
+
end
|
53
|
+
|
54
|
+
it "returns set format with no file_name" do
|
55
|
+
streams = IOStreams::Builder.new
|
56
|
+
streams.format = :csv
|
57
|
+
assert_equal :csv, streams.format
|
58
|
+
end
|
59
|
+
|
60
|
+
it "returns set format with file_name" do
|
61
|
+
streams = IOStreams::Builder.new("abc.json")
|
62
|
+
streams.format = :csv
|
63
|
+
assert_equal :csv, streams.format
|
64
|
+
end
|
65
|
+
|
66
|
+
it "validates bad format" do
|
67
|
+
assert_raises ArgumentError do
|
68
|
+
streams.format = :blah
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
44
73
|
describe "#stream" do
|
45
74
|
it "adds one stream" do
|
46
75
|
streams.stream(:pgp, passphrase: "unlock-me")
|