iostreams 0.16.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/io_streams/io_streams.rb +102 -26
- data/lib/io_streams/line/reader.rb +40 -20
- data/lib/io_streams/record/reader.rb +3 -3
- data/lib/io_streams/s3.rb +8 -10
- data/lib/io_streams/s3/reader.rb +11 -49
- data/lib/io_streams/s3/writer.rb +5 -3
- data/lib/io_streams/streams.rb +3 -3
- data/lib/io_streams/version.rb +1 -1
- data/lib/iostreams.rb +1 -4
- data/test/files/embedded_lines_test.csv +7 -0
- data/test/files/unclosed_quote_test.csv +4 -0
- data/test/io_streams_test.rb +10 -0
- data/test/line_reader_test.rb +53 -4
- data/test/s3_reader_test.rb +41 -0
- data/test/s3_writer_test.rb +41 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: afb047a3682e7cbb9d953c303ebf2f525ff9975248ac550b6eded486ff73a72b
|
4
|
+
data.tar.gz: b14b13f5c23663b1b173d4ee767a93ea74b381f4c89ae5b44e4610bfbe4fabe4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00a5fa08bf88bbcc9cc3c1418f90d2dd55c3da0b129cd472ef571e27f0dec653214f2f1780069801e8fd87c676c70e097fd79e86085fcbd63e3def501f779ec5
|
7
|
+
data.tar.gz: e193d7dacbd65625c9159fb61d13d3d0b005bd40e373047e2ece791c91c5c1798c881a398c0246fda8c424bd875dead9be3c5d3cc00765b0b2977d3decf3a08d
|
@@ -70,6 +70,15 @@ module IOStreams
|
|
70
70
|
end
|
71
71
|
|
72
72
|
# Iterate over a file / stream returning one line at a time.
|
73
|
+
# Embedded lines (within double quotes) will be skipped if
|
74
|
+
# 1. The file name contains .csv
|
75
|
+
# 2. Or the embedded_within argument is set
|
76
|
+
#
|
77
|
+
# Example: Supply custom options
|
78
|
+
# IOStreams.each_line(file_name, embedded_within: '"') do |line|
|
79
|
+
# puts line
|
80
|
+
# end
|
81
|
+
#
|
73
82
|
def self.each_line(file_name_or_io, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args, &block)
|
74
83
|
line_reader(file_name_or_io, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace, **args) do |line_stream|
|
75
84
|
line_stream.each(&block)
|
@@ -77,6 +86,15 @@ module IOStreams
|
|
77
86
|
end
|
78
87
|
|
79
88
|
# Iterate over a file / stream returning one line at a time.
|
89
|
+
# Embedded lines (within double quotes) will be skipped if
|
90
|
+
# 1. The file name contains .csv
|
91
|
+
# 2. Or the embedded_within argument is set
|
92
|
+
#
|
93
|
+
# Example: Supply custom options
|
94
|
+
# IOStreams.each_row(file_name, embedded_within: '"') do |line|
|
95
|
+
# puts line
|
96
|
+
# end
|
97
|
+
#
|
80
98
|
def self.each_row(file_name_or_io, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args, &block)
|
81
99
|
row_reader(file_name_or_io, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace, **args) do |row_stream|
|
82
100
|
row_stream.each(&block)
|
@@ -90,6 +108,15 @@ module IOStreams
|
|
90
108
|
# Each record / line is returned one at a time so that very large files
|
91
109
|
# can be read without having to load the entire file into memory.
|
92
110
|
#
|
111
|
+
# Embedded lines (within double quotes) will be skipped if
|
112
|
+
# 1. The file name contains .csv
|
113
|
+
# 2. Or the embedded_within argument is set
|
114
|
+
#
|
115
|
+
# Example: Supply custom options
|
116
|
+
# IOStreams.each_record(file_name, embedded_within: '"') do |line|
|
117
|
+
# puts line
|
118
|
+
# end
|
119
|
+
#
|
93
120
|
# Example:
|
94
121
|
# file_name = 'customer_data.csv.pgp'
|
95
122
|
# IOStreams.each_record(file_name) do |hash|
|
@@ -291,20 +318,21 @@ module IOStreams
|
|
291
318
|
# its extension(s)
|
292
319
|
#
|
293
320
|
# Example Zip file:
|
294
|
-
#
|
321
|
+
# IOStreams.streams_for_file_name('myfile.zip')
|
295
322
|
# => [ :zip ]
|
296
323
|
#
|
297
324
|
# Example Encrypted Gzip file:
|
298
|
-
#
|
325
|
+
# IOStreams.streams_for_file_name('myfile.csv.gz.enc')
|
299
326
|
# => [ :gz, :enc ]
|
300
327
|
#
|
301
328
|
# Example plain text / binary file:
|
302
|
-
#
|
303
|
-
# => [
|
329
|
+
# IOStreams.streams_for_file_name('myfile.csv')
|
330
|
+
# => []
|
304
331
|
def self.streams_for_file_name(file_name)
|
305
332
|
raise ArgumentError.new('File name cannot be nil') if file_name.nil?
|
306
333
|
raise ArgumentError.new("File name must be a string: #{file_name.inspect}, class: #{file_name.class}") unless file_name.is_a?(String)
|
307
|
-
|
334
|
+
|
335
|
+
parts = ::File.basename(file_name).split('.')
|
308
336
|
extensions = []
|
309
337
|
while extension = parts.pop
|
310
338
|
sym = extension.downcase.to_sym
|
@@ -314,12 +342,31 @@ module IOStreams
|
|
314
342
|
extensions
|
315
343
|
end
|
316
344
|
|
345
|
+
# Extract URI if any was supplied
|
346
|
+
def self.scheme_for_file_name(file_name)
|
347
|
+
raise ArgumentError.new('File name cannot be nil') if file_name.nil?
|
348
|
+
raise ArgumentError.new("File name must be a string: #{file_name.inspect}, class: #{file_name.class}") unless file_name.is_a?(String)
|
349
|
+
|
350
|
+
if matches = file_name.match(/\A(\w+):\/\//)
|
351
|
+
matches[1].downcase.to_sym
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
317
355
|
# Iterate over a file / stream returning each record/line one at a time.
|
318
|
-
|
356
|
+
# It will apply the embedded_within argument if the file or input_stream contain .csv in its name.
|
357
|
+
def self.line_reader(file_name_or_io, streams: nil, file_name: nil, encoding: nil, encode_cleaner: nil, encode_replace: nil, embedded_within: nil, **args, &block)
|
358
|
+
|
319
359
|
return yield(file_name_or_io) if file_name_or_io.is_a?(IOStreams::Line::Reader) || file_name_or_io.is_a?(Array)
|
320
360
|
|
361
|
+
# TODO: needs to be improved
|
362
|
+
if embedded_within.nil? && file_name_or_io.is_a?(String)
|
363
|
+
embedded_within = '"' if file_name_or_io.include?('.csv')
|
364
|
+
elsif embedded_within.nil? && file_name
|
365
|
+
embedded_within = '"' if file_name.include?('.csv')
|
366
|
+
end
|
367
|
+
|
321
368
|
reader(file_name_or_io, streams: streams, file_name: file_name, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace) do |io|
|
322
|
-
IOStreams::Line::Reader.open(io, **args, &block)
|
369
|
+
IOStreams::Line::Reader.open(io, embedded_within: embedded_within, **args, &block)
|
323
370
|
end
|
324
371
|
end
|
325
372
|
|
@@ -331,6 +378,7 @@ module IOStreams
|
|
331
378
|
encoding: nil,
|
332
379
|
encode_cleaner: nil,
|
333
380
|
encode_replace: nil,
|
381
|
+
embedded_within: nil,
|
334
382
|
**args,
|
335
383
|
&block)
|
336
384
|
|
@@ -338,12 +386,13 @@ module IOStreams
|
|
338
386
|
|
339
387
|
line_reader(
|
340
388
|
file_name_or_io,
|
341
|
-
streams:
|
342
|
-
delimiter:
|
343
|
-
file_name:
|
344
|
-
encoding:
|
345
|
-
encode_cleaner:
|
346
|
-
encode_replace:
|
389
|
+
streams: streams,
|
390
|
+
delimiter: delimiter,
|
391
|
+
file_name: file_name,
|
392
|
+
encoding: encoding,
|
393
|
+
encode_cleaner: encode_cleaner,
|
394
|
+
encode_replace: encode_replace,
|
395
|
+
embedded_within: embedded_within
|
347
396
|
) do |io|
|
348
397
|
file_name = file_name_or_io if file_name.nil? && file_name_or_io.is_a?(String)
|
349
398
|
IOStreams::Row::Reader.open(io, file_name: file_name, **args, &block)
|
@@ -358,21 +407,23 @@ module IOStreams
|
|
358
407
|
encoding: nil,
|
359
408
|
encode_cleaner: nil,
|
360
409
|
encode_replace: nil,
|
410
|
+
embedded_within: nil,
|
361
411
|
**args,
|
362
412
|
&block)
|
363
413
|
|
364
414
|
return yield(file_name_or_io) if file_name_or_io.is_a?(IOStreams::Record::Reader)
|
365
415
|
|
366
|
-
line_reader(
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
416
|
+
line_reader(file_name_or_io,
|
417
|
+
streams: streams,
|
418
|
+
delimiter: delimiter,
|
419
|
+
file_name: file_name,
|
420
|
+
encoding: encoding,
|
421
|
+
encode_cleaner: encode_cleaner,
|
422
|
+
encode_replace: encode_replace,
|
423
|
+
embedded_within: embedded_within
|
374
424
|
) do |io|
|
375
425
|
|
426
|
+
|
376
427
|
file_name = file_name_or_io if file_name.nil? && file_name_or_io.is_a?(String)
|
377
428
|
IOStreams::Record::Reader.open(io, file_name: file_name, **args, &block)
|
378
429
|
end
|
@@ -401,6 +452,16 @@ module IOStreams
|
|
401
452
|
@extensions.delete(extension.to_sym)
|
402
453
|
end
|
403
454
|
|
455
|
+
# Register a file extension and the reader and writer streaming classes
|
456
|
+
#
|
457
|
+
# Example:
|
458
|
+
# # MyXls::Reader and MyXls::Writer must implement .open
|
459
|
+
# register_extension(:xls, MyXls::Reader, MyXls::Writer)
|
460
|
+
def self.register_scheme(scheme, reader_class, writer_class)
|
461
|
+
raise(ArgumentError, "Invalid scheme #{scheme.inspect}") unless scheme.nil? || scheme.to_s =~ /\A\w+\Z/
|
462
|
+
@schemes[scheme.nil? ? nil : scheme.to_sym] = Extension.new(reader_class, writer_class)
|
463
|
+
end
|
464
|
+
|
404
465
|
# Helper method: Returns [true|false] if a value is blank?
|
405
466
|
def self.blank?(value)
|
406
467
|
if value.nil?
|
@@ -416,6 +477,7 @@ module IOStreams
|
|
416
477
|
|
417
478
|
# A registry to hold formats for processing files during upload or download
|
418
479
|
@extensions = {}
|
480
|
+
@schemes = {}
|
419
481
|
|
420
482
|
# Struct to hold the Stream and options if any
|
421
483
|
StreamStruct = Struct.new(:klass, :options)
|
@@ -438,8 +500,10 @@ module IOStreams
|
|
438
500
|
if streams.nil?
|
439
501
|
streams = file_name_or_io.is_a?(String) ? streams_for_file_name(file_name_or_io) : [nil]
|
440
502
|
end
|
503
|
+
scheme = scheme_for_file_name(file_name_or_io) if file_name_or_io.is_a?(String)
|
441
504
|
|
442
505
|
stream_structs = streams_for(type, streams)
|
506
|
+
stream_structs << stream_struct_for_scheme(type, scheme) if stream_structs.empty? || scheme
|
443
507
|
|
444
508
|
# Add encoding stream if any of its options are present
|
445
509
|
if encoding || encode_cleaner || encode_replace
|
@@ -466,7 +530,6 @@ module IOStreams
|
|
466
530
|
if params.is_a?(Symbol)
|
467
531
|
[stream_struct_for_stream(type, params)]
|
468
532
|
elsif params.is_a?(Array)
|
469
|
-
return [stream_struct_for_stream(type, nil)] if params.empty?
|
470
533
|
a = []
|
471
534
|
params.each do |stream|
|
472
535
|
if stream.is_a?(Hash)
|
@@ -491,8 +554,14 @@ module IOStreams
|
|
491
554
|
StreamStruct.new(klass, options)
|
492
555
|
end
|
493
556
|
|
557
|
+
def self.stream_struct_for_scheme(type, scheme, options = {})
|
558
|
+
ext = @schemes[scheme.nil? ? nil : scheme.to_sym] || raise(ArgumentError, "Unknown Scheme type: #{scheme.inspect}")
|
559
|
+
klass = ext.send("#{type}_class")
|
560
|
+
StreamStruct.new(klass, options)
|
561
|
+
end
|
562
|
+
|
494
563
|
# Default reader/writer when no other streams need to be applied.
|
495
|
-
register_extension(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
564
|
+
# register_extension(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
496
565
|
|
497
566
|
# Register File extensions
|
498
567
|
register_extension(:bz2, IOStreams::Bzip2::Reader, IOStreams::Bzip2::Writer)
|
@@ -510,10 +579,17 @@ module IOStreams
|
|
510
579
|
register_extension(:enc, SymmetricEncryption::Reader, SymmetricEncryption::Writer)
|
511
580
|
end
|
512
581
|
|
513
|
-
#
|
514
|
-
#
|
582
|
+
# Support URI schemes
|
583
|
+
#
|
584
|
+
# Examples:
|
585
|
+
# path/file_name
|
586
|
+
# http://hostname/path/file_name
|
587
|
+
# https://hostname/path/file_name
|
588
|
+
# sftp://hostname/path/file_name
|
589
|
+
# s3://bucket/key
|
590
|
+
register_scheme(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
515
591
|
# register_scheme(:http, IOStreams::HTTP::Reader, IOStreams::HTTP::Writer)
|
516
592
|
# register_scheme(:https, IOStreams::HTTPS::Reader, IOStreams::HTTPS::Writer)
|
517
593
|
# register_scheme(:sftp, IOStreams::SFTP::Reader, IOStreams::SFTP::Writer)
|
518
|
-
|
594
|
+
register_scheme(:s3, IOStreams::S3::Reader, IOStreams::S3::Writer)
|
519
595
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module IOStreams
|
2
2
|
module Line
|
3
3
|
class Reader
|
4
|
-
attr_reader :delimiter, :buffer_size, :
|
4
|
+
attr_reader :delimiter, :buffer_size, :line_number
|
5
5
|
|
6
6
|
# Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
|
7
7
|
MAX_BLOCKS_MULTIPLIER = 100
|
@@ -20,7 +20,7 @@ module IOStreams
|
|
20
20
|
# Create a delimited stream reader from the supplied input stream.
|
21
21
|
#
|
22
22
|
# Lines returned will be in the encoding of the input stream.
|
23
|
-
# To change the encoding of
|
23
|
+
# To change the encoding of returned lines, use IOStreams::Encode::Reader.
|
24
24
|
#
|
25
25
|
# Parameters
|
26
26
|
# input_stream
|
@@ -45,14 +45,15 @@ module IOStreams
|
|
45
45
|
# - Skip "empty" / "blank" lines. RegExp?
|
46
46
|
# - Extract header line(s) / first non-comment, non-blank line
|
47
47
|
# - Embedded newline support, RegExp? or Proc?
|
48
|
-
def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
|
49
|
-
@
|
50
|
-
@
|
48
|
+
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil)
|
49
|
+
@embedded_within = embedded_within
|
50
|
+
@input_stream = input_stream
|
51
|
+
@buffer_size = buffer_size
|
51
52
|
|
52
53
|
# More efficient read buffering only supported when the input stream `#read` method supports it.
|
53
54
|
@use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
|
54
55
|
|
55
|
-
@
|
56
|
+
@line_number = 0
|
56
57
|
@eof = false
|
57
58
|
@read_cache_buffer = nil
|
58
59
|
@buffer = nil
|
@@ -73,14 +74,40 @@ module IOStreams
|
|
73
74
|
# Note:
|
74
75
|
# * The line delimiter is _not_ returned.
|
75
76
|
def each
|
77
|
+
line_count = 0
|
76
78
|
until eof?
|
77
79
|
line = readline
|
78
|
-
|
80
|
+
unless line.nil?
|
81
|
+
yield(line)
|
82
|
+
line_count += 1
|
83
|
+
end
|
79
84
|
end
|
80
85
|
line_count
|
81
86
|
end
|
82
87
|
|
88
|
+
# Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
|
89
|
+
# The embedded_within argument is set in IOStreams::LineReader
|
83
90
|
def readline
|
91
|
+
line = _readline
|
92
|
+
if line && @embedded_within
|
93
|
+
initial_line_number = @line_number
|
94
|
+
while line.count(@embedded_within).odd?
|
95
|
+
raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
|
96
|
+
line << @delimiter
|
97
|
+
line << _readline
|
98
|
+
end
|
99
|
+
end
|
100
|
+
line
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns whether the end of file has been reached for this stream
|
104
|
+
def eof?
|
105
|
+
@eof && (@buffer.nil? || @buffer.empty?)
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
|
110
|
+
def _readline
|
84
111
|
return if eof?
|
85
112
|
|
86
113
|
# Keep reading until it finds the delimiter
|
@@ -89,29 +116,22 @@ module IOStreams
|
|
89
116
|
|
90
117
|
# Delimiter found?
|
91
118
|
if index
|
92
|
-
data
|
93
|
-
@buffer
|
94
|
-
@
|
119
|
+
data = @buffer.slice(0, index)
|
120
|
+
@buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
|
121
|
+
@line_number += 1
|
95
122
|
elsif @eof && @buffer.empty?
|
96
123
|
data = nil
|
97
124
|
@buffer = nil
|
98
125
|
else
|
99
126
|
# Last line without delimiter
|
100
|
-
data
|
101
|
-
@buffer
|
102
|
-
@
|
127
|
+
data = @buffer
|
128
|
+
@buffer = nil
|
129
|
+
@line_number += 1
|
103
130
|
end
|
104
131
|
|
105
132
|
data
|
106
133
|
end
|
107
134
|
|
108
|
-
# Returns whether the end of file has been reached for this stream
|
109
|
-
def eof?
|
110
|
-
@eof && (@buffer.nil? || @buffer.empty?)
|
111
|
-
end
|
112
|
-
|
113
|
-
private
|
114
|
-
|
115
135
|
# Returns [Integer] the number of characters read into the internal buffer
|
116
136
|
# Returns 0 on EOF
|
117
137
|
def read_block
|
@@ -32,14 +32,14 @@ module IOStreams
|
|
32
32
|
# :csv, :hash, :array, :json, :psv, :fixed
|
33
33
|
#
|
34
34
|
# For all other parameters, see Tabular::Header.new
|
35
|
-
def initialize(
|
35
|
+
def initialize(line_reader, cleanse_header: true, **args)
|
36
36
|
@tabular = IOStreams::Tabular.new(**args)
|
37
|
-
@
|
37
|
+
@line_reader = line_reader
|
38
38
|
@cleanse_header = cleanse_header
|
39
39
|
end
|
40
40
|
|
41
41
|
def each
|
42
|
-
@
|
42
|
+
@line_reader.each do |line|
|
43
43
|
if @tabular.header?
|
44
44
|
@tabular.parse_header(line)
|
45
45
|
@tabular.cleanse_header! if @cleanse_header
|
data/lib/io_streams/s3.rb
CHANGED
@@ -7,19 +7,17 @@ end
|
|
7
7
|
require 'uri'
|
8
8
|
module IOStreams
|
9
9
|
module S3
|
10
|
+
autoload :Reader, 'io_streams/s3/reader'
|
11
|
+
autoload :Writer, 'io_streams/s3/writer'
|
12
|
+
|
10
13
|
# Sample URI: s3://mybucket/user/abc.zip
|
11
14
|
def self.parse_uri(uri)
|
12
|
-
# 's3://mybucket/user/abc.zip'
|
13
15
|
uri = URI.parse(uri)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
bucket: segments.shift,
|
20
|
-
key: segments.join('/')
|
21
|
-
}
|
22
|
-
end
|
16
|
+
raise "Invalid URI. Required Format: 's3://<bucket_name>/<key>'" unless uri.scheme == 's3'
|
17
|
+
{
|
18
|
+
bucket: uri.host,
|
19
|
+
key: uri.path.sub(/\A\//, '')
|
20
|
+
}
|
23
21
|
end
|
24
22
|
end
|
25
23
|
end
|
data/lib/io_streams/s3/reader.rb
CHANGED
@@ -2,63 +2,25 @@ module IOStreams
|
|
2
2
|
module S3
|
3
3
|
class Reader
|
4
4
|
# Read from a AWS S3 file
|
5
|
-
def self.open(uri
|
6
|
-
|
5
|
+
def self.open(uri, region: nil, **args, &block)
|
6
|
+
raise(ArgumentError, 'file_name must be a URI string') unless uri.is_a?(String)
|
7
|
+
|
7
8
|
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
9
|
+
options = IOStreams::S3.parse_uri(uri)
|
8
10
|
object = s3.bucket(options[:bucket]).object(options[:key])
|
9
11
|
|
10
|
-
IO.pipe do |read_io, write_io|
|
11
|
-
object.get(response_target: write_io)
|
12
|
-
write_io.close
|
13
|
-
block.call(read_io)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.open2(uri = nil, **args, &block)
|
18
|
-
if !uri.nil? && IOStreams.reader_stream?(uri)
|
19
|
-
raise(ArgumentError, 'S3 can only accept a URI, not an IO stream when reading.')
|
20
|
-
end
|
21
|
-
|
22
|
-
unless defined?(Aws::S3::Resource)
|
23
|
-
begin
|
24
|
-
require 'aws-sdk-s3'
|
25
|
-
rescue LoadError => exc
|
26
|
-
raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
options = uri.nil? ? args : parse_uri(uri).merge(args)
|
31
|
-
|
32
12
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
ensure
|
36
|
-
io.close if io && (io.respond_to?(:closed?) && !io.closed?)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def initialize(region: nil, bucket:, key:)
|
41
|
-
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
42
|
-
@object = s3.bucket(bucket).object(key)
|
43
|
-
@buffer = []
|
44
|
-
end
|
13
|
+
# Since S3 download only supports a push stream, write it to a tempfile first.
|
14
|
+
temp_file = Tempfile.new('rocket_job')
|
45
15
|
|
46
|
-
|
47
|
-
|
48
|
-
return @buffer.slice!(0, length) if length && (length <= @buffer.length)
|
16
|
+
args[:response_target] = temp_file.to_path
|
17
|
+
object.get(args)
|
49
18
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
return @buffer.slice!(0, length) if length && (length <= @buffer.length)
|
19
|
+
block.call(temp_file)
|
20
|
+
ensure
|
21
|
+
temp_file.delete if temp_file
|
54
22
|
end
|
55
|
-
@buffer if @buffer.size > 0
|
56
23
|
end
|
57
|
-
|
58
|
-
private
|
59
|
-
|
60
|
-
attr_reader :object
|
61
|
-
|
62
24
|
end
|
63
25
|
end
|
64
26
|
end
|
data/lib/io_streams/s3/writer.rb
CHANGED
@@ -2,11 +2,13 @@ module IOStreams
|
|
2
2
|
module S3
|
3
3
|
class Writer
|
4
4
|
# Write to AWS S3
|
5
|
-
def self.open(uri
|
6
|
-
|
5
|
+
def self.open(uri, region: nil, **args, &block)
|
6
|
+
raise(ArgumentError, 'file_name must be a URI string') unless uri.is_a?(String)
|
7
|
+
|
8
|
+
options = IOStreams::S3.parse_uri(uri)
|
7
9
|
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
8
10
|
object = s3.bucket(options[:bucket]).object(options[:key])
|
9
|
-
object.upload_stream(
|
11
|
+
object.upload_stream(args, &block)
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
data/lib/io_streams/streams.rb
CHANGED
@@ -81,15 +81,15 @@ module IOStreams
|
|
81
81
|
# .gz.enc [ :gz, :enc ]
|
82
82
|
#
|
83
83
|
# Example Zip file:
|
84
|
-
#
|
84
|
+
# IOStreams.streams_for_file_name('myfile.zip')
|
85
85
|
# => [ :zip ]
|
86
86
|
#
|
87
87
|
# Example Encrypted Gzip file:
|
88
|
-
#
|
88
|
+
# IOStreams.streams_for_file_name('myfile.csv.gz.enc')
|
89
89
|
# => [ :gz, :enc ]
|
90
90
|
#
|
91
91
|
# Example plain text / binary file:
|
92
|
-
#
|
92
|
+
# IOStreams.streams_for_file_name('myfile.csv')
|
93
93
|
# => [ :file ]
|
94
94
|
def streams_for_file_name(file_name)
|
95
95
|
raise ArgumentError.new("Cannot auto-detect streams when already a stream: #{file_name.inspect}") if reader_stream?(file_name)
|
data/lib/io_streams/version.rb
CHANGED
data/lib/iostreams.rb
CHANGED
@@ -16,10 +16,7 @@ module IOStreams
|
|
16
16
|
autoload :Writer, 'io_streams/gzip/writer'
|
17
17
|
end
|
18
18
|
autoload :Pgp, 'io_streams/pgp'
|
19
|
-
|
20
|
-
autoload :Reader, 'io_streams/s3/reader'
|
21
|
-
autoload :Writer, 'io_streams/s3/writer'
|
22
|
-
end
|
19
|
+
autoload :S3, 'io_streams/s3'
|
23
20
|
module SFTP
|
24
21
|
autoload :Reader, 'io_streams/sftp/reader'
|
25
22
|
autoload :Writer, 'io_streams/sftp/writer'
|
data/test/io_streams_test.rb
CHANGED
@@ -95,6 +95,16 @@ class IOStreamsTest < Minitest::Test
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
+
describe '.scheme_for_file_name' do
|
99
|
+
it 'default' do
|
100
|
+
assert_nil IOStreams.scheme_for_file_name('a.xyz')
|
101
|
+
end
|
102
|
+
|
103
|
+
it 's3' do
|
104
|
+
assert_equal :s3, IOStreams.scheme_for_file_name('s3://a.xyz')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
98
108
|
describe '.each_line' do
|
99
109
|
it 'returns a line at a time' do
|
100
110
|
lines = []
|
data/test/line_reader_test.rb
CHANGED
@@ -6,6 +6,14 @@ class LineReaderTest < Minitest::Test
|
|
6
6
|
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
7
|
end
|
8
8
|
|
9
|
+
let :csv_file do
|
10
|
+
File.join(File.dirname(__FILE__), 'files', 'embedded_lines_test.csv')
|
11
|
+
end
|
12
|
+
|
13
|
+
let :unclosed_quote_file do
|
14
|
+
File.join(File.dirname(__FILE__), 'files', 'unclosed_quote_test.csv')
|
15
|
+
end
|
16
|
+
|
9
17
|
let :data do
|
10
18
|
data = []
|
11
19
|
File.open(file_name, 'rt') do |file|
|
@@ -16,6 +24,47 @@ class LineReaderTest < Minitest::Test
|
|
16
24
|
data
|
17
25
|
end
|
18
26
|
|
27
|
+
# Test file has embedded new lines in row 2, 3 and 4
|
28
|
+
#
|
29
|
+
# name, description, zip
|
30
|
+
# "\nJack","Firstname is Jack","234567"
|
31
|
+
# "John","Firstname\n is John","234568"
|
32
|
+
# "Zack","Firstname is Zack","234568\n"
|
33
|
+
#
|
34
|
+
describe 'embedded_within_quotes' do
|
35
|
+
describe 'csv file' do
|
36
|
+
|
37
|
+
it 'fails to keep embedded lines if flag is not set' do
|
38
|
+
lines = []
|
39
|
+
IOStreams::Line::Reader.open(csv_file) do |io|
|
40
|
+
io.each do |line|
|
41
|
+
lines << line
|
42
|
+
end
|
43
|
+
end
|
44
|
+
assert_equal 7, lines.count
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'keeps embedded lines if flag is set' do
|
48
|
+
lines = []
|
49
|
+
IOStreams::Line::Reader.open(csv_file, embedded_within: '"') do |io|
|
50
|
+
io.each do |line|
|
51
|
+
lines << line
|
52
|
+
end
|
53
|
+
end
|
54
|
+
assert_equal 4, lines.count
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'raises error for unclosed quote' do
|
58
|
+
assert_raises(RuntimeError) do
|
59
|
+
IOStreams::Line::Reader.open(unclosed_quote_file, embedded_within: '"') do |io|
|
60
|
+
io.each do |line|
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
19
68
|
describe '#each' do
|
20
69
|
it 'each_line file' do
|
21
70
|
lines = []
|
@@ -41,7 +90,7 @@ class LineReaderTest < Minitest::Test
|
|
41
90
|
it "autodetect delimiter: #{delimiter.inspect}" do
|
42
91
|
lines = []
|
43
92
|
stream = StringIO.new(data.join(delimiter))
|
44
|
-
count
|
93
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15) do |io|
|
45
94
|
io.each { |line| lines << line }
|
46
95
|
end
|
47
96
|
assert_equal data, lines
|
@@ -51,7 +100,7 @@ class LineReaderTest < Minitest::Test
|
|
51
100
|
it "single read autodetect delimiter: #{delimiter.inspect}" do
|
52
101
|
lines = []
|
53
102
|
stream = StringIO.new(data.join(delimiter))
|
54
|
-
count
|
103
|
+
count = IOStreams::Line::Reader.open(stream) do |io|
|
55
104
|
io.each { |line| lines << line }
|
56
105
|
end
|
57
106
|
assert_equal data, lines
|
@@ -63,7 +112,7 @@ class LineReaderTest < Minitest::Test
|
|
63
112
|
it "reads delimited #{delimiter.inspect}" do
|
64
113
|
lines = []
|
65
114
|
stream = StringIO.new(data.join(delimiter))
|
66
|
-
count
|
115
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15, delimiter: delimiter) do |io|
|
67
116
|
io.each { |line| lines << line }
|
68
117
|
end
|
69
118
|
assert_equal data, lines
|
@@ -75,7 +124,7 @@ class LineReaderTest < Minitest::Test
|
|
75
124
|
delimiter = "\x01"
|
76
125
|
lines = []
|
77
126
|
stream = StringIO.new(data.join(delimiter).encode('ASCII-8BIT'))
|
78
|
-
count
|
127
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15, delimiter: delimiter) do |io|
|
79
128
|
io.each { |line| lines << line }
|
80
129
|
end
|
81
130
|
assert_equal data, lines
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class S3ReaderTest < Minitest::Test
|
4
|
+
describe IOStreams::File::Reader do
|
5
|
+
let :file_name do
|
6
|
+
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
|
+
end
|
8
|
+
|
9
|
+
let :raw do
|
10
|
+
File.read(file_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
let :uri do
|
14
|
+
"s3://#{ENV['S3_BUCKET_NAME']}/s3_test/test.txt"
|
15
|
+
end
|
16
|
+
|
17
|
+
let :upload_s3_file do
|
18
|
+
IOStreams::S3::Writer.open(uri) { |io| io << raw }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '.open' do
|
22
|
+
it 'reads' do
|
23
|
+
unless ENV['S3_BUCKET_NAME']
|
24
|
+
skip "Supply 'S3_BUCKET_NAME' environment variable with S3 bucket name to test with"
|
25
|
+
end
|
26
|
+
|
27
|
+
upload_s3_file
|
28
|
+
result = IOStreams::S3::Reader.open(uri) { |io| io.read }
|
29
|
+
assert_equal raw, result
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'does not support streams' do
|
33
|
+
io_string = StringIO.new('data')
|
34
|
+
assert_raises ArgumentError do
|
35
|
+
IOStreams::S3::Reader.open(io_string) { |io| io.read }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class FileWriterTest < Minitest::Test
|
4
|
+
describe IOStreams::File::Writer do
|
5
|
+
let :file_name do
|
6
|
+
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
|
+
end
|
8
|
+
|
9
|
+
let :raw do
|
10
|
+
File.read(file_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
let :uri do
|
14
|
+
"s3://#{ENV['S3_BUCKET_NAME']}/s3_test/test.txt"
|
15
|
+
end
|
16
|
+
|
17
|
+
let :upload_s3_file do
|
18
|
+
IOStreams::S3::Writer.open(uri) { |io| io << raw }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '.open' do
|
22
|
+
it 'writes' do
|
23
|
+
unless ENV['S3_BUCKET_NAME']
|
24
|
+
skip "Supply 'S3_BUCKET_NAME' environment variable with S3 bucket name to test with"
|
25
|
+
end
|
26
|
+
|
27
|
+
IOStreams::S3::Writer.open(uri) { |io| io.write(raw) }
|
28
|
+
result = IOStreams::S3::Reader.open(uri) { |io| io.read }
|
29
|
+
assert_equal raw, result
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'does not support streams' do
|
33
|
+
io_string = StringIO.new
|
34
|
+
assert_raises ArgumentError do
|
35
|
+
IOStreams::S3::Writer.open(io_string) { |io| io.write(raw) }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iostreams
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -80,6 +80,7 @@ files:
|
|
80
80
|
- test/encode_writer_test.rb
|
81
81
|
- test/file_reader_test.rb
|
82
82
|
- test/file_writer_test.rb
|
83
|
+
- test/files/embedded_lines_test.csv
|
83
84
|
- test/files/spreadsheet.xlsx
|
84
85
|
- test/files/test.csv
|
85
86
|
- test/files/test.json
|
@@ -88,6 +89,7 @@ files:
|
|
88
89
|
- test/files/text.txt.gz
|
89
90
|
- test/files/text.txt.gz.zip
|
90
91
|
- test/files/text.zip
|
92
|
+
- test/files/unclosed_quote_test.csv
|
91
93
|
- test/gzip_reader_test.rb
|
92
94
|
- test/gzip_writer_test.rb
|
93
95
|
- test/io_streams_test.rb
|
@@ -100,6 +102,8 @@ files:
|
|
100
102
|
- test/record_writer_test.rb
|
101
103
|
- test/row_reader_test.rb
|
102
104
|
- test/row_writer_test.rb
|
105
|
+
- test/s3_reader_test.rb
|
106
|
+
- test/s3_writer_test.rb
|
103
107
|
- test/tabular_test.rb
|
104
108
|
- test/test_helper.rb
|
105
109
|
- test/xlsx_reader_test.rb
|
@@ -139,6 +143,7 @@ test_files:
|
|
139
143
|
- test/gzip_writer_test.rb
|
140
144
|
- test/file_reader_test.rb
|
141
145
|
- test/record_reader_test.rb
|
146
|
+
- test/s3_writer_test.rb
|
142
147
|
- test/pgp_writer_test.rb
|
143
148
|
- test/line_writer_test.rb
|
144
149
|
- test/row_reader_test.rb
|
@@ -146,8 +151,10 @@ test_files:
|
|
146
151
|
- test/zip_writer_test.rb
|
147
152
|
- test/files/text.zip
|
148
153
|
- test/files/spreadsheet.xlsx
|
154
|
+
- test/files/embedded_lines_test.csv
|
149
155
|
- test/files/test.csv
|
150
156
|
- test/files/test.json
|
157
|
+
- test/files/unclosed_quote_test.csv
|
151
158
|
- test/files/text.txt.bz2
|
152
159
|
- test/files/text.txt.gz.zip
|
153
160
|
- test/files/text.txt.gz
|
@@ -160,3 +167,4 @@ test_files:
|
|
160
167
|
- test/pgp_test.rb
|
161
168
|
- test/io_streams_test.rb
|
162
169
|
- test/record_writer_test.rb
|
170
|
+
- test/s3_reader_test.rb
|