iostreams 0.16.2 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/io_streams/io_streams.rb +102 -26
- data/lib/io_streams/line/reader.rb +40 -20
- data/lib/io_streams/record/reader.rb +3 -3
- data/lib/io_streams/s3.rb +8 -10
- data/lib/io_streams/s3/reader.rb +11 -49
- data/lib/io_streams/s3/writer.rb +5 -3
- data/lib/io_streams/streams.rb +3 -3
- data/lib/io_streams/version.rb +1 -1
- data/lib/iostreams.rb +1 -4
- data/test/files/embedded_lines_test.csv +7 -0
- data/test/files/unclosed_quote_test.csv +4 -0
- data/test/io_streams_test.rb +10 -0
- data/test/line_reader_test.rb +53 -4
- data/test/s3_reader_test.rb +41 -0
- data/test/s3_writer_test.rb +41 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: afb047a3682e7cbb9d953c303ebf2f525ff9975248ac550b6eded486ff73a72b
|
4
|
+
data.tar.gz: b14b13f5c23663b1b173d4ee767a93ea74b381f4c89ae5b44e4610bfbe4fabe4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00a5fa08bf88bbcc9cc3c1418f90d2dd55c3da0b129cd472ef571e27f0dec653214f2f1780069801e8fd87c676c70e097fd79e86085fcbd63e3def501f779ec5
|
7
|
+
data.tar.gz: e193d7dacbd65625c9159fb61d13d3d0b005bd40e373047e2ece791c91c5c1798c881a398c0246fda8c424bd875dead9be3c5d3cc00765b0b2977d3decf3a08d
|
@@ -70,6 +70,15 @@ module IOStreams
|
|
70
70
|
end
|
71
71
|
|
72
72
|
# Iterate over a file / stream returning one line at a time.
|
73
|
+
# Embedded lines (within double quotes) will be skipped if
|
74
|
+
# 1. The file name contains .csv
|
75
|
+
# 2. Or the embedded_within argument is set
|
76
|
+
#
|
77
|
+
# Example: Supply custom options
|
78
|
+
# IOStreams.each_line(file_name, embedded_within: '"') do |line|
|
79
|
+
# puts line
|
80
|
+
# end
|
81
|
+
#
|
73
82
|
def self.each_line(file_name_or_io, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args, &block)
|
74
83
|
line_reader(file_name_or_io, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace, **args) do |line_stream|
|
75
84
|
line_stream.each(&block)
|
@@ -77,6 +86,15 @@ module IOStreams
|
|
77
86
|
end
|
78
87
|
|
79
88
|
# Iterate over a file / stream returning one line at a time.
|
89
|
+
# Embedded lines (within double quotes) will be skipped if
|
90
|
+
# 1. The file name contains .csv
|
91
|
+
# 2. Or the embedded_within argument is set
|
92
|
+
#
|
93
|
+
# Example: Supply custom options
|
94
|
+
# IOStreams.each_row(file_name, embedded_within: '"') do |line|
|
95
|
+
# puts line
|
96
|
+
# end
|
97
|
+
#
|
80
98
|
def self.each_row(file_name_or_io, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args, &block)
|
81
99
|
row_reader(file_name_or_io, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace, **args) do |row_stream|
|
82
100
|
row_stream.each(&block)
|
@@ -90,6 +108,15 @@ module IOStreams
|
|
90
108
|
# Each record / line is returned one at a time so that very large files
|
91
109
|
# can be read without having to load the entire file into memory.
|
92
110
|
#
|
111
|
+
# Embedded lines (within double quotes) will be skipped if
|
112
|
+
# 1. The file name contains .csv
|
113
|
+
# 2. Or the embedded_within argument is set
|
114
|
+
#
|
115
|
+
# Example: Supply custom options
|
116
|
+
# IOStreams.each_record(file_name, embedded_within: '"') do |line|
|
117
|
+
# puts line
|
118
|
+
# end
|
119
|
+
#
|
93
120
|
# Example:
|
94
121
|
# file_name = 'customer_data.csv.pgp'
|
95
122
|
# IOStreams.each_record(file_name) do |hash|
|
@@ -291,20 +318,21 @@ module IOStreams
|
|
291
318
|
# its extension(s)
|
292
319
|
#
|
293
320
|
# Example Zip file:
|
294
|
-
#
|
321
|
+
# IOStreams.streams_for_file_name('myfile.zip')
|
295
322
|
# => [ :zip ]
|
296
323
|
#
|
297
324
|
# Example Encrypted Gzip file:
|
298
|
-
#
|
325
|
+
# IOStreams.streams_for_file_name('myfile.csv.gz.enc')
|
299
326
|
# => [ :gz, :enc ]
|
300
327
|
#
|
301
328
|
# Example plain text / binary file:
|
302
|
-
#
|
303
|
-
# => [
|
329
|
+
# IOStreams.streams_for_file_name('myfile.csv')
|
330
|
+
# => []
|
304
331
|
def self.streams_for_file_name(file_name)
|
305
332
|
raise ArgumentError.new('File name cannot be nil') if file_name.nil?
|
306
333
|
raise ArgumentError.new("File name must be a string: #{file_name.inspect}, class: #{file_name.class}") unless file_name.is_a?(String)
|
307
|
-
|
334
|
+
|
335
|
+
parts = ::File.basename(file_name).split('.')
|
308
336
|
extensions = []
|
309
337
|
while extension = parts.pop
|
310
338
|
sym = extension.downcase.to_sym
|
@@ -314,12 +342,31 @@ module IOStreams
|
|
314
342
|
extensions
|
315
343
|
end
|
316
344
|
|
345
|
+
# Extract URI if any was supplied
|
346
|
+
def self.scheme_for_file_name(file_name)
|
347
|
+
raise ArgumentError.new('File name cannot be nil') if file_name.nil?
|
348
|
+
raise ArgumentError.new("File name must be a string: #{file_name.inspect}, class: #{file_name.class}") unless file_name.is_a?(String)
|
349
|
+
|
350
|
+
if matches = file_name.match(/\A(\w+):\/\//)
|
351
|
+
matches[1].downcase.to_sym
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
317
355
|
# Iterate over a file / stream returning each record/line one at a time.
|
318
|
-
|
356
|
+
# It will apply the embedded_within argument if the file or input_stream contain .csv in its name.
|
357
|
+
def self.line_reader(file_name_or_io, streams: nil, file_name: nil, encoding: nil, encode_cleaner: nil, encode_replace: nil, embedded_within: nil, **args, &block)
|
358
|
+
|
319
359
|
return yield(file_name_or_io) if file_name_or_io.is_a?(IOStreams::Line::Reader) || file_name_or_io.is_a?(Array)
|
320
360
|
|
361
|
+
# TODO: needs to be improved
|
362
|
+
if embedded_within.nil? && file_name_or_io.is_a?(String)
|
363
|
+
embedded_within = '"' if file_name_or_io.include?('.csv')
|
364
|
+
elsif embedded_within.nil? && file_name
|
365
|
+
embedded_within = '"' if file_name.include?('.csv')
|
366
|
+
end
|
367
|
+
|
321
368
|
reader(file_name_or_io, streams: streams, file_name: file_name, encoding: encoding, encode_cleaner: encode_cleaner, encode_replace: encode_replace) do |io|
|
322
|
-
IOStreams::Line::Reader.open(io, **args, &block)
|
369
|
+
IOStreams::Line::Reader.open(io, embedded_within: embedded_within, **args, &block)
|
323
370
|
end
|
324
371
|
end
|
325
372
|
|
@@ -331,6 +378,7 @@ module IOStreams
|
|
331
378
|
encoding: nil,
|
332
379
|
encode_cleaner: nil,
|
333
380
|
encode_replace: nil,
|
381
|
+
embedded_within: nil,
|
334
382
|
**args,
|
335
383
|
&block)
|
336
384
|
|
@@ -338,12 +386,13 @@ module IOStreams
|
|
338
386
|
|
339
387
|
line_reader(
|
340
388
|
file_name_or_io,
|
341
|
-
streams:
|
342
|
-
delimiter:
|
343
|
-
file_name:
|
344
|
-
encoding:
|
345
|
-
encode_cleaner:
|
346
|
-
encode_replace:
|
389
|
+
streams: streams,
|
390
|
+
delimiter: delimiter,
|
391
|
+
file_name: file_name,
|
392
|
+
encoding: encoding,
|
393
|
+
encode_cleaner: encode_cleaner,
|
394
|
+
encode_replace: encode_replace,
|
395
|
+
embedded_within: embedded_within
|
347
396
|
) do |io|
|
348
397
|
file_name = file_name_or_io if file_name.nil? && file_name_or_io.is_a?(String)
|
349
398
|
IOStreams::Row::Reader.open(io, file_name: file_name, **args, &block)
|
@@ -358,21 +407,23 @@ module IOStreams
|
|
358
407
|
encoding: nil,
|
359
408
|
encode_cleaner: nil,
|
360
409
|
encode_replace: nil,
|
410
|
+
embedded_within: nil,
|
361
411
|
**args,
|
362
412
|
&block)
|
363
413
|
|
364
414
|
return yield(file_name_or_io) if file_name_or_io.is_a?(IOStreams::Record::Reader)
|
365
415
|
|
366
|
-
line_reader(
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
416
|
+
line_reader(file_name_or_io,
|
417
|
+
streams: streams,
|
418
|
+
delimiter: delimiter,
|
419
|
+
file_name: file_name,
|
420
|
+
encoding: encoding,
|
421
|
+
encode_cleaner: encode_cleaner,
|
422
|
+
encode_replace: encode_replace,
|
423
|
+
embedded_within: embedded_within
|
374
424
|
) do |io|
|
375
425
|
|
426
|
+
|
376
427
|
file_name = file_name_or_io if file_name.nil? && file_name_or_io.is_a?(String)
|
377
428
|
IOStreams::Record::Reader.open(io, file_name: file_name, **args, &block)
|
378
429
|
end
|
@@ -401,6 +452,16 @@ module IOStreams
|
|
401
452
|
@extensions.delete(extension.to_sym)
|
402
453
|
end
|
403
454
|
|
455
|
+
# Register a file extension and the reader and writer streaming classes
|
456
|
+
#
|
457
|
+
# Example:
|
458
|
+
# # MyXls::Reader and MyXls::Writer must implement .open
|
459
|
+
# register_extension(:xls, MyXls::Reader, MyXls::Writer)
|
460
|
+
def self.register_scheme(scheme, reader_class, writer_class)
|
461
|
+
raise(ArgumentError, "Invalid scheme #{scheme.inspect}") unless scheme.nil? || scheme.to_s =~ /\A\w+\Z/
|
462
|
+
@schemes[scheme.nil? ? nil : scheme.to_sym] = Extension.new(reader_class, writer_class)
|
463
|
+
end
|
464
|
+
|
404
465
|
# Helper method: Returns [true|false] if a value is blank?
|
405
466
|
def self.blank?(value)
|
406
467
|
if value.nil?
|
@@ -416,6 +477,7 @@ module IOStreams
|
|
416
477
|
|
417
478
|
# A registry to hold formats for processing files during upload or download
|
418
479
|
@extensions = {}
|
480
|
+
@schemes = {}
|
419
481
|
|
420
482
|
# Struct to hold the Stream and options if any
|
421
483
|
StreamStruct = Struct.new(:klass, :options)
|
@@ -438,8 +500,10 @@ module IOStreams
|
|
438
500
|
if streams.nil?
|
439
501
|
streams = file_name_or_io.is_a?(String) ? streams_for_file_name(file_name_or_io) : [nil]
|
440
502
|
end
|
503
|
+
scheme = scheme_for_file_name(file_name_or_io) if file_name_or_io.is_a?(String)
|
441
504
|
|
442
505
|
stream_structs = streams_for(type, streams)
|
506
|
+
stream_structs << stream_struct_for_scheme(type, scheme) if stream_structs.empty? || scheme
|
443
507
|
|
444
508
|
# Add encoding stream if any of its options are present
|
445
509
|
if encoding || encode_cleaner || encode_replace
|
@@ -466,7 +530,6 @@ module IOStreams
|
|
466
530
|
if params.is_a?(Symbol)
|
467
531
|
[stream_struct_for_stream(type, params)]
|
468
532
|
elsif params.is_a?(Array)
|
469
|
-
return [stream_struct_for_stream(type, nil)] if params.empty?
|
470
533
|
a = []
|
471
534
|
params.each do |stream|
|
472
535
|
if stream.is_a?(Hash)
|
@@ -491,8 +554,14 @@ module IOStreams
|
|
491
554
|
StreamStruct.new(klass, options)
|
492
555
|
end
|
493
556
|
|
557
|
+
def self.stream_struct_for_scheme(type, scheme, options = {})
|
558
|
+
ext = @schemes[scheme.nil? ? nil : scheme.to_sym] || raise(ArgumentError, "Unknown Scheme type: #{scheme.inspect}")
|
559
|
+
klass = ext.send("#{type}_class")
|
560
|
+
StreamStruct.new(klass, options)
|
561
|
+
end
|
562
|
+
|
494
563
|
# Default reader/writer when no other streams need to be applied.
|
495
|
-
register_extension(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
564
|
+
# register_extension(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
496
565
|
|
497
566
|
# Register File extensions
|
498
567
|
register_extension(:bz2, IOStreams::Bzip2::Reader, IOStreams::Bzip2::Writer)
|
@@ -510,10 +579,17 @@ module IOStreams
|
|
510
579
|
register_extension(:enc, SymmetricEncryption::Reader, SymmetricEncryption::Writer)
|
511
580
|
end
|
512
581
|
|
513
|
-
#
|
514
|
-
#
|
582
|
+
# Support URI schemes
|
583
|
+
#
|
584
|
+
# Examples:
|
585
|
+
# path/file_name
|
586
|
+
# http://hostname/path/file_name
|
587
|
+
# https://hostname/path/file_name
|
588
|
+
# sftp://hostname/path/file_name
|
589
|
+
# s3://bucket/key
|
590
|
+
register_scheme(nil, IOStreams::File::Reader, IOStreams::File::Writer)
|
515
591
|
# register_scheme(:http, IOStreams::HTTP::Reader, IOStreams::HTTP::Writer)
|
516
592
|
# register_scheme(:https, IOStreams::HTTPS::Reader, IOStreams::HTTPS::Writer)
|
517
593
|
# register_scheme(:sftp, IOStreams::SFTP::Reader, IOStreams::SFTP::Writer)
|
518
|
-
|
594
|
+
register_scheme(:s3, IOStreams::S3::Reader, IOStreams::S3::Writer)
|
519
595
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module IOStreams
|
2
2
|
module Line
|
3
3
|
class Reader
|
4
|
-
attr_reader :delimiter, :buffer_size, :
|
4
|
+
attr_reader :delimiter, :buffer_size, :line_number
|
5
5
|
|
6
6
|
# Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
|
7
7
|
MAX_BLOCKS_MULTIPLIER = 100
|
@@ -20,7 +20,7 @@ module IOStreams
|
|
20
20
|
# Create a delimited stream reader from the supplied input stream.
|
21
21
|
#
|
22
22
|
# Lines returned will be in the encoding of the input stream.
|
23
|
-
# To change the encoding of
|
23
|
+
# To change the encoding of returned lines, use IOStreams::Encode::Reader.
|
24
24
|
#
|
25
25
|
# Parameters
|
26
26
|
# input_stream
|
@@ -45,14 +45,15 @@ module IOStreams
|
|
45
45
|
# - Skip "empty" / "blank" lines. RegExp?
|
46
46
|
# - Extract header line(s) / first non-comment, non-blank line
|
47
47
|
# - Embedded newline support, RegExp? or Proc?
|
48
|
-
def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
|
49
|
-
@
|
50
|
-
@
|
48
|
+
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil)
|
49
|
+
@embedded_within = embedded_within
|
50
|
+
@input_stream = input_stream
|
51
|
+
@buffer_size = buffer_size
|
51
52
|
|
52
53
|
# More efficient read buffering only supported when the input stream `#read` method supports it.
|
53
54
|
@use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
|
54
55
|
|
55
|
-
@
|
56
|
+
@line_number = 0
|
56
57
|
@eof = false
|
57
58
|
@read_cache_buffer = nil
|
58
59
|
@buffer = nil
|
@@ -73,14 +74,40 @@ module IOStreams
|
|
73
74
|
# Note:
|
74
75
|
# * The line delimiter is _not_ returned.
|
75
76
|
def each
|
77
|
+
line_count = 0
|
76
78
|
until eof?
|
77
79
|
line = readline
|
78
|
-
|
80
|
+
unless line.nil?
|
81
|
+
yield(line)
|
82
|
+
line_count += 1
|
83
|
+
end
|
79
84
|
end
|
80
85
|
line_count
|
81
86
|
end
|
82
87
|
|
88
|
+
# Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
|
89
|
+
# The embedded_within argument is set in IOStreams::LineReader
|
83
90
|
def readline
|
91
|
+
line = _readline
|
92
|
+
if line && @embedded_within
|
93
|
+
initial_line_number = @line_number
|
94
|
+
while line.count(@embedded_within).odd?
|
95
|
+
raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
|
96
|
+
line << @delimiter
|
97
|
+
line << _readline
|
98
|
+
end
|
99
|
+
end
|
100
|
+
line
|
101
|
+
end
|
102
|
+
|
103
|
+
# Returns whether the end of file has been reached for this stream
|
104
|
+
def eof?
|
105
|
+
@eof && (@buffer.nil? || @buffer.empty?)
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
|
110
|
+
def _readline
|
84
111
|
return if eof?
|
85
112
|
|
86
113
|
# Keep reading until it finds the delimiter
|
@@ -89,29 +116,22 @@ module IOStreams
|
|
89
116
|
|
90
117
|
# Delimiter found?
|
91
118
|
if index
|
92
|
-
data
|
93
|
-
@buffer
|
94
|
-
@
|
119
|
+
data = @buffer.slice(0, index)
|
120
|
+
@buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
|
121
|
+
@line_number += 1
|
95
122
|
elsif @eof && @buffer.empty?
|
96
123
|
data = nil
|
97
124
|
@buffer = nil
|
98
125
|
else
|
99
126
|
# Last line without delimiter
|
100
|
-
data
|
101
|
-
@buffer
|
102
|
-
@
|
127
|
+
data = @buffer
|
128
|
+
@buffer = nil
|
129
|
+
@line_number += 1
|
103
130
|
end
|
104
131
|
|
105
132
|
data
|
106
133
|
end
|
107
134
|
|
108
|
-
# Returns whether the end of file has been reached for this stream
|
109
|
-
def eof?
|
110
|
-
@eof && (@buffer.nil? || @buffer.empty?)
|
111
|
-
end
|
112
|
-
|
113
|
-
private
|
114
|
-
|
115
135
|
# Returns [Integer] the number of characters read into the internal buffer
|
116
136
|
# Returns 0 on EOF
|
117
137
|
def read_block
|
@@ -32,14 +32,14 @@ module IOStreams
|
|
32
32
|
# :csv, :hash, :array, :json, :psv, :fixed
|
33
33
|
#
|
34
34
|
# For all other parameters, see Tabular::Header.new
|
35
|
-
def initialize(
|
35
|
+
def initialize(line_reader, cleanse_header: true, **args)
|
36
36
|
@tabular = IOStreams::Tabular.new(**args)
|
37
|
-
@
|
37
|
+
@line_reader = line_reader
|
38
38
|
@cleanse_header = cleanse_header
|
39
39
|
end
|
40
40
|
|
41
41
|
def each
|
42
|
-
@
|
42
|
+
@line_reader.each do |line|
|
43
43
|
if @tabular.header?
|
44
44
|
@tabular.parse_header(line)
|
45
45
|
@tabular.cleanse_header! if @cleanse_header
|
data/lib/io_streams/s3.rb
CHANGED
@@ -7,19 +7,17 @@ end
|
|
7
7
|
require 'uri'
|
8
8
|
module IOStreams
|
9
9
|
module S3
|
10
|
+
autoload :Reader, 'io_streams/s3/reader'
|
11
|
+
autoload :Writer, 'io_streams/s3/writer'
|
12
|
+
|
10
13
|
# Sample URI: s3://mybucket/user/abc.zip
|
11
14
|
def self.parse_uri(uri)
|
12
|
-
# 's3://mybucket/user/abc.zip'
|
13
15
|
uri = URI.parse(uri)
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
bucket: segments.shift,
|
20
|
-
key: segments.join('/')
|
21
|
-
}
|
22
|
-
end
|
16
|
+
raise "Invalid URI. Required Format: 's3://<bucket_name>/<key>'" unless uri.scheme == 's3'
|
17
|
+
{
|
18
|
+
bucket: uri.host,
|
19
|
+
key: uri.path.sub(/\A\//, '')
|
20
|
+
}
|
23
21
|
end
|
24
22
|
end
|
25
23
|
end
|
data/lib/io_streams/s3/reader.rb
CHANGED
@@ -2,63 +2,25 @@ module IOStreams
|
|
2
2
|
module S3
|
3
3
|
class Reader
|
4
4
|
# Read from a AWS S3 file
|
5
|
-
def self.open(uri
|
6
|
-
|
5
|
+
def self.open(uri, region: nil, **args, &block)
|
6
|
+
raise(ArgumentError, 'file_name must be a URI string') unless uri.is_a?(String)
|
7
|
+
|
7
8
|
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
9
|
+
options = IOStreams::S3.parse_uri(uri)
|
8
10
|
object = s3.bucket(options[:bucket]).object(options[:key])
|
9
11
|
|
10
|
-
IO.pipe do |read_io, write_io|
|
11
|
-
object.get(response_target: write_io)
|
12
|
-
write_io.close
|
13
|
-
block.call(read_io)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.open2(uri = nil, **args, &block)
|
18
|
-
if !uri.nil? && IOStreams.reader_stream?(uri)
|
19
|
-
raise(ArgumentError, 'S3 can only accept a URI, not an IO stream when reading.')
|
20
|
-
end
|
21
|
-
|
22
|
-
unless defined?(Aws::S3::Resource)
|
23
|
-
begin
|
24
|
-
require 'aws-sdk-s3'
|
25
|
-
rescue LoadError => exc
|
26
|
-
raise(LoadError, "Install gem 'aws-sdk-s3' to read and write AWS S3 files: #{exc.message}")
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
options = uri.nil? ? args : parse_uri(uri).merge(args)
|
31
|
-
|
32
12
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
ensure
|
36
|
-
io.close if io && (io.respond_to?(:closed?) && !io.closed?)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def initialize(region: nil, bucket:, key:)
|
41
|
-
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
42
|
-
@object = s3.bucket(bucket).object(key)
|
43
|
-
@buffer = []
|
44
|
-
end
|
13
|
+
# Since S3 download only supports a push stream, write it to a tempfile first.
|
14
|
+
temp_file = Tempfile.new('rocket_job')
|
45
15
|
|
46
|
-
|
47
|
-
|
48
|
-
return @buffer.slice!(0, length) if length && (length <= @buffer.length)
|
16
|
+
args[:response_target] = temp_file.to_path
|
17
|
+
object.get(args)
|
49
18
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
return @buffer.slice!(0, length) if length && (length <= @buffer.length)
|
19
|
+
block.call(temp_file)
|
20
|
+
ensure
|
21
|
+
temp_file.delete if temp_file
|
54
22
|
end
|
55
|
-
@buffer if @buffer.size > 0
|
56
23
|
end
|
57
|
-
|
58
|
-
private
|
59
|
-
|
60
|
-
attr_reader :object
|
61
|
-
|
62
24
|
end
|
63
25
|
end
|
64
26
|
end
|
data/lib/io_streams/s3/writer.rb
CHANGED
@@ -2,11 +2,13 @@ module IOStreams
|
|
2
2
|
module S3
|
3
3
|
class Writer
|
4
4
|
# Write to AWS S3
|
5
|
-
def self.open(uri
|
6
|
-
|
5
|
+
def self.open(uri, region: nil, **args, &block)
|
6
|
+
raise(ArgumentError, 'file_name must be a URI string') unless uri.is_a?(String)
|
7
|
+
|
8
|
+
options = IOStreams::S3.parse_uri(uri)
|
7
9
|
s3 = region.nil? ? Aws::S3::Resource.new : Aws::S3::Resource.new(region: region)
|
8
10
|
object = s3.bucket(options[:bucket]).object(options[:key])
|
9
|
-
object.upload_stream(
|
11
|
+
object.upload_stream(args, &block)
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
data/lib/io_streams/streams.rb
CHANGED
@@ -81,15 +81,15 @@ module IOStreams
|
|
81
81
|
# .gz.enc [ :gz, :enc ]
|
82
82
|
#
|
83
83
|
# Example Zip file:
|
84
|
-
#
|
84
|
+
# IOStreams.streams_for_file_name('myfile.zip')
|
85
85
|
# => [ :zip ]
|
86
86
|
#
|
87
87
|
# Example Encrypted Gzip file:
|
88
|
-
#
|
88
|
+
# IOStreams.streams_for_file_name('myfile.csv.gz.enc')
|
89
89
|
# => [ :gz, :enc ]
|
90
90
|
#
|
91
91
|
# Example plain text / binary file:
|
92
|
-
#
|
92
|
+
# IOStreams.streams_for_file_name('myfile.csv')
|
93
93
|
# => [ :file ]
|
94
94
|
def streams_for_file_name(file_name)
|
95
95
|
raise ArgumentError.new("Cannot auto-detect streams when already a stream: #{file_name.inspect}") if reader_stream?(file_name)
|
data/lib/io_streams/version.rb
CHANGED
data/lib/iostreams.rb
CHANGED
@@ -16,10 +16,7 @@ module IOStreams
|
|
16
16
|
autoload :Writer, 'io_streams/gzip/writer'
|
17
17
|
end
|
18
18
|
autoload :Pgp, 'io_streams/pgp'
|
19
|
-
|
20
|
-
autoload :Reader, 'io_streams/s3/reader'
|
21
|
-
autoload :Writer, 'io_streams/s3/writer'
|
22
|
-
end
|
19
|
+
autoload :S3, 'io_streams/s3'
|
23
20
|
module SFTP
|
24
21
|
autoload :Reader, 'io_streams/sftp/reader'
|
25
22
|
autoload :Writer, 'io_streams/sftp/writer'
|
data/test/io_streams_test.rb
CHANGED
@@ -95,6 +95,16 @@ class IOStreamsTest < Minitest::Test
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
+
describe '.scheme_for_file_name' do
|
99
|
+
it 'default' do
|
100
|
+
assert_nil IOStreams.scheme_for_file_name('a.xyz')
|
101
|
+
end
|
102
|
+
|
103
|
+
it 's3' do
|
104
|
+
assert_equal :s3, IOStreams.scheme_for_file_name('s3://a.xyz')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
98
108
|
describe '.each_line' do
|
99
109
|
it 'returns a line at a time' do
|
100
110
|
lines = []
|
data/test/line_reader_test.rb
CHANGED
@@ -6,6 +6,14 @@ class LineReaderTest < Minitest::Test
|
|
6
6
|
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
7
|
end
|
8
8
|
|
9
|
+
let :csv_file do
|
10
|
+
File.join(File.dirname(__FILE__), 'files', 'embedded_lines_test.csv')
|
11
|
+
end
|
12
|
+
|
13
|
+
let :unclosed_quote_file do
|
14
|
+
File.join(File.dirname(__FILE__), 'files', 'unclosed_quote_test.csv')
|
15
|
+
end
|
16
|
+
|
9
17
|
let :data do
|
10
18
|
data = []
|
11
19
|
File.open(file_name, 'rt') do |file|
|
@@ -16,6 +24,47 @@ class LineReaderTest < Minitest::Test
|
|
16
24
|
data
|
17
25
|
end
|
18
26
|
|
27
|
+
# Test file has embedded new lines in row 2, 3 and 4
|
28
|
+
#
|
29
|
+
# name, description, zip
|
30
|
+
# "\nJack","Firstname is Jack","234567"
|
31
|
+
# "John","Firstname\n is John","234568"
|
32
|
+
# "Zack","Firstname is Zack","234568\n"
|
33
|
+
#
|
34
|
+
describe 'embedded_within_quotes' do
|
35
|
+
describe 'csv file' do
|
36
|
+
|
37
|
+
it 'fails to keep embedded lines if flag is not set' do
|
38
|
+
lines = []
|
39
|
+
IOStreams::Line::Reader.open(csv_file) do |io|
|
40
|
+
io.each do |line|
|
41
|
+
lines << line
|
42
|
+
end
|
43
|
+
end
|
44
|
+
assert_equal 7, lines.count
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'keeps embedded lines if flag is set' do
|
48
|
+
lines = []
|
49
|
+
IOStreams::Line::Reader.open(csv_file, embedded_within: '"') do |io|
|
50
|
+
io.each do |line|
|
51
|
+
lines << line
|
52
|
+
end
|
53
|
+
end
|
54
|
+
assert_equal 4, lines.count
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'raises error for unclosed quote' do
|
58
|
+
assert_raises(RuntimeError) do
|
59
|
+
IOStreams::Line::Reader.open(unclosed_quote_file, embedded_within: '"') do |io|
|
60
|
+
io.each do |line|
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
19
68
|
describe '#each' do
|
20
69
|
it 'each_line file' do
|
21
70
|
lines = []
|
@@ -41,7 +90,7 @@ class LineReaderTest < Minitest::Test
|
|
41
90
|
it "autodetect delimiter: #{delimiter.inspect}" do
|
42
91
|
lines = []
|
43
92
|
stream = StringIO.new(data.join(delimiter))
|
44
|
-
count
|
93
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15) do |io|
|
45
94
|
io.each { |line| lines << line }
|
46
95
|
end
|
47
96
|
assert_equal data, lines
|
@@ -51,7 +100,7 @@ class LineReaderTest < Minitest::Test
|
|
51
100
|
it "single read autodetect delimiter: #{delimiter.inspect}" do
|
52
101
|
lines = []
|
53
102
|
stream = StringIO.new(data.join(delimiter))
|
54
|
-
count
|
103
|
+
count = IOStreams::Line::Reader.open(stream) do |io|
|
55
104
|
io.each { |line| lines << line }
|
56
105
|
end
|
57
106
|
assert_equal data, lines
|
@@ -63,7 +112,7 @@ class LineReaderTest < Minitest::Test
|
|
63
112
|
it "reads delimited #{delimiter.inspect}" do
|
64
113
|
lines = []
|
65
114
|
stream = StringIO.new(data.join(delimiter))
|
66
|
-
count
|
115
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15, delimiter: delimiter) do |io|
|
67
116
|
io.each { |line| lines << line }
|
68
117
|
end
|
69
118
|
assert_equal data, lines
|
@@ -75,7 +124,7 @@ class LineReaderTest < Minitest::Test
|
|
75
124
|
delimiter = "\x01"
|
76
125
|
lines = []
|
77
126
|
stream = StringIO.new(data.join(delimiter).encode('ASCII-8BIT'))
|
78
|
-
count
|
127
|
+
count = IOStreams::Line::Reader.open(stream, buffer_size: 15, delimiter: delimiter) do |io|
|
79
128
|
io.each { |line| lines << line }
|
80
129
|
end
|
81
130
|
assert_equal data, lines
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class S3ReaderTest < Minitest::Test
|
4
|
+
describe IOStreams::File::Reader do
|
5
|
+
let :file_name do
|
6
|
+
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
|
+
end
|
8
|
+
|
9
|
+
let :raw do
|
10
|
+
File.read(file_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
let :uri do
|
14
|
+
"s3://#{ENV['S3_BUCKET_NAME']}/s3_test/test.txt"
|
15
|
+
end
|
16
|
+
|
17
|
+
let :upload_s3_file do
|
18
|
+
IOStreams::S3::Writer.open(uri) { |io| io << raw }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '.open' do
|
22
|
+
it 'reads' do
|
23
|
+
unless ENV['S3_BUCKET_NAME']
|
24
|
+
skip "Supply 'S3_BUCKET_NAME' environment variable with S3 bucket name to test with"
|
25
|
+
end
|
26
|
+
|
27
|
+
upload_s3_file
|
28
|
+
result = IOStreams::S3::Reader.open(uri) { |io| io.read }
|
29
|
+
assert_equal raw, result
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'does not support streams' do
|
33
|
+
io_string = StringIO.new('data')
|
34
|
+
assert_raises ArgumentError do
|
35
|
+
IOStreams::S3::Reader.open(io_string) { |io| io.read }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
|
3
|
+
class FileWriterTest < Minitest::Test
|
4
|
+
describe IOStreams::File::Writer do
|
5
|
+
let :file_name do
|
6
|
+
File.join(File.dirname(__FILE__), 'files', 'text.txt')
|
7
|
+
end
|
8
|
+
|
9
|
+
let :raw do
|
10
|
+
File.read(file_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
let :uri do
|
14
|
+
"s3://#{ENV['S3_BUCKET_NAME']}/s3_test/test.txt"
|
15
|
+
end
|
16
|
+
|
17
|
+
let :upload_s3_file do
|
18
|
+
IOStreams::S3::Writer.open(uri) { |io| io << raw }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '.open' do
|
22
|
+
it 'writes' do
|
23
|
+
unless ENV['S3_BUCKET_NAME']
|
24
|
+
skip "Supply 'S3_BUCKET_NAME' environment variable with S3 bucket name to test with"
|
25
|
+
end
|
26
|
+
|
27
|
+
IOStreams::S3::Writer.open(uri) { |io| io.write(raw) }
|
28
|
+
result = IOStreams::S3::Reader.open(uri) { |io| io.read }
|
29
|
+
assert_equal raw, result
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'does not support streams' do
|
33
|
+
io_string = StringIO.new
|
34
|
+
assert_raises ArgumentError do
|
35
|
+
IOStreams::S3::Writer.open(io_string) { |io| io.write(raw) }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iostreams
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -80,6 +80,7 @@ files:
|
|
80
80
|
- test/encode_writer_test.rb
|
81
81
|
- test/file_reader_test.rb
|
82
82
|
- test/file_writer_test.rb
|
83
|
+
- test/files/embedded_lines_test.csv
|
83
84
|
- test/files/spreadsheet.xlsx
|
84
85
|
- test/files/test.csv
|
85
86
|
- test/files/test.json
|
@@ -88,6 +89,7 @@ files:
|
|
88
89
|
- test/files/text.txt.gz
|
89
90
|
- test/files/text.txt.gz.zip
|
90
91
|
- test/files/text.zip
|
92
|
+
- test/files/unclosed_quote_test.csv
|
91
93
|
- test/gzip_reader_test.rb
|
92
94
|
- test/gzip_writer_test.rb
|
93
95
|
- test/io_streams_test.rb
|
@@ -100,6 +102,8 @@ files:
|
|
100
102
|
- test/record_writer_test.rb
|
101
103
|
- test/row_reader_test.rb
|
102
104
|
- test/row_writer_test.rb
|
105
|
+
- test/s3_reader_test.rb
|
106
|
+
- test/s3_writer_test.rb
|
103
107
|
- test/tabular_test.rb
|
104
108
|
- test/test_helper.rb
|
105
109
|
- test/xlsx_reader_test.rb
|
@@ -139,6 +143,7 @@ test_files:
|
|
139
143
|
- test/gzip_writer_test.rb
|
140
144
|
- test/file_reader_test.rb
|
141
145
|
- test/record_reader_test.rb
|
146
|
+
- test/s3_writer_test.rb
|
142
147
|
- test/pgp_writer_test.rb
|
143
148
|
- test/line_writer_test.rb
|
144
149
|
- test/row_reader_test.rb
|
@@ -146,8 +151,10 @@ test_files:
|
|
146
151
|
- test/zip_writer_test.rb
|
147
152
|
- test/files/text.zip
|
148
153
|
- test/files/spreadsheet.xlsx
|
154
|
+
- test/files/embedded_lines_test.csv
|
149
155
|
- test/files/test.csv
|
150
156
|
- test/files/test.json
|
157
|
+
- test/files/unclosed_quote_test.csv
|
151
158
|
- test/files/text.txt.bz2
|
152
159
|
- test/files/text.txt.gz.zip
|
153
160
|
- test/files/text.txt.gz
|
@@ -160,3 +167,4 @@ test_files:
|
|
160
167
|
- test/pgp_test.rb
|
161
168
|
- test/io_streams_test.rb
|
162
169
|
- test/record_writer_test.rb
|
170
|
+
- test/s3_reader_test.rb
|