iostreams 1.5.0 → 1.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -1
- data/lib/io_streams/builder.rb +20 -4
- data/lib/io_streams/errors.rb +12 -0
- data/lib/io_streams/io_streams.rb +0 -2
- data/lib/io_streams/line/reader.rb +23 -11
- data/lib/io_streams/path.rb +1 -1
- data/lib/io_streams/paths/s3.rb +25 -14
- data/lib/io_streams/paths/sftp.rb +93 -59
- data/lib/io_streams/pgp.rb +17 -17
- data/lib/io_streams/pgp/writer.rb +1 -2
- data/lib/io_streams/stream.rb +75 -10
- data/lib/io_streams/tabular.rb +23 -23
- data/lib/io_streams/tabular/parser/csv.rb +4 -2
- data/lib/io_streams/tabular/parser/fixed.rb +1 -1
- data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
- data/lib/io_streams/utils.rb +3 -5
- data/lib/io_streams/version.rb +1 -1
- data/lib/iostreams.rb +8 -0
- data/test/builder_test.rb +29 -0
- data/test/deprecated_test.rb +2 -0
- data/test/files/test.psv +4 -0
- data/test/files/unclosed_quote_large_test.csv +1658 -0
- data/test/files/unclosed_quote_test2.csv +3 -0
- data/test/line_reader_test.rb +30 -4
- data/test/paths/file_test.rb +6 -8
- data/test/paths/sftp_test.rb +7 -1
- data/test/stream_test.rb +169 -3
- data/test/test_helper.rb +0 -3
- metadata +48 -43
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 849ceda63eb30f95762a7c985cd215d424e62afd68ab20776e8d16c188dd6aed
|
4
|
+
data.tar.gz: 8e26af86c40bb673ce36855a7fb30d1c4b401edc3eac0b27a71b9760cfe865dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99318c4c64e0133df57b84429b1c2f9caa064abb1405ace5d55208e41b6bf8bb8fa83a75db8ae46d53753f10d566bab53971d95871ed4011bab4571d31bebe8a
|
7
|
+
data.tar.gz: bfba3a033c753e3fe05f798177b8f3c7ee8f566eabaf9223984fa902b288cb3515154f621a4e97f75b6c5bc31da88f284390c2d82c5015d7682ecf08c2a671d3
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# IOStreams
|
2
|
-
[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![
|
2
|
+
[![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg)
|
3
3
|
|
4
4
|
IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
|
5
5
|
or storage mechanism transparent to the application.
|
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
|
|
14
14
|
|
15
15
|
Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
|
16
16
|
|
17
|
+
## Upgrading to v1.6
|
18
|
+
|
19
|
+
The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
|
20
|
+
the following line to your code:
|
21
|
+
|
22
|
+
~~~ruby
|
23
|
+
IOStreams.include(IOStreams::Deprecated)
|
24
|
+
~~~
|
25
|
+
|
26
|
+
It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
|
27
|
+
release.
|
28
|
+
|
17
29
|
## Versioning
|
18
30
|
|
19
31
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
data/lib/io_streams/builder.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module IOStreams
|
2
2
|
# Build the streams that need to be applied to a path druing reading or writing.
|
3
3
|
class Builder
|
4
|
-
attr_accessor :file_name
|
4
|
+
attr_accessor :file_name, :format_options
|
5
5
|
attr_reader :streams, :options
|
6
6
|
|
7
7
|
def initialize(file_name = nil)
|
8
|
-
@file_name
|
9
|
-
@streams
|
10
|
-
@options
|
8
|
+
@file_name = file_name
|
9
|
+
@streams = nil
|
10
|
+
@options = nil
|
11
|
+
@format = nil
|
12
|
+
@format_option = nil
|
11
13
|
end
|
12
14
|
|
13
15
|
# Supply an option that is only applied once the file name extensions have been parsed.
|
@@ -88,6 +90,20 @@ module IOStreams
|
|
88
90
|
built_streams.freeze
|
89
91
|
end
|
90
92
|
|
93
|
+
# Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
|
94
|
+
# Returns [nil] if no format is set, or if it cannot be determined from the file_name
|
95
|
+
def format
|
96
|
+
@format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def format=(format)
|
100
|
+
unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
|
101
|
+
raise(ArgumentError, "Invalid format: #{format.inspect}")
|
102
|
+
end
|
103
|
+
|
104
|
+
@format = format
|
105
|
+
end
|
106
|
+
|
91
107
|
private
|
92
108
|
|
93
109
|
def class_for_stream(type, stream)
|
data/lib/io_streams/errors.rb
CHANGED
@@ -9,6 +9,9 @@ module IOStreams
|
|
9
9
|
class MissingHeader < Error
|
10
10
|
end
|
11
11
|
|
12
|
+
class UnknownFormat < Error
|
13
|
+
end
|
14
|
+
|
12
15
|
class TypeMismatch < Error
|
13
16
|
end
|
14
17
|
|
@@ -26,6 +29,15 @@ module IOStreams
|
|
26
29
|
class ValueTooLong < Error
|
27
30
|
end
|
28
31
|
|
32
|
+
class MalformedDataError < RuntimeError
|
33
|
+
attr_reader :line_number
|
34
|
+
|
35
|
+
def initialize(message, line_number)
|
36
|
+
@line_number = line_number
|
37
|
+
super("#{message} on line #{line_number}.")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
29
41
|
class InvalidLayout < Error
|
30
42
|
end
|
31
43
|
end
|
@@ -38,12 +38,12 @@ module IOStreams
|
|
38
38
|
# Size of blocks to read from the input stream at a time.
|
39
39
|
# Default: 65536 ( 64K )
|
40
40
|
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
41
|
+
# embedded_within: [String]
|
42
|
+
# Supports CSV files where a line may contain an embedded newline.
|
43
|
+
# For CSV files set `embedded_within: '"'`
|
44
|
+
#
|
45
|
+
# Note:
|
46
|
+
# * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
|
47
47
|
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
|
48
48
|
super(input_stream)
|
49
49
|
|
@@ -86,17 +86,29 @@ module IOStreams
|
|
86
86
|
line_count
|
87
87
|
end
|
88
88
|
|
89
|
-
# Reads each line per the
|
90
|
-
#
|
89
|
+
# Reads each line per the `delimeter`.
|
90
|
+
# Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
|
91
|
+
# For Example, CSV files can contain newlines embedded within double quotes.
|
91
92
|
def readline
|
92
93
|
line = _readline
|
93
94
|
if line && @embedded_within
|
94
95
|
initial_line_number = @line_number
|
95
96
|
while line.count(@embedded_within).odd?
|
96
|
-
|
97
|
-
|
97
|
+
if eof? || line.length > @buffer_size * 10
|
98
|
+
raise(Errors::MalformedDataError.new(
|
99
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
100
|
+
initial_line_number
|
101
|
+
))
|
102
|
+
end
|
98
103
|
line << @delimiter
|
99
|
-
|
104
|
+
next_line = _readline
|
105
|
+
if next_line.nil?
|
106
|
+
raise(Errors::MalformedDataError.new(
|
107
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
108
|
+
initial_line_number
|
109
|
+
))
|
110
|
+
end
|
111
|
+
line << next_line
|
100
112
|
end
|
101
113
|
end
|
102
114
|
line
|
data/lib/io_streams/path.rb
CHANGED
@@ -153,7 +153,7 @@ module IOStreams
|
|
153
153
|
# Returns [true|false] whether the file is compressed based on its file extensions.
|
154
154
|
def compressed?
|
155
155
|
# TODO: Look at streams?
|
156
|
-
!(path =~ /\.(zip|gz|gzip|
|
156
|
+
!(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
|
157
157
|
end
|
158
158
|
|
159
159
|
# Returns [true|false] whether the file is encrypted based on its file extensions.
|
data/lib/io_streams/paths/s3.rb
CHANGED
@@ -3,7 +3,10 @@ require "uri"
|
|
3
3
|
module IOStreams
|
4
4
|
module Paths
|
5
5
|
class S3 < IOStreams::Path
|
6
|
-
attr_reader :bucket_name, :
|
6
|
+
attr_reader :bucket_name, :options
|
7
|
+
|
8
|
+
# Largest file size supported by the S3 copy object api.
|
9
|
+
S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
|
7
10
|
|
8
11
|
# Arguments:
|
9
12
|
#
|
@@ -138,16 +141,17 @@ module IOStreams
|
|
138
141
|
|
139
142
|
@bucket_name = uri.hostname
|
140
143
|
key = uri.path.sub(%r{\A/}, "")
|
141
|
-
|
142
|
-
|
143
|
-
client
|
144
|
-
@client = ::Aws::S3::Client.new(client)
|
144
|
+
|
145
|
+
if client && !client.is_a?(Hash)
|
146
|
+
@client = client
|
145
147
|
else
|
146
|
-
@
|
148
|
+
@client_options = client.is_a?(Hash) ? client.dup : {}
|
149
|
+
@client_options[:access_key_id] = access_key_id if access_key_id
|
150
|
+
@client_options[:secret_access_key] = secret_access_key if secret_access_key
|
147
151
|
end
|
148
|
-
@options = args
|
149
152
|
|
150
|
-
@options
|
153
|
+
@options = args
|
154
|
+
@options.merge!(uri.query.transform_keys(&:to_sym)) if uri.query
|
151
155
|
|
152
156
|
super(key)
|
153
157
|
end
|
@@ -187,11 +191,11 @@ module IOStreams
|
|
187
191
|
end
|
188
192
|
|
189
193
|
# Make S3 perform direct copies within S3 itself.
|
190
|
-
def copy_to(target_path, convert: true)
|
191
|
-
return super(target_path) if convert
|
194
|
+
def copy_to(target_path, convert: true, **args)
|
195
|
+
return super(target_path, convert: convert, **args) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
192
196
|
|
193
197
|
target = IOStreams.new(target_path)
|
194
|
-
return super(target) unless target.is_a?(self.class)
|
198
|
+
return super(target, convert: convert, **args) unless target.is_a?(self.class)
|
195
199
|
|
196
200
|
source_name = ::File.join(bucket_name, path)
|
197
201
|
client.copy_object(options.merge(bucket: target.bucket_name, key: target.path, copy_source: source_name))
|
@@ -199,11 +203,13 @@ module IOStreams
|
|
199
203
|
end
|
200
204
|
|
201
205
|
# Make S3 perform direct copies within S3 itself.
|
202
|
-
def copy_from(source_path, convert: true)
|
203
|
-
return super(source_path) if convert
|
206
|
+
def copy_from(source_path, convert: true, **args)
|
207
|
+
return super(source_path, convert: true, **args) if convert
|
204
208
|
|
205
209
|
source = IOStreams.new(source_path)
|
206
|
-
|
210
|
+
if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
211
|
+
return super(source, convert: convert, **args)
|
212
|
+
end
|
207
213
|
|
208
214
|
source_name = ::File.join(source.bucket_name, source.path)
|
209
215
|
client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
|
@@ -309,6 +315,11 @@ module IOStreams
|
|
309
315
|
def partial_files_visible?
|
310
316
|
false
|
311
317
|
end
|
318
|
+
|
319
|
+
# Lazy load S3 client since it takes two seconds to create itself!
|
320
|
+
def client
|
321
|
+
@client ||= ::Aws::S3::Client.new(@client_options)
|
322
|
+
end
|
312
323
|
end
|
313
324
|
end
|
314
325
|
end
|
@@ -26,12 +26,13 @@ module IOStreams
|
|
26
26
|
include SemanticLogger::Loggable if defined?(SemanticLogger)
|
27
27
|
|
28
28
|
class << self
|
29
|
-
attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds
|
29
|
+
attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds, :before_password_wait_seconds
|
30
30
|
end
|
31
31
|
|
32
|
-
@sftp_bin
|
33
|
-
@sshpass_bin
|
34
|
-
@
|
32
|
+
@sftp_bin = "sftp"
|
33
|
+
@sshpass_bin = "sshpass"
|
34
|
+
@before_password_wait_seconds = 2
|
35
|
+
@sshpass_wait_seconds = 5
|
35
36
|
|
36
37
|
attr_reader :hostname, :username, :ssh_options, :url, :port
|
37
38
|
|
@@ -46,9 +47,23 @@ module IOStreams
|
|
46
47
|
# password: [String]
|
47
48
|
# Password for the user.
|
48
49
|
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
50
|
+
# ssh_options: [Hash]
|
51
|
+
# - IdentityKey [String]
|
52
|
+
# The identity key that this client should use to talk to this host.
|
53
|
+
# Under the covers this value is written to a file and then the file name is passed as `IdentityFile`
|
54
|
+
# - HostKey [String]
|
55
|
+
# The expected SSH Host key that is presented by the remote host.
|
56
|
+
# Instead of storing the host key in the `known_hosts` file, it can be supplied explicity
|
57
|
+
# using this option.
|
58
|
+
# Under the covers this value is written to a file and then the file name is passed as `UserKnownHostsFile`
|
59
|
+
# Notes:
|
60
|
+
# - It must contain the entire line that would be stored in `known_hosts`,
|
61
|
+
# including the hostname, ip address, key type and key value. This value is written as-is into a
|
62
|
+
# "known_hosts" like file and then passed into sftp using the `UserKnownHostsFile` option.
|
63
|
+
# - The easiest way to generate the required is to use `ssh-keyscan` and then supply that value in this field.
|
64
|
+
# For example: `ssh-keyscan hostname`
|
65
|
+
# - Any other options supported by ssh_config.
|
66
|
+
# `man ssh_config` to see all available options.
|
52
67
|
#
|
53
68
|
# Examples:
|
54
69
|
#
|
@@ -167,33 +182,36 @@ module IOStreams
|
|
167
182
|
def sftp_download(remote_file_name, local_file_name)
|
168
183
|
with_sftp_args do |args|
|
169
184
|
Open3.popen2e(*args) do |writer, reader, waith_thr|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
end
|
184
|
-
|
185
|
-
out
|
186
|
-
rescue Errno::EPIPE
|
187
|
-
out = begin
|
188
|
-
reader.read.chomp
|
189
|
-
rescue StandardError
|
190
|
-
nil
|
191
|
-
end
|
185
|
+
# Give time for remote sftp server to get ready to accept the password.
|
186
|
+
sleep self.class.before_password_wait_seconds
|
187
|
+
|
188
|
+
writer.puts password
|
189
|
+
|
190
|
+
# Give time for password to be processed and stdin to be passed to sftp process.
|
191
|
+
sleep self.class.sshpass_wait_seconds
|
192
|
+
|
193
|
+
writer.puts "get #{remote_file_name} #{local_file_name}"
|
194
|
+
writer.puts "bye"
|
195
|
+
writer.close
|
196
|
+
out = reader.read.chomp
|
197
|
+
unless waith_thr.value.success?
|
192
198
|
raise(
|
193
199
|
Errors::CommunicationsFailure,
|
194
200
|
"Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
195
201
|
)
|
196
202
|
end
|
203
|
+
|
204
|
+
out
|
205
|
+
rescue Errno::EPIPE
|
206
|
+
out = begin
|
207
|
+
reader.read.chomp
|
208
|
+
rescue StandardError
|
209
|
+
nil
|
210
|
+
end
|
211
|
+
raise(
|
212
|
+
Errors::CommunicationsFailure,
|
213
|
+
"Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
214
|
+
)
|
197
215
|
end
|
198
216
|
end
|
199
217
|
end
|
@@ -201,48 +219,64 @@ module IOStreams
|
|
201
219
|
def sftp_upload(local_file_name, remote_file_name)
|
202
220
|
with_sftp_args do |args|
|
203
221
|
Open3.popen2e(*args) do |writer, reader, waith_thr|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
unless waith_thr.value.success?
|
213
|
-
raise(
|
214
|
-
Errors::CommunicationsFailure,
|
215
|
-
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
216
|
-
)
|
217
|
-
end
|
218
|
-
|
219
|
-
out
|
220
|
-
rescue Errno::EPIPE
|
221
|
-
out = begin
|
222
|
-
reader.read.chomp
|
223
|
-
rescue StandardError
|
224
|
-
nil
|
225
|
-
end
|
222
|
+
writer.puts(password) if password
|
223
|
+
# Give time for password to be processed and stdin to be passed to sftp process.
|
224
|
+
sleep self.class.sshpass_wait_seconds
|
225
|
+
writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
|
226
|
+
writer.puts "bye"
|
227
|
+
writer.close
|
228
|
+
out = reader.read.chomp
|
229
|
+
unless waith_thr.value.success?
|
226
230
|
raise(
|
227
231
|
Errors::CommunicationsFailure,
|
228
232
|
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
229
233
|
)
|
230
234
|
end
|
235
|
+
|
236
|
+
out
|
237
|
+
rescue Errno::EPIPE
|
238
|
+
out = begin
|
239
|
+
reader.read.chomp
|
240
|
+
rescue StandardError
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
raise(
|
244
|
+
Errors::CommunicationsFailure,
|
245
|
+
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
246
|
+
)
|
231
247
|
end
|
232
248
|
end
|
233
249
|
end
|
234
250
|
|
235
251
|
def with_sftp_args
|
236
|
-
return yield sftp_args(ssh_options)
|
252
|
+
return yield sftp_args(ssh_options) if !ssh_options.key?("IdentityKey") && !ssh_options.key?("HostKey")
|
253
|
+
|
254
|
+
with_identity_key(ssh_options.dup) do |options|
|
255
|
+
with_host_key(options) do |options2|
|
256
|
+
yield sftp_args(options2)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def with_identity_key(options)
|
262
|
+
return yield options unless ssh_options.key?("IdentityKey")
|
263
|
+
|
264
|
+
with_temp_file(options, "IdentityFile", options.delete("IdentityKey")) { yield options }
|
265
|
+
end
|
266
|
+
|
267
|
+
def with_host_key(options)
|
268
|
+
return yield options unless ssh_options.key?("HostKey")
|
269
|
+
|
270
|
+
with_temp_file(options, "UserKnownHostsFile", options.delete("HostKey")) { yield options }
|
271
|
+
end
|
237
272
|
|
273
|
+
def with_temp_file(options, option, value)
|
238
274
|
Utils.temp_file_name("iostreams-sftp-args", "key") do |file_name|
|
239
|
-
options = ssh_options.dup
|
240
|
-
key = options.delete("IdentityKey")
|
241
275
|
# sftp requires that private key is only readable by the current user
|
242
|
-
::File.open(file_name, "wb", 0o600) { |io| io.write(
|
276
|
+
::File.open(file_name, "wb", 0o600) { |io| io.write(value) }
|
243
277
|
|
244
|
-
options[
|
245
|
-
yield
|
278
|
+
options[option] = file_name
|
279
|
+
yield options
|
246
280
|
end
|
247
281
|
end
|
248
282
|
|
@@ -272,8 +306,8 @@ module IOStreams
|
|
272
306
|
|
273
307
|
def build_ssh_options
|
274
308
|
options = ssh_options.dup
|
275
|
-
options[:logger]
|
276
|
-
options[:port]
|
309
|
+
options[:logger] ||= logger if defined?(SemanticLogger)
|
310
|
+
options[:port] ||= port
|
277
311
|
options[:max_pkt_size] ||= 65_536
|
278
312
|
options[:password] ||= @password
|
279
313
|
options
|