iostreams 1.5.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +13 -1
- data/lib/io_streams/builder.rb +20 -4
- data/lib/io_streams/errors.rb +12 -0
- data/lib/io_streams/io_streams.rb +0 -2
- data/lib/io_streams/line/reader.rb +23 -11
- data/lib/io_streams/path.rb +1 -1
- data/lib/io_streams/paths/s3.rb +25 -14
- data/lib/io_streams/paths/sftp.rb +93 -59
- data/lib/io_streams/pgp.rb +17 -17
- data/lib/io_streams/pgp/writer.rb +1 -2
- data/lib/io_streams/stream.rb +75 -10
- data/lib/io_streams/tabular.rb +23 -23
- data/lib/io_streams/tabular/parser/csv.rb +4 -2
- data/lib/io_streams/tabular/parser/fixed.rb +1 -1
- data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
- data/lib/io_streams/utils.rb +3 -5
- data/lib/io_streams/version.rb +1 -1
- data/lib/iostreams.rb +8 -0
- data/test/builder_test.rb +29 -0
- data/test/deprecated_test.rb +2 -0
- data/test/files/test.psv +4 -0
- data/test/files/unclosed_quote_large_test.csv +1658 -0
- data/test/files/unclosed_quote_test2.csv +3 -0
- data/test/line_reader_test.rb +30 -4
- data/test/paths/file_test.rb +6 -8
- data/test/paths/sftp_test.rb +7 -1
- data/test/stream_test.rb +169 -3
- data/test/test_helper.rb +0 -3
- metadata +48 -43
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 849ceda63eb30f95762a7c985cd215d424e62afd68ab20776e8d16c188dd6aed
|
4
|
+
data.tar.gz: 8e26af86c40bb673ce36855a7fb30d1c4b401edc3eac0b27a71b9760cfe865dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99318c4c64e0133df57b84429b1c2f9caa064abb1405ace5d55208e41b6bf8bb8fa83a75db8ae46d53753f10d566bab53971d95871ed4011bab4571d31bebe8a
|
7
|
+
data.tar.gz: bfba3a033c753e3fe05f798177b8f3c7ee8f566eabaf9223984fa902b288cb3515154f621a4e97f75b6c5bc31da88f284390c2d82c5015d7682ecf08c2a671d3
|
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# IOStreams
|
2
|
-
[](https://rubygems.org/gems/iostreams) [](https://rubygems.org/gems/iostreams) [](https://rubygems.org/gems/iostreams) [](http://opensource.org/licenses/Apache-2.0) 
|
3
3
|
|
4
4
|
IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
|
5
5
|
or storage mechanism transparent to the application.
|
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
|
|
14
14
|
|
15
15
|
Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
|
16
16
|
|
17
|
+
## Upgrading to v1.6
|
18
|
+
|
19
|
+
The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
|
20
|
+
the following line to your code:
|
21
|
+
|
22
|
+
~~~ruby
|
23
|
+
IOStreams.include(IOStreams::Deprecated)
|
24
|
+
~~~
|
25
|
+
|
26
|
+
It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
|
27
|
+
release.
|
28
|
+
|
17
29
|
## Versioning
|
18
30
|
|
19
31
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
data/lib/io_streams/builder.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module IOStreams
|
2
2
|
# Build the streams that need to be applied to a path druing reading or writing.
|
3
3
|
class Builder
|
4
|
-
attr_accessor :file_name
|
4
|
+
attr_accessor :file_name, :format_options
|
5
5
|
attr_reader :streams, :options
|
6
6
|
|
7
7
|
def initialize(file_name = nil)
|
8
|
-
@file_name
|
9
|
-
@streams
|
10
|
-
@options
|
8
|
+
@file_name = file_name
|
9
|
+
@streams = nil
|
10
|
+
@options = nil
|
11
|
+
@format = nil
|
12
|
+
@format_option = nil
|
11
13
|
end
|
12
14
|
|
13
15
|
# Supply an option that is only applied once the file name extensions have been parsed.
|
@@ -88,6 +90,20 @@ module IOStreams
|
|
88
90
|
built_streams.freeze
|
89
91
|
end
|
90
92
|
|
93
|
+
# Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
|
94
|
+
# Returns [nil] if no format is set, or if it cannot be determined from the file_name
|
95
|
+
def format
|
96
|
+
@format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
|
97
|
+
end
|
98
|
+
|
99
|
+
def format=(format)
|
100
|
+
unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
|
101
|
+
raise(ArgumentError, "Invalid format: #{format.inspect}")
|
102
|
+
end
|
103
|
+
|
104
|
+
@format = format
|
105
|
+
end
|
106
|
+
|
91
107
|
private
|
92
108
|
|
93
109
|
def class_for_stream(type, stream)
|
data/lib/io_streams/errors.rb
CHANGED
@@ -9,6 +9,9 @@ module IOStreams
|
|
9
9
|
class MissingHeader < Error
|
10
10
|
end
|
11
11
|
|
12
|
+
class UnknownFormat < Error
|
13
|
+
end
|
14
|
+
|
12
15
|
class TypeMismatch < Error
|
13
16
|
end
|
14
17
|
|
@@ -26,6 +29,15 @@ module IOStreams
|
|
26
29
|
class ValueTooLong < Error
|
27
30
|
end
|
28
31
|
|
32
|
+
class MalformedDataError < RuntimeError
|
33
|
+
attr_reader :line_number
|
34
|
+
|
35
|
+
def initialize(message, line_number)
|
36
|
+
@line_number = line_number
|
37
|
+
super("#{message} on line #{line_number}.")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
29
41
|
class InvalidLayout < Error
|
30
42
|
end
|
31
43
|
end
|
@@ -38,12 +38,12 @@ module IOStreams
|
|
38
38
|
# Size of blocks to read from the input stream at a time.
|
39
39
|
# Default: 65536 ( 64K )
|
40
40
|
#
|
41
|
-
#
|
42
|
-
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
#
|
46
|
-
#
|
41
|
+
# embedded_within: [String]
|
42
|
+
# Supports CSV files where a line may contain an embedded newline.
|
43
|
+
# For CSV files set `embedded_within: '"'`
|
44
|
+
#
|
45
|
+
# Note:
|
46
|
+
# * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
|
47
47
|
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
|
48
48
|
super(input_stream)
|
49
49
|
|
@@ -86,17 +86,29 @@ module IOStreams
|
|
86
86
|
line_count
|
87
87
|
end
|
88
88
|
|
89
|
-
# Reads each line per the
|
90
|
-
#
|
89
|
+
# Reads each line per the `delimeter`.
|
90
|
+
# Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
|
91
|
+
# For Example, CSV files can contain newlines embedded within double quotes.
|
91
92
|
def readline
|
92
93
|
line = _readline
|
93
94
|
if line && @embedded_within
|
94
95
|
initial_line_number = @line_number
|
95
96
|
while line.count(@embedded_within).odd?
|
96
|
-
|
97
|
-
|
97
|
+
if eof? || line.length > @buffer_size * 10
|
98
|
+
raise(Errors::MalformedDataError.new(
|
99
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
100
|
+
initial_line_number
|
101
|
+
))
|
102
|
+
end
|
98
103
|
line << @delimiter
|
99
|
-
|
104
|
+
next_line = _readline
|
105
|
+
if next_line.nil?
|
106
|
+
raise(Errors::MalformedDataError.new(
|
107
|
+
"Unbalanced delimited field, delimiter: #{@embedded_within}",
|
108
|
+
initial_line_number
|
109
|
+
))
|
110
|
+
end
|
111
|
+
line << next_line
|
100
112
|
end
|
101
113
|
end
|
102
114
|
line
|
data/lib/io_streams/path.rb
CHANGED
@@ -153,7 +153,7 @@ module IOStreams
|
|
153
153
|
# Returns [true|false] whether the file is compressed based on its file extensions.
|
154
154
|
def compressed?
|
155
155
|
# TODO: Look at streams?
|
156
|
-
!(path =~ /\.(zip|gz|gzip|
|
156
|
+
!(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
|
157
157
|
end
|
158
158
|
|
159
159
|
# Returns [true|false] whether the file is encrypted based on its file extensions.
|
data/lib/io_streams/paths/s3.rb
CHANGED
@@ -3,7 +3,10 @@ require "uri"
|
|
3
3
|
module IOStreams
|
4
4
|
module Paths
|
5
5
|
class S3 < IOStreams::Path
|
6
|
-
attr_reader :bucket_name, :
|
6
|
+
attr_reader :bucket_name, :options
|
7
|
+
|
8
|
+
# Largest file size supported by the S3 copy object api.
|
9
|
+
S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
|
7
10
|
|
8
11
|
# Arguments:
|
9
12
|
#
|
@@ -138,16 +141,17 @@ module IOStreams
|
|
138
141
|
|
139
142
|
@bucket_name = uri.hostname
|
140
143
|
key = uri.path.sub(%r{\A/}, "")
|
141
|
-
|
142
|
-
|
143
|
-
client
|
144
|
-
@client = ::Aws::S3::Client.new(client)
|
144
|
+
|
145
|
+
if client && !client.is_a?(Hash)
|
146
|
+
@client = client
|
145
147
|
else
|
146
|
-
@
|
148
|
+
@client_options = client.is_a?(Hash) ? client.dup : {}
|
149
|
+
@client_options[:access_key_id] = access_key_id if access_key_id
|
150
|
+
@client_options[:secret_access_key] = secret_access_key if secret_access_key
|
147
151
|
end
|
148
|
-
@options = args
|
149
152
|
|
150
|
-
@options
|
153
|
+
@options = args
|
154
|
+
@options.merge!(uri.query.transform_keys(&:to_sym)) if uri.query
|
151
155
|
|
152
156
|
super(key)
|
153
157
|
end
|
@@ -187,11 +191,11 @@ module IOStreams
|
|
187
191
|
end
|
188
192
|
|
189
193
|
# Make S3 perform direct copies within S3 itself.
|
190
|
-
def copy_to(target_path, convert: true)
|
191
|
-
return super(target_path) if convert
|
194
|
+
def copy_to(target_path, convert: true, **args)
|
195
|
+
return super(target_path, convert: convert, **args) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
192
196
|
|
193
197
|
target = IOStreams.new(target_path)
|
194
|
-
return super(target) unless target.is_a?(self.class)
|
198
|
+
return super(target, convert: convert, **args) unless target.is_a?(self.class)
|
195
199
|
|
196
200
|
source_name = ::File.join(bucket_name, path)
|
197
201
|
client.copy_object(options.merge(bucket: target.bucket_name, key: target.path, copy_source: source_name))
|
@@ -199,11 +203,13 @@ module IOStreams
|
|
199
203
|
end
|
200
204
|
|
201
205
|
# Make S3 perform direct copies within S3 itself.
|
202
|
-
def copy_from(source_path, convert: true)
|
203
|
-
return super(source_path) if convert
|
206
|
+
def copy_from(source_path, convert: true, **args)
|
207
|
+
return super(source_path, convert: true, **args) if convert
|
204
208
|
|
205
209
|
source = IOStreams.new(source_path)
|
206
|
-
|
210
|
+
if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
|
211
|
+
return super(source, convert: convert, **args)
|
212
|
+
end
|
207
213
|
|
208
214
|
source_name = ::File.join(source.bucket_name, source.path)
|
209
215
|
client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
|
@@ -309,6 +315,11 @@ module IOStreams
|
|
309
315
|
def partial_files_visible?
|
310
316
|
false
|
311
317
|
end
|
318
|
+
|
319
|
+
# Lazy load S3 client since it takes two seconds to create itself!
|
320
|
+
def client
|
321
|
+
@client ||= ::Aws::S3::Client.new(@client_options)
|
322
|
+
end
|
312
323
|
end
|
313
324
|
end
|
314
325
|
end
|
@@ -26,12 +26,13 @@ module IOStreams
|
|
26
26
|
include SemanticLogger::Loggable if defined?(SemanticLogger)
|
27
27
|
|
28
28
|
class << self
|
29
|
-
attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds
|
29
|
+
attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds, :before_password_wait_seconds
|
30
30
|
end
|
31
31
|
|
32
|
-
@sftp_bin
|
33
|
-
@sshpass_bin
|
34
|
-
@
|
32
|
+
@sftp_bin = "sftp"
|
33
|
+
@sshpass_bin = "sshpass"
|
34
|
+
@before_password_wait_seconds = 2
|
35
|
+
@sshpass_wait_seconds = 5
|
35
36
|
|
36
37
|
attr_reader :hostname, :username, :ssh_options, :url, :port
|
37
38
|
|
@@ -46,9 +47,23 @@ module IOStreams
|
|
46
47
|
# password: [String]
|
47
48
|
# Password for the user.
|
48
49
|
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
50
|
+
# ssh_options: [Hash]
|
51
|
+
# - IdentityKey [String]
|
52
|
+
# The identity key that this client should use to talk to this host.
|
53
|
+
# Under the covers this value is written to a file and then the file name is passed as `IdentityFile`
|
54
|
+
# - HostKey [String]
|
55
|
+
# The expected SSH Host key that is presented by the remote host.
|
56
|
+
# Instead of storing the host key in the `known_hosts` file, it can be supplied explicity
|
57
|
+
# using this option.
|
58
|
+
# Under the covers this value is written to a file and then the file name is passed as `UserKnownHostsFile`
|
59
|
+
# Notes:
|
60
|
+
# - It must contain the entire line that would be stored in `known_hosts`,
|
61
|
+
# including the hostname, ip address, key type and key value. This value is written as-is into a
|
62
|
+
# "known_hosts" like file and then passed into sftp using the `UserKnownHostsFile` option.
|
63
|
+
# - The easiest way to generate the required is to use `ssh-keyscan` and then supply that value in this field.
|
64
|
+
# For example: `ssh-keyscan hostname`
|
65
|
+
# - Any other options supported by ssh_config.
|
66
|
+
# `man ssh_config` to see all available options.
|
52
67
|
#
|
53
68
|
# Examples:
|
54
69
|
#
|
@@ -167,33 +182,36 @@ module IOStreams
|
|
167
182
|
def sftp_download(remote_file_name, local_file_name)
|
168
183
|
with_sftp_args do |args|
|
169
184
|
Open3.popen2e(*args) do |writer, reader, waith_thr|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
end
|
184
|
-
|
185
|
-
out
|
186
|
-
rescue Errno::EPIPE
|
187
|
-
out = begin
|
188
|
-
reader.read.chomp
|
189
|
-
rescue StandardError
|
190
|
-
nil
|
191
|
-
end
|
185
|
+
# Give time for remote sftp server to get ready to accept the password.
|
186
|
+
sleep self.class.before_password_wait_seconds
|
187
|
+
|
188
|
+
writer.puts password
|
189
|
+
|
190
|
+
# Give time for password to be processed and stdin to be passed to sftp process.
|
191
|
+
sleep self.class.sshpass_wait_seconds
|
192
|
+
|
193
|
+
writer.puts "get #{remote_file_name} #{local_file_name}"
|
194
|
+
writer.puts "bye"
|
195
|
+
writer.close
|
196
|
+
out = reader.read.chomp
|
197
|
+
unless waith_thr.value.success?
|
192
198
|
raise(
|
193
199
|
Errors::CommunicationsFailure,
|
194
200
|
"Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
195
201
|
)
|
196
202
|
end
|
203
|
+
|
204
|
+
out
|
205
|
+
rescue Errno::EPIPE
|
206
|
+
out = begin
|
207
|
+
reader.read.chomp
|
208
|
+
rescue StandardError
|
209
|
+
nil
|
210
|
+
end
|
211
|
+
raise(
|
212
|
+
Errors::CommunicationsFailure,
|
213
|
+
"Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
214
|
+
)
|
197
215
|
end
|
198
216
|
end
|
199
217
|
end
|
@@ -201,48 +219,64 @@ module IOStreams
|
|
201
219
|
def sftp_upload(local_file_name, remote_file_name)
|
202
220
|
with_sftp_args do |args|
|
203
221
|
Open3.popen2e(*args) do |writer, reader, waith_thr|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
unless waith_thr.value.success?
|
213
|
-
raise(
|
214
|
-
Errors::CommunicationsFailure,
|
215
|
-
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
216
|
-
)
|
217
|
-
end
|
218
|
-
|
219
|
-
out
|
220
|
-
rescue Errno::EPIPE
|
221
|
-
out = begin
|
222
|
-
reader.read.chomp
|
223
|
-
rescue StandardError
|
224
|
-
nil
|
225
|
-
end
|
222
|
+
writer.puts(password) if password
|
223
|
+
# Give time for password to be processed and stdin to be passed to sftp process.
|
224
|
+
sleep self.class.sshpass_wait_seconds
|
225
|
+
writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
|
226
|
+
writer.puts "bye"
|
227
|
+
writer.close
|
228
|
+
out = reader.read.chomp
|
229
|
+
unless waith_thr.value.success?
|
226
230
|
raise(
|
227
231
|
Errors::CommunicationsFailure,
|
228
232
|
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
229
233
|
)
|
230
234
|
end
|
235
|
+
|
236
|
+
out
|
237
|
+
rescue Errno::EPIPE
|
238
|
+
out = begin
|
239
|
+
reader.read.chomp
|
240
|
+
rescue StandardError
|
241
|
+
nil
|
242
|
+
end
|
243
|
+
raise(
|
244
|
+
Errors::CommunicationsFailure,
|
245
|
+
"Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
|
246
|
+
)
|
231
247
|
end
|
232
248
|
end
|
233
249
|
end
|
234
250
|
|
235
251
|
def with_sftp_args
|
236
|
-
return yield sftp_args(ssh_options)
|
252
|
+
return yield sftp_args(ssh_options) if !ssh_options.key?("IdentityKey") && !ssh_options.key?("HostKey")
|
253
|
+
|
254
|
+
with_identity_key(ssh_options.dup) do |options|
|
255
|
+
with_host_key(options) do |options2|
|
256
|
+
yield sftp_args(options2)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def with_identity_key(options)
|
262
|
+
return yield options unless ssh_options.key?("IdentityKey")
|
263
|
+
|
264
|
+
with_temp_file(options, "IdentityFile", options.delete("IdentityKey")) { yield options }
|
265
|
+
end
|
266
|
+
|
267
|
+
def with_host_key(options)
|
268
|
+
return yield options unless ssh_options.key?("HostKey")
|
269
|
+
|
270
|
+
with_temp_file(options, "UserKnownHostsFile", options.delete("HostKey")) { yield options }
|
271
|
+
end
|
237
272
|
|
273
|
+
def with_temp_file(options, option, value)
|
238
274
|
Utils.temp_file_name("iostreams-sftp-args", "key") do |file_name|
|
239
|
-
options = ssh_options.dup
|
240
|
-
key = options.delete("IdentityKey")
|
241
275
|
# sftp requires that private key is only readable by the current user
|
242
|
-
::File.open(file_name, "wb", 0o600) { |io| io.write(
|
276
|
+
::File.open(file_name, "wb", 0o600) { |io| io.write(value) }
|
243
277
|
|
244
|
-
options[
|
245
|
-
yield
|
278
|
+
options[option] = file_name
|
279
|
+
yield options
|
246
280
|
end
|
247
281
|
end
|
248
282
|
|
@@ -272,8 +306,8 @@ module IOStreams
|
|
272
306
|
|
273
307
|
def build_ssh_options
|
274
308
|
options = ssh_options.dup
|
275
|
-
options[:logger]
|
276
|
-
options[:port]
|
309
|
+
options[:logger] ||= logger if defined?(SemanticLogger)
|
310
|
+
options[:port] ||= port
|
277
311
|
options[:max_pkt_size] ||= 65_536
|
278
312
|
options[:password] ||= @password
|
279
313
|
options
|