iostreams 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bef30882ae2eaebcfbc240858a6854f44cc54208465868e55fd6296addfc8015
4
- data.tar.gz: e9984f367727934b96495a646aa271009681f9972568a0b51e9046edf451e321
3
+ metadata.gz: 849ceda63eb30f95762a7c985cd215d424e62afd68ab20776e8d16c188dd6aed
4
+ data.tar.gz: 8e26af86c40bb673ce36855a7fb30d1c4b401edc3eac0b27a71b9760cfe865dd
5
5
  SHA512:
6
- metadata.gz: 4c30cb2085ce36904551bcfce2d0ec1d4544d79af891bf27660c35daa4081e1f2edaf193332ca7f15ed8324f4aaddef991a906b896d502b1574c53531f27b4d5
7
- data.tar.gz: 6066bb59b519568f99121d97ec0ba2637fab650889cc898a788f71d1dedcbce7167fa435206656d3ce607bb141eee5507501825afebbbd7e37e9c2c27c915f0f
6
+ metadata.gz: 99318c4c64e0133df57b84429b1c2f9caa064abb1405ace5d55208e41b6bf8bb8fa83a75db8ae46d53753f10d566bab53971d95871ed4011bab4571d31bebe8a
7
+ data.tar.gz: bfba3a033c753e3fe05f798177b8f3c7ee8f566eabaf9223984fa902b288cb3515154f621a4e97f75b6c5bc31da88f284390c2d82c5015d7682ecf08c2a671d3
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  # IOStreams
2
- [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Build Status](https://travis-ci.org/rocketjob/iostreams.svg?branch=master)](https://travis-ci.org/rocketjob/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
2
+ [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg)
3
3
 
4
4
  IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
5
5
  or storage mechanism transparent to the application.
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
14
14
 
15
15
  Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
16
16
 
17
+ ## Upgrading to v1.6
18
+
19
+ The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
20
+ the following line to your code:
21
+
22
+ ~~~ruby
23
+ IOStreams.include(IOStreams::Deprecated)
24
+ ~~~
25
+
26
+ It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
27
+ release.
28
+
17
29
  ## Versioning
18
30
 
19
31
  This project adheres to [Semantic Versioning](http://semver.org/).
@@ -1,13 +1,15 @@
1
1
  module IOStreams
2
2
  # Build the streams that need to be applied to a path druing reading or writing.
3
3
  class Builder
4
- attr_accessor :file_name
4
+ attr_accessor :file_name, :format_options
5
5
  attr_reader :streams, :options
6
6
 
7
7
  def initialize(file_name = nil)
8
- @file_name = file_name
9
- @streams = nil
10
- @options = nil
8
+ @file_name = file_name
9
+ @streams = nil
10
+ @options = nil
11
+ @format = nil
12
+ @format_option = nil
11
13
  end
12
14
 
13
15
  # Supply an option that is only applied once the file name extensions have been parsed.
@@ -88,6 +90,20 @@ module IOStreams
88
90
  built_streams.freeze
89
91
  end
90
92
 
93
+ # Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
94
+ # Returns [nil] if no format is set, or if it cannot be determined from the file_name
95
+ def format
96
+ @format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
97
+ end
98
+
99
+ def format=(format)
100
+ unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
101
+ raise(ArgumentError, "Invalid format: #{format.inspect}")
102
+ end
103
+
104
+ @format = format
105
+ end
106
+
91
107
  private
92
108
 
93
109
  def class_for_stream(type, stream)
@@ -9,6 +9,9 @@ module IOStreams
9
9
  class MissingHeader < Error
10
10
  end
11
11
 
12
+ class UnknownFormat < Error
13
+ end
14
+
12
15
  class TypeMismatch < Error
13
16
  end
14
17
 
@@ -26,6 +29,15 @@ module IOStreams
26
29
  class ValueTooLong < Error
27
30
  end
28
31
 
32
+ class MalformedDataError < RuntimeError
33
+ attr_reader :line_number
34
+
35
+ def initialize(message, line_number)
36
+ @line_number = line_number
37
+ super("#{message} on line #{line_number}.")
38
+ end
39
+ end
40
+
29
41
  class InvalidLayout < Error
30
42
  end
31
43
  end
@@ -13,8 +13,6 @@ require "uri"
13
13
  # .zip.enc [ :zip, :enc ]
14
14
  # .gz.enc [ :gz, :enc ]
15
15
  module IOStreams
16
- include Deprecated
17
-
18
16
  # Returns [Path] instance for the supplied complete path with optional scheme.
19
17
  #
20
18
  # Example:
@@ -38,12 +38,12 @@ module IOStreams
38
38
  # Size of blocks to read from the input stream at a time.
39
39
  # Default: 65536 ( 64K )
40
40
  #
41
- # TODO:
42
- # - Handle embedded line feeds when reading csv files.
43
- # - Skip Comment lines. RegExp?
44
- # - Skip "empty" / "blank" lines. RegExp?
45
- # - Extract header line(s) / first non-comment, non-blank line
46
- # - Embedded newline support, RegExp? or Proc?
41
+ # embedded_within: [String]
42
+ # Supports CSV files where a line may contain an embedded newline.
43
+ # For CSV files set `embedded_within: '"'`
44
+ #
45
+ # Note:
46
+ # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
47
47
  def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
48
48
  super(input_stream)
49
49
 
@@ -86,17 +86,29 @@ module IOStreams
86
86
  line_count
87
87
  end
88
88
 
89
- # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
90
- # The embedded_within argument is set in IOStreams::LineReader
89
+ # Reads each line per the `delimeter`.
90
+ # Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
91
+ # For Example, CSV files can contain newlines embedded within double quotes.
91
92
  def readline
92
93
  line = _readline
93
94
  if line && @embedded_within
94
95
  initial_line_number = @line_number
95
96
  while line.count(@embedded_within).odd?
96
- raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
97
-
97
+ if eof? || line.length > @buffer_size * 10
98
+ raise(Errors::MalformedDataError.new(
99
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
100
+ initial_line_number
101
+ ))
102
+ end
98
103
  line << @delimiter
99
- line << _readline
104
+ next_line = _readline
105
+ if next_line.nil?
106
+ raise(Errors::MalformedDataError.new(
107
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
108
+ initial_line_number
109
+ ))
110
+ end
111
+ line << next_line
100
112
  end
101
113
  end
102
114
  line
@@ -153,7 +153,7 @@ module IOStreams
153
153
  # Returns [true|false] whether the file is compressed based on its file extensions.
154
154
  def compressed?
155
155
  # TODO: Look at streams?
156
- !(path =~ /\.(zip|gz|gzip|xls.|)\z/i).nil?
156
+ !(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
157
157
  end
158
158
 
159
159
  # Returns [true|false] whether the file is encrypted based on its file extensions.
@@ -3,7 +3,10 @@ require "uri"
3
3
  module IOStreams
4
4
  module Paths
5
5
  class S3 < IOStreams::Path
6
- attr_reader :bucket_name, :client, :options
6
+ attr_reader :bucket_name, :options
7
+
8
+ # Largest file size supported by the S3 copy object api.
9
+ S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
7
10
 
8
11
  # Arguments:
9
12
  #
@@ -138,16 +141,17 @@ module IOStreams
138
141
 
139
142
  @bucket_name = uri.hostname
140
143
  key = uri.path.sub(%r{\A/}, "")
141
- if client.is_a?(Hash)
142
- client[:access_key_id] = access_key_id if access_key_id
143
- client[:secret_access_key] = secret_access_key if secret_access_key
144
- @client = ::Aws::S3::Client.new(client)
144
+
145
+ if client && !client.is_a?(Hash)
146
+ @client = client
145
147
  else
146
- @client = client || ::Aws::S3::Client.new(access_key_id: access_key_id, secret_access_key: secret_access_key)
148
+ @client_options = client.is_a?(Hash) ? client.dup : {}
149
+ @client_options[:access_key_id] = access_key_id if access_key_id
150
+ @client_options[:secret_access_key] = secret_access_key if secret_access_key
147
151
  end
148
- @options = args
149
152
 
150
- @options.merge(uri.query) if uri.query
153
+ @options = args
154
+ @options.merge!(uri.query.transform_keys(&:to_sym)) if uri.query
151
155
 
152
156
  super(key)
153
157
  end
@@ -187,11 +191,11 @@ module IOStreams
187
191
  end
188
192
 
189
193
  # Make S3 perform direct copies within S3 itself.
190
- def copy_to(target_path, convert: true)
191
- return super(target_path) if convert
194
+ def copy_to(target_path, convert: true, **args)
195
+ return super(target_path, convert: convert, **args) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
192
196
 
193
197
  target = IOStreams.new(target_path)
194
- return super(target) unless target.is_a?(self.class)
198
+ return super(target, convert: convert, **args) unless target.is_a?(self.class)
195
199
 
196
200
  source_name = ::File.join(bucket_name, path)
197
201
  client.copy_object(options.merge(bucket: target.bucket_name, key: target.path, copy_source: source_name))
@@ -199,11 +203,13 @@ module IOStreams
199
203
  end
200
204
 
201
205
  # Make S3 perform direct copies within S3 itself.
202
- def copy_from(source_path, convert: true)
203
- return super(source_path) if convert
206
+ def copy_from(source_path, convert: true, **args)
207
+ return super(source_path, convert: true, **args) if convert
204
208
 
205
209
  source = IOStreams.new(source_path)
206
- return super(source, **args) unless source.is_a?(self.class)
210
+ if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
211
+ return super(source, convert: convert, **args)
212
+ end
207
213
 
208
214
  source_name = ::File.join(source.bucket_name, source.path)
209
215
  client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
@@ -309,6 +315,11 @@ module IOStreams
309
315
  def partial_files_visible?
310
316
  false
311
317
  end
318
+
319
+ # Lazy load S3 client since it takes two seconds to create itself!
320
+ def client
321
+ @client ||= ::Aws::S3::Client.new(@client_options)
322
+ end
312
323
  end
313
324
  end
314
325
  end
@@ -26,12 +26,13 @@ module IOStreams
26
26
  include SemanticLogger::Loggable if defined?(SemanticLogger)
27
27
 
28
28
  class << self
29
- attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds
29
+ attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds, :before_password_wait_seconds
30
30
  end
31
31
 
32
- @sftp_bin = "sftp"
33
- @sshpass_bin = "sshpass"
34
- @sshpass_wait_seconds = 5
32
+ @sftp_bin = "sftp"
33
+ @sshpass_bin = "sshpass"
34
+ @before_password_wait_seconds = 2
35
+ @sshpass_wait_seconds = 5
35
36
 
36
37
  attr_reader :hostname, :username, :ssh_options, :url, :port
37
38
 
@@ -46,9 +47,23 @@ module IOStreams
46
47
  # password: [String]
47
48
  # Password for the user.
48
49
  #
49
- # **ssh_options
50
- # Any other options supported by ssh_config.
51
- # `man ssh_config` to see all available options.
50
+ # ssh_options: [Hash]
51
+ # - IdentityKey [String]
52
+ # The identity key that this client should use to talk to this host.
53
+ # Under the covers this value is written to a file and then the file name is passed as `IdentityFile`
54
+ # - HostKey [String]
55
+ # The expected SSH Host key that is presented by the remote host.
56
+ # Instead of storing the host key in the `known_hosts` file, it can be supplied explicity
57
+ # using this option.
58
+ # Under the covers this value is written to a file and then the file name is passed as `UserKnownHostsFile`
59
+ # Notes:
60
+ # - It must contain the entire line that would be stored in `known_hosts`,
61
+ # including the hostname, ip address, key type and key value. This value is written as-is into a
62
+ # "known_hosts" like file and then passed into sftp using the `UserKnownHostsFile` option.
63
+ # - The easiest way to generate the required is to use `ssh-keyscan` and then supply that value in this field.
64
+ # For example: `ssh-keyscan hostname`
65
+ # - Any other options supported by ssh_config.
66
+ # `man ssh_config` to see all available options.
52
67
  #
53
68
  # Examples:
54
69
  #
@@ -167,33 +182,36 @@ module IOStreams
167
182
  def sftp_download(remote_file_name, local_file_name)
168
183
  with_sftp_args do |args|
169
184
  Open3.popen2e(*args) do |writer, reader, waith_thr|
170
- begin
171
- writer.puts password
172
- # Give time for password to be processed and stdin to be passed to sftp process.
173
- sleep self.class.sshpass_wait_seconds
174
- writer.puts "get #{remote_file_name} #{local_file_name}"
175
- writer.puts "bye"
176
- writer.close
177
- out = reader.read.chomp
178
- unless waith_thr.value.success?
179
- raise(
180
- Errors::CommunicationsFailure,
181
- "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
182
- )
183
- end
184
-
185
- out
186
- rescue Errno::EPIPE
187
- out = begin
188
- reader.read.chomp
189
- rescue StandardError
190
- nil
191
- end
185
+ # Give time for remote sftp server to get ready to accept the password.
186
+ sleep self.class.before_password_wait_seconds
187
+
188
+ writer.puts password
189
+
190
+ # Give time for password to be processed and stdin to be passed to sftp process.
191
+ sleep self.class.sshpass_wait_seconds
192
+
193
+ writer.puts "get #{remote_file_name} #{local_file_name}"
194
+ writer.puts "bye"
195
+ writer.close
196
+ out = reader.read.chomp
197
+ unless waith_thr.value.success?
192
198
  raise(
193
199
  Errors::CommunicationsFailure,
194
200
  "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
195
201
  )
196
202
  end
203
+
204
+ out
205
+ rescue Errno::EPIPE
206
+ out = begin
207
+ reader.read.chomp
208
+ rescue StandardError
209
+ nil
210
+ end
211
+ raise(
212
+ Errors::CommunicationsFailure,
213
+ "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
214
+ )
197
215
  end
198
216
  end
199
217
  end
@@ -201,48 +219,64 @@ module IOStreams
201
219
  def sftp_upload(local_file_name, remote_file_name)
202
220
  with_sftp_args do |args|
203
221
  Open3.popen2e(*args) do |writer, reader, waith_thr|
204
- begin
205
- writer.puts(password) if password
206
- # Give time for password to be processed and stdin to be passed to sftp process.
207
- sleep self.class.sshpass_wait_seconds
208
- writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
209
- writer.puts "bye"
210
- writer.close
211
- out = reader.read.chomp
212
- unless waith_thr.value.success?
213
- raise(
214
- Errors::CommunicationsFailure,
215
- "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
216
- )
217
- end
218
-
219
- out
220
- rescue Errno::EPIPE
221
- out = begin
222
- reader.read.chomp
223
- rescue StandardError
224
- nil
225
- end
222
+ writer.puts(password) if password
223
+ # Give time for password to be processed and stdin to be passed to sftp process.
224
+ sleep self.class.sshpass_wait_seconds
225
+ writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
226
+ writer.puts "bye"
227
+ writer.close
228
+ out = reader.read.chomp
229
+ unless waith_thr.value.success?
226
230
  raise(
227
231
  Errors::CommunicationsFailure,
228
232
  "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
229
233
  )
230
234
  end
235
+
236
+ out
237
+ rescue Errno::EPIPE
238
+ out = begin
239
+ reader.read.chomp
240
+ rescue StandardError
241
+ nil
242
+ end
243
+ raise(
244
+ Errors::CommunicationsFailure,
245
+ "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
246
+ )
231
247
  end
232
248
  end
233
249
  end
234
250
 
235
251
  def with_sftp_args
236
- return yield sftp_args(ssh_options) unless ssh_options.key?("IdentityKey")
252
+ return yield sftp_args(ssh_options) if !ssh_options.key?("IdentityKey") && !ssh_options.key?("HostKey")
253
+
254
+ with_identity_key(ssh_options.dup) do |options|
255
+ with_host_key(options) do |options2|
256
+ yield sftp_args(options2)
257
+ end
258
+ end
259
+ end
260
+
261
+ def with_identity_key(options)
262
+ return yield options unless ssh_options.key?("IdentityKey")
263
+
264
+ with_temp_file(options, "IdentityFile", options.delete("IdentityKey")) { yield options }
265
+ end
266
+
267
+ def with_host_key(options)
268
+ return yield options unless ssh_options.key?("HostKey")
269
+
270
+ with_temp_file(options, "UserKnownHostsFile", options.delete("HostKey")) { yield options }
271
+ end
237
272
 
273
+ def with_temp_file(options, option, value)
238
274
  Utils.temp_file_name("iostreams-sftp-args", "key") do |file_name|
239
- options = ssh_options.dup
240
- key = options.delete("IdentityKey")
241
275
  # sftp requires that private key is only readable by the current user
242
- ::File.open(file_name, "wb", 0o600) { |io| io.write(key) }
276
+ ::File.open(file_name, "wb", 0o600) { |io| io.write(value) }
243
277
 
244
- options["IdentityFile"] = file_name
245
- yield sftp_args(options)
278
+ options[option] = file_name
279
+ yield options
246
280
  end
247
281
  end
248
282
 
@@ -272,8 +306,8 @@ module IOStreams
272
306
 
273
307
  def build_ssh_options
274
308
  options = ssh_options.dup
275
- options[:logger] ||= logger if defined?(SemanticLogger)
276
- options[:port] ||= port
309
+ options[:logger] ||= logger if defined?(SemanticLogger)
310
+ options[:port] ||= port
277
311
  options[:max_pkt_size] ||= 65_536
278
312
  options[:password] ||= @password
279
313
  options