iostreams 1.5.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bef30882ae2eaebcfbc240858a6854f44cc54208465868e55fd6296addfc8015
4
- data.tar.gz: e9984f367727934b96495a646aa271009681f9972568a0b51e9046edf451e321
3
+ metadata.gz: 849ceda63eb30f95762a7c985cd215d424e62afd68ab20776e8d16c188dd6aed
4
+ data.tar.gz: 8e26af86c40bb673ce36855a7fb30d1c4b401edc3eac0b27a71b9760cfe865dd
5
5
  SHA512:
6
- metadata.gz: 4c30cb2085ce36904551bcfce2d0ec1d4544d79af891bf27660c35daa4081e1f2edaf193332ca7f15ed8324f4aaddef991a906b896d502b1574c53531f27b4d5
7
- data.tar.gz: 6066bb59b519568f99121d97ec0ba2637fab650889cc898a788f71d1dedcbce7167fa435206656d3ce607bb141eee5507501825afebbbd7e37e9c2c27c915f0f
6
+ metadata.gz: 99318c4c64e0133df57b84429b1c2f9caa064abb1405ace5d55208e41b6bf8bb8fa83a75db8ae46d53753f10d566bab53971d95871ed4011bab4571d31bebe8a
7
+ data.tar.gz: bfba3a033c753e3fe05f798177b8f3c7ee8f566eabaf9223984fa902b288cb3515154f621a4e97f75b6c5bc31da88f284390c2d82c5015d7682ecf08c2a671d3
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
1
  # IOStreams
2
- [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Build Status](https://travis-ci.org/rocketjob/iostreams.svg?branch=master)](https://travis-ci.org/rocketjob/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg) [![Gitter chat](https://img.shields.io/badge/IRC%20(gitter)-Support-brightgreen.svg)](https://gitter.im/rocketjob/support)
2
+ [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg)
3
3
 
4
4
  IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
5
5
  or storage mechanism transparent to the application.
@@ -14,6 +14,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
14
14
 
15
15
  Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
16
16
 
17
+ ## Upgrading to v1.6
18
+
19
+ The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
20
+ the following line to your code:
21
+
22
+ ~~~ruby
23
+ IOStreams.include(IOStreams::Deprecated)
24
+ ~~~
25
+
26
+ It is important to move any of the old deprecated apis over to the new api, since they will be removed in a future
27
+ release.
28
+
17
29
  ## Versioning
18
30
 
19
31
  This project adheres to [Semantic Versioning](http://semver.org/).
@@ -1,13 +1,15 @@
1
1
  module IOStreams
2
2
  # Build the streams that need to be applied to a path druing reading or writing.
3
3
  class Builder
4
- attr_accessor :file_name
4
+ attr_accessor :file_name, :format_options
5
5
  attr_reader :streams, :options
6
6
 
7
7
  def initialize(file_name = nil)
8
- @file_name = file_name
9
- @streams = nil
10
- @options = nil
8
+ @file_name = file_name
9
+ @streams = nil
10
+ @options = nil
11
+ @format = nil
12
+ @format_option = nil
11
13
  end
12
14
 
13
15
  # Supply an option that is only applied once the file name extensions have been parsed.
@@ -88,6 +90,20 @@ module IOStreams
88
90
  built_streams.freeze
89
91
  end
90
92
 
93
+ # Returns the tabular format if set, otherwise tries to autodetect the format if the file_name has been set
94
+ # Returns [nil] if no format is set, or if it cannot be determined from the file_name
95
+ def format
96
+ @format ||= file_name ? Tabular.format_from_file_name(file_name) : nil
97
+ end
98
+
99
+ def format=(format)
100
+ unless format.nil? || IOStreams::Tabular.registered_formats.include?(format)
101
+ raise(ArgumentError, "Invalid format: #{format.inspect}")
102
+ end
103
+
104
+ @format = format
105
+ end
106
+
91
107
  private
92
108
 
93
109
  def class_for_stream(type, stream)
@@ -9,6 +9,9 @@ module IOStreams
9
9
  class MissingHeader < Error
10
10
  end
11
11
 
12
+ class UnknownFormat < Error
13
+ end
14
+
12
15
  class TypeMismatch < Error
13
16
  end
14
17
 
@@ -26,6 +29,15 @@ module IOStreams
26
29
  class ValueTooLong < Error
27
30
  end
28
31
 
32
+ class MalformedDataError < RuntimeError
33
+ attr_reader :line_number
34
+
35
+ def initialize(message, line_number)
36
+ @line_number = line_number
37
+ super("#{message} on line #{line_number}.")
38
+ end
39
+ end
40
+
29
41
  class InvalidLayout < Error
30
42
  end
31
43
  end
@@ -13,8 +13,6 @@ require "uri"
13
13
  # .zip.enc [ :zip, :enc ]
14
14
  # .gz.enc [ :gz, :enc ]
15
15
  module IOStreams
16
- include Deprecated
17
-
18
16
  # Returns [Path] instance for the supplied complete path with optional scheme.
19
17
  #
20
18
  # Example:
@@ -38,12 +38,12 @@ module IOStreams
38
38
  # Size of blocks to read from the input stream at a time.
39
39
  # Default: 65536 ( 64K )
40
40
  #
41
- # TODO:
42
- # - Handle embedded line feeds when reading csv files.
43
- # - Skip Comment lines. RegExp?
44
- # - Skip "empty" / "blank" lines. RegExp?
45
- # - Extract header line(s) / first non-comment, non-blank line
46
- # - Embedded newline support, RegExp? or Proc?
41
+ # embedded_within: [String]
42
+ # Supports CSV files where a line may contain an embedded newline.
43
+ # For CSV files set `embedded_within: '"'`
44
+ #
45
+ # Note:
46
+ # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
47
47
  def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
48
48
  super(input_stream)
49
49
 
@@ -86,17 +86,29 @@ module IOStreams
86
86
  line_count
87
87
  end
88
88
 
89
- # Reads each line per the @delimeter. It will account for embedded lines provided they are within double quotes.
90
- # The embedded_within argument is set in IOStreams::LineReader
89
+ # Reads each line per the `delimeter`.
90
+ # Accounts for lines that contain the `delimiter` when the `delimeter` is within the `embedded_within` delimiter.
91
+ # For Example, CSV files can contain newlines embedded within double quotes.
91
92
  def readline
92
93
  line = _readline
93
94
  if line && @embedded_within
94
95
  initial_line_number = @line_number
95
96
  while line.count(@embedded_within).odd?
96
- raise "Unclosed quoted field on line #{initial_line_number}" if eof? || line.length > @buffer_size * 10
97
-
97
+ if eof? || line.length > @buffer_size * 10
98
+ raise(Errors::MalformedDataError.new(
99
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
100
+ initial_line_number
101
+ ))
102
+ end
98
103
  line << @delimiter
99
- line << _readline
104
+ next_line = _readline
105
+ if next_line.nil?
106
+ raise(Errors::MalformedDataError.new(
107
+ "Unbalanced delimited field, delimiter: #{@embedded_within}",
108
+ initial_line_number
109
+ ))
110
+ end
111
+ line << next_line
100
112
  end
101
113
  end
102
114
  line
@@ -153,7 +153,7 @@ module IOStreams
153
153
  # Returns [true|false] whether the file is compressed based on its file extensions.
154
154
  def compressed?
155
155
  # TODO: Look at streams?
156
- !(path =~ /\.(zip|gz|gzip|xls.|)\z/i).nil?
156
+ !(path =~ /\.(zip|gz|gzip|xlsx|xlsm|bz2)\z/i).nil?
157
157
  end
158
158
 
159
159
  # Returns [true|false] whether the file is encrypted based on its file extensions.
@@ -3,7 +3,10 @@ require "uri"
3
3
  module IOStreams
4
4
  module Paths
5
5
  class S3 < IOStreams::Path
6
- attr_reader :bucket_name, :client, :options
6
+ attr_reader :bucket_name, :options
7
+
8
+ # Largest file size supported by the S3 copy object api.
9
+ S3_COPY_OBJECT_SIZE_LIMIT = 5 * 1024 * 1024 * 1024
7
10
 
8
11
  # Arguments:
9
12
  #
@@ -138,16 +141,17 @@ module IOStreams
138
141
 
139
142
  @bucket_name = uri.hostname
140
143
  key = uri.path.sub(%r{\A/}, "")
141
- if client.is_a?(Hash)
142
- client[:access_key_id] = access_key_id if access_key_id
143
- client[:secret_access_key] = secret_access_key if secret_access_key
144
- @client = ::Aws::S3::Client.new(client)
144
+
145
+ if client && !client.is_a?(Hash)
146
+ @client = client
145
147
  else
146
- @client = client || ::Aws::S3::Client.new(access_key_id: access_key_id, secret_access_key: secret_access_key)
148
+ @client_options = client.is_a?(Hash) ? client.dup : {}
149
+ @client_options[:access_key_id] = access_key_id if access_key_id
150
+ @client_options[:secret_access_key] = secret_access_key if secret_access_key
147
151
  end
148
- @options = args
149
152
 
150
- @options.merge(uri.query) if uri.query
153
+ @options = args
154
+ @options.merge!(uri.query.transform_keys(&:to_sym)) if uri.query
151
155
 
152
156
  super(key)
153
157
  end
@@ -187,11 +191,11 @@ module IOStreams
187
191
  end
188
192
 
189
193
  # Make S3 perform direct copies within S3 itself.
190
- def copy_to(target_path, convert: true)
191
- return super(target_path) if convert
194
+ def copy_to(target_path, convert: true, **args)
195
+ return super(target_path, convert: convert, **args) if convert || (size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
192
196
 
193
197
  target = IOStreams.new(target_path)
194
- return super(target) unless target.is_a?(self.class)
198
+ return super(target, convert: convert, **args) unless target.is_a?(self.class)
195
199
 
196
200
  source_name = ::File.join(bucket_name, path)
197
201
  client.copy_object(options.merge(bucket: target.bucket_name, key: target.path, copy_source: source_name))
@@ -199,11 +203,13 @@ module IOStreams
199
203
  end
200
204
 
201
205
  # Make S3 perform direct copies within S3 itself.
202
- def copy_from(source_path, convert: true)
203
- return super(source_path) if convert
206
+ def copy_from(source_path, convert: true, **args)
207
+ return super(source_path, convert: true, **args) if convert
204
208
 
205
209
  source = IOStreams.new(source_path)
206
- return super(source, **args) unless source.is_a?(self.class)
210
+ if !source.is_a?(self.class) || (source.size.to_i >= S3_COPY_OBJECT_SIZE_LIMIT)
211
+ return super(source, convert: convert, **args)
212
+ end
207
213
 
208
214
  source_name = ::File.join(source.bucket_name, source.path)
209
215
  client.copy_object(options.merge(bucket: bucket_name, key: path, copy_source: source_name))
@@ -309,6 +315,11 @@ module IOStreams
309
315
  def partial_files_visible?
310
316
  false
311
317
  end
318
+
319
+ # Lazy load S3 client since it takes two seconds to create itself!
320
+ def client
321
+ @client ||= ::Aws::S3::Client.new(@client_options)
322
+ end
312
323
  end
313
324
  end
314
325
  end
@@ -26,12 +26,13 @@ module IOStreams
26
26
  include SemanticLogger::Loggable if defined?(SemanticLogger)
27
27
 
28
28
  class << self
29
- attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds
29
+ attr_accessor :sshpass_bin, :sftp_bin, :sshpass_wait_seconds, :before_password_wait_seconds
30
30
  end
31
31
 
32
- @sftp_bin = "sftp"
33
- @sshpass_bin = "sshpass"
34
- @sshpass_wait_seconds = 5
32
+ @sftp_bin = "sftp"
33
+ @sshpass_bin = "sshpass"
34
+ @before_password_wait_seconds = 2
35
+ @sshpass_wait_seconds = 5
35
36
 
36
37
  attr_reader :hostname, :username, :ssh_options, :url, :port
37
38
 
@@ -46,9 +47,23 @@ module IOStreams
46
47
  # password: [String]
47
48
  # Password for the user.
48
49
  #
49
- # **ssh_options
50
- # Any other options supported by ssh_config.
51
- # `man ssh_config` to see all available options.
50
+ # ssh_options: [Hash]
51
+ # - IdentityKey [String]
52
+ # The identity key that this client should use to talk to this host.
53
+ # Under the covers this value is written to a file and then the file name is passed as `IdentityFile`
54
+ # - HostKey [String]
55
+ # The expected SSH Host key that is presented by the remote host.
56
+ # Instead of storing the host key in the `known_hosts` file, it can be supplied explicity
57
+ # using this option.
58
+ # Under the covers this value is written to a file and then the file name is passed as `UserKnownHostsFile`
59
+ # Notes:
60
+ # - It must contain the entire line that would be stored in `known_hosts`,
61
+ # including the hostname, ip address, key type and key value. This value is written as-is into a
62
+ # "known_hosts" like file and then passed into sftp using the `UserKnownHostsFile` option.
63
+ # - The easiest way to generate the required is to use `ssh-keyscan` and then supply that value in this field.
64
+ # For example: `ssh-keyscan hostname`
65
+ # - Any other options supported by ssh_config.
66
+ # `man ssh_config` to see all available options.
52
67
  #
53
68
  # Examples:
54
69
  #
@@ -167,33 +182,36 @@ module IOStreams
167
182
  def sftp_download(remote_file_name, local_file_name)
168
183
  with_sftp_args do |args|
169
184
  Open3.popen2e(*args) do |writer, reader, waith_thr|
170
- begin
171
- writer.puts password
172
- # Give time for password to be processed and stdin to be passed to sftp process.
173
- sleep self.class.sshpass_wait_seconds
174
- writer.puts "get #{remote_file_name} #{local_file_name}"
175
- writer.puts "bye"
176
- writer.close
177
- out = reader.read.chomp
178
- unless waith_thr.value.success?
179
- raise(
180
- Errors::CommunicationsFailure,
181
- "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
182
- )
183
- end
184
-
185
- out
186
- rescue Errno::EPIPE
187
- out = begin
188
- reader.read.chomp
189
- rescue StandardError
190
- nil
191
- end
185
+ # Give time for remote sftp server to get ready to accept the password.
186
+ sleep self.class.before_password_wait_seconds
187
+
188
+ writer.puts password
189
+
190
+ # Give time for password to be processed and stdin to be passed to sftp process.
191
+ sleep self.class.sshpass_wait_seconds
192
+
193
+ writer.puts "get #{remote_file_name} #{local_file_name}"
194
+ writer.puts "bye"
195
+ writer.close
196
+ out = reader.read.chomp
197
+ unless waith_thr.value.success?
192
198
  raise(
193
199
  Errors::CommunicationsFailure,
194
200
  "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
195
201
  )
196
202
  end
203
+
204
+ out
205
+ rescue Errno::EPIPE
206
+ out = begin
207
+ reader.read.chomp
208
+ rescue StandardError
209
+ nil
210
+ end
211
+ raise(
212
+ Errors::CommunicationsFailure,
213
+ "Download failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
214
+ )
197
215
  end
198
216
  end
199
217
  end
@@ -201,48 +219,64 @@ module IOStreams
201
219
  def sftp_upload(local_file_name, remote_file_name)
202
220
  with_sftp_args do |args|
203
221
  Open3.popen2e(*args) do |writer, reader, waith_thr|
204
- begin
205
- writer.puts(password) if password
206
- # Give time for password to be processed and stdin to be passed to sftp process.
207
- sleep self.class.sshpass_wait_seconds
208
- writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
209
- writer.puts "bye"
210
- writer.close
211
- out = reader.read.chomp
212
- unless waith_thr.value.success?
213
- raise(
214
- Errors::CommunicationsFailure,
215
- "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
216
- )
217
- end
218
-
219
- out
220
- rescue Errno::EPIPE
221
- out = begin
222
- reader.read.chomp
223
- rescue StandardError
224
- nil
225
- end
222
+ writer.puts(password) if password
223
+ # Give time for password to be processed and stdin to be passed to sftp process.
224
+ sleep self.class.sshpass_wait_seconds
225
+ writer.puts "put #{local_file_name.inspect} #{remote_file_name.inspect}"
226
+ writer.puts "bye"
227
+ writer.close
228
+ out = reader.read.chomp
229
+ unless waith_thr.value.success?
226
230
  raise(
227
231
  Errors::CommunicationsFailure,
228
232
  "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
229
233
  )
230
234
  end
235
+
236
+ out
237
+ rescue Errno::EPIPE
238
+ out = begin
239
+ reader.read.chomp
240
+ rescue StandardError
241
+ nil
242
+ end
243
+ raise(
244
+ Errors::CommunicationsFailure,
245
+ "Upload failed calling #{self.class.sftp_bin} via #{self.class.sshpass_bin}: #{out}"
246
+ )
231
247
  end
232
248
  end
233
249
  end
234
250
 
235
251
  def with_sftp_args
236
- return yield sftp_args(ssh_options) unless ssh_options.key?("IdentityKey")
252
+ return yield sftp_args(ssh_options) if !ssh_options.key?("IdentityKey") && !ssh_options.key?("HostKey")
253
+
254
+ with_identity_key(ssh_options.dup) do |options|
255
+ with_host_key(options) do |options2|
256
+ yield sftp_args(options2)
257
+ end
258
+ end
259
+ end
260
+
261
+ def with_identity_key(options)
262
+ return yield options unless ssh_options.key?("IdentityKey")
263
+
264
+ with_temp_file(options, "IdentityFile", options.delete("IdentityKey")) { yield options }
265
+ end
266
+
267
+ def with_host_key(options)
268
+ return yield options unless ssh_options.key?("HostKey")
269
+
270
+ with_temp_file(options, "UserKnownHostsFile", options.delete("HostKey")) { yield options }
271
+ end
237
272
 
273
+ def with_temp_file(options, option, value)
238
274
  Utils.temp_file_name("iostreams-sftp-args", "key") do |file_name|
239
- options = ssh_options.dup
240
- key = options.delete("IdentityKey")
241
275
  # sftp requires that private key is only readable by the current user
242
- ::File.open(file_name, "wb", 0o600) { |io| io.write(key) }
276
+ ::File.open(file_name, "wb", 0o600) { |io| io.write(value) }
243
277
 
244
- options["IdentityFile"] = file_name
245
- yield sftp_args(options)
278
+ options[option] = file_name
279
+ yield options
246
280
  end
247
281
  end
248
282
 
@@ -272,8 +306,8 @@ module IOStreams
272
306
 
273
307
  def build_ssh_options
274
308
  options = ssh_options.dup
275
- options[:logger] ||= logger if defined?(SemanticLogger)
276
- options[:port] ||= port
309
+ options[:logger] ||= logger if defined?(SemanticLogger)
310
+ options[:port] ||= port
277
311
  options[:max_pkt_size] ||= 65_536
278
312
  options[:password] ||= @password
279
313
  options