iostreams 1.10.3 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +20 -2
  3. data/Rakefile +7 -0
  4. data/lib/io_streams/builder.rb +10 -10
  5. data/lib/io_streams/bzip2/writer.rb +1 -1
  6. data/lib/io_streams/encode/reader.rb +2 -2
  7. data/lib/io_streams/encode/writer.rb +5 -5
  8. data/lib/io_streams/gzip/reader.rb +1 -1
  9. data/lib/io_streams/gzip/writer.rb +1 -1
  10. data/lib/io_streams/io_streams.rb +47 -21
  11. data/lib/io_streams/line/reader.rb +2 -2
  12. data/lib/io_streams/line/writer.rb +1 -1
  13. data/lib/io_streams/path.rb +2 -2
  14. data/lib/io_streams/paths/file.rb +25 -11
  15. data/lib/io_streams/paths/http.rb +80 -7
  16. data/lib/io_streams/paths/matcher.rb +3 -3
  17. data/lib/io_streams/paths/s3.rb +22 -3
  18. data/lib/io_streams/paths/sftp.rb +9 -10
  19. data/lib/io_streams/pgp/reader.rb +25 -7
  20. data/lib/io_streams/pgp/writer.rb +95 -29
  21. data/lib/io_streams/pgp.rb +289 -87
  22. data/lib/io_streams/reader.rb +4 -4
  23. data/lib/io_streams/record/reader.rb +3 -4
  24. data/lib/io_streams/record/writer.rb +3 -4
  25. data/lib/io_streams/row/reader.rb +1 -1
  26. data/lib/io_streams/row/writer.rb +1 -1
  27. data/lib/io_streams/stream.rb +36 -30
  28. data/lib/io_streams/symmetric_encryption/reader.rb +2 -2
  29. data/lib/io_streams/symmetric_encryption/writer.rb +4 -4
  30. data/lib/io_streams/tabular/header.rb +18 -6
  31. data/lib/io_streams/tabular/parser/array.rb +0 -10
  32. data/lib/io_streams/tabular/parser/csv.rb +6 -38
  33. data/lib/io_streams/tabular/parser/fixed.rb +5 -5
  34. data/lib/io_streams/tabular/parser/psv.rb +0 -12
  35. data/lib/io_streams/tabular.rb +5 -10
  36. data/lib/io_streams/utils.rb +6 -8
  37. data/lib/io_streams/version.rb +1 -1
  38. data/lib/io_streams/writer.rb +6 -6
  39. data/lib/io_streams/xlsx/reader.rb +1 -1
  40. data/lib/io_streams/zip/writer.rb +22 -10
  41. data/lib/iostreams.rb +0 -1
  42. metadata +28 -113
  43. data/lib/io_streams/deprecated.rb +0 -216
  44. data/lib/io_streams/tabular/utility/csv_row.rb +0 -105
  45. data/test/builder_test.rb +0 -311
  46. data/test/bzip2_reader_test.rb +0 -27
  47. data/test/bzip2_writer_test.rb +0 -56
  48. data/test/deprecated_test.rb +0 -121
  49. data/test/encode_reader_test.rb +0 -51
  50. data/test/encode_writer_test.rb +0 -90
  51. data/test/files/embedded_lines_test.csv +0 -7
  52. data/test/files/multiple_files.zip +0 -0
  53. data/test/files/spreadsheet.xlsx +0 -0
  54. data/test/files/test.csv +0 -4
  55. data/test/files/test.json +0 -3
  56. data/test/files/test.psv +0 -4
  57. data/test/files/text file.txt +0 -3
  58. data/test/files/text.txt +0 -3
  59. data/test/files/text.txt.bz2 +0 -0
  60. data/test/files/text.txt.gz +0 -0
  61. data/test/files/text.txt.gz.zip +0 -0
  62. data/test/files/text.zip +0 -0
  63. data/test/files/text.zip.gz +0 -0
  64. data/test/files/unclosed_quote_large_test.csv +0 -1658
  65. data/test/files/unclosed_quote_test.csv +0 -4
  66. data/test/files/unclosed_quote_test2.csv +0 -3
  67. data/test/files/utf16_test.csv +0 -0
  68. data/test/gzip_reader_test.rb +0 -27
  69. data/test/gzip_writer_test.rb +0 -52
  70. data/test/io_streams_test.rb +0 -132
  71. data/test/line_reader_test.rb +0 -325
  72. data/test/line_writer_test.rb +0 -59
  73. data/test/minimal_file_reader.rb +0 -25
  74. data/test/path_test.rb +0 -55
  75. data/test/paths/file_test.rb +0 -202
  76. data/test/paths/http_test.rb +0 -34
  77. data/test/paths/matcher_test.rb +0 -120
  78. data/test/paths/s3_test.rb +0 -220
  79. data/test/paths/sftp_test.rb +0 -106
  80. data/test/pgp_reader_test.rb +0 -46
  81. data/test/pgp_test.rb +0 -254
  82. data/test/pgp_writer_test.rb +0 -130
  83. data/test/record_reader_test.rb +0 -60
  84. data/test/record_writer_test.rb +0 -82
  85. data/test/row_reader_test.rb +0 -35
  86. data/test/row_writer_test.rb +0 -56
  87. data/test/stream_test.rb +0 -574
  88. data/test/tabular_test.rb +0 -338
  89. data/test/test_helper.rb +0 -40
  90. data/test/utils_test.rb +0 -20
  91. data/test/xlsx_reader_test.rb +0 -37
  92. data/test/zip_reader_test.rb +0 -53
  93. data/test/zip_writer_test.rb +0 -48
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30ace4685022c88754654b2b48d1c86185e47841f1d3c6a3916bc348098b178d
4
- data.tar.gz: 040c669d7ad4521941a3752dc405390c4f7dda98a583cc0f5d923091e12bd452
3
+ metadata.gz: 90f2ff2c49b4d9fcf8105dcf586f237cea428be7a2d4ce2ed6989134a8acba55
4
+ data.tar.gz: 3984b551f0be2b77fbd1c361a73b0e9e23fa41f639854bff2e338cfad3c02a90
5
5
  SHA512:
6
- metadata.gz: 90f2635d2e443fe4c4f3992d8f92a725118b2f52c0d4861ee1d778be8f145d93c2f0663904f8cf699b33106a154d13980634881a0039c7e0ad6389d137d69760
7
- data.tar.gz: 970da1fd8e3b6ea7e1b36dad476c7f842413c2b8b50dc2939802b1962c00347bcdfa6a135245550c9cb639b694e9ca2c16439bde1afe7677701f3867cdc277e4
6
+ metadata.gz: 20d29687d9f136bd6799962dc7d31de0a6c872edfad7eb57fd8254362c5ab969fa59b87ac62ae8eec4674707eb59321b2f66bd01c71325420b344e40b33a3191
7
+ data.tar.gz: 60f46999480b15f0620d8f0e3204f8ec6776f7da57329e33c43ed70cc6a641f4a8c9a4afdb1c5e945f9925b4a24fd8bda908f9abd23e061bcf21c47e879e60b7
data/README.md CHANGED
@@ -1,8 +1,9 @@
1
1
  # IOStreams
2
2
  [![Gem Version](https://img.shields.io/gem/v/iostreams.svg)](https://rubygems.org/gems/iostreams) [![Downloads](https://img.shields.io/gem/dt/iostreams.svg)](https://rubygems.org/gems/iostreams) [![License](https://img.shields.io/badge/license-Apache%202.0-brightgreen.svg)](http://opensource.org/licenses/Apache-2.0) ![](https://img.shields.io/badge/status-Production%20Ready-blue.svg)
3
3
 
4
- IOStreams is an incredibly powerful streaming library that makes changes to file formats, compression, encryption,
5
- or storage mechanism transparent to the application.
4
+ IOStreams is a streaming library for Ruby that makes compression, encryption, file format, and storage
5
+ location transparent to your code. Read and write files of any size, one block at a time, whether they
6
+ are gzip, zip, or PGP encrypted, and whether they live on local disk, AWS S3, SFTP, or are fetched over HTTP.
6
7
 
7
8
  ## Project Status
8
9
 
@@ -14,6 +15,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
14
15
 
15
16
  Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
16
17
 
18
+ See the [CHANGELOG](CHANGELOG.md) for the release history and notable changes.
19
+
20
+ ## Upgrading to v2.0
21
+
22
+ v2.0 is a major release with breaking changes. See the [CHANGELOG](CHANGELOG.md) for the full list. The changes most likely to affect you:
23
+
24
+ - **Ruby 3.2 or later is now required.** Older Ruby versions are no longer supported.
25
+ - **Writing Zip files now requires the `zip_kit` gem.** The retired `zip_tricks` gem has been replaced by its successor, `zip_kit`. If your application writes Zip files, replace `gem "zip_tricks"` with `gem "zip_kit"` in your Gemfile. Reading Zip files is unaffected. The IOStreams API itself is unchanged.
26
+ - **The deprecated pre-v1.6 API has been removed.** The `IOStreams::Deprecated` mix-in described below no longer exists. Any code still using those old apis must move to the current `IOStreams.path` / `IOStreams.stream` API.
27
+ - **The deprecated PGP writer `compression:` option has been removed.** Use `compress:` instead (available since v1.11.0).
28
+ - **`IOStreams::Pgp.logger` and `IOStreams::Pgp.logger=` have been removed.** Logging is now configured centrally for the whole library via `IOStreams.logger` / `IOStreams.logger=`. Replace `IOStreams::Pgp.logger = my_logger` with `IOStreams.logger = my_logger`. [Semantic Logger](https://logger.rocketjob.io) is detected automatically when loaded.
29
+
17
30
  ## Upgrading to v1.6
18
31
 
19
32
  The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
@@ -30,6 +43,11 @@ release.
30
43
 
31
44
  This project adheres to [Semantic Versioning](http://semver.org/).
32
45
 
46
+ ## Contributing
47
+
48
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on documentation
49
+ updates, code changes, the project architecture, and the code of conduct.
50
+
33
51
  ## Author
34
52
 
35
53
  [Reid Morrison](https://github.com/reidmorrison)
data/Rakefile CHANGED
@@ -1,10 +1,12 @@
1
1
  require "rake/testtask"
2
2
  require_relative "lib/io_streams/version"
3
3
 
4
+ desc "Build the iostreams gem"
4
5
  task :gem do
5
6
  system "gem build iostreams.gemspec"
6
7
  end
7
8
 
9
+ desc "Build and publish the iostreams gem, then tag and push the release"
8
10
  task publish: :gem do
9
11
  system "git tag -a v#{IOStreams::VERSION} -m 'Tagging #{IOStreams::VERSION}'"
10
12
  system "git push --tags"
@@ -12,6 +14,11 @@ task publish: :gem do
12
14
  system "rm iostreams-#{IOStreams::VERSION}.gem"
13
15
  end
14
16
 
17
+ desc "Start an IRB console with the gem loaded"
18
+ task :console do
19
+ exec "irb -I lib -r iostreams"
20
+ end
21
+
15
22
  Rake::TestTask.new(:test) do |t|
16
23
  t.pattern = "test/**/*_test.rb"
17
24
  t.verbose = true
@@ -1,5 +1,5 @@
1
1
  module IOStreams
2
- # Build the streams that need to be applied to a path druing reading or writing.
2
+ # Build the streams that need to be applied to a path during reading or writing.
3
3
  class Builder
4
4
  attr_accessor :file_name, :format_options
5
5
  attr_reader :streams, :options
@@ -50,13 +50,13 @@ module IOStreams
50
50
  self
51
51
  end
52
52
 
53
- def option_or_stream(stream, **options)
53
+ def option_or_stream(stream, **)
54
54
  if streams
55
- stream(stream, **options)
55
+ stream(stream, **)
56
56
  elsif file_name
57
- option(stream, **options)
57
+ option(stream, **)
58
58
  else
59
- stream(stream, **options)
59
+ stream(stream, **)
60
60
  end
61
61
  end
62
62
 
@@ -67,12 +67,12 @@ module IOStreams
67
67
  options[stream] if options
68
68
  end
69
69
 
70
- def reader(io_stream, &block)
71
- execute(:reader, pipeline, io_stream, &block)
70
+ def reader(io_stream, &)
71
+ execute(:reader, pipeline, io_stream, &)
72
72
  end
73
73
 
74
- def writer(io_stream, &block)
75
- execute(:writer, pipeline, io_stream, &block)
74
+ def writer(io_stream, &)
75
+ execute(:writer, pipeline, io_stream, &)
76
76
  end
77
77
 
78
78
  # Returns [Hash<Symbol:Hash>] the pipeline of streams
@@ -120,7 +120,7 @@ module IOStreams
120
120
  end
121
121
 
122
122
  def class_for_stream(type, stream)
123
- ext = IOStreams.extensions[stream.nil? ? nil : stream.to_sym] ||
123
+ ext = IOStreams.extensions[stream&.to_sym] ||
124
124
  raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
125
125
  ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
126
126
  end
@@ -2,7 +2,7 @@ module IOStreams
2
2
  module Bzip2
3
3
  class Writer < IOStreams::Writer
4
4
  # Write to a stream, compressing with Bzip2
5
- def self.stream(input_stream, original_file_name: nil, **args)
5
+ def self.stream(input_stream, **args)
6
6
  Utils.load_soft_dependency("bzip2-ffi", "Bzip2", "bzip2/ffi") unless defined?(::Bzip2::FFI)
7
7
 
8
8
  begin
@@ -3,7 +3,7 @@ module IOStreams
3
3
  class Reader < IOStreams::Reader
4
4
  attr_reader :encoding, :cleaner
5
5
 
6
- NOT_PRINTABLE = Regexp.compile(/[^[:print:]|\r|\n]/).freeze
6
+ NOT_PRINTABLE = /[^[:print:]|\r\n]/
7
7
  # Builtin strip options to apply after encoding the read data.
8
8
  CLEANSE_RULES = {
9
9
  # Strips all non printable characters
@@ -13,7 +13,7 @@ module IOStreams
13
13
  }.freeze
14
14
 
15
15
  # Read a line at a time from a file or stream
16
- def self.stream(input_stream, original_file_name: nil, **args)
16
+ def self.stream(input_stream, **args)
17
17
  yield new(input_stream, **args)
18
18
  end
19
19
 
@@ -4,7 +4,7 @@ module IOStreams
4
4
  attr_reader :encoding, :cleaner
5
5
 
6
6
  # Write a line at a time to a file or stream
7
- def self.stream(input_stream, original_file_name: nil, **args)
7
+ def self.stream(input_stream, **args)
8
8
  yield new(input_stream, **args)
9
9
  end
10
10
 
@@ -46,7 +46,7 @@ module IOStreams
46
46
  # Write a line to the output stream
47
47
  #
48
48
  # Example:
49
- # IOStreams.writer('a.txt', encoding: 'UTF-8') do |stream|
49
+ # IOStreams.path('a.txt').option(:encode, encoding: 'UTF-8').writer do |stream|
50
50
  # stream << 'first line' << 'second line'
51
51
  # end
52
52
  def <<(record)
@@ -54,13 +54,13 @@ module IOStreams
54
54
  self
55
55
  end
56
56
 
57
- # Write a line to the output stream followed by the delimiter.
57
+ # Encode data and write it to the output stream.
58
58
  # Returns [Integer] the number of bytes written.
59
59
  #
60
60
  # Example:
61
- # IOStreams.writer('a.txt', encoding: 'UTF-8') do |stream|
61
+ # IOStreams.path('a.txt').option(:encode, encoding: 'UTF-8').writer do |stream|
62
62
  # count = stream.write('first line')
63
- # puts "Wrote #{count} bytes to the output file, including the delimiter"
63
+ # puts "Wrote #{count} bytes to the output file"
64
64
  # end
65
65
  def write(data)
66
66
  return 0 if data.nil?
@@ -2,7 +2,7 @@ module IOStreams
2
2
  module Gzip
3
3
  class Reader < IOStreams::Reader
4
4
  # Read from a gzip stream, decompressing the contents as it is read
5
- def self.stream(input_stream, original_file_name: nil)
5
+ def self.stream(input_stream)
6
6
  io = ::Zlib::GzipReader.new(input_stream)
7
7
  yield io
8
8
  ensure
@@ -2,7 +2,7 @@ module IOStreams
2
2
  module Gzip
3
3
  class Writer < IOStreams::Writer
4
4
  # Write to a stream, compressing with GZip
5
- def self.stream(input_stream, original_file_name: nil, &block)
5
+ def self.stream(input_stream, &block)
6
6
  io = ::Zlib::GzipWriter.new(input_stream)
7
7
  block.call(io)
8
8
  ensure
@@ -23,7 +23,7 @@ module IOStreams
23
23
  # # => "/usr/local/sample"
24
24
  #
25
25
  # IOStreams.path("s3://mybucket/path/file.xls")
26
- # # => #<IOStreams::S3::Path:0x00007fec66e3a288, @path="s3://mybucket/path/file.xls">
26
+ # # => #<IOStreams::Paths::S3:0x00007fec66e3a288 @path="s3://mybucket/path/file.xls">
27
27
  #
28
28
  # IOStreams.path("s3://mybucket/path/file.xls").to_s
29
29
  # # => "s3://mybucket/path/file.xls"
@@ -36,10 +36,9 @@ module IOStreams
36
36
  #
37
37
  # For Files
38
38
  # IOStreams.path('blah.zip').option(:encode, encoding: 'BINARY').each(:line) { |line| puts line }
39
- # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:line).first
40
- # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:hash).last
41
- # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:hash).size
42
- # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').reader.size
39
+ # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:line) { |line| puts line }
40
+ # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:hash) { |hash| p hash }
41
+ # IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').read
43
42
  # IOStreams.path('blah.csv.zip').each(:line) { |line| puts line }
44
43
  # IOStreams.path('blah.zip').option(:pgp, passphrase: 'receiver_passphrase').read
45
44
  # IOStreams.path('blah.zip').stream(:zip).stream(:pgp, passphrase: 'receiver_passphrase').read
@@ -75,14 +74,25 @@ module IOStreams
75
74
 
76
75
  # Join the supplied path elements to a root path.
77
76
  #
77
+ # Roots allow paths to reference a particular root directory, so that all path names
78
+ # are appended to that root. Use `IOStreams.join` instead of `IOStreams.path` so that
79
+ # the exact same code can run in production and development, yet use completely
80
+ # different data sources in each. For example, in production the root can point to
81
+ # an S3 bucket, while in development it points to the local file system.
82
+ #
83
+ # Roots are configured via an initializer at startup. Multiple roots can be setup,
84
+ # for example one for input files, another for output files, another for reports, etc.
85
+ # The `:default` root is used whenever a root is not supplied when calling `IOStreams.join`.
86
+ #
78
87
  # Example:
79
88
  # IOStreams.add_root(:default, "tmp/export")
89
+ # IOStreams.add_root(:ftp, "tmp/ftp")
80
90
  #
81
91
  # IOStreams.join('file.xls')
82
- # # => #<IOStreams::Paths::File:0x00007fec70391bd8 @path="tmp/export/sample">
92
+ # # => #<IOStreams::Paths::File:0x00007fec70391bd8 @path="tmp/export/file.xls">
83
93
  #
84
94
  # IOStreams.join('file.xls').to_s
85
- # # => "tmp/export/sample"
95
+ # # => "tmp/export/file.xls"
86
96
  #
87
97
  # IOStreams.join('sample', 'file.xls', root: :ftp)
88
98
  # # => #<IOStreams::Paths::File:0x00007fec6ee329b8 @path="tmp/ftp/sample/file.xls">
@@ -108,7 +118,7 @@ module IOStreams
108
118
  # Optional extension to add to the tempfile.
109
119
  #
110
120
  # Example:
111
- # IOStreams.temp_file
121
+ # IOStreams.temp_file("export", ".csv") { |path| path.write("Hello World") }
112
122
  def self.temp_file(basename, extension = "")
113
123
  Utils.temp_file_name(basename, extension) { |file_name| yield(Paths::File.new(file_name).stream(:none)) }
114
124
  end
@@ -193,9 +203,9 @@ module IOStreams
193
203
  # "\a" "a" true # escaped ordinary remains ordinary
194
204
  # "[\?]" "?" true # can escape inside bracket expression
195
205
  #
196
- # "*" ".profile" false # wildcard doesn't match leading
197
- # "*" ".profile" true # period by default.
198
- # ".*" ".profile" true {hidden: true}
206
+ # "*" ".profile" false # wildcard doesn't match leading period by default
207
+ # "*" ".profile" true # unless hidden is enabled {hidden: true}
208
+ # ".*" ".profile" true # leading period is explicit
199
209
  #
200
210
  # "**/*.rb" "main.rb" false
201
211
  # "**/*.rb" "./main.rb" false
@@ -221,10 +231,10 @@ module IOStreams
221
231
  end
222
232
 
223
233
  # Add a named root path
224
- def self.add_root(root, *elements)
234
+ def self.add_root(root, *elements, **args)
225
235
  raise(ArgumentError, "Invalid characters in root name #{root.inspect}") unless root.to_s =~ /\A\w+\Z/
226
236
 
227
- @root_paths[root.to_sym] = path(*elements)
237
+ @root_paths[root.to_sym] = path(*elements, **args)
228
238
  end
229
239
 
230
240
  def self.roots
@@ -234,7 +244,7 @@ module IOStreams
234
244
  # Set the temporary path to use when creating local temp files.
235
245
  def self.temp_dir=(temp_dir)
236
246
  temp_dir = File.expand_path(temp_dir)
237
- FileUtils.mkdir_p(temp_dir) unless ::File.exist?(temp_dir)
247
+ FileUtils.mkdir_p(temp_dir)
238
248
 
239
249
  @temp_dir = temp_dir
240
250
  end
@@ -249,6 +259,23 @@ module IOStreams
249
259
 
250
260
  @temp_dir = nil
251
261
 
262
+ # Returns [Logger] the logger used by IOStreams for debug logging.
263
+ #
264
+ # When SemanticLogger is loaded a SemanticLogger instance is used by default,
265
+ # otherwise no logging is performed unless a logger is assigned via #logger=.
266
+ def self.logger
267
+ @logger
268
+ end
269
+
270
+ # Replace the logger used by IOStreams.
271
+ #
272
+ # Set to nil to disable logging.
273
+ def self.logger=(logger)
274
+ @logger = logger
275
+ end
276
+
277
+ @logger = (SemanticLogger[IOStreams] if defined?(SemanticLogger::Logger))
278
+
252
279
  # Register a file extension and the reader and writer streaming classes
253
280
  #
254
281
  # Example:
@@ -257,7 +284,7 @@ module IOStreams
257
284
  def self.register_extension(extension, reader_class, writer_class)
258
285
  raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.nil? || extension.to_s =~ /\A\w+\Z/
259
286
 
260
- @extensions[extension.nil? ? nil : extension.to_sym] = Extension.new(reader_class, writer_class)
287
+ @extensions[extension&.to_sym] = Extension.new(reader_class, writer_class)
261
288
  end
262
289
 
263
290
  # De-Register a file extension
@@ -265,7 +292,7 @@ module IOStreams
265
292
  # Returns [Symbol] the extension removed, or nil if the extension was not registered
266
293
  #
267
294
  # Example:
268
- # register_extension(:xls)
295
+ # deregister_extension(:xls)
269
296
  def self.deregister_extension(extension)
270
297
  raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.to_s =~ /\A\w+\Z/
271
298
 
@@ -277,15 +304,14 @@ module IOStreams
277
304
  @extensions.dup
278
305
  end
279
306
 
280
- # Register a file extension and the reader and writer streaming classes
307
+ # Register a URI scheme and the path class that handles it
281
308
  #
282
309
  # Example:
283
- # # MyXls::Reader and MyXls::Writer must implement .open
284
- # register_scheme(:xls, MyXls::Reader, MyXls::Writer)
310
+ # register_scheme(:gcs, MyGoogleCloudStoragePath)
285
311
  def self.register_scheme(scheme, klass)
286
312
  raise(ArgumentError, "Invalid scheme #{scheme.inspect}") unless scheme.nil? || scheme.to_s =~ /\A\w+\Z/
287
313
 
288
- @schemes[scheme.nil? ? nil : scheme.to_sym] = klass
314
+ @schemes[scheme&.to_sym] = klass
289
315
  end
290
316
 
291
317
  def self.schemes
@@ -293,7 +319,7 @@ module IOStreams
293
319
  end
294
320
 
295
321
  def self.scheme(scheme_name)
296
- @schemes[scheme_name.nil? ? nil : scheme_name.to_sym] || raise(ArgumentError, "Unknown Scheme type: #{scheme_name.inspect}")
322
+ @schemes[scheme_name&.to_sym] || raise(ArgumentError, "Unknown Scheme type: #{scheme_name.inspect}")
297
323
  end
298
324
 
299
325
  Extension = Struct.new(:reader_class, :writer_class)
@@ -6,7 +6,7 @@ module IOStreams
6
6
  # Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
7
7
  MAX_BLOCKS_MULTIPLIER = 100
8
8
 
9
- LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze
9
+ LINEFEED_REGEXP = /\r\n|\n|\r/
10
10
 
11
11
  # Read a line at a time from a stream
12
12
  def self.stream(input_stream, **args)
@@ -44,7 +44,7 @@ module IOStreams
44
44
  #
45
45
  # Note:
46
46
  # * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
47
- def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil, original_file_name: nil)
47
+ def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil)
48
48
  super(input_stream)
49
49
 
50
50
  @embedded_within = embedded_within
@@ -24,7 +24,7 @@ module IOStreams
24
24
  # Add the specified delimiter after every record when writing it
25
25
  # to the output stream
26
26
  # Default: OS Specific. Linux: "\n"
27
- def initialize(output_stream, delimiter: $/, original_file_name: nil)
27
+ def initialize(output_stream, delimiter: $/)
28
28
  super(output_stream)
29
29
  @delimiter = delimiter
30
30
  end
@@ -41,7 +41,7 @@ module IOStreams
41
41
  # Runs the pattern from the current path, returning the complete path for located files.
42
42
  #
43
43
  # See IOStreams::Paths::File.each for arguments.
44
- def each_child(pattern = "*", **args, &block)
44
+ def each_child(pattern = "*", **args, &)
45
45
  raise NotImplementedError
46
46
  end
47
47
 
@@ -84,7 +84,7 @@ module IOStreams
84
84
  # Cleanup an incomplete write to the target "file" if the copy fails.
85
85
  # rubocop:disable Lint/SuppressedException
86
86
  def copy_from(source, **args)
87
- super(source, **args)
87
+ super
88
88
  rescue StandardError => e
89
89
  begin
90
90
  delete
@@ -15,16 +15,16 @@ module IOStreams
15
15
  # Examples:
16
16
  #
17
17
  # # Case Insensitive file name lookup:
18
- # IOStreams.path("ruby").glob("r*.md") { |name| puts name }
18
+ # IOStreams.path("ruby").each_child("r*.md") { |path| puts path }
19
19
  #
20
20
  # # Case Sensitive file name lookup:
21
- # IOStreams.path("ruby").each("R*.md", case_sensitive: true) { |name| puts name }
21
+ # IOStreams.path("ruby").each_child("R*.md", case_sensitive: true) { |path| puts path }
22
22
  #
23
23
  # # Also return the names of directories found during the search:
24
- # IOStreams.path("ruby").each("R*.md", directories: true) { |name| puts name }
24
+ # IOStreams.path("ruby").each_child("R*.md", directories: true) { |path| puts path }
25
25
  #
26
26
  # # Case Insensitive recursive file name lookup:
27
- # IOStreams.path("ruby").glob("**/*.md") { |name| puts name }
27
+ # IOStreams.path("ruby").each_child("**/*.md") { |path| puts path }
28
28
  #
29
29
  # Parameters:
30
30
  # pattern [String]
@@ -77,9 +77,9 @@ module IOStreams
77
77
  # "\a" "a" true # escaped ordinary remains ordinary
78
78
  # "[\?]" "?" true # can escape inside bracket expression
79
79
  #
80
- # "*" ".profile" false # wildcard doesn't match leading
81
- # "*" ".profile" true # period by default.
82
- # ".*" ".profile" true {hidden: true}
80
+ # "*" ".profile" false # wildcard doesn't match leading period by default
81
+ # "*" ".profile" true # unless hidden is enabled {hidden: true}
82
+ # ".*" ".profile" true # leading period is explicit
83
83
  #
84
84
  # "**/*.rb" "main.rb" false
85
85
  # "**/*.rb" "./main.rb" false
@@ -99,7 +99,21 @@ module IOStreams
99
99
  flags |= ::File::FNM_DOTMATCH if hidden
100
100
 
101
101
  # Dir.each_child("testdir") {|x| puts "Got #{x}" }
102
- Dir.glob(::File.join(path, pattern), flags) do |full_path|
102
+ full_pattern = ::File.join(path, pattern)
103
+
104
+ results = Dir.glob(full_pattern, flags)
105
+
106
+ # On some platforms or Ruby versions, FNM_CASEFOLD may not work properly
107
+ # with complex patterns. If case-insensitive matching returns no results
108
+ # but we expected some, try a more robust approach.
109
+ if results.empty? && !case_sensitive && pattern.match?(/[A-Z]/)
110
+ # Try converting the pattern to lowercase and re-matching
111
+ lowercase_pattern = pattern.downcase
112
+ lowercase_full_pattern = ::File.join(path, lowercase_pattern)
113
+ results = Dir.glob(lowercase_full_pattern, flags)
114
+ end
115
+
116
+ results.each do |full_path|
103
117
  next if !directories && ::File.directory?(full_path)
104
118
 
105
119
  yield(self.class.new(full_path))
@@ -122,12 +136,12 @@ module IOStreams
122
136
 
123
137
  def mkpath
124
138
  dir = ::File.dirname(path)
125
- FileUtils.mkdir_p(dir) unless ::File.exist?(dir)
139
+ FileUtils.mkdir_p(dir)
126
140
  self
127
141
  end
128
142
 
129
143
  def mkdir
130
- FileUtils.mkdir_p(path) unless ::File.exist?(path)
144
+ FileUtils.mkdir_p(path)
131
145
  self
132
146
  end
133
147
 
@@ -175,7 +189,7 @@ module IOStreams
175
189
  begin
176
190
  ::File.open(path, "wb") { |io| builder.writer(io, &block) }
177
191
  rescue StandardError => e
178
- ::File.unlink(path) if ::File.exist?(path)
192
+ ::FileUtils.rm_f(path)
179
193
  raise(e)
180
194
  end
181
195
  end
@@ -26,7 +26,32 @@ module IOStreams
26
26
  #
27
27
  # http_redirect_count: [Integer]
28
28
  # Maximum number of http redirects to follow.
29
- def initialize(url, username: nil, password: nil, http_redirect_count: 10, parameters: nil)
29
+ # Set to 0 to disable following redirects entirely.
30
+ # Default: 10
31
+ #
32
+ # allow_hosts: [String | Array<String>]
33
+ # Optional allow-list of host names that may be contacted, applied to the
34
+ # supplied url and to every redirect that is followed.
35
+ # When supplied, a request to any other host raises CommunicationsFailure.
36
+ # Use this to limit Server Side Request Forgery (SSRF) exposure when the url
37
+ # can be influenced by untrusted input.
38
+ # Default: nil (any host is allowed).
39
+ #
40
+ # maximum_file_size: [Integer]
41
+ # Optional maximum number of bytes to download.
42
+ # When the response body exceeds this size the download is aborted with a
43
+ # CommunicationsFailure, protecting against unbounded (denial of service) responses.
44
+ # Default: nil (no limit).
45
+ #
46
+ # Security notes:
47
+ # - Redirect targets are supplied by the remote server. Validating only the url that is
48
+ # passed in is therefore not sufficient to prevent SSRF: use `allow_hosts` (or disable
49
+ # redirects with `http_redirect_count: 0`) when the url is not fully trusted.
50
+ # - Basic authentication credentials are only sent to the original host. They are not
51
+ # resent when a redirect points at a different scheme, host, or port, so that a
52
+ # redirect cannot leak the credentials to another server.
53
+ def initialize(url, username: nil, password: nil, http_redirect_count: 10, parameters: nil,
54
+ allow_hosts: nil, maximum_file_size: nil)
30
55
  uri = URI.parse(url)
31
56
  unless %w[http https].include?(uri.scheme)
32
57
  raise(
@@ -38,6 +63,8 @@ module IOStreams
38
63
  @username = username || uri.user
39
64
  @password = password || uri.password
40
65
  @http_redirect_count = http_redirect_count
66
+ @allow_hosts = allow_hosts.nil? ? nil : Array(allow_hosts)
67
+ @maximum_file_size = maximum_file_size
41
68
  @url = parameters ? "#{url}?#{URI.encode_www_form(parameters)}" : url
42
69
  super(uri.path)
43
70
  end
@@ -53,6 +80,8 @@ module IOStreams
53
80
 
54
81
  private
55
82
 
83
+ attr_reader :allow_hosts, :maximum_file_size
84
+
56
85
  # Read a file using an http get.
57
86
  #
58
87
  # For example:
@@ -63,18 +92,20 @@ module IOStreams
63
92
  #
64
93
  # Notes:
65
94
  # * Since Net::HTTP download only supports a push stream, the data is streamed into a tempfile first.
66
- def stream_reader(&block)
67
- handle_redirects(url, http_redirect_count, &block)
95
+ def stream_reader(&)
96
+ handle_redirects(url, http_redirect_count, &)
68
97
  end
69
98
 
70
99
  def handle_redirects(uri, http_redirect_count, &block)
71
100
  uri = URI.parse(uri) unless uri.is_a?(URI)
72
101
  result = nil
73
- raise(IOStreams::Errors::CommunicationsFailure, "Too many redirects") if http_redirect_count < 1
102
+
103
+ validate_uri!(uri)
74
104
 
75
105
  Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == "https") do |http|
76
106
  request = Net::HTTP::Get.new(uri)
77
- request.basic_auth(username, password) if username
107
+ # Only send credentials to the original host to avoid leaking them via a redirect.
108
+ request.basic_auth(username, password) if username && same_origin?(uri)
78
109
 
79
110
  http.request(request) do |response|
80
111
  raise(IOStreams::Errors::CommunicationsFailure, "Invalid URL: #{uri}") if response.is_a?(Net::HTTPNotFound)
@@ -83,7 +114,13 @@ module IOStreams
83
114
  end
84
115
 
85
116
  if response.is_a?(Net::HTTPRedirection)
86
- new_uri = response["location"]
117
+ raise(IOStreams::Errors::CommunicationsFailure, "Too many redirects") if http_redirect_count < 1
118
+
119
+ location = response["location"]
120
+ raise(IOStreams::Errors::CommunicationsFailure, "Redirect missing location header: #{uri}") unless location
121
+
122
+ # Resolve relative redirects against the current uri.
123
+ new_uri = uri.merge(location)
87
124
  return handle_redirects(new_uri, http_redirect_count - 1, &block)
88
125
  end
89
126
 
@@ -93,7 +130,7 @@ module IOStreams
93
130
 
94
131
  # Since Net::HTTP download only supports a push stream, write it to a tempfile first.
95
132
  Utils.temp_file_name("iostreams_http") do |file_name|
96
- ::File.open(file_name, "wb") { |io| response.read_body { |chunk| io.write(chunk) } }
133
+ download_to_file(response, file_name)
97
134
  # Return a read stream
98
135
  result = ::File.open(file_name, "rb") { |io| builder.reader(io, &block) }
99
136
  end
@@ -101,6 +138,42 @@ module IOStreams
101
138
  end
102
139
  result
103
140
  end
141
+
142
+ # Validate that the host may be contacted, and that the scheme is still http(s)
143
+ # after following a redirect.
144
+ def validate_uri!(uri)
145
+ unless %w[http https].include?(uri.scheme)
146
+ raise(IOStreams::Errors::CommunicationsFailure, "Invalid redirect, only http and https are supported: #{uri}")
147
+ end
148
+ return if allow_hosts.nil? || allow_hosts.include?(uri.hostname)
149
+
150
+ raise(IOStreams::Errors::CommunicationsFailure, "Host not in the allowed list of hosts: #{uri.hostname}")
151
+ end
152
+
153
+ def same_origin?(uri)
154
+ original = original_uri
155
+ uri.scheme == original.scheme && uri.hostname == original.hostname && uri.port == original.port
156
+ end
157
+
158
+ def original_uri
159
+ @original_uri ||= URI.parse(url)
160
+ end
161
+
162
+ def download_to_file(response, file_name)
163
+ size = 0
164
+ ::File.open(file_name, "wb") do |io|
165
+ response.read_body do |chunk|
166
+ size += chunk.bytesize
167
+ if maximum_file_size && (size > maximum_file_size)
168
+ raise(
169
+ IOStreams::Errors::CommunicationsFailure,
170
+ "Exceeded maximum allowed download size of #{maximum_file_size} bytes"
171
+ )
172
+ end
173
+ io.write(chunk)
174
+ end
175
+ end
176
+ end
104
177
  end
105
178
  end
106
179
  end