iostreams 1.10.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -2
- data/Rakefile +7 -0
- data/lib/io_streams/builder.rb +10 -10
- data/lib/io_streams/bzip2/writer.rb +1 -1
- data/lib/io_streams/encode/reader.rb +2 -2
- data/lib/io_streams/encode/writer.rb +5 -5
- data/lib/io_streams/gzip/reader.rb +1 -1
- data/lib/io_streams/gzip/writer.rb +1 -1
- data/lib/io_streams/io_streams.rb +47 -21
- data/lib/io_streams/line/reader.rb +2 -2
- data/lib/io_streams/line/writer.rb +1 -1
- data/lib/io_streams/path.rb +2 -2
- data/lib/io_streams/paths/file.rb +25 -11
- data/lib/io_streams/paths/http.rb +80 -7
- data/lib/io_streams/paths/matcher.rb +3 -3
- data/lib/io_streams/paths/s3.rb +22 -3
- data/lib/io_streams/paths/sftp.rb +9 -10
- data/lib/io_streams/pgp/reader.rb +25 -7
- data/lib/io_streams/pgp/writer.rb +95 -29
- data/lib/io_streams/pgp.rb +289 -87
- data/lib/io_streams/reader.rb +4 -4
- data/lib/io_streams/record/reader.rb +3 -4
- data/lib/io_streams/record/writer.rb +3 -4
- data/lib/io_streams/row/reader.rb +1 -1
- data/lib/io_streams/row/writer.rb +1 -1
- data/lib/io_streams/stream.rb +36 -30
- data/lib/io_streams/symmetric_encryption/reader.rb +2 -2
- data/lib/io_streams/symmetric_encryption/writer.rb +4 -4
- data/lib/io_streams/tabular/header.rb +18 -6
- data/lib/io_streams/tabular/parser/array.rb +0 -10
- data/lib/io_streams/tabular/parser/csv.rb +6 -38
- data/lib/io_streams/tabular/parser/fixed.rb +5 -5
- data/lib/io_streams/tabular/parser/psv.rb +0 -12
- data/lib/io_streams/tabular.rb +5 -10
- data/lib/io_streams/utils.rb +6 -8
- data/lib/io_streams/version.rb +1 -1
- data/lib/io_streams/writer.rb +6 -6
- data/lib/io_streams/xlsx/reader.rb +1 -1
- data/lib/io_streams/zip/writer.rb +22 -10
- data/lib/iostreams.rb +0 -1
- metadata +28 -113
- data/lib/io_streams/deprecated.rb +0 -216
- data/lib/io_streams/tabular/utility/csv_row.rb +0 -105
- data/test/builder_test.rb +0 -311
- data/test/bzip2_reader_test.rb +0 -27
- data/test/bzip2_writer_test.rb +0 -56
- data/test/deprecated_test.rb +0 -121
- data/test/encode_reader_test.rb +0 -51
- data/test/encode_writer_test.rb +0 -90
- data/test/files/embedded_lines_test.csv +0 -7
- data/test/files/multiple_files.zip +0 -0
- data/test/files/spreadsheet.xlsx +0 -0
- data/test/files/test.csv +0 -4
- data/test/files/test.json +0 -3
- data/test/files/test.psv +0 -4
- data/test/files/text file.txt +0 -3
- data/test/files/text.txt +0 -3
- data/test/files/text.txt.bz2 +0 -0
- data/test/files/text.txt.gz +0 -0
- data/test/files/text.txt.gz.zip +0 -0
- data/test/files/text.zip +0 -0
- data/test/files/text.zip.gz +0 -0
- data/test/files/unclosed_quote_large_test.csv +0 -1658
- data/test/files/unclosed_quote_test.csv +0 -4
- data/test/files/unclosed_quote_test2.csv +0 -3
- data/test/files/utf16_test.csv +0 -0
- data/test/gzip_reader_test.rb +0 -27
- data/test/gzip_writer_test.rb +0 -52
- data/test/io_streams_test.rb +0 -132
- data/test/line_reader_test.rb +0 -325
- data/test/line_writer_test.rb +0 -59
- data/test/minimal_file_reader.rb +0 -25
- data/test/path_test.rb +0 -55
- data/test/paths/file_test.rb +0 -202
- data/test/paths/http_test.rb +0 -34
- data/test/paths/matcher_test.rb +0 -120
- data/test/paths/s3_test.rb +0 -220
- data/test/paths/sftp_test.rb +0 -106
- data/test/pgp_reader_test.rb +0 -46
- data/test/pgp_test.rb +0 -254
- data/test/pgp_writer_test.rb +0 -130
- data/test/record_reader_test.rb +0 -60
- data/test/record_writer_test.rb +0 -82
- data/test/row_reader_test.rb +0 -35
- data/test/row_writer_test.rb +0 -56
- data/test/stream_test.rb +0 -574
- data/test/tabular_test.rb +0 -338
- data/test/test_helper.rb +0 -40
- data/test/utils_test.rb +0 -20
- data/test/xlsx_reader_test.rb +0 -37
- data/test/zip_reader_test.rb +0 -53
- data/test/zip_writer_test.rb +0 -48
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 90f2ff2c49b4d9fcf8105dcf586f237cea428be7a2d4ce2ed6989134a8acba55
|
|
4
|
+
data.tar.gz: 3984b551f0be2b77fbd1c361a73b0e9e23fa41f639854bff2e338cfad3c02a90
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 20d29687d9f136bd6799962dc7d31de0a6c872edfad7eb57fd8254362c5ab969fa59b87ac62ae8eec4674707eb59321b2f66bd01c71325420b344e40b33a3191
|
|
7
|
+
data.tar.gz: 60f46999480b15f0620d8f0e3204f8ec6776f7da57329e33c43ed70cc6a641f4a8c9a4afdb1c5e945f9925b4a24fd8bda908f9abd23e061bcf21c47e879e60b7
|
data/README.md
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# IOStreams
|
|
2
2
|
[](https://rubygems.org/gems/iostreams) [](https://rubygems.org/gems/iostreams) [](http://opensource.org/licenses/Apache-2.0) 
|
|
3
3
|
|
|
4
|
-
IOStreams is
|
|
5
|
-
|
|
4
|
+
IOStreams is a streaming library for Ruby that makes compression, encryption, file format, and storage
|
|
5
|
+
location transparent to your code. Read and write files of any size, one block at a time, whether they
|
|
6
|
+
are gzip, zip, or PGP encrypted, and whether they live on local disk, AWS S3, SFTP, or are fetched over HTTP.
|
|
6
7
|
|
|
7
8
|
## Project Status
|
|
8
9
|
|
|
@@ -14,6 +15,18 @@ Start with the [IOStreams tutorial](https://iostreams.rocketjob.io/tutorial) to
|
|
|
14
15
|
|
|
15
16
|
Next, checkout the remaining [IOStreams documentation](https://iostreams.rocketjob.io/)
|
|
16
17
|
|
|
18
|
+
See the [CHANGELOG](CHANGELOG.md) for the release history and notable changes.
|
|
19
|
+
|
|
20
|
+
## Upgrading to v2.0
|
|
21
|
+
|
|
22
|
+
v2.0 is a major release with breaking changes. See the [CHANGELOG](CHANGELOG.md) for the full list. The changes most likely to affect you:
|
|
23
|
+
|
|
24
|
+
- **Ruby 3.2 or later is now required.** Older Ruby versions are no longer supported.
|
|
25
|
+
- **Writing Zip files now requires the `zip_kit` gem.** The retired `zip_tricks` gem has been replaced by its successor, `zip_kit`. If your application writes Zip files, replace `gem "zip_tricks"` with `gem "zip_kit"` in your Gemfile. Reading Zip files is unaffected. The IOStreams API itself is unchanged.
|
|
26
|
+
- **The deprecated pre-v1.6 API has been removed.** The `IOStreams::Deprecated` mix-in described below no longer exists. Any code still using those old apis must move to the current `IOStreams.path` / `IOStreams.stream` API.
|
|
27
|
+
- **The deprecated PGP writer `compression:` option has been removed.** Use `compress:` instead (available since v1.11.0).
|
|
28
|
+
- **`IOStreams::Pgp.logger` and `IOStreams::Pgp.logger=` have been removed.** Logging is now configured centrally for the whole library via `IOStreams.logger` / `IOStreams.logger=`. Replace `IOStreams::Pgp.logger = my_logger` with `IOStreams.logger = my_logger`. [Semantic Logger](https://logger.rocketjob.io) is detected automatically when loaded.
|
|
29
|
+
|
|
17
30
|
## Upgrading to v1.6
|
|
18
31
|
|
|
19
32
|
The old, deprecated api's are no longer loaded by default with v1.6. To add back the deprecated api support, add
|
|
@@ -30,6 +43,11 @@ release.
|
|
|
30
43
|
|
|
31
44
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
|
32
45
|
|
|
46
|
+
## Contributing
|
|
47
|
+
|
|
48
|
+
Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on documentation
|
|
49
|
+
updates, code changes, the project architecture, and the code of conduct.
|
|
50
|
+
|
|
33
51
|
## Author
|
|
34
52
|
|
|
35
53
|
[Reid Morrison](https://github.com/reidmorrison)
|
data/Rakefile
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
require "rake/testtask"
|
|
2
2
|
require_relative "lib/io_streams/version"
|
|
3
3
|
|
|
4
|
+
desc "Build the iostreams gem"
|
|
4
5
|
task :gem do
|
|
5
6
|
system "gem build iostreams.gemspec"
|
|
6
7
|
end
|
|
7
8
|
|
|
9
|
+
desc "Build and publish the iostreams gem, then tag and push the release"
|
|
8
10
|
task publish: :gem do
|
|
9
11
|
system "git tag -a v#{IOStreams::VERSION} -m 'Tagging #{IOStreams::VERSION}'"
|
|
10
12
|
system "git push --tags"
|
|
@@ -12,6 +14,11 @@ task publish: :gem do
|
|
|
12
14
|
system "rm iostreams-#{IOStreams::VERSION}.gem"
|
|
13
15
|
end
|
|
14
16
|
|
|
17
|
+
desc "Start an IRB console with the gem loaded"
|
|
18
|
+
task :console do
|
|
19
|
+
exec "irb -I lib -r iostreams"
|
|
20
|
+
end
|
|
21
|
+
|
|
15
22
|
Rake::TestTask.new(:test) do |t|
|
|
16
23
|
t.pattern = "test/**/*_test.rb"
|
|
17
24
|
t.verbose = true
|
data/lib/io_streams/builder.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module IOStreams
|
|
2
|
-
# Build the streams that need to be applied to a path
|
|
2
|
+
# Build the streams that need to be applied to a path during reading or writing.
|
|
3
3
|
class Builder
|
|
4
4
|
attr_accessor :file_name, :format_options
|
|
5
5
|
attr_reader :streams, :options
|
|
@@ -50,13 +50,13 @@ module IOStreams
|
|
|
50
50
|
self
|
|
51
51
|
end
|
|
52
52
|
|
|
53
|
-
def option_or_stream(stream, **
|
|
53
|
+
def option_or_stream(stream, **)
|
|
54
54
|
if streams
|
|
55
|
-
stream(stream, **
|
|
55
|
+
stream(stream, **)
|
|
56
56
|
elsif file_name
|
|
57
|
-
option(stream, **
|
|
57
|
+
option(stream, **)
|
|
58
58
|
else
|
|
59
|
-
stream(stream, **
|
|
59
|
+
stream(stream, **)
|
|
60
60
|
end
|
|
61
61
|
end
|
|
62
62
|
|
|
@@ -67,12 +67,12 @@ module IOStreams
|
|
|
67
67
|
options[stream] if options
|
|
68
68
|
end
|
|
69
69
|
|
|
70
|
-
def reader(io_stream, &
|
|
71
|
-
execute(:reader, pipeline, io_stream, &
|
|
70
|
+
def reader(io_stream, &)
|
|
71
|
+
execute(:reader, pipeline, io_stream, &)
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
-
def writer(io_stream, &
|
|
75
|
-
execute(:writer, pipeline, io_stream, &
|
|
74
|
+
def writer(io_stream, &)
|
|
75
|
+
execute(:writer, pipeline, io_stream, &)
|
|
76
76
|
end
|
|
77
77
|
|
|
78
78
|
# Returns [Hash<Symbol:Hash>] the pipeline of streams
|
|
@@ -120,7 +120,7 @@ module IOStreams
|
|
|
120
120
|
end
|
|
121
121
|
|
|
122
122
|
def class_for_stream(type, stream)
|
|
123
|
-
ext = IOStreams.extensions[stream
|
|
123
|
+
ext = IOStreams.extensions[stream&.to_sym] ||
|
|
124
124
|
raise(ArgumentError, "Unknown Stream type: #{stream.inspect}")
|
|
125
125
|
ext.send("#{type}_class") || raise(ArgumentError, "No #{type} registered for Stream type: #{stream.inspect}")
|
|
126
126
|
end
|
|
@@ -2,7 +2,7 @@ module IOStreams
|
|
|
2
2
|
module Bzip2
|
|
3
3
|
class Writer < IOStreams::Writer
|
|
4
4
|
# Write to a stream, compressing with Bzip2
|
|
5
|
-
def self.stream(input_stream,
|
|
5
|
+
def self.stream(input_stream, **args)
|
|
6
6
|
Utils.load_soft_dependency("bzip2-ffi", "Bzip2", "bzip2/ffi") unless defined?(::Bzip2::FFI)
|
|
7
7
|
|
|
8
8
|
begin
|
|
@@ -3,7 +3,7 @@ module IOStreams
|
|
|
3
3
|
class Reader < IOStreams::Reader
|
|
4
4
|
attr_reader :encoding, :cleaner
|
|
5
5
|
|
|
6
|
-
NOT_PRINTABLE =
|
|
6
|
+
NOT_PRINTABLE = /[^[:print:]|\r\n]/
|
|
7
7
|
# Builtin strip options to apply after encoding the read data.
|
|
8
8
|
CLEANSE_RULES = {
|
|
9
9
|
# Strips all non printable characters
|
|
@@ -13,7 +13,7 @@ module IOStreams
|
|
|
13
13
|
}.freeze
|
|
14
14
|
|
|
15
15
|
# Read a line at a time from a file or stream
|
|
16
|
-
def self.stream(input_stream,
|
|
16
|
+
def self.stream(input_stream, **args)
|
|
17
17
|
yield new(input_stream, **args)
|
|
18
18
|
end
|
|
19
19
|
|
|
@@ -4,7 +4,7 @@ module IOStreams
|
|
|
4
4
|
attr_reader :encoding, :cleaner
|
|
5
5
|
|
|
6
6
|
# Write a line at a time to a file or stream
|
|
7
|
-
def self.stream(input_stream,
|
|
7
|
+
def self.stream(input_stream, **args)
|
|
8
8
|
yield new(input_stream, **args)
|
|
9
9
|
end
|
|
10
10
|
|
|
@@ -46,7 +46,7 @@ module IOStreams
|
|
|
46
46
|
# Write a line to the output stream
|
|
47
47
|
#
|
|
48
48
|
# Example:
|
|
49
|
-
# IOStreams.
|
|
49
|
+
# IOStreams.path('a.txt').option(:encode, encoding: 'UTF-8').writer do |stream|
|
|
50
50
|
# stream << 'first line' << 'second line'
|
|
51
51
|
# end
|
|
52
52
|
def <<(record)
|
|
@@ -54,13 +54,13 @@ module IOStreams
|
|
|
54
54
|
self
|
|
55
55
|
end
|
|
56
56
|
|
|
57
|
-
#
|
|
57
|
+
# Encode data and write it to the output stream.
|
|
58
58
|
# Returns [Integer] the number of bytes written.
|
|
59
59
|
#
|
|
60
60
|
# Example:
|
|
61
|
-
# IOStreams.
|
|
61
|
+
# IOStreams.path('a.txt').option(:encode, encoding: 'UTF-8').writer do |stream|
|
|
62
62
|
# count = stream.write('first line')
|
|
63
|
-
# puts "Wrote #{count} bytes to the output file
|
|
63
|
+
# puts "Wrote #{count} bytes to the output file"
|
|
64
64
|
# end
|
|
65
65
|
def write(data)
|
|
66
66
|
return 0 if data.nil?
|
|
@@ -2,7 +2,7 @@ module IOStreams
|
|
|
2
2
|
module Gzip
|
|
3
3
|
class Reader < IOStreams::Reader
|
|
4
4
|
# Read from a gzip stream, decompressing the contents as it is read
|
|
5
|
-
def self.stream(input_stream
|
|
5
|
+
def self.stream(input_stream)
|
|
6
6
|
io = ::Zlib::GzipReader.new(input_stream)
|
|
7
7
|
yield io
|
|
8
8
|
ensure
|
|
@@ -2,7 +2,7 @@ module IOStreams
|
|
|
2
2
|
module Gzip
|
|
3
3
|
class Writer < IOStreams::Writer
|
|
4
4
|
# Write to a stream, compressing with GZip
|
|
5
|
-
def self.stream(input_stream,
|
|
5
|
+
def self.stream(input_stream, &block)
|
|
6
6
|
io = ::Zlib::GzipWriter.new(input_stream)
|
|
7
7
|
block.call(io)
|
|
8
8
|
ensure
|
|
@@ -23,7 +23,7 @@ module IOStreams
|
|
|
23
23
|
# # => "/usr/local/sample"
|
|
24
24
|
#
|
|
25
25
|
# IOStreams.path("s3://mybucket/path/file.xls")
|
|
26
|
-
# # => #<IOStreams::S3
|
|
26
|
+
# # => #<IOStreams::Paths::S3:0x00007fec66e3a288 @path="s3://mybucket/path/file.xls">
|
|
27
27
|
#
|
|
28
28
|
# IOStreams.path("s3://mybucket/path/file.xls").to_s
|
|
29
29
|
# # => "s3://mybucket/path/file.xls"
|
|
@@ -36,10 +36,9 @@ module IOStreams
|
|
|
36
36
|
#
|
|
37
37
|
# For Files
|
|
38
38
|
# IOStreams.path('blah.zip').option(:encode, encoding: 'BINARY').each(:line) { |line| puts line }
|
|
39
|
-
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:line)
|
|
40
|
-
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:hash)
|
|
41
|
-
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').
|
|
42
|
-
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').reader.size
|
|
39
|
+
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:line) { |line| puts line }
|
|
40
|
+
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').each(:hash) { |hash| p hash }
|
|
41
|
+
# IOStreams.path('blah.zip').option(:encode, encoding: 'UTF-8').read
|
|
43
42
|
# IOStreams.path('blah.csv.zip').each(:line) { |line| puts line }
|
|
44
43
|
# IOStreams.path('blah.zip').option(:pgp, passphrase: 'receiver_passphrase').read
|
|
45
44
|
# IOStreams.path('blah.zip').stream(:zip).stream(:pgp, passphrase: 'receiver_passphrase').read
|
|
@@ -75,14 +74,25 @@ module IOStreams
|
|
|
75
74
|
|
|
76
75
|
# Join the supplied path elements to a root path.
|
|
77
76
|
#
|
|
77
|
+
# Roots allow paths to reference a particular root directory, so that all path names
|
|
78
|
+
# are appended to that root. Use `IOStreams.join` instead of `IOStreams.path` so that
|
|
79
|
+
# the exact same code can run in production and development, yet use completely
|
|
80
|
+
# different data sources in each. For example, in production the root can point to
|
|
81
|
+
# an S3 bucket, while in development it points to the local file system.
|
|
82
|
+
#
|
|
83
|
+
# Roots are configured via an initializer at startup. Multiple roots can be setup,
|
|
84
|
+
# for example one for input files, another for output files, another for reports, etc.
|
|
85
|
+
# The `:default` root is used whenever a root is not supplied when calling `IOStreams.join`.
|
|
86
|
+
#
|
|
78
87
|
# Example:
|
|
79
88
|
# IOStreams.add_root(:default, "tmp/export")
|
|
89
|
+
# IOStreams.add_root(:ftp, "tmp/ftp")
|
|
80
90
|
#
|
|
81
91
|
# IOStreams.join('file.xls')
|
|
82
|
-
# # => #<IOStreams::Paths::File:0x00007fec70391bd8 @path="tmp/export/
|
|
92
|
+
# # => #<IOStreams::Paths::File:0x00007fec70391bd8 @path="tmp/export/file.xls">
|
|
83
93
|
#
|
|
84
94
|
# IOStreams.join('file.xls').to_s
|
|
85
|
-
# # => "tmp/export/
|
|
95
|
+
# # => "tmp/export/file.xls"
|
|
86
96
|
#
|
|
87
97
|
# IOStreams.join('sample', 'file.xls', root: :ftp)
|
|
88
98
|
# # => #<IOStreams::Paths::File:0x00007fec6ee329b8 @path="tmp/ftp/sample/file.xls">
|
|
@@ -108,7 +118,7 @@ module IOStreams
|
|
|
108
118
|
# Optional extension to add to the tempfile.
|
|
109
119
|
#
|
|
110
120
|
# Example:
|
|
111
|
-
# IOStreams.temp_file
|
|
121
|
+
# IOStreams.temp_file("export", ".csv") { |path| path.write("Hello World") }
|
|
112
122
|
def self.temp_file(basename, extension = "")
|
|
113
123
|
Utils.temp_file_name(basename, extension) { |file_name| yield(Paths::File.new(file_name).stream(:none)) }
|
|
114
124
|
end
|
|
@@ -193,9 +203,9 @@ module IOStreams
|
|
|
193
203
|
# "\a" "a" true # escaped ordinary remains ordinary
|
|
194
204
|
# "[\?]" "?" true # can escape inside bracket expression
|
|
195
205
|
#
|
|
196
|
-
# "*" ".profile" false # wildcard doesn't match leading
|
|
197
|
-
# "*" ".profile" true #
|
|
198
|
-
# ".*" ".profile" true
|
|
206
|
+
# "*" ".profile" false # wildcard doesn't match leading period by default
|
|
207
|
+
# "*" ".profile" true # unless hidden is enabled {hidden: true}
|
|
208
|
+
# ".*" ".profile" true # leading period is explicit
|
|
199
209
|
#
|
|
200
210
|
# "**/*.rb" "main.rb" false
|
|
201
211
|
# "**/*.rb" "./main.rb" false
|
|
@@ -221,10 +231,10 @@ module IOStreams
|
|
|
221
231
|
end
|
|
222
232
|
|
|
223
233
|
# Add a named root path
|
|
224
|
-
def self.add_root(root, *elements)
|
|
234
|
+
def self.add_root(root, *elements, **args)
|
|
225
235
|
raise(ArgumentError, "Invalid characters in root name #{root.inspect}") unless root.to_s =~ /\A\w+\Z/
|
|
226
236
|
|
|
227
|
-
@root_paths[root.to_sym] = path(*elements)
|
|
237
|
+
@root_paths[root.to_sym] = path(*elements, **args)
|
|
228
238
|
end
|
|
229
239
|
|
|
230
240
|
def self.roots
|
|
@@ -234,7 +244,7 @@ module IOStreams
|
|
|
234
244
|
# Set the temporary path to use when creating local temp files.
|
|
235
245
|
def self.temp_dir=(temp_dir)
|
|
236
246
|
temp_dir = File.expand_path(temp_dir)
|
|
237
|
-
FileUtils.mkdir_p(temp_dir)
|
|
247
|
+
FileUtils.mkdir_p(temp_dir)
|
|
238
248
|
|
|
239
249
|
@temp_dir = temp_dir
|
|
240
250
|
end
|
|
@@ -249,6 +259,23 @@ module IOStreams
|
|
|
249
259
|
|
|
250
260
|
@temp_dir = nil
|
|
251
261
|
|
|
262
|
+
# Returns [Logger] the logger used by IOStreams for debug logging.
|
|
263
|
+
#
|
|
264
|
+
# When SemanticLogger is loaded a SemanticLogger instance is used by default,
|
|
265
|
+
# otherwise no logging is performed unless a logger is assigned via #logger=.
|
|
266
|
+
def self.logger
|
|
267
|
+
@logger
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Replace the logger used by IOStreams.
|
|
271
|
+
#
|
|
272
|
+
# Set to nil to disable logging.
|
|
273
|
+
def self.logger=(logger)
|
|
274
|
+
@logger = logger
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
@logger = (SemanticLogger[IOStreams] if defined?(SemanticLogger::Logger))
|
|
278
|
+
|
|
252
279
|
# Register a file extension and the reader and writer streaming classes
|
|
253
280
|
#
|
|
254
281
|
# Example:
|
|
@@ -257,7 +284,7 @@ module IOStreams
|
|
|
257
284
|
def self.register_extension(extension, reader_class, writer_class)
|
|
258
285
|
raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.nil? || extension.to_s =~ /\A\w+\Z/
|
|
259
286
|
|
|
260
|
-
@extensions[extension
|
|
287
|
+
@extensions[extension&.to_sym] = Extension.new(reader_class, writer_class)
|
|
261
288
|
end
|
|
262
289
|
|
|
263
290
|
# De-Register a file extension
|
|
@@ -265,7 +292,7 @@ module IOStreams
|
|
|
265
292
|
# Returns [Symbol] the extension removed, or nil if the extension was not registered
|
|
266
293
|
#
|
|
267
294
|
# Example:
|
|
268
|
-
#
|
|
295
|
+
# deregister_extension(:xls)
|
|
269
296
|
def self.deregister_extension(extension)
|
|
270
297
|
raise(ArgumentError, "Invalid extension #{extension.inspect}") unless extension.to_s =~ /\A\w+\Z/
|
|
271
298
|
|
|
@@ -277,15 +304,14 @@ module IOStreams
|
|
|
277
304
|
@extensions.dup
|
|
278
305
|
end
|
|
279
306
|
|
|
280
|
-
# Register a
|
|
307
|
+
# Register a URI scheme and the path class that handles it
|
|
281
308
|
#
|
|
282
309
|
# Example:
|
|
283
|
-
#
|
|
284
|
-
# register_scheme(:xls, MyXls::Reader, MyXls::Writer)
|
|
310
|
+
# register_scheme(:gcs, MyGoogleCloudStoragePath)
|
|
285
311
|
def self.register_scheme(scheme, klass)
|
|
286
312
|
raise(ArgumentError, "Invalid scheme #{scheme.inspect}") unless scheme.nil? || scheme.to_s =~ /\A\w+\Z/
|
|
287
313
|
|
|
288
|
-
@schemes[scheme
|
|
314
|
+
@schemes[scheme&.to_sym] = klass
|
|
289
315
|
end
|
|
290
316
|
|
|
291
317
|
def self.schemes
|
|
@@ -293,7 +319,7 @@ module IOStreams
|
|
|
293
319
|
end
|
|
294
320
|
|
|
295
321
|
def self.scheme(scheme_name)
|
|
296
|
-
@schemes[scheme_name
|
|
322
|
+
@schemes[scheme_name&.to_sym] || raise(ArgumentError, "Unknown Scheme type: #{scheme_name.inspect}")
|
|
297
323
|
end
|
|
298
324
|
|
|
299
325
|
Extension = Struct.new(:reader_class, :writer_class)
|
|
@@ -6,7 +6,7 @@ module IOStreams
|
|
|
6
6
|
# Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
|
|
7
7
|
MAX_BLOCKS_MULTIPLIER = 100
|
|
8
8
|
|
|
9
|
-
LINEFEED_REGEXP =
|
|
9
|
+
LINEFEED_REGEXP = /\r\n|\n|\r/
|
|
10
10
|
|
|
11
11
|
# Read a line at a time from a stream
|
|
12
12
|
def self.stream(input_stream, **args)
|
|
@@ -44,7 +44,7 @@ module IOStreams
|
|
|
44
44
|
#
|
|
45
45
|
# Note:
|
|
46
46
|
# * When using a line reader and the file_name ends with ".csv" then embedded_within is automatically set to `"`
|
|
47
|
-
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil
|
|
47
|
+
def initialize(input_stream, delimiter: nil, buffer_size: 65_536, embedded_within: nil)
|
|
48
48
|
super(input_stream)
|
|
49
49
|
|
|
50
50
|
@embedded_within = embedded_within
|
|
@@ -24,7 +24,7 @@ module IOStreams
|
|
|
24
24
|
# Add the specified delimiter after every record when writing it
|
|
25
25
|
# to the output stream
|
|
26
26
|
# Default: OS Specific. Linux: "\n"
|
|
27
|
-
def initialize(output_stream, delimiter:
|
|
27
|
+
def initialize(output_stream, delimiter: $/)
|
|
28
28
|
super(output_stream)
|
|
29
29
|
@delimiter = delimiter
|
|
30
30
|
end
|
data/lib/io_streams/path.rb
CHANGED
|
@@ -41,7 +41,7 @@ module IOStreams
|
|
|
41
41
|
# Runs the pattern from the current path, returning the complete path for located files.
|
|
42
42
|
#
|
|
43
43
|
# See IOStreams::Paths::File.each for arguments.
|
|
44
|
-
def each_child(pattern = "*", **args, &
|
|
44
|
+
def each_child(pattern = "*", **args, &)
|
|
45
45
|
raise NotImplementedError
|
|
46
46
|
end
|
|
47
47
|
|
|
@@ -84,7 +84,7 @@ module IOStreams
|
|
|
84
84
|
# Cleanup an incomplete write to the target "file" if the copy fails.
|
|
85
85
|
# rubocop:disable Lint/SuppressedException
|
|
86
86
|
def copy_from(source, **args)
|
|
87
|
-
super
|
|
87
|
+
super
|
|
88
88
|
rescue StandardError => e
|
|
89
89
|
begin
|
|
90
90
|
delete
|
|
@@ -15,16 +15,16 @@ module IOStreams
|
|
|
15
15
|
# Examples:
|
|
16
16
|
#
|
|
17
17
|
# # Case Insensitive file name lookup:
|
|
18
|
-
# IOStreams.path("ruby").
|
|
18
|
+
# IOStreams.path("ruby").each_child("r*.md") { |path| puts path }
|
|
19
19
|
#
|
|
20
20
|
# # Case Sensitive file name lookup:
|
|
21
|
-
# IOStreams.path("ruby").
|
|
21
|
+
# IOStreams.path("ruby").each_child("R*.md", case_sensitive: true) { |path| puts path }
|
|
22
22
|
#
|
|
23
23
|
# # Also return the names of directories found during the search:
|
|
24
|
-
# IOStreams.path("ruby").
|
|
24
|
+
# IOStreams.path("ruby").each_child("R*.md", directories: true) { |path| puts path }
|
|
25
25
|
#
|
|
26
26
|
# # Case Insensitive recursive file name lookup:
|
|
27
|
-
# IOStreams.path("ruby").
|
|
27
|
+
# IOStreams.path("ruby").each_child("**/*.md") { |path| puts path }
|
|
28
28
|
#
|
|
29
29
|
# Parameters:
|
|
30
30
|
# pattern [String]
|
|
@@ -77,9 +77,9 @@ module IOStreams
|
|
|
77
77
|
# "\a" "a" true # escaped ordinary remains ordinary
|
|
78
78
|
# "[\?]" "?" true # can escape inside bracket expression
|
|
79
79
|
#
|
|
80
|
-
# "*" ".profile" false # wildcard doesn't match leading
|
|
81
|
-
# "*" ".profile" true #
|
|
82
|
-
# ".*" ".profile" true
|
|
80
|
+
# "*" ".profile" false # wildcard doesn't match leading period by default
|
|
81
|
+
# "*" ".profile" true # unless hidden is enabled {hidden: true}
|
|
82
|
+
# ".*" ".profile" true # leading period is explicit
|
|
83
83
|
#
|
|
84
84
|
# "**/*.rb" "main.rb" false
|
|
85
85
|
# "**/*.rb" "./main.rb" false
|
|
@@ -99,7 +99,21 @@ module IOStreams
|
|
|
99
99
|
flags |= ::File::FNM_DOTMATCH if hidden
|
|
100
100
|
|
|
101
101
|
# Dir.each_child("testdir") {|x| puts "Got #{x}" }
|
|
102
|
-
|
|
102
|
+
full_pattern = ::File.join(path, pattern)
|
|
103
|
+
|
|
104
|
+
results = Dir.glob(full_pattern, flags)
|
|
105
|
+
|
|
106
|
+
# On some platforms or Ruby versions, FNM_CASEFOLD may not work properly
|
|
107
|
+
# with complex patterns. If case-insensitive matching returns no results
|
|
108
|
+
# but we expected some, try a more robust approach.
|
|
109
|
+
if results.empty? && !case_sensitive && pattern.match?(/[A-Z]/)
|
|
110
|
+
# Try converting the pattern to lowercase and re-matching
|
|
111
|
+
lowercase_pattern = pattern.downcase
|
|
112
|
+
lowercase_full_pattern = ::File.join(path, lowercase_pattern)
|
|
113
|
+
results = Dir.glob(lowercase_full_pattern, flags)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
results.each do |full_path|
|
|
103
117
|
next if !directories && ::File.directory?(full_path)
|
|
104
118
|
|
|
105
119
|
yield(self.class.new(full_path))
|
|
@@ -122,12 +136,12 @@ module IOStreams
|
|
|
122
136
|
|
|
123
137
|
def mkpath
|
|
124
138
|
dir = ::File.dirname(path)
|
|
125
|
-
FileUtils.mkdir_p(dir)
|
|
139
|
+
FileUtils.mkdir_p(dir)
|
|
126
140
|
self
|
|
127
141
|
end
|
|
128
142
|
|
|
129
143
|
def mkdir
|
|
130
|
-
FileUtils.mkdir_p(path)
|
|
144
|
+
FileUtils.mkdir_p(path)
|
|
131
145
|
self
|
|
132
146
|
end
|
|
133
147
|
|
|
@@ -175,7 +189,7 @@ module IOStreams
|
|
|
175
189
|
begin
|
|
176
190
|
::File.open(path, "wb") { |io| builder.writer(io, &block) }
|
|
177
191
|
rescue StandardError => e
|
|
178
|
-
::
|
|
192
|
+
::FileUtils.rm_f(path)
|
|
179
193
|
raise(e)
|
|
180
194
|
end
|
|
181
195
|
end
|
|
@@ -26,7 +26,32 @@ module IOStreams
|
|
|
26
26
|
#
|
|
27
27
|
# http_redirect_count: [Integer]
|
|
28
28
|
# Maximum number of http redirects to follow.
|
|
29
|
-
|
|
29
|
+
# Set to 0 to disable following redirects entirely.
|
|
30
|
+
# Default: 10
|
|
31
|
+
#
|
|
32
|
+
# allow_hosts: [String | Array<String>]
|
|
33
|
+
# Optional allow-list of host names that may be contacted, applied to the
|
|
34
|
+
# supplied url and to every redirect that is followed.
|
|
35
|
+
# When supplied, a request to any other host raises CommunicationsFailure.
|
|
36
|
+
# Use this to limit Server Side Request Forgery (SSRF) exposure when the url
|
|
37
|
+
# can be influenced by untrusted input.
|
|
38
|
+
# Default: nil (any host is allowed).
|
|
39
|
+
#
|
|
40
|
+
# maximum_file_size: [Integer]
|
|
41
|
+
# Optional maximum number of bytes to download.
|
|
42
|
+
# When the response body exceeds this size the download is aborted with a
|
|
43
|
+
# CommunicationsFailure, protecting against unbounded (denial of service) responses.
|
|
44
|
+
# Default: nil (no limit).
|
|
45
|
+
#
|
|
46
|
+
# Security notes:
|
|
47
|
+
# - Redirect targets are supplied by the remote server. Validating only the url that is
|
|
48
|
+
# passed in is therefore not sufficient to prevent SSRF: use `allow_hosts` (or disable
|
|
49
|
+
# redirects with `http_redirect_count: 0`) when the url is not fully trusted.
|
|
50
|
+
# - Basic authentication credentials are only sent to the original host. They are not
|
|
51
|
+
# resent when a redirect points at a different scheme, host, or port, so that a
|
|
52
|
+
# redirect cannot leak the credentials to another server.
|
|
53
|
+
def initialize(url, username: nil, password: nil, http_redirect_count: 10, parameters: nil,
|
|
54
|
+
allow_hosts: nil, maximum_file_size: nil)
|
|
30
55
|
uri = URI.parse(url)
|
|
31
56
|
unless %w[http https].include?(uri.scheme)
|
|
32
57
|
raise(
|
|
@@ -38,6 +63,8 @@ module IOStreams
|
|
|
38
63
|
@username = username || uri.user
|
|
39
64
|
@password = password || uri.password
|
|
40
65
|
@http_redirect_count = http_redirect_count
|
|
66
|
+
@allow_hosts = allow_hosts.nil? ? nil : Array(allow_hosts)
|
|
67
|
+
@maximum_file_size = maximum_file_size
|
|
41
68
|
@url = parameters ? "#{url}?#{URI.encode_www_form(parameters)}" : url
|
|
42
69
|
super(uri.path)
|
|
43
70
|
end
|
|
@@ -53,6 +80,8 @@ module IOStreams
|
|
|
53
80
|
|
|
54
81
|
private
|
|
55
82
|
|
|
83
|
+
attr_reader :allow_hosts, :maximum_file_size
|
|
84
|
+
|
|
56
85
|
# Read a file using an http get.
|
|
57
86
|
#
|
|
58
87
|
# For example:
|
|
@@ -63,18 +92,20 @@ module IOStreams
|
|
|
63
92
|
#
|
|
64
93
|
# Notes:
|
|
65
94
|
# * Since Net::HTTP download only supports a push stream, the data is streamed into a tempfile first.
|
|
66
|
-
def stream_reader(&
|
|
67
|
-
handle_redirects(url, http_redirect_count, &
|
|
95
|
+
def stream_reader(&)
|
|
96
|
+
handle_redirects(url, http_redirect_count, &)
|
|
68
97
|
end
|
|
69
98
|
|
|
70
99
|
def handle_redirects(uri, http_redirect_count, &block)
|
|
71
100
|
uri = URI.parse(uri) unless uri.is_a?(URI)
|
|
72
101
|
result = nil
|
|
73
|
-
|
|
102
|
+
|
|
103
|
+
validate_uri!(uri)
|
|
74
104
|
|
|
75
105
|
Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == "https") do |http|
|
|
76
106
|
request = Net::HTTP::Get.new(uri)
|
|
77
|
-
|
|
107
|
+
# Only send credentials to the original host to avoid leaking them via a redirect.
|
|
108
|
+
request.basic_auth(username, password) if username && same_origin?(uri)
|
|
78
109
|
|
|
79
110
|
http.request(request) do |response|
|
|
80
111
|
raise(IOStreams::Errors::CommunicationsFailure, "Invalid URL: #{uri}") if response.is_a?(Net::HTTPNotFound)
|
|
@@ -83,7 +114,13 @@ module IOStreams
|
|
|
83
114
|
end
|
|
84
115
|
|
|
85
116
|
if response.is_a?(Net::HTTPRedirection)
|
|
86
|
-
|
|
117
|
+
raise(IOStreams::Errors::CommunicationsFailure, "Too many redirects") if http_redirect_count < 1
|
|
118
|
+
|
|
119
|
+
location = response["location"]
|
|
120
|
+
raise(IOStreams::Errors::CommunicationsFailure, "Redirect missing location header: #{uri}") unless location
|
|
121
|
+
|
|
122
|
+
# Resolve relative redirects against the current uri.
|
|
123
|
+
new_uri = uri.merge(location)
|
|
87
124
|
return handle_redirects(new_uri, http_redirect_count - 1, &block)
|
|
88
125
|
end
|
|
89
126
|
|
|
@@ -93,7 +130,7 @@ module IOStreams
|
|
|
93
130
|
|
|
94
131
|
# Since Net::HTTP download only supports a push stream, write it to a tempfile first.
|
|
95
132
|
Utils.temp_file_name("iostreams_http") do |file_name|
|
|
96
|
-
|
|
133
|
+
download_to_file(response, file_name)
|
|
97
134
|
# Return a read stream
|
|
98
135
|
result = ::File.open(file_name, "rb") { |io| builder.reader(io, &block) }
|
|
99
136
|
end
|
|
@@ -101,6 +138,42 @@ module IOStreams
|
|
|
101
138
|
end
|
|
102
139
|
result
|
|
103
140
|
end
|
|
141
|
+
|
|
142
|
+
# Validate that the host may be contacted, and that the scheme is still http(s)
|
|
143
|
+
# after following a redirect.
|
|
144
|
+
def validate_uri!(uri)
|
|
145
|
+
unless %w[http https].include?(uri.scheme)
|
|
146
|
+
raise(IOStreams::Errors::CommunicationsFailure, "Invalid redirect, only http and https are supported: #{uri}")
|
|
147
|
+
end
|
|
148
|
+
return if allow_hosts.nil? || allow_hosts.include?(uri.hostname)
|
|
149
|
+
|
|
150
|
+
raise(IOStreams::Errors::CommunicationsFailure, "Host not in the allowed list of hosts: #{uri.hostname}")
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def same_origin?(uri)
|
|
154
|
+
original = original_uri
|
|
155
|
+
uri.scheme == original.scheme && uri.hostname == original.hostname && uri.port == original.port
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def original_uri
|
|
159
|
+
@original_uri ||= URI.parse(url)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def download_to_file(response, file_name)
|
|
163
|
+
size = 0
|
|
164
|
+
::File.open(file_name, "wb") do |io|
|
|
165
|
+
response.read_body do |chunk|
|
|
166
|
+
size += chunk.bytesize
|
|
167
|
+
if maximum_file_size && (size > maximum_file_size)
|
|
168
|
+
raise(
|
|
169
|
+
IOStreams::Errors::CommunicationsFailure,
|
|
170
|
+
"Exceeded maximum allowed download size of #{maximum_file_size} bytes"
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
io.write(chunk)
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
104
177
|
end
|
|
105
178
|
end
|
|
106
179
|
end
|