iostreams 1.2.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +19 -4
- data/lib/io_streams/builder.rb +27 -10
- data/lib/io_streams/bzip2/reader.rb +3 -3
- data/lib/io_streams/bzip2/writer.rb +3 -3
- data/lib/io_streams/deprecated.rb +1 -1
- data/lib/io_streams/encode/reader.rb +1 -3
- data/lib/io_streams/encode/writer.rb +1 -1
- data/lib/io_streams/errors.rb +22 -0
- data/lib/io_streams/io_streams.rb +1 -5
- data/lib/io_streams/line/reader.rb +28 -16
- data/lib/io_streams/path.rb +3 -1
- data/lib/io_streams/paths/file.rb +4 -4
- data/lib/io_streams/paths/http.rb +6 -3
- data/lib/io_streams/paths/s3.rb +30 -8
- data/lib/io_streams/paths/sftp.rb +34 -13
- data/lib/io_streams/pgp.rb +84 -71
- data/lib/io_streams/stream.rb +78 -12
- data/lib/io_streams/tabular.rb +28 -27
- data/lib/io_streams/tabular/header.rb +14 -12
- data/lib/io_streams/tabular/parser/csv.rb +4 -2
- data/lib/io_streams/tabular/parser/fixed.rb +166 -26
- data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
- data/lib/io_streams/utils.rb +4 -4
- data/lib/io_streams/version.rb +1 -1
- data/lib/io_streams/zip/reader.rb +1 -1
- data/test/builder_test.rb +29 -0
- data/test/bzip2_writer_test.rb +6 -4
- data/test/deprecated_test.rb +2 -0
- data/test/files/test.psv +4 -0
- data/test/files/unclosed_quote_large_test.csv +1658 -0
- data/test/files/unclosed_quote_test2.csv +3 -0
- data/test/io_streams_test.rb +2 -2
- data/test/line_reader_test.rb +30 -4
- data/test/paths/file_test.rb +1 -1
- data/test/paths/s3_test.rb +3 -3
- data/test/paths/sftp_test.rb +4 -4
- data/test/pgp_test.rb +54 -4
- data/test/pgp_writer_test.rb +3 -3
- data/test/stream_test.rb +174 -8
- data/test/tabular_test.rb +100 -40
- data/test/test_helper.rb +1 -1
- metadata +47 -42
data/lib/io_streams/tabular.rb
CHANGED
@@ -52,7 +52,7 @@ module IOStreams
|
|
52
52
|
# format: [Symbol]
|
53
53
|
# :csv, :hash, :array, :json, :psv, :fixed
|
54
54
|
#
|
55
|
-
# file_name: [String]
|
55
|
+
# file_name: [IOStreams::Path | String]
|
56
56
|
# When `:format` is not supplied the file name can be used to infer the required format.
|
57
57
|
# Optional. Default: nil
|
58
58
|
#
|
@@ -81,15 +81,20 @@ module IOStreams
|
|
81
81
|
# #as_hash will skip these additional columns entirely as if they were not in the file at all.
|
82
82
|
# false:
|
83
83
|
# Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
|
84
|
-
|
84
|
+
#
|
85
|
+
# default_format: [Symbol]
|
86
|
+
# When the format is not supplied, and the format cannot be inferred from the supplied file name
|
87
|
+
# then this default format will be used.
|
88
|
+
# Default: :csv
|
89
|
+
# Set to nil to force it to raise an exception when the format is undefined.
|
90
|
+
def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
|
85
91
|
@header = Header.new(**args)
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
@parser = format_options ? klass.new(format_options) : klass.new
|
92
|
+
@format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
|
93
|
+
@format ||= default_format
|
94
|
+
raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
|
95
|
+
|
96
|
+
klass = self.class.parser_class(@format)
|
97
|
+
@parser = format_options ? klass.new(**format_options) : klass.new
|
93
98
|
end
|
94
99
|
|
95
100
|
# Returns [true|false] whether a header is still required in order to parse or render the current format.
|
@@ -142,7 +147,10 @@ module IOStreams
|
|
142
147
|
return unless requires_header?
|
143
148
|
|
144
149
|
if IOStreams::Utils.blank?(header.columns)
|
145
|
-
raise(
|
150
|
+
raise(
|
151
|
+
Errors::MissingHeader,
|
152
|
+
"Header columns must be set before attempting to render a header for format: #{format.inspect}"
|
153
|
+
)
|
146
154
|
end
|
147
155
|
|
148
156
|
parser.render(header.columns, header)
|
@@ -159,9 +167,9 @@ module IOStreams
|
|
159
167
|
# Example:
|
160
168
|
# register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
161
169
|
def self.register_format(format, parser)
|
162
|
-
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.
|
170
|
+
raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
|
163
171
|
|
164
|
-
@formats[format.
|
172
|
+
@formats[format.to_sym] = parser
|
165
173
|
end
|
166
174
|
|
167
175
|
# De-Register a file format
|
@@ -181,28 +189,21 @@ module IOStreams
|
|
181
189
|
@formats.keys
|
182
190
|
end
|
183
191
|
|
184
|
-
private
|
185
|
-
|
186
192
|
# A registry to hold formats for processing files during upload or download
|
187
193
|
@formats = {}
|
188
194
|
|
189
|
-
|
190
|
-
|
195
|
+
# Returns the registered format that will be used for the supplied file name.
|
196
|
+
def self.format_from_file_name(file_name)
|
197
|
+
file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
|
198
|
+
nil
|
191
199
|
end
|
192
200
|
|
193
|
-
# Returns the parser
|
194
|
-
def self.
|
195
|
-
format
|
196
|
-
|
197
|
-
if @formats.include?(ext.to_sym)
|
198
|
-
format = ext.to_sym
|
199
|
-
break
|
200
|
-
end
|
201
|
-
end
|
202
|
-
parser_class(format)
|
201
|
+
# Returns the parser class for the registered format.
|
202
|
+
def self.parser_class(format)
|
203
|
+
@formats[format.nil? ? nil : format.to_sym] ||
|
204
|
+
raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
|
203
205
|
end
|
204
206
|
|
205
|
-
register_format(nil, IOStreams::Tabular::Parser::Csv)
|
206
207
|
register_format(:array, IOStreams::Tabular::Parser::Array)
|
207
208
|
register_format(:csv, IOStreams::Tabular::Parser::Csv)
|
208
209
|
register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
|
@@ -109,7 +109,10 @@ module IOStreams
|
|
109
109
|
end
|
110
110
|
|
111
111
|
unless row.is_a?(Array)
|
112
|
-
raise(
|
112
|
+
raise(
|
113
|
+
IOStreams::Errors::TypeMismatch,
|
114
|
+
"Don't know how to convert #{row.class.name} to an Array without the header columns being set."
|
115
|
+
)
|
113
116
|
end
|
114
117
|
|
115
118
|
row
|
@@ -126,18 +129,17 @@ module IOStreams
|
|
126
129
|
# Perform cleansing on returned Hash keys during the narrowing process.
|
127
130
|
# For example, avoids issues with case etc.
|
128
131
|
def cleanse_hash(hash)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
132
|
+
unmatched = columns - hash.keys
|
133
|
+
unless unmatched.empty?
|
134
|
+
hash = hash.dup
|
135
|
+
unmatched.each { |name| hash[cleanse_column(name)] = hash.delete(name) }
|
136
|
+
end
|
137
|
+
# Hash#slice as of Ruby 2.5
|
138
|
+
if hash.respond_to?(:slice)
|
139
|
+
hash.slice(*columns)
|
140
|
+
else
|
141
|
+
columns.each_with_object({}) { |column, new_hash| new_hash[column] = hash[column] }
|
139
142
|
end
|
140
|
-
h
|
141
143
|
end
|
142
144
|
|
143
145
|
def cleanse_column(name)
|
@@ -5,8 +5,10 @@ module IOStreams
|
|
5
5
|
class Csv < Base
|
6
6
|
attr_reader :csv_parser
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
unless RUBY_VERSION.to_f >= 2.6
|
9
|
+
def initialize
|
10
|
+
@csv_parser = Utility::CSVRow.new
|
11
|
+
end
|
10
12
|
end
|
11
13
|
|
12
14
|
# Returns [Array<String>] the header row.
|
@@ -3,31 +3,77 @@ module IOStreams
|
|
3
3
|
module Parser
|
4
4
|
# Parsing and rendering fixed length data
|
5
5
|
class Fixed < Base
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :layout, :truncate
|
7
7
|
|
8
8
|
# Returns [IOStreams::Tabular::Parser]
|
9
9
|
#
|
10
10
|
# Parameters:
|
11
11
|
# layout: [Array<Hash>]
|
12
12
|
# [
|
13
|
-
# {
|
14
|
-
# {
|
15
|
-
# {
|
13
|
+
# {size: 23, key: "name"},
|
14
|
+
# {size: 40, key: "address"},
|
15
|
+
# {size: 2},
|
16
|
+
# {size: 5, key: "zip"},
|
17
|
+
# {size: 8, key: "age", type: :integer},
|
18
|
+
# {size: 10, key: "weight", type: :float, decimals: 2}
|
16
19
|
# ]
|
17
|
-
|
18
|
-
|
20
|
+
#
|
21
|
+
# Notes:
|
22
|
+
# * Leave out the name of the key to ignore that column during parsing,
|
23
|
+
# and to space fill when rendering. For example as a filler.
|
24
|
+
#
|
25
|
+
# Types:
|
26
|
+
# :string
|
27
|
+
# This is the default type.
|
28
|
+
# Applies space padding and the value is left justified.
|
29
|
+
# Returns value as a String
|
30
|
+
# :integer
|
31
|
+
# Applies zero padding to the left.
|
32
|
+
# Returns value as an Integer
|
33
|
+
# Raises Errors::ValueTooLong when the supplied value cannot be rendered in `size` characters.
|
34
|
+
# :float
|
35
|
+
# Applies zero padding to the left.
|
36
|
+
# Returns value as a float.
|
37
|
+
# The :size is the total size of this field including the `.` and the decimals.
|
38
|
+
# Number of :decimals
|
39
|
+
# Raises Errors::ValueTooLong when the supplied value cannot be rendered in `size` characters.
|
40
|
+
#
|
41
|
+
# In some circumstances the length of the last column is variable.
|
42
|
+
# layout: [Array<Hash>]
|
43
|
+
# [
|
44
|
+
# {size: 23, key: "name"},
|
45
|
+
# {size: :remainder, key: "rest"}
|
46
|
+
# ]
|
47
|
+
# By setting a size of `:remainder` it will take the rest of the line as the value for that column.
|
48
|
+
#
|
49
|
+
# A size of `:remainder` and no `:key` will discard the remainder of the line without validating the length.
|
50
|
+
# layout: [Array<Hash>]
|
51
|
+
# [
|
52
|
+
# {size: 23, key: "name"},
|
53
|
+
# {size: :remainder}
|
54
|
+
# ]
|
55
|
+
#
|
56
|
+
def initialize(layout:, truncate: true)
|
57
|
+
@layout = Layout.new(layout)
|
58
|
+
@truncate = truncate
|
59
|
+
end
|
60
|
+
|
61
|
+
# The required line length for every fixed length line
|
62
|
+
def line_length
|
63
|
+
layout.length
|
19
64
|
end
|
20
65
|
|
21
66
|
# Returns [String] fixed layout values extracted from the supplied hash.
|
22
|
-
#
|
67
|
+
#
|
68
|
+
# Notes:
|
69
|
+
# * A nil value is considered an empty string
|
70
|
+
# * When a supplied value exceeds the column size it is truncated.
|
23
71
|
def render(row, header)
|
24
72
|
hash = header.to_hash(row)
|
25
73
|
|
26
74
|
result = ""
|
27
|
-
|
28
|
-
|
29
|
-
value = hash[map.key].to_s
|
30
|
-
result << format("%-#{map.size}.#{map.size}s", value)
|
75
|
+
layout.columns.each do |column|
|
76
|
+
result << column.render(hash[column.key], truncate)
|
31
77
|
end
|
32
78
|
result
|
33
79
|
end
|
@@ -36,32 +82,126 @@ module IOStreams
|
|
36
82
|
# String will be encoded to `encoding`
|
37
83
|
def parse(line)
|
38
84
|
unless line.is_a?(String)
|
39
|
-
raise(
|
85
|
+
raise(Errors::TypeMismatch, "Line must be a String when format is :fixed. Actual: #{line.class.name}")
|
86
|
+
end
|
87
|
+
|
88
|
+
if layout.length.positive? && (line.length != layout.length)
|
89
|
+
raise(Errors::InvalidLineLength, "Expected line length: #{layout.length}, actual line length: #{line.length}")
|
40
90
|
end
|
41
91
|
|
42
92
|
hash = {}
|
43
93
|
index = 0
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
94
|
+
layout.columns.each do |column|
|
95
|
+
if column.size == -1
|
96
|
+
hash[column.key] = column.parse(line[index..-1]) if column.key
|
97
|
+
break
|
98
|
+
end
|
99
|
+
|
100
|
+
# Ignore "columns" that have no keys. E.g. Fillers
|
101
|
+
hash[column.key] = column.parse(line[index, column.size]) if column.key
|
102
|
+
index += column.size
|
48
103
|
end
|
49
104
|
hash
|
50
105
|
end
|
51
106
|
|
52
|
-
|
107
|
+
# The header is required as an argument and cannot be supplied in the file itself.
|
108
|
+
def requires_header?
|
109
|
+
false
|
110
|
+
end
|
111
|
+
|
112
|
+
class Layout
|
113
|
+
attr_reader :columns, :length
|
114
|
+
|
115
|
+
# Returns [Array<FixedLayout>] the layout for this fixed width file.
|
116
|
+
# Also validates values
|
117
|
+
def initialize(layout)
|
118
|
+
@length = 0
|
119
|
+
@columns = parse_layout(layout)
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
def parse_layout(layout)
|
125
|
+
@length = 0
|
126
|
+
layout.collect do |hash|
|
127
|
+
raise(Errors::InvalidLayout, "Missing required :size in: #{hash.inspect}") unless hash.key?(:size)
|
128
|
+
|
129
|
+
column = Column.new(**hash)
|
130
|
+
if column.size == -1
|
131
|
+
if @length == -1
|
132
|
+
raise(Errors::InvalidLayout, "Only the last :size can be '-1' or :remainder in: #{hash.inspect}")
|
133
|
+
end
|
134
|
+
|
135
|
+
@length = -1
|
136
|
+
else
|
137
|
+
@length += column.size
|
138
|
+
end
|
139
|
+
column
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
class Column
|
145
|
+
TYPES = %i[string integer float].freeze
|
146
|
+
|
147
|
+
attr_reader :key, :size, :type, :decimals
|
148
|
+
|
149
|
+
def initialize(key: nil, size:, type: :string, decimals: 2)
|
150
|
+
@key = key
|
151
|
+
@size = size == :remainder ? -1 : size.to_i
|
152
|
+
@type = type.to_sym
|
153
|
+
@decimals = decimals
|
154
|
+
|
155
|
+
unless @size.positive? || (@size == -1)
|
156
|
+
raise(Errors::InvalidLayout, "Size #{size.inspect} must be positive or :remainder")
|
157
|
+
end
|
158
|
+
raise(Errors::InvalidLayout, "Unknown type: #{type.inspect}") unless TYPES.include?(type)
|
159
|
+
end
|
160
|
+
|
161
|
+
def parse(value)
|
162
|
+
return if value.nil?
|
163
|
+
|
164
|
+
stripped_value = value.to_s.strip
|
165
|
+
|
166
|
+
case type
|
167
|
+
when :string
|
168
|
+
stripped_value
|
169
|
+
when :integer
|
170
|
+
stripped_value.length.zero? ? nil : value.to_i
|
171
|
+
when :float
|
172
|
+
stripped_value.length.zero? ? nil : value.to_f
|
173
|
+
else
|
174
|
+
raise(Errors::InvalidLayout, "Unsupported type: #{type.inspect}")
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def render(value, truncate)
|
179
|
+
formatted =
|
180
|
+
case type
|
181
|
+
when :string
|
182
|
+
value = value.to_s
|
183
|
+
return value if size == -1
|
184
|
+
|
185
|
+
format(truncate ? "%-#{size}.#{size}s" : "%-#{size}s", value)
|
186
|
+
when :integer
|
187
|
+
return value.to_i.to_s if size == -1
|
188
|
+
|
189
|
+
truncate = false
|
190
|
+
format("%0#{size}d", value.to_i)
|
191
|
+
when :float
|
192
|
+
return value.to_f.to_s if size == -1
|
53
193
|
|
54
|
-
|
194
|
+
truncate = false
|
195
|
+
format("%0#{size}.#{decimals}f", value.to_f)
|
196
|
+
else
|
197
|
+
raise(Errors::InvalidLayout, "Unsupported type: #{type.inspect}")
|
198
|
+
end
|
55
199
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
layout.collect do |map|
|
60
|
-
size = map[:size]
|
61
|
-
key = map[:key]
|
62
|
-
raise(ArgumentError, "Missing required :key and :size in: #{map.inspect}") unless size && key
|
200
|
+
if !truncate && formatted.length > size
|
201
|
+
raise(Errors::ValueTooLong, "Value: #{value} is too large to fit into column:#{key} of size:#{size}")
|
202
|
+
end
|
63
203
|
|
64
|
-
|
204
|
+
formatted
|
65
205
|
end
|
66
206
|
end
|
67
207
|
end
|
@@ -6,10 +6,7 @@ module IOStreams
|
|
6
6
|
# 2 to 3 times better performance than CSV.parse_line and considerably less
|
7
7
|
# garbage collection required.
|
8
8
|
#
|
9
|
-
# Note:
|
10
|
-
# This parser does not support line feeds embedded in quoted fields since
|
11
|
-
# the file is broken apart based on line feeds during the upload process and
|
12
|
-
# is then processed by each worker on a line by line basis.
|
9
|
+
# Note: Only used prior to Ruby 2.6
|
13
10
|
class CSVRow < ::CSV
|
14
11
|
UTF8_ENCODING = Encoding.find("UTF-8").freeze
|
15
12
|
|
data/lib/io_streams/utils.rb
CHANGED
@@ -49,10 +49,10 @@ module IOStreams
|
|
49
49
|
@user = uri.user
|
50
50
|
@password = uri.password
|
51
51
|
@port = uri.port
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
52
|
+
return unless uri.query
|
53
|
+
|
54
|
+
@query = {}
|
55
|
+
::URI.decode_www_form(uri.query).each { |key, value| @query[key] = value }
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
data/lib/io_streams/version.rb
CHANGED
data/test/builder_test.rb
CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
describe "#format" do
|
45
|
+
it "detects the format from the file name" do
|
46
|
+
streams = IOStreams::Builder.new("abc.json")
|
47
|
+
assert_equal :json, streams.format
|
48
|
+
end
|
49
|
+
|
50
|
+
it "is nil if the file name has no meaningful format" do
|
51
|
+
assert_nil streams.format
|
52
|
+
end
|
53
|
+
|
54
|
+
it "returns set format with no file_name" do
|
55
|
+
streams = IOStreams::Builder.new
|
56
|
+
streams.format = :csv
|
57
|
+
assert_equal :csv, streams.format
|
58
|
+
end
|
59
|
+
|
60
|
+
it "returns set format with file_name" do
|
61
|
+
streams = IOStreams::Builder.new("abc.json")
|
62
|
+
streams.format = :csv
|
63
|
+
assert_equal :csv, streams.format
|
64
|
+
end
|
65
|
+
|
66
|
+
it "validates bad format" do
|
67
|
+
assert_raises ArgumentError do
|
68
|
+
streams.format = :blah
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
44
73
|
describe "#stream" do
|
45
74
|
it "adds one stream" do
|
46
75
|
streams.stream(:pgp, passphrase: "unlock-me")
|