iostreams 1.2.1 → 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +19 -4
  3. data/lib/io_streams/builder.rb +27 -10
  4. data/lib/io_streams/bzip2/reader.rb +3 -3
  5. data/lib/io_streams/bzip2/writer.rb +3 -3
  6. data/lib/io_streams/deprecated.rb +1 -1
  7. data/lib/io_streams/encode/reader.rb +1 -3
  8. data/lib/io_streams/encode/writer.rb +1 -1
  9. data/lib/io_streams/errors.rb +22 -0
  10. data/lib/io_streams/io_streams.rb +1 -5
  11. data/lib/io_streams/line/reader.rb +28 -16
  12. data/lib/io_streams/path.rb +3 -1
  13. data/lib/io_streams/paths/file.rb +4 -4
  14. data/lib/io_streams/paths/http.rb +6 -3
  15. data/lib/io_streams/paths/s3.rb +30 -8
  16. data/lib/io_streams/paths/sftp.rb +34 -13
  17. data/lib/io_streams/pgp.rb +84 -71
  18. data/lib/io_streams/stream.rb +78 -12
  19. data/lib/io_streams/tabular.rb +28 -27
  20. data/lib/io_streams/tabular/header.rb +14 -12
  21. data/lib/io_streams/tabular/parser/csv.rb +4 -2
  22. data/lib/io_streams/tabular/parser/fixed.rb +166 -26
  23. data/lib/io_streams/tabular/utility/csv_row.rb +1 -4
  24. data/lib/io_streams/utils.rb +4 -4
  25. data/lib/io_streams/version.rb +1 -1
  26. data/lib/io_streams/zip/reader.rb +1 -1
  27. data/test/builder_test.rb +29 -0
  28. data/test/bzip2_writer_test.rb +6 -4
  29. data/test/deprecated_test.rb +2 -0
  30. data/test/files/test.psv +4 -0
  31. data/test/files/unclosed_quote_large_test.csv +1658 -0
  32. data/test/files/unclosed_quote_test2.csv +3 -0
  33. data/test/io_streams_test.rb +2 -2
  34. data/test/line_reader_test.rb +30 -4
  35. data/test/paths/file_test.rb +1 -1
  36. data/test/paths/s3_test.rb +3 -3
  37. data/test/paths/sftp_test.rb +4 -4
  38. data/test/pgp_test.rb +54 -4
  39. data/test/pgp_writer_test.rb +3 -3
  40. data/test/stream_test.rb +174 -8
  41. data/test/tabular_test.rb +100 -40
  42. data/test/test_helper.rb +1 -1
  43. metadata +47 -42
@@ -52,7 +52,7 @@ module IOStreams
52
52
  # format: [Symbol]
53
53
  # :csv, :hash, :array, :json, :psv, :fixed
54
54
  #
55
- # file_name: [String]
55
+ # file_name: [IOStreams::Path | String]
56
56
  # When `:format` is not supplied the file name can be used to infer the required format.
57
57
  # Optional. Default: nil
58
58
  #
@@ -81,15 +81,20 @@ module IOStreams
81
81
  # #as_hash will skip these additional columns entirely as if they were not in the file at all.
82
82
  # false:
83
83
  # Raises Tabular::InvalidHeader when a column is supplied that is not in the whitelist.
84
- def initialize(format: nil, file_name: nil, format_options: nil, **args)
84
+ #
85
+ # default_format: [Symbol]
86
+ # When the format is not supplied, and the format cannot be inferred from the supplied file name
87
+ # then this default format will be used.
88
+ # Default: :csv
89
+ # Set to nil to force it to raise an exception when the format is undefined.
90
+ def initialize(format: nil, file_name: nil, format_options: nil, default_format: :csv, **args)
85
91
  @header = Header.new(**args)
86
- klass =
87
- if file_name && format.nil?
88
- self.class.parser_class_for_file_name(file_name)
89
- else
90
- self.class.parser_class(format)
91
- end
92
- @parser = format_options ? klass.new(format_options) : klass.new
92
+ @format = file_name && format.nil? ? self.class.format_from_file_name(file_name) : format
93
+ @format ||= default_format
94
+ raise(UnknownFormat, "The format cannot be inferred from the file name: #{file_name}") unless @format
95
+
96
+ klass = self.class.parser_class(@format)
97
+ @parser = format_options ? klass.new(**format_options) : klass.new
93
98
  end
94
99
 
95
100
  # Returns [true|false] whether a header is still required in order to parse or render the current format.
@@ -142,7 +147,10 @@ module IOStreams
142
147
  return unless requires_header?
143
148
 
144
149
  if IOStreams::Utils.blank?(header.columns)
145
- raise(Errors::MissingHeader, "Header columns must be set before attempting to render a header for format: #{format.inspect}")
150
+ raise(
151
+ Errors::MissingHeader,
152
+ "Header columns must be set before attempting to render a header for format: #{format.inspect}"
153
+ )
146
154
  end
147
155
 
148
156
  parser.render(header.columns, header)
@@ -159,9 +167,9 @@ module IOStreams
159
167
  # Example:
160
168
  # register_format(:csv, IOStreams::Tabular::Parser::Csv)
161
169
  def self.register_format(format, parser)
162
- raise(ArgumentError, "Invalid format #{format.inspect}") unless format.nil? || format.to_s =~ /\A\w+\Z/
170
+ raise(ArgumentError, "Invalid format #{format.inspect}") unless format.to_s =~ /\A\w+\Z/
163
171
 
164
- @formats[format.nil? ? nil : format.to_sym] = parser
172
+ @formats[format.to_sym] = parser
165
173
  end
166
174
 
167
175
  # De-Register a file format
@@ -181,28 +189,21 @@ module IOStreams
181
189
  @formats.keys
182
190
  end
183
191
 
184
- private
185
-
186
192
  # A registry to hold formats for processing files during upload or download
187
193
  @formats = {}
188
194
 
189
- def self.parser_class(format)
190
- @formats[format.nil? ? nil : format.to_sym] || raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
195
+ # Returns the registered format that will be used for the supplied file name.
196
+ def self.format_from_file_name(file_name)
197
+ file_name.to_s.split(".").reverse_each { |ext| return ext.to_sym if @formats.include?(ext.to_sym) }
198
+ nil
191
199
  end
192
200
 
193
- # Returns the parser to use with tabular for the supplied file_name
194
- def self.parser_class_for_file_name(file_name)
195
- format = nil
196
- file_name.to_s.split(".").reverse_each do |ext|
197
- if @formats.include?(ext.to_sym)
198
- format = ext.to_sym
199
- break
200
- end
201
- end
202
- parser_class(format)
201
+ # Returns the parser class for the registered format.
202
+ def self.parser_class(format)
203
+ @formats[format.nil? ? nil : format.to_sym] ||
204
+ raise(ArgumentError, "Unknown Tabular Format: #{format.inspect}")
203
205
  end
204
206
 
205
- register_format(nil, IOStreams::Tabular::Parser::Csv)
206
207
  register_format(:array, IOStreams::Tabular::Parser::Array)
207
208
  register_format(:csv, IOStreams::Tabular::Parser::Csv)
208
209
  register_format(:fixed, IOStreams::Tabular::Parser::Fixed)
@@ -109,7 +109,10 @@ module IOStreams
109
109
  end
110
110
 
111
111
  unless row.is_a?(Array)
112
- raise(IOStreams::Errors::TypeMismatch, "Don't know how to convert #{row.class.name} to an Array without the header columns being set.")
112
+ raise(
113
+ IOStreams::Errors::TypeMismatch,
114
+ "Don't know how to convert #{row.class.name} to an Array without the header columns being set."
115
+ )
113
116
  end
114
117
 
115
118
  row
@@ -126,18 +129,17 @@ module IOStreams
126
129
  # Perform cleansing on returned Hash keys during the narrowing process.
127
130
  # For example, avoids issues with case etc.
128
131
  def cleanse_hash(hash)
129
- h = {}
130
- hash.each_pair do |key, value|
131
- cleansed_key =
132
- if columns.include?(key)
133
- key
134
- else
135
- key = cleanse_column(key)
136
- key if columns.include?(key)
137
- end
138
- h[cleansed_key] = value if cleansed_key
132
+ unmatched = columns - hash.keys
133
+ unless unmatched.empty?
134
+ hash = hash.dup
135
+ unmatched.each { |name| hash[cleanse_column(name)] = hash.delete(name) }
136
+ end
137
+ # Hash#slice as of Ruby 2.5
138
+ if hash.respond_to?(:slice)
139
+ hash.slice(*columns)
140
+ else
141
+ columns.each_with_object({}) { |column, new_hash| new_hash[column] = hash[column] }
139
142
  end
140
- h
141
143
  end
142
144
 
143
145
  def cleanse_column(name)
@@ -5,8 +5,10 @@ module IOStreams
5
5
  class Csv < Base
6
6
  attr_reader :csv_parser
7
7
 
8
- def initialize
9
- @csv_parser = Utility::CSVRow.new unless RUBY_VERSION.to_f >= 2.6
8
+ unless RUBY_VERSION.to_f >= 2.6
9
+ def initialize
10
+ @csv_parser = Utility::CSVRow.new
11
+ end
10
12
  end
11
13
 
12
14
  # Returns [Array<String>] the header row.
@@ -3,31 +3,77 @@ module IOStreams
3
3
  module Parser
4
4
  # Parsing and rendering fixed length data
5
5
  class Fixed < Base
6
- attr_reader :fixed_layout
6
+ attr_reader :layout, :truncate
7
7
 
8
8
  # Returns [IOStreams::Tabular::Parser]
9
9
  #
10
10
  # Parameters:
11
11
  # layout: [Array<Hash>]
12
12
  # [
13
- # {key: 'name', size: 23 },
14
- # {key: 'address', size: 40 },
15
- # {key: 'zip', size: 5 }
13
+ # {size: 23, key: "name"},
14
+ # {size: 40, key: "address"},
15
+ # {size: 2},
16
+ # {size: 5, key: "zip"},
17
+ # {size: 8, key: "age", type: :integer},
18
+ # {size: 10, key: "weight", type: :float, decimals: 2}
16
19
  # ]
17
- def initialize(layout:)
18
- @fixed_layout = parse_layout(layout)
20
+ #
21
+ # Notes:
22
+ # * Leave out the name of the key to ignore that column during parsing,
23
+ # and to space fill when rendering. For example as a filler.
24
+ #
25
+ # Types:
26
+ # :string
27
+ # This is the default type.
28
+ # Applies space padding and the value is left justified.
29
+ # Returns value as a String
30
+ # :integer
31
+ # Applies zero padding to the left.
32
+ # Returns value as an Integer
33
+ # Raises Errors::ValueTooLong when the supplied value cannot be rendered in `size` characters.
34
+ # :float
35
+ # Applies zero padding to the left.
36
+ # Returns value as a float.
37
+ # The :size is the total size of this field including the `.` and the decimals.
38
+ # Number of :decimals
39
+ # Raises Errors::ValueTooLong when the supplied value cannot be rendered in `size` characters.
40
+ #
41
+ # In some circumstances the length of the last column is variable.
42
+ # layout: [Array<Hash>]
43
+ # [
44
+ # {size: 23, key: "name"},
45
+ # {size: :remainder, key: "rest"}
46
+ # ]
47
+ # By setting a size of `:remainder` it will take the rest of the line as the value for that column.
48
+ #
49
+ # A size of `:remainder` and no `:key` will discard the remainder of the line without validating the length.
50
+ # layout: [Array<Hash>]
51
+ # [
52
+ # {size: 23, key: "name"},
53
+ # {size: :remainder}
54
+ # ]
55
+ #
56
+ def initialize(layout:, truncate: true)
57
+ @layout = Layout.new(layout)
58
+ @truncate = truncate
59
+ end
60
+
61
+ # The required line length for every fixed length line
62
+ def line_length
63
+ layout.length
19
64
  end
20
65
 
21
66
  # Returns [String] fixed layout values extracted from the supplied hash.
22
- # String will be encoded to `encoding`
67
+ #
68
+ # Notes:
69
+ # * A nil value is considered an empty string
70
+ # * When a supplied value exceeds the column size it is truncated.
23
71
  def render(row, header)
24
72
  hash = header.to_hash(row)
25
73
 
26
74
  result = ""
27
- fixed_layout.each do |map|
28
- # A nil value is considered an empty string
29
- value = hash[map.key].to_s
30
- result << format("%-#{map.size}.#{map.size}s", value)
75
+ layout.columns.each do |column|
76
+ result << column.render(hash[column.key], truncate)
31
77
  end
32
78
  result
33
79
  end
@@ -36,32 +82,126 @@ module IOStreams
36
82
  # String will be encoded to `encoding`
37
83
  def parse(line)
38
84
  unless line.is_a?(String)
39
- raise(IOStreams::Errors::TypeMismatch, "Format is :fixed. Invalid parse input: #{line.class.name}")
85
+ raise(Errors::TypeMismatch, "Line must be a String when format is :fixed. Actual: #{line.class.name}")
86
+ end
87
+
88
+ if layout.length.positive? && (line.length != layout.length)
89
+ raise(Errors::InvalidLineLength, "Expected line length: #{layout.length}, actual line length: #{line.length}")
40
90
  end
41
91
 
42
92
  hash = {}
43
93
  index = 0
44
- fixed_layout.each do |map|
45
- value = line[index..(index + map.size - 1)]
46
- index += map.size
47
- hash[map.key] = value.to_s.strip
94
+ layout.columns.each do |column|
95
+ if column.size == -1
96
+ hash[column.key] = column.parse(line[index..-1]) if column.key
97
+ break
98
+ end
99
+
100
+ # Ignore "columns" that have no keys. E.g. Fillers
101
+ hash[column.key] = column.parse(line[index, column.size]) if column.key
102
+ index += column.size
48
103
  end
49
104
  hash
50
105
  end
51
106
 
52
- private
107
+ # The header is required as an argument and cannot be supplied in the file itself.
108
+ def requires_header?
109
+ false
110
+ end
111
+
112
+ class Layout
113
+ attr_reader :columns, :length
114
+
115
+ # Returns [Array<FixedLayout>] the layout for this fixed width file.
116
+ # Also validates values
117
+ def initialize(layout)
118
+ @length = 0
119
+ @columns = parse_layout(layout)
120
+ end
121
+
122
+ private
123
+
124
+ def parse_layout(layout)
125
+ @length = 0
126
+ layout.collect do |hash|
127
+ raise(Errors::InvalidLayout, "Missing required :size in: #{hash.inspect}") unless hash.key?(:size)
128
+
129
+ column = Column.new(**hash)
130
+ if column.size == -1
131
+ if @length == -1
132
+ raise(Errors::InvalidLayout, "Only the last :size can be '-1' or :remainder in: #{hash.inspect}")
133
+ end
134
+
135
+ @length = -1
136
+ else
137
+ @length += column.size
138
+ end
139
+ column
140
+ end
141
+ end
142
+ end
143
+
144
+ class Column
145
+ TYPES = %i[string integer float].freeze
146
+
147
+ attr_reader :key, :size, :type, :decimals
148
+
149
+ def initialize(key: nil, size:, type: :string, decimals: 2)
150
+ @key = key
151
+ @size = size == :remainder ? -1 : size.to_i
152
+ @type = type.to_sym
153
+ @decimals = decimals
154
+
155
+ unless @size.positive? || (@size == -1)
156
+ raise(Errors::InvalidLayout, "Size #{size.inspect} must be positive or :remainder")
157
+ end
158
+ raise(Errors::InvalidLayout, "Unknown type: #{type.inspect}") unless TYPES.include?(type)
159
+ end
160
+
161
+ def parse(value)
162
+ return if value.nil?
163
+
164
+ stripped_value = value.to_s.strip
165
+
166
+ case type
167
+ when :string
168
+ stripped_value
169
+ when :integer
170
+ stripped_value.length.zero? ? nil : value.to_i
171
+ when :float
172
+ stripped_value.length.zero? ? nil : value.to_f
173
+ else
174
+ raise(Errors::InvalidLayout, "Unsupported type: #{type.inspect}")
175
+ end
176
+ end
177
+
178
+ def render(value, truncate)
179
+ formatted =
180
+ case type
181
+ when :string
182
+ value = value.to_s
183
+ return value if size == -1
184
+
185
+ format(truncate ? "%-#{size}.#{size}s" : "%-#{size}s", value)
186
+ when :integer
187
+ return value.to_i.to_s if size == -1
188
+
189
+ truncate = false
190
+ format("%0#{size}d", value.to_i)
191
+ when :float
192
+ return value.to_f.to_s if size == -1
53
193
 
54
- FixedLayout = Struct.new(:key, :size)
194
+ truncate = false
195
+ format("%0#{size}.#{decimals}f", value.to_f)
196
+ else
197
+ raise(Errors::InvalidLayout, "Unsupported type: #{type.inspect}")
198
+ end
55
199
 
56
- # Returns [Array<FixedLayout>] the layout for this fixed width file.
57
- # Also validates values
58
- def parse_layout(layout)
59
- layout.collect do |map|
60
- size = map[:size]
61
- key = map[:key]
62
- raise(ArgumentError, "Missing required :key and :size in: #{map.inspect}") unless size && key
200
+ if !truncate && formatted.length > size
201
+ raise(Errors::ValueTooLong, "Value: #{value} is too large to fit into column:#{key} of size:#{size}")
202
+ end
63
203
 
64
- FixedLayout.new(key, size)
204
+ formatted
65
205
  end
66
206
  end
67
207
  end
@@ -6,10 +6,7 @@ module IOStreams
6
6
  # 2 to 3 times better performance than CSV.parse_line and considerably less
7
7
  # garbage collection required.
8
8
  #
9
- # Note:
10
- # This parser does not support line feeds embedded in quoted fields since
11
- # the file is broken apart based on line feeds during the upload process and
12
- # is then processed by each worker on a line by line basis.
9
+ # Note: Only used prior to Ruby 2.6
13
10
  class CSVRow < ::CSV
14
11
  UTF8_ENCODING = Encoding.find("UTF-8").freeze
15
12
 
@@ -49,10 +49,10 @@ module IOStreams
49
49
  @user = uri.user
50
50
  @password = uri.password
51
51
  @port = uri.port
52
- if uri.query
53
- @query = {}
54
- ::URI.decode_www_form(uri.query).each { |key, value| @query[key] = value }
55
- end
52
+ return unless uri.query
53
+
54
+ @query = {}
55
+ ::URI.decode_www_form(uri.query).each { |key, value| @query[key] = value }
56
56
  end
57
57
  end
58
58
  end
@@ -1,3 +1,3 @@
1
1
  module IOStreams
2
- VERSION = "1.2.1".freeze
2
+ VERSION = "1.6.2".freeze
3
3
  end
@@ -38,7 +38,7 @@ module IOStreams
38
38
  return true
39
39
  end
40
40
 
41
- while entry = zin.get_next_entry
41
+ while (entry = zin.get_next_entry)
42
42
  return true if entry.name == entry_file_name
43
43
  end
44
44
  false
data/test/builder_test.rb CHANGED
@@ -41,6 +41,35 @@ class BuilderTest < Minitest::Test
41
41
  end
42
42
  end
43
43
 
44
+ describe "#format" do
45
+ it "detects the format from the file name" do
46
+ streams = IOStreams::Builder.new("abc.json")
47
+ assert_equal :json, streams.format
48
+ end
49
+
50
+ it "is nil if the file name has no meaningful format" do
51
+ assert_nil streams.format
52
+ end
53
+
54
+ it "returns set format with no file_name" do
55
+ streams = IOStreams::Builder.new
56
+ streams.format = :csv
57
+ assert_equal :csv, streams.format
58
+ end
59
+
60
+ it "returns set format with file_name" do
61
+ streams = IOStreams::Builder.new("abc.json")
62
+ streams.format = :csv
63
+ assert_equal :csv, streams.format
64
+ end
65
+
66
+ it "validates bad format" do
67
+ assert_raises ArgumentError do
68
+ streams.format = :blah
69
+ end
70
+ end
71
+ end
72
+
44
73
  describe "#stream" do
45
74
  it "adds one stream" do
46
75
  streams.stream(:pgp, passphrase: "unlock-me")