iostreams 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/lib/io_streams/bzip2/reader.rb +1 -1
  3. data/lib/io_streams/bzip2/writer.rb +1 -1
  4. data/lib/io_streams/encode/reader.rb +102 -0
  5. data/lib/io_streams/encode/writer.rb +78 -0
  6. data/lib/io_streams/errors.rb +19 -0
  7. data/lib/io_streams/file/reader.rb +1 -1
  8. data/lib/io_streams/file/writer.rb +1 -3
  9. data/lib/io_streams/gzip/reader.rb +1 -1
  10. data/lib/io_streams/gzip/writer.rb +1 -1
  11. data/lib/io_streams/io_streams.rb +57 -38
  12. data/lib/io_streams/line/reader.rb +125 -69
  13. data/lib/io_streams/line/writer.rb +11 -35
  14. data/lib/io_streams/pgp.rb +1 -1
  15. data/lib/io_streams/record/reader.rb +12 -14
  16. data/lib/io_streams/record/writer.rb +12 -14
  17. data/lib/io_streams/row/reader.rb +15 -16
  18. data/lib/io_streams/row/writer.rb +14 -12
  19. data/lib/io_streams/tabular.rb +50 -30
  20. data/lib/io_streams/tabular/header.rb +6 -6
  21. data/lib/io_streams/tabular/parser/array.rb +2 -2
  22. data/lib/io_streams/tabular/parser/csv.rb +6 -2
  23. data/lib/io_streams/tabular/parser/fixed.rb +18 -37
  24. data/lib/io_streams/tabular/parser/hash.rb +1 -1
  25. data/lib/io_streams/tabular/parser/json.rb +3 -1
  26. data/lib/io_streams/tabular/parser/psv.rb +6 -2
  27. data/lib/io_streams/version.rb +1 -1
  28. data/lib/io_streams/xlsx/reader.rb +22 -32
  29. data/lib/iostreams.rb +6 -0
  30. data/test/encode_reader_test.rb +54 -0
  31. data/test/encode_writer_test.rb +82 -0
  32. data/test/io_streams_test.rb +0 -65
  33. data/test/line_reader_test.rb +180 -37
  34. data/test/tabular_test.rb +79 -3
  35. data/test/test_helper.rb +1 -1
  36. data/test/xlsx_reader_test.rb +7 -10
  37. metadata +10 -4
  38. data/lib/io_streams/tabular/errors.rb +0 -14
@@ -1,7 +1,12 @@
1
1
  module IOStreams
2
2
  module Line
3
3
  class Reader
4
- attr_reader :delimiter, :buffer_size, :encoding, :strip_non_printable
4
+ attr_reader :delimiter, :buffer_size, :line_count
5
+
6
+ # Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
7
+ MAX_BLOCKS_MULTIPLIER = 100
8
+
9
+ LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze
5
10
 
6
11
  # Read a line at a time from a file or stream
7
12
  def self.open(file_name_or_io, **args)
@@ -12,11 +17,10 @@ module IOStreams
12
17
  end
13
18
  end
14
19
 
15
- # Create a delimited UTF8 stream reader from the supplied input streams
20
+ # Create a delimited stream reader from the supplied input stream.
16
21
  #
17
- # The input stream should be binary with no text conversions performed
18
- # since `strip_non_printable` will be applied to the binary stream before
19
- # converting to UTF-8
22
+ # Lines returned will be in the encoding of the input stream.
23
+ # To change the encoding of retruned lines, use IOStreams::Encode::Reader.
20
24
  #
21
25
  # Parameters
22
26
  # input_stream
@@ -24,102 +28,154 @@ module IOStreams
24
28
  #
25
29
  # delimiter: [String]
26
30
  # Line / Record delimiter to use to break the stream up into records
27
- # Any string to break the stream up by
28
- # The records when saved will not include this delimiter
31
+ # Any string to break the stream up by.
32
+ # This delimiter is removed from each line when `#each` or `#readline` is called.
29
33
  # Default: nil
30
34
  # Automatically detect line endings and break up by line
31
35
  # Searches for the first "\r\n" or "\n" and then uses that as the
32
- # delimiter for all subsequent records
36
+ # delimiter for all subsequent records.
33
37
  #
34
38
  # buffer_size: [Integer]
35
- # Maximum size of the buffer into which to read the stream into for
36
- # processing.
37
- # Must be large enough to hold the entire first line and its delimiter(s)
39
+ # Size of blocks to read from the input stream at a time.
38
40
  # Default: 65536 ( 64K )
39
41
  #
40
- # strip_non_printable: [true|false]
41
- # Strip all non-printable characters read from the file
42
- # Default: false
43
- #
44
- # encoding:
45
- # Force encoding to this encoding for all data being read
46
- # Default: UTF8_ENCODING
47
- # Set to nil to disable encoding
48
- #
49
42
  # TODO:
43
+ # - Handle embedded line feeds when reading csv files.
50
44
  # - Skip Comment lines. RegExp?
51
45
  # - Skip "empty" / "blank" lines. RegExp?
52
46
  # - Extract header line(s) / first non-comment, non-blank line
53
47
  # - Embedded newline support, RegExp? or Proc?
54
- def initialize(input_stream, delimiter: nil, buffer_size: 65536, encoding: UTF8_ENCODING, strip_non_printable: false)
55
- @input_stream = input_stream
56
- @delimiter = delimiter
57
- @buffer_size = buffer_size
58
- @encoding = encoding
59
- @strip_non_printable = strip_non_printable
60
-
61
- @delimiter.encode(UTF8_ENCODING) if @delimiter && @encoding
62
- @buffer = ''
48
+ def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
49
+ @input_stream = input_stream
50
+ @buffer_size = buffer_size
51
+
52
+ # More efficient read buffering only supported when the input stream `#read` method supports it.
53
+ @use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
54
+
55
+ @line_count = 0
56
+ @eof = false
57
+ @read_cache_buffer = nil
58
+ @buffer = nil
59
+
60
+ read_block
61
+ # Auto-detect windows/linux line endings if not supplied. \n or \r\n
62
+ @delimiter = delimiter || auto_detect_line_endings
63
+
64
+ unless eof?
65
+ # Change the delimiters encoding to match that of the input stream
66
+ @delimiter = @delimiter.encode(@buffer.encoding)
67
+ @delimiter_size = @delimiter.size
68
+ end
63
69
  end
64
70
 
65
71
  # Iterate over every line in the file/stream passing each line to supplied block in turn.
66
72
  # Returns [Integer] the number of lines read from the file/stream.
67
- def each(&block)
68
- partial = nil
69
- loop do
70
- if read_chunk == 0
71
- block.call(partial) if partial
72
- return
73
- end
73
+ # Note:
74
+ # * The line delimiter is _not_ returned.
75
+ def each
76
+ yield(readline) until eof?
77
+ line_count
78
+ end
74
79
 
75
- self.delimiter ||= detect_delimiter
76
- end_index ||= (delimiter.size + 1) * -1
80
+ def readline
81
+ return if eof?
77
82
 
78
- @buffer.each_line(delimiter) do |line|
79
- if line.end_with?(delimiter)
80
- # Strip off delimiter
81
- block.call(line[0..end_index])
82
- partial = nil
83
- else
84
- partial = line
85
- end
86
- end
87
- @buffer = partial.nil? ? '' : partial
83
+ # Keep reading until it finds the delimiter
84
+ while (index = @buffer.index(@delimiter)).nil? && read_block
88
85
  end
86
+
87
+ # Delimiter found?
88
+ if index
89
+ data = @buffer.slice(0, index)
90
+ @buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
91
+ elsif @eof && @buffer.empty?
92
+ data = nil
93
+ @buffer = nil
94
+ else
95
+ # Last line without delimiter
96
+ data = @buffer
97
+ @buffer = nil
98
+ end
99
+
100
+ @line_count += 1
101
+ data
102
+ end
103
+
104
+ # Returns whether the end of file has been reached for this stream
105
+ def eof?
106
+ @eof && (@buffer.nil? || @buffer.empty?)
89
107
  end
90
108
 
91
109
  private
92
110
 
93
- attr_reader :buffer
94
- attr_writer :delimiter
111
+ # Returns [Integer] the number of characters read into the internal buffer
112
+ # Returns 0 on EOF
113
+ def read_block
114
+ return false if @eof
95
115
 
96
- NOT_PRINTABLE = Regexp.compile(/[^[:print:]|\r|\n]/)
116
+ block =
117
+ if @read_cache_buffer
118
+ begin
119
+ @input_stream.read(@buffer_size, @read_cache_buffer)
120
+ rescue ArgumentError
121
+ # Handle arity of -1 when just 0..1
122
+ @read_cache_buffer = nil
123
+ @input_stream.read(@buffer_size)
124
+ end
125
+ else
126
+ @input_stream.read(@buffer_size)
127
+ end
97
128
 
98
- # Returns [Integer] the number of bytes read into the internal buffer
99
- # Returns 0 on EOF
100
- def read_chunk
101
- # TODO: read into existing buffer
102
- chunk = @input_stream.read(@buffer_size)
103
129
  # EOF reached?
104
- return 0 unless chunk
130
+ if block.nil?
131
+ @eof = true
132
+ return false
133
+ elsif block.size < @buffer_size
134
+ @eof = true
135
+ end
105
136
 
106
- # Strip out non-printable characters before converting to UTF-8
107
- chunk.gsub!(NOT_PRINTABLE, '') if @strip_non_printable
137
+ if @buffer
138
+ @buffer << block
139
+ else
140
+ # Take on the encoding from the input stream
141
+ @buffer = block.dup
142
+ # Take on the encoding from the first block that was read.
143
+ @read_cache_buffer = ''.encode(block.encoding) if @use_read_cache_buffer
144
+ end
108
145
 
109
- @buffer << (@encoding ? chunk.force_encoding(@encoding) : chunk)
110
- chunk.size
146
+ if @buffer.size > MAX_BLOCKS_MULTIPLIER * @buffer_size
147
+ raise(
148
+ Errors::DelimiterNotFound,
149
+ "Delimiter: #{@delimiter.inspect} not found after reading #{@buffer.size} bytes."
150
+ )
151
+ end
152
+
153
+ true
111
154
  end
112
155
 
113
- # Auto detect text line delimiter
114
- def detect_delimiter
115
- if @buffer =~ /\r\n|\n\r|\n|\r/
116
- $&
117
- elsif @buffer.size <= @buffer_size
118
- # Handle one line files that are smaller than the buffer size
119
- "\n"
156
+ # Auto-detect windows/linux line endings: \n, \r or \r\n
157
+ def auto_detect_line_endings
158
+ return "\n" if @buffer.nil? && !read_block
159
+
160
+ # Could be "\r\n" broken in half by the block size
161
+ read_block if @buffer[-1] == "\r"
162
+
163
+ # Delimiter takes on the encoding from @buffer
164
+ delimiter = @buffer.slice(LINEFEED_REGEXP)
165
+ return delimiter if delimiter
166
+
167
+ while read_block
168
+ # Could be "\r\n" broken in half by the block size
169
+ read_block if @buffer[-1] == "\r"
170
+
171
+ # Delimiter takes on the encoding from @buffer
172
+ delimiter = @buffer.slice(LINEFEED_REGEXP)
173
+ return delimiter if delimiter
120
174
  end
121
- end
122
175
 
176
+ # One line files with no delimiter
177
+ "\n"
178
+ end
123
179
  end
124
180
  end
125
181
  end
@@ -1,7 +1,7 @@
1
1
  module IOStreams
2
2
  module Line
3
3
  class Writer
4
- attr_reader :delimiter, :encoding, :strip_non_printable
4
+ attr_reader :delimiter
5
5
 
6
6
  # Write a line at a time to a file or stream
7
7
  def self.open(file_name_or_io, **args)
@@ -12,13 +12,10 @@ module IOStreams
12
12
  end
13
13
  end
14
14
 
15
- NOT_PRINTABLE = Regexp.compile(/[^[:print:]]/)
16
-
17
- # A delimited stream writer that will write to the supplied output stream
15
+ # A delimited stream writer that will write to the supplied output stream.
18
16
  #
19
- # The output stream should be binary with no text conversions performed
20
- # since `strip_non_printable` will be applied to the binary stream before
21
- # converting to UTF-8
17
+ # The output stream will have the encoding of data written to it.
18
+ # To change the output encoding, use IOStreams::Encode::Writer.
22
19
  #
23
20
  # Parameters
24
21
  # output_stream
@@ -28,22 +25,9 @@ module IOStreams
28
25
  # Add the specified delimiter after every record when writing it
29
26
  # to the output stream
30
27
  # Default: OS Specific. Linux: "\n"
31
- #
32
- # encoding:
33
- # Encode data before writing to the output stream.
34
- # Default: UTF8_ENCODING
35
- # Set to nil to disable encoding
36
- #
37
- # strip_non_printable: [true|false]
38
- # Strip all non-printable characters before writing to the file / stream.
39
- # Default: false
40
- #
41
- # TODO: Support replacement character for invalid characters
42
- def initialize(output_stream, delimiter: $/, encoding: UTF8_ENCODING, strip_non_printable: false)
43
- @output_stream = output_stream
44
- @delimiter = delimiter.encode(encoding) if delimiter && encoding
45
- @encoding = encoding
46
- @strip_non_printable = strip_non_printable
28
+ def initialize(output_stream, delimiter: $/)
29
+ @output_stream = output_stream
30
+ @delimiter = delimiter
47
31
  end
48
32
 
49
33
  # Write a line to the output stream
@@ -52,8 +36,8 @@ module IOStreams
52
36
  # IOStreams.line_writer('a.txt') do |stream|
53
37
  # stream << 'first line' << 'second line'
54
38
  # end
55
- def <<(record)
56
- write(record)
39
+ def <<(data)
40
+ write(data)
57
41
  self
58
42
  end
59
43
 
@@ -65,17 +49,9 @@ module IOStreams
65
49
  # count = stream.write('first line')
66
50
  # puts "Wrote #{count} bytes to the output file, including the delimiter"
67
51
  # end
68
- def write(record)
69
- chunk = record.to_s
70
- chunk.gsub!(NOT_PRINTABLE, '') if strip_non_printable
71
- count = output_stream.write((encoding ? chunk.encode(encoding) : chunk))
72
- count += output_stream.write(delimiter) if delimiter
73
- count
52
+ def write(data)
53
+ @output_stream.write(data.to_s + delimiter)
74
54
  end
75
-
76
- private
77
-
78
- attr_reader :output_stream
79
55
  end
80
56
  end
81
57
  end
@@ -44,7 +44,7 @@ module IOStreams
44
44
  #
45
45
  # # Generate encrypted file for a specific recipient and sign it with senders credentials
46
46
  # data = %w(this is some data that should be encrypted using pgp)
47
- # IOStreams.writer('secure.gpg', pgp: {recipient: 'receiver@example.org'}) do |output|
47
+ # IOStreams.writer('secure.gpg', streams: {pgp: {recipient: 'receiver@example.org'}}) do |output|
48
48
  # data.each { |word| output.puts(word) }
49
49
  # end
50
50
  #
@@ -5,13 +5,15 @@ module IOStreams
5
5
  include Enumerable
6
6
 
7
7
  # Read a record as a Hash at a time from a file or stream.
8
- def self.open(file_name_or_io, delimiter: nil, buffer_size: 65536, encoding: UTF8_ENCODING, strip_non_printable: false, **args)
8
+ def self.open(file_name_or_io, delimiter: nil, buffer_size: 65536, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args)
9
9
  if file_name_or_io.is_a?(String)
10
10
  IOStreams.line_reader(file_name_or_io,
11
- delimiter: delimiter,
12
- buffer_size: buffer_size,
13
- encoding: encoding,
14
- strip_non_printable: strip_non_printable) do |io|
11
+ delimiter: delimiter,
12
+ buffer_size: buffer_size,
13
+ encoding: encoding,
14
+ encode_cleaner: encode_cleaner,
15
+ encode_replace: encode_replace
16
+ ) do |io|
15
17
  yield new(io, file_name: file_name_or_io, **args)
16
18
  end
17
19
  else
@@ -37,19 +39,15 @@ module IOStreams
37
39
  end
38
40
 
39
41
  def each
40
- delimited.each do |line|
41
- if tabular.requires_header?
42
- tabular.parse_header(line)
43
- tabular.cleanse_header! if cleanse_header
42
+ @delimited.each do |line|
43
+ if @tabular.header?
44
+ @tabular.parse_header(line)
45
+ @tabular.cleanse_header! if @cleanse_header
44
46
  else
45
- yield tabular.record_parse(line)
47
+ yield @tabular.record_parse(line)
46
48
  end
47
49
  end
48
50
  end
49
-
50
- private
51
-
52
- attr_reader :tabular, :delimited, :cleanse_header
53
51
  end
54
52
  end
55
53
  end
@@ -11,12 +11,14 @@ module IOStreams
11
11
  #
12
12
  class Writer
13
13
  # Write a record as a Hash at a time to a file or stream.
14
- def self.open(file_name_or_io, delimiter: $/, encoding: UTF8_ENCODING, strip_non_printable: false, **args)
14
+ def self.open(file_name_or_io, delimiter: $/, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args)
15
15
  if file_name_or_io.is_a?(String)
16
16
  IOStreams.line_writer(file_name_or_io,
17
- delimiter: delimiter,
18
- encoding: encoding,
19
- strip_non_printable: strip_non_printable) do |io|
17
+ delimiter: delimiter,
18
+ encoding: encoding,
19
+ encode_cleaner: encode_cleaner,
20
+ encode_replace: encode_replace
21
+ ) do |io|
20
22
  yield new(io, file_name: file_name_or_io, **args)
21
23
  end
22
24
  else
@@ -42,22 +44,18 @@ module IOStreams
42
44
  @delimited = delimited
43
45
 
44
46
  # Render header line when `columns` is supplied.
45
- delimited << @tabular.render(columns) if columns && @tabular.requires_header?
47
+ @delimited << @tabular.render_header if columns && @tabular.requires_header?
46
48
  end
47
49
 
48
50
  def <<(hash)
49
51
  raise(ArgumentError, 'Must supply a Hash') unless hash.is_a?(Hash)
50
- if tabular.requires_header?
51
- columns = hash.keys
52
- tabular.header.columns = columns
53
- delimited << tabular.render(columns)
52
+ if @tabular.header?
53
+ # Extract header from the keys from the first row when not supplied above.
54
+ @tabular.header.columns = hash.keys
55
+ @delimited << @tabular.render_header
54
56
  end
55
- delimited << tabular.render(hash)
57
+ @delimited << @tabular.render(hash)
56
58
  end
57
-
58
- private
59
-
60
- attr_reader :tabular, :delimited, :cleanse_header
61
59
  end
62
60
  end
63
61
  end
@@ -6,17 +6,20 @@ module IOStreams
6
6
  def self.open(file_name_or_io,
7
7
  delimiter: nil,
8
8
  buffer_size: 65_536,
9
- encoding: UTF8_ENCODING,
10
- strip_non_printable: false,
11
9
  file_name: nil,
10
+ encoding: nil,
11
+ encode_cleaner: nil,
12
+ encode_replace: nil,
12
13
  **args)
13
14
  if file_name_or_io.is_a?(String)
14
15
  IOStreams.line_reader(file_name_or_io,
15
- delimiter: delimiter,
16
- buffer_size: buffer_size,
17
- encoding: encoding,
18
- file_name: file_name,
19
- strip_non_printable: strip_non_printable) do |io|
16
+ delimiter: delimiter,
17
+ buffer_size: buffer_size,
18
+ file_name: file_name,
19
+ encoding: encoding,
20
+ encode_cleaner: encode_cleaner,
21
+ encode_replace: encode_replace
22
+ ) do |io|
20
23
  yield new(io, file_name: file_name, **args)
21
24
  end
22
25
  else
@@ -41,20 +44,16 @@ module IOStreams
41
44
  end
42
45
 
43
46
  def each
44
- delimited.each do |line|
45
- if tabular.requires_header?
46
- columns = tabular.parse_header(line)
47
- tabular.cleanse_header! if cleanse_header
47
+ @delimited.each do |line|
48
+ if @tabular.header?
49
+ columns = @tabular.parse_header(line)
50
+ @tabular.cleanse_header! if @cleanse_header
48
51
  yield columns
49
52
  else
50
- yield tabular.row_parse(line)
53
+ yield @tabular.row_parse(line)
51
54
  end
52
55
  end
53
56
  end
54
-
55
- private
56
-
57
- attr_reader :tabular, :delimited, :cleanse_header
58
57
  end
59
58
  end
60
59
  end