iostreams 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/io_streams/bzip2/reader.rb +1 -1
- data/lib/io_streams/bzip2/writer.rb +1 -1
- data/lib/io_streams/encode/reader.rb +102 -0
- data/lib/io_streams/encode/writer.rb +78 -0
- data/lib/io_streams/errors.rb +19 -0
- data/lib/io_streams/file/reader.rb +1 -1
- data/lib/io_streams/file/writer.rb +1 -3
- data/lib/io_streams/gzip/reader.rb +1 -1
- data/lib/io_streams/gzip/writer.rb +1 -1
- data/lib/io_streams/io_streams.rb +57 -38
- data/lib/io_streams/line/reader.rb +125 -69
- data/lib/io_streams/line/writer.rb +11 -35
- data/lib/io_streams/pgp.rb +1 -1
- data/lib/io_streams/record/reader.rb +12 -14
- data/lib/io_streams/record/writer.rb +12 -14
- data/lib/io_streams/row/reader.rb +15 -16
- data/lib/io_streams/row/writer.rb +14 -12
- data/lib/io_streams/tabular.rb +50 -30
- data/lib/io_streams/tabular/header.rb +6 -6
- data/lib/io_streams/tabular/parser/array.rb +2 -2
- data/lib/io_streams/tabular/parser/csv.rb +6 -2
- data/lib/io_streams/tabular/parser/fixed.rb +18 -37
- data/lib/io_streams/tabular/parser/hash.rb +1 -1
- data/lib/io_streams/tabular/parser/json.rb +3 -1
- data/lib/io_streams/tabular/parser/psv.rb +6 -2
- data/lib/io_streams/version.rb +1 -1
- data/lib/io_streams/xlsx/reader.rb +22 -32
- data/lib/iostreams.rb +6 -0
- data/test/encode_reader_test.rb +54 -0
- data/test/encode_writer_test.rb +82 -0
- data/test/io_streams_test.rb +0 -65
- data/test/line_reader_test.rb +180 -37
- data/test/tabular_test.rb +79 -3
- data/test/test_helper.rb +1 -1
- data/test/xlsx_reader_test.rb +7 -10
- metadata +10 -4
- data/lib/io_streams/tabular/errors.rb +0 -14
@@ -1,7 +1,12 @@
|
|
1
1
|
module IOStreams
|
2
2
|
module Line
|
3
3
|
class Reader
|
4
|
-
attr_reader :delimiter, :buffer_size, :
|
4
|
+
attr_reader :delimiter, :buffer_size, :line_count
|
5
|
+
|
6
|
+
# Prevent denial of service when a delimiter is not found before this number * `buffer_size` characters are read.
|
7
|
+
MAX_BLOCKS_MULTIPLIER = 100
|
8
|
+
|
9
|
+
LINEFEED_REGEXP = Regexp.compile(/\r\n|\n|\r/).freeze
|
5
10
|
|
6
11
|
# Read a line at a time from a file or stream
|
7
12
|
def self.open(file_name_or_io, **args)
|
@@ -12,11 +17,10 @@ module IOStreams
|
|
12
17
|
end
|
13
18
|
end
|
14
19
|
|
15
|
-
# Create a delimited
|
20
|
+
# Create a delimited stream reader from the supplied input stream.
|
16
21
|
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
# converting to UTF-8
|
22
|
+
# Lines returned will be in the encoding of the input stream.
|
23
|
+
# To change the encoding of retruned lines, use IOStreams::Encode::Reader.
|
20
24
|
#
|
21
25
|
# Parameters
|
22
26
|
# input_stream
|
@@ -24,102 +28,154 @@ module IOStreams
|
|
24
28
|
#
|
25
29
|
# delimiter: [String]
|
26
30
|
# Line / Record delimiter to use to break the stream up into records
|
27
|
-
# Any string to break the stream up by
|
28
|
-
#
|
31
|
+
# Any string to break the stream up by.
|
32
|
+
# This delimiter is removed from each line when `#each` or `#readline` is called.
|
29
33
|
# Default: nil
|
30
34
|
# Automatically detect line endings and break up by line
|
31
35
|
# Searches for the first "\r\n" or "\n" and then uses that as the
|
32
|
-
# delimiter for all subsequent records
|
36
|
+
# delimiter for all subsequent records.
|
33
37
|
#
|
34
38
|
# buffer_size: [Integer]
|
35
|
-
#
|
36
|
-
# processing.
|
37
|
-
# Must be large enough to hold the entire first line and its delimiter(s)
|
39
|
+
# Size of blocks to read from the input stream at a time.
|
38
40
|
# Default: 65536 ( 64K )
|
39
41
|
#
|
40
|
-
# strip_non_printable: [true|false]
|
41
|
-
# Strip all non-printable characters read from the file
|
42
|
-
# Default: false
|
43
|
-
#
|
44
|
-
# encoding:
|
45
|
-
# Force encoding to this encoding for all data being read
|
46
|
-
# Default: UTF8_ENCODING
|
47
|
-
# Set to nil to disable encoding
|
48
|
-
#
|
49
42
|
# TODO:
|
43
|
+
# - Handle embedded line feeds when reading csv files.
|
50
44
|
# - Skip Comment lines. RegExp?
|
51
45
|
# - Skip "empty" / "blank" lines. RegExp?
|
52
46
|
# - Extract header line(s) / first non-comment, non-blank line
|
53
47
|
# - Embedded newline support, RegExp? or Proc?
|
54
|
-
def initialize(input_stream, delimiter: nil, buffer_size:
|
55
|
-
@input_stream
|
56
|
-
@
|
57
|
-
|
58
|
-
|
59
|
-
@
|
60
|
-
|
61
|
-
@
|
62
|
-
@
|
48
|
+
def initialize(input_stream, delimiter: nil, buffer_size: 65_536)
|
49
|
+
@input_stream = input_stream
|
50
|
+
@buffer_size = buffer_size
|
51
|
+
|
52
|
+
# More efficient read buffering only supported when the input stream `#read` method supports it.
|
53
|
+
@use_read_cache_buffer = !@input_stream.method(:read).arity.between?(0, 1)
|
54
|
+
|
55
|
+
@line_count = 0
|
56
|
+
@eof = false
|
57
|
+
@read_cache_buffer = nil
|
58
|
+
@buffer = nil
|
59
|
+
|
60
|
+
read_block
|
61
|
+
# Auto-detect windows/linux line endings if not supplied. \n or \r\n
|
62
|
+
@delimiter = delimiter || auto_detect_line_endings
|
63
|
+
|
64
|
+
unless eof?
|
65
|
+
# Change the delimiters encoding to match that of the input stream
|
66
|
+
@delimiter = @delimiter.encode(@buffer.encoding)
|
67
|
+
@delimiter_size = @delimiter.size
|
68
|
+
end
|
63
69
|
end
|
64
70
|
|
65
71
|
# Iterate over every line in the file/stream passing each line to supplied block in turn.
|
66
72
|
# Returns [Integer] the number of lines read from the file/stream.
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
end
|
73
|
+
# Note:
|
74
|
+
# * The line delimiter is _not_ returned.
|
75
|
+
def each
|
76
|
+
yield(readline) until eof?
|
77
|
+
line_count
|
78
|
+
end
|
74
79
|
|
75
|
-
|
76
|
-
|
80
|
+
def readline
|
81
|
+
return if eof?
|
77
82
|
|
78
|
-
|
79
|
-
|
80
|
-
# Strip off delimiter
|
81
|
-
block.call(line[0..end_index])
|
82
|
-
partial = nil
|
83
|
-
else
|
84
|
-
partial = line
|
85
|
-
end
|
86
|
-
end
|
87
|
-
@buffer = partial.nil? ? '' : partial
|
83
|
+
# Keep reading until it finds the delimiter
|
84
|
+
while (index = @buffer.index(@delimiter)).nil? && read_block
|
88
85
|
end
|
86
|
+
|
87
|
+
# Delimiter found?
|
88
|
+
if index
|
89
|
+
data = @buffer.slice(0, index)
|
90
|
+
@buffer = @buffer.slice(index + @delimiter_size, @buffer.size)
|
91
|
+
elsif @eof && @buffer.empty?
|
92
|
+
data = nil
|
93
|
+
@buffer = nil
|
94
|
+
else
|
95
|
+
# Last line without delimiter
|
96
|
+
data = @buffer
|
97
|
+
@buffer = nil
|
98
|
+
end
|
99
|
+
|
100
|
+
@line_count += 1
|
101
|
+
data
|
102
|
+
end
|
103
|
+
|
104
|
+
# Returns whether the end of file has been reached for this stream
|
105
|
+
def eof?
|
106
|
+
@eof && (@buffer.nil? || @buffer.empty?)
|
89
107
|
end
|
90
108
|
|
91
109
|
private
|
92
110
|
|
93
|
-
|
94
|
-
|
111
|
+
# Returns [Integer] the number of characters read into the internal buffer
|
112
|
+
# Returns 0 on EOF
|
113
|
+
def read_block
|
114
|
+
return false if @eof
|
95
115
|
|
96
|
-
|
116
|
+
block =
|
117
|
+
if @read_cache_buffer
|
118
|
+
begin
|
119
|
+
@input_stream.read(@buffer_size, @read_cache_buffer)
|
120
|
+
rescue ArgumentError
|
121
|
+
# Handle arity of -1 when just 0..1
|
122
|
+
@read_cache_buffer = nil
|
123
|
+
@input_stream.read(@buffer_size)
|
124
|
+
end
|
125
|
+
else
|
126
|
+
@input_stream.read(@buffer_size)
|
127
|
+
end
|
97
128
|
|
98
|
-
# Returns [Integer] the number of bytes read into the internal buffer
|
99
|
-
# Returns 0 on EOF
|
100
|
-
def read_chunk
|
101
|
-
# TODO: read into existing buffer
|
102
|
-
chunk = @input_stream.read(@buffer_size)
|
103
129
|
# EOF reached?
|
104
|
-
|
130
|
+
if block.nil?
|
131
|
+
@eof = true
|
132
|
+
return false
|
133
|
+
elsif block.size < @buffer_size
|
134
|
+
@eof = true
|
135
|
+
end
|
105
136
|
|
106
|
-
|
107
|
-
|
137
|
+
if @buffer
|
138
|
+
@buffer << block
|
139
|
+
else
|
140
|
+
# Take on the encoding from the input stream
|
141
|
+
@buffer = block.dup
|
142
|
+
# Take on the encoding from the first block that was read.
|
143
|
+
@read_cache_buffer = ''.encode(block.encoding) if @use_read_cache_buffer
|
144
|
+
end
|
108
145
|
|
109
|
-
@buffer
|
110
|
-
|
146
|
+
if @buffer.size > MAX_BLOCKS_MULTIPLIER * @buffer_size
|
147
|
+
raise(
|
148
|
+
Errors::DelimiterNotFound,
|
149
|
+
"Delimiter: #{@delimiter.inspect} not found after reading #{@buffer.size} bytes."
|
150
|
+
)
|
151
|
+
end
|
152
|
+
|
153
|
+
true
|
111
154
|
end
|
112
155
|
|
113
|
-
# Auto
|
114
|
-
def
|
115
|
-
if @buffer
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
156
|
+
# Auto-detect windows/linux line endings: \n, \r or \r\n
|
157
|
+
def auto_detect_line_endings
|
158
|
+
return "\n" if @buffer.nil? && !read_block
|
159
|
+
|
160
|
+
# Could be "\r\n" broken in half by the block size
|
161
|
+
read_block if @buffer[-1] == "\r"
|
162
|
+
|
163
|
+
# Delimiter takes on the encoding from @buffer
|
164
|
+
delimiter = @buffer.slice(LINEFEED_REGEXP)
|
165
|
+
return delimiter if delimiter
|
166
|
+
|
167
|
+
while read_block
|
168
|
+
# Could be "\r\n" broken in half by the block size
|
169
|
+
read_block if @buffer[-1] == "\r"
|
170
|
+
|
171
|
+
# Delimiter takes on the encoding from @buffer
|
172
|
+
delimiter = @buffer.slice(LINEFEED_REGEXP)
|
173
|
+
return delimiter if delimiter
|
120
174
|
end
|
121
|
-
end
|
122
175
|
|
176
|
+
# One line files with no delimiter
|
177
|
+
"\n"
|
178
|
+
end
|
123
179
|
end
|
124
180
|
end
|
125
181
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module IOStreams
|
2
2
|
module Line
|
3
3
|
class Writer
|
4
|
-
attr_reader :delimiter
|
4
|
+
attr_reader :delimiter
|
5
5
|
|
6
6
|
# Write a line at a time to a file or stream
|
7
7
|
def self.open(file_name_or_io, **args)
|
@@ -12,13 +12,10 @@ module IOStreams
|
|
12
12
|
end
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
# A delimited stream writer that will write to the supplied output stream
|
15
|
+
# A delimited stream writer that will write to the supplied output stream.
|
18
16
|
#
|
19
|
-
# The output stream
|
20
|
-
#
|
21
|
-
# converting to UTF-8
|
17
|
+
# The output stream will have the encoding of data written to it.
|
18
|
+
# To change the output encoding, use IOStreams::Encode::Writer.
|
22
19
|
#
|
23
20
|
# Parameters
|
24
21
|
# output_stream
|
@@ -28,22 +25,9 @@ module IOStreams
|
|
28
25
|
# Add the specified delimiter after every record when writing it
|
29
26
|
# to the output stream
|
30
27
|
# Default: OS Specific. Linux: "\n"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
# Default: UTF8_ENCODING
|
35
|
-
# Set to nil to disable encoding
|
36
|
-
#
|
37
|
-
# strip_non_printable: [true|false]
|
38
|
-
# Strip all non-printable characters before writing to the file / stream.
|
39
|
-
# Default: false
|
40
|
-
#
|
41
|
-
# TODO: Support replacement character for invalid characters
|
42
|
-
def initialize(output_stream, delimiter: $/, encoding: UTF8_ENCODING, strip_non_printable: false)
|
43
|
-
@output_stream = output_stream
|
44
|
-
@delimiter = delimiter.encode(encoding) if delimiter && encoding
|
45
|
-
@encoding = encoding
|
46
|
-
@strip_non_printable = strip_non_printable
|
28
|
+
def initialize(output_stream, delimiter: $/)
|
29
|
+
@output_stream = output_stream
|
30
|
+
@delimiter = delimiter
|
47
31
|
end
|
48
32
|
|
49
33
|
# Write a line to the output stream
|
@@ -52,8 +36,8 @@ module IOStreams
|
|
52
36
|
# IOStreams.line_writer('a.txt') do |stream|
|
53
37
|
# stream << 'first line' << 'second line'
|
54
38
|
# end
|
55
|
-
def <<(
|
56
|
-
write(
|
39
|
+
def <<(data)
|
40
|
+
write(data)
|
57
41
|
self
|
58
42
|
end
|
59
43
|
|
@@ -65,17 +49,9 @@ module IOStreams
|
|
65
49
|
# count = stream.write('first line')
|
66
50
|
# puts "Wrote #{count} bytes to the output file, including the delimiter"
|
67
51
|
# end
|
68
|
-
def write(
|
69
|
-
|
70
|
-
chunk.gsub!(NOT_PRINTABLE, '') if strip_non_printable
|
71
|
-
count = output_stream.write((encoding ? chunk.encode(encoding) : chunk))
|
72
|
-
count += output_stream.write(delimiter) if delimiter
|
73
|
-
count
|
52
|
+
def write(data)
|
53
|
+
@output_stream.write(data.to_s + delimiter)
|
74
54
|
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
attr_reader :output_stream
|
79
55
|
end
|
80
56
|
end
|
81
57
|
end
|
data/lib/io_streams/pgp.rb
CHANGED
@@ -44,7 +44,7 @@ module IOStreams
|
|
44
44
|
#
|
45
45
|
# # Generate encrypted file for a specific recipient and sign it with senders credentials
|
46
46
|
# data = %w(this is some data that should be encrypted using pgp)
|
47
|
-
# IOStreams.writer('secure.gpg', pgp: {recipient: 'receiver@example.org'}) do |output|
|
47
|
+
# IOStreams.writer('secure.gpg', streams: {pgp: {recipient: 'receiver@example.org'}}) do |output|
|
48
48
|
# data.each { |word| output.puts(word) }
|
49
49
|
# end
|
50
50
|
#
|
@@ -5,13 +5,15 @@ module IOStreams
|
|
5
5
|
include Enumerable
|
6
6
|
|
7
7
|
# Read a record as a Hash at a time from a file or stream.
|
8
|
-
def self.open(file_name_or_io, delimiter: nil, buffer_size: 65536, encoding:
|
8
|
+
def self.open(file_name_or_io, delimiter: nil, buffer_size: 65536, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args)
|
9
9
|
if file_name_or_io.is_a?(String)
|
10
10
|
IOStreams.line_reader(file_name_or_io,
|
11
|
-
delimiter:
|
12
|
-
buffer_size:
|
13
|
-
encoding:
|
14
|
-
|
11
|
+
delimiter: delimiter,
|
12
|
+
buffer_size: buffer_size,
|
13
|
+
encoding: encoding,
|
14
|
+
encode_cleaner: encode_cleaner,
|
15
|
+
encode_replace: encode_replace
|
16
|
+
) do |io|
|
15
17
|
yield new(io, file_name: file_name_or_io, **args)
|
16
18
|
end
|
17
19
|
else
|
@@ -37,19 +39,15 @@ module IOStreams
|
|
37
39
|
end
|
38
40
|
|
39
41
|
def each
|
40
|
-
delimited.each do |line|
|
41
|
-
if tabular.
|
42
|
-
tabular.parse_header(line)
|
43
|
-
tabular.cleanse_header! if cleanse_header
|
42
|
+
@delimited.each do |line|
|
43
|
+
if @tabular.header?
|
44
|
+
@tabular.parse_header(line)
|
45
|
+
@tabular.cleanse_header! if @cleanse_header
|
44
46
|
else
|
45
|
-
yield tabular.record_parse(line)
|
47
|
+
yield @tabular.record_parse(line)
|
46
48
|
end
|
47
49
|
end
|
48
50
|
end
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
attr_reader :tabular, :delimited, :cleanse_header
|
53
51
|
end
|
54
52
|
end
|
55
53
|
end
|
@@ -11,12 +11,14 @@ module IOStreams
|
|
11
11
|
#
|
12
12
|
class Writer
|
13
13
|
# Write a record as a Hash at a time to a file or stream.
|
14
|
-
def self.open(file_name_or_io, delimiter: $/, encoding:
|
14
|
+
def self.open(file_name_or_io, delimiter: $/, encoding: nil, encode_cleaner: nil, encode_replace: nil, **args)
|
15
15
|
if file_name_or_io.is_a?(String)
|
16
16
|
IOStreams.line_writer(file_name_or_io,
|
17
|
-
delimiter:
|
18
|
-
encoding:
|
19
|
-
|
17
|
+
delimiter: delimiter,
|
18
|
+
encoding: encoding,
|
19
|
+
encode_cleaner: encode_cleaner,
|
20
|
+
encode_replace: encode_replace
|
21
|
+
) do |io|
|
20
22
|
yield new(io, file_name: file_name_or_io, **args)
|
21
23
|
end
|
22
24
|
else
|
@@ -42,22 +44,18 @@ module IOStreams
|
|
42
44
|
@delimited = delimited
|
43
45
|
|
44
46
|
# Render header line when `columns` is supplied.
|
45
|
-
delimited << @tabular.
|
47
|
+
@delimited << @tabular.render_header if columns && @tabular.requires_header?
|
46
48
|
end
|
47
49
|
|
48
50
|
def <<(hash)
|
49
51
|
raise(ArgumentError, 'Must supply a Hash') unless hash.is_a?(Hash)
|
50
|
-
if tabular.
|
51
|
-
|
52
|
-
tabular.header.columns =
|
53
|
-
delimited << tabular.
|
52
|
+
if @tabular.header?
|
53
|
+
# Extract header from the keys from the first row when not supplied above.
|
54
|
+
@tabular.header.columns = hash.keys
|
55
|
+
@delimited << @tabular.render_header
|
54
56
|
end
|
55
|
-
delimited << tabular.render(hash)
|
57
|
+
@delimited << @tabular.render(hash)
|
56
58
|
end
|
57
|
-
|
58
|
-
private
|
59
|
-
|
60
|
-
attr_reader :tabular, :delimited, :cleanse_header
|
61
59
|
end
|
62
60
|
end
|
63
61
|
end
|
@@ -6,17 +6,20 @@ module IOStreams
|
|
6
6
|
def self.open(file_name_or_io,
|
7
7
|
delimiter: nil,
|
8
8
|
buffer_size: 65_536,
|
9
|
-
encoding: UTF8_ENCODING,
|
10
|
-
strip_non_printable: false,
|
11
9
|
file_name: nil,
|
10
|
+
encoding: nil,
|
11
|
+
encode_cleaner: nil,
|
12
|
+
encode_replace: nil,
|
12
13
|
**args)
|
13
14
|
if file_name_or_io.is_a?(String)
|
14
15
|
IOStreams.line_reader(file_name_or_io,
|
15
|
-
delimiter:
|
16
|
-
buffer_size:
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
delimiter: delimiter,
|
17
|
+
buffer_size: buffer_size,
|
18
|
+
file_name: file_name,
|
19
|
+
encoding: encoding,
|
20
|
+
encode_cleaner: encode_cleaner,
|
21
|
+
encode_replace: encode_replace
|
22
|
+
) do |io|
|
20
23
|
yield new(io, file_name: file_name, **args)
|
21
24
|
end
|
22
25
|
else
|
@@ -41,20 +44,16 @@ module IOStreams
|
|
41
44
|
end
|
42
45
|
|
43
46
|
def each
|
44
|
-
delimited.each do |line|
|
45
|
-
if tabular.
|
46
|
-
columns = tabular.parse_header(line)
|
47
|
-
tabular.cleanse_header! if cleanse_header
|
47
|
+
@delimited.each do |line|
|
48
|
+
if @tabular.header?
|
49
|
+
columns = @tabular.parse_header(line)
|
50
|
+
@tabular.cleanse_header! if @cleanse_header
|
48
51
|
yield columns
|
49
52
|
else
|
50
|
-
yield tabular.row_parse(line)
|
53
|
+
yield @tabular.row_parse(line)
|
51
54
|
end
|
52
55
|
end
|
53
56
|
end
|
54
|
-
|
55
|
-
private
|
56
|
-
|
57
|
-
attr_reader :tabular, :delimited, :cleanse_header
|
58
57
|
end
|
59
58
|
end
|
60
59
|
end
|