filter_io 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -1
- data/Gemfile +2 -6
- data/README.markdown +37 -31
- data/Rakefile +4 -52
- data/filter_io.gemspec +26 -0
- data/lib/filter_io/version.rb +3 -0
- data/lib/filter_io.rb +108 -104
- data/spec/filter_io_spec.rb +801 -0
- data/spec/spec_helper.rb +6 -0
- metadata +114 -56
- data/Gemfile.lock +0 -24
- data/test/filter_io_test.rb +0 -777
- data/test/test_helper.rb +0 -6
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.markdown
CHANGED
@@ -11,37 +11,41 @@
|
|
11
11
|
|
12
12
|
You can install the gem by running:
|
13
13
|
|
14
|
-
|
14
|
+
``` sh
|
15
|
+
gem install filter_io
|
16
|
+
```
|
15
17
|
|
16
18
|
### Example Usage
|
17
19
|
|
18
20
|
#### A Simple Example: ROT-13
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
|
22
|
+
``` ruby
|
23
|
+
io = FilterIO.new io do |data|
|
24
|
+
data.tr "A-Za-z", "N-ZA-Mn-za-m"
|
25
|
+
end
|
26
|
+
```
|
23
27
|
|
24
28
|
#### A Useful Example: Line Ending Normalisation
|
25
29
|
|
26
30
|
A common usage of `filter_io` is to normalise line endings before parsing CSV data:
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
``` ruby
|
33
|
+
# open source stream
|
34
|
+
File.open(filename) do |io|
|
35
|
+
# apply filter to stream
|
36
|
+
io = FilterIO.new(io) do |data, state|
|
37
|
+
# grab another chunk if the last character is a delimiter
|
38
|
+
raise FilterIO::NeedMoreData if data =~ /[\r\n]\z/ && !state.eof?
|
39
|
+
# normalise line endings to LF
|
40
|
+
data.gsub /\r\n|\r|\n/, "\n"
|
41
|
+
end
|
42
|
+
|
43
|
+
# process resulting stream normally
|
44
|
+
FasterCSV.parse(io) do |row|
|
45
|
+
pp row
|
46
|
+
end
|
47
|
+
end
|
48
|
+
```
|
45
49
|
|
46
50
|
### Reference
|
47
51
|
|
@@ -66,16 +70,18 @@ If your block is unable to process the whole chunk of data immediately, it can r
|
|
66
70
|
|
67
71
|
Here's an example which processes whole lines and prepends the line length to the beginning of each line.
|
68
72
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
``` ruby
|
74
|
+
io = FilterIO.new io do |data, state|
|
75
|
+
output = ''
|
76
|
+
# grab complete lines until we hit EOF
|
77
|
+
while data =~ /(.*)\n/ || (state.eof? && data =~ /(.+)/)
|
78
|
+
output << "#{$1.size} #{$1}\n"
|
79
|
+
data = $'
|
80
|
+
end
|
81
|
+
# `output` contains the processed lines, `data` contains any left over partial line
|
82
|
+
[output, data]
|
83
|
+
end
|
84
|
+
```
|
79
85
|
|
80
86
|
#### Block Size
|
81
87
|
|
data/Rakefile
CHANGED
@@ -1,54 +1,6 @@
|
|
1
|
-
require 'rubygems'
|
2
1
|
require 'rake'
|
3
|
-
require '
|
2
|
+
require 'bundler/gem_tasks'
|
3
|
+
require 'rspec/core/rake_task'
|
4
4
|
|
5
|
-
|
6
|
-
task :default => :
|
7
|
-
|
8
|
-
desc 'Test the filter_io plugin.'
|
9
|
-
Rake::TestTask.new(:test) do |t|
|
10
|
-
t.libs << 'lib'
|
11
|
-
t.libs << 'test'
|
12
|
-
t.pattern = 'test/**/*_test.rb'
|
13
|
-
t.verbose = true
|
14
|
-
end
|
15
|
-
|
16
|
-
task :test => :check_dependencies
|
17
|
-
|
18
|
-
begin
|
19
|
-
require 'jeweler'
|
20
|
-
Jeweler::Tasks.new do |gem|
|
21
|
-
gem.name = "filter_io"
|
22
|
-
gem.summary = "Filter IO streams with a block. Ruby's FilterInputStream."
|
23
|
-
gem.email = "jason@jasoncodes.com"
|
24
|
-
gem.homepage = "http://github.com/jasoncodes/filter_io"
|
25
|
-
gem.authors = ["Jason Weathered"]
|
26
|
-
gem.has_rdoc = false
|
27
|
-
gem.add_dependency 'activesupport'
|
28
|
-
end
|
29
|
-
Jeweler::GemcutterTasks.new
|
30
|
-
rescue LoadError
|
31
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
32
|
-
end
|
33
|
-
|
34
|
-
begin
|
35
|
-
require 'rcov/rcovtask'
|
36
|
-
Rcov::RcovTask.new do |t|
|
37
|
-
t.libs << "test"
|
38
|
-
t.rcov_opts = [
|
39
|
-
"--exclude '^(?!lib)'"
|
40
|
-
]
|
41
|
-
t.test_files = FileList[
|
42
|
-
'test/**/*_test.rb'
|
43
|
-
]
|
44
|
-
t.output_dir = 'coverage'
|
45
|
-
t.verbose = true
|
46
|
-
end
|
47
|
-
task :rcov do
|
48
|
-
system "open coverage/index.html"
|
49
|
-
end
|
50
|
-
rescue LoadError
|
51
|
-
task :rcov do
|
52
|
-
raise "You must install the 'rcov' gem"
|
53
|
-
end
|
54
|
-
end
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
task :default => :spec
|
data/filter_io.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'filter_io/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = %q{filter_io}
|
8
|
+
spec.version = FilterIO::VERSION
|
9
|
+
spec.authors = ['Jason Weathered']
|
10
|
+
spec.email = ['jason@jasoncodes.com']
|
11
|
+
spec.summary = %q{Filter IO streams with a block. Ruby's FilterInputStream.}
|
12
|
+
spec.homepage = 'http://github.com/jasoncodes/filter_io'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'activesupport', '>= 2.3.9'
|
21
|
+
|
22
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_development_dependency 'simplecov'
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
26
|
+
end
|
data/lib/filter_io.rb
CHANGED
@@ -4,12 +4,11 @@ require 'active_support/core_ext/array'
|
|
4
4
|
require 'active_support/core_ext/hash'
|
5
5
|
|
6
6
|
class FilterIO
|
7
|
-
|
8
7
|
DEFAULT_BLOCK_SIZE = 1024
|
9
|
-
|
8
|
+
|
10
9
|
class NeedMoreData < Exception
|
11
10
|
end
|
12
|
-
|
11
|
+
|
13
12
|
class BlockState
|
14
13
|
attr_reader :bof, :eof
|
15
14
|
def initialize(bof, eof)
|
@@ -19,7 +18,7 @@ class FilterIO
|
|
19
18
|
alias_method :bof?, :bof
|
20
19
|
alias_method :eof?, :eof
|
21
20
|
end
|
22
|
-
|
21
|
+
|
23
22
|
def initialize(io, options = nil, &block)
|
24
23
|
@io = io
|
25
24
|
@options = options || {}
|
@@ -29,75 +28,99 @@ class FilterIO
|
|
29
28
|
@buffer_raw = empty_string_raw
|
30
29
|
@options.assert_valid_keys :block_size
|
31
30
|
end
|
32
|
-
|
31
|
+
|
33
32
|
def pos
|
34
33
|
@pos
|
35
34
|
end
|
36
|
-
|
35
|
+
|
37
36
|
def bof?
|
38
37
|
@pos == 0
|
39
38
|
end
|
40
|
-
|
39
|
+
|
41
40
|
def eof?
|
42
41
|
@buffer.empty? && source_eof?
|
43
42
|
end
|
44
|
-
|
43
|
+
|
45
44
|
def source_eof?
|
46
45
|
@buffer_raw.empty? && @io.eof?
|
47
46
|
end
|
48
|
-
|
47
|
+
|
49
48
|
def close
|
50
49
|
@io.close
|
51
50
|
end
|
52
|
-
|
51
|
+
|
53
52
|
def closed?
|
54
53
|
@io.closed?
|
55
54
|
end
|
56
|
-
|
57
|
-
def
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
55
|
+
|
56
|
+
def default_encoding
|
57
|
+
unless @default_encoding
|
58
|
+
c = @io.getc
|
59
|
+
@io.ungetc c
|
60
|
+
@default_encoding = c.encoding
|
61
|
+
end
|
62
|
+
@default_encoding
|
63
|
+
end
|
64
|
+
|
65
|
+
def internal_encoding
|
66
|
+
if @io.respond_to?(:internal_encoding)
|
67
|
+
@io.internal_encoding
|
66
68
|
else
|
67
|
-
|
69
|
+
default_encoding
|
68
70
|
end
|
69
71
|
end
|
70
|
-
|
72
|
+
|
73
|
+
def external_encoding
|
74
|
+
if @io.respond_to?(:external_encoding)
|
75
|
+
@io.external_encoding
|
76
|
+
else
|
77
|
+
default_encoding
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def readchar
|
82
|
+
raise EOFError, 'end of file reached' if eof?
|
83
|
+
data = empty_string_raw
|
84
|
+
begin
|
85
|
+
byte = read(1)
|
86
|
+
if internal_encoding || external_encoding
|
87
|
+
byte.force_encoding internal_encoding || external_encoding
|
88
|
+
end
|
89
|
+
data << byte
|
90
|
+
end until data.valid_encoding? or source_eof?
|
91
|
+
data.encode! internal_encoding if internal_encoding
|
92
|
+
data
|
93
|
+
end
|
94
|
+
|
71
95
|
def getc
|
72
96
|
readchar
|
73
97
|
rescue EOFError
|
74
98
|
nil
|
75
99
|
end
|
76
|
-
|
100
|
+
|
77
101
|
def read(length = nil)
|
78
|
-
|
79
102
|
raise ArgumentError if length && length < 0
|
80
103
|
return '' if length == 0
|
81
|
-
|
104
|
+
|
82
105
|
# fill the buffer up to the fill level (or whole input if length is nil)
|
83
|
-
while !source_eof? && (length.nil? || length >
|
106
|
+
while !source_eof? && (length.nil? || length > @buffer.bytesize)
|
84
107
|
buffer_data @options[:block_size] || length
|
85
108
|
end
|
86
|
-
|
109
|
+
|
87
110
|
# we now have all the data in the buffer that we need (or can get if EOF)
|
88
111
|
case
|
89
|
-
when
|
112
|
+
when @buffer.bytesize > 0
|
90
113
|
# limit length to the buffer size if we were asked for it all or have ran out (EOF)
|
91
|
-
read_length = if length.nil? or length >
|
92
|
-
|
114
|
+
read_length = if length.nil? or length > @buffer.bytesize
|
115
|
+
@buffer.bytesize
|
93
116
|
else
|
94
117
|
length
|
95
118
|
end
|
96
119
|
data = pop_bytes read_length
|
97
|
-
@pos += bytesize
|
98
|
-
if length.nil?
|
99
|
-
data.force_encoding
|
100
|
-
data.encode!
|
120
|
+
@pos += data.bytesize
|
121
|
+
if length.nil?
|
122
|
+
data.force_encoding external_encoding if external_encoding
|
123
|
+
data.encode! internal_encoding if internal_encoding
|
101
124
|
end
|
102
125
|
data
|
103
126
|
when source_eof?
|
@@ -106,15 +129,13 @@ class FilterIO
|
|
106
129
|
else
|
107
130
|
raise IOError, 'Read error'
|
108
131
|
end
|
109
|
-
|
110
132
|
end
|
111
|
-
|
133
|
+
|
112
134
|
def rewind
|
113
135
|
seek 0, IO::SEEK_SET
|
114
136
|
end
|
115
|
-
|
137
|
+
|
116
138
|
def seek(offset, whence = IO::SEEK_SET)
|
117
|
-
|
118
139
|
new_pos = case whence
|
119
140
|
when IO::SEEK_SET
|
120
141
|
offset
|
@@ -125,7 +146,7 @@ class FilterIO
|
|
125
146
|
else
|
126
147
|
raise Errno::EINVAL
|
127
148
|
end
|
128
|
-
|
149
|
+
|
129
150
|
case new_pos
|
130
151
|
when pos
|
131
152
|
# noop
|
@@ -137,26 +158,25 @@ class FilterIO
|
|
137
158
|
else
|
138
159
|
raise Errno::EINVAL, 'Random seek not supported'
|
139
160
|
end
|
140
|
-
|
161
|
+
|
141
162
|
0
|
142
163
|
end
|
143
|
-
|
164
|
+
|
144
165
|
def ungetc(char)
|
145
|
-
char = char.chr
|
146
|
-
@pos -= bytesize
|
166
|
+
char = char.chr
|
167
|
+
@pos -= char.bytesize
|
147
168
|
@pos = 0 if @pos < 0
|
148
169
|
@buffer = char + @buffer
|
149
170
|
end
|
150
|
-
|
171
|
+
|
151
172
|
def gets(sep_string = $/)
|
152
|
-
|
153
173
|
return nil if eof?
|
154
174
|
return read if sep_string.nil?
|
155
|
-
|
175
|
+
|
156
176
|
paragraph_mode = sep_string == ''
|
157
177
|
sep_string = "\n\n" if paragraph_mode
|
158
178
|
sep_string = sep_string.to_s unless sep_string.is_a? String
|
159
|
-
|
179
|
+
|
160
180
|
if paragraph_mode
|
161
181
|
# consume any leading newlines
|
162
182
|
char = getc
|
@@ -167,12 +187,12 @@ class FilterIO
|
|
167
187
|
return nil # nothing left except newlines, bail out
|
168
188
|
end
|
169
189
|
end
|
170
|
-
|
190
|
+
|
171
191
|
# fill the buffer until it contains the separator sequence
|
172
192
|
until source_eof? or @buffer.index(sep_string)
|
173
193
|
buffer_data @options[:block_size]
|
174
194
|
end
|
175
|
-
|
195
|
+
|
176
196
|
# calculate how much of the buffer to return
|
177
197
|
length = if idx = @buffer.index(sep_string)
|
178
198
|
# we found the separator, include it in our output
|
@@ -181,18 +201,18 @@ class FilterIO
|
|
181
201
|
# no separator found (must be EOF). return everything we've got
|
182
202
|
length = @buffer.size
|
183
203
|
end
|
184
|
-
|
204
|
+
|
185
205
|
# increment the position and return the buffer fragment
|
186
206
|
data = @buffer.slice!(0, length)
|
187
|
-
@pos += bytesize
|
188
|
-
|
207
|
+
@pos += data.bytesize
|
208
|
+
|
189
209
|
data
|
190
210
|
end
|
191
|
-
|
211
|
+
|
192
212
|
def readline(sep_string = $/)
|
193
213
|
gets(sep_string) or raise EOFError, 'end of file reached'
|
194
214
|
end
|
195
|
-
|
215
|
+
|
196
216
|
def each_line(sep_string = $/)
|
197
217
|
unless block_given?
|
198
218
|
klass = defined?(Enumerator) ? Enumerator : Enumerable::Enumerator
|
@@ -205,64 +225,55 @@ class FilterIO
|
|
205
225
|
end
|
206
226
|
alias :each :each_line
|
207
227
|
alias :lines :each_line
|
208
|
-
|
228
|
+
|
209
229
|
def readlines(sep_string = $/)
|
210
230
|
lines = []
|
211
231
|
each_line(sep_string) { |line| lines << line }
|
212
232
|
lines
|
213
233
|
end
|
214
|
-
|
234
|
+
|
215
235
|
protected
|
216
|
-
|
236
|
+
|
217
237
|
def empty_string
|
218
238
|
str = String.new
|
219
|
-
if
|
220
|
-
str.force_encoding
|
239
|
+
if internal_encoding || external_encoding
|
240
|
+
str.force_encoding internal_encoding || external_encoding
|
221
241
|
end
|
222
242
|
str
|
223
243
|
end
|
224
|
-
|
244
|
+
|
225
245
|
def empty_string_raw
|
226
246
|
str = String.new
|
227
|
-
if
|
228
|
-
str.force_encoding
|
247
|
+
if external_encoding
|
248
|
+
str.force_encoding external_encoding
|
229
249
|
end
|
230
250
|
str
|
231
251
|
end
|
232
|
-
|
233
|
-
def bytesize(str)
|
234
|
-
str.respond_to?(:bytesize) ? str.bytesize : str.size
|
235
|
-
end
|
236
|
-
|
252
|
+
|
237
253
|
def pop_bytes(count)
|
238
254
|
data = begin
|
239
|
-
|
240
|
-
|
241
|
-
end
|
255
|
+
org_encoding = @buffer.encoding
|
256
|
+
@buffer.force_encoding 'ASCII-8BIT'
|
242
257
|
@buffer.slice!(0, count)
|
243
258
|
ensure
|
244
|
-
|
245
|
-
@buffer.force_encoding @io.internal_encoding || @io.external_encoding
|
246
|
-
end
|
259
|
+
@buffer.force_encoding org_encoding
|
247
260
|
end
|
248
261
|
data
|
249
262
|
end
|
250
|
-
|
263
|
+
|
251
264
|
def buffer_data(block_size = nil)
|
252
|
-
|
253
265
|
block_size ||= DEFAULT_BLOCK_SIZE
|
254
|
-
|
266
|
+
|
255
267
|
data = unless @buffer_raw.empty?
|
256
|
-
@buffer_raw.slice! 0,
|
268
|
+
@buffer_raw.slice! 0, @buffer_raw.bytesize
|
257
269
|
else
|
258
270
|
@io.read(block_size) or return
|
259
271
|
end
|
260
|
-
|
261
|
-
initial_data_size = bytesize
|
272
|
+
|
273
|
+
initial_data_size = data.bytesize
|
262
274
|
begin
|
263
|
-
|
264
275
|
data = process_data data, initial_data_size
|
265
|
-
|
276
|
+
|
266
277
|
# if no processed data was returned and there is unprocessed data...
|
267
278
|
if data.is_a?(Array) && data.size == 2 && data[0].size == 0 && data[1].size > 0
|
268
279
|
# restore the unprocessed data into the temporary buffer
|
@@ -270,52 +281,45 @@ class FilterIO
|
|
270
281
|
# and add some more data to the buffer
|
271
282
|
raise NeedMoreData
|
272
283
|
end
|
273
|
-
|
274
284
|
rescue NeedMoreData => e
|
275
285
|
raise EOFError, 'end of file reached' if @io.eof?
|
276
286
|
data << @io.read(block_size)
|
277
287
|
retry
|
278
288
|
end
|
279
|
-
|
289
|
+
|
280
290
|
data = [data] unless data.is_a? Array
|
281
291
|
raise 'Block must have 1 or 2 values' unless data.size <= 2
|
282
|
-
if @buffer.
|
292
|
+
if @buffer.encoding != data[0].encoding
|
283
293
|
if [@buffer, data[0]].any? { |x| x.encoding.to_s == 'ASCII-8BIT' }
|
284
294
|
data[0] = data[0].dup.force_encoding @buffer.encoding
|
285
295
|
end
|
286
296
|
end
|
287
297
|
@buffer << data[0]
|
288
298
|
if data[1]
|
289
|
-
if
|
290
|
-
data[1].convert!
|
299
|
+
if internal_encoding
|
300
|
+
data[1].convert! external_encoding
|
291
301
|
end
|
292
302
|
@buffer_raw = data[1]
|
293
303
|
end
|
294
|
-
|
295
304
|
end
|
296
|
-
|
305
|
+
|
297
306
|
def process_data(data, initial_data_size)
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
data.force_encoding org_encoding
|
305
|
-
raise NeedMoreData
|
306
|
-
end
|
307
|
-
data.encode! @io.internal_encoding if @io.internal_encoding
|
307
|
+
org_encoding = data.encoding
|
308
|
+
data.force_encoding external_encoding if external_encoding
|
309
|
+
additional_data_size = data.bytesize - initial_data_size
|
310
|
+
unless data.valid_encoding? or source_eof? or additional_data_size >= 4
|
311
|
+
data.force_encoding org_encoding
|
312
|
+
raise NeedMoreData
|
308
313
|
end
|
309
|
-
|
314
|
+
data.encode! internal_encoding if internal_encoding
|
315
|
+
|
310
316
|
if data && @block
|
311
|
-
|
312
|
-
args
|
313
|
-
args = args.first(@block.arity > 0 ? @block.arity : 1)
|
317
|
+
args = [data.dup]
|
318
|
+
args << BlockState.new(@io.pos == data.length, source_eof?) if @block.arity > 1
|
314
319
|
data = @block.call(*args)
|
315
320
|
raise IOError, 'Block returned nil' if data.nil?
|
316
321
|
end
|
317
|
-
|
322
|
+
|
318
323
|
data
|
319
324
|
end
|
320
|
-
|
321
325
|
end
|