filter_io 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -1
- data/Gemfile +2 -6
- data/README.markdown +37 -31
- data/Rakefile +4 -52
- data/filter_io.gemspec +26 -0
- data/lib/filter_io/version.rb +3 -0
- data/lib/filter_io.rb +108 -104
- data/spec/filter_io_spec.rb +801 -0
- data/spec/spec_helper.rb +6 -0
- metadata +114 -56
- data/Gemfile.lock +0 -24
- data/test/filter_io_test.rb +0 -777
- data/test/test_helper.rb +0 -6
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.markdown
CHANGED
@@ -11,37 +11,41 @@
|
|
11
11
|
|
12
12
|
You can install the gem by running:
|
13
13
|
|
14
|
-
|
14
|
+
``` sh
|
15
|
+
gem install filter_io
|
16
|
+
```
|
15
17
|
|
16
18
|
### Example Usage
|
17
19
|
|
18
20
|
#### A Simple Example: ROT-13
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
|
22
|
+
``` ruby
|
23
|
+
io = FilterIO.new io do |data|
|
24
|
+
data.tr "A-Za-z", "N-ZA-Mn-za-m"
|
25
|
+
end
|
26
|
+
```
|
23
27
|
|
24
28
|
#### A Useful Example: Line Ending Normalisation
|
25
29
|
|
26
30
|
A common usage of `filter_io` is to normalise line endings before parsing CSV data:
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
``` ruby
|
33
|
+
# open source stream
|
34
|
+
File.open(filename) do |io|
|
35
|
+
# apply filter to stream
|
36
|
+
io = FilterIO.new(io) do |data, state|
|
37
|
+
# grab another chunk if the last character is a delimiter
|
38
|
+
raise FilterIO::NeedMoreData if data =~ /[\r\n]\z/ && !state.eof?
|
39
|
+
# normalise line endings to LF
|
40
|
+
data.gsub /\r\n|\r|\n/, "\n"
|
41
|
+
end
|
42
|
+
|
43
|
+
# process resulting stream normally
|
44
|
+
FasterCSV.parse(io) do |row|
|
45
|
+
pp row
|
46
|
+
end
|
47
|
+
end
|
48
|
+
```
|
45
49
|
|
46
50
|
### Reference
|
47
51
|
|
@@ -66,16 +70,18 @@ If your block is unable to process the whole chunk of data immediately, it can r
|
|
66
70
|
|
67
71
|
Here's an example which processes whole lines and prepends the line length to the beginning of each line.
|
68
72
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
``` ruby
|
74
|
+
io = FilterIO.new io do |data, state|
|
75
|
+
output = ''
|
76
|
+
# grab complete lines until we hit EOF
|
77
|
+
while data =~ /(.*)\n/ || (state.eof? && data =~ /(.+)/)
|
78
|
+
output << "#{$1.size} #{$1}\n"
|
79
|
+
data = $'
|
80
|
+
end
|
81
|
+
# `output` contains the processed lines, `data` contains any left over partial line
|
82
|
+
[output, data]
|
83
|
+
end
|
84
|
+
```
|
79
85
|
|
80
86
|
#### Block Size
|
81
87
|
|
data/Rakefile
CHANGED
@@ -1,54 +1,6 @@
|
|
1
|
-
require 'rubygems'
|
2
1
|
require 'rake'
|
3
|
-
require '
|
2
|
+
require 'bundler/gem_tasks'
|
3
|
+
require 'rspec/core/rake_task'
|
4
4
|
|
5
|
-
|
6
|
-
task :default => :
|
7
|
-
|
8
|
-
desc 'Test the filter_io plugin.'
|
9
|
-
Rake::TestTask.new(:test) do |t|
|
10
|
-
t.libs << 'lib'
|
11
|
-
t.libs << 'test'
|
12
|
-
t.pattern = 'test/**/*_test.rb'
|
13
|
-
t.verbose = true
|
14
|
-
end
|
15
|
-
|
16
|
-
task :test => :check_dependencies
|
17
|
-
|
18
|
-
begin
|
19
|
-
require 'jeweler'
|
20
|
-
Jeweler::Tasks.new do |gem|
|
21
|
-
gem.name = "filter_io"
|
22
|
-
gem.summary = "Filter IO streams with a block. Ruby's FilterInputStream."
|
23
|
-
gem.email = "jason@jasoncodes.com"
|
24
|
-
gem.homepage = "http://github.com/jasoncodes/filter_io"
|
25
|
-
gem.authors = ["Jason Weathered"]
|
26
|
-
gem.has_rdoc = false
|
27
|
-
gem.add_dependency 'activesupport'
|
28
|
-
end
|
29
|
-
Jeweler::GemcutterTasks.new
|
30
|
-
rescue LoadError
|
31
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
32
|
-
end
|
33
|
-
|
34
|
-
begin
|
35
|
-
require 'rcov/rcovtask'
|
36
|
-
Rcov::RcovTask.new do |t|
|
37
|
-
t.libs << "test"
|
38
|
-
t.rcov_opts = [
|
39
|
-
"--exclude '^(?!lib)'"
|
40
|
-
]
|
41
|
-
t.test_files = FileList[
|
42
|
-
'test/**/*_test.rb'
|
43
|
-
]
|
44
|
-
t.output_dir = 'coverage'
|
45
|
-
t.verbose = true
|
46
|
-
end
|
47
|
-
task :rcov do
|
48
|
-
system "open coverage/index.html"
|
49
|
-
end
|
50
|
-
rescue LoadError
|
51
|
-
task :rcov do
|
52
|
-
raise "You must install the 'rcov' gem"
|
53
|
-
end
|
54
|
-
end
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
task :default => :spec
|
data/filter_io.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'filter_io/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = %q{filter_io}
|
8
|
+
spec.version = FilterIO::VERSION
|
9
|
+
spec.authors = ['Jason Weathered']
|
10
|
+
spec.email = ['jason@jasoncodes.com']
|
11
|
+
spec.summary = %q{Filter IO streams with a block. Ruby's FilterInputStream.}
|
12
|
+
spec.homepage = 'http://github.com/jasoncodes/filter_io'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($/)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'activesupport', '>= 2.3.9'
|
21
|
+
|
22
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_development_dependency 'simplecov'
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
26
|
+
end
|
data/lib/filter_io.rb
CHANGED
@@ -4,12 +4,11 @@ require 'active_support/core_ext/array'
|
|
4
4
|
require 'active_support/core_ext/hash'
|
5
5
|
|
6
6
|
class FilterIO
|
7
|
-
|
8
7
|
DEFAULT_BLOCK_SIZE = 1024
|
9
|
-
|
8
|
+
|
10
9
|
class NeedMoreData < Exception
|
11
10
|
end
|
12
|
-
|
11
|
+
|
13
12
|
class BlockState
|
14
13
|
attr_reader :bof, :eof
|
15
14
|
def initialize(bof, eof)
|
@@ -19,7 +18,7 @@ class FilterIO
|
|
19
18
|
alias_method :bof?, :bof
|
20
19
|
alias_method :eof?, :eof
|
21
20
|
end
|
22
|
-
|
21
|
+
|
23
22
|
def initialize(io, options = nil, &block)
|
24
23
|
@io = io
|
25
24
|
@options = options || {}
|
@@ -29,75 +28,99 @@ class FilterIO
|
|
29
28
|
@buffer_raw = empty_string_raw
|
30
29
|
@options.assert_valid_keys :block_size
|
31
30
|
end
|
32
|
-
|
31
|
+
|
33
32
|
def pos
|
34
33
|
@pos
|
35
34
|
end
|
36
|
-
|
35
|
+
|
37
36
|
def bof?
|
38
37
|
@pos == 0
|
39
38
|
end
|
40
|
-
|
39
|
+
|
41
40
|
def eof?
|
42
41
|
@buffer.empty? && source_eof?
|
43
42
|
end
|
44
|
-
|
43
|
+
|
45
44
|
def source_eof?
|
46
45
|
@buffer_raw.empty? && @io.eof?
|
47
46
|
end
|
48
|
-
|
47
|
+
|
49
48
|
def close
|
50
49
|
@io.close
|
51
50
|
end
|
52
|
-
|
51
|
+
|
53
52
|
def closed?
|
54
53
|
@io.closed?
|
55
54
|
end
|
56
|
-
|
57
|
-
def
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
55
|
+
|
56
|
+
def default_encoding
|
57
|
+
unless @default_encoding
|
58
|
+
c = @io.getc
|
59
|
+
@io.ungetc c
|
60
|
+
@default_encoding = c.encoding
|
61
|
+
end
|
62
|
+
@default_encoding
|
63
|
+
end
|
64
|
+
|
65
|
+
def internal_encoding
|
66
|
+
if @io.respond_to?(:internal_encoding)
|
67
|
+
@io.internal_encoding
|
66
68
|
else
|
67
|
-
|
69
|
+
default_encoding
|
68
70
|
end
|
69
71
|
end
|
70
|
-
|
72
|
+
|
73
|
+
def external_encoding
|
74
|
+
if @io.respond_to?(:external_encoding)
|
75
|
+
@io.external_encoding
|
76
|
+
else
|
77
|
+
default_encoding
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def readchar
|
82
|
+
raise EOFError, 'end of file reached' if eof?
|
83
|
+
data = empty_string_raw
|
84
|
+
begin
|
85
|
+
byte = read(1)
|
86
|
+
if internal_encoding || external_encoding
|
87
|
+
byte.force_encoding internal_encoding || external_encoding
|
88
|
+
end
|
89
|
+
data << byte
|
90
|
+
end until data.valid_encoding? or source_eof?
|
91
|
+
data.encode! internal_encoding if internal_encoding
|
92
|
+
data
|
93
|
+
end
|
94
|
+
|
71
95
|
def getc
|
72
96
|
readchar
|
73
97
|
rescue EOFError
|
74
98
|
nil
|
75
99
|
end
|
76
|
-
|
100
|
+
|
77
101
|
def read(length = nil)
|
78
|
-
|
79
102
|
raise ArgumentError if length && length < 0
|
80
103
|
return '' if length == 0
|
81
|
-
|
104
|
+
|
82
105
|
# fill the buffer up to the fill level (or whole input if length is nil)
|
83
|
-
while !source_eof? && (length.nil? || length >
|
106
|
+
while !source_eof? && (length.nil? || length > @buffer.bytesize)
|
84
107
|
buffer_data @options[:block_size] || length
|
85
108
|
end
|
86
|
-
|
109
|
+
|
87
110
|
# we now have all the data in the buffer that we need (or can get if EOF)
|
88
111
|
case
|
89
|
-
when
|
112
|
+
when @buffer.bytesize > 0
|
90
113
|
# limit length to the buffer size if we were asked for it all or have ran out (EOF)
|
91
|
-
read_length = if length.nil? or length >
|
92
|
-
|
114
|
+
read_length = if length.nil? or length > @buffer.bytesize
|
115
|
+
@buffer.bytesize
|
93
116
|
else
|
94
117
|
length
|
95
118
|
end
|
96
119
|
data = pop_bytes read_length
|
97
|
-
@pos += bytesize
|
98
|
-
if length.nil?
|
99
|
-
data.force_encoding
|
100
|
-
data.encode!
|
120
|
+
@pos += data.bytesize
|
121
|
+
if length.nil?
|
122
|
+
data.force_encoding external_encoding if external_encoding
|
123
|
+
data.encode! internal_encoding if internal_encoding
|
101
124
|
end
|
102
125
|
data
|
103
126
|
when source_eof?
|
@@ -106,15 +129,13 @@ class FilterIO
|
|
106
129
|
else
|
107
130
|
raise IOError, 'Read error'
|
108
131
|
end
|
109
|
-
|
110
132
|
end
|
111
|
-
|
133
|
+
|
112
134
|
def rewind
|
113
135
|
seek 0, IO::SEEK_SET
|
114
136
|
end
|
115
|
-
|
137
|
+
|
116
138
|
def seek(offset, whence = IO::SEEK_SET)
|
117
|
-
|
118
139
|
new_pos = case whence
|
119
140
|
when IO::SEEK_SET
|
120
141
|
offset
|
@@ -125,7 +146,7 @@ class FilterIO
|
|
125
146
|
else
|
126
147
|
raise Errno::EINVAL
|
127
148
|
end
|
128
|
-
|
149
|
+
|
129
150
|
case new_pos
|
130
151
|
when pos
|
131
152
|
# noop
|
@@ -137,26 +158,25 @@ class FilterIO
|
|
137
158
|
else
|
138
159
|
raise Errno::EINVAL, 'Random seek not supported'
|
139
160
|
end
|
140
|
-
|
161
|
+
|
141
162
|
0
|
142
163
|
end
|
143
|
-
|
164
|
+
|
144
165
|
def ungetc(char)
|
145
|
-
char = char.chr
|
146
|
-
@pos -= bytesize
|
166
|
+
char = char.chr
|
167
|
+
@pos -= char.bytesize
|
147
168
|
@pos = 0 if @pos < 0
|
148
169
|
@buffer = char + @buffer
|
149
170
|
end
|
150
|
-
|
171
|
+
|
151
172
|
def gets(sep_string = $/)
|
152
|
-
|
153
173
|
return nil if eof?
|
154
174
|
return read if sep_string.nil?
|
155
|
-
|
175
|
+
|
156
176
|
paragraph_mode = sep_string == ''
|
157
177
|
sep_string = "\n\n" if paragraph_mode
|
158
178
|
sep_string = sep_string.to_s unless sep_string.is_a? String
|
159
|
-
|
179
|
+
|
160
180
|
if paragraph_mode
|
161
181
|
# consume any leading newlines
|
162
182
|
char = getc
|
@@ -167,12 +187,12 @@ class FilterIO
|
|
167
187
|
return nil # nothing left except newlines, bail out
|
168
188
|
end
|
169
189
|
end
|
170
|
-
|
190
|
+
|
171
191
|
# fill the buffer until it contains the separator sequence
|
172
192
|
until source_eof? or @buffer.index(sep_string)
|
173
193
|
buffer_data @options[:block_size]
|
174
194
|
end
|
175
|
-
|
195
|
+
|
176
196
|
# calculate how much of the buffer to return
|
177
197
|
length = if idx = @buffer.index(sep_string)
|
178
198
|
# we found the separator, include it in our output
|
@@ -181,18 +201,18 @@ class FilterIO
|
|
181
201
|
# no separator found (must be EOF). return everything we've got
|
182
202
|
length = @buffer.size
|
183
203
|
end
|
184
|
-
|
204
|
+
|
185
205
|
# increment the position and return the buffer fragment
|
186
206
|
data = @buffer.slice!(0, length)
|
187
|
-
@pos += bytesize
|
188
|
-
|
207
|
+
@pos += data.bytesize
|
208
|
+
|
189
209
|
data
|
190
210
|
end
|
191
|
-
|
211
|
+
|
192
212
|
def readline(sep_string = $/)
|
193
213
|
gets(sep_string) or raise EOFError, 'end of file reached'
|
194
214
|
end
|
195
|
-
|
215
|
+
|
196
216
|
def each_line(sep_string = $/)
|
197
217
|
unless block_given?
|
198
218
|
klass = defined?(Enumerator) ? Enumerator : Enumerable::Enumerator
|
@@ -205,64 +225,55 @@ class FilterIO
|
|
205
225
|
end
|
206
226
|
alias :each :each_line
|
207
227
|
alias :lines :each_line
|
208
|
-
|
228
|
+
|
209
229
|
def readlines(sep_string = $/)
|
210
230
|
lines = []
|
211
231
|
each_line(sep_string) { |line| lines << line }
|
212
232
|
lines
|
213
233
|
end
|
214
|
-
|
234
|
+
|
215
235
|
protected
|
216
|
-
|
236
|
+
|
217
237
|
def empty_string
|
218
238
|
str = String.new
|
219
|
-
if
|
220
|
-
str.force_encoding
|
239
|
+
if internal_encoding || external_encoding
|
240
|
+
str.force_encoding internal_encoding || external_encoding
|
221
241
|
end
|
222
242
|
str
|
223
243
|
end
|
224
|
-
|
244
|
+
|
225
245
|
def empty_string_raw
|
226
246
|
str = String.new
|
227
|
-
if
|
228
|
-
str.force_encoding
|
247
|
+
if external_encoding
|
248
|
+
str.force_encoding external_encoding
|
229
249
|
end
|
230
250
|
str
|
231
251
|
end
|
232
|
-
|
233
|
-
def bytesize(str)
|
234
|
-
str.respond_to?(:bytesize) ? str.bytesize : str.size
|
235
|
-
end
|
236
|
-
|
252
|
+
|
237
253
|
def pop_bytes(count)
|
238
254
|
data = begin
|
239
|
-
|
240
|
-
|
241
|
-
end
|
255
|
+
org_encoding = @buffer.encoding
|
256
|
+
@buffer.force_encoding 'ASCII-8BIT'
|
242
257
|
@buffer.slice!(0, count)
|
243
258
|
ensure
|
244
|
-
|
245
|
-
@buffer.force_encoding @io.internal_encoding || @io.external_encoding
|
246
|
-
end
|
259
|
+
@buffer.force_encoding org_encoding
|
247
260
|
end
|
248
261
|
data
|
249
262
|
end
|
250
|
-
|
263
|
+
|
251
264
|
def buffer_data(block_size = nil)
|
252
|
-
|
253
265
|
block_size ||= DEFAULT_BLOCK_SIZE
|
254
|
-
|
266
|
+
|
255
267
|
data = unless @buffer_raw.empty?
|
256
|
-
@buffer_raw.slice! 0,
|
268
|
+
@buffer_raw.slice! 0, @buffer_raw.bytesize
|
257
269
|
else
|
258
270
|
@io.read(block_size) or return
|
259
271
|
end
|
260
|
-
|
261
|
-
initial_data_size = bytesize
|
272
|
+
|
273
|
+
initial_data_size = data.bytesize
|
262
274
|
begin
|
263
|
-
|
264
275
|
data = process_data data, initial_data_size
|
265
|
-
|
276
|
+
|
266
277
|
# if no processed data was returned and there is unprocessed data...
|
267
278
|
if data.is_a?(Array) && data.size == 2 && data[0].size == 0 && data[1].size > 0
|
268
279
|
# restore the unprocessed data into the temporary buffer
|
@@ -270,52 +281,45 @@ class FilterIO
|
|
270
281
|
# and add some more data to the buffer
|
271
282
|
raise NeedMoreData
|
272
283
|
end
|
273
|
-
|
274
284
|
rescue NeedMoreData => e
|
275
285
|
raise EOFError, 'end of file reached' if @io.eof?
|
276
286
|
data << @io.read(block_size)
|
277
287
|
retry
|
278
288
|
end
|
279
|
-
|
289
|
+
|
280
290
|
data = [data] unless data.is_a? Array
|
281
291
|
raise 'Block must have 1 or 2 values' unless data.size <= 2
|
282
|
-
if @buffer.
|
292
|
+
if @buffer.encoding != data[0].encoding
|
283
293
|
if [@buffer, data[0]].any? { |x| x.encoding.to_s == 'ASCII-8BIT' }
|
284
294
|
data[0] = data[0].dup.force_encoding @buffer.encoding
|
285
295
|
end
|
286
296
|
end
|
287
297
|
@buffer << data[0]
|
288
298
|
if data[1]
|
289
|
-
if
|
290
|
-
data[1].convert!
|
299
|
+
if internal_encoding
|
300
|
+
data[1].convert! external_encoding
|
291
301
|
end
|
292
302
|
@buffer_raw = data[1]
|
293
303
|
end
|
294
|
-
|
295
304
|
end
|
296
|
-
|
305
|
+
|
297
306
|
def process_data(data, initial_data_size)
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
data.force_encoding org_encoding
|
305
|
-
raise NeedMoreData
|
306
|
-
end
|
307
|
-
data.encode! @io.internal_encoding if @io.internal_encoding
|
307
|
+
org_encoding = data.encoding
|
308
|
+
data.force_encoding external_encoding if external_encoding
|
309
|
+
additional_data_size = data.bytesize - initial_data_size
|
310
|
+
unless data.valid_encoding? or source_eof? or additional_data_size >= 4
|
311
|
+
data.force_encoding org_encoding
|
312
|
+
raise NeedMoreData
|
308
313
|
end
|
309
|
-
|
314
|
+
data.encode! internal_encoding if internal_encoding
|
315
|
+
|
310
316
|
if data && @block
|
311
|
-
|
312
|
-
args
|
313
|
-
args = args.first(@block.arity > 0 ? @block.arity : 1)
|
317
|
+
args = [data.dup]
|
318
|
+
args << BlockState.new(@io.pos == data.length, source_eof?) if @block.arity > 1
|
314
319
|
data = @block.call(*args)
|
315
320
|
raise IOError, 'Block returned nil' if data.nil?
|
316
321
|
end
|
317
|
-
|
322
|
+
|
318
323
|
data
|
319
324
|
end
|
320
|
-
|
321
325
|
end
|