file_processor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ .DS_Store
2
+ *.gem
3
+ *.rbc
4
+ .bundle
5
+ .config
6
+ .yardoc
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --color
2
+ --format documentation
3
+ --drb
4
+ --debugger
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.3@file_processor --create
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ nguage: ruby
2
+ rvm:
3
+ - 1.9.3
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in file_processor.gemspec
4
+ gemspec
5
+
6
+ gem 'rake'
7
+
8
+ group(:development) do
9
+ gem 'debugger'
10
+ end
11
+
12
+ group(:test) do
13
+ gem 'rspec', "~> 2.14.0.rc1"
14
+ gem 'simplecov'
15
+ gem 'json', '~> 1.7.7'
16
+ end
17
+
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ file_processor (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ columnize (0.3.6)
10
+ debugger (1.6.0)
11
+ columnize (>= 0.3.1)
12
+ debugger-linecache (~> 1.2.0)
13
+ debugger-ruby_core_source (~> 1.2.1)
14
+ debugger-linecache (1.2.0)
15
+ debugger-ruby_core_source (1.2.2)
16
+ diff-lcs (1.2.4)
17
+ json (1.7.7)
18
+ multi_json (1.7.4)
19
+ rake (10.0.4)
20
+ rspec (2.14.0.rc1)
21
+ rspec-core (= 2.14.0.rc1)
22
+ rspec-expectations (= 2.14.0.rc1)
23
+ rspec-mocks (= 2.14.0.rc1)
24
+ rspec-core (2.14.0.rc1)
25
+ rspec-expectations (2.14.0.rc1)
26
+ diff-lcs (>= 1.1.3, < 2.0)
27
+ rspec-mocks (2.14.0.rc1)
28
+ simplecov (0.7.1)
29
+ multi_json (~> 1.0)
30
+ simplecov-html (~> 0.7.1)
31
+ simplecov-html (0.7.1)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ debugger
38
+ file_processor!
39
+ json (~> 1.7.7)
40
+ rake
41
+ rspec (~> 2.14.0.rc1)
42
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Vicente Mundim
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # FileProcessor
2
+
3
+ [![Build Status](https://travis-ci.org/dtmconsultoria/file_processor.png)](https://travis-ci.org/dtmconsultoria/file_processor)
4
+
5
+ A more powerful CSV file processor
6
+
7
+ ## Installation
8
+
9
+ FileProcessor uses the new CSV library introduced in Ruby 1.9.3, thus it is only compatible with this Ruby version.
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'file_processor'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install file_processor
22
+
23
+ ## Usage
24
+
25
+ Use it as you would use Ruby's CSV:
26
+
27
+ FileProcessor::CSV.open(filename, options) do |csv|
28
+ csv.each do |row|
29
+ # process row here
30
+ end
31
+ end # automatically closes the file
32
+
33
+ FileProcessor::CSV is just a wrapper around Ruby's CSV, so you can manipulate it as you would manipulate Ruby's CSV.
34
+
35
+ You can also use `FileProcessor::CSV#process_range` to process a range in the file:
36
+
37
+ FileProcessor::CSV.open(filename, options) do |csv|
38
+ csv.process_range(offset: 2000, limit: 1000) do |row, index|
39
+ # yields 1000 rows starting from line 2000 (i.e., from line 2000 to line 2999)
40
+ end
41
+ end # automatically closes the file
42
+
43
+ Here are the added features:
44
+
45
+ * Auto-detect encoding of UTF-8 and ISO-8859-1 (Latin1) files.
46
+ * Auto-detect the column separator (`col_sep` option) when not given.
47
+ * Skip lines without data when `skip_blank` is `true`, which is turned on by default. This means that count will not take these lines into account. Also skips them when iterating through lines.
48
+ * Detects if a file is gzipped, and decompress it for you automatically.
49
+
50
+ ## Contributing
51
+
52
+ 1. Fork it
53
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
54
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
55
+ 4. Push to the branch (`git push origin my-new-feature`)
56
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :default => :spec
@@ -0,0 +1,18 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'file_processor/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "file_processor"
8
+ gem.version = FileProcessor::VERSION
9
+ gem.authors = ["Vicente Mundim"]
10
+ gem.email = ["vicente.mundim@gmail.com"]
11
+ gem.description = %q{A more powerful CSV file processor}
12
+ gem.summary = %q{A more powerful CSV file processor}
13
+
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+ end
@@ -0,0 +1,186 @@
1
+ module FileProcessor
2
+ class CSV < SimpleDelegator
3
+ include Enumerable
4
+
5
+ # Opens a file and yields it, ensuring that it is properly closed.
6
+ def self.open(*args)
7
+ instance = new(*args)
8
+
9
+ if block_given?
10
+ begin
11
+ yield instance
12
+ ensure
13
+ instance.close if instance
14
+ end
15
+ else
16
+ instance
17
+ end
18
+ end
19
+
20
+ attr_accessor :detected_encoding
21
+
22
+ def initialize(filename, options={})
23
+ @gzipped = options.delete(:gzipped)
24
+
25
+ load(filename, options.delete(:open_options))
26
+
27
+ @options = default_options.merge(options)
28
+
29
+ @options[:encoding] ||= detect_encoding
30
+ @detected_encoding ||= Encoding.find(@options[:encoding])
31
+
32
+ tempfile.reopen(detected_mode) if tempfile.closed?
33
+
34
+ @options[:col_sep] ||= detect_column_separator
35
+
36
+ super(::CSV.new(tempfile, @options))
37
+ end
38
+
39
+ # Counts the number of rows in the file, even if it has already been read
40
+ #
41
+ # @return [ Integer ] the number of rows in the file
42
+ def total_count(&block)
43
+ rewind
44
+ count(&block)
45
+ ensure
46
+ rewind
47
+ end
48
+
49
+ #
50
+ # Yields each row of the data source in turn, skipping blanks and rows with no data.
51
+ #
52
+ # Support for Enumerable.
53
+ #
54
+ # The data source must be open for reading.
55
+ #
56
+ def each
57
+ if block_given?
58
+ while row = shift
59
+ yield row unless skip_blanks? && row_with_no_data?(row)
60
+ end
61
+ else
62
+ to_enum
63
+ end
64
+ end
65
+
66
+ # Process a range of lines in the CSV file.
67
+ #
68
+ # @example Process 1000 lines starting from the line 2000
69
+ # csv.process_range(offset: 2000, limit: 1000) do |row, index|
70
+ # # process range here
71
+ # end
72
+ #
73
+ # @param [ Hash ] options A hash with offset and/or limit
74
+ #
75
+ # @option options [ Integer ] :offset The offset from which the process should start
76
+ # @option options [ Integer ] :limit The number of rows to process
77
+ #
78
+ # @return [ Enumerable ] CSV's enumerable
79
+ def process_range(options={})
80
+ options ||= {}
81
+
82
+ offset = options[:offset] || 0
83
+ limit = options[:limit] || -1
84
+
85
+ rewind
86
+ each_with_index do |row, index|
87
+ next if index < offset
88
+ break if limit >= 0 && index >= offset + limit
89
+
90
+ yield row, index
91
+ end
92
+ ensure
93
+ rewind
94
+ end
95
+
96
+ # Returns true when the file is gzipped, false otherwise
97
+ def gzipped?
98
+ @gzipped
99
+ end
100
+
101
+ private
102
+
103
+ def detect_compression?
104
+ @gzipped.nil?
105
+ end
106
+
107
+ def row_with_no_data?(row)
108
+ row = row.fields if row.respond_to?(:fields)
109
+ row.all? { |column| column.nil? || column.empty? }
110
+ end
111
+
112
+ def load(filename, open_options)
113
+ loaded_io = decompress(::Kernel.open(filename, 'rb', open_options || {}))
114
+ loaded_io.rewind
115
+
116
+ @original_default_internal = Encoding.default_internal
117
+ Encoding.default_internal = nil
118
+
119
+ loaded_io.each do |line|
120
+ tempfile.write(line)
121
+ end
122
+ ensure
123
+ tempfile.close
124
+ loaded_io.close
125
+ Encoding.default_internal = @original_default_internal
126
+ end
127
+
128
+ def decompress(loaded_io)
129
+ if detect_compression? || gzipped?
130
+ Zlib::GzipReader.open(loaded_io).tap do |decompressed_io|
131
+ decompressed_io.getc # attempt to read from a compressed io
132
+ @gzipped = true
133
+ end
134
+ else
135
+ @gzipped = false
136
+ loaded_io
137
+ end
138
+ rescue Zlib::Error
139
+ # not a compressed io, just returning the loaded io instead
140
+ @gzipped = false
141
+ loaded_io
142
+ end
143
+
144
+ # We open the file and try to read each line of it, if there is an
145
+ # invalid byte sequence, an ArgumentError exception will be thrown.
146
+ #
147
+ # We then assume that the file is in ISO-8859-1 encoding, and transcode
148
+ # it to UTF-8. Though its ugly, this was the only way to detect whether
149
+ # a file was using one of these encodings.
150
+ def detect_encoding
151
+ tempfile.reopen('r:utf-8')
152
+ tempfile.each(&:split) # raises ArgumentError when it has non-ascii characters that are not in UTF-8
153
+
154
+ @detected_encoding = Encoding.find('utf-8')
155
+ rescue ArgumentError
156
+ tempfile.reopen('r:iso-8859-1:utf-8')
157
+ @detected_encoding = Encoding.find('iso-8859-1')
158
+ ensure
159
+ tempfile.rewind
160
+ end
161
+
162
+ def detected_utf_8?
163
+ detected_encoding == Encoding.find('utf-8')
164
+ end
165
+
166
+ def detected_mode
167
+ detected_utf_8? ? 'r:utf-8' : 'r:iso-8859-1:utf-8'
168
+ end
169
+
170
+ def detect_column_separator
171
+ @col_sep = tempfile.gets.split(';').size > 1 ? ';' : ','
172
+ ensure
173
+ tempfile.rewind
174
+ end
175
+
176
+ def default_options
177
+ {
178
+ skip_blanks: true
179
+ }
180
+ end
181
+
182
+ def tempfile
183
+ @tempfile ||= FileProcessor::Tempfile.new
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,20 @@
1
+ module FileProcessor
2
+ class Tempfile < ::Tempfile
3
+ def initialize(basename='file-processor', *args)
4
+ super(basename, *args)
5
+ end
6
+
7
+ def path
8
+ @tmpname
9
+ end
10
+
11
+ def reopen(mode)
12
+ close unless closed?
13
+ @mode = mode
14
+
15
+ @tmpfile = File.open(path, mode, @opts)
16
+ @data[1] = @tmpfile
17
+ __setobj__(@tmpfile)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module FileProcessor
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,12 @@
1
+ require "file_processor/version"
2
+ require "delegate"
3
+ require "csv"
4
+ require "zlib"
5
+ require "open-uri"
6
+ require "tempfile"
7
+
8
+ module FileProcessor
9
+ end
10
+
11
+ require "file_processor/temp_file"
12
+ require "file_processor/csv"
@@ -0,0 +1,449 @@
1
+ require "spec_helper"
2
+
3
+ describe FileProcessor::CSV do
4
+ let(:filename) { fixture('base.csv') }
5
+ let(:options) { {} }
6
+
7
+ subject(:processor) { FileProcessor::CSV.new(filename, options) }
8
+
9
+ it "delegates to a CSV instance" do
10
+ processor.__getobj__.should be_a(::CSV)
11
+ end
12
+
13
+ describe "#col_sep" do
14
+ context "when it is not given" do
15
+ context "and the first line of the file has more than one header column separated with a semi-colon" do
16
+ it "detects it properly" do
17
+ processor.col_sep.should eq(';')
18
+ end
19
+ end
20
+
21
+ context "and the first line of the file has more than one header column separated with a comman" do
22
+ let(:filename) { fixture('base-with-comma-separated-header.csv') }
23
+
24
+ it "detects it properly" do
25
+ processor.col_sep.should eq(',')
26
+ end
27
+ end
28
+
29
+ context "and an unknown column separator is used" do
30
+ let(:filename) { fixture('base-with-unknown-column-separator.csv') }
31
+
32
+ it "does not detects it, falling back to the default one" do
33
+ processor.col_sep.should eq(',')
34
+ end
35
+ end
36
+
37
+ context "and the file has non-ascii characters in its first line" do
38
+ context "in UTF-8" do
39
+ let(:filename) { fixture('base-non-ascii-characters-in-header-utf-8.csv') }
40
+
41
+ it "detects it properly" do
42
+ processor.col_sep.should eq(';')
43
+ end
44
+ end
45
+
46
+ context "in ISO-8859-1" do
47
+ let(:filename) { fixture('base-non-ascii-characters-in-header-iso-8859-1.csv') }
48
+
49
+ it "detects it properly" do
50
+ processor.col_sep.should eq(';')
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ context "when it is given" do
57
+ let(:options) { { col_sep: '|' } }
58
+
59
+ it "uses the given col_sep" do
60
+ processor.col_sep.should eq('|')
61
+ end
62
+ end
63
+ end
64
+
65
+ describe "#count" do
66
+ it "returns the number of rows in the CSV" do
67
+ processor.count.should eq(5)
68
+ end
69
+
70
+ context "when the file has new line characters in a field, but it is properly quoted" do
71
+ let(:filename) { fixture('base-new-line-in-field.csv') }
72
+
73
+ it "returns the correct number of rows in the CSV" do
74
+ processor.count.should eq(3)
75
+ end
76
+ end
77
+
78
+ context "when the file has blank lines" do
79
+ let(:filename) { fixture('base-with-blank-lines.csv') }
80
+
81
+ it "skips them by default" do
82
+ processor.count.should eq(5)
83
+ end
84
+ end
85
+
86
+ context "when the file has lines with no data" do
87
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
88
+
89
+ it "does not count them" do
90
+ processor.count.should eq(5)
91
+ end
92
+
93
+ context "but skip_blanks is false" do
94
+ let(:options) { { skip_blanks: false } }
95
+
96
+ it "does counts them" do
97
+ processor.count.should eq(7)
98
+ end
99
+ end
100
+
101
+ context "and { headers: true } is passed" do
102
+ let(:options) { { headers: true } }
103
+
104
+ it "does not count these lines, as well as the header" do
105
+ processor.count.should eq(4)
106
+ end
107
+ end
108
+ end
109
+
110
+ context "when a block is passed" do
111
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
112
+
113
+ it "returns the number of lines for which the block evaluates to true, properly handling lines with no data" do
114
+ processor.count { |row| !row.first.nil? }.should eq(3)
115
+ end
116
+ end
117
+ end
118
+
119
+ describe "#total_count" do
120
+ it "works as count, but returns all rows, even when called multiple times, since it rewinds the io file" do
121
+ processor.total_count
122
+ processor.total_count.should eq(5)
123
+ end
124
+ end
125
+
126
+ describe "#each" do
127
+ it "returns an enumerator when called without a block" do
128
+ processor.each.should be_a(Enumerator)
129
+ end
130
+
131
+ context "when the file has lines with no data" do
132
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
133
+
134
+ it "does not yields these lines" do
135
+ expect { |block|
136
+ processor.each(&block)
137
+ }.to yield_control.exactly(5).times
138
+ end
139
+
140
+ context "but skip_blanks is false" do
141
+ let(:options) { { skip_blanks: false } }
142
+
143
+ it "yields these lines" do
144
+ expect { |block|
145
+ processor.each(&block)
146
+ }.to yield_control.exactly(7).times
147
+ end
148
+ end
149
+
150
+ context "and { headers: true } is passed" do
151
+ let(:options) { { headers: true } }
152
+
153
+ it "does not yields these lines, as well as the header" do
154
+ expect { |block|
155
+ processor.each(&block)
156
+ }.to yield_control.exactly(4).times # header do not count here
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ describe "encoding" do
163
+ it "can iterate through all of its contents without raising an error" do
164
+ expect {
165
+ processor.each {}
166
+ }.to_not raise_error
167
+ end
168
+
169
+ context "when the encoding is given" do
170
+ let(:filename) { fixture('base-utf-8.csv') }
171
+ let(:options) { { encoding: 'utf-8' } }
172
+
173
+ its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
174
+
175
+ it "opens the file properly" do
176
+ expect {
177
+ processor
178
+ }.to_not raise_error
179
+ end
180
+
181
+ context "and the file is ISO-8859-1" do
182
+ let(:filename) { fixture('base-iso-8859-1.csv') }
183
+
184
+ it "uses it to open the file, raising an error" do
185
+ expect {
186
+ processor
187
+ }.to raise_error
188
+ end
189
+
190
+ context "but the given encoding is ISO-8859-1" do
191
+ let(:options) { { encoding: 'ISO-8859-1' } }
192
+
193
+ its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
194
+
195
+ it "opens the file properly" do
196
+ expect {
197
+ processor
198
+ }.to_not raise_error
199
+ end
200
+ end
201
+ end
202
+ end
203
+
204
+ context "when the file is in US-ASCII" do
205
+ its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
206
+
207
+ it "reads it with utf-8" do
208
+ processor.encoding.should eq(Encoding.find('utf-8'))
209
+ end
210
+ end
211
+
212
+ context "when the file can be read in utf-8" do
213
+ let(:filename) { fixture('base-utf-8.csv') }
214
+
215
+ its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
216
+
217
+ it "properly detects it" do
218
+ processor.encoding.should eq(Encoding.find('utf-8'))
219
+ end
220
+
221
+ it "can iterate through all of its contents without raising an error" do
222
+ expect {
223
+ processor.each {}
224
+ }.to_not raise_error
225
+ end
226
+ end
227
+
228
+ context "when the file cannot be read in utf-8" do
229
+ context "but it can be read in iso-8859-1" do
230
+ let(:filename) { fixture('base-iso-8859-1.csv') }
231
+
232
+ its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
233
+
234
+ it "properly detects it, transcoding it to utf-8" do
235
+ processor.encoding.should eq(Encoding.find('utf-8'))
236
+ end
237
+
238
+ it "can iterate through all of its contents without raising an error" do
239
+ expect {
240
+ processor.each {}
241
+ }.to_not raise_error
242
+ end
243
+
244
+ context "and no look-ahead is used" do
245
+ let(:options) { { row_sep: "\n" } }
246
+
247
+ its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
248
+
249
+ it "properly detects it, transcoding it to utf-8" do
250
+ processor.encoding.should eq(Encoding.find('utf-8'))
251
+ end
252
+
253
+ it "can iterate through all of its contents without raising an error" do
254
+ expect {
255
+ processor.each {}
256
+ }.to_not raise_error
257
+ end
258
+ end
259
+ end
260
+ end
261
+ end
262
+
263
+ describe "gzip support" do
264
+ let(:filename) { fixture('base.csv.gz') }
265
+
266
+ it "detects that the file is gzipped and decompress it" do
267
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
268
+ end
269
+
270
+ it { should be_gzipped }
271
+
272
+ context "when the file is in ISO-8859-1 encoding" do
273
+ let(:filename) { fixture('base-iso-8859-1.csv.gz') }
274
+
275
+ it "detects that the file is gzipped and decompress it" do
276
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
277
+ end
278
+
279
+ it { should be_gzipped }
280
+ end
281
+
282
+ context "when { gzipped: false } options is passed" do
283
+ let(:options) { { gzipped: false } }
284
+
285
+ context "and the file is not gzipped" do
286
+ let(:filename) { fixture('base.csv') }
287
+
288
+ it { should_not be_gzipped }
289
+
290
+ it "does not raise an error" do
291
+ expect {
292
+ processor.shift
293
+ }.to_not raise_error
294
+ end
295
+ end
296
+
297
+ context "and the file is gzipped" do
298
+ it "does not attempt to detect it, reading data as it were UTF-8" do
299
+ processor.shift.should_not eq(['A', 'B', 'C'])
300
+ end
301
+ end
302
+ end
303
+
304
+ context "when { gzipped: true } option is passed" do
305
+ let(:options) { { gzipped: true } }
306
+
307
+ context "and the file is not gzipped" do
308
+ let(:filename) { fixture('base.csv') }
309
+
310
+ it { should_not be_gzipped }
311
+
312
+ it "does not raise an error" do
313
+ expect {
314
+ processor.shift
315
+ }.to_not raise_error
316
+ end
317
+ end
318
+
319
+ context "and the file is gzipped" do
320
+ it "properly assumes that the file is gzipped and decompress it" do
321
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
322
+ end
323
+ end
324
+ end
325
+ end
326
+
327
+ describe "#process_range" do
328
+ it "yields every line of the file by default" do
329
+ expect { |block|
330
+ processor.process_range(&block)
331
+ }.to yield_control.exactly(5).times
332
+ end
333
+
334
+ it "yields the row and its index" do
335
+ expect { |block|
336
+ processor.process_range(&block)
337
+ }.to yield_successive_args(
338
+ [["A", "B", "C"], 0],
339
+ [["a1", "b1", "c1"], 1],
340
+ [["a2", "b2", "c2"], 2],
341
+ [["a3", "b3", "c3"], 3],
342
+ [["a4", "b4", "c4"], 4]
343
+ )
344
+ end
345
+
346
+ it "rewinds the file, so it can be called multiple times" do
347
+ processor.process_range {}
348
+
349
+ expect { |block|
350
+ processor.process_range(&block)
351
+ }.to yield_successive_args(
352
+ [["A", "B", "C"], 0],
353
+ [["a1", "b1", "c1"], 1],
354
+ [["a2", "b2", "c2"], 2],
355
+ [["a3", "b3", "c3"], 3],
356
+ [["a4", "b4", "c4"], 4]
357
+ )
358
+ end
359
+
360
+ context "when an offset is given" do
361
+ let(:offset) { 2 }
362
+
363
+ it "starts from this offset" do
364
+ expect { |block|
365
+ processor.process_range(offset: offset, &block)
366
+ }.to yield_successive_args(
367
+ [["a2", "b2", "c2"], 2],
368
+ [["a3", "b3", "c3"], 3],
369
+ [["a4", "b4", "c4"], 4]
370
+ )
371
+ end
372
+
373
+ context "and it is equal to the number of lines of the file" do
374
+ let(:offset) { processor.count }
375
+
376
+ it "does not yield" do
377
+ expect { |block|
378
+ processor.process_range(offset: offset, &block)
379
+ }.to_not yield_control
380
+ end
381
+ end
382
+
383
+ context "and it is greater than to the number of lines of the file" do
384
+ let(:offset) { processor.count + 1 }
385
+
386
+ it "does not yield" do
387
+ expect { |block|
388
+ processor.process_range(offset: offset, &block)
389
+ }.to_not yield_control
390
+ end
391
+ end
392
+ end
393
+
394
+ context "when a limit is given" do
395
+ let(:limit) { 2 }
396
+
397
+ it "yields only the number of rows given" do
398
+ expect { |block|
399
+ processor.process_range(limit: limit, &block)
400
+ }.to yield_successive_args(
401
+ [["A", "B", "C"], 0],
402
+ [["a1", "b1", "c1"], 1]
403
+ )
404
+ end
405
+
406
+ context "with zero" do
407
+ let(:limit) { 0 }
408
+
409
+ it "does not yield" do
410
+ expect { |block|
411
+ processor.process_range(limit: limit, &block)
412
+ }.to_not yield_control
413
+ end
414
+ end
415
+
416
+ context "with an offset" do
417
+ let(:offset) { 2 }
418
+
419
+ it "yields only the number of rows given, from the given offset" do
420
+ expect { |block|
421
+ processor.process_range(offset: offset, limit: limit, &block)
422
+ }.to yield_successive_args(
423
+ [["a2", "b2", "c2"], 2],
424
+ [["a3", "b3", "c3"], 3]
425
+ )
426
+ end
427
+ end
428
+ end
429
+ end
430
+
431
+ describe ".open" do
432
+ subject(:processor) { double(FileProcessor::CSV, close: true) }
433
+ before { FileProcessor::CSV.stub(:new).with(filename, options).and_return(processor) }
434
+
435
+ context "without a block" do
436
+ it "creates a new instance and returns it" do
437
+ FileProcessor::CSV.open(filename, options).should eq(processor)
438
+ end
439
+ end
440
+
441
+ context "with a block" do
442
+ it "creates a new instance and returns it" do
443
+ expect { |block|
444
+ FileProcessor::CSV.open(filename, options, &block)
445
+ }.to yield_with_args(processor)
446
+ end
447
+ end
448
+ end
449
+ end
@@ -0,0 +1,51 @@
1
+ require "spec_helper"
2
+
3
+ describe FileProcessor::Tempfile do
4
+ subject(:temp_file) { FileProcessor::Tempfile.new }
5
+ let(:generated_path) { File.join(Dir.tmpdir, 'some-path') }
6
+
7
+ it "creates the file" do
8
+ File.exists?(temp_file.path).should be_true
9
+ end
10
+
11
+ it "opens file ready to be written" do
12
+ expect {
13
+ temp_file << "some content"
14
+ }.to_not raise_error
15
+ end
16
+
17
+ describe "#path" do
18
+ it "is generated using 'file-processor' basename" do
19
+ temp_file.path.start_with?(File.join(Dir.tmpdir, 'file-processor')).should be_true
20
+ end
21
+ end
22
+
23
+ describe "#reopen" do
24
+ let!(:old_file) { temp_file.__getobj__ }
25
+
26
+ it "closes the old file" do
27
+ old_file.should_receive(:close)
28
+ temp_file.reopen('r')
29
+ end
30
+
31
+ it "updates the delegated object" do
32
+ temp_file.reopen('r')
33
+ temp_file.__getobj__.should_not eq(old_file)
34
+ temp_file.__getobj__.should be_a(File)
35
+ end
36
+
37
+ it "reopens the path with the given mode" do
38
+ temp_file.stub(:path).and_return(generated_path)
39
+ File.should_receive(:open).with(generated_path, 'r:utf-8', 384)
40
+ temp_file.reopen('r:utf-8')
41
+ end
42
+
43
+ context "when the old file is already closed" do
44
+ it "does not closes the old file" do
45
+ old_file.close
46
+ old_file.should_not_receive(:close)
47
+ temp_file.reopen('r')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,16 @@
1
+ require "bundler/setup"
2
+
3
+ require 'simplecov'
4
+ SimpleCov.start
5
+
6
+ Bundler.require :default
7
+
8
+ root = File.expand_path('../..', __FILE__)
9
+
10
+ Dir[File.join(root, "spec/support/**/*.rb")].each { |f| require f }
11
+
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+ end
@@ -0,0 +1,2 @@
1
+ A;B;C
2
+ �;b;c
@@ -0,0 +1,9 @@
1
+ A;B;C
2
+ "This is
3
+ a field
4
+
5
+ with new lines";This one has no new line;"But this one has
6
+ also
7
+
8
+ some new lines"
9
+ Now we have a new row;With data;And more data
@@ -0,0 +1,2 @@
1
+ A;B;C
2
+ á;b;c
@@ -0,0 +1,7 @@
1
+ A;B;C
2
+ a1;b1;c1
3
+ a2;b2;c2
4
+
5
+ a3;b3;c3
6
+
7
+ a4;b4;c4
@@ -0,0 +1,7 @@
1
+ A;B;C
2
+ a1;;
3
+ ;;
4
+ ;;c2
5
+ ;b3;
6
+ ;;
7
+ a4;b4;c4
@@ -0,0 +1,5 @@
1
+ A;B;C
2
+ a1;b1;c1
3
+ a2;b2;c2
4
+ a3;b3;c3
5
+ a4;b4;c4
Binary file
@@ -0,0 +1,13 @@
1
+ module FixturesSupport
2
+ def fixture(filename)
3
+ File.join(root_path, "/spec/support/fixtures", filename)
4
+ end
5
+
6
+ def root_path
7
+ File.expand_path('../../..', __FILE__)
8
+ end
9
+ end
10
+
11
+ RSpec.configure do |config|
12
+ config.include FixturesSupport
13
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: file_processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vicente Mundim
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A more powerful CSV file processor
15
+ email:
16
+ - vicente.mundim@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - .rspec
23
+ - .rvmrc
24
+ - .travis.yml
25
+ - Gemfile
26
+ - Gemfile.lock
27
+ - LICENSE.txt
28
+ - README.md
29
+ - Rakefile
30
+ - file_processor.gemspec
31
+ - lib/file_processor.rb
32
+ - lib/file_processor/csv.rb
33
+ - lib/file_processor/temp_file.rb
34
+ - lib/file_processor/version.rb
35
+ - spec/file_processor/csv_spec.rb
36
+ - spec/file_processor/temp_file_spec.rb
37
+ - spec/spec_helper.rb
38
+ - spec/support/fixtures.rb
39
+ - spec/support/fixtures/base-iso-8859-1.csv
40
+ - spec/support/fixtures/base-iso-8859-1.csv.gz
41
+ - spec/support/fixtures/base-new-line-in-field.csv
42
+ - spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
43
+ - spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
44
+ - spec/support/fixtures/base-utf-8.csv
45
+ - spec/support/fixtures/base-with-blank-lines.csv
46
+ - spec/support/fixtures/base-with-comma-separated-header.csv
47
+ - spec/support/fixtures/base-with-lines-with-no-data.csv
48
+ - spec/support/fixtures/base-with-unknown-column-separator.csv
49
+ - spec/support/fixtures/base.csv
50
+ - spec/support/fixtures/base.csv.gz
51
+ homepage:
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ segments:
64
+ - 0
65
+ hash: -785318963478006114
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ segments:
73
+ - 0
74
+ hash: -785318963478006114
75
+ requirements: []
76
+ rubyforge_project:
77
+ rubygems_version: 1.8.25
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: A more powerful CSV file processor
81
+ test_files:
82
+ - spec/file_processor/csv_spec.rb
83
+ - spec/file_processor/temp_file_spec.rb
84
+ - spec/spec_helper.rb
85
+ - spec/support/fixtures.rb
86
+ - spec/support/fixtures/base-iso-8859-1.csv
87
+ - spec/support/fixtures/base-iso-8859-1.csv.gz
88
+ - spec/support/fixtures/base-new-line-in-field.csv
89
+ - spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
90
+ - spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
91
+ - spec/support/fixtures/base-utf-8.csv
92
+ - spec/support/fixtures/base-with-blank-lines.csv
93
+ - spec/support/fixtures/base-with-comma-separated-header.csv
94
+ - spec/support/fixtures/base-with-lines-with-no-data.csv
95
+ - spec/support/fixtures/base-with-unknown-column-separator.csv
96
+ - spec/support/fixtures/base.csv
97
+ - spec/support/fixtures/base.csv.gz