file_processor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ .DS_Store
2
+ *.gem
3
+ *.rbc
4
+ .bundle
5
+ .config
6
+ .yardoc
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --color
2
+ --format documentation
3
+ --drb
4
+ --debugger
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm 1.9.3@file_processor --create
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ nguage: ruby
2
+ rvm:
3
+ - 1.9.3
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in file_processor.gemspec
4
+ gemspec
5
+
6
+ gem 'rake'
7
+
8
+ group(:development) do
9
+ gem 'debugger'
10
+ end
11
+
12
+ group(:test) do
13
+ gem 'rspec', "~> 2.14.0.rc1"
14
+ gem 'simplecov'
15
+ gem 'json', '~> 1.7.7'
16
+ end
17
+
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ file_processor (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ columnize (0.3.6)
10
+ debugger (1.6.0)
11
+ columnize (>= 0.3.1)
12
+ debugger-linecache (~> 1.2.0)
13
+ debugger-ruby_core_source (~> 1.2.1)
14
+ debugger-linecache (1.2.0)
15
+ debugger-ruby_core_source (1.2.2)
16
+ diff-lcs (1.2.4)
17
+ json (1.7.7)
18
+ multi_json (1.7.4)
19
+ rake (10.0.4)
20
+ rspec (2.14.0.rc1)
21
+ rspec-core (= 2.14.0.rc1)
22
+ rspec-expectations (= 2.14.0.rc1)
23
+ rspec-mocks (= 2.14.0.rc1)
24
+ rspec-core (2.14.0.rc1)
25
+ rspec-expectations (2.14.0.rc1)
26
+ diff-lcs (>= 1.1.3, < 2.0)
27
+ rspec-mocks (2.14.0.rc1)
28
+ simplecov (0.7.1)
29
+ multi_json (~> 1.0)
30
+ simplecov-html (~> 0.7.1)
31
+ simplecov-html (0.7.1)
32
+
33
+ PLATFORMS
34
+ ruby
35
+
36
+ DEPENDENCIES
37
+ debugger
38
+ file_processor!
39
+ json (~> 1.7.7)
40
+ rake
41
+ rspec (~> 2.14.0.rc1)
42
+ simplecov
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Vicente Mundim
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # FileProcessor
2
+
3
+ [![Build Status](https://travis-ci.org/dtmconsultoria/file_processor.png)](https://travis-ci.org/dtmconsultoria/file_processor)
4
+
5
+ A more powerful CSV file processor
6
+
7
+ ## Installation
8
+
9
+ FileProcessor uses the new CSV library introduced in Ruby 1.9.3, thus it is only compatible with this Ruby version.
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'file_processor'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install file_processor
22
+
23
+ ## Usage
24
+
25
+ Use it as you would use Ruby's CSV:
26
+
27
+ FileProcessor::CSV.open(filename, options) do |csv|
28
+ csv.each do |row|
29
+ # process row here
30
+ end
31
+ end # automatically closes the file
32
+
33
+ FileProcessor::CSV is just a wrapper around Ruby's CSV, so you can manipulate it as you would manipulate Ruby's CSV.
34
+
35
+ You can also use `FileProcessor::CSV#process_range` to process a range in the file:
36
+
37
+ FileProcessor::CSV.open(filename, options) do |csv|
38
+ csv.process_range(offset: 2000, limit: 1000) do |row, index|
39
+ # yields 1000 rows starting from line 2000 (i.e., from line 2000 to line 2999)
40
+ end
41
+ end # automatically closes the file
42
+
43
+ Here are the added features:
44
+
45
+ * Auto-detect encoding of UTF-8 and ISO-8859-1 (Latin1) files.
46
+ * Auto-detect the column separator (`col_sep` option) when not given.
47
+ * Skip lines without data when `skip_blank` is `true`, which is turned on by default. This means that count will not take these lines into account. Also skips them when iterating through lines.
48
+ * Detects if a file is gzipped, and decompress it for you automatically.
49
+
50
+ ## Contributing
51
+
52
+ 1. Fork it
53
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
54
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
55
+ 4. Push to the branch (`git push origin my-new-feature`)
56
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :default => :spec
@@ -0,0 +1,18 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'file_processor/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "file_processor"
8
+ gem.version = FileProcessor::VERSION
9
+ gem.authors = ["Vicente Mundim"]
10
+ gem.email = ["vicente.mundim@gmail.com"]
11
+ gem.description = %q{A more powerful CSV file processor}
12
+ gem.summary = %q{A more powerful CSV file processor}
13
+
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+ end
@@ -0,0 +1,186 @@
1
+ module FileProcessor
2
+ class CSV < SimpleDelegator
3
+ include Enumerable
4
+
5
+ # Opens a file and yields it, ensuring that it is properly closed.
6
+ def self.open(*args)
7
+ instance = new(*args)
8
+
9
+ if block_given?
10
+ begin
11
+ yield instance
12
+ ensure
13
+ instance.close if instance
14
+ end
15
+ else
16
+ instance
17
+ end
18
+ end
19
+
20
+ attr_accessor :detected_encoding
21
+
22
+ def initialize(filename, options={})
23
+ @gzipped = options.delete(:gzipped)
24
+
25
+ load(filename, options.delete(:open_options))
26
+
27
+ @options = default_options.merge(options)
28
+
29
+ @options[:encoding] ||= detect_encoding
30
+ @detected_encoding ||= Encoding.find(@options[:encoding])
31
+
32
+ tempfile.reopen(detected_mode) if tempfile.closed?
33
+
34
+ @options[:col_sep] ||= detect_column_separator
35
+
36
+ super(::CSV.new(tempfile, @options))
37
+ end
38
+
39
+ # Counts the number of rows in the file, even if it has already been read
40
+ #
41
+ # @return [ Integer ] the number of rows in the file
42
+ def total_count(&block)
43
+ rewind
44
+ count(&block)
45
+ ensure
46
+ rewind
47
+ end
48
+
49
+ #
50
+ # Yields each row of the data source in turn, skipping blanks and rows with no data.
51
+ #
52
+ # Support for Enumerable.
53
+ #
54
+ # The data source must be open for reading.
55
+ #
56
+ def each
57
+ if block_given?
58
+ while row = shift
59
+ yield row unless skip_blanks? && row_with_no_data?(row)
60
+ end
61
+ else
62
+ to_enum
63
+ end
64
+ end
65
+
66
+ # Process a range of lines in the CSV file.
67
+ #
68
+ # @example Process 1000 lines starting from the line 2000
69
+ # csv.process_range(offset: 2000, limit: 1000) do |row, index|
70
+ # # process range here
71
+ # end
72
+ #
73
+ # @param [ Hash ] options A hash with offset and/or limit
74
+ #
75
+ # @option options [ Integer ] :offset The offset from which the process should start
76
+ # @option options [ Integer ] :limit The number of rows to process
77
+ #
78
+ # @return [ Enumerable ] CSV's enumerable
79
+ def process_range(options={})
80
+ options ||= {}
81
+
82
+ offset = options[:offset] || 0
83
+ limit = options[:limit] || -1
84
+
85
+ rewind
86
+ each_with_index do |row, index|
87
+ next if index < offset
88
+ break if limit >= 0 && index >= offset + limit
89
+
90
+ yield row, index
91
+ end
92
+ ensure
93
+ rewind
94
+ end
95
+
96
+ # Returns true when the file is gzipped, false otherwise
97
+ def gzipped?
98
+ @gzipped
99
+ end
100
+
101
+ private
102
+
103
+ def detect_compression?
104
+ @gzipped.nil?
105
+ end
106
+
107
+ def row_with_no_data?(row)
108
+ row = row.fields if row.respond_to?(:fields)
109
+ row.all? { |column| column.nil? || column.empty? }
110
+ end
111
+
112
+ def load(filename, open_options)
113
+ loaded_io = decompress(::Kernel.open(filename, 'rb', open_options || {}))
114
+ loaded_io.rewind
115
+
116
+ @original_default_internal = Encoding.default_internal
117
+ Encoding.default_internal = nil
118
+
119
+ loaded_io.each do |line|
120
+ tempfile.write(line)
121
+ end
122
+ ensure
123
+ tempfile.close
124
+ loaded_io.close
125
+ Encoding.default_internal = @original_default_internal
126
+ end
127
+
128
+ def decompress(loaded_io)
129
+ if detect_compression? || gzipped?
130
+ Zlib::GzipReader.open(loaded_io).tap do |decompressed_io|
131
+ decompressed_io.getc # attempt to read from a compressed io
132
+ @gzipped = true
133
+ end
134
+ else
135
+ @gzipped = false
136
+ loaded_io
137
+ end
138
+ rescue Zlib::Error
139
+ # not a compressed io, just returning the loaded io instead
140
+ @gzipped = false
141
+ loaded_io
142
+ end
143
+
144
+ # We open the file and try to read each line of it, if there is an
145
+ # invalid byte sequence, an ArgumentError exception will be thrown.
146
+ #
147
+ # We then assume that the file is in ISO-8859-1 encoding, and transcode
148
+ # it to UTF-8. Though its ugly, this was the only way to detect whether
149
+ # a file was using one of these encodings.
150
+ def detect_encoding
151
+ tempfile.reopen('r:utf-8')
152
+ tempfile.each(&:split) # raises ArgumentError when it has non-ascii characters that are not in UTF-8
153
+
154
+ @detected_encoding = Encoding.find('utf-8')
155
+ rescue ArgumentError
156
+ tempfile.reopen('r:iso-8859-1:utf-8')
157
+ @detected_encoding = Encoding.find('iso-8859-1')
158
+ ensure
159
+ tempfile.rewind
160
+ end
161
+
162
+ def detected_utf_8?
163
+ detected_encoding == Encoding.find('utf-8')
164
+ end
165
+
166
+ def detected_mode
167
+ detected_utf_8? ? 'r:utf-8' : 'r:iso-8859-1:utf-8'
168
+ end
169
+
170
+ def detect_column_separator
171
+ @col_sep = tempfile.gets.split(';').size > 1 ? ';' : ','
172
+ ensure
173
+ tempfile.rewind
174
+ end
175
+
176
+ def default_options
177
+ {
178
+ skip_blanks: true
179
+ }
180
+ end
181
+
182
+ def tempfile
183
+ @tempfile ||= FileProcessor::Tempfile.new
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,20 @@
1
+ module FileProcessor
2
+ class Tempfile < ::Tempfile
3
+ def initialize(basename='file-processor', *args)
4
+ super(basename, *args)
5
+ end
6
+
7
+ def path
8
+ @tmpname
9
+ end
10
+
11
+ def reopen(mode)
12
+ close unless closed?
13
+ @mode = mode
14
+
15
+ @tmpfile = File.open(path, mode, @opts)
16
+ @data[1] = @tmpfile
17
+ __setobj__(@tmpfile)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module FileProcessor
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,12 @@
1
+ require "file_processor/version"
2
+ require "delegate"
3
+ require "csv"
4
+ require "zlib"
5
+ require "open-uri"
6
+ require "tempfile"
7
+
8
+ module FileProcessor
9
+ end
10
+
11
+ require "file_processor/temp_file"
12
+ require "file_processor/csv"
@@ -0,0 +1,449 @@
1
+ require "spec_helper"
2
+
3
+ describe FileProcessor::CSV do
4
+ let(:filename) { fixture('base.csv') }
5
+ let(:options) { {} }
6
+
7
+ subject(:processor) { FileProcessor::CSV.new(filename, options) }
8
+
9
+ it "delegates to a CSV instance" do
10
+ processor.__getobj__.should be_a(::CSV)
11
+ end
12
+
13
+ describe "#col_sep" do
14
+ context "when it is not given" do
15
+ context "and the first line of the file has more than one header column separated with a semi-colon" do
16
+ it "detects it properly" do
17
+ processor.col_sep.should eq(';')
18
+ end
19
+ end
20
+
21
+ context "and the first line of the file has more than one header column separated with a comman" do
22
+ let(:filename) { fixture('base-with-comma-separated-header.csv') }
23
+
24
+ it "detects it properly" do
25
+ processor.col_sep.should eq(',')
26
+ end
27
+ end
28
+
29
+ context "and an unknown column separator is used" do
30
+ let(:filename) { fixture('base-with-unknown-column-separator.csv') }
31
+
32
+ it "does not detects it, falling back to the default one" do
33
+ processor.col_sep.should eq(',')
34
+ end
35
+ end
36
+
37
+ context "and the file has non-ascii characters in its first line" do
38
+ context "in UTF-8" do
39
+ let(:filename) { fixture('base-non-ascii-characters-in-header-utf-8.csv') }
40
+
41
+ it "detects it properly" do
42
+ processor.col_sep.should eq(';')
43
+ end
44
+ end
45
+
46
+ context "in ISO-8859-1" do
47
+ let(:filename) { fixture('base-non-ascii-characters-in-header-iso-8859-1.csv') }
48
+
49
+ it "detects it properly" do
50
+ processor.col_sep.should eq(';')
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ context "when it is given" do
57
+ let(:options) { { col_sep: '|' } }
58
+
59
+ it "uses the given col_sep" do
60
+ processor.col_sep.should eq('|')
61
+ end
62
+ end
63
+ end
64
+
65
+ describe "#count" do
66
+ it "returns the number of rows in the CSV" do
67
+ processor.count.should eq(5)
68
+ end
69
+
70
+ context "when the file has new line characters in a field, but it is properly quoted" do
71
+ let(:filename) { fixture('base-new-line-in-field.csv') }
72
+
73
+ it "returns the correct number of rows in the CSV" do
74
+ processor.count.should eq(3)
75
+ end
76
+ end
77
+
78
+ context "when the file has blank lines" do
79
+ let(:filename) { fixture('base-with-blank-lines.csv') }
80
+
81
+ it "skips them by default" do
82
+ processor.count.should eq(5)
83
+ end
84
+ end
85
+
86
+ context "when the file has lines with no data" do
87
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
88
+
89
+ it "does not count them" do
90
+ processor.count.should eq(5)
91
+ end
92
+
93
+ context "but skip_blanks is false" do
94
+ let(:options) { { skip_blanks: false } }
95
+
96
+ it "does counts them" do
97
+ processor.count.should eq(7)
98
+ end
99
+ end
100
+
101
+ context "and { headers: true } is passed" do
102
+ let(:options) { { headers: true } }
103
+
104
+ it "does not count these lines, as well as the header" do
105
+ processor.count.should eq(4)
106
+ end
107
+ end
108
+ end
109
+
110
+ context "when a block is passed" do
111
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
112
+
113
+ it "returns the number of lines for which the block evaluates to true, properly handling lines with no data" do
114
+ processor.count { |row| !row.first.nil? }.should eq(3)
115
+ end
116
+ end
117
+ end
118
+
119
+ describe "#total_count" do
120
+ it "works as count, but returns all rows, even when called multiple times, since it rewinds the io file" do
121
+ processor.total_count
122
+ processor.total_count.should eq(5)
123
+ end
124
+ end
125
+
126
+ describe "#each" do
127
+ it "returns an enumerator when called without a block" do
128
+ processor.each.should be_a(Enumerator)
129
+ end
130
+
131
+ context "when the file has lines with no data" do
132
+ let(:filename) { fixture('base-with-lines-with-no-data.csv') }
133
+
134
+ it "does not yields these lines" do
135
+ expect { |block|
136
+ processor.each(&block)
137
+ }.to yield_control.exactly(5).times
138
+ end
139
+
140
+ context "but skip_blanks is false" do
141
+ let(:options) { { skip_blanks: false } }
142
+
143
+ it "yields these lines" do
144
+ expect { |block|
145
+ processor.each(&block)
146
+ }.to yield_control.exactly(7).times
147
+ end
148
+ end
149
+
150
+ context "and { headers: true } is passed" do
151
+ let(:options) { { headers: true } }
152
+
153
+ it "does not yields these lines, as well as the header" do
154
+ expect { |block|
155
+ processor.each(&block)
156
+ }.to yield_control.exactly(4).times # header do not count here
157
+ end
158
+ end
159
+ end
160
+ end
161
+
162
+ describe "encoding" do
163
+ it "can iterate through all of its contents without raising an error" do
164
+ expect {
165
+ processor.each {}
166
+ }.to_not raise_error
167
+ end
168
+
169
+ context "when the encoding is given" do
170
+ let(:filename) { fixture('base-utf-8.csv') }
171
+ let(:options) { { encoding: 'utf-8' } }
172
+
173
+ its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
174
+
175
+ it "opens the file properly" do
176
+ expect {
177
+ processor
178
+ }.to_not raise_error
179
+ end
180
+
181
+ context "and the file is ISO-8859-1" do
182
+ let(:filename) { fixture('base-iso-8859-1.csv') }
183
+
184
+ it "uses it to open the file, raising an error" do
185
+ expect {
186
+ processor
187
+ }.to raise_error
188
+ end
189
+
190
+ context "but the given encoding is ISO-8859-1" do
191
+ let(:options) { { encoding: 'ISO-8859-1' } }
192
+
193
+ its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
194
+
195
+ it "opens the file properly" do
196
+ expect {
197
+ processor
198
+ }.to_not raise_error
199
+ end
200
+ end
201
+ end
202
+ end
203
+
204
+ context "when the file is in US-ASCII" do
205
+ its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
206
+
207
+ it "reads it with utf-8" do
208
+ processor.encoding.should eq(Encoding.find('utf-8'))
209
+ end
210
+ end
211
+
212
+ context "when the file can be read in utf-8" do
213
+ let(:filename) { fixture('base-utf-8.csv') }
214
+
215
+ its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
216
+
217
+ it "properly detects it" do
218
+ processor.encoding.should eq(Encoding.find('utf-8'))
219
+ end
220
+
221
+ it "can iterate through all of its contents without raising an error" do
222
+ expect {
223
+ processor.each {}
224
+ }.to_not raise_error
225
+ end
226
+ end
227
+
228
+ context "when the file cannot be read in utf-8" do
229
+ context "but it can be read in iso-8859-1" do
230
+ let(:filename) { fixture('base-iso-8859-1.csv') }
231
+
232
+ its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
233
+
234
+ it "properly detects it, transcoding it to utf-8" do
235
+ processor.encoding.should eq(Encoding.find('utf-8'))
236
+ end
237
+
238
+ it "can iterate through all of its contents without raising an error" do
239
+ expect {
240
+ processor.each {}
241
+ }.to_not raise_error
242
+ end
243
+
244
+ context "and no look-ahead is used" do
245
+ let(:options) { { row_sep: "\n" } }
246
+
247
+ its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
248
+
249
+ it "properly detects it, transcoding it to utf-8" do
250
+ processor.encoding.should eq(Encoding.find('utf-8'))
251
+ end
252
+
253
+ it "can iterate through all of its contents without raising an error" do
254
+ expect {
255
+ processor.each {}
256
+ }.to_not raise_error
257
+ end
258
+ end
259
+ end
260
+ end
261
+ end
262
+
263
+ describe "gzip support" do
264
+ let(:filename) { fixture('base.csv.gz') }
265
+
266
+ it "detects that the file is gzipped and decompress it" do
267
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
268
+ end
269
+
270
+ it { should be_gzipped }
271
+
272
+ context "when the file is in ISO-8859-1 encoding" do
273
+ let(:filename) { fixture('base-iso-8859-1.csv.gz') }
274
+
275
+ it "detects that the file is gzipped and decompress it" do
276
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
277
+ end
278
+
279
+ it { should be_gzipped }
280
+ end
281
+
282
+ context "when { gzipped: false } options is passed" do
283
+ let(:options) { { gzipped: false } }
284
+
285
+ context "and the file is not gzipped" do
286
+ let(:filename) { fixture('base.csv') }
287
+
288
+ it { should_not be_gzipped }
289
+
290
+ it "does not raise an error" do
291
+ expect {
292
+ processor.shift
293
+ }.to_not raise_error
294
+ end
295
+ end
296
+
297
+ context "and the file is gzipped" do
298
+ it "does not attempt to detect it, reading data as it were UTF-8" do
299
+ processor.shift.should_not eq(['A', 'B', 'C'])
300
+ end
301
+ end
302
+ end
303
+
304
+ context "when { gzipped: true } option is passed" do
305
+ let(:options) { { gzipped: true } }
306
+
307
+ context "and the file is not gzipped" do
308
+ let(:filename) { fixture('base.csv') }
309
+
310
+ it { should_not be_gzipped }
311
+
312
+ it "does not raise an error" do
313
+ expect {
314
+ processor.shift
315
+ }.to_not raise_error
316
+ end
317
+ end
318
+
319
+ context "and the file is gzipped" do
320
+ it "properly assumes that the file is gzipped and decompress it" do
321
+ processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
322
+ end
323
+ end
324
+ end
325
+ end
326
+
327
+ describe "#process_range" do
328
+ it "yields every line of the file by default" do
329
+ expect { |block|
330
+ processor.process_range(&block)
331
+ }.to yield_control.exactly(5).times
332
+ end
333
+
334
+ it "yields the row and its index" do
335
+ expect { |block|
336
+ processor.process_range(&block)
337
+ }.to yield_successive_args(
338
+ [["A", "B", "C"], 0],
339
+ [["a1", "b1", "c1"], 1],
340
+ [["a2", "b2", "c2"], 2],
341
+ [["a3", "b3", "c3"], 3],
342
+ [["a4", "b4", "c4"], 4]
343
+ )
344
+ end
345
+
346
+ it "rewinds the file, so it can be called multiple times" do
347
+ processor.process_range {}
348
+
349
+ expect { |block|
350
+ processor.process_range(&block)
351
+ }.to yield_successive_args(
352
+ [["A", "B", "C"], 0],
353
+ [["a1", "b1", "c1"], 1],
354
+ [["a2", "b2", "c2"], 2],
355
+ [["a3", "b3", "c3"], 3],
356
+ [["a4", "b4", "c4"], 4]
357
+ )
358
+ end
359
+
360
+ context "when an offset is given" do
361
+ let(:offset) { 2 }
362
+
363
+ it "starts from this offset" do
364
+ expect { |block|
365
+ processor.process_range(offset: offset, &block)
366
+ }.to yield_successive_args(
367
+ [["a2", "b2", "c2"], 2],
368
+ [["a3", "b3", "c3"], 3],
369
+ [["a4", "b4", "c4"], 4]
370
+ )
371
+ end
372
+
373
+ context "and it is equal to the number of lines of the file" do
374
+ let(:offset) { processor.count }
375
+
376
+ it "does not yield" do
377
+ expect { |block|
378
+ processor.process_range(offset: offset, &block)
379
+ }.to_not yield_control
380
+ end
381
+ end
382
+
383
+ context "and it is greater than to the number of lines of the file" do
384
+ let(:offset) { processor.count + 1 }
385
+
386
+ it "does not yield" do
387
+ expect { |block|
388
+ processor.process_range(offset: offset, &block)
389
+ }.to_not yield_control
390
+ end
391
+ end
392
+ end
393
+
394
+ context "when a limit is given" do
395
+ let(:limit) { 2 }
396
+
397
+ it "yields only the number of rows given" do
398
+ expect { |block|
399
+ processor.process_range(limit: limit, &block)
400
+ }.to yield_successive_args(
401
+ [["A", "B", "C"], 0],
402
+ [["a1", "b1", "c1"], 1]
403
+ )
404
+ end
405
+
406
+ context "with zero" do
407
+ let(:limit) { 0 }
408
+
409
+ it "does not yield" do
410
+ expect { |block|
411
+ processor.process_range(limit: limit, &block)
412
+ }.to_not yield_control
413
+ end
414
+ end
415
+
416
+ context "with an offset" do
417
+ let(:offset) { 2 }
418
+
419
+ it "yields only the number of rows given, from the given offset" do
420
+ expect { |block|
421
+ processor.process_range(offset: offset, limit: limit, &block)
422
+ }.to yield_successive_args(
423
+ [["a2", "b2", "c2"], 2],
424
+ [["a3", "b3", "c3"], 3]
425
+ )
426
+ end
427
+ end
428
+ end
429
+ end
430
+
431
+ describe ".open" do
432
+ subject(:processor) { double(FileProcessor::CSV, close: true) }
433
+ before { FileProcessor::CSV.stub(:new).with(filename, options).and_return(processor) }
434
+
435
+ context "without a block" do
436
+ it "creates a new instance and returns it" do
437
+ FileProcessor::CSV.open(filename, options).should eq(processor)
438
+ end
439
+ end
440
+
441
+ context "with a block" do
442
+ it "creates a new instance and returns it" do
443
+ expect { |block|
444
+ FileProcessor::CSV.open(filename, options, &block)
445
+ }.to yield_with_args(processor)
446
+ end
447
+ end
448
+ end
449
+ end
@@ -0,0 +1,51 @@
1
+ require "spec_helper"
2
+
3
+ describe FileProcessor::Tempfile do
4
+ subject(:temp_file) { FileProcessor::Tempfile.new }
5
+ let(:generated_path) { File.join(Dir.tmpdir, 'some-path') }
6
+
7
+ it "creates the file" do
8
+ File.exists?(temp_file.path).should be_true
9
+ end
10
+
11
+ it "opens file ready to be written" do
12
+ expect {
13
+ temp_file << "some content"
14
+ }.to_not raise_error
15
+ end
16
+
17
+ describe "#path" do
18
+ it "is generated using 'file-processor' basename" do
19
+ temp_file.path.start_with?(File.join(Dir.tmpdir, 'file-processor')).should be_true
20
+ end
21
+ end
22
+
23
+ describe "#reopen" do
24
+ let!(:old_file) { temp_file.__getobj__ }
25
+
26
+ it "closes the old file" do
27
+ old_file.should_receive(:close)
28
+ temp_file.reopen('r')
29
+ end
30
+
31
+ it "updates the delegated object" do
32
+ temp_file.reopen('r')
33
+ temp_file.__getobj__.should_not eq(old_file)
34
+ temp_file.__getobj__.should be_a(File)
35
+ end
36
+
37
+ it "reopens the path with the given mode" do
38
+ temp_file.stub(:path).and_return(generated_path)
39
+ File.should_receive(:open).with(generated_path, 'r:utf-8', 384)
40
+ temp_file.reopen('r:utf-8')
41
+ end
42
+
43
+ context "when the old file is already closed" do
44
+ it "does not closes the old file" do
45
+ old_file.close
46
+ old_file.should_not_receive(:close)
47
+ temp_file.reopen('r')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,16 @@
1
+ require "bundler/setup"
2
+
3
+ require 'simplecov'
4
+ SimpleCov.start
5
+
6
+ Bundler.require :default
7
+
8
+ root = File.expand_path('../..', __FILE__)
9
+
10
+ Dir[File.join(root, "spec/support/**/*.rb")].each { |f| require f }
11
+
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+ end
@@ -0,0 +1,2 @@
1
+ A;B;C
2
+ �;b;c
@@ -0,0 +1,9 @@
1
+ A;B;C
2
+ "This is
3
+ a field
4
+
5
+ with new lines";This one has no new line;"But this one has
6
+ also
7
+
8
+ some new lines"
9
+ Now we have a new row;With data;And more data
@@ -0,0 +1,2 @@
1
+ A;B;C
2
+ á;b;c
@@ -0,0 +1,7 @@
1
+ A;B;C
2
+ a1;b1;c1
3
+ a2;b2;c2
4
+
5
+ a3;b3;c3
6
+
7
+ a4;b4;c4
@@ -0,0 +1,7 @@
1
+ A;B;C
2
+ a1;;
3
+ ;;
4
+ ;;c2
5
+ ;b3;
6
+ ;;
7
+ a4;b4;c4
@@ -0,0 +1,5 @@
1
+ A;B;C
2
+ a1;b1;c1
3
+ a2;b2;c2
4
+ a3;b3;c3
5
+ a4;b4;c4
Binary file
@@ -0,0 +1,13 @@
1
+ module FixturesSupport
2
+ def fixture(filename)
3
+ File.join(root_path, "/spec/support/fixtures", filename)
4
+ end
5
+
6
+ def root_path
7
+ File.expand_path('../../..', __FILE__)
8
+ end
9
+ end
10
+
11
+ RSpec.configure do |config|
12
+ config.include FixturesSupport
13
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: file_processor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vicente Mundim
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A more powerful CSV file processor
15
+ email:
16
+ - vicente.mundim@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - .rspec
23
+ - .rvmrc
24
+ - .travis.yml
25
+ - Gemfile
26
+ - Gemfile.lock
27
+ - LICENSE.txt
28
+ - README.md
29
+ - Rakefile
30
+ - file_processor.gemspec
31
+ - lib/file_processor.rb
32
+ - lib/file_processor/csv.rb
33
+ - lib/file_processor/temp_file.rb
34
+ - lib/file_processor/version.rb
35
+ - spec/file_processor/csv_spec.rb
36
+ - spec/file_processor/temp_file_spec.rb
37
+ - spec/spec_helper.rb
38
+ - spec/support/fixtures.rb
39
+ - spec/support/fixtures/base-iso-8859-1.csv
40
+ - spec/support/fixtures/base-iso-8859-1.csv.gz
41
+ - spec/support/fixtures/base-new-line-in-field.csv
42
+ - spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
43
+ - spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
44
+ - spec/support/fixtures/base-utf-8.csv
45
+ - spec/support/fixtures/base-with-blank-lines.csv
46
+ - spec/support/fixtures/base-with-comma-separated-header.csv
47
+ - spec/support/fixtures/base-with-lines-with-no-data.csv
48
+ - spec/support/fixtures/base-with-unknown-column-separator.csv
49
+ - spec/support/fixtures/base.csv
50
+ - spec/support/fixtures/base.csv.gz
51
+ homepage:
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ segments:
64
+ - 0
65
+ hash: -785318963478006114
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ segments:
73
+ - 0
74
+ hash: -785318963478006114
75
+ requirements: []
76
+ rubyforge_project:
77
+ rubygems_version: 1.8.25
78
+ signing_key:
79
+ specification_version: 3
80
+ summary: A more powerful CSV file processor
81
+ test_files:
82
+ - spec/file_processor/csv_spec.rb
83
+ - spec/file_processor/temp_file_spec.rb
84
+ - spec/spec_helper.rb
85
+ - spec/support/fixtures.rb
86
+ - spec/support/fixtures/base-iso-8859-1.csv
87
+ - spec/support/fixtures/base-iso-8859-1.csv.gz
88
+ - spec/support/fixtures/base-new-line-in-field.csv
89
+ - spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
90
+ - spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
91
+ - spec/support/fixtures/base-utf-8.csv
92
+ - spec/support/fixtures/base-with-blank-lines.csv
93
+ - spec/support/fixtures/base-with-comma-separated-header.csv
94
+ - spec/support/fixtures/base-with-lines-with-no-data.csv
95
+ - spec/support/fixtures/base-with-unknown-column-separator.csv
96
+ - spec/support/fixtures/base.csv
97
+ - spec/support/fixtures/base.csv.gz