file_processor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rspec +4 -0
- data/.rvmrc +1 -0
- data/.travis.yml +3 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +56 -0
- data/Rakefile +5 -0
- data/file_processor.gemspec +18 -0
- data/lib/file_processor/csv.rb +186 -0
- data/lib/file_processor/temp_file.rb +20 -0
- data/lib/file_processor/version.rb +3 -0
- data/lib/file_processor.rb +12 -0
- data/spec/file_processor/csv_spec.rb +449 -0
- data/spec/file_processor/temp_file_spec.rb +51 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/support/fixtures/base-iso-8859-1.csv +2 -0
- data/spec/support/fixtures/base-iso-8859-1.csv.gz +0 -0
- data/spec/support/fixtures/base-new-line-in-field.csv +9 -0
- data/spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv +2 -0
- data/spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv +2 -0
- data/spec/support/fixtures/base-utf-8.csv +2 -0
- data/spec/support/fixtures/base-with-blank-lines.csv +7 -0
- data/spec/support/fixtures/base-with-comma-separated-header.csv +2 -0
- data/spec/support/fixtures/base-with-lines-with-no-data.csv +7 -0
- data/spec/support/fixtures/base-with-unknown-column-separator.csv +2 -0
- data/spec/support/fixtures/base.csv +5 -0
- data/spec/support/fixtures/base.csv.gz +0 -0
- data/spec/support/fixtures.rb +13 -0
- metadata +97 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm 1.9.3@file_processor --create
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in file_processor.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'rake'
|
7
|
+
|
8
|
+
group(:development) do
|
9
|
+
gem 'debugger'
|
10
|
+
end
|
11
|
+
|
12
|
+
group(:test) do
|
13
|
+
gem 'rspec', "~> 2.14.0.rc1"
|
14
|
+
gem 'simplecov'
|
15
|
+
gem 'json', '~> 1.7.7'
|
16
|
+
end
|
17
|
+
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
file_processor (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
columnize (0.3.6)
|
10
|
+
debugger (1.6.0)
|
11
|
+
columnize (>= 0.3.1)
|
12
|
+
debugger-linecache (~> 1.2.0)
|
13
|
+
debugger-ruby_core_source (~> 1.2.1)
|
14
|
+
debugger-linecache (1.2.0)
|
15
|
+
debugger-ruby_core_source (1.2.2)
|
16
|
+
diff-lcs (1.2.4)
|
17
|
+
json (1.7.7)
|
18
|
+
multi_json (1.7.4)
|
19
|
+
rake (10.0.4)
|
20
|
+
rspec (2.14.0.rc1)
|
21
|
+
rspec-core (= 2.14.0.rc1)
|
22
|
+
rspec-expectations (= 2.14.0.rc1)
|
23
|
+
rspec-mocks (= 2.14.0.rc1)
|
24
|
+
rspec-core (2.14.0.rc1)
|
25
|
+
rspec-expectations (2.14.0.rc1)
|
26
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
27
|
+
rspec-mocks (2.14.0.rc1)
|
28
|
+
simplecov (0.7.1)
|
29
|
+
multi_json (~> 1.0)
|
30
|
+
simplecov-html (~> 0.7.1)
|
31
|
+
simplecov-html (0.7.1)
|
32
|
+
|
33
|
+
PLATFORMS
|
34
|
+
ruby
|
35
|
+
|
36
|
+
DEPENDENCIES
|
37
|
+
debugger
|
38
|
+
file_processor!
|
39
|
+
json (~> 1.7.7)
|
40
|
+
rake
|
41
|
+
rspec (~> 2.14.0.rc1)
|
42
|
+
simplecov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Vicente Mundim
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# FileProcessor
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/dtmconsultoria/file_processor.png)](https://travis-ci.org/dtmconsultoria/file_processor)
|
4
|
+
|
5
|
+
A more powerful CSV file processor
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
FileProcessor uses the new CSV library introduced in Ruby 1.9.3, thus it is only compatible with this Ruby version.
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'file_processor'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install file_processor
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Use it as you would use Ruby's CSV:
|
26
|
+
|
27
|
+
FileProcessor::CSV.open(filename, options) do |csv|
|
28
|
+
csv.each do |row|
|
29
|
+
# process row here
|
30
|
+
end
|
31
|
+
end # automatically closes the file
|
32
|
+
|
33
|
+
FileProcessor::CSV is just a wrapper around Ruby's CSV, so you can manipulate it as you would manipulate Ruby's CSV.
|
34
|
+
|
35
|
+
You can also use `FileProcessor::CSV#process_range` to process a range in the file:
|
36
|
+
|
37
|
+
FileProcessor::CSV.open(filename, options) do |csv|
|
38
|
+
csv.process_range(offset: 2000, limit: 1000) do |row, index|
|
39
|
+
# yields 1000 rows starting from line 2000 (i.e., from line 2000 to line 2999)
|
40
|
+
end
|
41
|
+
end # automatically closes the file
|
42
|
+
|
43
|
+
Here are the added features:
|
44
|
+
|
45
|
+
* Auto-detect encoding of UTF-8 and ISO-8859-1 (Latin1) files.
|
46
|
+
* Auto-detect the column separator (`col_sep` option) when not given.
|
47
|
+
* Skip lines without data when `skip_blank` is `true`, which is turned on by default. This means that count will not take these lines into account. Also skips them when iterating through lines.
|
48
|
+
* Detects if a file is gzipped, and decompress it for you automatically.
|
49
|
+
|
50
|
+
## Contributing
|
51
|
+
|
52
|
+
1. Fork it
|
53
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
54
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
55
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
56
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'file_processor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "file_processor"
|
8
|
+
gem.version = FileProcessor::VERSION
|
9
|
+
gem.authors = ["Vicente Mundim"]
|
10
|
+
gem.email = ["vicente.mundim@gmail.com"]
|
11
|
+
gem.description = %q{A more powerful CSV file processor}
|
12
|
+
gem.summary = %q{A more powerful CSV file processor}
|
13
|
+
|
14
|
+
gem.files = `git ls-files`.split($/)
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
end
|
@@ -0,0 +1,186 @@
|
|
1
|
+
module FileProcessor
|
2
|
+
class CSV < SimpleDelegator
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
# Opens a file and yields it, ensuring that it is properly closed.
|
6
|
+
def self.open(*args)
|
7
|
+
instance = new(*args)
|
8
|
+
|
9
|
+
if block_given?
|
10
|
+
begin
|
11
|
+
yield instance
|
12
|
+
ensure
|
13
|
+
instance.close if instance
|
14
|
+
end
|
15
|
+
else
|
16
|
+
instance
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_accessor :detected_encoding
|
21
|
+
|
22
|
+
def initialize(filename, options={})
|
23
|
+
@gzipped = options.delete(:gzipped)
|
24
|
+
|
25
|
+
load(filename, options.delete(:open_options))
|
26
|
+
|
27
|
+
@options = default_options.merge(options)
|
28
|
+
|
29
|
+
@options[:encoding] ||= detect_encoding
|
30
|
+
@detected_encoding ||= Encoding.find(@options[:encoding])
|
31
|
+
|
32
|
+
tempfile.reopen(detected_mode) if tempfile.closed?
|
33
|
+
|
34
|
+
@options[:col_sep] ||= detect_column_separator
|
35
|
+
|
36
|
+
super(::CSV.new(tempfile, @options))
|
37
|
+
end
|
38
|
+
|
39
|
+
# Counts the number of rows in the file, even if it has already been read
|
40
|
+
#
|
41
|
+
# @return [ Integer ] the number of rows in the file
|
42
|
+
def total_count(&block)
|
43
|
+
rewind
|
44
|
+
count(&block)
|
45
|
+
ensure
|
46
|
+
rewind
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Yields each row of the data source in turn, skipping blanks and rows with no data.
|
51
|
+
#
|
52
|
+
# Support for Enumerable.
|
53
|
+
#
|
54
|
+
# The data source must be open for reading.
|
55
|
+
#
|
56
|
+
def each
|
57
|
+
if block_given?
|
58
|
+
while row = shift
|
59
|
+
yield row unless skip_blanks? && row_with_no_data?(row)
|
60
|
+
end
|
61
|
+
else
|
62
|
+
to_enum
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Process a range of lines in the CSV file.
|
67
|
+
#
|
68
|
+
# @example Process 1000 lines starting from the line 2000
|
69
|
+
# csv.process_range(offset: 2000, limit: 1000) do |row, index|
|
70
|
+
# # process range here
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# @param [ Hash ] options A hash with offset and/or limit
|
74
|
+
#
|
75
|
+
# @option options [ Integer ] :offset The offset from which the process should start
|
76
|
+
# @option options [ Integer ] :limit The number of rows to process
|
77
|
+
#
|
78
|
+
# @return [ Enumerable ] CSV's enumerable
|
79
|
+
def process_range(options={})
|
80
|
+
options ||= {}
|
81
|
+
|
82
|
+
offset = options[:offset] || 0
|
83
|
+
limit = options[:limit] || -1
|
84
|
+
|
85
|
+
rewind
|
86
|
+
each_with_index do |row, index|
|
87
|
+
next if index < offset
|
88
|
+
break if limit >= 0 && index >= offset + limit
|
89
|
+
|
90
|
+
yield row, index
|
91
|
+
end
|
92
|
+
ensure
|
93
|
+
rewind
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns true when the file is gzipped, false otherwise
|
97
|
+
def gzipped?
|
98
|
+
@gzipped
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def detect_compression?
|
104
|
+
@gzipped.nil?
|
105
|
+
end
|
106
|
+
|
107
|
+
def row_with_no_data?(row)
|
108
|
+
row = row.fields if row.respond_to?(:fields)
|
109
|
+
row.all? { |column| column.nil? || column.empty? }
|
110
|
+
end
|
111
|
+
|
112
|
+
def load(filename, open_options)
|
113
|
+
loaded_io = decompress(::Kernel.open(filename, 'rb', open_options || {}))
|
114
|
+
loaded_io.rewind
|
115
|
+
|
116
|
+
@original_default_internal = Encoding.default_internal
|
117
|
+
Encoding.default_internal = nil
|
118
|
+
|
119
|
+
loaded_io.each do |line|
|
120
|
+
tempfile.write(line)
|
121
|
+
end
|
122
|
+
ensure
|
123
|
+
tempfile.close
|
124
|
+
loaded_io.close
|
125
|
+
Encoding.default_internal = @original_default_internal
|
126
|
+
end
|
127
|
+
|
128
|
+
def decompress(loaded_io)
|
129
|
+
if detect_compression? || gzipped?
|
130
|
+
Zlib::GzipReader.open(loaded_io).tap do |decompressed_io|
|
131
|
+
decompressed_io.getc # attempt to read from a compressed io
|
132
|
+
@gzipped = true
|
133
|
+
end
|
134
|
+
else
|
135
|
+
@gzipped = false
|
136
|
+
loaded_io
|
137
|
+
end
|
138
|
+
rescue Zlib::Error
|
139
|
+
# not a compressed io, just returning the loaded io instead
|
140
|
+
@gzipped = false
|
141
|
+
loaded_io
|
142
|
+
end
|
143
|
+
|
144
|
+
# We open the file and try to read each line of it, if there is an
|
145
|
+
# invalid byte sequence, an ArgumentError exception will be thrown.
|
146
|
+
#
|
147
|
+
# We then assume that the file is in ISO-8859-1 encoding, and transcode
|
148
|
+
# it to UTF-8. Though its ugly, this was the only way to detect whether
|
149
|
+
# a file was using one of these encodings.
|
150
|
+
def detect_encoding
|
151
|
+
tempfile.reopen('r:utf-8')
|
152
|
+
tempfile.each(&:split) # raises ArgumentError when it has non-ascii characters that are not in UTF-8
|
153
|
+
|
154
|
+
@detected_encoding = Encoding.find('utf-8')
|
155
|
+
rescue ArgumentError
|
156
|
+
tempfile.reopen('r:iso-8859-1:utf-8')
|
157
|
+
@detected_encoding = Encoding.find('iso-8859-1')
|
158
|
+
ensure
|
159
|
+
tempfile.rewind
|
160
|
+
end
|
161
|
+
|
162
|
+
def detected_utf_8?
|
163
|
+
detected_encoding == Encoding.find('utf-8')
|
164
|
+
end
|
165
|
+
|
166
|
+
def detected_mode
|
167
|
+
detected_utf_8? ? 'r:utf-8' : 'r:iso-8859-1:utf-8'
|
168
|
+
end
|
169
|
+
|
170
|
+
def detect_column_separator
|
171
|
+
@col_sep = tempfile.gets.split(';').size > 1 ? ';' : ','
|
172
|
+
ensure
|
173
|
+
tempfile.rewind
|
174
|
+
end
|
175
|
+
|
176
|
+
def default_options
|
177
|
+
{
|
178
|
+
skip_blanks: true
|
179
|
+
}
|
180
|
+
end
|
181
|
+
|
182
|
+
def tempfile
|
183
|
+
@tempfile ||= FileProcessor::Tempfile.new
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module FileProcessor
|
2
|
+
class Tempfile < ::Tempfile
|
3
|
+
def initialize(basename='file-processor', *args)
|
4
|
+
super(basename, *args)
|
5
|
+
end
|
6
|
+
|
7
|
+
def path
|
8
|
+
@tmpname
|
9
|
+
end
|
10
|
+
|
11
|
+
def reopen(mode)
|
12
|
+
close unless closed?
|
13
|
+
@mode = mode
|
14
|
+
|
15
|
+
@tmpfile = File.open(path, mode, @opts)
|
16
|
+
@data[1] = @tmpfile
|
17
|
+
__setobj__(@tmpfile)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,449 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe FileProcessor::CSV do
|
4
|
+
let(:filename) { fixture('base.csv') }
|
5
|
+
let(:options) { {} }
|
6
|
+
|
7
|
+
subject(:processor) { FileProcessor::CSV.new(filename, options) }
|
8
|
+
|
9
|
+
it "delegates to a CSV instance" do
|
10
|
+
processor.__getobj__.should be_a(::CSV)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#col_sep" do
|
14
|
+
context "when it is not given" do
|
15
|
+
context "and the first line of the file has more than one header column separated with a semi-colon" do
|
16
|
+
it "detects it properly" do
|
17
|
+
processor.col_sep.should eq(';')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "and the first line of the file has more than one header column separated with a comman" do
|
22
|
+
let(:filename) { fixture('base-with-comma-separated-header.csv') }
|
23
|
+
|
24
|
+
it "detects it properly" do
|
25
|
+
processor.col_sep.should eq(',')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "and an unknown column separator is used" do
|
30
|
+
let(:filename) { fixture('base-with-unknown-column-separator.csv') }
|
31
|
+
|
32
|
+
it "does not detects it, falling back to the default one" do
|
33
|
+
processor.col_sep.should eq(',')
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "and the file has non-ascii characters in its first line" do
|
38
|
+
context "in UTF-8" do
|
39
|
+
let(:filename) { fixture('base-non-ascii-characters-in-header-utf-8.csv') }
|
40
|
+
|
41
|
+
it "detects it properly" do
|
42
|
+
processor.col_sep.should eq(';')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "in ISO-8859-1" do
|
47
|
+
let(:filename) { fixture('base-non-ascii-characters-in-header-iso-8859-1.csv') }
|
48
|
+
|
49
|
+
it "detects it properly" do
|
50
|
+
processor.col_sep.should eq(';')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "when it is given" do
|
57
|
+
let(:options) { { col_sep: '|' } }
|
58
|
+
|
59
|
+
it "uses the given col_sep" do
|
60
|
+
processor.col_sep.should eq('|')
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#count" do
|
66
|
+
it "returns the number of rows in the CSV" do
|
67
|
+
processor.count.should eq(5)
|
68
|
+
end
|
69
|
+
|
70
|
+
context "when the file has new line characters in a field, but it is properly quoted" do
|
71
|
+
let(:filename) { fixture('base-new-line-in-field.csv') }
|
72
|
+
|
73
|
+
it "returns the correct number of rows in the CSV" do
|
74
|
+
processor.count.should eq(3)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
context "when the file has blank lines" do
|
79
|
+
let(:filename) { fixture('base-with-blank-lines.csv') }
|
80
|
+
|
81
|
+
it "skips them by default" do
|
82
|
+
processor.count.should eq(5)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context "when the file has lines with no data" do
|
87
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
88
|
+
|
89
|
+
it "does not count them" do
|
90
|
+
processor.count.should eq(5)
|
91
|
+
end
|
92
|
+
|
93
|
+
context "but skip_blanks is false" do
|
94
|
+
let(:options) { { skip_blanks: false } }
|
95
|
+
|
96
|
+
it "does counts them" do
|
97
|
+
processor.count.should eq(7)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
context "and { headers: true } is passed" do
|
102
|
+
let(:options) { { headers: true } }
|
103
|
+
|
104
|
+
it "does not count these lines, as well as the header" do
|
105
|
+
processor.count.should eq(4)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context "when a block is passed" do
|
111
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
112
|
+
|
113
|
+
it "returns the number of lines for which the block evaluates to true, properly handling lines with no data" do
|
114
|
+
processor.count { |row| !row.first.nil? }.should eq(3)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#total_count" do
|
120
|
+
it "works as count, but returns all rows, even when called multiple times, since it rewinds the io file" do
|
121
|
+
processor.total_count
|
122
|
+
processor.total_count.should eq(5)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
describe "#each" do
|
127
|
+
it "returns an enumerator when called without a block" do
|
128
|
+
processor.each.should be_a(Enumerator)
|
129
|
+
end
|
130
|
+
|
131
|
+
context "when the file has lines with no data" do
|
132
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
133
|
+
|
134
|
+
it "does not yields these lines" do
|
135
|
+
expect { |block|
|
136
|
+
processor.each(&block)
|
137
|
+
}.to yield_control.exactly(5).times
|
138
|
+
end
|
139
|
+
|
140
|
+
context "but skip_blanks is false" do
|
141
|
+
let(:options) { { skip_blanks: false } }
|
142
|
+
|
143
|
+
it "yields these lines" do
|
144
|
+
expect { |block|
|
145
|
+
processor.each(&block)
|
146
|
+
}.to yield_control.exactly(7).times
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
context "and { headers: true } is passed" do
|
151
|
+
let(:options) { { headers: true } }
|
152
|
+
|
153
|
+
it "does not yields these lines, as well as the header" do
|
154
|
+
expect { |block|
|
155
|
+
processor.each(&block)
|
156
|
+
}.to yield_control.exactly(4).times # header do not count here
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
describe "encoding" do
|
163
|
+
it "can iterate through all of its contents without raising an error" do
|
164
|
+
expect {
|
165
|
+
processor.each {}
|
166
|
+
}.to_not raise_error
|
167
|
+
end
|
168
|
+
|
169
|
+
context "when the encoding is given" do
|
170
|
+
let(:filename) { fixture('base-utf-8.csv') }
|
171
|
+
let(:options) { { encoding: 'utf-8' } }
|
172
|
+
|
173
|
+
its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
|
174
|
+
|
175
|
+
it "opens the file properly" do
|
176
|
+
expect {
|
177
|
+
processor
|
178
|
+
}.to_not raise_error
|
179
|
+
end
|
180
|
+
|
181
|
+
context "and the file is ISO-8859-1" do
|
182
|
+
let(:filename) { fixture('base-iso-8859-1.csv') }
|
183
|
+
|
184
|
+
it "uses it to open the file, raising an error" do
|
185
|
+
expect {
|
186
|
+
processor
|
187
|
+
}.to raise_error
|
188
|
+
end
|
189
|
+
|
190
|
+
context "but the given encoding is ISO-8859-1" do
|
191
|
+
let(:options) { { encoding: 'ISO-8859-1' } }
|
192
|
+
|
193
|
+
its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
|
194
|
+
|
195
|
+
it "opens the file properly" do
|
196
|
+
expect {
|
197
|
+
processor
|
198
|
+
}.to_not raise_error
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
context "when the file is in US-ASCII" do
|
205
|
+
its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
|
206
|
+
|
207
|
+
it "reads it with utf-8" do
|
208
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
context "when the file can be read in utf-8" do
|
213
|
+
let(:filename) { fixture('base-utf-8.csv') }
|
214
|
+
|
215
|
+
its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
|
216
|
+
|
217
|
+
it "properly detects it" do
|
218
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
219
|
+
end
|
220
|
+
|
221
|
+
it "can iterate through all of its contents without raising an error" do
|
222
|
+
expect {
|
223
|
+
processor.each {}
|
224
|
+
}.to_not raise_error
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
context "when the file cannot be read in utf-8" do
|
229
|
+
context "but it can be read in iso-8859-1" do
|
230
|
+
let(:filename) { fixture('base-iso-8859-1.csv') }
|
231
|
+
|
232
|
+
its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
|
233
|
+
|
234
|
+
it "properly detects it, transcoding it to utf-8" do
|
235
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
236
|
+
end
|
237
|
+
|
238
|
+
it "can iterate through all of its contents without raising an error" do
|
239
|
+
expect {
|
240
|
+
processor.each {}
|
241
|
+
}.to_not raise_error
|
242
|
+
end
|
243
|
+
|
244
|
+
context "and no look-ahead is used" do
|
245
|
+
let(:options) { { row_sep: "\n" } }
|
246
|
+
|
247
|
+
its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
|
248
|
+
|
249
|
+
it "properly detects it, transcoding it to utf-8" do
|
250
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
251
|
+
end
|
252
|
+
|
253
|
+
it "can iterate through all of its contents without raising an error" do
|
254
|
+
expect {
|
255
|
+
processor.each {}
|
256
|
+
}.to_not raise_error
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
describe "gzip support" do
|
264
|
+
let(:filename) { fixture('base.csv.gz') }
|
265
|
+
|
266
|
+
it "detects that the file is gzipped and decompress it" do
|
267
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
268
|
+
end
|
269
|
+
|
270
|
+
it { should be_gzipped }
|
271
|
+
|
272
|
+
context "when the file is in ISO-8859-1 encoding" do
|
273
|
+
let(:filename) { fixture('base-iso-8859-1.csv.gz') }
|
274
|
+
|
275
|
+
it "detects that the file is gzipped and decompress it" do
|
276
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
277
|
+
end
|
278
|
+
|
279
|
+
it { should be_gzipped }
|
280
|
+
end
|
281
|
+
|
282
|
+
context "when { gzipped: false } options is passed" do
|
283
|
+
let(:options) { { gzipped: false } }
|
284
|
+
|
285
|
+
context "and the file is not gzipped" do
|
286
|
+
let(:filename) { fixture('base.csv') }
|
287
|
+
|
288
|
+
it { should_not be_gzipped }
|
289
|
+
|
290
|
+
it "does not raise an error" do
|
291
|
+
expect {
|
292
|
+
processor.shift
|
293
|
+
}.to_not raise_error
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
context "and the file is gzipped" do
|
298
|
+
it "does not attempt to detect it, reading data as it were UTF-8" do
|
299
|
+
processor.shift.should_not eq(['A', 'B', 'C'])
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
context "when { gzipped: true } option is passed" do
|
305
|
+
let(:options) { { gzipped: true } }
|
306
|
+
|
307
|
+
context "and the file is not gzipped" do
|
308
|
+
let(:filename) { fixture('base.csv') }
|
309
|
+
|
310
|
+
it { should_not be_gzipped }
|
311
|
+
|
312
|
+
it "does not raise an error" do
|
313
|
+
expect {
|
314
|
+
processor.shift
|
315
|
+
}.to_not raise_error
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
context "and the file is gzipped" do
|
320
|
+
it "properly assumes that the file is gzipped and decompress it" do
|
321
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
describe "#process_range" do
|
328
|
+
it "yields every line of the file by default" do
|
329
|
+
expect { |block|
|
330
|
+
processor.process_range(&block)
|
331
|
+
}.to yield_control.exactly(5).times
|
332
|
+
end
|
333
|
+
|
334
|
+
it "yields the row and its index" do
|
335
|
+
expect { |block|
|
336
|
+
processor.process_range(&block)
|
337
|
+
}.to yield_successive_args(
|
338
|
+
[["A", "B", "C"], 0],
|
339
|
+
[["a1", "b1", "c1"], 1],
|
340
|
+
[["a2", "b2", "c2"], 2],
|
341
|
+
[["a3", "b3", "c3"], 3],
|
342
|
+
[["a4", "b4", "c4"], 4]
|
343
|
+
)
|
344
|
+
end
|
345
|
+
|
346
|
+
it "rewinds the file, so it can be called multiple times" do
|
347
|
+
processor.process_range {}
|
348
|
+
|
349
|
+
expect { |block|
|
350
|
+
processor.process_range(&block)
|
351
|
+
}.to yield_successive_args(
|
352
|
+
[["A", "B", "C"], 0],
|
353
|
+
[["a1", "b1", "c1"], 1],
|
354
|
+
[["a2", "b2", "c2"], 2],
|
355
|
+
[["a3", "b3", "c3"], 3],
|
356
|
+
[["a4", "b4", "c4"], 4]
|
357
|
+
)
|
358
|
+
end
|
359
|
+
|
360
|
+
context "when an offset is given" do
|
361
|
+
let(:offset) { 2 }
|
362
|
+
|
363
|
+
it "starts from this offset" do
|
364
|
+
expect { |block|
|
365
|
+
processor.process_range(offset: offset, &block)
|
366
|
+
}.to yield_successive_args(
|
367
|
+
[["a2", "b2", "c2"], 2],
|
368
|
+
[["a3", "b3", "c3"], 3],
|
369
|
+
[["a4", "b4", "c4"], 4]
|
370
|
+
)
|
371
|
+
end
|
372
|
+
|
373
|
+
context "and it is equal to the number of lines of the file" do
|
374
|
+
let(:offset) { processor.count }
|
375
|
+
|
376
|
+
it "does not yield" do
|
377
|
+
expect { |block|
|
378
|
+
processor.process_range(offset: offset, &block)
|
379
|
+
}.to_not yield_control
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
context "and it is greater than to the number of lines of the file" do
|
384
|
+
let(:offset) { processor.count + 1 }
|
385
|
+
|
386
|
+
it "does not yield" do
|
387
|
+
expect { |block|
|
388
|
+
processor.process_range(offset: offset, &block)
|
389
|
+
}.to_not yield_control
|
390
|
+
end
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
context "when a limit is given" do
|
395
|
+
let(:limit) { 2 }
|
396
|
+
|
397
|
+
it "yields only the number of rows given" do
|
398
|
+
expect { |block|
|
399
|
+
processor.process_range(limit: limit, &block)
|
400
|
+
}.to yield_successive_args(
|
401
|
+
[["A", "B", "C"], 0],
|
402
|
+
[["a1", "b1", "c1"], 1]
|
403
|
+
)
|
404
|
+
end
|
405
|
+
|
406
|
+
context "with zero" do
|
407
|
+
let(:limit) { 0 }
|
408
|
+
|
409
|
+
it "does not yield" do
|
410
|
+
expect { |block|
|
411
|
+
processor.process_range(limit: limit, &block)
|
412
|
+
}.to_not yield_control
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
context "with an offset" do
|
417
|
+
let(:offset) { 2 }
|
418
|
+
|
419
|
+
it "yields only the number of rows given, from the given offset" do
|
420
|
+
expect { |block|
|
421
|
+
processor.process_range(offset: offset, limit: limit, &block)
|
422
|
+
}.to yield_successive_args(
|
423
|
+
[["a2", "b2", "c2"], 2],
|
424
|
+
[["a3", "b3", "c3"], 3]
|
425
|
+
)
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
describe ".open" do
|
432
|
+
subject(:processor) { double(FileProcessor::CSV, close: true) }
|
433
|
+
before { FileProcessor::CSV.stub(:new).with(filename, options).and_return(processor) }
|
434
|
+
|
435
|
+
context "without a block" do
|
436
|
+
it "creates a new instance and returns it" do
|
437
|
+
FileProcessor::CSV.open(filename, options).should eq(processor)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context "with a block" do
|
442
|
+
it "creates a new instance and returns it" do
|
443
|
+
expect { |block|
|
444
|
+
FileProcessor::CSV.open(filename, options, &block)
|
445
|
+
}.to yield_with_args(processor)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe FileProcessor::Tempfile do
|
4
|
+
subject(:temp_file) { FileProcessor::Tempfile.new }
|
5
|
+
let(:generated_path) { File.join(Dir.tmpdir, 'some-path') }
|
6
|
+
|
7
|
+
it "creates the file" do
|
8
|
+
File.exists?(temp_file.path).should be_true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "opens file ready to be written" do
|
12
|
+
expect {
|
13
|
+
temp_file << "some content"
|
14
|
+
}.to_not raise_error
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#path" do
|
18
|
+
it "is generated using 'file-processor' basename" do
|
19
|
+
temp_file.path.start_with?(File.join(Dir.tmpdir, 'file-processor')).should be_true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#reopen" do
|
24
|
+
let!(:old_file) { temp_file.__getobj__ }
|
25
|
+
|
26
|
+
it "closes the old file" do
|
27
|
+
old_file.should_receive(:close)
|
28
|
+
temp_file.reopen('r')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "updates the delegated object" do
|
32
|
+
temp_file.reopen('r')
|
33
|
+
temp_file.__getobj__.should_not eq(old_file)
|
34
|
+
temp_file.__getobj__.should be_a(File)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "reopens the path with the given mode" do
|
38
|
+
temp_file.stub(:path).and_return(generated_path)
|
39
|
+
File.should_receive(:open).with(generated_path, 'r:utf-8', 384)
|
40
|
+
temp_file.reopen('r:utf-8')
|
41
|
+
end
|
42
|
+
|
43
|
+
context "when the old file is already closed" do
|
44
|
+
it "does not closes the old file" do
|
45
|
+
old_file.close
|
46
|
+
old_file.should_not_receive(:close)
|
47
|
+
temp_file.reopen('r')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
|
3
|
+
require 'simplecov'
|
4
|
+
SimpleCov.start
|
5
|
+
|
6
|
+
Bundler.require :default
|
7
|
+
|
8
|
+
root = File.expand_path('../..', __FILE__)
|
9
|
+
|
10
|
+
Dir[File.join(root, "spec/support/**/*.rb")].each { |f| require f }
|
11
|
+
|
12
|
+
RSpec.configure do |config|
|
13
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
14
|
+
config.run_all_when_everything_filtered = true
|
15
|
+
config.filter_run :focus
|
16
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module FixturesSupport
|
2
|
+
def fixture(filename)
|
3
|
+
File.join(root_path, "/spec/support/fixtures", filename)
|
4
|
+
end
|
5
|
+
|
6
|
+
def root_path
|
7
|
+
File.expand_path('../../..', __FILE__)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
RSpec.configure do |config|
|
12
|
+
config.include FixturesSupport
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: file_processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Vicente Mundim
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: A more powerful CSV file processor
|
15
|
+
email:
|
16
|
+
- vicente.mundim@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- .rspec
|
23
|
+
- .rvmrc
|
24
|
+
- .travis.yml
|
25
|
+
- Gemfile
|
26
|
+
- Gemfile.lock
|
27
|
+
- LICENSE.txt
|
28
|
+
- README.md
|
29
|
+
- Rakefile
|
30
|
+
- file_processor.gemspec
|
31
|
+
- lib/file_processor.rb
|
32
|
+
- lib/file_processor/csv.rb
|
33
|
+
- lib/file_processor/temp_file.rb
|
34
|
+
- lib/file_processor/version.rb
|
35
|
+
- spec/file_processor/csv_spec.rb
|
36
|
+
- spec/file_processor/temp_file_spec.rb
|
37
|
+
- spec/spec_helper.rb
|
38
|
+
- spec/support/fixtures.rb
|
39
|
+
- spec/support/fixtures/base-iso-8859-1.csv
|
40
|
+
- spec/support/fixtures/base-iso-8859-1.csv.gz
|
41
|
+
- spec/support/fixtures/base-new-line-in-field.csv
|
42
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
|
43
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
|
44
|
+
- spec/support/fixtures/base-utf-8.csv
|
45
|
+
- spec/support/fixtures/base-with-blank-lines.csv
|
46
|
+
- spec/support/fixtures/base-with-comma-separated-header.csv
|
47
|
+
- spec/support/fixtures/base-with-lines-with-no-data.csv
|
48
|
+
- spec/support/fixtures/base-with-unknown-column-separator.csv
|
49
|
+
- spec/support/fixtures/base.csv
|
50
|
+
- spec/support/fixtures/base.csv.gz
|
51
|
+
homepage:
|
52
|
+
licenses: []
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
hash: -785318963478006114
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
hash: -785318963478006114
|
75
|
+
requirements: []
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.8.25
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: A more powerful CSV file processor
|
81
|
+
test_files:
|
82
|
+
- spec/file_processor/csv_spec.rb
|
83
|
+
- spec/file_processor/temp_file_spec.rb
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
- spec/support/fixtures.rb
|
86
|
+
- spec/support/fixtures/base-iso-8859-1.csv
|
87
|
+
- spec/support/fixtures/base-iso-8859-1.csv.gz
|
88
|
+
- spec/support/fixtures/base-new-line-in-field.csv
|
89
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
|
90
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
|
91
|
+
- spec/support/fixtures/base-utf-8.csv
|
92
|
+
- spec/support/fixtures/base-with-blank-lines.csv
|
93
|
+
- spec/support/fixtures/base-with-comma-separated-header.csv
|
94
|
+
- spec/support/fixtures/base-with-lines-with-no-data.csv
|
95
|
+
- spec/support/fixtures/base-with-unknown-column-separator.csv
|
96
|
+
- spec/support/fixtures/base.csv
|
97
|
+
- spec/support/fixtures/base.csv.gz
|