file_processor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rspec +4 -0
- data/.rvmrc +1 -0
- data/.travis.yml +3 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +42 -0
- data/LICENSE.txt +22 -0
- data/README.md +56 -0
- data/Rakefile +5 -0
- data/file_processor.gemspec +18 -0
- data/lib/file_processor/csv.rb +186 -0
- data/lib/file_processor/temp_file.rb +20 -0
- data/lib/file_processor/version.rb +3 -0
- data/lib/file_processor.rb +12 -0
- data/spec/file_processor/csv_spec.rb +449 -0
- data/spec/file_processor/temp_file_spec.rb +51 -0
- data/spec/spec_helper.rb +16 -0
- data/spec/support/fixtures/base-iso-8859-1.csv +2 -0
- data/spec/support/fixtures/base-iso-8859-1.csv.gz +0 -0
- data/spec/support/fixtures/base-new-line-in-field.csv +9 -0
- data/spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv +2 -0
- data/spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv +2 -0
- data/spec/support/fixtures/base-utf-8.csv +2 -0
- data/spec/support/fixtures/base-with-blank-lines.csv +7 -0
- data/spec/support/fixtures/base-with-comma-separated-header.csv +2 -0
- data/spec/support/fixtures/base-with-lines-with-no-data.csv +7 -0
- data/spec/support/fixtures/base-with-unknown-column-separator.csv +2 -0
- data/spec/support/fixtures/base.csv +5 -0
- data/spec/support/fixtures/base.csv.gz +0 -0
- data/spec/support/fixtures.rb +13 -0
- metadata +97 -0
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm 1.9.3@file_processor --create
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in file_processor.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'rake'
|
7
|
+
|
8
|
+
group(:development) do
|
9
|
+
gem 'debugger'
|
10
|
+
end
|
11
|
+
|
12
|
+
group(:test) do
|
13
|
+
gem 'rspec', "~> 2.14.0.rc1"
|
14
|
+
gem 'simplecov'
|
15
|
+
gem 'json', '~> 1.7.7'
|
16
|
+
end
|
17
|
+
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
file_processor (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
columnize (0.3.6)
|
10
|
+
debugger (1.6.0)
|
11
|
+
columnize (>= 0.3.1)
|
12
|
+
debugger-linecache (~> 1.2.0)
|
13
|
+
debugger-ruby_core_source (~> 1.2.1)
|
14
|
+
debugger-linecache (1.2.0)
|
15
|
+
debugger-ruby_core_source (1.2.2)
|
16
|
+
diff-lcs (1.2.4)
|
17
|
+
json (1.7.7)
|
18
|
+
multi_json (1.7.4)
|
19
|
+
rake (10.0.4)
|
20
|
+
rspec (2.14.0.rc1)
|
21
|
+
rspec-core (= 2.14.0.rc1)
|
22
|
+
rspec-expectations (= 2.14.0.rc1)
|
23
|
+
rspec-mocks (= 2.14.0.rc1)
|
24
|
+
rspec-core (2.14.0.rc1)
|
25
|
+
rspec-expectations (2.14.0.rc1)
|
26
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
27
|
+
rspec-mocks (2.14.0.rc1)
|
28
|
+
simplecov (0.7.1)
|
29
|
+
multi_json (~> 1.0)
|
30
|
+
simplecov-html (~> 0.7.1)
|
31
|
+
simplecov-html (0.7.1)
|
32
|
+
|
33
|
+
PLATFORMS
|
34
|
+
ruby
|
35
|
+
|
36
|
+
DEPENDENCIES
|
37
|
+
debugger
|
38
|
+
file_processor!
|
39
|
+
json (~> 1.7.7)
|
40
|
+
rake
|
41
|
+
rspec (~> 2.14.0.rc1)
|
42
|
+
simplecov
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Vicente Mundim
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# FileProcessor
|
2
|
+
|
3
|
+
[](https://travis-ci.org/dtmconsultoria/file_processor)
|
4
|
+
|
5
|
+
A more powerful CSV file processor
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
FileProcessor uses the new CSV library introduced in Ruby 1.9.3, thus it is only compatible with this Ruby version.
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'file_processor'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install file_processor
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Use it as you would use Ruby's CSV:
|
26
|
+
|
27
|
+
FileProcessor::CSV.open(filename, options) do |csv|
|
28
|
+
csv.each do |row|
|
29
|
+
# process row here
|
30
|
+
end
|
31
|
+
end # automatically closes the file
|
32
|
+
|
33
|
+
FileProcessor::CSV is just a wrapper around Ruby's CSV, so you can manipulate it as you would manipulate Ruby's CSV.
|
34
|
+
|
35
|
+
You can also use `FileProcessor::CSV#process_range` to process a range in the file:
|
36
|
+
|
37
|
+
FileProcessor::CSV.open(filename, options) do |csv|
|
38
|
+
csv.process_range(offset: 2000, limit: 1000) do |row, index|
|
39
|
+
# yields 1000 rows starting from line 2000 (i.e., from line 2000 to line 2999)
|
40
|
+
end
|
41
|
+
end # automatically closes the file
|
42
|
+
|
43
|
+
Here are the added features:
|
44
|
+
|
45
|
+
* Auto-detect encoding of UTF-8 and ISO-8859-1 (Latin1) files.
|
46
|
+
* Auto-detect the column separator (`col_sep` option) when not given.
|
47
|
+
* Skip lines without data when `skip_blank` is `true`, which is turned on by default. This means that count will not take these lines into account. Also skips them when iterating through lines.
|
48
|
+
* Detects if a file is gzipped, and decompress it for you automatically.
|
49
|
+
|
50
|
+
## Contributing
|
51
|
+
|
52
|
+
1. Fork it
|
53
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
54
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
55
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
56
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'file_processor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "file_processor"
|
8
|
+
gem.version = FileProcessor::VERSION
|
9
|
+
gem.authors = ["Vicente Mundim"]
|
10
|
+
gem.email = ["vicente.mundim@gmail.com"]
|
11
|
+
gem.description = %q{A more powerful CSV file processor}
|
12
|
+
gem.summary = %q{A more powerful CSV file processor}
|
13
|
+
|
14
|
+
gem.files = `git ls-files`.split($/)
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
end
|
@@ -0,0 +1,186 @@
|
|
1
|
+
module FileProcessor
|
2
|
+
class CSV < SimpleDelegator
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
# Opens a file and yields it, ensuring that it is properly closed.
|
6
|
+
def self.open(*args)
|
7
|
+
instance = new(*args)
|
8
|
+
|
9
|
+
if block_given?
|
10
|
+
begin
|
11
|
+
yield instance
|
12
|
+
ensure
|
13
|
+
instance.close if instance
|
14
|
+
end
|
15
|
+
else
|
16
|
+
instance
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_accessor :detected_encoding
|
21
|
+
|
22
|
+
def initialize(filename, options={})
|
23
|
+
@gzipped = options.delete(:gzipped)
|
24
|
+
|
25
|
+
load(filename, options.delete(:open_options))
|
26
|
+
|
27
|
+
@options = default_options.merge(options)
|
28
|
+
|
29
|
+
@options[:encoding] ||= detect_encoding
|
30
|
+
@detected_encoding ||= Encoding.find(@options[:encoding])
|
31
|
+
|
32
|
+
tempfile.reopen(detected_mode) if tempfile.closed?
|
33
|
+
|
34
|
+
@options[:col_sep] ||= detect_column_separator
|
35
|
+
|
36
|
+
super(::CSV.new(tempfile, @options))
|
37
|
+
end
|
38
|
+
|
39
|
+
# Counts the number of rows in the file, even if it has already been read
|
40
|
+
#
|
41
|
+
# @return [ Integer ] the number of rows in the file
|
42
|
+
def total_count(&block)
|
43
|
+
rewind
|
44
|
+
count(&block)
|
45
|
+
ensure
|
46
|
+
rewind
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Yields each row of the data source in turn, skipping blanks and rows with no data.
|
51
|
+
#
|
52
|
+
# Support for Enumerable.
|
53
|
+
#
|
54
|
+
# The data source must be open for reading.
|
55
|
+
#
|
56
|
+
def each
|
57
|
+
if block_given?
|
58
|
+
while row = shift
|
59
|
+
yield row unless skip_blanks? && row_with_no_data?(row)
|
60
|
+
end
|
61
|
+
else
|
62
|
+
to_enum
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Process a range of lines in the CSV file.
|
67
|
+
#
|
68
|
+
# @example Process 1000 lines starting from the line 2000
|
69
|
+
# csv.process_range(offset: 2000, limit: 1000) do |row, index|
|
70
|
+
# # process range here
|
71
|
+
# end
|
72
|
+
#
|
73
|
+
# @param [ Hash ] options A hash with offset and/or limit
|
74
|
+
#
|
75
|
+
# @option options [ Integer ] :offset The offset from which the process should start
|
76
|
+
# @option options [ Integer ] :limit The number of rows to process
|
77
|
+
#
|
78
|
+
# @return [ Enumerable ] CSV's enumerable
|
79
|
+
def process_range(options={})
|
80
|
+
options ||= {}
|
81
|
+
|
82
|
+
offset = options[:offset] || 0
|
83
|
+
limit = options[:limit] || -1
|
84
|
+
|
85
|
+
rewind
|
86
|
+
each_with_index do |row, index|
|
87
|
+
next if index < offset
|
88
|
+
break if limit >= 0 && index >= offset + limit
|
89
|
+
|
90
|
+
yield row, index
|
91
|
+
end
|
92
|
+
ensure
|
93
|
+
rewind
|
94
|
+
end
|
95
|
+
|
96
|
+
# Returns true when the file is gzipped, false otherwise
|
97
|
+
def gzipped?
|
98
|
+
@gzipped
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def detect_compression?
|
104
|
+
@gzipped.nil?
|
105
|
+
end
|
106
|
+
|
107
|
+
def row_with_no_data?(row)
|
108
|
+
row = row.fields if row.respond_to?(:fields)
|
109
|
+
row.all? { |column| column.nil? || column.empty? }
|
110
|
+
end
|
111
|
+
|
112
|
+
def load(filename, open_options)
|
113
|
+
loaded_io = decompress(::Kernel.open(filename, 'rb', open_options || {}))
|
114
|
+
loaded_io.rewind
|
115
|
+
|
116
|
+
@original_default_internal = Encoding.default_internal
|
117
|
+
Encoding.default_internal = nil
|
118
|
+
|
119
|
+
loaded_io.each do |line|
|
120
|
+
tempfile.write(line)
|
121
|
+
end
|
122
|
+
ensure
|
123
|
+
tempfile.close
|
124
|
+
loaded_io.close
|
125
|
+
Encoding.default_internal = @original_default_internal
|
126
|
+
end
|
127
|
+
|
128
|
+
def decompress(loaded_io)
|
129
|
+
if detect_compression? || gzipped?
|
130
|
+
Zlib::GzipReader.open(loaded_io).tap do |decompressed_io|
|
131
|
+
decompressed_io.getc # attempt to read from a compressed io
|
132
|
+
@gzipped = true
|
133
|
+
end
|
134
|
+
else
|
135
|
+
@gzipped = false
|
136
|
+
loaded_io
|
137
|
+
end
|
138
|
+
rescue Zlib::Error
|
139
|
+
# not a compressed io, just returning the loaded io instead
|
140
|
+
@gzipped = false
|
141
|
+
loaded_io
|
142
|
+
end
|
143
|
+
|
144
|
+
# We open the file and try to read each line of it, if there is an
|
145
|
+
# invalid byte sequence, an ArgumentError exception will be thrown.
|
146
|
+
#
|
147
|
+
# We then assume that the file is in ISO-8859-1 encoding, and transcode
|
148
|
+
# it to UTF-8. Though its ugly, this was the only way to detect whether
|
149
|
+
# a file was using one of these encodings.
|
150
|
+
def detect_encoding
|
151
|
+
tempfile.reopen('r:utf-8')
|
152
|
+
tempfile.each(&:split) # raises ArgumentError when it has non-ascii characters that are not in UTF-8
|
153
|
+
|
154
|
+
@detected_encoding = Encoding.find('utf-8')
|
155
|
+
rescue ArgumentError
|
156
|
+
tempfile.reopen('r:iso-8859-1:utf-8')
|
157
|
+
@detected_encoding = Encoding.find('iso-8859-1')
|
158
|
+
ensure
|
159
|
+
tempfile.rewind
|
160
|
+
end
|
161
|
+
|
162
|
+
def detected_utf_8?
|
163
|
+
detected_encoding == Encoding.find('utf-8')
|
164
|
+
end
|
165
|
+
|
166
|
+
def detected_mode
|
167
|
+
detected_utf_8? ? 'r:utf-8' : 'r:iso-8859-1:utf-8'
|
168
|
+
end
|
169
|
+
|
170
|
+
def detect_column_separator
|
171
|
+
@col_sep = tempfile.gets.split(';').size > 1 ? ';' : ','
|
172
|
+
ensure
|
173
|
+
tempfile.rewind
|
174
|
+
end
|
175
|
+
|
176
|
+
def default_options
|
177
|
+
{
|
178
|
+
skip_blanks: true
|
179
|
+
}
|
180
|
+
end
|
181
|
+
|
182
|
+
def tempfile
|
183
|
+
@tempfile ||= FileProcessor::Tempfile.new
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module FileProcessor
|
2
|
+
class Tempfile < ::Tempfile
|
3
|
+
def initialize(basename='file-processor', *args)
|
4
|
+
super(basename, *args)
|
5
|
+
end
|
6
|
+
|
7
|
+
def path
|
8
|
+
@tmpname
|
9
|
+
end
|
10
|
+
|
11
|
+
def reopen(mode)
|
12
|
+
close unless closed?
|
13
|
+
@mode = mode
|
14
|
+
|
15
|
+
@tmpfile = File.open(path, mode, @opts)
|
16
|
+
@data[1] = @tmpfile
|
17
|
+
__setobj__(@tmpfile)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,449 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe FileProcessor::CSV do
|
4
|
+
let(:filename) { fixture('base.csv') }
|
5
|
+
let(:options) { {} }
|
6
|
+
|
7
|
+
subject(:processor) { FileProcessor::CSV.new(filename, options) }
|
8
|
+
|
9
|
+
it "delegates to a CSV instance" do
|
10
|
+
processor.__getobj__.should be_a(::CSV)
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "#col_sep" do
|
14
|
+
context "when it is not given" do
|
15
|
+
context "and the first line of the file has more than one header column separated with a semi-colon" do
|
16
|
+
it "detects it properly" do
|
17
|
+
processor.col_sep.should eq(';')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "and the first line of the file has more than one header column separated with a comman" do
|
22
|
+
let(:filename) { fixture('base-with-comma-separated-header.csv') }
|
23
|
+
|
24
|
+
it "detects it properly" do
|
25
|
+
processor.col_sep.should eq(',')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
context "and an unknown column separator is used" do
|
30
|
+
let(:filename) { fixture('base-with-unknown-column-separator.csv') }
|
31
|
+
|
32
|
+
it "does not detects it, falling back to the default one" do
|
33
|
+
processor.col_sep.should eq(',')
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "and the file has non-ascii characters in its first line" do
|
38
|
+
context "in UTF-8" do
|
39
|
+
let(:filename) { fixture('base-non-ascii-characters-in-header-utf-8.csv') }
|
40
|
+
|
41
|
+
it "detects it properly" do
|
42
|
+
processor.col_sep.should eq(';')
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "in ISO-8859-1" do
|
47
|
+
let(:filename) { fixture('base-non-ascii-characters-in-header-iso-8859-1.csv') }
|
48
|
+
|
49
|
+
it "detects it properly" do
|
50
|
+
processor.col_sep.should eq(';')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "when it is given" do
|
57
|
+
let(:options) { { col_sep: '|' } }
|
58
|
+
|
59
|
+
it "uses the given col_sep" do
|
60
|
+
processor.col_sep.should eq('|')
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "#count" do
|
66
|
+
it "returns the number of rows in the CSV" do
|
67
|
+
processor.count.should eq(5)
|
68
|
+
end
|
69
|
+
|
70
|
+
context "when the file has new line characters in a field, but it is properly quoted" do
|
71
|
+
let(:filename) { fixture('base-new-line-in-field.csv') }
|
72
|
+
|
73
|
+
it "returns the correct number of rows in the CSV" do
|
74
|
+
processor.count.should eq(3)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
context "when the file has blank lines" do
|
79
|
+
let(:filename) { fixture('base-with-blank-lines.csv') }
|
80
|
+
|
81
|
+
it "skips them by default" do
|
82
|
+
processor.count.should eq(5)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context "when the file has lines with no data" do
|
87
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
88
|
+
|
89
|
+
it "does not count them" do
|
90
|
+
processor.count.should eq(5)
|
91
|
+
end
|
92
|
+
|
93
|
+
context "but skip_blanks is false" do
|
94
|
+
let(:options) { { skip_blanks: false } }
|
95
|
+
|
96
|
+
it "does counts them" do
|
97
|
+
processor.count.should eq(7)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
context "and { headers: true } is passed" do
|
102
|
+
let(:options) { { headers: true } }
|
103
|
+
|
104
|
+
it "does not count these lines, as well as the header" do
|
105
|
+
processor.count.should eq(4)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
context "when a block is passed" do
|
111
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
112
|
+
|
113
|
+
it "returns the number of lines for which the block evaluates to true, properly handling lines with no data" do
|
114
|
+
processor.count { |row| !row.first.nil? }.should eq(3)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#total_count" do
|
120
|
+
it "works as count, but returns all rows, even when called multiple times, since it rewinds the io file" do
|
121
|
+
processor.total_count
|
122
|
+
processor.total_count.should eq(5)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
describe "#each" do
|
127
|
+
it "returns an enumerator when called without a block" do
|
128
|
+
processor.each.should be_a(Enumerator)
|
129
|
+
end
|
130
|
+
|
131
|
+
context "when the file has lines with no data" do
|
132
|
+
let(:filename) { fixture('base-with-lines-with-no-data.csv') }
|
133
|
+
|
134
|
+
it "does not yields these lines" do
|
135
|
+
expect { |block|
|
136
|
+
processor.each(&block)
|
137
|
+
}.to yield_control.exactly(5).times
|
138
|
+
end
|
139
|
+
|
140
|
+
context "but skip_blanks is false" do
|
141
|
+
let(:options) { { skip_blanks: false } }
|
142
|
+
|
143
|
+
it "yields these lines" do
|
144
|
+
expect { |block|
|
145
|
+
processor.each(&block)
|
146
|
+
}.to yield_control.exactly(7).times
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
context "and { headers: true } is passed" do
|
151
|
+
let(:options) { { headers: true } }
|
152
|
+
|
153
|
+
it "does not yields these lines, as well as the header" do
|
154
|
+
expect { |block|
|
155
|
+
processor.each(&block)
|
156
|
+
}.to yield_control.exactly(4).times # header do not count here
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
describe "encoding" do
|
163
|
+
it "can iterate through all of its contents without raising an error" do
|
164
|
+
expect {
|
165
|
+
processor.each {}
|
166
|
+
}.to_not raise_error
|
167
|
+
end
|
168
|
+
|
169
|
+
context "when the encoding is given" do
|
170
|
+
let(:filename) { fixture('base-utf-8.csv') }
|
171
|
+
let(:options) { { encoding: 'utf-8' } }
|
172
|
+
|
173
|
+
its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
|
174
|
+
|
175
|
+
it "opens the file properly" do
|
176
|
+
expect {
|
177
|
+
processor
|
178
|
+
}.to_not raise_error
|
179
|
+
end
|
180
|
+
|
181
|
+
context "and the file is ISO-8859-1" do
|
182
|
+
let(:filename) { fixture('base-iso-8859-1.csv') }
|
183
|
+
|
184
|
+
it "uses it to open the file, raising an error" do
|
185
|
+
expect {
|
186
|
+
processor
|
187
|
+
}.to raise_error
|
188
|
+
end
|
189
|
+
|
190
|
+
context "but the given encoding is ISO-8859-1" do
|
191
|
+
let(:options) { { encoding: 'ISO-8859-1' } }
|
192
|
+
|
193
|
+
its(:detected_encoding) { should eq(Encoding.find(options[:encoding])) }
|
194
|
+
|
195
|
+
it "opens the file properly" do
|
196
|
+
expect {
|
197
|
+
processor
|
198
|
+
}.to_not raise_error
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
context "when the file is in US-ASCII" do
|
205
|
+
its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
|
206
|
+
|
207
|
+
it "reads it with utf-8" do
|
208
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
context "when the file can be read in utf-8" do
|
213
|
+
let(:filename) { fixture('base-utf-8.csv') }
|
214
|
+
|
215
|
+
its(:detected_encoding) { should eq(Encoding.find('utf-8')) }
|
216
|
+
|
217
|
+
it "properly detects it" do
|
218
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
219
|
+
end
|
220
|
+
|
221
|
+
it "can iterate through all of its contents without raising an error" do
|
222
|
+
expect {
|
223
|
+
processor.each {}
|
224
|
+
}.to_not raise_error
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
context "when the file cannot be read in utf-8" do
|
229
|
+
context "but it can be read in iso-8859-1" do
|
230
|
+
let(:filename) { fixture('base-iso-8859-1.csv') }
|
231
|
+
|
232
|
+
its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
|
233
|
+
|
234
|
+
it "properly detects it, transcoding it to utf-8" do
|
235
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
236
|
+
end
|
237
|
+
|
238
|
+
it "can iterate through all of its contents without raising an error" do
|
239
|
+
expect {
|
240
|
+
processor.each {}
|
241
|
+
}.to_not raise_error
|
242
|
+
end
|
243
|
+
|
244
|
+
context "and no look-ahead is used" do
|
245
|
+
let(:options) { { row_sep: "\n" } }
|
246
|
+
|
247
|
+
its(:detected_encoding) { should eq(Encoding.find('iso-8859-1')) }
|
248
|
+
|
249
|
+
it "properly detects it, transcoding it to utf-8" do
|
250
|
+
processor.encoding.should eq(Encoding.find('utf-8'))
|
251
|
+
end
|
252
|
+
|
253
|
+
it "can iterate through all of its contents without raising an error" do
|
254
|
+
expect {
|
255
|
+
processor.each {}
|
256
|
+
}.to_not raise_error
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
describe "gzip support" do
|
264
|
+
let(:filename) { fixture('base.csv.gz') }
|
265
|
+
|
266
|
+
it "detects that the file is gzipped and decompress it" do
|
267
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
268
|
+
end
|
269
|
+
|
270
|
+
it { should be_gzipped }
|
271
|
+
|
272
|
+
context "when the file is in ISO-8859-1 encoding" do
|
273
|
+
let(:filename) { fixture('base-iso-8859-1.csv.gz') }
|
274
|
+
|
275
|
+
it "detects that the file is gzipped and decompress it" do
|
276
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
277
|
+
end
|
278
|
+
|
279
|
+
it { should be_gzipped }
|
280
|
+
end
|
281
|
+
|
282
|
+
context "when { gzipped: false } options is passed" do
|
283
|
+
let(:options) { { gzipped: false } }
|
284
|
+
|
285
|
+
context "and the file is not gzipped" do
|
286
|
+
let(:filename) { fixture('base.csv') }
|
287
|
+
|
288
|
+
it { should_not be_gzipped }
|
289
|
+
|
290
|
+
it "does not raise an error" do
|
291
|
+
expect {
|
292
|
+
processor.shift
|
293
|
+
}.to_not raise_error
|
294
|
+
end
|
295
|
+
end
|
296
|
+
|
297
|
+
context "and the file is gzipped" do
|
298
|
+
it "does not attempt to detect it, reading data as it were UTF-8" do
|
299
|
+
processor.shift.should_not eq(['A', 'B', 'C'])
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
context "when { gzipped: true } option is passed" do
|
305
|
+
let(:options) { { gzipped: true } }
|
306
|
+
|
307
|
+
context "and the file is not gzipped" do
|
308
|
+
let(:filename) { fixture('base.csv') }
|
309
|
+
|
310
|
+
it { should_not be_gzipped }
|
311
|
+
|
312
|
+
it "does not raise an error" do
|
313
|
+
expect {
|
314
|
+
processor.shift
|
315
|
+
}.to_not raise_error
|
316
|
+
end
|
317
|
+
end
|
318
|
+
|
319
|
+
context "and the file is gzipped" do
|
320
|
+
it "properly assumes that the file is gzipped and decompress it" do
|
321
|
+
processor.shift.should eq(['A', 'B', 'C']) # first line decompressed
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
describe "#process_range" do
|
328
|
+
it "yields every line of the file by default" do
|
329
|
+
expect { |block|
|
330
|
+
processor.process_range(&block)
|
331
|
+
}.to yield_control.exactly(5).times
|
332
|
+
end
|
333
|
+
|
334
|
+
it "yields the row and its index" do
|
335
|
+
expect { |block|
|
336
|
+
processor.process_range(&block)
|
337
|
+
}.to yield_successive_args(
|
338
|
+
[["A", "B", "C"], 0],
|
339
|
+
[["a1", "b1", "c1"], 1],
|
340
|
+
[["a2", "b2", "c2"], 2],
|
341
|
+
[["a3", "b3", "c3"], 3],
|
342
|
+
[["a4", "b4", "c4"], 4]
|
343
|
+
)
|
344
|
+
end
|
345
|
+
|
346
|
+
it "rewinds the file, so it can be called multiple times" do
|
347
|
+
processor.process_range {}
|
348
|
+
|
349
|
+
expect { |block|
|
350
|
+
processor.process_range(&block)
|
351
|
+
}.to yield_successive_args(
|
352
|
+
[["A", "B", "C"], 0],
|
353
|
+
[["a1", "b1", "c1"], 1],
|
354
|
+
[["a2", "b2", "c2"], 2],
|
355
|
+
[["a3", "b3", "c3"], 3],
|
356
|
+
[["a4", "b4", "c4"], 4]
|
357
|
+
)
|
358
|
+
end
|
359
|
+
|
360
|
+
context "when an offset is given" do
|
361
|
+
let(:offset) { 2 }
|
362
|
+
|
363
|
+
it "starts from this offset" do
|
364
|
+
expect { |block|
|
365
|
+
processor.process_range(offset: offset, &block)
|
366
|
+
}.to yield_successive_args(
|
367
|
+
[["a2", "b2", "c2"], 2],
|
368
|
+
[["a3", "b3", "c3"], 3],
|
369
|
+
[["a4", "b4", "c4"], 4]
|
370
|
+
)
|
371
|
+
end
|
372
|
+
|
373
|
+
context "and it is equal to the number of lines of the file" do
|
374
|
+
let(:offset) { processor.count }
|
375
|
+
|
376
|
+
it "does not yield" do
|
377
|
+
expect { |block|
|
378
|
+
processor.process_range(offset: offset, &block)
|
379
|
+
}.to_not yield_control
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
context "and it is greater than to the number of lines of the file" do
|
384
|
+
let(:offset) { processor.count + 1 }
|
385
|
+
|
386
|
+
it "does not yield" do
|
387
|
+
expect { |block|
|
388
|
+
processor.process_range(offset: offset, &block)
|
389
|
+
}.to_not yield_control
|
390
|
+
end
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
context "when a limit is given" do
|
395
|
+
let(:limit) { 2 }
|
396
|
+
|
397
|
+
it "yields only the number of rows given" do
|
398
|
+
expect { |block|
|
399
|
+
processor.process_range(limit: limit, &block)
|
400
|
+
}.to yield_successive_args(
|
401
|
+
[["A", "B", "C"], 0],
|
402
|
+
[["a1", "b1", "c1"], 1]
|
403
|
+
)
|
404
|
+
end
|
405
|
+
|
406
|
+
context "with zero" do
|
407
|
+
let(:limit) { 0 }
|
408
|
+
|
409
|
+
it "does not yield" do
|
410
|
+
expect { |block|
|
411
|
+
processor.process_range(limit: limit, &block)
|
412
|
+
}.to_not yield_control
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
context "with an offset" do
|
417
|
+
let(:offset) { 2 }
|
418
|
+
|
419
|
+
it "yields only the number of rows given, from the given offset" do
|
420
|
+
expect { |block|
|
421
|
+
processor.process_range(offset: offset, limit: limit, &block)
|
422
|
+
}.to yield_successive_args(
|
423
|
+
[["a2", "b2", "c2"], 2],
|
424
|
+
[["a3", "b3", "c3"], 3]
|
425
|
+
)
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
describe ".open" do
|
432
|
+
subject(:processor) { double(FileProcessor::CSV, close: true) }
|
433
|
+
before { FileProcessor::CSV.stub(:new).with(filename, options).and_return(processor) }
|
434
|
+
|
435
|
+
context "without a block" do
|
436
|
+
it "creates a new instance and returns it" do
|
437
|
+
FileProcessor::CSV.open(filename, options).should eq(processor)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context "with a block" do
|
442
|
+
it "creates a new instance and returns it" do
|
443
|
+
expect { |block|
|
444
|
+
FileProcessor::CSV.open(filename, options, &block)
|
445
|
+
}.to yield_with_args(processor)
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe FileProcessor::Tempfile do
|
4
|
+
subject(:temp_file) { FileProcessor::Tempfile.new }
|
5
|
+
let(:generated_path) { File.join(Dir.tmpdir, 'some-path') }
|
6
|
+
|
7
|
+
it "creates the file" do
|
8
|
+
File.exists?(temp_file.path).should be_true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "opens file ready to be written" do
|
12
|
+
expect {
|
13
|
+
temp_file << "some content"
|
14
|
+
}.to_not raise_error
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "#path" do
|
18
|
+
it "is generated using 'file-processor' basename" do
|
19
|
+
temp_file.path.start_with?(File.join(Dir.tmpdir, 'file-processor')).should be_true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "#reopen" do
|
24
|
+
let!(:old_file) { temp_file.__getobj__ }
|
25
|
+
|
26
|
+
it "closes the old file" do
|
27
|
+
old_file.should_receive(:close)
|
28
|
+
temp_file.reopen('r')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "updates the delegated object" do
|
32
|
+
temp_file.reopen('r')
|
33
|
+
temp_file.__getobj__.should_not eq(old_file)
|
34
|
+
temp_file.__getobj__.should be_a(File)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "reopens the path with the given mode" do
|
38
|
+
temp_file.stub(:path).and_return(generated_path)
|
39
|
+
File.should_receive(:open).with(generated_path, 'r:utf-8', 384)
|
40
|
+
temp_file.reopen('r:utf-8')
|
41
|
+
end
|
42
|
+
|
43
|
+
context "when the old file is already closed" do
|
44
|
+
it "does not closes the old file" do
|
45
|
+
old_file.close
|
46
|
+
old_file.should_not_receive(:close)
|
47
|
+
temp_file.reopen('r')
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
|
3
|
+
require 'simplecov'
|
4
|
+
SimpleCov.start
|
5
|
+
|
6
|
+
Bundler.require :default
|
7
|
+
|
8
|
+
root = File.expand_path('../..', __FILE__)
|
9
|
+
|
10
|
+
Dir[File.join(root, "spec/support/**/*.rb")].each { |f| require f }
|
11
|
+
|
12
|
+
RSpec.configure do |config|
|
13
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
14
|
+
config.run_all_when_everything_filtered = true
|
15
|
+
config.filter_run :focus
|
16
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module FixturesSupport
|
2
|
+
def fixture(filename)
|
3
|
+
File.join(root_path, "/spec/support/fixtures", filename)
|
4
|
+
end
|
5
|
+
|
6
|
+
def root_path
|
7
|
+
File.expand_path('../../..', __FILE__)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
RSpec.configure do |config|
|
12
|
+
config.include FixturesSupport
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: file_processor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Vicente Mundim
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-14 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: A more powerful CSV file processor
|
15
|
+
email:
|
16
|
+
- vicente.mundim@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- .rspec
|
23
|
+
- .rvmrc
|
24
|
+
- .travis.yml
|
25
|
+
- Gemfile
|
26
|
+
- Gemfile.lock
|
27
|
+
- LICENSE.txt
|
28
|
+
- README.md
|
29
|
+
- Rakefile
|
30
|
+
- file_processor.gemspec
|
31
|
+
- lib/file_processor.rb
|
32
|
+
- lib/file_processor/csv.rb
|
33
|
+
- lib/file_processor/temp_file.rb
|
34
|
+
- lib/file_processor/version.rb
|
35
|
+
- spec/file_processor/csv_spec.rb
|
36
|
+
- spec/file_processor/temp_file_spec.rb
|
37
|
+
- spec/spec_helper.rb
|
38
|
+
- spec/support/fixtures.rb
|
39
|
+
- spec/support/fixtures/base-iso-8859-1.csv
|
40
|
+
- spec/support/fixtures/base-iso-8859-1.csv.gz
|
41
|
+
- spec/support/fixtures/base-new-line-in-field.csv
|
42
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
|
43
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
|
44
|
+
- spec/support/fixtures/base-utf-8.csv
|
45
|
+
- spec/support/fixtures/base-with-blank-lines.csv
|
46
|
+
- spec/support/fixtures/base-with-comma-separated-header.csv
|
47
|
+
- spec/support/fixtures/base-with-lines-with-no-data.csv
|
48
|
+
- spec/support/fixtures/base-with-unknown-column-separator.csv
|
49
|
+
- spec/support/fixtures/base.csv
|
50
|
+
- spec/support/fixtures/base.csv.gz
|
51
|
+
homepage:
|
52
|
+
licenses: []
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
hash: -785318963478006114
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
hash: -785318963478006114
|
75
|
+
requirements: []
|
76
|
+
rubyforge_project:
|
77
|
+
rubygems_version: 1.8.25
|
78
|
+
signing_key:
|
79
|
+
specification_version: 3
|
80
|
+
summary: A more powerful CSV file processor
|
81
|
+
test_files:
|
82
|
+
- spec/file_processor/csv_spec.rb
|
83
|
+
- spec/file_processor/temp_file_spec.rb
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
- spec/support/fixtures.rb
|
86
|
+
- spec/support/fixtures/base-iso-8859-1.csv
|
87
|
+
- spec/support/fixtures/base-iso-8859-1.csv.gz
|
88
|
+
- spec/support/fixtures/base-new-line-in-field.csv
|
89
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-iso-8859-1.csv
|
90
|
+
- spec/support/fixtures/base-non-ascii-characters-in-header-utf-8.csv
|
91
|
+
- spec/support/fixtures/base-utf-8.csv
|
92
|
+
- spec/support/fixtures/base-with-blank-lines.csv
|
93
|
+
- spec/support/fixtures/base-with-comma-separated-header.csv
|
94
|
+
- spec/support/fixtures/base-with-lines-with-no-data.csv
|
95
|
+
- spec/support/fixtures/base-with-unknown-column-separator.csv
|
96
|
+
- spec/support/fixtures/base.csv
|
97
|
+
- spec/support/fixtures/base.csv.gz
|