nebulous 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +34 -0
- data/.rspec +1 -0
- data/.rubocop.yml +265 -0
- data/.travis.yml +5 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/lib/nebulous.rb +15 -0
- data/lib/nebulous/chunk.rb +14 -0
- data/lib/nebulous/delimiter_detector.rb +68 -0
- data/lib/nebulous/parser.rb +129 -0
- data/lib/nebulous/row.rb +47 -0
- data/lib/nebulous/version.rb +3 -0
- data/nebulous.gemspec +27 -0
- data/spec/nebulous/chunk_spec.rb +6 -0
- data/spec/nebulous/delimiter_detector_spec.rb +161 -0
- data/spec/nebulous/parser_spec.rb +287 -0
- data/spec/nebulous/row_spec.rb +6 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/support/assets/clrf-batches.csv +1001 -0
- data/spec/support/assets/cr-comma-delimited.csv +1 -0
- data/spec/support/assets/cr-lf-comma-delimited.csv +1 -0
- data/spec/support/assets/crlf-comma-delimited.csv +21 -0
- data/spec/support/assets/crlf-dolla-delimited.csv +21 -0
- data/spec/support/assets/crlf-pipe-delimited.csv +21 -0
- data/spec/support/assets/crlf-semicolon-delimited.csv +21 -0
- data/spec/support/assets/crlf-tab-delimited.tsv +21 -0
- data/spec/support/assets/no-headers.csv +20 -0
- metadata +171 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class DelimiterDetector
|
3
|
+
LINE_DELIMITERS = [
|
4
|
+
[/CRLF/, "\n"],
|
5
|
+
[/CR, LF/, "\r"],
|
6
|
+
[/CR(?!,)/, "\r"]
|
7
|
+
]
|
8
|
+
|
9
|
+
COLUMN_DELIMITERS = [',', ';', "\t", '|']
|
10
|
+
|
11
|
+
attr_reader :path
|
12
|
+
|
13
|
+
def initialize(path, *args)
|
14
|
+
@path = path
|
15
|
+
@options = args.extract_options!
|
16
|
+
|
17
|
+
raise ArgumentError unless File.exists?(@path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def detect
|
21
|
+
{ col_sep: detect_column_delimiter,
|
22
|
+
row_sep: detect_line_delimiter }
|
23
|
+
end
|
24
|
+
|
25
|
+
def detect_column_delimiter
|
26
|
+
ln = readline
|
27
|
+
|
28
|
+
column_delimiters.each_with_index do |exp, index|
|
29
|
+
counts[index] = ln.split(exp).length - 1
|
30
|
+
end
|
31
|
+
|
32
|
+
count = counts.each_with_index.max[1]
|
33
|
+
column_delimiters[count]
|
34
|
+
end
|
35
|
+
|
36
|
+
def detect_line_delimiter
|
37
|
+
res = Cocaine::CommandLine.new('file', ':path').run(path: path).chomp
|
38
|
+
|
39
|
+
map = line_delimiters.map do |sep|
|
40
|
+
sep[1] if res =~ sep[0]
|
41
|
+
end.compact
|
42
|
+
|
43
|
+
map.first || line_delimiters[0][1]
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def line_delimiters
|
49
|
+
@options.fetch(:line_delimiters, LINE_DELIMITERS)
|
50
|
+
end
|
51
|
+
|
52
|
+
def column_delimiters
|
53
|
+
@options.fetch(:column_delimiters, COLUMN_DELIMITERS)
|
54
|
+
end
|
55
|
+
|
56
|
+
def encoding
|
57
|
+
@options.fetch(:encoding, Encoding::UTF_8.to_s)
|
58
|
+
end
|
59
|
+
|
60
|
+
def counts
|
61
|
+
@counts ||= column_delimiters.map { 0 }
|
62
|
+
end
|
63
|
+
|
64
|
+
def readline
|
65
|
+
File.open(path, &:readline).encode(encoding, invalid: :replace)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class Parser
|
3
|
+
DEFAULT_OPTIONS = {
|
4
|
+
col_sep: nil,
|
5
|
+
row_sep: nil,
|
6
|
+
quote_char: '"',
|
7
|
+
comment_exp: /^#/,
|
8
|
+
chunk: false,
|
9
|
+
headers: true,
|
10
|
+
mapping: nil,
|
11
|
+
limit: false,
|
12
|
+
remove_empty_values: true,
|
13
|
+
encoding: Encoding::UTF_8.to_s
|
14
|
+
}
|
15
|
+
|
16
|
+
attr_reader :file
|
17
|
+
attr_reader :options
|
18
|
+
|
19
|
+
def initialize(file, *args)
|
20
|
+
opts = args.extract_options!
|
21
|
+
|
22
|
+
@options = OpenStruct.new DEFAULT_OPTIONS.merge(opts)
|
23
|
+
@file = read_input(file)
|
24
|
+
|
25
|
+
merge_delimiters
|
26
|
+
end
|
27
|
+
|
28
|
+
def process(&block)
|
29
|
+
@index = 0
|
30
|
+
read_headers
|
31
|
+
iterate(&block)
|
32
|
+
ensure
|
33
|
+
reset
|
34
|
+
file.rewind
|
35
|
+
end
|
36
|
+
|
37
|
+
def delimiters
|
38
|
+
@delimiters ||= DelimiterDetector.new(file.path).detect
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def reset
|
44
|
+
@index = 0
|
45
|
+
@headers = nil
|
46
|
+
@chunk = nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def chunk
|
50
|
+
@chunk ||= Chunk.new chunk_options
|
51
|
+
end
|
52
|
+
|
53
|
+
def read_headers
|
54
|
+
@headers ||= Row.headers(readline, options) if options[:headers]
|
55
|
+
end
|
56
|
+
|
57
|
+
def iterate(&block)
|
58
|
+
while !file.eof?
|
59
|
+
break if limit?
|
60
|
+
chunk << replace_keys(parse_row.merge(@headers))
|
61
|
+
yield_chunk(chunk, &block) if block_given? && options.chunk
|
62
|
+
end
|
63
|
+
|
64
|
+
@chunk.to_a
|
65
|
+
end
|
66
|
+
|
67
|
+
def sequence
|
68
|
+
@index += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
def limit?
|
72
|
+
options.limit && options.limit == @index
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_row
|
76
|
+
sequence
|
77
|
+
Row.parse(read_complete_line, options)
|
78
|
+
end
|
79
|
+
|
80
|
+
def yield_chunk(chunk, &_block)
|
81
|
+
if chunk.full? || file.eof?
|
82
|
+
yield chunk.map(&:to_a)
|
83
|
+
@chunk = nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def read_input(input)
|
88
|
+
input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
|
89
|
+
end
|
90
|
+
|
91
|
+
def read_complete_line
|
92
|
+
ln = readline
|
93
|
+
while ln.count(options.quote_char) % 2 == 1
|
94
|
+
ln += readline
|
95
|
+
end
|
96
|
+
ln
|
97
|
+
end
|
98
|
+
|
99
|
+
def readline
|
100
|
+
file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
|
101
|
+
end
|
102
|
+
|
103
|
+
def encoding
|
104
|
+
options.encoding
|
105
|
+
end
|
106
|
+
|
107
|
+
def merge_delimiters
|
108
|
+
options.row_sep ||= delimiters[:row_sep]
|
109
|
+
options.col_sep ||= delimiters[:col_sep]
|
110
|
+
end
|
111
|
+
|
112
|
+
def line_terminator
|
113
|
+
options.row_sep
|
114
|
+
end
|
115
|
+
|
116
|
+
def chunk_options
|
117
|
+
Hash.new.tap do |attrs|
|
118
|
+
attrs[:size] = options.chunk.to_i if options.chunk
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def replace_keys(row)
|
123
|
+
return row unless options.mapping
|
124
|
+
row.map do |key, value|
|
125
|
+
[options.mapping[key], value] if options.mapping.has_key?(key)
|
126
|
+
end.compact.to_h
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/nebulous/row.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class Row < Array
|
3
|
+
def self.headers(str, opts)
|
4
|
+
headers = parse(str, opts).
|
5
|
+
map(&:parameterize).
|
6
|
+
map(&:underscore).
|
7
|
+
map(&:to_sym)
|
8
|
+
headers.zip(headers).to_h
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.parse(str, opts)
|
12
|
+
str.gsub!(opts.comment_exp, '')
|
13
|
+
str.chomp!
|
14
|
+
|
15
|
+
begin
|
16
|
+
args = opts.to_h.slice(:col_sep, :row_sep, :quote_char)
|
17
|
+
data = CSV.parse_line str, args
|
18
|
+
rescue CSV::MalformedCSVError
|
19
|
+
exp = /(#{opts.col_sep})(?=(?:[^"]|"[^"]*")*$)/
|
20
|
+
data = str.gsub(exp, "\0").split(/\0/)
|
21
|
+
end
|
22
|
+
|
23
|
+
data.map!(&:strip)
|
24
|
+
new(data).to_numeric
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_numeric
|
28
|
+
arr = map do |val|
|
29
|
+
case val
|
30
|
+
when /^[+-]?\d+\.\d+$/
|
31
|
+
val.to_i
|
32
|
+
when /^[+-]?\d+$/
|
33
|
+
val.to_i
|
34
|
+
else
|
35
|
+
val
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
self.class.new(arr)
|
40
|
+
end
|
41
|
+
|
42
|
+
def merge(keys)
|
43
|
+
return self unless keys
|
44
|
+
keys.values.zip(self).to_h
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/nebulous.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'nebulous/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'nebulous'
|
7
|
+
spec.version = Nebulous::VERSION
|
8
|
+
spec.authors = ['Zach Graves']
|
9
|
+
spec.email = ['zagraves@gmail.com']
|
10
|
+
spec.summary = 'Read CSV files with substantially less murderous rage!'
|
11
|
+
spec.description = spec.summary
|
12
|
+
spec.homepage = 'https://github.com/zachgraves/nebulous'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(spec)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'cocaine', '~> 0.5'
|
21
|
+
spec.add_dependency 'activesupport'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
24
|
+
spec.add_development_dependency 'rake'
|
25
|
+
spec.add_development_dependency 'rspec'
|
26
|
+
spec.add_development_dependency 'byebug'
|
27
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Nebulous::DelimiterDetector do
|
4
|
+
context 'around detecting csv delimiters' do
|
5
|
+
subject { Nebulous::DelimiterDetector }
|
6
|
+
let(:path) { './spec/support/assets/crlf-comma-delimited.csv' }
|
7
|
+
let(:detector) { subject.new(path) }
|
8
|
+
|
9
|
+
context '#initialize' do
|
10
|
+
it 'can be initialized' do
|
11
|
+
expect(detector).to be_instance_of subject
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'assigns specified file path' do
|
15
|
+
expect(detector.path).to eq path
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context '#detect' do
|
20
|
+
context 'with CRLF and comma delimiters' do
|
21
|
+
it 'detects expected delimiters' do
|
22
|
+
expect(detector.detect).to eq(
|
23
|
+
{ col_sep: ",", row_sep: "\n" }
|
24
|
+
)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context 'with CRLF and tab delimiters' do
|
29
|
+
let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
|
30
|
+
it 'detects expected delimiters' do
|
31
|
+
expect(detector.detect).to eq(
|
32
|
+
{ col_sep: "\t", row_sep: "\n" }
|
33
|
+
)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context 'with CR and comma delimiters' do
|
38
|
+
let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
|
39
|
+
it 'detects expected delimiters' do
|
40
|
+
expect(detector.detect).to eq(
|
41
|
+
{ col_sep: ",", row_sep: "\r" }
|
42
|
+
)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'with semicolon delimiters' do
|
47
|
+
let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
|
48
|
+
it 'detects expected delimiters' do
|
49
|
+
expect(detector.detect).to eq(
|
50
|
+
{ col_sep: ";", row_sep: "\n" }
|
51
|
+
)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'with pipe delimiters' do
|
56
|
+
let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
|
57
|
+
it 'detects expected delimiters' do
|
58
|
+
expect(detector.detect).to eq(
|
59
|
+
{ col_sep: "|", row_sep: "\n" }
|
60
|
+
)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context 'with custom delimiters' do
|
65
|
+
let(:detector) { subject.new(path, options) }
|
66
|
+
let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
|
67
|
+
let(:options) do
|
68
|
+
{ column_delimiters: ["\n", '$', "\t"] }
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'detects expected delimiters' do
|
72
|
+
expect(detector.detect).to eq(
|
73
|
+
{ col_sep: "$", row_sep: "\n" }
|
74
|
+
)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context '#detect_column_delimiter' do
|
80
|
+
context 'with comma delimiters' do
|
81
|
+
it 'detects expected delimiters' do
|
82
|
+
expect(detector.detect_column_delimiter).to eq ','
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context 'with tab delimiters' do
|
87
|
+
let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
|
88
|
+
it 'detects expected delimiters' do
|
89
|
+
expect(detector.detect_column_delimiter).to eq "\t"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context 'with semicolon delimiters' do
|
94
|
+
let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
|
95
|
+
it 'detects expected delimiters' do
|
96
|
+
expect(detector.detect_column_delimiter).to eq ';'
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context 'with pipe delimiters' do
|
101
|
+
let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
|
102
|
+
it 'detects expected delimiters' do
|
103
|
+
expect(detector.detect_column_delimiter).to eq '|'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context 'with custom delimiters' do
|
108
|
+
let(:detector) { subject.new(path, options) }
|
109
|
+
let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
|
110
|
+
let(:options) do
|
111
|
+
{ column_delimiters: ["\n", '$', "\t"] }
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'detects expected delimiters' do
|
115
|
+
expect(detector.detect_column_delimiter).to eq '$'
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context '#detect_line_delimiter' do
|
121
|
+
context 'with CRLF terminators' do
|
122
|
+
it 'detects expected delimiters' do
|
123
|
+
expect(detector.detect_line_delimiter).to eq "\n"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
context 'with CR terminators' do
|
128
|
+
let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
|
129
|
+
it 'detects expected delimiters' do
|
130
|
+
expect(detector.detect_line_delimiter).to eq "\r"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
context 'with CR, LF terminators' do
|
135
|
+
let(:path) { './spec/support/assets/cr-lf-comma-delimited.csv' }
|
136
|
+
it 'detects expected delimiters' do
|
137
|
+
expect(detector.detect_line_delimiter).to eq "\r"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context '#encoding' do
|
143
|
+
it 'defaults to UTF-8' do
|
144
|
+
expect(detector.send(:encoding)).to eq 'UTF-8'
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
context '#counts' do
|
149
|
+
it 'returns an array initialized at 0 for each column delimiter' do
|
150
|
+
expect(detector.send(:counts)).to eq [0,0,0,0]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context '#readline' do
|
155
|
+
it 'returns first line from provided file' do
|
156
|
+
ln = detector.send(:readline)
|
157
|
+
expect(ln).to eq "First name,Last name,From,Access,Qty\n"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|