nebulous 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +34 -0
- data/.rspec +1 -0
- data/.rubocop.yml +265 -0
- data/.travis.yml +5 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/lib/nebulous.rb +15 -0
- data/lib/nebulous/chunk.rb +14 -0
- data/lib/nebulous/delimiter_detector.rb +68 -0
- data/lib/nebulous/parser.rb +129 -0
- data/lib/nebulous/row.rb +47 -0
- data/lib/nebulous/version.rb +3 -0
- data/nebulous.gemspec +27 -0
- data/spec/nebulous/chunk_spec.rb +6 -0
- data/spec/nebulous/delimiter_detector_spec.rb +161 -0
- data/spec/nebulous/parser_spec.rb +287 -0
- data/spec/nebulous/row_spec.rb +6 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/support/assets/clrf-batches.csv +1001 -0
- data/spec/support/assets/cr-comma-delimited.csv +1 -0
- data/spec/support/assets/cr-lf-comma-delimited.csv +1 -0
- data/spec/support/assets/crlf-comma-delimited.csv +21 -0
- data/spec/support/assets/crlf-dolla-delimited.csv +21 -0
- data/spec/support/assets/crlf-pipe-delimited.csv +21 -0
- data/spec/support/assets/crlf-semicolon-delimited.csv +21 -0
- data/spec/support/assets/crlf-tab-delimited.tsv +21 -0
- data/spec/support/assets/no-headers.csv +20 -0
- metadata +171 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class DelimiterDetector
|
3
|
+
LINE_DELIMITERS = [
|
4
|
+
[/CRLF/, "\n"],
|
5
|
+
[/CR, LF/, "\r"],
|
6
|
+
[/CR(?!,)/, "\r"]
|
7
|
+
]
|
8
|
+
|
9
|
+
COLUMN_DELIMITERS = [',', ';', "\t", '|']
|
10
|
+
|
11
|
+
attr_reader :path
|
12
|
+
|
13
|
+
def initialize(path, *args)
|
14
|
+
@path = path
|
15
|
+
@options = args.extract_options!
|
16
|
+
|
17
|
+
raise ArgumentError unless File.exists?(@path)
|
18
|
+
end
|
19
|
+
|
20
|
+
def detect
|
21
|
+
{ col_sep: detect_column_delimiter,
|
22
|
+
row_sep: detect_line_delimiter }
|
23
|
+
end
|
24
|
+
|
25
|
+
def detect_column_delimiter
|
26
|
+
ln = readline
|
27
|
+
|
28
|
+
column_delimiters.each_with_index do |exp, index|
|
29
|
+
counts[index] = ln.split(exp).length - 1
|
30
|
+
end
|
31
|
+
|
32
|
+
count = counts.each_with_index.max[1]
|
33
|
+
column_delimiters[count]
|
34
|
+
end
|
35
|
+
|
36
|
+
def detect_line_delimiter
|
37
|
+
res = Cocaine::CommandLine.new('file', ':path').run(path: path).chomp
|
38
|
+
|
39
|
+
map = line_delimiters.map do |sep|
|
40
|
+
sep[1] if res =~ sep[0]
|
41
|
+
end.compact
|
42
|
+
|
43
|
+
map.first || line_delimiters[0][1]
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def line_delimiters
|
49
|
+
@options.fetch(:line_delimiters, LINE_DELIMITERS)
|
50
|
+
end
|
51
|
+
|
52
|
+
def column_delimiters
|
53
|
+
@options.fetch(:column_delimiters, COLUMN_DELIMITERS)
|
54
|
+
end
|
55
|
+
|
56
|
+
def encoding
|
57
|
+
@options.fetch(:encoding, Encoding::UTF_8.to_s)
|
58
|
+
end
|
59
|
+
|
60
|
+
def counts
|
61
|
+
@counts ||= column_delimiters.map { 0 }
|
62
|
+
end
|
63
|
+
|
64
|
+
def readline
|
65
|
+
File.open(path, &:readline).encode(encoding, invalid: :replace)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class Parser
|
3
|
+
DEFAULT_OPTIONS = {
|
4
|
+
col_sep: nil,
|
5
|
+
row_sep: nil,
|
6
|
+
quote_char: '"',
|
7
|
+
comment_exp: /^#/,
|
8
|
+
chunk: false,
|
9
|
+
headers: true,
|
10
|
+
mapping: nil,
|
11
|
+
limit: false,
|
12
|
+
remove_empty_values: true,
|
13
|
+
encoding: Encoding::UTF_8.to_s
|
14
|
+
}
|
15
|
+
|
16
|
+
attr_reader :file
|
17
|
+
attr_reader :options
|
18
|
+
|
19
|
+
def initialize(file, *args)
|
20
|
+
opts = args.extract_options!
|
21
|
+
|
22
|
+
@options = OpenStruct.new DEFAULT_OPTIONS.merge(opts)
|
23
|
+
@file = read_input(file)
|
24
|
+
|
25
|
+
merge_delimiters
|
26
|
+
end
|
27
|
+
|
28
|
+
def process(&block)
|
29
|
+
@index = 0
|
30
|
+
read_headers
|
31
|
+
iterate(&block)
|
32
|
+
ensure
|
33
|
+
reset
|
34
|
+
file.rewind
|
35
|
+
end
|
36
|
+
|
37
|
+
def delimiters
|
38
|
+
@delimiters ||= DelimiterDetector.new(file.path).detect
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def reset
|
44
|
+
@index = 0
|
45
|
+
@headers = nil
|
46
|
+
@chunk = nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def chunk
|
50
|
+
@chunk ||= Chunk.new chunk_options
|
51
|
+
end
|
52
|
+
|
53
|
+
def read_headers
|
54
|
+
@headers ||= Row.headers(readline, options) if options[:headers]
|
55
|
+
end
|
56
|
+
|
57
|
+
def iterate(&block)
|
58
|
+
while !file.eof?
|
59
|
+
break if limit?
|
60
|
+
chunk << replace_keys(parse_row.merge(@headers))
|
61
|
+
yield_chunk(chunk, &block) if block_given? && options.chunk
|
62
|
+
end
|
63
|
+
|
64
|
+
@chunk.to_a
|
65
|
+
end
|
66
|
+
|
67
|
+
def sequence
|
68
|
+
@index += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
def limit?
|
72
|
+
options.limit && options.limit == @index
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_row
|
76
|
+
sequence
|
77
|
+
Row.parse(read_complete_line, options)
|
78
|
+
end
|
79
|
+
|
80
|
+
def yield_chunk(chunk, &_block)
|
81
|
+
if chunk.full? || file.eof?
|
82
|
+
yield chunk.map(&:to_a)
|
83
|
+
@chunk = nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def read_input(input)
|
88
|
+
input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
|
89
|
+
end
|
90
|
+
|
91
|
+
def read_complete_line
|
92
|
+
ln = readline
|
93
|
+
while ln.count(options.quote_char) % 2 == 1
|
94
|
+
ln += readline
|
95
|
+
end
|
96
|
+
ln
|
97
|
+
end
|
98
|
+
|
99
|
+
def readline
|
100
|
+
file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
|
101
|
+
end
|
102
|
+
|
103
|
+
def encoding
|
104
|
+
options.encoding
|
105
|
+
end
|
106
|
+
|
107
|
+
def merge_delimiters
|
108
|
+
options.row_sep ||= delimiters[:row_sep]
|
109
|
+
options.col_sep ||= delimiters[:col_sep]
|
110
|
+
end
|
111
|
+
|
112
|
+
def line_terminator
|
113
|
+
options.row_sep
|
114
|
+
end
|
115
|
+
|
116
|
+
def chunk_options
|
117
|
+
Hash.new.tap do |attrs|
|
118
|
+
attrs[:size] = options.chunk.to_i if options.chunk
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def replace_keys(row)
|
123
|
+
return row unless options.mapping
|
124
|
+
row.map do |key, value|
|
125
|
+
[options.mapping[key], value] if options.mapping.has_key?(key)
|
126
|
+
end.compact.to_h
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
data/lib/nebulous/row.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Nebulous
|
2
|
+
class Row < Array
|
3
|
+
def self.headers(str, opts)
|
4
|
+
headers = parse(str, opts).
|
5
|
+
map(&:parameterize).
|
6
|
+
map(&:underscore).
|
7
|
+
map(&:to_sym)
|
8
|
+
headers.zip(headers).to_h
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.parse(str, opts)
|
12
|
+
str.gsub!(opts.comment_exp, '')
|
13
|
+
str.chomp!
|
14
|
+
|
15
|
+
begin
|
16
|
+
args = opts.to_h.slice(:col_sep, :row_sep, :quote_char)
|
17
|
+
data = CSV.parse_line str, args
|
18
|
+
rescue CSV::MalformedCSVError
|
19
|
+
exp = /(#{opts.col_sep})(?=(?:[^"]|"[^"]*")*$)/
|
20
|
+
data = str.gsub(exp, "\0").split(/\0/)
|
21
|
+
end
|
22
|
+
|
23
|
+
data.map!(&:strip)
|
24
|
+
new(data).to_numeric
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_numeric
|
28
|
+
arr = map do |val|
|
29
|
+
case val
|
30
|
+
when /^[+-]?\d+\.\d+$/
|
31
|
+
val.to_i
|
32
|
+
when /^[+-]?\d+$/
|
33
|
+
val.to_i
|
34
|
+
else
|
35
|
+
val
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
self.class.new(arr)
|
40
|
+
end
|
41
|
+
|
42
|
+
def merge(keys)
|
43
|
+
return self unless keys
|
44
|
+
keys.values.zip(self).to_h
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/nebulous.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'nebulous/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'nebulous'
|
7
|
+
spec.version = Nebulous::VERSION
|
8
|
+
spec.authors = ['Zach Graves']
|
9
|
+
spec.email = ['zagraves@gmail.com']
|
10
|
+
spec.summary = 'Read CSV files with substantially less murderous rage!'
|
11
|
+
spec.description = spec.summary
|
12
|
+
spec.homepage = 'https://github.com/zachgraves/nebulous'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(spec)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
|
20
|
+
spec.add_dependency 'cocaine', '~> 0.5'
|
21
|
+
spec.add_dependency 'activesupport'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
24
|
+
spec.add_development_dependency 'rake'
|
25
|
+
spec.add_development_dependency 'rspec'
|
26
|
+
spec.add_development_dependency 'byebug'
|
27
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Nebulous::DelimiterDetector do
|
4
|
+
context 'around detecting csv delimiters' do
|
5
|
+
subject { Nebulous::DelimiterDetector }
|
6
|
+
let(:path) { './spec/support/assets/crlf-comma-delimited.csv' }
|
7
|
+
let(:detector) { subject.new(path) }
|
8
|
+
|
9
|
+
context '#initialize' do
|
10
|
+
it 'can be initialized' do
|
11
|
+
expect(detector).to be_instance_of subject
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'assigns specified file path' do
|
15
|
+
expect(detector.path).to eq path
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context '#detect' do
|
20
|
+
context 'with CRLF and comma delimiters' do
|
21
|
+
it 'detects expected delimiters' do
|
22
|
+
expect(detector.detect).to eq(
|
23
|
+
{ col_sep: ",", row_sep: "\n" }
|
24
|
+
)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context 'with CRLF and tab delimiters' do
|
29
|
+
let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
|
30
|
+
it 'detects expected delimiters' do
|
31
|
+
expect(detector.detect).to eq(
|
32
|
+
{ col_sep: "\t", row_sep: "\n" }
|
33
|
+
)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context 'with CR and comma delimiters' do
|
38
|
+
let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
|
39
|
+
it 'detects expected delimiters' do
|
40
|
+
expect(detector.detect).to eq(
|
41
|
+
{ col_sep: ",", row_sep: "\r" }
|
42
|
+
)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context 'with semicolon delimiters' do
|
47
|
+
let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
|
48
|
+
it 'detects expected delimiters' do
|
49
|
+
expect(detector.detect).to eq(
|
50
|
+
{ col_sep: ";", row_sep: "\n" }
|
51
|
+
)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
context 'with pipe delimiters' do
|
56
|
+
let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
|
57
|
+
it 'detects expected delimiters' do
|
58
|
+
expect(detector.detect).to eq(
|
59
|
+
{ col_sep: "|", row_sep: "\n" }
|
60
|
+
)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context 'with custom delimiters' do
|
65
|
+
let(:detector) { subject.new(path, options) }
|
66
|
+
let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
|
67
|
+
let(:options) do
|
68
|
+
{ column_delimiters: ["\n", '$', "\t"] }
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'detects expected delimiters' do
|
72
|
+
expect(detector.detect).to eq(
|
73
|
+
{ col_sep: "$", row_sep: "\n" }
|
74
|
+
)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context '#detect_column_delimiter' do
|
80
|
+
context 'with comma delimiters' do
|
81
|
+
it 'detects expected delimiters' do
|
82
|
+
expect(detector.detect_column_delimiter).to eq ','
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
context 'with tab delimiters' do
|
87
|
+
let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
|
88
|
+
it 'detects expected delimiters' do
|
89
|
+
expect(detector.detect_column_delimiter).to eq "\t"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context 'with semicolon delimiters' do
|
94
|
+
let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
|
95
|
+
it 'detects expected delimiters' do
|
96
|
+
expect(detector.detect_column_delimiter).to eq ';'
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context 'with pipe delimiters' do
|
101
|
+
let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
|
102
|
+
it 'detects expected delimiters' do
|
103
|
+
expect(detector.detect_column_delimiter).to eq '|'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context 'with custom delimiters' do
|
108
|
+
let(:detector) { subject.new(path, options) }
|
109
|
+
let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
|
110
|
+
let(:options) do
|
111
|
+
{ column_delimiters: ["\n", '$', "\t"] }
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'detects expected delimiters' do
|
115
|
+
expect(detector.detect_column_delimiter).to eq '$'
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context '#detect_line_delimiter' do
|
121
|
+
context 'with CRLF terminators' do
|
122
|
+
it 'detects expected delimiters' do
|
123
|
+
expect(detector.detect_line_delimiter).to eq "\n"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
context 'with CR terminators' do
|
128
|
+
let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
|
129
|
+
it 'detects expected delimiters' do
|
130
|
+
expect(detector.detect_line_delimiter).to eq "\r"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
context 'with CR, LF terminators' do
|
135
|
+
let(:path) { './spec/support/assets/cr-lf-comma-delimited.csv' }
|
136
|
+
it 'detects expected delimiters' do
|
137
|
+
expect(detector.detect_line_delimiter).to eq "\r"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context '#encoding' do
|
143
|
+
it 'defaults to UTF-8' do
|
144
|
+
expect(detector.send(:encoding)).to eq 'UTF-8'
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
context '#counts' do
|
149
|
+
it 'returns an array initialized at 0 for each column delimiter' do
|
150
|
+
expect(detector.send(:counts)).to eq [0,0,0,0]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context '#readline' do
|
155
|
+
it 'returns first line from provided file' do
|
156
|
+
ln = detector.send(:readline)
|
157
|
+
expect(ln).to eq "First name,Last name,From,Access,Qty\n"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|