nebulous 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ module Nebulous
2
+ class Chunk < Array
3
+ attr_reader :options
4
+
5
+ def initialize(*args)
6
+ @options = args.extract_options!
7
+ super
8
+ end
9
+
10
+ def full?
11
+ options.has_key?(:size) && options[:size] == size
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,68 @@
1
+ module Nebulous
2
+ class DelimiterDetector
3
+ LINE_DELIMITERS = [
4
+ [/CRLF/, "\n"],
5
+ [/CR, LF/, "\r"],
6
+ [/CR(?!,)/, "\r"]
7
+ ]
8
+
9
+ COLUMN_DELIMITERS = [',', ';', "\t", '|']
10
+
11
+ attr_reader :path
12
+
13
+ def initialize(path, *args)
14
+ @path = path
15
+ @options = args.extract_options!
16
+
17
+ raise ArgumentError unless File.exists?(@path)
18
+ end
19
+
20
+ def detect
21
+ { col_sep: detect_column_delimiter,
22
+ row_sep: detect_line_delimiter }
23
+ end
24
+
25
+ def detect_column_delimiter
26
+ ln = readline
27
+
28
+ column_delimiters.each_with_index do |exp, index|
29
+ counts[index] = ln.split(exp).length - 1
30
+ end
31
+
32
+ count = counts.each_with_index.max[1]
33
+ column_delimiters[count]
34
+ end
35
+
36
+ def detect_line_delimiter
37
+ res = Cocaine::CommandLine.new('file', ':path').run(path: path).chomp
38
+
39
+ map = line_delimiters.map do |sep|
40
+ sep[1] if res =~ sep[0]
41
+ end.compact
42
+
43
+ map.first || line_delimiters[0][1]
44
+ end
45
+
46
+ private
47
+
48
+ def line_delimiters
49
+ @options.fetch(:line_delimiters, LINE_DELIMITERS)
50
+ end
51
+
52
+ def column_delimiters
53
+ @options.fetch(:column_delimiters, COLUMN_DELIMITERS)
54
+ end
55
+
56
+ def encoding
57
+ @options.fetch(:encoding, Encoding::UTF_8.to_s)
58
+ end
59
+
60
+ def counts
61
+ @counts ||= column_delimiters.map { 0 }
62
+ end
63
+
64
+ def readline
65
+ File.open(path, &:readline).encode(encoding, invalid: :replace)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,129 @@
1
+ module Nebulous
2
+ class Parser
3
+ DEFAULT_OPTIONS = {
4
+ col_sep: nil,
5
+ row_sep: nil,
6
+ quote_char: '"',
7
+ comment_exp: /^#/,
8
+ chunk: false,
9
+ headers: true,
10
+ mapping: nil,
11
+ limit: false,
12
+ remove_empty_values: true,
13
+ encoding: Encoding::UTF_8.to_s
14
+ }
15
+
16
+ attr_reader :file
17
+ attr_reader :options
18
+
19
+ def initialize(file, *args)
20
+ opts = args.extract_options!
21
+
22
+ @options = OpenStruct.new DEFAULT_OPTIONS.merge(opts)
23
+ @file = read_input(file)
24
+
25
+ merge_delimiters
26
+ end
27
+
28
+ def process(&block)
29
+ @index = 0
30
+ read_headers
31
+ iterate(&block)
32
+ ensure
33
+ reset
34
+ file.rewind
35
+ end
36
+
37
+ def delimiters
38
+ @delimiters ||= DelimiterDetector.new(file.path).detect
39
+ end
40
+
41
+ private
42
+
43
+ def reset
44
+ @index = 0
45
+ @headers = nil
46
+ @chunk = nil
47
+ end
48
+
49
+ def chunk
50
+ @chunk ||= Chunk.new chunk_options
51
+ end
52
+
53
+ def read_headers
54
+ @headers ||= Row.headers(readline, options) if options[:headers]
55
+ end
56
+
57
+ def iterate(&block)
58
+ while !file.eof?
59
+ break if limit?
60
+ chunk << replace_keys(parse_row.merge(@headers))
61
+ yield_chunk(chunk, &block) if block_given? && options.chunk
62
+ end
63
+
64
+ @chunk.to_a
65
+ end
66
+
67
+ def sequence
68
+ @index += 1
69
+ end
70
+
71
+ def limit?
72
+ options.limit && options.limit == @index
73
+ end
74
+
75
+ def parse_row
76
+ sequence
77
+ Row.parse(read_complete_line, options)
78
+ end
79
+
80
+ def yield_chunk(chunk, &_block)
81
+ if chunk.full? || file.eof?
82
+ yield chunk.map(&:to_a)
83
+ @chunk = nil
84
+ end
85
+ end
86
+
87
+ def read_input(input)
88
+ input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
89
+ end
90
+
91
+ def read_complete_line
92
+ ln = readline
93
+ while ln.count(options.quote_char) % 2 == 1
94
+ ln += readline
95
+ end
96
+ ln
97
+ end
98
+
99
+ def readline
100
+ file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
101
+ end
102
+
103
+ def encoding
104
+ options.encoding
105
+ end
106
+
107
+ def merge_delimiters
108
+ options.row_sep ||= delimiters[:row_sep]
109
+ options.col_sep ||= delimiters[:col_sep]
110
+ end
111
+
112
+ def line_terminator
113
+ options.row_sep
114
+ end
115
+
116
+ def chunk_options
117
+ Hash.new.tap do |attrs|
118
+ attrs[:size] = options.chunk.to_i if options.chunk
119
+ end
120
+ end
121
+
122
+ def replace_keys(row)
123
+ return row unless options.mapping
124
+ row.map do |key, value|
125
+ [options.mapping[key], value] if options.mapping.has_key?(key)
126
+ end.compact.to_h
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,47 @@
1
+ module Nebulous
2
+ class Row < Array
3
+ def self.headers(str, opts)
4
+ headers = parse(str, opts).
5
+ map(&:parameterize).
6
+ map(&:underscore).
7
+ map(&:to_sym)
8
+ headers.zip(headers).to_h
9
+ end
10
+
11
+ def self.parse(str, opts)
12
+ str.gsub!(opts.comment_exp, '')
13
+ str.chomp!
14
+
15
+ begin
16
+ args = opts.to_h.slice(:col_sep, :row_sep, :quote_char)
17
+ data = CSV.parse_line str, args
18
+ rescue CSV::MalformedCSVError
19
+ exp = /(#{opts.col_sep})(?=(?:[^"]|"[^"]*")*$)/
20
+ data = str.gsub(exp, "\0").split(/\0/)
21
+ end
22
+
23
+ data.map!(&:strip)
24
+ new(data).to_numeric
25
+ end
26
+
27
+ def to_numeric
28
+ arr = map do |val|
29
+ case val
30
+ when /^[+-]?\d+\.\d+$/
31
+ val.to_i
32
+ when /^[+-]?\d+$/
33
+ val.to_i
34
+ else
35
+ val
36
+ end
37
+ end
38
+
39
+ self.class.new(arr)
40
+ end
41
+
42
+ def merge(keys)
43
+ return self unless keys
44
+ keys.values.zip(self).to_h
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,3 @@
1
+ module Nebulous
2
+ VERSION = '0.0.2'.freeze
3
+ end
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'nebulous/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'nebulous'
7
+ spec.version = Nebulous::VERSION
8
+ spec.authors = ['Zach Graves']
9
+ spec.email = ['zagraves@gmail.com']
10
+ spec.summary = 'Read CSV files with substantially less murderous rage!'
11
+ spec.description = spec.summary
12
+ spec.homepage = 'https://github.com/zachgraves/nebulous'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(spec)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_dependency 'cocaine', '~> 0.5'
21
+ spec.add_dependency 'activesupport'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.6'
24
+ spec.add_development_dependency 'rake'
25
+ spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency 'byebug'
27
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nebulous::Chunk do
4
+ context 'around batches of csv data' do
5
+ end
6
+ end
@@ -0,0 +1,161 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nebulous::DelimiterDetector do
4
+ context 'around detecting csv delimiters' do
5
+ subject { Nebulous::DelimiterDetector }
6
+ let(:path) { './spec/support/assets/crlf-comma-delimited.csv' }
7
+ let(:detector) { subject.new(path) }
8
+
9
+ context '#initialize' do
10
+ it 'can be initialized' do
11
+ expect(detector).to be_instance_of subject
12
+ end
13
+
14
+ it 'assigns specified file path' do
15
+ expect(detector.path).to eq path
16
+ end
17
+ end
18
+
19
+ context '#detect' do
20
+ context 'with CRLF and comma delimiters' do
21
+ it 'detects expected delimiters' do
22
+ expect(detector.detect).to eq(
23
+ { col_sep: ",", row_sep: "\n" }
24
+ )
25
+ end
26
+ end
27
+
28
+ context 'with CRLF and tab delimiters' do
29
+ let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
30
+ it 'detects expected delimiters' do
31
+ expect(detector.detect).to eq(
32
+ { col_sep: "\t", row_sep: "\n" }
33
+ )
34
+ end
35
+ end
36
+
37
+ context 'with CR and comma delimiters' do
38
+ let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
39
+ it 'detects expected delimiters' do
40
+ expect(detector.detect).to eq(
41
+ { col_sep: ",", row_sep: "\r" }
42
+ )
43
+ end
44
+ end
45
+
46
+ context 'with semicolon delimiters' do
47
+ let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
48
+ it 'detects expected delimiters' do
49
+ expect(detector.detect).to eq(
50
+ { col_sep: ";", row_sep: "\n" }
51
+ )
52
+ end
53
+ end
54
+
55
+ context 'with pipe delimiters' do
56
+ let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
57
+ it 'detects expected delimiters' do
58
+ expect(detector.detect).to eq(
59
+ { col_sep: "|", row_sep: "\n" }
60
+ )
61
+ end
62
+ end
63
+
64
+ context 'with custom delimiters' do
65
+ let(:detector) { subject.new(path, options) }
66
+ let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
67
+ let(:options) do
68
+ { column_delimiters: ["\n", '$', "\t"] }
69
+ end
70
+
71
+ it 'detects expected delimiters' do
72
+ expect(detector.detect).to eq(
73
+ { col_sep: "$", row_sep: "\n" }
74
+ )
75
+ end
76
+ end
77
+ end
78
+
79
+ context '#detect_column_delimiter' do
80
+ context 'with comma delimiters' do
81
+ it 'detects expected delimiters' do
82
+ expect(detector.detect_column_delimiter).to eq ','
83
+ end
84
+ end
85
+
86
+ context 'with tab delimiters' do
87
+ let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
88
+ it 'detects expected delimiters' do
89
+ expect(detector.detect_column_delimiter).to eq "\t"
90
+ end
91
+ end
92
+
93
+ context 'with semicolon delimiters' do
94
+ let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
95
+ it 'detects expected delimiters' do
96
+ expect(detector.detect_column_delimiter).to eq ';'
97
+ end
98
+ end
99
+
100
+ context 'with pipe delimiters' do
101
+ let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
102
+ it 'detects expected delimiters' do
103
+ expect(detector.detect_column_delimiter).to eq '|'
104
+ end
105
+ end
106
+
107
+ context 'with custom delimiters' do
108
+ let(:detector) { subject.new(path, options) }
109
+ let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
110
+ let(:options) do
111
+ { column_delimiters: ["\n", '$', "\t"] }
112
+ end
113
+
114
+ it 'detects expected delimiters' do
115
+ expect(detector.detect_column_delimiter).to eq '$'
116
+ end
117
+ end
118
+ end
119
+
120
+ context '#detect_line_delimiter' do
121
+ context 'with CRLF terminators' do
122
+ it 'detects expected delimiters' do
123
+ expect(detector.detect_line_delimiter).to eq "\n"
124
+ end
125
+ end
126
+
127
+ context 'with CR terminators' do
128
+ let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
129
+ it 'detects expected delimiters' do
130
+ expect(detector.detect_line_delimiter).to eq "\r"
131
+ end
132
+ end
133
+
134
+ context 'with CR, LF terminators' do
135
+ let(:path) { './spec/support/assets/cr-lf-comma-delimited.csv' }
136
+ it 'detects expected delimiters' do
137
+ expect(detector.detect_line_delimiter).to eq "\r"
138
+ end
139
+ end
140
+ end
141
+
142
+ context '#encoding' do
143
+ it 'defaults to UTF-8' do
144
+ expect(detector.send(:encoding)).to eq 'UTF-8'
145
+ end
146
+ end
147
+
148
+ context '#counts' do
149
+ it 'returns an array initialized at 0 for each column delimiter' do
150
+ expect(detector.send(:counts)).to eq [0,0,0,0]
151
+ end
152
+ end
153
+
154
+ context '#readline' do
155
+ it 'returns first line from provided file' do
156
+ ln = detector.send(:readline)
157
+ expect(ln).to eq "First name,Last name,From,Access,Qty\n"
158
+ end
159
+ end
160
+ end
161
+ end