nebulous 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ module Nebulous
2
+ class Chunk < Array
3
+ attr_reader :options
4
+
5
+ def initialize(*args)
6
+ @options = args.extract_options!
7
+ super
8
+ end
9
+
10
+ def full?
11
+ options.has_key?(:size) && options[:size] == size
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,68 @@
1
+ module Nebulous
2
+ class DelimiterDetector
3
+ LINE_DELIMITERS = [
4
+ [/CRLF/, "\n"],
5
+ [/CR, LF/, "\r"],
6
+ [/CR(?!,)/, "\r"]
7
+ ]
8
+
9
+ COLUMN_DELIMITERS = [',', ';', "\t", '|']
10
+
11
+ attr_reader :path
12
+
13
+ def initialize(path, *args)
14
+ @path = path
15
+ @options = args.extract_options!
16
+
17
+ raise ArgumentError unless File.exists?(@path)
18
+ end
19
+
20
+ def detect
21
+ { col_sep: detect_column_delimiter,
22
+ row_sep: detect_line_delimiter }
23
+ end
24
+
25
+ def detect_column_delimiter
26
+ ln = readline
27
+
28
+ column_delimiters.each_with_index do |exp, index|
29
+ counts[index] = ln.split(exp).length - 1
30
+ end
31
+
32
+ count = counts.each_with_index.max[1]
33
+ column_delimiters[count]
34
+ end
35
+
36
+ def detect_line_delimiter
37
+ res = Cocaine::CommandLine.new('file', ':path').run(path: path).chomp
38
+
39
+ map = line_delimiters.map do |sep|
40
+ sep[1] if res =~ sep[0]
41
+ end.compact
42
+
43
+ map.first || line_delimiters[0][1]
44
+ end
45
+
46
+ private
47
+
48
+ def line_delimiters
49
+ @options.fetch(:line_delimiters, LINE_DELIMITERS)
50
+ end
51
+
52
+ def column_delimiters
53
+ @options.fetch(:column_delimiters, COLUMN_DELIMITERS)
54
+ end
55
+
56
+ def encoding
57
+ @options.fetch(:encoding, Encoding::UTF_8.to_s)
58
+ end
59
+
60
+ def counts
61
+ @counts ||= column_delimiters.map { 0 }
62
+ end
63
+
64
+ def readline
65
+ File.open(path, &:readline).encode(encoding, invalid: :replace)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,129 @@
1
+ module Nebulous
2
+ class Parser
3
+ DEFAULT_OPTIONS = {
4
+ col_sep: nil,
5
+ row_sep: nil,
6
+ quote_char: '"',
7
+ comment_exp: /^#/,
8
+ chunk: false,
9
+ headers: true,
10
+ mapping: nil,
11
+ limit: false,
12
+ remove_empty_values: true,
13
+ encoding: Encoding::UTF_8.to_s
14
+ }
15
+
16
+ attr_reader :file
17
+ attr_reader :options
18
+
19
+ def initialize(file, *args)
20
+ opts = args.extract_options!
21
+
22
+ @options = OpenStruct.new DEFAULT_OPTIONS.merge(opts)
23
+ @file = read_input(file)
24
+
25
+ merge_delimiters
26
+ end
27
+
28
+ def process(&block)
29
+ @index = 0
30
+ read_headers
31
+ iterate(&block)
32
+ ensure
33
+ reset
34
+ file.rewind
35
+ end
36
+
37
+ def delimiters
38
+ @delimiters ||= DelimiterDetector.new(file.path).detect
39
+ end
40
+
41
+ private
42
+
43
+ def reset
44
+ @index = 0
45
+ @headers = nil
46
+ @chunk = nil
47
+ end
48
+
49
+ def chunk
50
+ @chunk ||= Chunk.new chunk_options
51
+ end
52
+
53
+ def read_headers
54
+ @headers ||= Row.headers(readline, options) if options[:headers]
55
+ end
56
+
57
+ def iterate(&block)
58
+ while !file.eof?
59
+ break if limit?
60
+ chunk << replace_keys(parse_row.merge(@headers))
61
+ yield_chunk(chunk, &block) if block_given? && options.chunk
62
+ end
63
+
64
+ @chunk.to_a
65
+ end
66
+
67
+ def sequence
68
+ @index += 1
69
+ end
70
+
71
+ def limit?
72
+ options.limit && options.limit == @index
73
+ end
74
+
75
+ def parse_row
76
+ sequence
77
+ Row.parse(read_complete_line, options)
78
+ end
79
+
80
+ def yield_chunk(chunk, &_block)
81
+ if chunk.full? || file.eof?
82
+ yield chunk.map(&:to_a)
83
+ @chunk = nil
84
+ end
85
+ end
86
+
87
+ def read_input(input)
88
+ input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
89
+ end
90
+
91
+ def read_complete_line
92
+ ln = readline
93
+ while ln.count(options.quote_char) % 2 == 1
94
+ ln += readline
95
+ end
96
+ ln
97
+ end
98
+
99
+ def readline
100
+ file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
101
+ end
102
+
103
+ def encoding
104
+ options.encoding
105
+ end
106
+
107
+ def merge_delimiters
108
+ options.row_sep ||= delimiters[:row_sep]
109
+ options.col_sep ||= delimiters[:col_sep]
110
+ end
111
+
112
+ def line_terminator
113
+ options.row_sep
114
+ end
115
+
116
+ def chunk_options
117
+ Hash.new.tap do |attrs|
118
+ attrs[:size] = options.chunk.to_i if options.chunk
119
+ end
120
+ end
121
+
122
+ def replace_keys(row)
123
+ return row unless options.mapping
124
+ row.map do |key, value|
125
+ [options.mapping[key], value] if options.mapping.has_key?(key)
126
+ end.compact.to_h
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,47 @@
1
+ module Nebulous
2
+ class Row < Array
3
+ def self.headers(str, opts)
4
+ headers = parse(str, opts).
5
+ map(&:parameterize).
6
+ map(&:underscore).
7
+ map(&:to_sym)
8
+ headers.zip(headers).to_h
9
+ end
10
+
11
+ def self.parse(str, opts)
12
+ str.gsub!(opts.comment_exp, '')
13
+ str.chomp!
14
+
15
+ begin
16
+ args = opts.to_h.slice(:col_sep, :row_sep, :quote_char)
17
+ data = CSV.parse_line str, args
18
+ rescue CSV::MalformedCSVError
19
+ exp = /(#{opts.col_sep})(?=(?:[^"]|"[^"]*")*$)/
20
+ data = str.gsub(exp, "\0").split(/\0/)
21
+ end
22
+
23
+ data.map!(&:strip)
24
+ new(data).to_numeric
25
+ end
26
+
27
+ def to_numeric
28
+ arr = map do |val|
29
+ case val
30
+ when /^[+-]?\d+\.\d+$/
31
+ val.to_i
32
+ when /^[+-]?\d+$/
33
+ val.to_i
34
+ else
35
+ val
36
+ end
37
+ end
38
+
39
+ self.class.new(arr)
40
+ end
41
+
42
+ def merge(keys)
43
+ return self unless keys
44
+ keys.values.zip(self).to_h
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,3 @@
1
+ module Nebulous
2
+ VERSION = '0.0.2'.freeze
3
+ end
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'nebulous/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'nebulous'
7
+ spec.version = Nebulous::VERSION
8
+ spec.authors = ['Zach Graves']
9
+ spec.email = ['zagraves@gmail.com']
10
+ spec.summary = 'Read CSV files with substantially less murderous rage!'
11
+ spec.description = spec.summary
12
+ spec.homepage = 'https://github.com/zachgraves/nebulous'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(spec)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_dependency 'cocaine', '~> 0.5'
21
+ spec.add_dependency 'activesupport'
22
+
23
+ spec.add_development_dependency 'bundler', '~> 1.6'
24
+ spec.add_development_dependency 'rake'
25
+ spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency 'byebug'
27
+ end
@@ -0,0 +1,6 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nebulous::Chunk do
4
+ context 'around batches of csv data' do
5
+ end
6
+ end
@@ -0,0 +1,161 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nebulous::DelimiterDetector do
4
+ context 'around detecting csv delimiters' do
5
+ subject { Nebulous::DelimiterDetector }
6
+ let(:path) { './spec/support/assets/crlf-comma-delimited.csv' }
7
+ let(:detector) { subject.new(path) }
8
+
9
+ context '#initialize' do
10
+ it 'can be initialized' do
11
+ expect(detector).to be_instance_of subject
12
+ end
13
+
14
+ it 'assigns specified file path' do
15
+ expect(detector.path).to eq path
16
+ end
17
+ end
18
+
19
+ context '#detect' do
20
+ context 'with CRLF and comma delimiters' do
21
+ it 'detects expected delimiters' do
22
+ expect(detector.detect).to eq(
23
+ { col_sep: ",", row_sep: "\n" }
24
+ )
25
+ end
26
+ end
27
+
28
+ context 'with CRLF and tab delimiters' do
29
+ let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
30
+ it 'detects expected delimiters' do
31
+ expect(detector.detect).to eq(
32
+ { col_sep: "\t", row_sep: "\n" }
33
+ )
34
+ end
35
+ end
36
+
37
+ context 'with CR and comma delimiters' do
38
+ let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
39
+ it 'detects expected delimiters' do
40
+ expect(detector.detect).to eq(
41
+ { col_sep: ",", row_sep: "\r" }
42
+ )
43
+ end
44
+ end
45
+
46
+ context 'with semicolon delimiters' do
47
+ let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
48
+ it 'detects expected delimiters' do
49
+ expect(detector.detect).to eq(
50
+ { col_sep: ";", row_sep: "\n" }
51
+ )
52
+ end
53
+ end
54
+
55
+ context 'with pipe delimiters' do
56
+ let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
57
+ it 'detects expected delimiters' do
58
+ expect(detector.detect).to eq(
59
+ { col_sep: "|", row_sep: "\n" }
60
+ )
61
+ end
62
+ end
63
+
64
+ context 'with custom delimiters' do
65
+ let(:detector) { subject.new(path, options) }
66
+ let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
67
+ let(:options) do
68
+ { column_delimiters: ["\n", '$', "\t"] }
69
+ end
70
+
71
+ it 'detects expected delimiters' do
72
+ expect(detector.detect).to eq(
73
+ { col_sep: "$", row_sep: "\n" }
74
+ )
75
+ end
76
+ end
77
+ end
78
+
79
+ context '#detect_column_delimiter' do
80
+ context 'with comma delimiters' do
81
+ it 'detects expected delimiters' do
82
+ expect(detector.detect_column_delimiter).to eq ','
83
+ end
84
+ end
85
+
86
+ context 'with tab delimiters' do
87
+ let(:path) { './spec/support/assets/crlf-tab-delimited.tsv' }
88
+ it 'detects expected delimiters' do
89
+ expect(detector.detect_column_delimiter).to eq "\t"
90
+ end
91
+ end
92
+
93
+ context 'with semicolon delimiters' do
94
+ let(:path) { './spec/support/assets/crlf-semicolon-delimited.csv' }
95
+ it 'detects expected delimiters' do
96
+ expect(detector.detect_column_delimiter).to eq ';'
97
+ end
98
+ end
99
+
100
+ context 'with pipe delimiters' do
101
+ let(:path) { './spec/support/assets/crlf-pipe-delimited.csv' }
102
+ it 'detects expected delimiters' do
103
+ expect(detector.detect_column_delimiter).to eq '|'
104
+ end
105
+ end
106
+
107
+ context 'with custom delimiters' do
108
+ let(:detector) { subject.new(path, options) }
109
+ let(:path) { './spec/support/assets/crlf-dolla-delimited.csv' }
110
+ let(:options) do
111
+ { column_delimiters: ["\n", '$', "\t"] }
112
+ end
113
+
114
+ it 'detects expected delimiters' do
115
+ expect(detector.detect_column_delimiter).to eq '$'
116
+ end
117
+ end
118
+ end
119
+
120
+ context '#detect_line_delimiter' do
121
+ context 'with CRLF terminators' do
122
+ it 'detects expected delimiters' do
123
+ expect(detector.detect_line_delimiter).to eq "\n"
124
+ end
125
+ end
126
+
127
+ context 'with CR terminators' do
128
+ let(:path) { './spec/support/assets/cr-comma-delimited.csv' }
129
+ it 'detects expected delimiters' do
130
+ expect(detector.detect_line_delimiter).to eq "\r"
131
+ end
132
+ end
133
+
134
+ context 'with CR, LF terminators' do
135
+ let(:path) { './spec/support/assets/cr-lf-comma-delimited.csv' }
136
+ it 'detects expected delimiters' do
137
+ expect(detector.detect_line_delimiter).to eq "\r"
138
+ end
139
+ end
140
+ end
141
+
142
+ context '#encoding' do
143
+ it 'defaults to UTF-8' do
144
+ expect(detector.send(:encoding)).to eq 'UTF-8'
145
+ end
146
+ end
147
+
148
+ context '#counts' do
149
+ it 'returns an array initialized at 0 for each column delimiter' do
150
+ expect(detector.send(:counts)).to eq [0,0,0,0]
151
+ end
152
+ end
153
+
154
+ context '#readline' do
155
+ it 'returns first line from provided file' do
156
+ ln = detector.send(:readline)
157
+ expect(ln).to eq "First name,Last name,From,Access,Qty\n"
158
+ end
159
+ end
160
+ end
161
+ end