nebulous 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/lib/nebulous.rb +6 -2
- data/lib/nebulous/input.rb +4 -0
- data/lib/nebulous/input/delimiters.rb +16 -0
- data/lib/nebulous/input/parsing.rb +56 -0
- data/lib/nebulous/input/reader.rb +29 -0
- data/lib/nebulous/parser.rb +4 -85
- data/lib/nebulous/row.rb +6 -5
- data/lib/nebulous/version.rb +1 -1
- data/spec/nebulous/chunk_spec.rb +25 -1
- data/spec/nebulous/row_spec.rb +65 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15734d97663b72356a0c85a85a07e5bddc191a0f
|
4
|
+
data.tar.gz: a2fdc1e48d2835bc8a6e509733cbbb5a09ed9bdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d98c33418a5c0c497027d57491e8e3698357bd35762b9c11efe67e8d97dce7458693b4129d52516d155e498ff3023b825c6c04e224e679e5f681fa17886dfd3f
|
7
|
+
data.tar.gz: 43845b83b364c806fa803affe6b5c8981af2286a68e0ca157d321c63328b0c27ddfb4e7d4f451ba82e5bb5dff7ba617513282c1ef8aa1258f9b106c2bd885862
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
data/lib/nebulous.rb
CHANGED
@@ -3,10 +3,14 @@ require 'ostruct'
|
|
3
3
|
require 'cocaine'
|
4
4
|
require 'active_support/all'
|
5
5
|
require 'nebulous/version'
|
6
|
-
require 'nebulous/
|
6
|
+
require 'nebulous/delimiter_detector'
|
7
7
|
require 'nebulous/row'
|
8
8
|
require 'nebulous/chunk'
|
9
|
-
require 'nebulous/
|
9
|
+
require 'nebulous/input'
|
10
|
+
require 'nebulous/input/reader'
|
11
|
+
require 'nebulous/input/parsing'
|
12
|
+
require 'nebulous/input/delimiters'
|
13
|
+
require 'nebulous/parser'
|
10
14
|
|
11
15
|
module Nebulous
|
12
16
|
def self.process(file, *args, &block)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Nebulous
|
2
|
+
module Input
|
3
|
+
module Delimiters
|
4
|
+
def delimiters
|
5
|
+
@delimiters ||= Nebulous::DelimiterDetector.new(file.path).detect
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
def merge_delimiters
|
11
|
+
options.row_sep ||= delimiters[:row_sep]
|
12
|
+
options.col_sep ||= delimiters[:col_sep]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Nebulous
|
2
|
+
module Input
|
3
|
+
module Parsing
|
4
|
+
def parse_row
|
5
|
+
sequence
|
6
|
+
Row.parse(read_complete_line, options).to_numeric.merge(@headers)
|
7
|
+
end
|
8
|
+
|
9
|
+
def read_headers
|
10
|
+
@headers ||= Row.headers(readline, options) if options[:headers]
|
11
|
+
end
|
12
|
+
|
13
|
+
def chunk
|
14
|
+
@chunk ||= Chunk.new chunk_options
|
15
|
+
end
|
16
|
+
|
17
|
+
def sequence
|
18
|
+
@index += 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def limit?
|
22
|
+
options.limit && options.limit == @index
|
23
|
+
end
|
24
|
+
|
25
|
+
def yield_chunk(chunk, &_block)
|
26
|
+
if chunk.full? || file.eof?
|
27
|
+
yield chunk.map(&:to_a)
|
28
|
+
@chunk = nil
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def iterate(&block)
|
33
|
+
while !file.eof?
|
34
|
+
break if limit?
|
35
|
+
chunk << replace_keys(parse_row)
|
36
|
+
yield_chunk(chunk, &block) if block_given? && options.chunk
|
37
|
+
end
|
38
|
+
|
39
|
+
@chunk.to_a
|
40
|
+
end
|
41
|
+
|
42
|
+
def replace_keys(row)
|
43
|
+
return row unless options.mapping
|
44
|
+
row.map do |key, value|
|
45
|
+
[options.mapping[key], value] if options.mapping.has_key?(key)
|
46
|
+
end.compact.to_h
|
47
|
+
end
|
48
|
+
|
49
|
+
def chunk_options
|
50
|
+
Hash.new.tap do |attrs|
|
51
|
+
attrs[:size] = options.chunk.to_i if options.chunk
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Nebulous
|
2
|
+
module Input
|
3
|
+
module Reader
|
4
|
+
def read_input(input)
|
5
|
+
input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
|
6
|
+
end
|
7
|
+
|
8
|
+
def read_complete_line
|
9
|
+
ln = readline
|
10
|
+
while ln.count(options.quote_char) % 2 == 1
|
11
|
+
ln += readline
|
12
|
+
end
|
13
|
+
ln
|
14
|
+
end
|
15
|
+
|
16
|
+
def readline
|
17
|
+
file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
|
18
|
+
end
|
19
|
+
|
20
|
+
def line_terminator
|
21
|
+
options.row_sep
|
22
|
+
end
|
23
|
+
|
24
|
+
def encoding
|
25
|
+
options.encoding
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/nebulous/parser.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
module Nebulous
|
2
2
|
class Parser
|
3
|
+
include Nebulous::Input::Reader
|
4
|
+
include Nebulous::Input::Parsing
|
5
|
+
include Nebulous::Input::Delimiters
|
6
|
+
|
3
7
|
DEFAULT_OPTIONS = {
|
4
8
|
col_sep: nil,
|
5
9
|
row_sep: nil,
|
@@ -9,7 +13,6 @@ module Nebulous
|
|
9
13
|
headers: true,
|
10
14
|
mapping: nil,
|
11
15
|
limit: false,
|
12
|
-
remove_empty_values: true,
|
13
16
|
encoding: Encoding::UTF_8.to_s
|
14
17
|
}
|
15
18
|
|
@@ -34,10 +37,6 @@ module Nebulous
|
|
34
37
|
file.rewind
|
35
38
|
end
|
36
39
|
|
37
|
-
def delimiters
|
38
|
-
@delimiters ||= DelimiterDetector.new(file.path).detect
|
39
|
-
end
|
40
|
-
|
41
40
|
private
|
42
41
|
|
43
42
|
def reset
|
@@ -45,85 +44,5 @@ module Nebulous
|
|
45
44
|
@headers = nil
|
46
45
|
@chunk = nil
|
47
46
|
end
|
48
|
-
|
49
|
-
def chunk
|
50
|
-
@chunk ||= Chunk.new chunk_options
|
51
|
-
end
|
52
|
-
|
53
|
-
def read_headers
|
54
|
-
@headers ||= Row.headers(readline, options) if options[:headers]
|
55
|
-
end
|
56
|
-
|
57
|
-
def iterate(&block)
|
58
|
-
while !file.eof?
|
59
|
-
break if limit?
|
60
|
-
chunk << replace_keys(parse_row.merge(@headers))
|
61
|
-
yield_chunk(chunk, &block) if block_given? && options.chunk
|
62
|
-
end
|
63
|
-
|
64
|
-
@chunk.to_a
|
65
|
-
end
|
66
|
-
|
67
|
-
def sequence
|
68
|
-
@index += 1
|
69
|
-
end
|
70
|
-
|
71
|
-
def limit?
|
72
|
-
options.limit && options.limit == @index
|
73
|
-
end
|
74
|
-
|
75
|
-
def parse_row
|
76
|
-
sequence
|
77
|
-
Row.parse(read_complete_line, options)
|
78
|
-
end
|
79
|
-
|
80
|
-
def yield_chunk(chunk, &_block)
|
81
|
-
if chunk.full? || file.eof?
|
82
|
-
yield chunk.map(&:to_a)
|
83
|
-
@chunk = nil
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
def read_input(input)
|
88
|
-
input.respond_to?(:readline) ? input : File.open(input, "r:#{encoding}")
|
89
|
-
end
|
90
|
-
|
91
|
-
def read_complete_line
|
92
|
-
ln = readline
|
93
|
-
while ln.count(options.quote_char) % 2 == 1
|
94
|
-
ln += readline
|
95
|
-
end
|
96
|
-
ln
|
97
|
-
end
|
98
|
-
|
99
|
-
def readline
|
100
|
-
file.readline(line_terminator).encode(encoding, invalid: :replace).chomp
|
101
|
-
end
|
102
|
-
|
103
|
-
def encoding
|
104
|
-
options.encoding
|
105
|
-
end
|
106
|
-
|
107
|
-
def merge_delimiters
|
108
|
-
options.row_sep ||= delimiters[:row_sep]
|
109
|
-
options.col_sep ||= delimiters[:col_sep]
|
110
|
-
end
|
111
|
-
|
112
|
-
def line_terminator
|
113
|
-
options.row_sep
|
114
|
-
end
|
115
|
-
|
116
|
-
def chunk_options
|
117
|
-
Hash.new.tap do |attrs|
|
118
|
-
attrs[:size] = options.chunk.to_i if options.chunk
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
def replace_keys(row)
|
123
|
-
return row unless options.mapping
|
124
|
-
row.map do |key, value|
|
125
|
-
[options.mapping[key], value] if options.mapping.has_key?(key)
|
126
|
-
end.compact.to_h
|
127
|
-
end
|
128
47
|
end
|
129
48
|
end
|
data/lib/nebulous/row.rb
CHANGED
@@ -9,26 +9,27 @@ module Nebulous
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.parse(str, opts)
|
12
|
-
|
12
|
+
opts = opts.to_h
|
13
|
+
str.gsub!(opts[:comment_exp], '')
|
13
14
|
str.chomp!
|
14
15
|
|
15
16
|
begin
|
16
|
-
args = opts.
|
17
|
+
args = opts.slice(:col_sep, :row_sep, :quote_char)
|
17
18
|
data = CSV.parse_line str, args
|
18
19
|
rescue CSV::MalformedCSVError
|
19
|
-
exp = /(#{opts
|
20
|
+
exp = /(#{opts[:col_sep]})(?=(?:[^"]|"[^"]*")*$)/
|
20
21
|
data = str.gsub(exp, "\0").split(/\0/)
|
21
22
|
end
|
22
23
|
|
23
24
|
data.map!(&:strip)
|
24
|
-
new(data)
|
25
|
+
new(data)
|
25
26
|
end
|
26
27
|
|
27
28
|
def to_numeric
|
28
29
|
arr = map do |val|
|
29
30
|
case val
|
30
31
|
when /^[+-]?\d+\.\d+$/
|
31
|
-
val.
|
32
|
+
val.to_f
|
32
33
|
when /^[+-]?\d+$/
|
33
34
|
val.to_i
|
34
35
|
else
|
data/lib/nebulous/version.rb
CHANGED
data/spec/nebulous/chunk_spec.rb
CHANGED
@@ -1,6 +1,30 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Nebulous::Chunk do
|
4
|
-
context 'around
|
4
|
+
context 'around chunk of csv data' do
|
5
|
+
subject { Nebulous::Chunk }
|
6
|
+
|
7
|
+
context '#full?' do
|
8
|
+
let(:chunk) { subject.new size: size }
|
9
|
+
|
10
|
+
before do
|
11
|
+
chunk << ['row']
|
12
|
+
chunk << ['row']
|
13
|
+
end
|
14
|
+
|
15
|
+
context 'when not full' do
|
16
|
+
let(:size) { 3 }
|
17
|
+
it 'returns expected value' do
|
18
|
+
expect(chunk.full?).to be_falsy
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context 'when full' do
|
23
|
+
let(:size) { 2 }
|
24
|
+
it 'returns expected value' do
|
25
|
+
expect(chunk.full?).to be_truthy
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
5
29
|
end
|
6
30
|
end
|
data/spec/nebulous/row_spec.rb
CHANGED
@@ -2,5 +2,70 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Nebulous::Row do
|
4
4
|
context 'around reading csv rows' do
|
5
|
+
subject { Nebulous::Row }
|
6
|
+
|
7
|
+
let(:col_sep) { ',' }
|
8
|
+
let(:row_sep) { "\n" }
|
9
|
+
let(:options) do
|
10
|
+
{ col_sep: col_sep, row_sep: row_sep, quote_char: '"', comment_exp: /^#/ }
|
11
|
+
end
|
12
|
+
|
13
|
+
context '::headers' do
|
14
|
+
it 'parses and normalizes a csv string as headers' do
|
15
|
+
headers = subject.headers("First name, last-name, guests", options)
|
16
|
+
expect(headers).to eq(
|
17
|
+
{first_name: :first_name, last_name: :last_name, guests: :guests}
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context '::parse' do
|
23
|
+
context 'with valid csv' do
|
24
|
+
it 'returns expected parsed result' do
|
25
|
+
row = subject.parse "raw denim, Austin,selvage,artisan", options
|
26
|
+
expect(row).to eq ["raw denim", "Austin", "selvage", "artisan"]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context 'with valid tsv' do
|
31
|
+
let(:col_sep) { "\t" }
|
32
|
+
it 'returns expected parsed result' do
|
33
|
+
row = subject.parse "raw denim\tAustin\t selvage\tartisan", options
|
34
|
+
expect(row).to eq ["raw denim", "Austin", "selvage", "artisan"]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context 'with malformed csv' do
|
39
|
+
it 'returns expected parsed result' do
|
40
|
+
row = subject.parse 'raw denim, Austin "TX, US", artisan', options
|
41
|
+
expect(row).to eq ["raw denim", "Austin \"TX, US\"", "artisan"]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'with malformed tsv' do
|
46
|
+
let(:col_sep) { "\t" }
|
47
|
+
it 'returns expected parsed result' do
|
48
|
+
row = subject.parse "raw denim\t Austin \"TX, US\"\t artisan", options
|
49
|
+
expect(row).to eq ["raw denim", "Austin \"TX, US\"", "artisan"]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
context '#to_numeric' do
|
55
|
+
it 'converts numeric values to ints/floats' do
|
56
|
+
row = subject.new ["1", "two", "3", "4.5"]
|
57
|
+
expect(row.to_numeric).to eq [1, "two", 3, 4.5]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context '#merge' do
|
62
|
+
it 'zips a row with provided headers' do
|
63
|
+
headers = subject.headers "first name, last name", options
|
64
|
+
row = subject.new ["bob", "barker"]
|
65
|
+
expect(row.merge(headers)).to eq(
|
66
|
+
{ first_name: "bob", last_name: "barker" }
|
67
|
+
)
|
68
|
+
end
|
69
|
+
end
|
5
70
|
end
|
6
71
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nebulous
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Zach Graves
|
@@ -112,6 +112,10 @@ files:
|
|
112
112
|
- lib/nebulous.rb
|
113
113
|
- lib/nebulous/chunk.rb
|
114
114
|
- lib/nebulous/delimiter_detector.rb
|
115
|
+
- lib/nebulous/input.rb
|
116
|
+
- lib/nebulous/input/delimiters.rb
|
117
|
+
- lib/nebulous/input/parsing.rb
|
118
|
+
- lib/nebulous/input/reader.rb
|
115
119
|
- lib/nebulous/parser.rb
|
116
120
|
- lib/nebulous/row.rb
|
117
121
|
- lib/nebulous/version.rb
|