data_kit 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data_kit.gemspec +0 -1
- data/lib/data_kit.rb +1 -4
- data/lib/data_kit/csv/converter.rb +16 -4
- data/lib/data_kit/csv/field_analyzer.rb +9 -4
- data/lib/data_kit/csv/parser.rb +11 -20
- data/lib/data_kit/csv/schema_analyzer.rb +9 -3
- data/lib/data_kit/version.rb +1 -1
- metadata +2 -17
- data/lib/data_kit/patches/rcsv.rb +0 -124
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30b22c22f9efa81ca1011684ed418acfd1fce0e1
|
4
|
+
data.tar.gz: 0875d12a8b029661945098eebe35965420a5dbb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6520bb22fbce4620c012a1757b84582d68e577ccb2d8f730017f4afa9ac8190f9813a056582e220c012a5c0f0ec206fd4abc40af36810b23125758a2cfaa3180
|
7
|
+
data.tar.gz: 9518d32e467cce05341449ce4409ccad339e680f38bf8b65ad357d8f96a2311110d7eb3d5f2c826af56bbae5fed0baea16674837bbab8656be548926fab7a040
|
data/data_kit.gemspec
CHANGED
data/lib/data_kit.rb
CHANGED
@@ -16,11 +16,15 @@ module DataKit
|
|
16
16
|
|
17
17
|
def execute
|
18
18
|
::CSV.open(output_path, 'wb') do |writer|
|
19
|
-
|
19
|
+
first = true
|
20
|
+
converted = []
|
20
21
|
csv.each_row do |row|
|
21
|
-
|
22
|
-
|
22
|
+
if first
|
23
|
+
first = false
|
24
|
+
writer << csv.headers
|
23
25
|
end
|
26
|
+
|
27
|
+
writer << convert_row(csv.headers, row)
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|
@@ -39,7 +43,15 @@ module DataKit
|
|
39
43
|
|
40
44
|
private
|
41
45
|
|
42
|
-
def
|
46
|
+
def convert_row(headers, row)
|
47
|
+
converted = []
|
48
|
+
headers.each_with_index do |field_name, index|
|
49
|
+
converted << convert_value(row[index], field_types[field_name])
|
50
|
+
end
|
51
|
+
converted
|
52
|
+
end
|
53
|
+
|
54
|
+
def convert_value(value, type)
|
43
55
|
if value.nil? || type == :string || type == :empty
|
44
56
|
return value.to_s
|
45
57
|
else
|
@@ -14,16 +14,21 @@ module DataKit
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def execute
|
17
|
+
first = true
|
18
|
+
analysis = nil
|
17
19
|
random = Random.new
|
18
20
|
|
19
|
-
field_name = csv.headers[field_pos]
|
20
|
-
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
21
|
-
|
22
21
|
csv.each_row do |row|
|
22
|
+
if first
|
23
|
+
first = false
|
24
|
+
field_name = csv.headers[field_pos]
|
25
|
+
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
26
|
+
end
|
27
|
+
|
23
28
|
analysis.increment_total
|
24
29
|
if random.rand <= sampling_rate
|
25
30
|
analysis.increment_sample
|
26
|
-
analysis.insert(row[
|
31
|
+
analysis.insert(row[field_pos])
|
27
32
|
end
|
28
33
|
end
|
29
34
|
|
data/lib/data_kit/csv/parser.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'csv'
|
2
2
|
|
3
3
|
module DataKit
|
4
4
|
module CSV
|
@@ -9,29 +9,25 @@ module DataKit
|
|
9
9
|
|
10
10
|
def initialize(path)
|
11
11
|
@path = path
|
12
|
-
|
13
|
-
set_handle
|
14
|
-
set_headers
|
12
|
+
set_handle
|
15
13
|
end
|
16
14
|
|
17
15
|
def each_row(&block)
|
16
|
+
first = true
|
18
17
|
handle.rewind
|
19
|
-
|
20
|
-
|
18
|
+
|
19
|
+
::CSV.parse(handle, converters: nil) do |row|
|
20
|
+
if first == true
|
21
|
+
first = false
|
22
|
+
@headers = row
|
23
|
+
else
|
24
|
+
yield row
|
25
|
+
end
|
21
26
|
end
|
22
27
|
end
|
23
28
|
|
24
29
|
private
|
25
30
|
|
26
|
-
def columns
|
27
|
-
index = -1
|
28
|
-
@columns ||= headers.inject({}) do |result, field_name|
|
29
|
-
index += 1
|
30
|
-
result[index] = { :alias => field_name }
|
31
|
-
result
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
31
|
def set_handle
|
36
32
|
if path.is_a?(IO)
|
37
33
|
@handle = path
|
@@ -44,11 +40,6 @@ module DataKit
|
|
44
40
|
{:invalid => :replace, :undef => :replace, :replace => ''}
|
45
41
|
)
|
46
42
|
end
|
47
|
-
|
48
|
-
def set_headers
|
49
|
-
handle.rewind
|
50
|
-
Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
|
51
|
-
end
|
52
43
|
end
|
53
44
|
end
|
54
45
|
end
|
@@ -19,15 +19,21 @@ module DataKit
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def execute
|
22
|
+
first = true
|
23
|
+
analysis = nil
|
22
24
|
random = Random.new
|
23
|
-
analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
|
24
25
|
|
25
26
|
csv.each_row do |row|
|
27
|
+
if first
|
28
|
+
first = false
|
29
|
+
analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
|
30
|
+
end
|
31
|
+
|
26
32
|
analysis.increment_total
|
27
33
|
if random.rand <= sampling_rate
|
28
34
|
analysis.increment_sample
|
29
|
-
row.
|
30
|
-
analysis.insert(
|
35
|
+
row.each_with_index do |value, index|
|
36
|
+
analysis.insert(csv.headers[index].to_s, value)
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
data/lib/data_kit/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mode Analytics
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: rcsv
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - '>='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: timeliness
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -137,7 +123,6 @@ files:
|
|
137
123
|
- lib/data_kit/csv/schema_analyzer.rb
|
138
124
|
- lib/data_kit/dataset/field.rb
|
139
125
|
- lib/data_kit/dataset/schema.rb
|
140
|
-
- lib/data_kit/patches/rcsv.rb
|
141
126
|
- lib/data_kit/version.rb
|
142
127
|
- spec/converters/boolean_spec.rb
|
143
128
|
- spec/converters/date_time_spec.rb
|
@@ -1,124 +0,0 @@
|
|
1
|
-
require "rcsv/rcsv"
|
2
|
-
require "rcsv/version"
|
3
|
-
|
4
|
-
require "stringio"
|
5
|
-
|
6
|
-
#
|
7
|
-
# This is a temporary monkey patch to Rcsv.parse
|
8
|
-
# to silence warnings in Ruby 2 about #lines being deprecated
|
9
|
-
#
|
10
|
-
|
11
|
-
class Rcsv
|
12
|
-
def self.parse(csv_data, options = {}, &block)
|
13
|
-
options[:header] ||= :use
|
14
|
-
raw_options = {}
|
15
|
-
|
16
|
-
raw_options[:col_sep] = options[:column_separator] && options[:column_separator][0] || ','
|
17
|
-
raw_options[:offset_rows] = options[:offset_rows] || 0
|
18
|
-
raw_options[:nostrict] = options[:nostrict]
|
19
|
-
raw_options[:parse_empty_fields_as] = options[:parse_empty_fields_as]
|
20
|
-
raw_options[:buffer_size] = options[:buffer_size] || 1024 * 1024 # 1 MiB
|
21
|
-
|
22
|
-
if csv_data.is_a?(String)
|
23
|
-
csv_data = StringIO.new(csv_data)
|
24
|
-
elsif !(csv_data.respond_to?(:lines) && csv_data.respond_to?(:read))
|
25
|
-
inspected_csv_data = csv_data.inspect
|
26
|
-
raise ParseError.new("Supplied CSV object #{inspected_csv_data[0..127]}#{inspected_csv_data.size > 128 ? '...' : ''} is neither String nor looks like IO object.")
|
27
|
-
end
|
28
|
-
|
29
|
-
if csv_data.respond_to?(:external_encoding)
|
30
|
-
raw_options[:output_encoding] = csv_data.external_encoding.to_s
|
31
|
-
end
|
32
|
-
|
33
|
-
initial_position = csv_data.pos
|
34
|
-
|
35
|
-
first_line = csv_data.each_line.first
|
36
|
-
field_count = first_line.split(raw_options[:col_sep]).length
|
37
|
-
|
38
|
-
case options[:header]
|
39
|
-
when :use
|
40
|
-
header = self.raw_parse(StringIO.new(first_line), raw_options).first
|
41
|
-
raw_options[:offset_rows] += 1
|
42
|
-
when :skip
|
43
|
-
header = (0..field_count).to_a
|
44
|
-
raw_options[:offset_rows] += 1
|
45
|
-
when :none
|
46
|
-
header = (0..field_count).to_a
|
47
|
-
end
|
48
|
-
|
49
|
-
raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
|
50
|
-
|
51
|
-
if options[:columns]
|
52
|
-
only_rows = []
|
53
|
-
except_rows = []
|
54
|
-
row_defaults = []
|
55
|
-
column_names = []
|
56
|
-
row_conversions = ''
|
57
|
-
|
58
|
-
header.each do |column_header|
|
59
|
-
column_options = options[:columns][column_header]
|
60
|
-
if column_options
|
61
|
-
if (options[:row_as_hash])
|
62
|
-
column_names << (column_options[:alias] || column_header)
|
63
|
-
end
|
64
|
-
|
65
|
-
row_defaults << column_options[:default] || nil
|
66
|
-
|
67
|
-
only_rows << case column_options[:match]
|
68
|
-
when Array
|
69
|
-
column_options[:match]
|
70
|
-
when nil
|
71
|
-
nil
|
72
|
-
else
|
73
|
-
[column_options[:match]]
|
74
|
-
end
|
75
|
-
|
76
|
-
except_rows << case column_options[:not_match]
|
77
|
-
when Array
|
78
|
-
column_options[:not_match]
|
79
|
-
when nil
|
80
|
-
nil
|
81
|
-
else
|
82
|
-
[column_options[:not_match]]
|
83
|
-
end
|
84
|
-
|
85
|
-
row_conversions << case column_options[:type]
|
86
|
-
when :int
|
87
|
-
'i'
|
88
|
-
when :float
|
89
|
-
'f'
|
90
|
-
when :string
|
91
|
-
's'
|
92
|
-
when :bool
|
93
|
-
'b'
|
94
|
-
when nil
|
95
|
-
's' # strings by default
|
96
|
-
else
|
97
|
-
fail "Unknown column type #{column_options[:type].inspect}."
|
98
|
-
end
|
99
|
-
elsif options[:only_listed_columns]
|
100
|
-
column_names << nil
|
101
|
-
row_defaults << nil
|
102
|
-
only_rows << nil
|
103
|
-
except_rows << nil
|
104
|
-
row_conversions << ' '
|
105
|
-
else
|
106
|
-
column_names << column_header
|
107
|
-
row_defaults << nil
|
108
|
-
only_rows << nil
|
109
|
-
except_rows << nil
|
110
|
-
row_conversions << 's'
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
raw_options[:column_names] = column_names if options[:row_as_hash]
|
115
|
-
raw_options[:only_rows] = only_rows unless only_rows.compact.empty?
|
116
|
-
raw_options[:except_rows] = except_rows unless except_rows.compact.empty?
|
117
|
-
raw_options[:row_defaults] = row_defaults unless row_defaults.compact.empty?
|
118
|
-
raw_options[:row_conversions] = row_conversions
|
119
|
-
end
|
120
|
-
|
121
|
-
csv_data.pos = initial_position
|
122
|
-
return self.raw_parse(csv_data, raw_options, &block)
|
123
|
-
end
|
124
|
-
end
|