data_kit 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data_kit.gemspec +0 -1
- data/lib/data_kit.rb +1 -4
- data/lib/data_kit/csv/converter.rb +16 -4
- data/lib/data_kit/csv/field_analyzer.rb +9 -4
- data/lib/data_kit/csv/parser.rb +11 -20
- data/lib/data_kit/csv/schema_analyzer.rb +9 -3
- data/lib/data_kit/version.rb +1 -1
- metadata +2 -17
- data/lib/data_kit/patches/rcsv.rb +0 -124
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30b22c22f9efa81ca1011684ed418acfd1fce0e1
|
4
|
+
data.tar.gz: 0875d12a8b029661945098eebe35965420a5dbb8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6520bb22fbce4620c012a1757b84582d68e577ccb2d8f730017f4afa9ac8190f9813a056582e220c012a5c0f0ec206fd4abc40af36810b23125758a2cfaa3180
|
7
|
+
data.tar.gz: 9518d32e467cce05341449ce4409ccad339e680f38bf8b65ad357d8f96a2311110d7eb3d5f2c826af56bbae5fed0baea16674837bbab8656be548926fab7a040
|
data/data_kit.gemspec
CHANGED
data/lib/data_kit.rb
CHANGED
@@ -16,11 +16,15 @@ module DataKit
|
|
16
16
|
|
17
17
|
def execute
|
18
18
|
::CSV.open(output_path, 'wb') do |writer|
|
19
|
-
|
19
|
+
first = true
|
20
|
+
converted = []
|
20
21
|
csv.each_row do |row|
|
21
|
-
|
22
|
-
|
22
|
+
if first
|
23
|
+
first = false
|
24
|
+
writer << csv.headers
|
23
25
|
end
|
26
|
+
|
27
|
+
writer << convert_row(csv.headers, row)
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|
@@ -39,7 +43,15 @@ module DataKit
|
|
39
43
|
|
40
44
|
private
|
41
45
|
|
42
|
-
def
|
46
|
+
def convert_row(headers, row)
|
47
|
+
converted = []
|
48
|
+
headers.each_with_index do |field_name, index|
|
49
|
+
converted << convert_value(row[index], field_types[field_name])
|
50
|
+
end
|
51
|
+
converted
|
52
|
+
end
|
53
|
+
|
54
|
+
def convert_value(value, type)
|
43
55
|
if value.nil? || type == :string || type == :empty
|
44
56
|
return value.to_s
|
45
57
|
else
|
@@ -14,16 +14,21 @@ module DataKit
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def execute
|
17
|
+
first = true
|
18
|
+
analysis = nil
|
17
19
|
random = Random.new
|
18
20
|
|
19
|
-
field_name = csv.headers[field_pos]
|
20
|
-
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
21
|
-
|
22
21
|
csv.each_row do |row|
|
22
|
+
if first
|
23
|
+
first = false
|
24
|
+
field_name = csv.headers[field_pos]
|
25
|
+
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
26
|
+
end
|
27
|
+
|
23
28
|
analysis.increment_total
|
24
29
|
if random.rand <= sampling_rate
|
25
30
|
analysis.increment_sample
|
26
|
-
analysis.insert(row[
|
31
|
+
analysis.insert(row[field_pos])
|
27
32
|
end
|
28
33
|
end
|
29
34
|
|
data/lib/data_kit/csv/parser.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require 'csv'
|
2
2
|
|
3
3
|
module DataKit
|
4
4
|
module CSV
|
@@ -9,29 +9,25 @@ module DataKit
|
|
9
9
|
|
10
10
|
def initialize(path)
|
11
11
|
@path = path
|
12
|
-
|
13
|
-
set_handle
|
14
|
-
set_headers
|
12
|
+
set_handle
|
15
13
|
end
|
16
14
|
|
17
15
|
def each_row(&block)
|
16
|
+
first = true
|
18
17
|
handle.rewind
|
19
|
-
|
20
|
-
|
18
|
+
|
19
|
+
::CSV.parse(handle, converters: nil) do |row|
|
20
|
+
if first == true
|
21
|
+
first = false
|
22
|
+
@headers = row
|
23
|
+
else
|
24
|
+
yield row
|
25
|
+
end
|
21
26
|
end
|
22
27
|
end
|
23
28
|
|
24
29
|
private
|
25
30
|
|
26
|
-
def columns
|
27
|
-
index = -1
|
28
|
-
@columns ||= headers.inject({}) do |result, field_name|
|
29
|
-
index += 1
|
30
|
-
result[index] = { :alias => field_name }
|
31
|
-
result
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
31
|
def set_handle
|
36
32
|
if path.is_a?(IO)
|
37
33
|
@handle = path
|
@@ -44,11 +40,6 @@ module DataKit
|
|
44
40
|
{:invalid => :replace, :undef => :replace, :replace => ''}
|
45
41
|
)
|
46
42
|
end
|
47
|
-
|
48
|
-
def set_headers
|
49
|
-
handle.rewind
|
50
|
-
Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
|
51
|
-
end
|
52
43
|
end
|
53
44
|
end
|
54
45
|
end
|
@@ -19,15 +19,21 @@ module DataKit
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def execute
|
22
|
+
first = true
|
23
|
+
analysis = nil
|
22
24
|
random = Random.new
|
23
|
-
analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
|
24
25
|
|
25
26
|
csv.each_row do |row|
|
27
|
+
if first
|
28
|
+
first = false
|
29
|
+
analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
|
30
|
+
end
|
31
|
+
|
26
32
|
analysis.increment_total
|
27
33
|
if random.rand <= sampling_rate
|
28
34
|
analysis.increment_sample
|
29
|
-
row.
|
30
|
-
analysis.insert(
|
35
|
+
row.each_with_index do |value, index|
|
36
|
+
analysis.insert(csv.headers[index].to_s, value)
|
31
37
|
end
|
32
38
|
end
|
33
39
|
end
|
data/lib/data_kit/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mode Analytics
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: rcsv
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - '>='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: timeliness
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -137,7 +123,6 @@ files:
|
|
137
123
|
- lib/data_kit/csv/schema_analyzer.rb
|
138
124
|
- lib/data_kit/dataset/field.rb
|
139
125
|
- lib/data_kit/dataset/schema.rb
|
140
|
-
- lib/data_kit/patches/rcsv.rb
|
141
126
|
- lib/data_kit/version.rb
|
142
127
|
- spec/converters/boolean_spec.rb
|
143
128
|
- spec/converters/date_time_spec.rb
|
@@ -1,124 +0,0 @@
|
|
1
|
-
require "rcsv/rcsv"
|
2
|
-
require "rcsv/version"
|
3
|
-
|
4
|
-
require "stringio"
|
5
|
-
|
6
|
-
#
|
7
|
-
# This is a temporary monkey patch to Rcsv.parse
|
8
|
-
# to silence warnings in Ruby 2 about #lines being deprecated
|
9
|
-
#
|
10
|
-
|
11
|
-
class Rcsv
|
12
|
-
def self.parse(csv_data, options = {}, &block)
|
13
|
-
options[:header] ||= :use
|
14
|
-
raw_options = {}
|
15
|
-
|
16
|
-
raw_options[:col_sep] = options[:column_separator] && options[:column_separator][0] || ','
|
17
|
-
raw_options[:offset_rows] = options[:offset_rows] || 0
|
18
|
-
raw_options[:nostrict] = options[:nostrict]
|
19
|
-
raw_options[:parse_empty_fields_as] = options[:parse_empty_fields_as]
|
20
|
-
raw_options[:buffer_size] = options[:buffer_size] || 1024 * 1024 # 1 MiB
|
21
|
-
|
22
|
-
if csv_data.is_a?(String)
|
23
|
-
csv_data = StringIO.new(csv_data)
|
24
|
-
elsif !(csv_data.respond_to?(:lines) && csv_data.respond_to?(:read))
|
25
|
-
inspected_csv_data = csv_data.inspect
|
26
|
-
raise ParseError.new("Supplied CSV object #{inspected_csv_data[0..127]}#{inspected_csv_data.size > 128 ? '...' : ''} is neither String nor looks like IO object.")
|
27
|
-
end
|
28
|
-
|
29
|
-
if csv_data.respond_to?(:external_encoding)
|
30
|
-
raw_options[:output_encoding] = csv_data.external_encoding.to_s
|
31
|
-
end
|
32
|
-
|
33
|
-
initial_position = csv_data.pos
|
34
|
-
|
35
|
-
first_line = csv_data.each_line.first
|
36
|
-
field_count = first_line.split(raw_options[:col_sep]).length
|
37
|
-
|
38
|
-
case options[:header]
|
39
|
-
when :use
|
40
|
-
header = self.raw_parse(StringIO.new(first_line), raw_options).first
|
41
|
-
raw_options[:offset_rows] += 1
|
42
|
-
when :skip
|
43
|
-
header = (0..field_count).to_a
|
44
|
-
raw_options[:offset_rows] += 1
|
45
|
-
when :none
|
46
|
-
header = (0..field_count).to_a
|
47
|
-
end
|
48
|
-
|
49
|
-
raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
|
50
|
-
|
51
|
-
if options[:columns]
|
52
|
-
only_rows = []
|
53
|
-
except_rows = []
|
54
|
-
row_defaults = []
|
55
|
-
column_names = []
|
56
|
-
row_conversions = ''
|
57
|
-
|
58
|
-
header.each do |column_header|
|
59
|
-
column_options = options[:columns][column_header]
|
60
|
-
if column_options
|
61
|
-
if (options[:row_as_hash])
|
62
|
-
column_names << (column_options[:alias] || column_header)
|
63
|
-
end
|
64
|
-
|
65
|
-
row_defaults << column_options[:default] || nil
|
66
|
-
|
67
|
-
only_rows << case column_options[:match]
|
68
|
-
when Array
|
69
|
-
column_options[:match]
|
70
|
-
when nil
|
71
|
-
nil
|
72
|
-
else
|
73
|
-
[column_options[:match]]
|
74
|
-
end
|
75
|
-
|
76
|
-
except_rows << case column_options[:not_match]
|
77
|
-
when Array
|
78
|
-
column_options[:not_match]
|
79
|
-
when nil
|
80
|
-
nil
|
81
|
-
else
|
82
|
-
[column_options[:not_match]]
|
83
|
-
end
|
84
|
-
|
85
|
-
row_conversions << case column_options[:type]
|
86
|
-
when :int
|
87
|
-
'i'
|
88
|
-
when :float
|
89
|
-
'f'
|
90
|
-
when :string
|
91
|
-
's'
|
92
|
-
when :bool
|
93
|
-
'b'
|
94
|
-
when nil
|
95
|
-
's' # strings by default
|
96
|
-
else
|
97
|
-
fail "Unknown column type #{column_options[:type].inspect}."
|
98
|
-
end
|
99
|
-
elsif options[:only_listed_columns]
|
100
|
-
column_names << nil
|
101
|
-
row_defaults << nil
|
102
|
-
only_rows << nil
|
103
|
-
except_rows << nil
|
104
|
-
row_conversions << ' '
|
105
|
-
else
|
106
|
-
column_names << column_header
|
107
|
-
row_defaults << nil
|
108
|
-
only_rows << nil
|
109
|
-
except_rows << nil
|
110
|
-
row_conversions << 's'
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
raw_options[:column_names] = column_names if options[:row_as_hash]
|
115
|
-
raw_options[:only_rows] = only_rows unless only_rows.compact.empty?
|
116
|
-
raw_options[:except_rows] = except_rows unless except_rows.compact.empty?
|
117
|
-
raw_options[:row_defaults] = row_defaults unless row_defaults.compact.empty?
|
118
|
-
raw_options[:row_conversions] = row_conversions
|
119
|
-
end
|
120
|
-
|
121
|
-
csv_data.pos = initial_position
|
122
|
-
return self.raw_parse(csv_data, raw_options, &block)
|
123
|
-
end
|
124
|
-
end
|