data_kit 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/lib/data_kit/csv/field_analysis.rb +80 -0
- data/lib/data_kit/csv/field_analyzer.rb +40 -0
- data/lib/data_kit/csv/{analysis.rb → schema_analysis.rb} +1 -1
- data/lib/data_kit/csv/{analyzer.rb → schema_analyzer.rb} +9 -9
- data/lib/data_kit/version.rb +1 -1
- data/lib/data_kit.rb +9 -3
- data/spec/csv/converter_spec.rb +2 -2
- data/spec/csv/field_analysis_spec.rb +71 -0
- data/spec/csv/field_analyzer_spec.rb +45 -0
- data/spec/csv/{analysis_spec.rb → schema_analysis_spec.rb} +6 -6
- data/spec/csv/{analyzer_spec.rb → schema_analyzer_spec.rb} +9 -11
- metadata +14 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eed0243b61997177b645f36dd755c121dd36c177
|
4
|
+
data.tar.gz: a9631c21ab9bf41820e8f4d28bec5bcf7a0c43b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b8468fc7bda9be6701312139e1d2cd3ad5933c0d1ddd0d38f4e9e373ba1dd27b176e1a1928c4a1790cd8d8e1873fdbecae9adc86266478f438595d0f8d32f1a0
|
7
|
+
data.tar.gz: 4f5dcdab6b2a42d05ba8a71bf38391b8109ba4adb6a46e5077f73dbfad796a9fd5706b5d07343a22e56aa438982bac5b27e889a964e4444b7327e3958a0340ed
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class FieldAnalysis
|
4
|
+
attr_reader :field_name
|
5
|
+
attr_reader :match_type
|
6
|
+
|
7
|
+
attr_reader :types # {10 => :string, ...}
|
8
|
+
attr_reader :values # {10 => "2010-13-01"}
|
9
|
+
attr_reader :row_count
|
10
|
+
attr_reader :sample_count
|
11
|
+
|
12
|
+
def initialize(field_name, options = {})
|
13
|
+
@field_name = field_name
|
14
|
+
|
15
|
+
@types, @values = {}, {}
|
16
|
+
@row_count, @sample_count = 0, 0
|
17
|
+
|
18
|
+
@match_type = options[:match_type] || :any
|
19
|
+
|
20
|
+
Dataset::Field::Types.each do |type|
|
21
|
+
@types[type] = []
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def increment_total
|
26
|
+
@row_count += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def increment_sample
|
30
|
+
@sample_count += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
def insert(value)
|
34
|
+
value_type = Dataset::Field.type?(value)
|
35
|
+
|
36
|
+
if match_type.nil? || match_type == :any
|
37
|
+
insert_value_with_type(value, value_type)
|
38
|
+
elsif value_type == match_type
|
39
|
+
insert_value_with_type(value, value_type)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def type?
|
44
|
+
if has_single_type?
|
45
|
+
type_list.first
|
46
|
+
elsif has_only_numeric_types?
|
47
|
+
:number
|
48
|
+
else
|
49
|
+
:string
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def value_at(row_num)
|
54
|
+
@values[row_num]
|
55
|
+
end
|
56
|
+
|
57
|
+
def type_count(type)
|
58
|
+
types[type].length
|
59
|
+
end
|
60
|
+
|
61
|
+
def type_list
|
62
|
+
types.keys.select{ |type| @types[type].length > 0 }
|
63
|
+
end
|
64
|
+
|
65
|
+
def has_single_type?
|
66
|
+
type_list.length == 1
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_only_numeric_types?
|
70
|
+
(type_list - [:integer, :number, :null]).length == 0
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
def insert_value_with_type(value, type)
|
75
|
+
@values[row_count] = value
|
76
|
+
@types[type] << row_count
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class FieldAnalyzer
|
4
|
+
attr_accessor :csv
|
5
|
+
attr_accessor :field_pos
|
6
|
+
attr_accessor :match_type
|
7
|
+
attr_accessor :sampling_rate
|
8
|
+
|
9
|
+
def initialize(csv, field_pos, options = {})
|
10
|
+
@csv = csv
|
11
|
+
@field_pos = field_pos
|
12
|
+
@match_type = options[:match_type] || :any
|
13
|
+
@sampling_rate = options[:sampling_rate] || 0.1
|
14
|
+
end
|
15
|
+
|
16
|
+
def execute
|
17
|
+
random = Random.new
|
18
|
+
|
19
|
+
field_name = csv.headers[field_pos]
|
20
|
+
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
21
|
+
|
22
|
+
csv.each_row do |row|
|
23
|
+
analysis.increment_total
|
24
|
+
if random.rand <= sampling_rate
|
25
|
+
analysis.increment_sample
|
26
|
+
analysis.insert(row[field_name])
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
analysis
|
31
|
+
end
|
32
|
+
|
33
|
+
class << self
|
34
|
+
def analyze(csv, field_pos, options = {})
|
35
|
+
new(csv, field_pos, options).execute
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module DataKit
|
2
2
|
module CSV
|
3
|
-
class
|
3
|
+
class SchemaAnalyzer
|
4
4
|
attr_accessor :csv
|
5
5
|
attr_accessor :keys
|
6
|
-
attr_accessor :
|
6
|
+
attr_accessor :sampling_rate
|
7
7
|
|
8
8
|
def initialize(csv, options = {})
|
9
9
|
@csv = csv
|
10
10
|
@keys = options[:keys] || []
|
11
|
-
@
|
11
|
+
@sampling_rate = options[:sampling_rate] || 0.1
|
12
12
|
end
|
13
13
|
|
14
14
|
def execute
|
15
15
|
random = Random.new
|
16
|
-
analysis =
|
16
|
+
analysis = SchemaAnalysis.new(csv.headers)
|
17
17
|
|
18
18
|
csv.each_row do |row|
|
19
19
|
analysis.increment_total
|
20
|
-
if random.rand <=
|
20
|
+
if random.rand <= sampling_rate
|
21
21
|
analysis.increment_sample
|
22
22
|
row.keys.each do |field_name|
|
23
23
|
analysis.insert(field_name.to_s, row[field_name])
|
@@ -32,18 +32,18 @@ module DataKit
|
|
32
32
|
def analyze(csv, options = {})
|
33
33
|
analyzer = new(csv,
|
34
34
|
:keys => options[:keys],
|
35
|
-
:
|
35
|
+
:sampling_rate => options[:sampling_rate]
|
36
36
|
)
|
37
37
|
|
38
38
|
analyzer.execute
|
39
39
|
end
|
40
40
|
|
41
|
-
def
|
41
|
+
def sampling_rate(file_size)
|
42
42
|
if file_size < (1024 * 1024)
|
43
|
-
|
43
|
+
sampling_rate = 1.0
|
44
44
|
else
|
45
45
|
scale_factor = 500
|
46
|
-
|
46
|
+
sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
data/lib/data_kit/version.rb
CHANGED
data/lib/data_kit.rb
CHANGED
@@ -1,9 +1,15 @@
|
|
1
1
|
require "data_kit/version"
|
2
2
|
|
3
|
-
#
|
3
|
+
# Parsers
|
4
4
|
require 'data_kit/csv/parser'
|
5
|
-
|
6
|
-
|
5
|
+
|
6
|
+
# Analyzers
|
7
|
+
require 'data_kit/csv/field_analyzer'
|
8
|
+
require 'data_kit/csv/field_analysis'
|
9
|
+
require 'data_kit/csv/schema_analyzer'
|
10
|
+
require 'data_kit/csv/schema_analysis'
|
11
|
+
|
12
|
+
# Converters
|
7
13
|
require 'data_kit/csv/converter'
|
8
14
|
|
9
15
|
# Data Conversion
|
data/spec/csv/converter_spec.rb
CHANGED
@@ -14,7 +14,7 @@ describe DataKit::CSV::Converter do
|
|
14
14
|
}
|
15
15
|
|
16
16
|
it "should initialize and execute" do
|
17
|
-
analysis = DataKit::CSV::
|
17
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
18
18
|
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
19
19
|
|
20
20
|
converter.execute
|
@@ -25,7 +25,7 @@ describe DataKit::CSV::Converter do
|
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should convert using the convience method" do
|
28
|
-
analysis = DataKit::CSV::
|
28
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
29
29
|
converter = DataKit::CSV::Converter.convert(csv, analysis, target)
|
30
30
|
|
31
31
|
row_count = 0
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DataKit::CSV::FieldAnalysis do
|
4
|
+
it "should increment totals and samples" do
|
5
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
6
|
+
|
7
|
+
analysis.increment_total
|
8
|
+
analysis.increment_total
|
9
|
+
|
10
|
+
analysis.increment_sample
|
11
|
+
|
12
|
+
analysis.row_count.should == 2
|
13
|
+
analysis.sample_count.should == 1
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should insert a row for analysis" do
|
17
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
18
|
+
|
19
|
+
analysis.insert '1.0'
|
20
|
+
analysis.insert '2.0'
|
21
|
+
|
22
|
+
analysis.type?.should == :number
|
23
|
+
analysis.has_single_type?.should == true
|
24
|
+
analysis.has_only_numeric_types?.should == true
|
25
|
+
analysis.type_count(:number).should == 2
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should return the value for a specific row" do
|
29
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
30
|
+
|
31
|
+
analysis.increment_total
|
32
|
+
analysis.insert '1.0'
|
33
|
+
|
34
|
+
analysis.increment_total
|
35
|
+
analysis.insert '2.0'
|
36
|
+
|
37
|
+
analysis.value_at(1).should == '1.0'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should infer a string type if there non-numeric mixed types" do
|
41
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
42
|
+
|
43
|
+
analysis.insert '1.0'
|
44
|
+
analysis.insert '2.0'
|
45
|
+
analysis.insert '2.0'
|
46
|
+
analysis.insert 'str2'
|
47
|
+
|
48
|
+
analysis.type?.should == :string
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should infer a number type if there are mixed numeric types" do
|
52
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
53
|
+
|
54
|
+
analysis.insert '1.0'
|
55
|
+
analysis.insert '20'
|
56
|
+
analysis.insert nil
|
57
|
+
|
58
|
+
analysis.type?.should == :number
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should filter analysis to a specific type" do
|
62
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1', :match_type => :number)
|
63
|
+
|
64
|
+
analysis.insert '1.0'
|
65
|
+
analysis.insert '20'
|
66
|
+
analysis.insert nil
|
67
|
+
analysis.insert 'str2'
|
68
|
+
|
69
|
+
analysis.type?.should == :number
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DataKit::CSV::FieldAnalyzer do
|
4
|
+
let(:path) {
|
5
|
+
data_path('standard.csv')
|
6
|
+
}
|
7
|
+
|
8
|
+
let(:csv) {
|
9
|
+
DataKit::CSV::Parser.new(path)
|
10
|
+
}
|
11
|
+
|
12
|
+
let(:iocsv) {
|
13
|
+
DataKit::CSV::Parser.new(File.open(path))
|
14
|
+
}
|
15
|
+
|
16
|
+
it "should initialize" do
|
17
|
+
analyzer = DataKit::CSV::FieldAnalyzer.new(csv, 1)
|
18
|
+
|
19
|
+
analyzer.csv.should == csv
|
20
|
+
analyzer.field_pos.should == 1
|
21
|
+
analyzer.sampling_rate.should == 0.1
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should initialize schema with an IO" do
|
25
|
+
analyzer = DataKit::CSV::FieldAnalyzer.new(iocsv, 1)
|
26
|
+
|
27
|
+
analyzer.csv.should == iocsv
|
28
|
+
analyzer.field_pos.should == 1
|
29
|
+
analyzer.sampling_rate.should == 0.1
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should execute an analysis" do
|
33
|
+
analysis = DataKit::CSV::FieldAnalyzer.new(csv, 8, :sampling_rate => 0.5).execute
|
34
|
+
|
35
|
+
analysis.type?.should == :datetime # activated_at
|
36
|
+
|
37
|
+
analysis.row_count.should == 10
|
38
|
+
analysis.sample_count.should be < 10
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should analyze using the static convenience method" do
|
42
|
+
analysis = DataKit::CSV::FieldAnalyzer.analyze(csv, 8, :sampling_rate => 0.5)
|
43
|
+
analysis.type?.should == :datetime # activated_at
|
44
|
+
end
|
45
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe DataKit::CSV::
|
3
|
+
describe DataKit::CSV::SchemaAnalysis do
|
4
4
|
it "should insert a row for analysis" do
|
5
|
-
analysis = DataKit::CSV::
|
5
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
6
6
|
|
7
7
|
analysis.insert 'field1', '1.0'
|
8
8
|
analysis.insert 'field1', '2.0'
|
@@ -11,7 +11,7 @@ describe DataKit::CSV::Analysis do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should know the number of rows with a particular type" do
|
14
|
-
analysis = DataKit::CSV::
|
14
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
15
15
|
|
16
16
|
analysis.insert 'field1', '1.0'
|
17
17
|
analysis.insert 'field1', '2.0'
|
@@ -20,7 +20,7 @@ describe DataKit::CSV::Analysis do
|
|
20
20
|
end
|
21
21
|
|
22
22
|
it "should determine the type of a field" do
|
23
|
-
analysis = DataKit::CSV::
|
23
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
|
24
24
|
|
25
25
|
analysis.insert 'field1', '1.0'
|
26
26
|
analysis.insert 'field1', '2.0'
|
@@ -32,7 +32,7 @@ describe DataKit::CSV::Analysis do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should infer a string type if there non-numeric mixed types" do
|
35
|
-
analysis = DataKit::CSV::
|
35
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
|
36
36
|
|
37
37
|
analysis.insert 'field1', '1.0'
|
38
38
|
analysis.insert 'field1', '2.0'
|
@@ -44,7 +44,7 @@ describe DataKit::CSV::Analysis do
|
|
44
44
|
end
|
45
45
|
|
46
46
|
it "should infer a number type if there are mixed numeric types" do
|
47
|
-
analysis = DataKit::CSV::
|
47
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
48
48
|
|
49
49
|
analysis.insert 'field1', '1.0'
|
50
50
|
analysis.insert 'field1', '20'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe DataKit::CSV::
|
3
|
+
describe DataKit::CSV::SchemaAnalyzer do
|
4
4
|
let(:path) {
|
5
5
|
data_path('standard.csv')
|
6
6
|
}
|
@@ -14,25 +14,23 @@ describe DataKit::CSV::Analyzer do
|
|
14
14
|
}
|
15
15
|
|
16
16
|
it "should initialize" do
|
17
|
-
analyzer = DataKit::CSV::
|
17
|
+
analyzer = DataKit::CSV::SchemaAnalyzer.new(csv)
|
18
18
|
|
19
19
|
analyzer.csv.should == csv
|
20
20
|
analyzer.keys.should == []
|
21
|
-
analyzer.
|
21
|
+
analyzer.sampling_rate.should == 0.1
|
22
22
|
end
|
23
23
|
|
24
24
|
it "should initialize schema with an IO" do
|
25
|
-
analyzer = DataKit::CSV::
|
25
|
+
analyzer = DataKit::CSV::SchemaAnalyzer.new(iocsv)
|
26
26
|
|
27
27
|
analyzer.csv.should == iocsv
|
28
28
|
analyzer.keys.should == []
|
29
|
-
analyzer.
|
29
|
+
analyzer.sampling_rate.should == 0.1
|
30
30
|
end
|
31
31
|
|
32
32
|
it "should execute an analysis" do
|
33
|
-
analysis = DataKit::CSV::
|
34
|
-
|
35
|
-
puts analysis.inspect
|
33
|
+
analysis = DataKit::CSV::SchemaAnalyzer.new(csv, :sampling_rate => 0.5).execute
|
36
34
|
|
37
35
|
analysis.type?('id').should == :integer
|
38
36
|
analysis.type?('first_name').should == :string
|
@@ -49,8 +47,8 @@ describe DataKit::CSV::Analyzer do
|
|
49
47
|
analysis.sample_count.should be < 10
|
50
48
|
end
|
51
49
|
|
52
|
-
it "should calculate a
|
53
|
-
DataKit::CSV::
|
54
|
-
DataKit::CSV::
|
50
|
+
it "should calculate a sampling_rate" do
|
51
|
+
DataKit::CSV::SchemaAnalyzer.sampling_rate(1024).should == 1
|
52
|
+
DataKit::CSV::SchemaAnalyzer.sampling_rate(2048 * 2048).should be < 1
|
55
53
|
end
|
56
54
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mode Analytics
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rcsv
|
@@ -130,10 +130,12 @@ files:
|
|
130
130
|
- lib/data_kit/converters/date_time.rb
|
131
131
|
- lib/data_kit/converters/integer.rb
|
132
132
|
- lib/data_kit/converters/number.rb
|
133
|
-
- lib/data_kit/csv/analysis.rb
|
134
|
-
- lib/data_kit/csv/analyzer.rb
|
135
133
|
- lib/data_kit/csv/converter.rb
|
134
|
+
- lib/data_kit/csv/field_analysis.rb
|
135
|
+
- lib/data_kit/csv/field_analyzer.rb
|
136
136
|
- lib/data_kit/csv/parser.rb
|
137
|
+
- lib/data_kit/csv/schema_analysis.rb
|
138
|
+
- lib/data_kit/csv/schema_analyzer.rb
|
137
139
|
- lib/data_kit/dataset/field.rb
|
138
140
|
- lib/data_kit/dataset/schema.rb
|
139
141
|
- lib/data_kit/patches/rcsv.rb
|
@@ -142,10 +144,12 @@ files:
|
|
142
144
|
- spec/converters/date_time_spec.rb
|
143
145
|
- spec/converters/integer_spec.rb
|
144
146
|
- spec/converters/number_spec.rb
|
145
|
-
- spec/csv/analysis_spec.rb
|
146
|
-
- spec/csv/analyzer_spec.rb
|
147
147
|
- spec/csv/converter_spec.rb
|
148
|
+
- spec/csv/field_analysis_spec.rb
|
149
|
+
- spec/csv/field_analyzer_spec.rb
|
148
150
|
- spec/csv/parser_spec.rb
|
151
|
+
- spec/csv/schema_analysis_spec.rb
|
152
|
+
- spec/csv/schema_analyzer_spec.rb
|
149
153
|
- spec/dataset/field_spec.rb
|
150
154
|
- spec/dataset/schema_spec.rb
|
151
155
|
- spec/fixtures/carriage_returns.csv
|
@@ -181,10 +185,12 @@ test_files:
|
|
181
185
|
- spec/converters/date_time_spec.rb
|
182
186
|
- spec/converters/integer_spec.rb
|
183
187
|
- spec/converters/number_spec.rb
|
184
|
-
- spec/csv/analysis_spec.rb
|
185
|
-
- spec/csv/analyzer_spec.rb
|
186
188
|
- spec/csv/converter_spec.rb
|
189
|
+
- spec/csv/field_analysis_spec.rb
|
190
|
+
- spec/csv/field_analyzer_spec.rb
|
187
191
|
- spec/csv/parser_spec.rb
|
192
|
+
- spec/csv/schema_analysis_spec.rb
|
193
|
+
- spec/csv/schema_analyzer_spec.rb
|
188
194
|
- spec/dataset/field_spec.rb
|
189
195
|
- spec/dataset/schema_spec.rb
|
190
196
|
- spec/fixtures/carriage_returns.csv
|