data_kit 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/lib/data_kit/csv/field_analysis.rb +80 -0
- data/lib/data_kit/csv/field_analyzer.rb +40 -0
- data/lib/data_kit/csv/{analysis.rb → schema_analysis.rb} +1 -1
- data/lib/data_kit/csv/{analyzer.rb → schema_analyzer.rb} +9 -9
- data/lib/data_kit/version.rb +1 -1
- data/lib/data_kit.rb +9 -3
- data/spec/csv/converter_spec.rb +2 -2
- data/spec/csv/field_analysis_spec.rb +71 -0
- data/spec/csv/field_analyzer_spec.rb +45 -0
- data/spec/csv/{analysis_spec.rb → schema_analysis_spec.rb} +6 -6
- data/spec/csv/{analyzer_spec.rb → schema_analyzer_spec.rb} +9 -11
- metadata +14 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eed0243b61997177b645f36dd755c121dd36c177
|
4
|
+
data.tar.gz: a9631c21ab9bf41820e8f4d28bec5bcf7a0c43b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b8468fc7bda9be6701312139e1d2cd3ad5933c0d1ddd0d38f4e9e373ba1dd27b176e1a1928c4a1790cd8d8e1873fdbecae9adc86266478f438595d0f8d32f1a0
|
7
|
+
data.tar.gz: 4f5dcdab6b2a42d05ba8a71bf38391b8109ba4adb6a46e5077f73dbfad796a9fd5706b5d07343a22e56aa438982bac5b27e889a964e4444b7327e3958a0340ed
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -0,0 +1,80 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class FieldAnalysis
|
4
|
+
attr_reader :field_name
|
5
|
+
attr_reader :match_type
|
6
|
+
|
7
|
+
attr_reader :types # {10 => :string, ...}
|
8
|
+
attr_reader :values # {10 => "2010-13-01"}
|
9
|
+
attr_reader :row_count
|
10
|
+
attr_reader :sample_count
|
11
|
+
|
12
|
+
def initialize(field_name, options = {})
|
13
|
+
@field_name = field_name
|
14
|
+
|
15
|
+
@types, @values = {}, {}
|
16
|
+
@row_count, @sample_count = 0, 0
|
17
|
+
|
18
|
+
@match_type = options[:match_type] || :any
|
19
|
+
|
20
|
+
Dataset::Field::Types.each do |type|
|
21
|
+
@types[type] = []
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def increment_total
|
26
|
+
@row_count += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def increment_sample
|
30
|
+
@sample_count += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
def insert(value)
|
34
|
+
value_type = Dataset::Field.type?(value)
|
35
|
+
|
36
|
+
if match_type.nil? || match_type == :any
|
37
|
+
insert_value_with_type(value, value_type)
|
38
|
+
elsif value_type == match_type
|
39
|
+
insert_value_with_type(value, value_type)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def type?
|
44
|
+
if has_single_type?
|
45
|
+
type_list.first
|
46
|
+
elsif has_only_numeric_types?
|
47
|
+
:number
|
48
|
+
else
|
49
|
+
:string
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def value_at(row_num)
|
54
|
+
@values[row_num]
|
55
|
+
end
|
56
|
+
|
57
|
+
def type_count(type)
|
58
|
+
types[type].length
|
59
|
+
end
|
60
|
+
|
61
|
+
def type_list
|
62
|
+
types.keys.select{ |type| @types[type].length > 0 }
|
63
|
+
end
|
64
|
+
|
65
|
+
def has_single_type?
|
66
|
+
type_list.length == 1
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_only_numeric_types?
|
70
|
+
(type_list - [:integer, :number, :null]).length == 0
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
def insert_value_with_type(value, type)
|
75
|
+
@values[row_count] = value
|
76
|
+
@types[type] << row_count
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module DataKit
|
2
|
+
module CSV
|
3
|
+
class FieldAnalyzer
|
4
|
+
attr_accessor :csv
|
5
|
+
attr_accessor :field_pos
|
6
|
+
attr_accessor :match_type
|
7
|
+
attr_accessor :sampling_rate
|
8
|
+
|
9
|
+
def initialize(csv, field_pos, options = {})
|
10
|
+
@csv = csv
|
11
|
+
@field_pos = field_pos
|
12
|
+
@match_type = options[:match_type] || :any
|
13
|
+
@sampling_rate = options[:sampling_rate] || 0.1
|
14
|
+
end
|
15
|
+
|
16
|
+
def execute
|
17
|
+
random = Random.new
|
18
|
+
|
19
|
+
field_name = csv.headers[field_pos]
|
20
|
+
analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
|
21
|
+
|
22
|
+
csv.each_row do |row|
|
23
|
+
analysis.increment_total
|
24
|
+
if random.rand <= sampling_rate
|
25
|
+
analysis.increment_sample
|
26
|
+
analysis.insert(row[field_name])
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
analysis
|
31
|
+
end
|
32
|
+
|
33
|
+
class << self
|
34
|
+
def analyze(csv, field_pos, options = {})
|
35
|
+
new(csv, field_pos, options).execute
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module DataKit
|
2
2
|
module CSV
|
3
|
-
class
|
3
|
+
class SchemaAnalyzer
|
4
4
|
attr_accessor :csv
|
5
5
|
attr_accessor :keys
|
6
|
-
attr_accessor :
|
6
|
+
attr_accessor :sampling_rate
|
7
7
|
|
8
8
|
def initialize(csv, options = {})
|
9
9
|
@csv = csv
|
10
10
|
@keys = options[:keys] || []
|
11
|
-
@
|
11
|
+
@sampling_rate = options[:sampling_rate] || 0.1
|
12
12
|
end
|
13
13
|
|
14
14
|
def execute
|
15
15
|
random = Random.new
|
16
|
-
analysis =
|
16
|
+
analysis = SchemaAnalysis.new(csv.headers)
|
17
17
|
|
18
18
|
csv.each_row do |row|
|
19
19
|
analysis.increment_total
|
20
|
-
if random.rand <=
|
20
|
+
if random.rand <= sampling_rate
|
21
21
|
analysis.increment_sample
|
22
22
|
row.keys.each do |field_name|
|
23
23
|
analysis.insert(field_name.to_s, row[field_name])
|
@@ -32,18 +32,18 @@ module DataKit
|
|
32
32
|
def analyze(csv, options = {})
|
33
33
|
analyzer = new(csv,
|
34
34
|
:keys => options[:keys],
|
35
|
-
:
|
35
|
+
:sampling_rate => options[:sampling_rate]
|
36
36
|
)
|
37
37
|
|
38
38
|
analyzer.execute
|
39
39
|
end
|
40
40
|
|
41
|
-
def
|
41
|
+
def sampling_rate(file_size)
|
42
42
|
if file_size < (1024 * 1024)
|
43
|
-
|
43
|
+
sampling_rate = 1.0
|
44
44
|
else
|
45
45
|
scale_factor = 500
|
46
|
-
|
46
|
+
sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
|
47
47
|
end
|
48
48
|
end
|
49
49
|
end
|
data/lib/data_kit/version.rb
CHANGED
data/lib/data_kit.rb
CHANGED
@@ -1,9 +1,15 @@
|
|
1
1
|
require "data_kit/version"
|
2
2
|
|
3
|
-
#
|
3
|
+
# Parsers
|
4
4
|
require 'data_kit/csv/parser'
|
5
|
-
|
6
|
-
|
5
|
+
|
6
|
+
# Analyzers
|
7
|
+
require 'data_kit/csv/field_analyzer'
|
8
|
+
require 'data_kit/csv/field_analysis'
|
9
|
+
require 'data_kit/csv/schema_analyzer'
|
10
|
+
require 'data_kit/csv/schema_analysis'
|
11
|
+
|
12
|
+
# Converters
|
7
13
|
require 'data_kit/csv/converter'
|
8
14
|
|
9
15
|
# Data Conversion
|
data/spec/csv/converter_spec.rb
CHANGED
@@ -14,7 +14,7 @@ describe DataKit::CSV::Converter do
|
|
14
14
|
}
|
15
15
|
|
16
16
|
it "should initialize and execute" do
|
17
|
-
analysis = DataKit::CSV::
|
17
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
18
18
|
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
19
19
|
|
20
20
|
converter.execute
|
@@ -25,7 +25,7 @@ describe DataKit::CSV::Converter do
|
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should convert using the convience method" do
|
28
|
-
analysis = DataKit::CSV::
|
28
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
29
29
|
converter = DataKit::CSV::Converter.convert(csv, analysis, target)
|
30
30
|
|
31
31
|
row_count = 0
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DataKit::CSV::FieldAnalysis do
|
4
|
+
it "should increment totals and samples" do
|
5
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
6
|
+
|
7
|
+
analysis.increment_total
|
8
|
+
analysis.increment_total
|
9
|
+
|
10
|
+
analysis.increment_sample
|
11
|
+
|
12
|
+
analysis.row_count.should == 2
|
13
|
+
analysis.sample_count.should == 1
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should insert a row for analysis" do
|
17
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
18
|
+
|
19
|
+
analysis.insert '1.0'
|
20
|
+
analysis.insert '2.0'
|
21
|
+
|
22
|
+
analysis.type?.should == :number
|
23
|
+
analysis.has_single_type?.should == true
|
24
|
+
analysis.has_only_numeric_types?.should == true
|
25
|
+
analysis.type_count(:number).should == 2
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should return the value for a specific row" do
|
29
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
30
|
+
|
31
|
+
analysis.increment_total
|
32
|
+
analysis.insert '1.0'
|
33
|
+
|
34
|
+
analysis.increment_total
|
35
|
+
analysis.insert '2.0'
|
36
|
+
|
37
|
+
analysis.value_at(1).should == '1.0'
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should infer a string type if there non-numeric mixed types" do
|
41
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
42
|
+
|
43
|
+
analysis.insert '1.0'
|
44
|
+
analysis.insert '2.0'
|
45
|
+
analysis.insert '2.0'
|
46
|
+
analysis.insert 'str2'
|
47
|
+
|
48
|
+
analysis.type?.should == :string
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should infer a number type if there are mixed numeric types" do
|
52
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1')
|
53
|
+
|
54
|
+
analysis.insert '1.0'
|
55
|
+
analysis.insert '20'
|
56
|
+
analysis.insert nil
|
57
|
+
|
58
|
+
analysis.type?.should == :number
|
59
|
+
end
|
60
|
+
|
61
|
+
it "should filter analysis to a specific type" do
|
62
|
+
analysis = DataKit::CSV::FieldAnalysis.new('field1', :match_type => :number)
|
63
|
+
|
64
|
+
analysis.insert '1.0'
|
65
|
+
analysis.insert '20'
|
66
|
+
analysis.insert nil
|
67
|
+
analysis.insert 'str2'
|
68
|
+
|
69
|
+
analysis.type?.should == :number
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe DataKit::CSV::FieldAnalyzer do
|
4
|
+
let(:path) {
|
5
|
+
data_path('standard.csv')
|
6
|
+
}
|
7
|
+
|
8
|
+
let(:csv) {
|
9
|
+
DataKit::CSV::Parser.new(path)
|
10
|
+
}
|
11
|
+
|
12
|
+
let(:iocsv) {
|
13
|
+
DataKit::CSV::Parser.new(File.open(path))
|
14
|
+
}
|
15
|
+
|
16
|
+
it "should initialize" do
|
17
|
+
analyzer = DataKit::CSV::FieldAnalyzer.new(csv, 1)
|
18
|
+
|
19
|
+
analyzer.csv.should == csv
|
20
|
+
analyzer.field_pos.should == 1
|
21
|
+
analyzer.sampling_rate.should == 0.1
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should initialize schema with an IO" do
|
25
|
+
analyzer = DataKit::CSV::FieldAnalyzer.new(iocsv, 1)
|
26
|
+
|
27
|
+
analyzer.csv.should == iocsv
|
28
|
+
analyzer.field_pos.should == 1
|
29
|
+
analyzer.sampling_rate.should == 0.1
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should execute an analysis" do
|
33
|
+
analysis = DataKit::CSV::FieldAnalyzer.new(csv, 8, :sampling_rate => 0.5).execute
|
34
|
+
|
35
|
+
analysis.type?.should == :datetime # activated_at
|
36
|
+
|
37
|
+
analysis.row_count.should == 10
|
38
|
+
analysis.sample_count.should be < 10
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should analyze using the static convenience method" do
|
42
|
+
analysis = DataKit::CSV::FieldAnalyzer.analyze(csv, 8, :sampling_rate => 0.5)
|
43
|
+
analysis.type?.should == :datetime # activated_at
|
44
|
+
end
|
45
|
+
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe DataKit::CSV::
|
3
|
+
describe DataKit::CSV::SchemaAnalysis do
|
4
4
|
it "should insert a row for analysis" do
|
5
|
-
analysis = DataKit::CSV::
|
5
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
6
6
|
|
7
7
|
analysis.insert 'field1', '1.0'
|
8
8
|
analysis.insert 'field1', '2.0'
|
@@ -11,7 +11,7 @@ describe DataKit::CSV::Analysis do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
it "should know the number of rows with a particular type" do
|
14
|
-
analysis = DataKit::CSV::
|
14
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
15
15
|
|
16
16
|
analysis.insert 'field1', '1.0'
|
17
17
|
analysis.insert 'field1', '2.0'
|
@@ -20,7 +20,7 @@ describe DataKit::CSV::Analysis do
|
|
20
20
|
end
|
21
21
|
|
22
22
|
it "should determine the type of a field" do
|
23
|
-
analysis = DataKit::CSV::
|
23
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
|
24
24
|
|
25
25
|
analysis.insert 'field1', '1.0'
|
26
26
|
analysis.insert 'field1', '2.0'
|
@@ -32,7 +32,7 @@ describe DataKit::CSV::Analysis do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should infer a string type if there non-numeric mixed types" do
|
35
|
-
analysis = DataKit::CSV::
|
35
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
|
36
36
|
|
37
37
|
analysis.insert 'field1', '1.0'
|
38
38
|
analysis.insert 'field1', '2.0'
|
@@ -44,7 +44,7 @@ describe DataKit::CSV::Analysis do
|
|
44
44
|
end
|
45
45
|
|
46
46
|
it "should infer a number type if there are mixed numeric types" do
|
47
|
-
analysis = DataKit::CSV::
|
47
|
+
analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
|
48
48
|
|
49
49
|
analysis.insert 'field1', '1.0'
|
50
50
|
analysis.insert 'field1', '20'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe DataKit::CSV::
|
3
|
+
describe DataKit::CSV::SchemaAnalyzer do
|
4
4
|
let(:path) {
|
5
5
|
data_path('standard.csv')
|
6
6
|
}
|
@@ -14,25 +14,23 @@ describe DataKit::CSV::Analyzer do
|
|
14
14
|
}
|
15
15
|
|
16
16
|
it "should initialize" do
|
17
|
-
analyzer = DataKit::CSV::
|
17
|
+
analyzer = DataKit::CSV::SchemaAnalyzer.new(csv)
|
18
18
|
|
19
19
|
analyzer.csv.should == csv
|
20
20
|
analyzer.keys.should == []
|
21
|
-
analyzer.
|
21
|
+
analyzer.sampling_rate.should == 0.1
|
22
22
|
end
|
23
23
|
|
24
24
|
it "should initialize schema with an IO" do
|
25
|
-
analyzer = DataKit::CSV::
|
25
|
+
analyzer = DataKit::CSV::SchemaAnalyzer.new(iocsv)
|
26
26
|
|
27
27
|
analyzer.csv.should == iocsv
|
28
28
|
analyzer.keys.should == []
|
29
|
-
analyzer.
|
29
|
+
analyzer.sampling_rate.should == 0.1
|
30
30
|
end
|
31
31
|
|
32
32
|
it "should execute an analysis" do
|
33
|
-
analysis = DataKit::CSV::
|
34
|
-
|
35
|
-
puts analysis.inspect
|
33
|
+
analysis = DataKit::CSV::SchemaAnalyzer.new(csv, :sampling_rate => 0.5).execute
|
36
34
|
|
37
35
|
analysis.type?('id').should == :integer
|
38
36
|
analysis.type?('first_name').should == :string
|
@@ -49,8 +47,8 @@ describe DataKit::CSV::Analyzer do
|
|
49
47
|
analysis.sample_count.should be < 10
|
50
48
|
end
|
51
49
|
|
52
|
-
it "should calculate a
|
53
|
-
DataKit::CSV::
|
54
|
-
DataKit::CSV::
|
50
|
+
it "should calculate a sampling_rate" do
|
51
|
+
DataKit::CSV::SchemaAnalyzer.sampling_rate(1024).should == 1
|
52
|
+
DataKit::CSV::SchemaAnalyzer.sampling_rate(2048 * 2048).should be < 1
|
55
53
|
end
|
56
54
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_kit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mode Analytics
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rcsv
|
@@ -130,10 +130,12 @@ files:
|
|
130
130
|
- lib/data_kit/converters/date_time.rb
|
131
131
|
- lib/data_kit/converters/integer.rb
|
132
132
|
- lib/data_kit/converters/number.rb
|
133
|
-
- lib/data_kit/csv/analysis.rb
|
134
|
-
- lib/data_kit/csv/analyzer.rb
|
135
133
|
- lib/data_kit/csv/converter.rb
|
134
|
+
- lib/data_kit/csv/field_analysis.rb
|
135
|
+
- lib/data_kit/csv/field_analyzer.rb
|
136
136
|
- lib/data_kit/csv/parser.rb
|
137
|
+
- lib/data_kit/csv/schema_analysis.rb
|
138
|
+
- lib/data_kit/csv/schema_analyzer.rb
|
137
139
|
- lib/data_kit/dataset/field.rb
|
138
140
|
- lib/data_kit/dataset/schema.rb
|
139
141
|
- lib/data_kit/patches/rcsv.rb
|
@@ -142,10 +144,12 @@ files:
|
|
142
144
|
- spec/converters/date_time_spec.rb
|
143
145
|
- spec/converters/integer_spec.rb
|
144
146
|
- spec/converters/number_spec.rb
|
145
|
-
- spec/csv/analysis_spec.rb
|
146
|
-
- spec/csv/analyzer_spec.rb
|
147
147
|
- spec/csv/converter_spec.rb
|
148
|
+
- spec/csv/field_analysis_spec.rb
|
149
|
+
- spec/csv/field_analyzer_spec.rb
|
148
150
|
- spec/csv/parser_spec.rb
|
151
|
+
- spec/csv/schema_analysis_spec.rb
|
152
|
+
- spec/csv/schema_analyzer_spec.rb
|
149
153
|
- spec/dataset/field_spec.rb
|
150
154
|
- spec/dataset/schema_spec.rb
|
151
155
|
- spec/fixtures/carriage_returns.csv
|
@@ -181,10 +185,12 @@ test_files:
|
|
181
185
|
- spec/converters/date_time_spec.rb
|
182
186
|
- spec/converters/integer_spec.rb
|
183
187
|
- spec/converters/number_spec.rb
|
184
|
-
- spec/csv/analysis_spec.rb
|
185
|
-
- spec/csv/analyzer_spec.rb
|
186
188
|
- spec/csv/converter_spec.rb
|
189
|
+
- spec/csv/field_analysis_spec.rb
|
190
|
+
- spec/csv/field_analyzer_spec.rb
|
187
191
|
- spec/csv/parser_spec.rb
|
192
|
+
- spec/csv/schema_analysis_spec.rb
|
193
|
+
- spec/csv/schema_analyzer_spec.rb
|
188
194
|
- spec/dataset/field_spec.rb
|
189
195
|
- spec/dataset/schema_spec.rb
|
190
196
|
- spec/fixtures/carriage_returns.csv
|