data_kit 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6cb35e25f3fbf1a5444fbd581b8ae9225c038653
4
- data.tar.gz: 8933a43e911a6e8c36c92d7fcb9ea004875c0cb6
3
+ metadata.gz: eed0243b61997177b645f36dd755c121dd36c177
4
+ data.tar.gz: a9631c21ab9bf41820e8f4d28bec5bcf7a0c43b5
5
5
  SHA512:
6
- metadata.gz: abf4c32f7bc1c7d001d2acd90dfd23c652382f652bb1066205df76642855c77bb94abbeea6cd1c27cd39a7784f846f58f3cdd7be08e87c2edc9541cdacf3edbf
7
- data.tar.gz: f248370d7f60840a9a82229409bf4d8316e3d5b40340bd6cf7e6cbe0dc40f226ec05813107f9a6bdee8607c81a4253fe7381525a11719904d43858c9cd54cf0c
6
+ metadata.gz: b8468fc7bda9be6701312139e1d2cd3ad5933c0d1ddd0d38f4e9e373ba1dd27b176e1a1928c4a1790cd8d8e1873fdbecae9adc86266478f438595d0f8d32f1a0
7
+ data.tar.gz: 4f5dcdab6b2a42d05ba8a71bf38391b8109ba4adb6a46e5077f73dbfad796a9fd5706b5d07343a22e56aa438982bac5b27e889a964e4444b7327e3958a0340ed
data/.travis.yml CHANGED
@@ -4,4 +4,4 @@ rvm:
4
4
  - 2.0.0
5
5
 
6
6
  script:
7
- - bundle exec rake
7
+ - CODECLIMATE_REPO_TOKEN=448979f6b7459a140f4fb67a3e7865b00d0468928df6937f6494df2ea93e425f bundle exec rake
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- data_kit (0.0.1)
4
+ data_kit (0.0.2)
5
5
  rcsv
6
6
  timeliness
7
7
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  Data Kit
2
2
  ==========
3
-
3
+
4
4
  Library for ingesting, analyzing and cleaning normalizing datasets
5
5
 
6
6
  ## Installation
@@ -0,0 +1,80 @@
1
+ module DataKit
2
+ module CSV
3
+ class FieldAnalysis
4
+ attr_reader :field_name
5
+ attr_reader :match_type
6
+
7
+ attr_reader :types # {10 => :string, ...}
8
+ attr_reader :values # {10 => "2010-13-01"}
9
+ attr_reader :row_count
10
+ attr_reader :sample_count
11
+
12
+ def initialize(field_name, options = {})
13
+ @field_name = field_name
14
+
15
+ @types, @values = {}, {}
16
+ @row_count, @sample_count = 0, 0
17
+
18
+ @match_type = options[:match_type] || :any
19
+
20
+ Dataset::Field::Types.each do |type|
21
+ @types[type] = []
22
+ end
23
+ end
24
+
25
+ def increment_total
26
+ @row_count += 1
27
+ end
28
+
29
+ def increment_sample
30
+ @sample_count += 1
31
+ end
32
+
33
+ def insert(value)
34
+ value_type = Dataset::Field.type?(value)
35
+
36
+ if match_type.nil? || match_type == :any
37
+ insert_value_with_type(value, value_type)
38
+ elsif value_type == match_type
39
+ insert_value_with_type(value, value_type)
40
+ end
41
+ end
42
+
43
+ def type?
44
+ if has_single_type?
45
+ type_list.first
46
+ elsif has_only_numeric_types?
47
+ :number
48
+ else
49
+ :string
50
+ end
51
+ end
52
+
53
+ def value_at(row_num)
54
+ @values[row_num]
55
+ end
56
+
57
+ def type_count(type)
58
+ types[type].length
59
+ end
60
+
61
+ def type_list
62
+ types.keys.select{ |type| @types[type].length > 0 }
63
+ end
64
+
65
+ def has_single_type?
66
+ type_list.length == 1
67
+ end
68
+
69
+ def has_only_numeric_types?
70
+ (type_list - [:integer, :number, :null]).length == 0
71
+ end
72
+
73
+ private
74
+ def insert_value_with_type(value, type)
75
+ @values[row_count] = value
76
+ @types[type] << row_count
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,40 @@
1
+ module DataKit
2
+ module CSV
3
+ class FieldAnalyzer
4
+ attr_accessor :csv
5
+ attr_accessor :field_pos
6
+ attr_accessor :match_type
7
+ attr_accessor :sampling_rate
8
+
9
+ def initialize(csv, field_pos, options = {})
10
+ @csv = csv
11
+ @field_pos = field_pos
12
+ @match_type = options[:match_type] || :any
13
+ @sampling_rate = options[:sampling_rate] || 0.1
14
+ end
15
+
16
+ def execute
17
+ random = Random.new
18
+
19
+ field_name = csv.headers[field_pos]
20
+ analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
21
+
22
+ csv.each_row do |row|
23
+ analysis.increment_total
24
+ if random.rand <= sampling_rate
25
+ analysis.increment_sample
26
+ analysis.insert(row[field_name])
27
+ end
28
+ end
29
+
30
+ analysis
31
+ end
32
+
33
+ class << self
34
+ def analyze(csv, field_pos, options = {})
35
+ new(csv, field_pos, options).execute
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,6 +1,6 @@
1
1
  module DataKit
2
2
  module CSV
3
- class Analysis
3
+ class SchemaAnalysis
4
4
  attr_reader :fields
5
5
  attr_reader :types
6
6
  attr_reader :row_count
@@ -1,23 +1,23 @@
1
1
  module DataKit
2
2
  module CSV
3
- class Analyzer
3
+ class SchemaAnalyzer
4
4
  attr_accessor :csv
5
5
  attr_accessor :keys
6
- attr_accessor :sample_rate
6
+ attr_accessor :sampling_rate
7
7
 
8
8
  def initialize(csv, options = {})
9
9
  @csv = csv
10
10
  @keys = options[:keys] || []
11
- @sample_rate = options[:sample_rate] || 0.1
11
+ @sampling_rate = options[:sampling_rate] || 0.1
12
12
  end
13
13
 
14
14
  def execute
15
15
  random = Random.new
16
- analysis = Analysis.new(csv.headers)
16
+ analysis = SchemaAnalysis.new(csv.headers)
17
17
 
18
18
  csv.each_row do |row|
19
19
  analysis.increment_total
20
- if random.rand <= sample_rate
20
+ if random.rand <= sampling_rate
21
21
  analysis.increment_sample
22
22
  row.keys.each do |field_name|
23
23
  analysis.insert(field_name.to_s, row[field_name])
@@ -32,18 +32,18 @@ module DataKit
32
32
  def analyze(csv, options = {})
33
33
  analyzer = new(csv,
34
34
  :keys => options[:keys],
35
- :sample_rate => options[:sample_rate]
35
+ :sampling_rate => options[:sampling_rate]
36
36
  )
37
37
 
38
38
  analyzer.execute
39
39
  end
40
40
 
41
- def sample_rate(file_size)
41
+ def sampling_rate(file_size)
42
42
  if file_size < (1024 * 1024)
43
- sample_rate = 1.0
43
+ sampling_rate = 1.0
44
44
  else
45
45
  scale_factor = 500
46
- sample_rate = (scale_factor / Math.sqrt(file_size)).round(4)
46
+ sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
47
47
  end
48
48
  end
49
49
  end
@@ -1,3 +1,3 @@
1
1
  module DataKit
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/data_kit.rb CHANGED
@@ -1,9 +1,15 @@
1
1
  require "data_kit/version"
2
2
 
3
- # Data I/O
3
+ # Parsers
4
4
  require 'data_kit/csv/parser'
5
- require 'data_kit/csv/analyzer'
6
- require 'data_kit/csv/analysis'
5
+
6
+ # Analyzers
7
+ require 'data_kit/csv/field_analyzer'
8
+ require 'data_kit/csv/field_analysis'
9
+ require 'data_kit/csv/schema_analyzer'
10
+ require 'data_kit/csv/schema_analysis'
11
+
12
+ # Converters
7
13
  require 'data_kit/csv/converter'
8
14
 
9
15
  # Data Conversion
@@ -14,7 +14,7 @@ describe DataKit::CSV::Converter do
14
14
  }
15
15
 
16
16
  it "should initialize and execute" do
17
- analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
17
+ analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
18
18
  converter = DataKit::CSV::Converter.new(csv, analysis, target)
19
19
 
20
20
  converter.execute
@@ -25,7 +25,7 @@ describe DataKit::CSV::Converter do
25
25
  end
26
26
 
27
27
  it "should convert using the convience method" do
28
- analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
28
+ analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
29
29
  converter = DataKit::CSV::Converter.convert(csv, analysis, target)
30
30
 
31
31
  row_count = 0
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::FieldAnalysis do
4
+ it "should increment totals and samples" do
5
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
6
+
7
+ analysis.increment_total
8
+ analysis.increment_total
9
+
10
+ analysis.increment_sample
11
+
12
+ analysis.row_count.should == 2
13
+ analysis.sample_count.should == 1
14
+ end
15
+
16
+ it "should insert a row for analysis" do
17
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
18
+
19
+ analysis.insert '1.0'
20
+ analysis.insert '2.0'
21
+
22
+ analysis.type?.should == :number
23
+ analysis.has_single_type?.should == true
24
+ analysis.has_only_numeric_types?.should == true
25
+ analysis.type_count(:number).should == 2
26
+ end
27
+
28
+ it "should return the value for a specific row" do
29
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
30
+
31
+ analysis.increment_total
32
+ analysis.insert '1.0'
33
+
34
+ analysis.increment_total
35
+ analysis.insert '2.0'
36
+
37
+ analysis.value_at(1).should == '1.0'
38
+ end
39
+
40
+ it "should infer a string type if there non-numeric mixed types" do
41
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
42
+
43
+ analysis.insert '1.0'
44
+ analysis.insert '2.0'
45
+ analysis.insert '2.0'
46
+ analysis.insert 'str2'
47
+
48
+ analysis.type?.should == :string
49
+ end
50
+
51
+ it "should infer a number type if there are mixed numeric types" do
52
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
53
+
54
+ analysis.insert '1.0'
55
+ analysis.insert '20'
56
+ analysis.insert nil
57
+
58
+ analysis.type?.should == :number
59
+ end
60
+
61
+ it "should filter analysis to a specific type" do
62
+ analysis = DataKit::CSV::FieldAnalysis.new('field1', :match_type => :number)
63
+
64
+ analysis.insert '1.0'
65
+ analysis.insert '20'
66
+ analysis.insert nil
67
+ analysis.insert 'str2'
68
+
69
+ analysis.type?.should == :number
70
+ end
71
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::FieldAnalyzer do
4
+ let(:path) {
5
+ data_path('standard.csv')
6
+ }
7
+
8
+ let(:csv) {
9
+ DataKit::CSV::Parser.new(path)
10
+ }
11
+
12
+ let(:iocsv) {
13
+ DataKit::CSV::Parser.new(File.open(path))
14
+ }
15
+
16
+ it "should initialize" do
17
+ analyzer = DataKit::CSV::FieldAnalyzer.new(csv, 1)
18
+
19
+ analyzer.csv.should == csv
20
+ analyzer.field_pos.should == 1
21
+ analyzer.sampling_rate.should == 0.1
22
+ end
23
+
24
+ it "should initialize schema with an IO" do
25
+ analyzer = DataKit::CSV::FieldAnalyzer.new(iocsv, 1)
26
+
27
+ analyzer.csv.should == iocsv
28
+ analyzer.field_pos.should == 1
29
+ analyzer.sampling_rate.should == 0.1
30
+ end
31
+
32
+ it "should execute an analysis" do
33
+ analysis = DataKit::CSV::FieldAnalyzer.new(csv, 8, :sampling_rate => 0.5).execute
34
+
35
+ analysis.type?.should == :datetime # activated_at
36
+
37
+ analysis.row_count.should == 10
38
+ analysis.sample_count.should be < 10
39
+ end
40
+
41
+ it "should analyze using the static convenience method" do
42
+ analysis = DataKit::CSV::FieldAnalyzer.analyze(csv, 8, :sampling_rate => 0.5)
43
+ analysis.type?.should == :datetime # activated_at
44
+ end
45
+ end
@@ -1,8 +1,8 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe DataKit::CSV::Analysis do
3
+ describe DataKit::CSV::SchemaAnalysis do
4
4
  it "should insert a row for analysis" do
5
- analysis = DataKit::CSV::Analysis.new(['field1'])
5
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
6
6
 
7
7
  analysis.insert 'field1', '1.0'
8
8
  analysis.insert 'field1', '2.0'
@@ -11,7 +11,7 @@ describe DataKit::CSV::Analysis do
11
11
  end
12
12
 
13
13
  it "should know the number of rows with a particular type" do
14
- analysis = DataKit::CSV::Analysis.new(['field1'])
14
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
15
15
 
16
16
  analysis.insert 'field1', '1.0'
17
17
  analysis.insert 'field1', '2.0'
@@ -20,7 +20,7 @@ describe DataKit::CSV::Analysis do
20
20
  end
21
21
 
22
22
  it "should determine the type of a field" do
23
- analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
23
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
24
24
 
25
25
  analysis.insert 'field1', '1.0'
26
26
  analysis.insert 'field1', '2.0'
@@ -32,7 +32,7 @@ describe DataKit::CSV::Analysis do
32
32
  end
33
33
 
34
34
  it "should infer a string type if there non-numeric mixed types" do
35
- analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
35
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
36
36
 
37
37
  analysis.insert 'field1', '1.0'
38
38
  analysis.insert 'field1', '2.0'
@@ -44,7 +44,7 @@ describe DataKit::CSV::Analysis do
44
44
  end
45
45
 
46
46
  it "should infer a number type if there are mixed numeric types" do
47
- analysis = DataKit::CSV::Analysis.new(['field1'])
47
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
48
48
 
49
49
  analysis.insert 'field1', '1.0'
50
50
  analysis.insert 'field1', '20'
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe DataKit::CSV::Analyzer do
3
+ describe DataKit::CSV::SchemaAnalyzer do
4
4
  let(:path) {
5
5
  data_path('standard.csv')
6
6
  }
@@ -14,25 +14,23 @@ describe DataKit::CSV::Analyzer do
14
14
  }
15
15
 
16
16
  it "should initialize" do
17
- analyzer = DataKit::CSV::Analyzer.new(csv)
17
+ analyzer = DataKit::CSV::SchemaAnalyzer.new(csv)
18
18
 
19
19
  analyzer.csv.should == csv
20
20
  analyzer.keys.should == []
21
- analyzer.sample_rate.should == 0.1
21
+ analyzer.sampling_rate.should == 0.1
22
22
  end
23
23
 
24
24
  it "should initialize schema with an IO" do
25
- analyzer = DataKit::CSV::Analyzer.new(iocsv)
25
+ analyzer = DataKit::CSV::SchemaAnalyzer.new(iocsv)
26
26
 
27
27
  analyzer.csv.should == iocsv
28
28
  analyzer.keys.should == []
29
- analyzer.sample_rate.should == 0.1
29
+ analyzer.sampling_rate.should == 0.1
30
30
  end
31
31
 
32
32
  it "should execute an analysis" do
33
- analysis = DataKit::CSV::Analyzer.new(csv, :sample_rate => 0.5).execute
34
-
35
- puts analysis.inspect
33
+ analysis = DataKit::CSV::SchemaAnalyzer.new(csv, :sampling_rate => 0.5).execute
36
34
 
37
35
  analysis.type?('id').should == :integer
38
36
  analysis.type?('first_name').should == :string
@@ -49,8 +47,8 @@ describe DataKit::CSV::Analyzer do
49
47
  analysis.sample_count.should be < 10
50
48
  end
51
49
 
52
- it "should calculate a sample_rate" do
53
- DataKit::CSV::Analyzer.sample_rate(1024).should == 1
54
- DataKit::CSV::Analyzer.sample_rate(2048 * 2048).should be < 1
50
+ it "should calculate a sampling_rate" do
51
+ DataKit::CSV::SchemaAnalyzer.sampling_rate(1024).should == 1
52
+ DataKit::CSV::SchemaAnalyzer.sampling_rate(2048 * 2048).should be < 1
55
53
  end
56
54
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mode Analytics
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-13 00:00:00.000000000 Z
11
+ date: 2013-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rcsv
@@ -130,10 +130,12 @@ files:
130
130
  - lib/data_kit/converters/date_time.rb
131
131
  - lib/data_kit/converters/integer.rb
132
132
  - lib/data_kit/converters/number.rb
133
- - lib/data_kit/csv/analysis.rb
134
- - lib/data_kit/csv/analyzer.rb
135
133
  - lib/data_kit/csv/converter.rb
134
+ - lib/data_kit/csv/field_analysis.rb
135
+ - lib/data_kit/csv/field_analyzer.rb
136
136
  - lib/data_kit/csv/parser.rb
137
+ - lib/data_kit/csv/schema_analysis.rb
138
+ - lib/data_kit/csv/schema_analyzer.rb
137
139
  - lib/data_kit/dataset/field.rb
138
140
  - lib/data_kit/dataset/schema.rb
139
141
  - lib/data_kit/patches/rcsv.rb
@@ -142,10 +144,12 @@ files:
142
144
  - spec/converters/date_time_spec.rb
143
145
  - spec/converters/integer_spec.rb
144
146
  - spec/converters/number_spec.rb
145
- - spec/csv/analysis_spec.rb
146
- - spec/csv/analyzer_spec.rb
147
147
  - spec/csv/converter_spec.rb
148
+ - spec/csv/field_analysis_spec.rb
149
+ - spec/csv/field_analyzer_spec.rb
148
150
  - spec/csv/parser_spec.rb
151
+ - spec/csv/schema_analysis_spec.rb
152
+ - spec/csv/schema_analyzer_spec.rb
149
153
  - spec/dataset/field_spec.rb
150
154
  - spec/dataset/schema_spec.rb
151
155
  - spec/fixtures/carriage_returns.csv
@@ -181,10 +185,12 @@ test_files:
181
185
  - spec/converters/date_time_spec.rb
182
186
  - spec/converters/integer_spec.rb
183
187
  - spec/converters/number_spec.rb
184
- - spec/csv/analysis_spec.rb
185
- - spec/csv/analyzer_spec.rb
186
188
  - spec/csv/converter_spec.rb
189
+ - spec/csv/field_analysis_spec.rb
190
+ - spec/csv/field_analyzer_spec.rb
187
191
  - spec/csv/parser_spec.rb
192
+ - spec/csv/schema_analysis_spec.rb
193
+ - spec/csv/schema_analyzer_spec.rb
188
194
  - spec/dataset/field_spec.rb
189
195
  - spec/dataset/schema_spec.rb
190
196
  - spec/fixtures/carriage_returns.csv