data_kit 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6cb35e25f3fbf1a5444fbd581b8ae9225c038653
4
- data.tar.gz: 8933a43e911a6e8c36c92d7fcb9ea004875c0cb6
3
+ metadata.gz: eed0243b61997177b645f36dd755c121dd36c177
4
+ data.tar.gz: a9631c21ab9bf41820e8f4d28bec5bcf7a0c43b5
5
5
  SHA512:
6
- metadata.gz: abf4c32f7bc1c7d001d2acd90dfd23c652382f652bb1066205df76642855c77bb94abbeea6cd1c27cd39a7784f846f58f3cdd7be08e87c2edc9541cdacf3edbf
7
- data.tar.gz: f248370d7f60840a9a82229409bf4d8316e3d5b40340bd6cf7e6cbe0dc40f226ec05813107f9a6bdee8607c81a4253fe7381525a11719904d43858c9cd54cf0c
6
+ metadata.gz: b8468fc7bda9be6701312139e1d2cd3ad5933c0d1ddd0d38f4e9e373ba1dd27b176e1a1928c4a1790cd8d8e1873fdbecae9adc86266478f438595d0f8d32f1a0
7
+ data.tar.gz: 4f5dcdab6b2a42d05ba8a71bf38391b8109ba4adb6a46e5077f73dbfad796a9fd5706b5d07343a22e56aa438982bac5b27e889a964e4444b7327e3958a0340ed
data/.travis.yml CHANGED
@@ -4,4 +4,4 @@ rvm:
4
4
  - 2.0.0
5
5
 
6
6
  script:
7
- - bundle exec rake
7
+ - CODECLIMATE_REPO_TOKEN=448979f6b7459a140f4fb67a3e7865b00d0468928df6937f6494df2ea93e425f bundle exec rake
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- data_kit (0.0.1)
4
+ data_kit (0.0.2)
5
5
  rcsv
6
6
  timeliness
7
7
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  Data Kit
2
2
  ==========
3
-
3
+
4
4
  Library for ingesting, analyzing and cleaning normalizing datasets
5
5
 
6
6
  ## Installation
@@ -0,0 +1,80 @@
1
+ module DataKit
2
+ module CSV
3
+ class FieldAnalysis
4
+ attr_reader :field_name
5
+ attr_reader :match_type
6
+
7
+ attr_reader :types # {10 => :string, ...}
8
+ attr_reader :values # {10 => "2010-13-01"}
9
+ attr_reader :row_count
10
+ attr_reader :sample_count
11
+
12
+ def initialize(field_name, options = {})
13
+ @field_name = field_name
14
+
15
+ @types, @values = {}, {}
16
+ @row_count, @sample_count = 0, 0
17
+
18
+ @match_type = options[:match_type] || :any
19
+
20
+ Dataset::Field::Types.each do |type|
21
+ @types[type] = []
22
+ end
23
+ end
24
+
25
+ def increment_total
26
+ @row_count += 1
27
+ end
28
+
29
+ def increment_sample
30
+ @sample_count += 1
31
+ end
32
+
33
+ def insert(value)
34
+ value_type = Dataset::Field.type?(value)
35
+
36
+ if match_type.nil? || match_type == :any
37
+ insert_value_with_type(value, value_type)
38
+ elsif value_type == match_type
39
+ insert_value_with_type(value, value_type)
40
+ end
41
+ end
42
+
43
+ def type?
44
+ if has_single_type?
45
+ type_list.first
46
+ elsif has_only_numeric_types?
47
+ :number
48
+ else
49
+ :string
50
+ end
51
+ end
52
+
53
+ def value_at(row_num)
54
+ @values[row_num]
55
+ end
56
+
57
+ def type_count(type)
58
+ types[type].length
59
+ end
60
+
61
+ def type_list
62
+ types.keys.select{ |type| @types[type].length > 0 }
63
+ end
64
+
65
+ def has_single_type?
66
+ type_list.length == 1
67
+ end
68
+
69
+ def has_only_numeric_types?
70
+ (type_list - [:integer, :number, :null]).length == 0
71
+ end
72
+
73
+ private
74
+ def insert_value_with_type(value, type)
75
+ @values[row_count] = value
76
+ @types[type] << row_count
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,40 @@
1
+ module DataKit
2
+ module CSV
3
+ class FieldAnalyzer
4
+ attr_accessor :csv
5
+ attr_accessor :field_pos
6
+ attr_accessor :match_type
7
+ attr_accessor :sampling_rate
8
+
9
+ def initialize(csv, field_pos, options = {})
10
+ @csv = csv
11
+ @field_pos = field_pos
12
+ @match_type = options[:match_type] || :any
13
+ @sampling_rate = options[:sampling_rate] || 0.1
14
+ end
15
+
16
+ def execute
17
+ random = Random.new
18
+
19
+ field_name = csv.headers[field_pos]
20
+ analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
21
+
22
+ csv.each_row do |row|
23
+ analysis.increment_total
24
+ if random.rand <= sampling_rate
25
+ analysis.increment_sample
26
+ analysis.insert(row[field_name])
27
+ end
28
+ end
29
+
30
+ analysis
31
+ end
32
+
33
+ class << self
34
+ def analyze(csv, field_pos, options = {})
35
+ new(csv, field_pos, options).execute
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -1,6 +1,6 @@
1
1
  module DataKit
2
2
  module CSV
3
- class Analysis
3
+ class SchemaAnalysis
4
4
  attr_reader :fields
5
5
  attr_reader :types
6
6
  attr_reader :row_count
@@ -1,23 +1,23 @@
1
1
  module DataKit
2
2
  module CSV
3
- class Analyzer
3
+ class SchemaAnalyzer
4
4
  attr_accessor :csv
5
5
  attr_accessor :keys
6
- attr_accessor :sample_rate
6
+ attr_accessor :sampling_rate
7
7
 
8
8
  def initialize(csv, options = {})
9
9
  @csv = csv
10
10
  @keys = options[:keys] || []
11
- @sample_rate = options[:sample_rate] || 0.1
11
+ @sampling_rate = options[:sampling_rate] || 0.1
12
12
  end
13
13
 
14
14
  def execute
15
15
  random = Random.new
16
- analysis = Analysis.new(csv.headers)
16
+ analysis = SchemaAnalysis.new(csv.headers)
17
17
 
18
18
  csv.each_row do |row|
19
19
  analysis.increment_total
20
- if random.rand <= sample_rate
20
+ if random.rand <= sampling_rate
21
21
  analysis.increment_sample
22
22
  row.keys.each do |field_name|
23
23
  analysis.insert(field_name.to_s, row[field_name])
@@ -32,18 +32,18 @@ module DataKit
32
32
  def analyze(csv, options = {})
33
33
  analyzer = new(csv,
34
34
  :keys => options[:keys],
35
- :sample_rate => options[:sample_rate]
35
+ :sampling_rate => options[:sampling_rate]
36
36
  )
37
37
 
38
38
  analyzer.execute
39
39
  end
40
40
 
41
- def sample_rate(file_size)
41
+ def sampling_rate(file_size)
42
42
  if file_size < (1024 * 1024)
43
- sample_rate = 1.0
43
+ sampling_rate = 1.0
44
44
  else
45
45
  scale_factor = 500
46
- sample_rate = (scale_factor / Math.sqrt(file_size)).round(4)
46
+ sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
47
47
  end
48
48
  end
49
49
  end
@@ -1,3 +1,3 @@
1
1
  module DataKit
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/data_kit.rb CHANGED
@@ -1,9 +1,15 @@
1
1
  require "data_kit/version"
2
2
 
3
- # Data I/O
3
+ # Parsers
4
4
  require 'data_kit/csv/parser'
5
- require 'data_kit/csv/analyzer'
6
- require 'data_kit/csv/analysis'
5
+
6
+ # Analyzers
7
+ require 'data_kit/csv/field_analyzer'
8
+ require 'data_kit/csv/field_analysis'
9
+ require 'data_kit/csv/schema_analyzer'
10
+ require 'data_kit/csv/schema_analysis'
11
+
12
+ # Converters
7
13
  require 'data_kit/csv/converter'
8
14
 
9
15
  # Data Conversion
@@ -14,7 +14,7 @@ describe DataKit::CSV::Converter do
14
14
  }
15
15
 
16
16
  it "should initialize and execute" do
17
- analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
17
+ analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
18
18
  converter = DataKit::CSV::Converter.new(csv, analysis, target)
19
19
 
20
20
  converter.execute
@@ -25,7 +25,7 @@ describe DataKit::CSV::Converter do
25
25
  end
26
26
 
27
27
  it "should convert using the convience method" do
28
- analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
28
+ analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
29
29
  converter = DataKit::CSV::Converter.convert(csv, analysis, target)
30
30
 
31
31
  row_count = 0
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::FieldAnalysis do
4
+ it "should increment totals and samples" do
5
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
6
+
7
+ analysis.increment_total
8
+ analysis.increment_total
9
+
10
+ analysis.increment_sample
11
+
12
+ analysis.row_count.should == 2
13
+ analysis.sample_count.should == 1
14
+ end
15
+
16
+ it "should insert a row for analysis" do
17
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
18
+
19
+ analysis.insert '1.0'
20
+ analysis.insert '2.0'
21
+
22
+ analysis.type?.should == :number
23
+ analysis.has_single_type?.should == true
24
+ analysis.has_only_numeric_types?.should == true
25
+ analysis.type_count(:number).should == 2
26
+ end
27
+
28
+ it "should return the value for a specific row" do
29
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
30
+
31
+ analysis.increment_total
32
+ analysis.insert '1.0'
33
+
34
+ analysis.increment_total
35
+ analysis.insert '2.0'
36
+
37
+ analysis.value_at(1).should == '1.0'
38
+ end
39
+
40
+ it "should infer a string type if there non-numeric mixed types" do
41
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
42
+
43
+ analysis.insert '1.0'
44
+ analysis.insert '2.0'
45
+ analysis.insert '2.0'
46
+ analysis.insert 'str2'
47
+
48
+ analysis.type?.should == :string
49
+ end
50
+
51
+ it "should infer a number type if there are mixed numeric types" do
52
+ analysis = DataKit::CSV::FieldAnalysis.new('field1')
53
+
54
+ analysis.insert '1.0'
55
+ analysis.insert '20'
56
+ analysis.insert nil
57
+
58
+ analysis.type?.should == :number
59
+ end
60
+
61
+ it "should filter analysis to a specific type" do
62
+ analysis = DataKit::CSV::FieldAnalysis.new('field1', :match_type => :number)
63
+
64
+ analysis.insert '1.0'
65
+ analysis.insert '20'
66
+ analysis.insert nil
67
+ analysis.insert 'str2'
68
+
69
+ analysis.type?.should == :number
70
+ end
71
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe DataKit::CSV::FieldAnalyzer do
4
+ let(:path) {
5
+ data_path('standard.csv')
6
+ }
7
+
8
+ let(:csv) {
9
+ DataKit::CSV::Parser.new(path)
10
+ }
11
+
12
+ let(:iocsv) {
13
+ DataKit::CSV::Parser.new(File.open(path))
14
+ }
15
+
16
+ it "should initialize" do
17
+ analyzer = DataKit::CSV::FieldAnalyzer.new(csv, 1)
18
+
19
+ analyzer.csv.should == csv
20
+ analyzer.field_pos.should == 1
21
+ analyzer.sampling_rate.should == 0.1
22
+ end
23
+
24
+ it "should initialize schema with an IO" do
25
+ analyzer = DataKit::CSV::FieldAnalyzer.new(iocsv, 1)
26
+
27
+ analyzer.csv.should == iocsv
28
+ analyzer.field_pos.should == 1
29
+ analyzer.sampling_rate.should == 0.1
30
+ end
31
+
32
+ it "should execute an analysis" do
33
+ analysis = DataKit::CSV::FieldAnalyzer.new(csv, 8, :sampling_rate => 0.5).execute
34
+
35
+ analysis.type?.should == :datetime # activated_at
36
+
37
+ analysis.row_count.should == 10
38
+ analysis.sample_count.should be < 10
39
+ end
40
+
41
+ it "should analyze using the static convenience method" do
42
+ analysis = DataKit::CSV::FieldAnalyzer.analyze(csv, 8, :sampling_rate => 0.5)
43
+ analysis.type?.should == :datetime # activated_at
44
+ end
45
+ end
@@ -1,8 +1,8 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe DataKit::CSV::Analysis do
3
+ describe DataKit::CSV::SchemaAnalysis do
4
4
  it "should insert a row for analysis" do
5
- analysis = DataKit::CSV::Analysis.new(['field1'])
5
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
6
6
 
7
7
  analysis.insert 'field1', '1.0'
8
8
  analysis.insert 'field1', '2.0'
@@ -11,7 +11,7 @@ describe DataKit::CSV::Analysis do
11
11
  end
12
12
 
13
13
  it "should know the number of rows with a particular type" do
14
- analysis = DataKit::CSV::Analysis.new(['field1'])
14
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
15
15
 
16
16
  analysis.insert 'field1', '1.0'
17
17
  analysis.insert 'field1', '2.0'
@@ -20,7 +20,7 @@ describe DataKit::CSV::Analysis do
20
20
  end
21
21
 
22
22
  it "should determine the type of a field" do
23
- analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
23
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
24
24
 
25
25
  analysis.insert 'field1', '1.0'
26
26
  analysis.insert 'field1', '2.0'
@@ -32,7 +32,7 @@ describe DataKit::CSV::Analysis do
32
32
  end
33
33
 
34
34
  it "should infer a string type if there non-numeric mixed types" do
35
- analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
35
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
36
36
 
37
37
  analysis.insert 'field1', '1.0'
38
38
  analysis.insert 'field1', '2.0'
@@ -44,7 +44,7 @@ describe DataKit::CSV::Analysis do
44
44
  end
45
45
 
46
46
  it "should infer a number type if there are mixed numeric types" do
47
- analysis = DataKit::CSV::Analysis.new(['field1'])
47
+ analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
48
48
 
49
49
  analysis.insert 'field1', '1.0'
50
50
  analysis.insert 'field1', '20'
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe DataKit::CSV::Analyzer do
3
+ describe DataKit::CSV::SchemaAnalyzer do
4
4
  let(:path) {
5
5
  data_path('standard.csv')
6
6
  }
@@ -14,25 +14,23 @@ describe DataKit::CSV::Analyzer do
14
14
  }
15
15
 
16
16
  it "should initialize" do
17
- analyzer = DataKit::CSV::Analyzer.new(csv)
17
+ analyzer = DataKit::CSV::SchemaAnalyzer.new(csv)
18
18
 
19
19
  analyzer.csv.should == csv
20
20
  analyzer.keys.should == []
21
- analyzer.sample_rate.should == 0.1
21
+ analyzer.sampling_rate.should == 0.1
22
22
  end
23
23
 
24
24
  it "should initialize schema with an IO" do
25
- analyzer = DataKit::CSV::Analyzer.new(iocsv)
25
+ analyzer = DataKit::CSV::SchemaAnalyzer.new(iocsv)
26
26
 
27
27
  analyzer.csv.should == iocsv
28
28
  analyzer.keys.should == []
29
- analyzer.sample_rate.should == 0.1
29
+ analyzer.sampling_rate.should == 0.1
30
30
  end
31
31
 
32
32
  it "should execute an analysis" do
33
- analysis = DataKit::CSV::Analyzer.new(csv, :sample_rate => 0.5).execute
34
-
35
- puts analysis.inspect
33
+ analysis = DataKit::CSV::SchemaAnalyzer.new(csv, :sampling_rate => 0.5).execute
36
34
 
37
35
  analysis.type?('id').should == :integer
38
36
  analysis.type?('first_name').should == :string
@@ -49,8 +47,8 @@ describe DataKit::CSV::Analyzer do
49
47
  analysis.sample_count.should be < 10
50
48
  end
51
49
 
52
- it "should calculate a sample_rate" do
53
- DataKit::CSV::Analyzer.sample_rate(1024).should == 1
54
- DataKit::CSV::Analyzer.sample_rate(2048 * 2048).should be < 1
50
+ it "should calculate a sampling_rate" do
51
+ DataKit::CSV::SchemaAnalyzer.sampling_rate(1024).should == 1
52
+ DataKit::CSV::SchemaAnalyzer.sampling_rate(2048 * 2048).should be < 1
55
53
  end
56
54
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mode Analytics
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-13 00:00:00.000000000 Z
11
+ date: 2013-12-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rcsv
@@ -130,10 +130,12 @@ files:
130
130
  - lib/data_kit/converters/date_time.rb
131
131
  - lib/data_kit/converters/integer.rb
132
132
  - lib/data_kit/converters/number.rb
133
- - lib/data_kit/csv/analysis.rb
134
- - lib/data_kit/csv/analyzer.rb
135
133
  - lib/data_kit/csv/converter.rb
134
+ - lib/data_kit/csv/field_analysis.rb
135
+ - lib/data_kit/csv/field_analyzer.rb
136
136
  - lib/data_kit/csv/parser.rb
137
+ - lib/data_kit/csv/schema_analysis.rb
138
+ - lib/data_kit/csv/schema_analyzer.rb
137
139
  - lib/data_kit/dataset/field.rb
138
140
  - lib/data_kit/dataset/schema.rb
139
141
  - lib/data_kit/patches/rcsv.rb
@@ -142,10 +144,12 @@ files:
142
144
  - spec/converters/date_time_spec.rb
143
145
  - spec/converters/integer_spec.rb
144
146
  - spec/converters/number_spec.rb
145
- - spec/csv/analysis_spec.rb
146
- - spec/csv/analyzer_spec.rb
147
147
  - spec/csv/converter_spec.rb
148
+ - spec/csv/field_analysis_spec.rb
149
+ - spec/csv/field_analyzer_spec.rb
148
150
  - spec/csv/parser_spec.rb
151
+ - spec/csv/schema_analysis_spec.rb
152
+ - spec/csv/schema_analyzer_spec.rb
149
153
  - spec/dataset/field_spec.rb
150
154
  - spec/dataset/schema_spec.rb
151
155
  - spec/fixtures/carriage_returns.csv
@@ -181,10 +185,12 @@ test_files:
181
185
  - spec/converters/date_time_spec.rb
182
186
  - spec/converters/integer_spec.rb
183
187
  - spec/converters/number_spec.rb
184
- - spec/csv/analysis_spec.rb
185
- - spec/csv/analyzer_spec.rb
186
188
  - spec/csv/converter_spec.rb
189
+ - spec/csv/field_analysis_spec.rb
190
+ - spec/csv/field_analyzer_spec.rb
187
191
  - spec/csv/parser_spec.rb
192
+ - spec/csv/schema_analysis_spec.rb
193
+ - spec/csv/schema_analyzer_spec.rb
188
194
  - spec/dataset/field_spec.rb
189
195
  - spec/dataset/schema_spec.rb
190
196
  - spec/fixtures/carriage_returns.csv