RubyGems - data_kit - Versions diffs - 0.0.2 → 0.0.3 - Mend

data_kit 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -1
data/Gemfile.lock +1 -1
data/README.md +1 -1
data/lib/data_kit/csv/field_analysis.rb +80 -0
data/lib/data_kit/csv/field_analyzer.rb +40 -0
data/lib/data_kit/csv/{analysis.rb → schema_analysis.rb} +1 -1
data/lib/data_kit/csv/{analyzer.rb → schema_analyzer.rb} +9 -9
data/lib/data_kit/version.rb +1 -1
data/lib/data_kit.rb +9 -3
data/spec/csv/converter_spec.rb +2 -2
data/spec/csv/field_analysis_spec.rb +71 -0
data/spec/csv/field_analyzer_spec.rb +45 -0
data/spec/csv/{analysis_spec.rb → schema_analysis_spec.rb} +6 -6
data/spec/csv/{analyzer_spec.rb → schema_analyzer_spec.rb} +9 -11
metadata +14 -8

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6cb35e25f3fbf1a5444fbd581b8ae9225c038653
-  data.tar.gz: 8933a43e911a6e8c36c92d7fcb9ea004875c0cb6
+  metadata.gz: eed0243b61997177b645f36dd755c121dd36c177
+  data.tar.gz: a9631c21ab9bf41820e8f4d28bec5bcf7a0c43b5
 SHA512:
-  metadata.gz: abf4c32f7bc1c7d001d2acd90dfd23c652382f652bb1066205df76642855c77bb94abbeea6cd1c27cd39a7784f846f58f3cdd7be08e87c2edc9541cdacf3edbf
-  data.tar.gz: f248370d7f60840a9a82229409bf4d8316e3d5b40340bd6cf7e6cbe0dc40f226ec05813107f9a6bdee8607c81a4253fe7381525a11719904d43858c9cd54cf0c
+  metadata.gz: b8468fc7bda9be6701312139e1d2cd3ad5933c0d1ddd0d38f4e9e373ba1dd27b176e1a1928c4a1790cd8d8e1873fdbecae9adc86266478f438595d0f8d32f1a0
+  data.tar.gz: 4f5dcdab6b2a42d05ba8a71bf38391b8109ba4adb6a46e5077f73dbfad796a9fd5706b5d07343a22e56aa438982bac5b27e889a964e4444b7327e3958a0340ed

data/.travis.yml CHANGED Viewed

@@ -4,4 +4,4 @@ rvm:
   - 2.0.0
 script:
-  - bundle exec rake
+  - CODECLIMATE_REPO_TOKEN=448979f6b7459a140f4fb67a3e7865b00d0468928df6937f6494df2ea93e425f bundle exec rake

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    data_kit (0.0.1)
+    data_kit (0.0.2)
       rcsv
       timeliness

data/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 Data Kit
 ==========
 Library for ingesting, analyzing and cleaning normalizing datasets
 ## Installation

data/lib/data_kit/csv/field_analysis.rb ADDED Viewed

@@ -0,0 +1,80 @@
+module DataKit
+  module CSV
+    class FieldAnalysis
+      attr_reader :field_name
+      attr_reader :match_type
+      attr_reader :types # {10 => :string, ...}
+      attr_reader :values # {10 => "2010-13-01"}
+      attr_reader :row_count
+      attr_reader :sample_count
+      def initialize(field_name, options = {})
+        @field_name = field_name
+        @types, @values = {}, {}
+        @row_count, @sample_count = 0, 0
+        @match_type = options[:match_type] || :any
+        Dataset::Field::Types.each do |type|
+          @types[type] = []
+        end
+      end
+      def increment_total
+        @row_count += 1
+      end
+      def increment_sample
+        @sample_count += 1
+      end
+      def insert(value)
+        value_type = Dataset::Field.type?(value)
+        if match_type.nil? || match_type == :any
+          insert_value_with_type(value, value_type)
+        elsif value_type == match_type
+          insert_value_with_type(value, value_type)
+        end
+      end
+      def type?
+        if has_single_type?
+          type_list.first
+        elsif has_only_numeric_types?
+          :number
+        else
+          :string
+        end
+      end
+      def value_at(row_num)
+        @values[row_num]
+      end
+      def type_count(type)
+        types[type].length
+      end
+      def type_list
+        types.keys.select{ |type| @types[type].length > 0 }
+      end
+      def has_single_type?
+        type_list.length == 1
+      end
+      def has_only_numeric_types?
+        (type_list - [:integer, :number, :null]).length == 0
+      end
+    private
+      def insert_value_with_type(value, type)
+        @values[row_count] = value
+        @types[type] << row_count
+      end
+    end
+  end
+end

data/lib/data_kit/csv/field_analyzer.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module DataKit
+  module CSV
+    class FieldAnalyzer
+      attr_accessor :csv
+      attr_accessor :field_pos
+      attr_accessor :match_type
+      attr_accessor :sampling_rate
+      def initialize(csv, field_pos, options = {})
+        @csv = csv
+        @field_pos = field_pos
+        @match_type = options[:match_type] || :any
+        @sampling_rate = options[:sampling_rate] || 0.1
+      end
+      def execute
+        random = Random.new
+        field_name = csv.headers[field_pos]
+        analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
+        csv.each_row do |row|
+          analysis.increment_total
+          if random.rand <= sampling_rate
+            analysis.increment_sample
+            analysis.insert(row[field_name])
+          end
+        end
+        analysis
+      end
+      class << self
+        def analyze(csv, field_pos, options = {})
+          new(csv, field_pos, options).execute
+        end
+      end
+    end
+  end
+end

data/lib/data_kit/csv/{analysis.rb → schema_analysis.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 module DataKit
   module CSV
-    class Analysis
+    class SchemaAnalysis
       attr_reader :fields
       attr_reader :types
       attr_reader :row_count

data/lib/data_kit/csv/{analyzer.rb → schema_analyzer.rb} RENAMED Viewed

@@ -1,23 +1,23 @@
 module DataKit
   module CSV
-    class Analyzer
+    class SchemaAnalyzer
       attr_accessor :csv
       attr_accessor :keys
-      attr_accessor :sample_rate
+      attr_accessor :sampling_rate
       def initialize(csv, options = {})
         @csv = csv
         @keys = options[:keys] || []
-        @sample_rate = options[:sample_rate] || 0.1
+        @sampling_rate = options[:sampling_rate] || 0.1
       end
       def execute
         random = Random.new
-        analysis = Analysis.new(csv.headers)
+        analysis = SchemaAnalysis.new(csv.headers)
         csv.each_row do |row|
           analysis.increment_total
-          if random.rand <= sample_rate
+          if random.rand <= sampling_rate
             analysis.increment_sample
             row.keys.each do |field_name|
               analysis.insert(field_name.to_s, row[field_name])
@@ -32,18 +32,18 @@ module DataKit
         def analyze(csv, options = {})
           analyzer = new(csv,
             :keys => options[:keys],
-            :sample_rate => options[:sample_rate]
+            :sampling_rate => options[:sampling_rate]
           )
           analyzer.execute
         end
-        def sample_rate(file_size)
+        def sampling_rate(file_size)
           if file_size < (1024 * 1024)
-            sample_rate = 1.0
+            sampling_rate = 1.0
           else
             scale_factor = 500
-            sample_rate = (scale_factor / Math.sqrt(file_size)).round(4)
+            sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
           end
         end
       end

data/lib/data_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DataKit
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/lib/data_kit.rb CHANGED Viewed

@@ -1,9 +1,15 @@
 require "data_kit/version"
-# Data I/O
+# Parsers
 require 'data_kit/csv/parser'
-require 'data_kit/csv/analyzer'
-require 'data_kit/csv/analysis'
+# Analyzers
+require 'data_kit/csv/field_analyzer'
+require 'data_kit/csv/field_analysis'
+require 'data_kit/csv/schema_analyzer'
+require 'data_kit/csv/schema_analysis'
+# Converters
 require 'data_kit/csv/converter'
 # Data Conversion

data/spec/csv/converter_spec.rb CHANGED Viewed

@@ -14,7 +14,7 @@ describe DataKit::CSV::Converter do
   }
   it "should initialize and execute" do
-    analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
+    analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
     converter = DataKit::CSV::Converter.new(csv, analysis, target)
     converter.execute
@@ -25,7 +25,7 @@ describe DataKit::CSV::Converter do
   end
   it "should convert using the convience method" do
-    analysis = DataKit::CSV::Analyzer.analyze(csv, :sample_rate => 1)
+    analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
     converter = DataKit::CSV::Converter.convert(csv, analysis, target)
     row_count = 0

data/spec/csv/field_analysis_spec.rb ADDED Viewed

@@ -0,0 +1,71 @@
+require 'spec_helper'
+describe DataKit::CSV::FieldAnalysis do
+  it "should increment totals and samples" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1')
+    analysis.increment_total
+    analysis.increment_total
+    analysis.increment_sample
+    analysis.row_count.should == 2
+    analysis.sample_count.should == 1
+  end
+  it "should insert a row for analysis" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1')
+    analysis.insert '1.0'
+    analysis.insert '2.0'
+    analysis.type?.should == :number
+    analysis.has_single_type?.should == true
+    analysis.has_only_numeric_types?.should == true
+    analysis.type_count(:number).should == 2
+  end
+  it "should return the value for a specific row" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1')
+    analysis.increment_total
+    analysis.insert '1.0'
+    analysis.increment_total
+    analysis.insert '2.0'
+    analysis.value_at(1).should == '1.0'
+  end
+  it "should infer a string type if there non-numeric mixed types" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1')
+    analysis.insert '1.0'
+    analysis.insert '2.0'
+    analysis.insert '2.0'
+    analysis.insert 'str2'
+    analysis.type?.should == :string
+  end
+  it "should infer a number type if there are mixed numeric types" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1')
+    analysis.insert '1.0'
+    analysis.insert '20'
+    analysis.insert nil
+    analysis.type?.should == :number
+  end
+  it "should filter analysis to a specific type" do
+    analysis = DataKit::CSV::FieldAnalysis.new('field1', :match_type => :number)
+    analysis.insert '1.0'
+    analysis.insert '20'
+    analysis.insert nil
+    analysis.insert 'str2'
+    analysis.type?.should == :number
+  end
+end

data/spec/csv/field_analyzer_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'spec_helper'
+describe DataKit::CSV::FieldAnalyzer do
+  let(:path) {
+    data_path('standard.csv')
+  }
+  let(:csv) {
+    DataKit::CSV::Parser.new(path)
+  }
+  let(:iocsv) {
+    DataKit::CSV::Parser.new(File.open(path))
+  }
+  it "should initialize" do
+    analyzer = DataKit::CSV::FieldAnalyzer.new(csv, 1)
+    analyzer.csv.should == csv
+    analyzer.field_pos.should == 1
+    analyzer.sampling_rate.should == 0.1
+  end
+  it "should initialize schema with an IO" do
+    analyzer = DataKit::CSV::FieldAnalyzer.new(iocsv, 1)
+    analyzer.csv.should == iocsv
+    analyzer.field_pos.should == 1
+    analyzer.sampling_rate.should == 0.1
+  end
+  it "should execute an analysis" do
+    analysis = DataKit::CSV::FieldAnalyzer.new(csv, 8, :sampling_rate => 0.5).execute
+    analysis.type?.should == :datetime # activated_at
+    analysis.row_count.should == 10
+    analysis.sample_count.should be < 10
+  end
+  it "should analyze using the static convenience method" do
+    analysis = DataKit::CSV::FieldAnalyzer.analyze(csv, 8, :sampling_rate => 0.5)
+    analysis.type?.should == :datetime # activated_at
+  end
+end

data/spec/csv/{analysis_spec.rb → schema_analysis_spec.rb} RENAMED Viewed

@@ -1,8 +1,8 @@
 require 'spec_helper'
-describe DataKit::CSV::Analysis do
+describe DataKit::CSV::SchemaAnalysis do
   it "should insert a row for analysis" do
-    analysis = DataKit::CSV::Analysis.new(['field1'])
+    analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
     analysis.insert 'field1', '1.0'
     analysis.insert 'field1', '2.0'
@@ -11,7 +11,7 @@ describe DataKit::CSV::Analysis do
   end
   it "should know the number of rows with a particular type" do
-    analysis = DataKit::CSV::Analysis.new(['field1'])
+    analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
     analysis.insert 'field1', '1.0'
     analysis.insert 'field1', '2.0'
@@ -20,7 +20,7 @@ describe DataKit::CSV::Analysis do
   end
   it "should determine the type of a field" do
-    analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
+    analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
     analysis.insert 'field1', '1.0'
     analysis.insert 'field1', '2.0'
@@ -32,7 +32,7 @@ describe DataKit::CSV::Analysis do
   end
   it "should infer a string type if there non-numeric mixed types" do
-    analysis = DataKit::CSV::Analysis.new(['field1', 'field2'])
+    analysis = DataKit::CSV::SchemaAnalysis.new(['field1', 'field2'])
     analysis.insert 'field1', '1.0'
     analysis.insert 'field1', '2.0'
@@ -44,7 +44,7 @@ describe DataKit::CSV::Analysis do
   end
   it "should infer a number type if there are mixed numeric types" do
-    analysis = DataKit::CSV::Analysis.new(['field1'])
+    analysis = DataKit::CSV::SchemaAnalysis.new(['field1'])
     analysis.insert 'field1', '1.0'
     analysis.insert 'field1', '20'

data/spec/csv/{analyzer_spec.rb → schema_analyzer_spec.rb} RENAMED Viewed

@@ -1,6 +1,6 @@
 require 'spec_helper'
-describe DataKit::CSV::Analyzer do
+describe DataKit::CSV::SchemaAnalyzer do
   let(:path) {
     data_path('standard.csv')
   }
@@ -14,25 +14,23 @@ describe DataKit::CSV::Analyzer do
   }
   it "should initialize" do
-    analyzer = DataKit::CSV::Analyzer.new(csv)
+    analyzer = DataKit::CSV::SchemaAnalyzer.new(csv)
     analyzer.csv.should == csv
     analyzer.keys.should == []
-    analyzer.sample_rate.should == 0.1
+    analyzer.sampling_rate.should == 0.1
   end
   it "should initialize schema with an IO" do
-    analyzer = DataKit::CSV::Analyzer.new(iocsv)
+    analyzer = DataKit::CSV::SchemaAnalyzer.new(iocsv)
     analyzer.csv.should == iocsv
     analyzer.keys.should == []
-    analyzer.sample_rate.should == 0.1
+    analyzer.sampling_rate.should == 0.1
   end
   it "should execute an analysis" do
-    analysis = DataKit::CSV::Analyzer.new(csv, :sample_rate => 0.5).execute
-    puts analysis.inspect
+    analysis = DataKit::CSV::SchemaAnalyzer.new(csv, :sampling_rate => 0.5).execute
     analysis.type?('id').should == :integer
     analysis.type?('first_name').should == :string
@@ -49,8 +47,8 @@ describe DataKit::CSV::Analyzer do
     analysis.sample_count.should be < 10
   end
-  it "should calculate a sample_rate" do
-    DataKit::CSV::Analyzer.sample_rate(1024).should == 1
-    DataKit::CSV::Analyzer.sample_rate(2048 * 2048).should be < 1
+  it "should calculate a sampling_rate" do
+    DataKit::CSV::SchemaAnalyzer.sampling_rate(1024).should == 1
+    DataKit::CSV::SchemaAnalyzer.sampling_rate(2048 * 2048).should be < 1
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: data_kit
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Mode Analytics
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-13 00:00:00.000000000 Z
+date: 2013-12-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rcsv
@@ -130,10 +130,12 @@ files:
 - lib/data_kit/converters/date_time.rb
 - lib/data_kit/converters/integer.rb
 - lib/data_kit/converters/number.rb
-- lib/data_kit/csv/analysis.rb
-- lib/data_kit/csv/analyzer.rb
 - lib/data_kit/csv/converter.rb
+- lib/data_kit/csv/field_analysis.rb
+- lib/data_kit/csv/field_analyzer.rb
 - lib/data_kit/csv/parser.rb
+- lib/data_kit/csv/schema_analysis.rb
+- lib/data_kit/csv/schema_analyzer.rb
 - lib/data_kit/dataset/field.rb
 - lib/data_kit/dataset/schema.rb
 - lib/data_kit/patches/rcsv.rb
@@ -142,10 +144,12 @@ files:
 - spec/converters/date_time_spec.rb
 - spec/converters/integer_spec.rb
 - spec/converters/number_spec.rb
-- spec/csv/analysis_spec.rb
-- spec/csv/analyzer_spec.rb
 - spec/csv/converter_spec.rb
+- spec/csv/field_analysis_spec.rb
+- spec/csv/field_analyzer_spec.rb
 - spec/csv/parser_spec.rb
+- spec/csv/schema_analysis_spec.rb
+- spec/csv/schema_analyzer_spec.rb
 - spec/dataset/field_spec.rb
 - spec/dataset/schema_spec.rb
 - spec/fixtures/carriage_returns.csv
@@ -181,10 +185,12 @@ test_files:
 - spec/converters/date_time_spec.rb
 - spec/converters/integer_spec.rb
 - spec/converters/number_spec.rb
-- spec/csv/analysis_spec.rb
-- spec/csv/analyzer_spec.rb
 - spec/csv/converter_spec.rb
+- spec/csv/field_analysis_spec.rb
+- spec/csv/field_analyzer_spec.rb
 - spec/csv/parser_spec.rb
+- spec/csv/schema_analysis_spec.rb
+- spec/csv/schema_analyzer_spec.rb
 - spec/dataset/field_spec.rb
 - spec/dataset/schema_spec.rb
 - spec/fixtures/carriage_returns.csv