RubyGems - data_hut - Versions diffs - 0.0.4 → 0.0.5 - Mend

data_hut 0.0.4 → 0.0.5

Files changed (9) hide show

data/CHANGELOG.md +6 -0
data/README.md +1 -3
data/data_hut.gemspec +4 -0
data/lib/data_hut.rb +7 -1
data/lib/data_hut/data_warehouse.rb +109 -13
data/lib/data_hut/version.rb +1 -1
data/test/spec/basic_test.rb +99 -12
data/test/test_helper.rb +3 -0
metadata +50 -1

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # Changelog
+## 0.0.5
+* added rdoc
+* added tests; 100% code coverage.
 ## 0.0.4
 * added the capability to mark records in the datahut as processed so that transform passes can ignore previously processed data and only process new data... good for cycles where you pull regular updates and then process them.

data/README.md CHANGED Viewed

@@ -11,9 +11,7 @@ DataHut has basic features for small one-off analytics like parsing error logs a
 Add this line to your application's Gemfile:
-*NOTE* I haven't released this gem yet, so you'll need to ref git:
-    gem 'data_hut', :git => "git://github.com/coldnebo/data_hut.git"
+    gem 'data_hut'
 And then execute:

data/data_hut.gemspec CHANGED Viewed

@@ -20,4 +20,8 @@ Gem::Specification.new do |gem|
   gem.add_development_dependency 'mocha'
   gem.add_development_dependency 'pry'
+  gem.add_development_dependency 'yard'
+  gem.add_development_dependency 'redcarpet'
+  gem.add_development_dependency 'simplecov'
 end

data/lib/data_hut.rb CHANGED Viewed

@@ -2,9 +2,15 @@ require "data_hut/version"
 require "data_hut/data_warehouse"
 module DataHut
-  # Your code goes here...
+  # convenience method to create or open an existing connection to a DataHut data store.
+  #
+  # @param name [String] name of the DataHut.  This will also be the name of the sqlite3
+  #   file written to the current working directory (e.g. './<name>.db')
+  # @return [DataHut::DataWarehouse] instance
+  # @see DataHut::DataWarehouse#connect
   def self.connect(name)
     DataWarehouse.connect(name)
   end

data/lib/data_hut/data_warehouse.rb CHANGED Viewed

@@ -3,29 +3,86 @@ require 'ostruct'
 require 'logger'
 module DataHut
+  # The DataHut::DataWarehouse comprehensively manages all the heavy lifting of creating a data system for your analytics.
+  # So during *extract* and *transform* phases you don't have to worry about the schema or the data types you'll be using...
+  # just start scraping and playing with the data extraction, DataHut will take care of introspecting your final data records
+  # and creating or altering the DataHut schema for you, auto-magically.
+  #
+  # @example
+  #   require 'data_hut'
+  #   require 'pry'   # not necessary, but very useful
+  #
+  #   dh = DataHut.connect("scratch")
+  #   data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
+  #           {name: "phil", age: 31},
+  #           {name: "fred", age: 44, login: DateTime.parse('2013-02-07')}]
+  #
+  #   # extract your data by iterating over your data format (from whatever source) and map it to a record model...
+  #   dh.extract(data) do |r, d|
+  #     r.name = d[:name]
+  #     r.age = d[:age]
+  #     # you can do anything you need to within the extract block to ensure data quality if you want:
+  #     d[:login] = DateTime.new unless d.has_key?(:login)
+  #     r.last_active = d[:login]
+  #     print 'v'
+  #   end
+  #
+  #   # transform your data by adding fields to it
+  #   dh.transform do |r|
+  #     r.eligible = r.age < 30
+  #     print '*'
+  #   end
+  #
+  #   # mark all the records as processed to avoid re-transforming them.
+  #   dh.transform_complete
+  #   ds = dh.dataset
+  #   binding.pry   # play with ds.
+  #   [1] pry(main)> ds.avg(:age)
+  #   => 34.0
+  #   [2] pry(main)> ineligible = ds.where(eligible: false)
+  #   [3] pry(main)> ineligible.avg(:age)
+  #   => 37.5
   class DataWarehouse
     private_class_method :new
+    # creates or opens an existing connection to a DataHut data store.
+    #
+    # @param name [String] name of the DataHut.  This will also be the name of the sqlite3 file written
+    #   to the current working directory (e.g. './<name>.db')
+    # @return [DataHut::DataWarehouse] instance
     def self.connect(name)
       new(name)
     end
-    def initialize(name)
-      @db_file = "#{name}.db"
-      @db = Sequel.sqlite(@db_file)
-      #@db.logger = ::Logger.new(STDOUT)
-      unless @db.table_exists?(:data_warehouse)
-        @db.create_table(:data_warehouse) do
-          primary_key :dw_id
-          column :dw_processed, TrueClass, :null => false, :default => false
-        end
-      end
-    end
+    # access the DataHut dataset. See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Dataset.html Sequel::Dataset}
+    # for available operations on the dataset.
+    #
+    # @return [Sequel::Model] instance bound to the data warehouse. Use this handle to query and analyze the datahut.
     def dataset
       Class.new(Sequel::Model(@db[:data_warehouse]))
     end
+    # used to extract data from whatever source you wish. As long as the data forms an enumerable collection,
+    # you can pass it to extract along with a block that specifies how you which the DataHut *record* to be
+    # mapped from the source *element* of the collection.
+    #
+    # @example Extracting fields from a hash and assigning it to a field on a record
+    #  data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') }]
+    #  dh.extract(data) do |r, d|
+    #    r.name = d[:name]
+    #    r.age  = d[:age]
+    #  end
+    #
+    # @param data [Enumerable]
+    # @yield [record, element] lets you control the mapping of data elements to record fields
+    # @yieldparam record an OpenStruct that allows you to create fields dynamically on the record as needed.
+    #   These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
+    #   *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
+    #   See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
+    #   more information about supported ruby data types you can use.
+    # @yieldparam element an element from your data.
+    # @raise [ArgumentError] if you don't provide a block
     def extract(data)
       raise(ArgumentError, "a block is required for extract.", caller) unless block_given?
@@ -36,7 +93,25 @@ module DataHut
       end
     end
-    # transform all (could also be limited to not processed)
+    # used to transform data already extracted into a DataHut.  You can also use *transform* to create new synthetic data fields
+    # from existing fields.  You may create as many transform blocks (i.e. 'passes') as you like.
+    #
+    # @example Defining 'eligibility' based on arbitrary age criteria.
+    #   dh.transform do |r|
+    #     r.eligible = r.age < 30      # using extracted to create a synthetic boolean field
+    #   end
+    #
+    # @param forced if set to 'true', this transform will iterate over records already marked processed.  This can be useful for
+    #   layers of transforms that deal with analytics where the analytical model may need to rapidly change as you explore the data.
+    #   See the second transform in {file/README.md#A_More_Ambitious_Example___}.
+    # @yield [record] lets you modify the DataHut record
+    # @yieldparam record an OpenStruct that fronts the DataHut record.  You may access existing fields on this record or create new
+    #   fields to store synthetic data from a transform pass.
+    #   These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
+    #   *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
+    #   See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
+    #   more information about supported ruby data types you can use.
+    # @raise [ArgumentError] if you don't provide a block
     def transform(forced=false)
       raise(ArgumentError, "a block is required for transform.", caller) unless block_given?
@@ -62,12 +137,33 @@ module DataHut
       end
     end
+    # marks all the records in the DataHut as 'processed'.  Useful as the last command in a sequence of extract and transform passes.
+    #
+    # @example a simple log analysis system (pseudocode)
+    #   rake update
+    #      extract apache logs  (only adds new logs since last update)
+    #      transform logs into types of response (error, ok, met_SLA (service level agreement, etc.))  (only transforms unprocessed (new) logs)
+    #      transform_complete (marks the update complete)
+    #      dh.dataset is used to visualize graphs with d3.js
+    #   end
     def transform_complete
       @db[:data_warehouse].update(:dw_processed => true)
     end
     private
+    def initialize(name)
+      @db_file = "#{name}.db"
+      @db = Sequel.sqlite(@db_file)
+      #@db.logger = ::Logger.new(STDOUT)
+      unless @db.table_exists?(:data_warehouse)
+        @db.create_table(:data_warehouse) do
+          primary_key :dw_id
+          column :dw_processed, TrueClass, :null => false, :default => false
+        end
+      end
+    end
     def store(r)
       adapt_schema(r)
       h = r.marshal_dump

data/lib/data_hut/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DataHut
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

data/test/spec/basic_test.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require_relative File.join(*%w[.. test_helper])
 describe DataHut do
   def teardown
-    FileUtils.rm("foo.db", force: true, verbose: true)
+    FileUtils.rm("foo.db", force: true)
   end
   describe "gem loading" do
@@ -14,7 +14,7 @@ describe DataHut do
   describe "connect" do
     it "should create a database if none exists" do
-      FileUtils.rm("foo.db", force: true, verbose: true)
+      FileUtils.rm("foo.db", force: true)
       dh = DataHut.connect("foo")
       assert File.exists?("foo.db")
     end
@@ -27,26 +27,113 @@ describe DataHut do
       data = [{name: "barney", age: 27},
               {name: "phil", age: 31},
               {name: "fred", age: 44}]
-      # ignore dups!!
-      data2 = [{name: "barney", age: 27},
-              {name: "phil", age: 31},{name: "phil", age: 31},
-              {name: "fred", age: 44}]
-      # the idea of the extract phase is that you control exactly how an element of your data 'd' is
-      # extracted into a transactional record 'r' in the data warehouse.
-      dh.extract(data2) do |r, d|
+      dh.extract(data) do |r, d|
         r.name = d[:name]
         r.age = d[:age]
       end
+      assert_equal 3, dh.dataset.count
       dh.dataset.each_with_index do |r,i|
-        assert r.name == data[i][:name]
+        assert_equal data[i][:name], r.name
         assert_kind_of(data[i][:name].class, r.name)
-        assert r.age == data[i][:age]
+        assert_equal data[i][:age], r.age
         assert_kind_of(data[i][:age].class, r.age)
       end
     end
+    it "should prevent duplicates from being extracted" do
+      dh = DataHut.connect("foo")
+      data = [{name: "barney", age: 27},
+              {name: "barney", age: 27},
+              {name: "phil", age: 31},
+              {name: "phil", age: 31},
+              {name: "fred", age: 44}]
+      dh.extract(data) do |r, d|
+        r.name = d[:name]
+        r.age = d[:age]
+      end
+      assert_equal 3, dh.dataset.count
+    end
+    it "should add new records on subsequent extracts" do
+      dh = DataHut.connect("foo")
+      # first data pull
+      data = [{name: "barney", age: 27},
+              {name: "phil", age: 31},
+              {name: "fred", age: 44}]
+      dh.extract(data) do |r, d|
+        r.name = d[:name]
+        r.age = d[:age]
+      end
+      assert_equal 3, dh.dataset.count
+      # later on, a second data pull is run with new data...
+      data = [{name: "lisa", age: 27},
+              {name: "mary", age: 19},
+              {name: "jane", age: 33}]
+      dh.extract(data) do |r, d|
+        r.name = d[:name]
+        r.age = d[:age]
+      end
+      assert_equal 6, dh.dataset.count
+    end
+  end
+  describe "transform" do
+    def setup
+      @dh = DataHut.connect("foo")
+      data = [{name: "barney", age: 27},
+              {name: "phil",   age: 31},
+              {name: "fred",   age: 44},
+              {name: "lisa",   age: 27},
+              {name: "mary",   age: 19},
+              {name: "jane",   age: 15}]
+      @dh.extract(data) do |r, d|
+        r.name = d[:name]
+        r.age = d[:age]
+      end
+    end
+    it "should support transforming existing data" do
+      @dh.transform do |r|
+        r.eligible = r.age > 18 && r.age < 35
+      end
+      assert_equal 27.166666666666668, @dh.dataset.avg(:age)
+      sorted_by_name = @dh.dataset.order(:name)
+      eligible = sorted_by_name.where(eligible:true)
+      ineligible = sorted_by_name.where(eligible:false)
+      assert_equal 4, eligible.count
+      assert_equal 2, ineligible.count
+      assert_equal ["barney", "lisa", "mary", "phil"], eligible.collect{|d| d.name}
+      assert_equal ["fred", "jane"], ineligible.collect{|d| d.name}
+    end
+    it "should support ignoring processed records" do
+      @dh.transform_complete
+      called = false
+      @dh.transform do |r|
+        r.eligible = r.age > 18 && r.age < 35
+        called = true
+      end
+      refute called
+    end
   end
 end

data/test/test_helper.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+require 'simplecov'
+SimpleCov.start
 require 'minitest/autorun'
 require 'mocha/setup'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: data_hut
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
   prerelease:
 platform: ruby
 authors:
@@ -75,6 +75,54 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: redcarpet
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A small, portable data warehouse for Ruby for analytics on anything!
 email:
 - larry.kyrala@gmail.com
@@ -125,3 +173,4 @@ test_files:
 - test/spec/basic_test.rb
 - test/test_helper.rb
 - test/unit/data_warehouse_test.rb
+has_rdoc: