data_hut 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +6 -0
- data/README.md +1 -3
- data/data_hut.gemspec +4 -0
- data/lib/data_hut.rb +7 -1
- data/lib/data_hut/data_warehouse.rb +109 -13
- data/lib/data_hut/version.rb +1 -1
- data/test/spec/basic_test.rb +99 -12
- data/test/test_helper.rb +3 -0
- metadata +50 -1
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.0.5
|
4
|
+
|
5
|
+
* added rdoc
|
6
|
+
|
7
|
+
* added tests; 100% code coverage.
|
8
|
+
|
3
9
|
## 0.0.4
|
4
10
|
|
5
11
|
* added the capability to mark records in the datahut as processed so that transform passes can ignore previously processed data and only process new data... good for cycles where you pull regular updates and then process them.
|
data/README.md
CHANGED
@@ -11,9 +11,7 @@ DataHut has basic features for small one-off analytics like parsing error logs a
|
|
11
11
|
|
12
12
|
Add this line to your application's Gemfile:
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
gem 'data_hut', :git => "git://github.com/coldnebo/data_hut.git"
|
14
|
+
gem 'data_hut'
|
17
15
|
|
18
16
|
And then execute:
|
19
17
|
|
data/data_hut.gemspec
CHANGED
data/lib/data_hut.rb
CHANGED
@@ -2,9 +2,15 @@ require "data_hut/version"
|
|
2
2
|
require "data_hut/data_warehouse"
|
3
3
|
|
4
4
|
|
5
|
+
|
5
6
|
module DataHut
|
6
|
-
# Your code goes here...
|
7
7
|
|
8
|
+
# convenience method to create or open an existing connection to a DataHut data store.
|
9
|
+
#
|
10
|
+
# @param name [String] name of the DataHut. This will also be the name of the sqlite3
|
11
|
+
# file written to the current working directory (e.g. './<name>.db')
|
12
|
+
# @return [DataHut::DataWarehouse] instance
|
13
|
+
# @see DataHut::DataWarehouse#connect
|
8
14
|
def self.connect(name)
|
9
15
|
DataWarehouse.connect(name)
|
10
16
|
end
|
@@ -3,29 +3,86 @@ require 'ostruct'
|
|
3
3
|
require 'logger'
|
4
4
|
|
5
5
|
module DataHut
|
6
|
+
|
7
|
+
# The DataHut::DataWarehouse comprehensively manages all the heavy lifting of creating a data system for your analytics.
|
8
|
+
# So during *extract* and *transform* phases you don't have to worry about the schema or the data types you'll be using...
|
9
|
+
# just start scraping and playing with the data extraction, DataHut will take care of introspecting your final data records
|
10
|
+
# and creating or altering the DataHut schema for you, auto-magically.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# require 'data_hut'
|
14
|
+
# require 'pry' # not necessary, but very useful
|
15
|
+
#
|
16
|
+
# dh = DataHut.connect("scratch")
|
17
|
+
# data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
|
18
|
+
# {name: "phil", age: 31},
|
19
|
+
# {name: "fred", age: 44, login: DateTime.parse('2013-02-07')}]
|
20
|
+
#
|
21
|
+
# # extract your data by iterating over your data format (from whatever source) and map it to a record model...
|
22
|
+
# dh.extract(data) do |r, d|
|
23
|
+
# r.name = d[:name]
|
24
|
+
# r.age = d[:age]
|
25
|
+
# # you can do anything you need to within the extract block to ensure data quality if you want:
|
26
|
+
# d[:login] = DateTime.new unless d.has_key?(:login)
|
27
|
+
# r.last_active = d[:login]
|
28
|
+
# print 'v'
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# # transform your data by adding fields to it
|
32
|
+
# dh.transform do |r|
|
33
|
+
# r.eligible = r.age < 30
|
34
|
+
# print '*'
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# # mark all the records as processed to avoid re-transforming them.
|
38
|
+
# dh.transform_complete
|
39
|
+
# ds = dh.dataset
|
40
|
+
# binding.pry # play with ds.
|
41
|
+
# [1] pry(main)> ds.avg(:age)
|
42
|
+
# => 34.0
|
43
|
+
# [2] pry(main)> ineligible = ds.where(eligible: false)
|
44
|
+
# [3] pry(main)> ineligible.avg(:age)
|
45
|
+
# => 37.5
|
6
46
|
class DataWarehouse
|
7
47
|
private_class_method :new
|
8
48
|
|
49
|
+
# creates or opens an existing connection to a DataHut data store.
|
50
|
+
#
|
51
|
+
# @param name [String] name of the DataHut. This will also be the name of the sqlite3 file written
|
52
|
+
# to the current working directory (e.g. './<name>.db')
|
53
|
+
# @return [DataHut::DataWarehouse] instance
|
9
54
|
def self.connect(name)
|
10
55
|
new(name)
|
11
56
|
end
|
12
57
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
unless @db.table_exists?(:data_warehouse)
|
18
|
-
@db.create_table(:data_warehouse) do
|
19
|
-
primary_key :dw_id
|
20
|
-
column :dw_processed, TrueClass, :null => false, :default => false
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
58
|
+
# access the DataHut dataset. See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Dataset.html Sequel::Dataset}
|
59
|
+
# for available operations on the dataset.
|
60
|
+
#
|
61
|
+
# @return [Sequel::Model] instance bound to the data warehouse. Use this handle to query and analyze the datahut.
|
25
62
|
def dataset
|
26
63
|
Class.new(Sequel::Model(@db[:data_warehouse]))
|
27
64
|
end
|
28
65
|
|
66
|
+
# used to extract data from whatever source you wish. As long as the data forms an enumerable collection,
|
67
|
+
# you can pass it to extract along with a block that specifies how you which the DataHut *record* to be
|
68
|
+
# mapped from the source *element* of the collection.
|
69
|
+
#
|
70
|
+
# @example Extracting fields from a hash and assigning it to a field on a record
|
71
|
+
# data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') }]
|
72
|
+
# dh.extract(data) do |r, d|
|
73
|
+
# r.name = d[:name]
|
74
|
+
# r.age = d[:age]
|
75
|
+
# end
|
76
|
+
#
|
77
|
+
# @param data [Enumerable]
|
78
|
+
# @yield [record, element] lets you control the mapping of data elements to record fields
|
79
|
+
# @yieldparam record an OpenStruct that allows you to create fields dynamically on the record as needed.
|
80
|
+
# These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
|
81
|
+
# *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
|
82
|
+
# See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
|
83
|
+
# more information about supported ruby data types you can use.
|
84
|
+
# @yieldparam element an element from your data.
|
85
|
+
# @raise [ArgumentError] if you don't provide a block
|
29
86
|
def extract(data)
|
30
87
|
raise(ArgumentError, "a block is required for extract.", caller) unless block_given?
|
31
88
|
|
@@ -36,7 +93,25 @@ module DataHut
|
|
36
93
|
end
|
37
94
|
end
|
38
95
|
|
39
|
-
# transform
|
96
|
+
# used to transform data already extracted into a DataHut. You can also use *transform* to create new synthetic data fields
|
97
|
+
# from existing fields. You may create as many transform blocks (i.e. 'passes') as you like.
|
98
|
+
#
|
99
|
+
# @example Defining 'eligibility' based on arbitrary age criteria.
|
100
|
+
# dh.transform do |r|
|
101
|
+
# r.eligible = r.age < 30 # using extracted to create a synthetic boolean field
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# @param forced if set to 'true', this transform will iterate over records already marked processed. This can be useful for
|
105
|
+
# layers of transforms that deal with analytics where the analytical model may need to rapidly change as you explore the data.
|
106
|
+
# See the second transform in {file/README.md#A_More_Ambitious_Example___}.
|
107
|
+
# @yield [record] lets you modify the DataHut record
|
108
|
+
# @yieldparam record an OpenStruct that fronts the DataHut record. You may access existing fields on this record or create new
|
109
|
+
# fields to store synthetic data from a transform pass.
|
110
|
+
# These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
|
111
|
+
# *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
|
112
|
+
# See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
|
113
|
+
# more information about supported ruby data types you can use.
|
114
|
+
# @raise [ArgumentError] if you don't provide a block
|
40
115
|
def transform(forced=false)
|
41
116
|
raise(ArgumentError, "a block is required for transform.", caller) unless block_given?
|
42
117
|
|
@@ -62,12 +137,33 @@ module DataHut
|
|
62
137
|
end
|
63
138
|
end
|
64
139
|
|
140
|
+
# marks all the records in the DataHut as 'processed'. Useful as the last command in a sequence of extract and transform passes.
|
141
|
+
#
|
142
|
+
# @example a simple log analysis system (pseudocode)
|
143
|
+
# rake update
|
144
|
+
# extract apache logs (only adds new logs since last update)
|
145
|
+
# transform logs into types of response (error, ok, met_SLA (service level agreement, etc.)) (only transforms unprocessed (new) logs)
|
146
|
+
# transform_complete (marks the update complete)
|
147
|
+
# dh.dataset is used to visualize graphs with d3.js
|
148
|
+
# end
|
65
149
|
def transform_complete
|
66
150
|
@db[:data_warehouse].update(:dw_processed => true)
|
67
151
|
end
|
68
152
|
|
69
153
|
private
|
70
154
|
|
155
|
+
def initialize(name)
|
156
|
+
@db_file = "#{name}.db"
|
157
|
+
@db = Sequel.sqlite(@db_file)
|
158
|
+
#@db.logger = ::Logger.new(STDOUT)
|
159
|
+
unless @db.table_exists?(:data_warehouse)
|
160
|
+
@db.create_table(:data_warehouse) do
|
161
|
+
primary_key :dw_id
|
162
|
+
column :dw_processed, TrueClass, :null => false, :default => false
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
71
167
|
def store(r)
|
72
168
|
adapt_schema(r)
|
73
169
|
h = r.marshal_dump
|
data/lib/data_hut/version.rb
CHANGED
data/test/spec/basic_test.rb
CHANGED
@@ -3,7 +3,7 @@ require_relative File.join(*%w[.. test_helper])
|
|
3
3
|
|
4
4
|
describe DataHut do
|
5
5
|
def teardown
|
6
|
-
FileUtils.rm("foo.db", force: true
|
6
|
+
FileUtils.rm("foo.db", force: true)
|
7
7
|
end
|
8
8
|
|
9
9
|
describe "gem loading" do
|
@@ -14,7 +14,7 @@ describe DataHut do
|
|
14
14
|
|
15
15
|
describe "connect" do
|
16
16
|
it "should create a database if none exists" do
|
17
|
-
FileUtils.rm("foo.db", force: true
|
17
|
+
FileUtils.rm("foo.db", force: true)
|
18
18
|
dh = DataHut.connect("foo")
|
19
19
|
assert File.exists?("foo.db")
|
20
20
|
end
|
@@ -27,26 +27,113 @@ describe DataHut do
|
|
27
27
|
data = [{name: "barney", age: 27},
|
28
28
|
{name: "phil", age: 31},
|
29
29
|
{name: "fred", age: 44}]
|
30
|
-
|
31
|
-
# ignore dups!!
|
32
|
-
data2 = [{name: "barney", age: 27},
|
33
|
-
{name: "phil", age: 31},{name: "phil", age: 31},
|
34
|
-
{name: "fred", age: 44}]
|
35
30
|
|
36
|
-
|
37
|
-
# extracted into a transactional record 'r' in the data warehouse.
|
38
|
-
dh.extract(data2) do |r, d|
|
31
|
+
dh.extract(data) do |r, d|
|
39
32
|
r.name = d[:name]
|
40
33
|
r.age = d[:age]
|
41
34
|
end
|
42
35
|
|
36
|
+
assert_equal 3, dh.dataset.count
|
37
|
+
|
43
38
|
dh.dataset.each_with_index do |r,i|
|
44
|
-
|
39
|
+
assert_equal data[i][:name], r.name
|
45
40
|
assert_kind_of(data[i][:name].class, r.name)
|
46
|
-
|
41
|
+
assert_equal data[i][:age], r.age
|
47
42
|
assert_kind_of(data[i][:age].class, r.age)
|
48
43
|
end
|
49
44
|
end
|
45
|
+
|
46
|
+
it "should prevent duplicates from being extracted" do
|
47
|
+
dh = DataHut.connect("foo")
|
48
|
+
|
49
|
+
data = [{name: "barney", age: 27},
|
50
|
+
{name: "barney", age: 27},
|
51
|
+
{name: "phil", age: 31},
|
52
|
+
{name: "phil", age: 31},
|
53
|
+
{name: "fred", age: 44}]
|
54
|
+
|
55
|
+
dh.extract(data) do |r, d|
|
56
|
+
r.name = d[:name]
|
57
|
+
r.age = d[:age]
|
58
|
+
end
|
59
|
+
|
60
|
+
assert_equal 3, dh.dataset.count
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should add new records on subsequent extracts" do
|
64
|
+
dh = DataHut.connect("foo")
|
65
|
+
|
66
|
+
# first data pull
|
67
|
+
data = [{name: "barney", age: 27},
|
68
|
+
{name: "phil", age: 31},
|
69
|
+
{name: "fred", age: 44}]
|
70
|
+
|
71
|
+
dh.extract(data) do |r, d|
|
72
|
+
r.name = d[:name]
|
73
|
+
r.age = d[:age]
|
74
|
+
end
|
75
|
+
|
76
|
+
assert_equal 3, dh.dataset.count
|
77
|
+
|
78
|
+
# later on, a second data pull is run with new data...
|
79
|
+
data = [{name: "lisa", age: 27},
|
80
|
+
{name: "mary", age: 19},
|
81
|
+
{name: "jane", age: 33}]
|
82
|
+
|
83
|
+
dh.extract(data) do |r, d|
|
84
|
+
r.name = d[:name]
|
85
|
+
r.age = d[:age]
|
86
|
+
end
|
87
|
+
|
88
|
+
assert_equal 6, dh.dataset.count
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe "transform" do
|
93
|
+
def setup
|
94
|
+
@dh = DataHut.connect("foo")
|
95
|
+
|
96
|
+
data = [{name: "barney", age: 27},
|
97
|
+
{name: "phil", age: 31},
|
98
|
+
{name: "fred", age: 44},
|
99
|
+
{name: "lisa", age: 27},
|
100
|
+
{name: "mary", age: 19},
|
101
|
+
{name: "jane", age: 15}]
|
102
|
+
|
103
|
+
@dh.extract(data) do |r, d|
|
104
|
+
r.name = d[:name]
|
105
|
+
r.age = d[:age]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
it "should support transforming existing data" do
|
110
|
+
@dh.transform do |r|
|
111
|
+
r.eligible = r.age > 18 && r.age < 35
|
112
|
+
end
|
113
|
+
|
114
|
+
assert_equal 27.166666666666668, @dh.dataset.avg(:age)
|
115
|
+
sorted_by_name = @dh.dataset.order(:name)
|
116
|
+
eligible = sorted_by_name.where(eligible:true)
|
117
|
+
ineligible = sorted_by_name.where(eligible:false)
|
118
|
+
assert_equal 4, eligible.count
|
119
|
+
assert_equal 2, ineligible.count
|
120
|
+
|
121
|
+
assert_equal ["barney", "lisa", "mary", "phil"], eligible.collect{|d| d.name}
|
122
|
+
assert_equal ["fred", "jane"], ineligible.collect{|d| d.name}
|
123
|
+
end
|
124
|
+
|
125
|
+
it "should support ignoring processed records" do
|
126
|
+
@dh.transform_complete
|
127
|
+
|
128
|
+
called = false
|
129
|
+
@dh.transform do |r|
|
130
|
+
r.eligible = r.age > 18 && r.age < 35
|
131
|
+
called = true
|
132
|
+
end
|
133
|
+
|
134
|
+
refute called
|
135
|
+
end
|
136
|
+
|
50
137
|
end
|
51
138
|
|
52
139
|
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_hut
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -75,6 +75,54 @@ dependencies:
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: yard
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: redcarpet
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: simplecov
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
78
126
|
description: A small, portable data warehouse for Ruby for analytics on anything!
|
79
127
|
email:
|
80
128
|
- larry.kyrala@gmail.com
|
@@ -125,3 +173,4 @@ test_files:
|
|
125
173
|
- test/spec/basic_test.rb
|
126
174
|
- test/test_helper.rb
|
127
175
|
- test/unit/data_warehouse_test.rb
|
176
|
+
has_rdoc:
|