data_hut 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/.rvmrc +1 -0
- data/CHANGELOG.md +24 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +244 -0
- data/Rakefile +15 -0
- data/data_hut.gemspec +23 -0
- data/lib/data_hut.rb +12 -0
- data/lib/data_hut/data_warehouse.rb +94 -0
- data/lib/data_hut/version.rb +3 -0
- data/scratch +164 -0
- data/test/spec/basic_test.rb +53 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/data_warehouse_test.rb +16 -0
- metadata +127 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm 1.9.3@data_hut --create
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## 0.0.4
|
4
|
+
|
5
|
+
* added the capability to mark records in the datahut as processed so that transform passes can ignore previously processed data and only process new data... good for cycles where you pull regular updates and then process them.
|
6
|
+
|
7
|
+
* added capability to force the transform to write in spite of processed; good for situations where you are playing with the structure of the transform and want to regenerate the data.
|
8
|
+
|
9
|
+
|
10
|
+
## 0.0.3
|
11
|
+
|
12
|
+
* fixed an update issue found in transforms where data was written successfully, but Sequel::Model couldn't read it immediately after.
|
13
|
+
|
14
|
+
|
15
|
+
## 0.0.2
|
16
|
+
|
17
|
+
* fixed problem with multiple instances of the datahut returning only a single dataset instance.
|
18
|
+
|
19
|
+
* added more interesting example to motivate edge cases.
|
20
|
+
|
21
|
+
|
22
|
+
## 0.0.1
|
23
|
+
|
24
|
+
* initial checkin. basic functionality
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Larry Kyrala
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,244 @@
|
|
1
|
+
# DataHut
|
2
|
+
|
3
|
+
A small, portable data warehouse for Ruby for analytics on anything!
|
4
|
+
|
5
|
+
DataHut has basic features for small one-off analytics like parsing error logs and such. Like its bigger cousin (the Data Warehouse) it has support for *extract*, *transform* and *load* processes (ETL). Unlike its bigger cousin it is simple to setup and use for simple projects.
|
6
|
+
|
7
|
+
*Extract* your data from anywhere, *transform* it however you like and *analyze* it for insights!
|
8
|
+
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add this line to your application's Gemfile:
|
13
|
+
|
14
|
+
*NOTE* I haven't released this gem yet, so you'll need to ref git:
|
15
|
+
|
16
|
+
gem 'data_hut', :git => "git://github.com/coldnebo/data_hut.git"
|
17
|
+
|
18
|
+
And then execute:
|
19
|
+
|
20
|
+
$ bundle
|
21
|
+
|
22
|
+
Or install it yourself as:
|
23
|
+
|
24
|
+
$ gem install data_hut
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
Setting up a datahut is easy...
|
29
|
+
|
30
|
+
require 'data_hut'
|
31
|
+
require 'pry'
|
32
|
+
|
33
|
+
dh = DataHut.connect("scratch")
|
34
|
+
|
35
|
+
data = [{name: "barney", age: 27},
|
36
|
+
{name: "phil", age: 31},
|
37
|
+
{name: "fred", age: 44}]
|
38
|
+
|
39
|
+
# extract your data by iterating over your data format (from whatever source) and map it to a record model...
|
40
|
+
dh.extract(data) do |r, d|
|
41
|
+
r.name = d[:name]
|
42
|
+
r.age = d[:age]
|
43
|
+
end
|
44
|
+
|
45
|
+
# transform your data by adding fields to it
|
46
|
+
dh.transform do |r|
|
47
|
+
r.eligible = r.age < 30
|
48
|
+
end
|
49
|
+
|
50
|
+
# operate on your dataset by using chained queries
|
51
|
+
ds = dh.dataset
|
52
|
+
|
53
|
+
binding.pry
|
54
|
+
|
55
|
+
The datahut *dataset* is a Sequel::Model backed by the data warehouse you just created.
|
56
|
+
|
57
|
+
And here's the kinds of powerful things you can do:
|
58
|
+
|
59
|
+
[2] pry(main)> ds.where(eligible: false).count
|
60
|
+
=> 2
|
61
|
+
[3] pry(main)> ds.avg(:age)
|
62
|
+
=> 34.0
|
63
|
+
[4] pry(main)> ds.max(:age)
|
64
|
+
=> 44
|
65
|
+
[5] pry(main)> ds.min(:age)
|
66
|
+
=> 27
|
67
|
+
|
68
|
+
But wait, you can name these collections:
|
69
|
+
|
70
|
+
[6] pry(main)> ineligible = ds.where(eligible: false)
|
71
|
+
=> #<Sequel::SQLite::Dataset: "SELECT * FROM `data_warehouse` WHERE (`eligible` = 'f')">
|
72
|
+
|
73
|
+
[26] pry(main)> ineligible.avg(:age)
|
74
|
+
=> 37.5
|
75
|
+
[24] pry(main)> ineligible.order(Sequel.desc(:age)).all
|
76
|
+
=> [#< @values={:dw_id=>3, :name=>"fred", :age=>44, :eligible=>false}>,
|
77
|
+
#< @values={:dw_id=>2, :name=>"phil", :age=>31, :eligible=>false}>]
|
78
|
+
|
79
|
+
The results are Sequel::Model objects, so you can treat them as such:
|
80
|
+
|
81
|
+
[32] pry(main)> record = ineligible.order(Sequel.desc(:age)).first
|
82
|
+
=> #< @values={:dw_id=>3, :name=>"fred", :age=>44, :eligible=>false}>
|
83
|
+
[33] pry(main)> record.name
|
84
|
+
=> "fred"
|
85
|
+
[34] pry(main)> record.age
|
86
|
+
=> 44
|
87
|
+
|
88
|
+
|
89
|
+
Read more about the [Sequel gem](http://sequel.rubyforge.org/rdoc/files/README_rdoc.html) to determine what operations you can perform on a datahut dataset.
|
90
|
+
|
91
|
+
## A More Ambitious Example...
|
92
|
+
|
93
|
+
Taking a popular game like League of Legends and hand-rolling some simple analysis of the champions...
|
94
|
+
|
95
|
+
require 'nokogiri'
|
96
|
+
require 'open-uri'
|
97
|
+
require 'pry'
|
98
|
+
|
99
|
+
root = 'http://na.leagueoflegends.com'
|
100
|
+
|
101
|
+
# load the data once... (manually delete it to refresh)
|
102
|
+
unless File.exists?("lolstats.db")
|
103
|
+
dh = DataHut.connect("lolstats")
|
104
|
+
|
105
|
+
champions_page = Nokogiri::HTML(open("#{root}/champions"))
|
106
|
+
|
107
|
+
urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
|
108
|
+
|
109
|
+
# keep the powers for later since they are on different pages.
|
110
|
+
powers = {}
|
111
|
+
champions_page.css('table.champion_item').each do |c|
|
112
|
+
name = c.css('td.description span.highlight a').text
|
113
|
+
attack = c.css('td.graphing td.filled_attack').count
|
114
|
+
health = c.css('td.graphing td.filled_health').count
|
115
|
+
spells = c.css('td.graphing td.filled_spells').count
|
116
|
+
difficulty = c.css('td.graphing td.filled_difficulty').count
|
117
|
+
powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
|
118
|
+
end
|
119
|
+
|
120
|
+
puts "loading champion data"
|
121
|
+
dh.extract(urls) do |r, url|
|
122
|
+
champion_page = Nokogiri::HTML(open("#{root}#{url}"))
|
123
|
+
r.name = champion_page.css('div.page_header_text').text
|
124
|
+
|
125
|
+
st = champion_page.css('table.stats_table')
|
126
|
+
names = st.css('td.stats_name').collect{|e| e.text.strip}
|
127
|
+
values = st.css('td.stats_value').collect{|e| e.text.strip}
|
128
|
+
modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
|
129
|
+
|
130
|
+
(0..names.count-1).collect do |i|
|
131
|
+
stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
|
132
|
+
r.send(stat, values[i].to_f)
|
133
|
+
stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
|
134
|
+
per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
|
135
|
+
r.send(stat_per_level, per_level_value)
|
136
|
+
end
|
137
|
+
|
138
|
+
# add the powers for this champion...
|
139
|
+
power = powers[r.name]
|
140
|
+
r.attack_power = power[:attack_power]
|
141
|
+
r.defense_power = power[:defense_power]
|
142
|
+
r.ability_power = power[:ability_power]
|
143
|
+
r.difficulty = power[:difficulty]
|
144
|
+
|
145
|
+
print "."
|
146
|
+
end
|
147
|
+
puts "done."
|
148
|
+
end
|
149
|
+
|
150
|
+
dh = DataHut.connect("lolstats")
|
151
|
+
|
152
|
+
puts "first transform"
|
153
|
+
dh.transform do |r|
|
154
|
+
r.total_damage = r.damage + (r.damage_per_level * 18.0)
|
155
|
+
r.total_health = r.health + (r.health_per_level * 18.0)
|
156
|
+
r.total_mana = r.mana + (r.mana_per_level * 18.0)
|
157
|
+
r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
|
158
|
+
r.total_armor = r.armor + (r.armor_per_level * 18.0)
|
159
|
+
r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
|
160
|
+
r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
|
161
|
+
r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
|
162
|
+
print '.'
|
163
|
+
end
|
164
|
+
|
165
|
+
puts "second transform"
|
166
|
+
# there's no need to do transforms all in one batch either... you can layer them...
|
167
|
+
dh.transform(true) do |r|
|
168
|
+
# this index combines the tank dimensions above for best combination (simple Euclidean metric)
|
169
|
+
r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
|
170
|
+
r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
|
171
|
+
r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
|
172
|
+
r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
|
173
|
+
print '.'
|
174
|
+
end
|
175
|
+
|
176
|
+
# use once at the end to mark records processed.
|
177
|
+
dh.transform_complete
|
178
|
+
puts "transforms complete"
|
179
|
+
|
180
|
+
ds = dh.dataset
|
181
|
+
|
182
|
+
binding.pry
|
183
|
+
|
184
|
+
|
185
|
+
Now that we have some data, lets play...
|
186
|
+
|
187
|
+
* who has the most base damage?
|
188
|
+
|
189
|
+
[14] pry(main)> ds.order(Sequel.desc(:damage)).limit(5).collect{|c| {c.name => c.damage}}
|
190
|
+
=> [{"Taric"=>58.0},
|
191
|
+
{"Maokai"=>58.0},
|
192
|
+
{"Warwick"=>56.76},
|
193
|
+
{"Singed"=>56.65},
|
194
|
+
{"Poppy"=>56.3}]
|
195
|
+
|
196
|
+
* but wait a minute... what about at level 18? Fortunately, we've transformed our data to add some extra fields for this...
|
197
|
+
|
198
|
+
[3] pry(main)> ds.order(Sequel.desc(:total_damage)).limit(5).collect{|c| {c.name => c.total_damage}}
|
199
|
+
=> [{"Skarner"=>129.70000000000002},
|
200
|
+
{"Cho'Gath"=>129.70000000000002},
|
201
|
+
{"Kassadin"=>122.5},
|
202
|
+
{"Taric"=>121.0},
|
203
|
+
{"Alistar"=>120.19}]
|
204
|
+
|
205
|
+
* how about using some of the indexes we defined above... like the 'nuke_index' (notice that the assumptions on what make a good
|
206
|
+
nuke are subjective, but that's the fun of it; we can model our assumptions and see how the data changes in response.)
|
207
|
+
|
208
|
+
[5] pry(main)> ds.order(Sequel.desc(:nuke_index)).limit(5).collect{|c| {c.name => [c.total_damage, c.total_move_speed, c.total_mana, c.ability_power]}}
|
209
|
+
=> [{"Karthus"=>[100.7, 335.0, 1368.0, 10]},
|
210
|
+
{"Morgana"=>[114.58, 335.0, 1320.0, 9]},
|
211
|
+
{"Ryze"=>[106.0, 335.0, 1240.0, 10]},
|
212
|
+
{"Karma"=>[109.4, 335.0, 1320.0, 9]},
|
213
|
+
{"Lux"=>[109.4, 340.0, 1150.0, 10]}]
|
214
|
+
|
215
|
+
I must have hit close to the mark, because personally I hate each of these champions when I go up against them! ;)
|
216
|
+
|
217
|
+
* and (now I risk becoming addicted to datahut myself), here's some further guesses with an easy_nuke index:
|
218
|
+
|
219
|
+
[2] pry(main)> ds.order(Sequel.desc(:easy_nuke_index)).limit(5).collect{|c| c.name}
|
220
|
+
=> ["Sona", "Ryze", "Nasus", "Soraka", "Heimerdinger"]
|
221
|
+
|
222
|
+
* makes sense, but is still fascinating... what about my crack at a support_index?
|
223
|
+
|
224
|
+
[3] pry(main)> ds.order(Sequel.desc(:support_index)).limit(5).collect{|c| c.name}
|
225
|
+
=> ["Sion", "Diana", "Nunu", "Nautilus", "Amumu"]
|
226
|
+
|
227
|
+
You get the idea now! *Extract* your data from anywhere, *transform* it however you like and *analyze* it for insights!
|
228
|
+
|
229
|
+
Have fun!
|
230
|
+
|
231
|
+
|
232
|
+
## TODOS
|
233
|
+
|
234
|
+
* fill out tests
|
235
|
+
* further optimizations
|
236
|
+
* time-based series and binning helpers (by week/day/hour/5-min/etc).
|
237
|
+
|
238
|
+
## Contributing
|
239
|
+
|
240
|
+
1. Fork it
|
241
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
242
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
243
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
244
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require 'rake/testtask'
|
4
|
+
|
5
|
+
Rake::TestTask.new do |t|
|
6
|
+
t.libs << 'lib'
|
7
|
+
t.test_files = FileList['test/**/*_test.rb']
|
8
|
+
t.verbose = true
|
9
|
+
end
|
10
|
+
task :default => :test
|
11
|
+
|
12
|
+
desc "clean up"
|
13
|
+
task :clean do
|
14
|
+
FileUtils.rm(FileList["*.db"], force: true, verbose: true)
|
15
|
+
end
|
data/data_hut.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/data_hut/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Larry Kyrala"]
|
6
|
+
gem.email = ["larry.kyrala@gmail.com"]
|
7
|
+
gem.description = %q{A small, portable data warehouse for Ruby for analytics on anything!}
|
8
|
+
gem.summary = %q{Like a data warehouse, but smaller.}
|
9
|
+
gem.homepage = "https://github.com/coldnebo/data_hut"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "data_hut"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = DataHut::VERSION
|
17
|
+
|
18
|
+
gem.add_dependency 'sequel'
|
19
|
+
gem.add_dependency 'sqlite3'
|
20
|
+
|
21
|
+
gem.add_development_dependency 'mocha'
|
22
|
+
gem.add_development_dependency 'pry'
|
23
|
+
end
|
data/lib/data_hut.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'sequel'
|
2
|
+
require 'ostruct'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module DataHut
|
6
|
+
class DataWarehouse
|
7
|
+
private_class_method :new
|
8
|
+
|
9
|
+
def self.connect(name)
|
10
|
+
new(name)
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(name)
|
14
|
+
@db_file = "#{name}.db"
|
15
|
+
@db = Sequel.sqlite(@db_file)
|
16
|
+
#@db.logger = ::Logger.new(STDOUT)
|
17
|
+
unless @db.table_exists?(:data_warehouse)
|
18
|
+
@db.create_table(:data_warehouse) do
|
19
|
+
primary_key :dw_id
|
20
|
+
column :dw_processed, TrueClass, :null => false, :default => false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def dataset
|
26
|
+
Class.new(Sequel::Model(@db[:data_warehouse]))
|
27
|
+
end
|
28
|
+
|
29
|
+
def extract(data)
|
30
|
+
raise(ArgumentError, "a block is required for extract.", caller) unless block_given?
|
31
|
+
|
32
|
+
data.each do |d|
|
33
|
+
r = OpenStruct.new
|
34
|
+
yield r, d
|
35
|
+
store(r)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# transform all (could also be limited to not processed)
|
40
|
+
def transform(forced=false)
|
41
|
+
raise(ArgumentError, "a block is required for transform.", caller) unless block_given?
|
42
|
+
|
43
|
+
# now process all the records with the updated schema...
|
44
|
+
@db[:data_warehouse].each do |h|
|
45
|
+
# check for processed if not forced
|
46
|
+
unless forced
|
47
|
+
next if h[:dw_processed] == true
|
48
|
+
end
|
49
|
+
# then get rid of the internal id and processed flags
|
50
|
+
dw_id = h.delete(:dw_id)
|
51
|
+
h.delete(:dw_processed)
|
52
|
+
# copy record fields to an openstruct
|
53
|
+
r = OpenStruct.new(h)
|
54
|
+
# and let the transformer modify it...
|
55
|
+
yield r
|
56
|
+
# now add any new transformation fields to the schema...
|
57
|
+
adapt_schema(r)
|
58
|
+
# get the update hash from the openstruct
|
59
|
+
h = r.marshal_dump
|
60
|
+
# and use it to update the record
|
61
|
+
@db[:data_warehouse].where(dw_id: dw_id).update(h)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def transform_complete
|
66
|
+
@db[:data_warehouse].update(:dw_processed => true)
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def store(r)
|
72
|
+
adapt_schema(r)
|
73
|
+
h = r.marshal_dump
|
74
|
+
# don't insert dups
|
75
|
+
unless @db[:data_warehouse].where(h).count > 0
|
76
|
+
@db[:data_warehouse].insert(h)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def adapt_schema(r)
|
81
|
+
h = r.marshal_dump
|
82
|
+
h.keys.each do |key|
|
83
|
+
type = h[key].class
|
84
|
+
unless @db[:data_warehouse].columns.include?(key)
|
85
|
+
@db.alter_table(:data_warehouse) do
|
86
|
+
add_column key, type
|
87
|
+
add_index key
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
data/scratch
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# allows this script to behave as if the gem were installed.
|
3
|
+
lp = File.expand_path(File.join(*%w[lib]), File.dirname(__FILE__))
|
4
|
+
unless $LOAD_PATH.include?(lp)
|
5
|
+
$LOAD_PATH.unshift(lp)
|
6
|
+
end
|
7
|
+
|
8
|
+
# ------------------ manual example of client usage for docs, etc. -----------
|
9
|
+
|
10
|
+
require 'data_hut'
|
11
|
+
require 'pry'
|
12
|
+
|
13
|
+
dh = DataHut.connect("scratch")
|
14
|
+
|
15
|
+
data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
|
16
|
+
{name: "phil", age: 31},
|
17
|
+
{name: "fred", age: 44, login: DateTime.parse('2013-02-07')}]
|
18
|
+
|
19
|
+
# extract your data by iterating over your data format (from whatever source) and map it to a record model...
|
20
|
+
dh.extract(data) do |r, d|
|
21
|
+
r.name = d[:name]
|
22
|
+
r.age = d[:age]
|
23
|
+
# data quality step:
|
24
|
+
d[:login] = DateTime.new unless d.has_key?(:login)
|
25
|
+
r.last_active = d[:login]
|
26
|
+
print 'v'
|
27
|
+
end
|
28
|
+
|
29
|
+
# transform your data by adding fields to it
|
30
|
+
dh.transform do |r|
|
31
|
+
r.eligible = r.age < 30
|
32
|
+
print '*'
|
33
|
+
end
|
34
|
+
|
35
|
+
dh.transform_complete
|
36
|
+
|
37
|
+
# later... you have more data...
|
38
|
+
data = [{name: "sarah", age: 24, login: DateTime.parse('2011-04-01') },
|
39
|
+
{name: "robin", age: 45},
|
40
|
+
{name: "jane", age: 19, login: DateTime.parse('2012-10-14')}]
|
41
|
+
|
42
|
+
# add it too...
|
43
|
+
dh.extract(data) do |r, d|
|
44
|
+
r.name = d[:name]
|
45
|
+
r.age = d[:age]
|
46
|
+
# data quality step:
|
47
|
+
d[:login] = DateTime.new unless d.has_key?(:login)
|
48
|
+
r.last_active = d[:login]
|
49
|
+
print 'v'
|
50
|
+
end
|
51
|
+
|
52
|
+
# and only transform the new records automatically
|
53
|
+
dh.transform do |r|
|
54
|
+
r.eligible = r.age < 30
|
55
|
+
print '*'
|
56
|
+
end
|
57
|
+
|
58
|
+
dh.transform_complete
|
59
|
+
|
60
|
+
|
61
|
+
# operate on your dataset by using chained queries
|
62
|
+
ds = dh.dataset
|
63
|
+
|
64
|
+
#binding.pry
|
65
|
+
|
66
|
+
|
67
|
+
# clean up scratch demo
|
68
|
+
FileUtils.rm("scratch.db")
|
69
|
+
puts "done."
|
70
|
+
|
71
|
+
#exit
|
72
|
+
|
73
|
+
# ------------------------------ a more ambitious example -------------------------
|
74
|
+
|
75
|
+
require 'nokogiri'
|
76
|
+
require 'open-uri'
|
77
|
+
require 'pry'
|
78
|
+
|
79
|
+
root = 'http://na.leagueoflegends.com'
|
80
|
+
|
81
|
+
# load the data once... (manually delete it to refresh)
|
82
|
+
unless File.exists?("lolstats.db")
|
83
|
+
dh = DataHut.connect("lolstats")
|
84
|
+
|
85
|
+
champions_page = Nokogiri::HTML(open("#{root}/champions"))
|
86
|
+
|
87
|
+
urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
|
88
|
+
|
89
|
+
# keep the powers for later since they are on different pages.
|
90
|
+
powers = {}
|
91
|
+
champions_page.css('table.champion_item').each do |c|
|
92
|
+
name = c.css('td.description span.highlight a').text
|
93
|
+
attack = c.css('td.graphing td.filled_attack').count
|
94
|
+
health = c.css('td.graphing td.filled_health').count
|
95
|
+
spells = c.css('td.graphing td.filled_spells').count
|
96
|
+
difficulty = c.css('td.graphing td.filled_difficulty').count
|
97
|
+
powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
|
98
|
+
end
|
99
|
+
|
100
|
+
puts "loading champion data"
|
101
|
+
dh.extract(urls) do |r, url|
|
102
|
+
champion_page = Nokogiri::HTML(open("#{root}#{url}"))
|
103
|
+
r.name = champion_page.css('div.page_header_text').text
|
104
|
+
|
105
|
+
st = champion_page.css('table.stats_table')
|
106
|
+
names = st.css('td.stats_name').collect{|e| e.text.strip}
|
107
|
+
values = st.css('td.stats_value').collect{|e| e.text.strip}
|
108
|
+
modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
|
109
|
+
|
110
|
+
(0..names.count-1).collect do |i|
|
111
|
+
stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
|
112
|
+
r.send(stat, values[i].to_f)
|
113
|
+
stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
|
114
|
+
per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
|
115
|
+
r.send(stat_per_level, per_level_value)
|
116
|
+
end
|
117
|
+
|
118
|
+
# add the powers for this champion...
|
119
|
+
power = powers[r.name]
|
120
|
+
r.attack_power = power[:attack_power]
|
121
|
+
r.defense_power = power[:defense_power]
|
122
|
+
r.ability_power = power[:ability_power]
|
123
|
+
r.difficulty = power[:difficulty]
|
124
|
+
|
125
|
+
print "."
|
126
|
+
end
|
127
|
+
puts "done."
|
128
|
+
end
|
129
|
+
|
130
|
+
dh = DataHut.connect("lolstats")
|
131
|
+
|
132
|
+
puts "first transform"
|
133
|
+
dh.transform do |r|
|
134
|
+
r.total_damage = r.damage + (r.damage_per_level * 18.0)
|
135
|
+
r.total_health = r.health + (r.health_per_level * 18.0)
|
136
|
+
r.total_mana = r.mana + (r.mana_per_level * 18.0)
|
137
|
+
r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
|
138
|
+
r.total_armor = r.armor + (r.armor_per_level * 18.0)
|
139
|
+
r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
|
140
|
+
r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
|
141
|
+
r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
|
142
|
+
print '.'
|
143
|
+
end
|
144
|
+
|
145
|
+
puts "second transform"
|
146
|
+
# there's no need to do transforms all in one batch either... you can layer them...
|
147
|
+
dh.transform(true) do |r|
|
148
|
+
# this index combines the tank dimensions above for best combination (simple Euclidean metric)
|
149
|
+
r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
|
150
|
+
r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
|
151
|
+
r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
|
152
|
+
r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
|
153
|
+
print '.'
|
154
|
+
end
|
155
|
+
|
156
|
+
# use once at the end to mark records processed.
|
157
|
+
dh.transform_complete
|
158
|
+
puts "transforms complete"
|
159
|
+
|
160
|
+
ds = dh.dataset
|
161
|
+
|
162
|
+
binding.pry
|
163
|
+
|
164
|
+
puts "done."
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require_relative File.join(*%w[.. test_helper])
|
2
|
+
|
3
|
+
|
4
|
+
describe DataHut do
|
5
|
+
def teardown
|
6
|
+
FileUtils.rm("foo.db", force: true, verbose: true)
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "gem loading" do
|
10
|
+
it "must be defined" do
|
11
|
+
DataHut::VERSION.wont_be_nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "connect" do
|
16
|
+
it "should create a database if none exists" do
|
17
|
+
FileUtils.rm("foo.db", force: true, verbose: true)
|
18
|
+
dh = DataHut.connect("foo")
|
19
|
+
assert File.exists?("foo.db")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "extract" do
|
24
|
+
it "should support extracting data" do
|
25
|
+
dh = DataHut.connect("foo")
|
26
|
+
|
27
|
+
data = [{name: "barney", age: 27},
|
28
|
+
{name: "phil", age: 31},
|
29
|
+
{name: "fred", age: 44}]
|
30
|
+
|
31
|
+
# ignore dups!!
|
32
|
+
data2 = [{name: "barney", age: 27},
|
33
|
+
{name: "phil", age: 31},{name: "phil", age: 31},
|
34
|
+
{name: "fred", age: 44}]
|
35
|
+
|
36
|
+
# the idea of the extract phase is that you control exactly how an element of your data 'd' is
|
37
|
+
# extracted into a transactional record 'r' in the data warehouse.
|
38
|
+
dh.extract(data2) do |r, d|
|
39
|
+
r.name = d[:name]
|
40
|
+
r.age = d[:age]
|
41
|
+
end
|
42
|
+
|
43
|
+
dh.dataset.each_with_index do |r,i|
|
44
|
+
assert r.name == data[i][:name]
|
45
|
+
assert_kind_of(data[i][:name].class, r.name)
|
46
|
+
assert r.age == data[i][:age]
|
47
|
+
assert_kind_of(data[i][:age].class, r.age)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative File.join(*%w[.. test_helper])
|
2
|
+
|
3
|
+
|
4
|
+
class DataWarehouseTest < MiniTest::Unit::TestCase
|
5
|
+
def setup
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_cannot_instaniate
|
9
|
+
assert_raises(NoMethodError) do
|
10
|
+
dw = DataHut::DataWarehouse.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# assert_equal "OHAI!", @meme.i_can_has_cheezburger?
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
metadata
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: data_hut
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Larry Kyrala
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-09 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: sequel
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: sqlite3
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: mocha
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: pry
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
description: A small, portable data warehouse for Ruby for analytics on anything!
|
79
|
+
email:
|
80
|
+
- larry.kyrala@gmail.com
|
81
|
+
executables: []
|
82
|
+
extensions: []
|
83
|
+
extra_rdoc_files: []
|
84
|
+
files:
|
85
|
+
- .gitignore
|
86
|
+
- .rvmrc
|
87
|
+
- CHANGELOG.md
|
88
|
+
- Gemfile
|
89
|
+
- LICENSE
|
90
|
+
- README.md
|
91
|
+
- Rakefile
|
92
|
+
- data_hut.gemspec
|
93
|
+
- lib/data_hut.rb
|
94
|
+
- lib/data_hut/data_warehouse.rb
|
95
|
+
- lib/data_hut/version.rb
|
96
|
+
- scratch
|
97
|
+
- test/spec/basic_test.rb
|
98
|
+
- test/test_helper.rb
|
99
|
+
- test/unit/data_warehouse_test.rb
|
100
|
+
homepage: https://github.com/coldnebo/data_hut
|
101
|
+
licenses: []
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
requirements: []
|
119
|
+
rubyforge_project:
|
120
|
+
rubygems_version: 1.8.24
|
121
|
+
signing_key:
|
122
|
+
specification_version: 3
|
123
|
+
summary: Like a data warehouse, but smaller.
|
124
|
+
test_files:
|
125
|
+
- test/spec/basic_test.rb
|
126
|
+
- test/test_helper.rb
|
127
|
+
- test/unit/data_warehouse_test.rb
|