data_hut 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.0.6
4
+
5
+ * externalized the Sequel database logger so that it can be set by DataHut clients. See DataHut::DataWarehouse#logger=
6
+
7
+ * added type checking on extract and transform to ensure safe operation with underlying Sequel sqlite3 database.
8
+
3
9
  ## 0.0.5
4
10
 
5
11
  * added rdoc
data/README.md CHANGED
@@ -229,7 +229,6 @@ Have fun!
229
229
 
230
230
  ## TODOS
231
231
 
232
- * fill out tests
233
232
  * further optimizations
234
233
  * time-based series and binning helpers (by week/day/hour/5-min/etc).
235
234
 
data/Rakefile CHANGED
@@ -11,5 +11,5 @@ task :default => :test
11
11
 
12
12
  desc "clean up"
13
13
  task :clean do
14
- FileUtils.rm(FileList["*.db"], force: true, verbose: true)
14
+ FileUtils.rm(FileList["samples/**/*.db"], force: true, verbose: true)
15
15
  end
@@ -78,7 +78,6 @@ module DataHut
78
78
  # @yield [record, element] lets you control the mapping of data elements to record fields
79
79
  # @yieldparam record an OpenStruct that allows you to create fields dynamically on the record as needed.
80
80
  # These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
81
- # *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
82
81
  # See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
83
82
  # more information about supported ruby data types you can use.
84
83
  # @yieldparam element an element from your data.
@@ -108,7 +107,6 @@ module DataHut
108
107
  # @yieldparam record an OpenStruct that fronts the DataHut record. You may access existing fields on this record or create new
109
108
  # fields to store synthetic data from a transform pass.
110
109
  # These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
111
- # *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
112
110
  # See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
113
111
  # more information about supported ruby data types you can use.
114
112
  # @raise [ArgumentError] if you don't provide a block
@@ -150,12 +148,25 @@ module DataHut
150
148
  @db[:data_warehouse].update(:dw_processed => true)
151
149
  end
152
150
 
151
+ # attach a Logger to the underlying Sequel database so that you can debug or monitor database actions.
152
+ # See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Database.html#method-i-logger-3D Sequel::Database#logger=}.
153
+ #
154
+ # @example
155
+ # dh.logger = Logger.new(STDOUT)
156
+ #
157
+ # @param logger [Logger] a logger for the underlying Sequel actions.
158
+ # @raise [ArgumentError] if passed a logger that is not a kind of {http://www.ruby-doc.org/stdlib-1.9.3//libdoc/logger/rdoc/Logger.html Logger}.
159
+ def logger=(logger)
160
+ raise(ArgumentError, "logger must be a type of Logger.") unless logger.kind_of?(Logger)
161
+ @db.logger = logger
162
+ end
163
+
153
164
  private
154
165
 
155
166
  def initialize(name)
156
167
  @db_file = "#{name}.db"
157
168
  @db = Sequel.sqlite(@db_file)
158
- #@db.logger = ::Logger.new(STDOUT)
169
+
159
170
  unless @db.table_exists?(:data_warehouse)
160
171
  @db.create_table(:data_warehouse) do
161
172
  primary_key :dw_id
@@ -177,6 +188,9 @@ module DataHut
177
188
  h = r.marshal_dump
178
189
  h.keys.each do |key|
179
190
  type = h[key].class
191
+ unless Sequel::Schema::CreateTableGenerator::GENERIC_TYPES.include?(type)
192
+ raise(ArgumentError, "DataHut: Ruby type '#{type}' not supported by Sequel. Must be one of the supported types: #{Sequel::Schema::CreateTableGenerator::GENERIC_TYPES.inspect}", caller)
193
+ end
180
194
  unless @db[:data_warehouse].columns.include?(key)
181
195
  @db.alter_table(:data_warehouse) do
182
196
  add_column key, type
@@ -1,3 +1,3 @@
1
1
  module DataHut
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -0,0 +1,55 @@
1
+
2
+ # run from the samples dir with:
3
+ # $ ruby basic.rb
4
+
5
+ require_relative 'sample_helper.rb'
6
+
7
+ require 'data_hut'
8
+ require 'pry'
9
+
10
+ dh = DataHut.connect("sample")
11
+
12
+ data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
13
+ {name: "phil", age: 31},
14
+ {name: "fred", age: 44, login: DateTime.parse('2013-02-07')},
15
+ {name: "sarah", age: 24, login: DateTime.parse('2011-04-01')},
16
+ {name: "robin", age: 45},
17
+ {name: "jane", age: 19, login: DateTime.parse('2012-10-14')}]
18
+
19
+ # extract your data by iterating over your data format (from whatever source) and map it to a record model...
20
+ puts "extracting data"
21
+ dh.extract(data) do |r, d|
22
+ r.name = d[:name]
23
+ r.age = d[:age]
24
+ # data quality step:
25
+ d[:login] = DateTime.new unless d.has_key?(:login)
26
+ r.last_active = d[:login]
27
+ print '.'
28
+ end
29
+
30
+ # and only transform the new records automatically
31
+ puts "\ntransforming data"
32
+ dh.transform do |r|
33
+ r.eligible = r.age < 30
34
+ print '*'
35
+ end
36
+
37
+ dh.transform_complete
38
+ puts "\ndone."
39
+
40
+ # operate on your dataset by using chained queries
41
+ ds = dh.dataset
42
+
43
+ ds.each{|d| puts d.inspect}
44
+
45
+ puts "Average age: #{ds.avg(:age)}"
46
+
47
+ puts "Eligible:"
48
+ eligible = ds.where(eligible:true)
49
+ eligible.each{|d| puts d.inspect}
50
+
51
+ binding.pry
52
+
53
+ # clean up scratch demo
54
+ FileUtils.rm("sample.db")
55
+ puts "done."
@@ -0,0 +1,97 @@
1
+
2
+ # run from the samples dir with:
3
+ # $ ruby league_of_legends.rb
4
+
5
+ require_relative 'sample_helper.rb'
6
+
7
+ require 'data_hut'
8
+ require 'nokogiri'
9
+ require 'open-uri'
10
+ require 'pry'
11
+
12
+ root = 'http://na.leagueoflegends.com'
13
+
14
+ # load the data once... (manually delete it to refresh)
15
+ unless File.exists?("lolstats.db")
16
+ dh = DataHut.connect("lolstats")
17
+
18
+ champions_page = Nokogiri::HTML(open("#{root}/champions"))
19
+
20
+ urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
21
+
22
+ # keep the powers for later since they are on different pages.
23
+ powers = {}
24
+ champions_page.css('table.champion_item').each do |c|
25
+ name = c.css('td.description span.highlight a').text
26
+ attack = c.css('td.graphing td.filled_attack').count
27
+ health = c.css('td.graphing td.filled_health').count
28
+ spells = c.css('td.graphing td.filled_spells').count
29
+ difficulty = c.css('td.graphing td.filled_difficulty').count
30
+ powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
31
+ end
32
+
33
+ puts "loading champion data"
34
+ dh.extract(urls) do |r, url|
35
+ champion_page = Nokogiri::HTML(open("#{root}#{url}"))
36
+ r.name = champion_page.css('div.page_header_text').text
37
+
38
+ st = champion_page.css('table.stats_table')
39
+ names = st.css('td.stats_name').collect{|e| e.text.strip}
40
+ values = st.css('td.stats_value').collect{|e| e.text.strip}
41
+ modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
42
+
43
+ (0..names.count-1).collect do |i|
44
+ stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
45
+ r.send(stat, values[i].to_f)
46
+ stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
47
+ per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
48
+ r.send(stat_per_level, per_level_value)
49
+ end
50
+
51
+ # add the powers for this champion...
52
+ power = powers[r.name]
53
+ r.attack_power = power[:attack_power]
54
+ r.defense_power = power[:defense_power]
55
+ r.ability_power = power[:ability_power]
56
+ r.difficulty = power[:difficulty]
57
+
58
+ print "."
59
+ end
60
+ puts "done."
61
+ end
62
+
63
+ dh = DataHut.connect("lolstats")
64
+
65
+ puts "first transform"
66
+ dh.transform do |r|
67
+ r.total_damage = r.damage + (r.damage_per_level * 18.0)
68
+ r.total_health = r.health + (r.health_per_level * 18.0)
69
+ r.total_mana = r.mana + (r.mana_per_level * 18.0)
70
+ r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
71
+ r.total_armor = r.armor + (r.armor_per_level * 18.0)
72
+ r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
73
+ r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
74
+ r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
75
+ print '.'
76
+ end
77
+
78
+ puts "second transform"
79
+ # there's no need to do transforms all in one batch either... you can layer them...
80
+ dh.transform(true) do |r|
81
+ # this index combines the tank dimensions above for best combination (simple Euclidean metric)
82
+ r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
83
+ r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
84
+ r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
85
+ r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
86
+ print '.'
87
+ end
88
+
89
+ # use once at the end to mark records processed.
90
+ dh.transform_complete
91
+ puts "transforms complete"
92
+
93
+ ds = dh.dataset
94
+
95
+ binding.pry
96
+
97
+ puts "done."
@@ -0,0 +1,7 @@
1
+ # sample helper takes care of loading the gem from source without requiring it to be rebuilt and installed.
2
+ # this is useful in allowing the samples in this directory to evolve the behavior of the actual gem.
3
+
4
+ lp = File.expand_path(File.join(*%w[.. lib]), File.dirname(__FILE__))
5
+ unless $LOAD_PATH.include?(lp)
6
+ $LOAD_PATH.unshift(lp)
7
+ end
@@ -136,5 +136,43 @@ describe DataHut do
136
136
 
137
137
  end
138
138
 
139
+
140
+ describe "nice usage" do
141
+
142
+ class Foo
143
+ end
144
+
145
+ it "should provide logging services to see or debug underlying Sequel" do
146
+ dh = DataHut.connect("foo")
147
+
148
+ dh.logger = ::Logger.new(STDOUT)
149
+
150
+ assert_raises(ArgumentError) do
151
+ dh.logger = Foo.new
152
+ end
153
+
154
+ end
155
+
156
+ it "should handle type errors" do
157
+ dh = DataHut.connect("foo")
158
+
159
+ data = [{name: "fred", birthday: '1978-02-11'}]
160
+
161
+ # how about dates?
162
+ dh.extract(data) do |r, d|
163
+ r.name = d[:name]
164
+ r.birthday = Date.parse(d[:birthday])
165
+ end
166
+
167
+ # ok, but what about a custom type... that's guaranteed to fail!
168
+ assert_raises(ArgumentError) do
169
+ dh.transform do |r|
170
+ r.my_foo = Foo.new
171
+ end
172
+ end
173
+ end
174
+
175
+ end
176
+
139
177
  end
140
178
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_hut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-09 00:00:00.000000000 Z
12
+ date: 2013-02-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sequel
@@ -141,7 +141,9 @@ files:
141
141
  - lib/data_hut.rb
142
142
  - lib/data_hut/data_warehouse.rb
143
143
  - lib/data_hut/version.rb
144
- - scratch
144
+ - samples/basic.rb
145
+ - samples/league_of_legends.rb
146
+ - samples/sample_helper.rb
145
147
  - test/spec/basic_test.rb
146
148
  - test/test_helper.rb
147
149
  - test/unit/data_warehouse_test.rb
data/scratch DELETED
@@ -1,164 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # allows this script to behave as if the gem were installed.
3
- lp = File.expand_path(File.join(*%w[lib]), File.dirname(__FILE__))
4
- unless $LOAD_PATH.include?(lp)
5
- $LOAD_PATH.unshift(lp)
6
- end
7
-
8
- # ------------------ manual example of client usage for docs, etc. -----------
9
-
10
- require 'data_hut'
11
- require 'pry'
12
-
13
- dh = DataHut.connect("scratch")
14
-
15
- data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
16
- {name: "phil", age: 31},
17
- {name: "fred", age: 44, login: DateTime.parse('2013-02-07')}]
18
-
19
- # extract your data by iterating over your data format (from whatever source) and map it to a record model...
20
- dh.extract(data) do |r, d|
21
- r.name = d[:name]
22
- r.age = d[:age]
23
- # data quality step:
24
- d[:login] = DateTime.new unless d.has_key?(:login)
25
- r.last_active = d[:login]
26
- print 'v'
27
- end
28
-
29
- # transform your data by adding fields to it
30
- dh.transform do |r|
31
- r.eligible = r.age < 30
32
- print '*'
33
- end
34
-
35
- dh.transform_complete
36
-
37
- # later... you have more data...
38
- data = [{name: "sarah", age: 24, login: DateTime.parse('2011-04-01') },
39
- {name: "robin", age: 45},
40
- {name: "jane", age: 19, login: DateTime.parse('2012-10-14')}]
41
-
42
- # add it too...
43
- dh.extract(data) do |r, d|
44
- r.name = d[:name]
45
- r.age = d[:age]
46
- # data quality step:
47
- d[:login] = DateTime.new unless d.has_key?(:login)
48
- r.last_active = d[:login]
49
- print 'v'
50
- end
51
-
52
- # and only transform the new records automatically
53
- dh.transform do |r|
54
- r.eligible = r.age < 30
55
- print '*'
56
- end
57
-
58
- dh.transform_complete
59
-
60
-
61
- # operate on your dataset by using chained queries
62
- ds = dh.dataset
63
-
64
- #binding.pry
65
-
66
-
67
- # clean up scratch demo
68
- FileUtils.rm("scratch.db")
69
- puts "done."
70
-
71
- #exit
72
-
73
- # ------------------------------ a more ambitious example -------------------------
74
-
75
- require 'nokogiri'
76
- require 'open-uri'
77
- require 'pry'
78
-
79
- root = 'http://na.leagueoflegends.com'
80
-
81
- # load the data once... (manually delete it to refresh)
82
- unless File.exists?("lolstats.db")
83
- dh = DataHut.connect("lolstats")
84
-
85
- champions_page = Nokogiri::HTML(open("#{root}/champions"))
86
-
87
- urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
88
-
89
- # keep the powers for later since they are on different pages.
90
- powers = {}
91
- champions_page.css('table.champion_item').each do |c|
92
- name = c.css('td.description span.highlight a').text
93
- attack = c.css('td.graphing td.filled_attack').count
94
- health = c.css('td.graphing td.filled_health').count
95
- spells = c.css('td.graphing td.filled_spells').count
96
- difficulty = c.css('td.graphing td.filled_difficulty').count
97
- powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
98
- end
99
-
100
- puts "loading champion data"
101
- dh.extract(urls) do |r, url|
102
- champion_page = Nokogiri::HTML(open("#{root}#{url}"))
103
- r.name = champion_page.css('div.page_header_text').text
104
-
105
- st = champion_page.css('table.stats_table')
106
- names = st.css('td.stats_name').collect{|e| e.text.strip}
107
- values = st.css('td.stats_value').collect{|e| e.text.strip}
108
- modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
109
-
110
- (0..names.count-1).collect do |i|
111
- stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
112
- r.send(stat, values[i].to_f)
113
- stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
114
- per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
115
- r.send(stat_per_level, per_level_value)
116
- end
117
-
118
- # add the powers for this champion...
119
- power = powers[r.name]
120
- r.attack_power = power[:attack_power]
121
- r.defense_power = power[:defense_power]
122
- r.ability_power = power[:ability_power]
123
- r.difficulty = power[:difficulty]
124
-
125
- print "."
126
- end
127
- puts "done."
128
- end
129
-
130
- dh = DataHut.connect("lolstats")
131
-
132
- puts "first transform"
133
- dh.transform do |r|
134
- r.total_damage = r.damage + (r.damage_per_level * 18.0)
135
- r.total_health = r.health + (r.health_per_level * 18.0)
136
- r.total_mana = r.mana + (r.mana_per_level * 18.0)
137
- r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
138
- r.total_armor = r.armor + (r.armor_per_level * 18.0)
139
- r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
140
- r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
141
- r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
142
- print '.'
143
- end
144
-
145
- puts "second transform"
146
- # there's no need to do transforms all in one batch either... you can layer them...
147
- dh.transform(true) do |r|
148
- # this index combines the tank dimensions above for best combination (simple Euclidean metric)
149
- r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
150
- r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
151
- r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
152
- r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
153
- print '.'
154
- end
155
-
156
- # use once at the end to mark records processed.
157
- dh.transform_complete
158
- puts "transforms complete"
159
-
160
- ds = dh.dataset
161
-
162
- binding.pry
163
-
164
- puts "done."