data_hut 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.0.6
4
+
5
+ * externalized the Sequel database logger so that it can be set by DataHut clients. See DataHut::DataWarehouse#logger=
6
+
7
+ * added type checking on extract and transform to ensure safe operation with underlying Sequel sqlite3 database.
8
+
3
9
  ## 0.0.5
4
10
 
5
11
  * added rdoc
data/README.md CHANGED
@@ -229,7 +229,6 @@ Have fun!
229
229
 
230
230
  ## TODOS
231
231
 
232
- * fill out tests
233
232
  * further optimizations
234
233
  * time-based series and binning helpers (by week/day/hour/5-min/etc).
235
234
 
data/Rakefile CHANGED
@@ -11,5 +11,5 @@ task :default => :test
11
11
 
12
12
  desc "clean up"
13
13
  task :clean do
14
- FileUtils.rm(FileList["*.db"], force: true, verbose: true)
14
+ FileUtils.rm(FileList["samples/**/*.db"], force: true, verbose: true)
15
15
  end
@@ -78,7 +78,6 @@ module DataHut
78
78
  # @yield [record, element] lets you control the mapping of data elements to record fields
79
79
  # @yieldparam record an OpenStruct that allows you to create fields dynamically on the record as needed.
80
80
  # These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
81
- # *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
82
81
  # See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
83
82
  # more information about supported ruby data types you can use.
84
83
  # @yieldparam element an element from your data.
@@ -108,7 +107,6 @@ module DataHut
108
107
  # @yieldparam record an OpenStruct that fronts the DataHut record. You may access existing fields on this record or create new
109
108
  # fields to store synthetic data from a transform pass.
110
109
  # These fields will automatically be added to the schema behind the DataHut using the ruby data type you assigned to the record.
111
- # *NOTE* that you must use DateTime or Time objects as Date objects are not supported.
112
110
  # See {http://sequel.rubyforge.org/rdoc/files/doc/schema_modification_rdoc.html Sequel Schema Modification Methods} for
113
111
  # more information about supported ruby data types you can use.
114
112
  # @raise [ArgumentError] if you don't provide a block
@@ -150,12 +148,25 @@ module DataHut
150
148
  @db[:data_warehouse].update(:dw_processed => true)
151
149
  end
152
150
 
151
+ # attach a Logger to the underlying Sequel database so that you can debug or monitor database actions.
152
+ # See {http://sequel.rubyforge.org/rdoc/classes/Sequel/Database.html#method-i-logger-3D Sequel::Database#logger=}.
153
+ #
154
+ # @example
155
+ # dh.logger = Logger.new(STDOUT)
156
+ #
157
+ # @param logger [Logger] a logger for the underlying Sequel actions.
158
+ # @raise [ArgumentError] if passed a logger that is not a kind of {http://www.ruby-doc.org/stdlib-1.9.3//libdoc/logger/rdoc/Logger.html Logger}.
159
+ def logger=(logger)
160
+ raise(ArgumentError, "logger must be a type of Logger.") unless logger.kind_of?(Logger)
161
+ @db.logger = logger
162
+ end
163
+
153
164
  private
154
165
 
155
166
  def initialize(name)
156
167
  @db_file = "#{name}.db"
157
168
  @db = Sequel.sqlite(@db_file)
158
- #@db.logger = ::Logger.new(STDOUT)
169
+
159
170
  unless @db.table_exists?(:data_warehouse)
160
171
  @db.create_table(:data_warehouse) do
161
172
  primary_key :dw_id
@@ -177,6 +188,9 @@ module DataHut
177
188
  h = r.marshal_dump
178
189
  h.keys.each do |key|
179
190
  type = h[key].class
191
+ unless Sequel::Schema::CreateTableGenerator::GENERIC_TYPES.include?(type)
192
+ raise(ArgumentError, "DataHut: Ruby type '#{type}' not supported by Sequel. Must be one of the supported types: #{Sequel::Schema::CreateTableGenerator::GENERIC_TYPES.inspect}", caller)
193
+ end
180
194
  unless @db[:data_warehouse].columns.include?(key)
181
195
  @db.alter_table(:data_warehouse) do
182
196
  add_column key, type
@@ -1,3 +1,3 @@
1
1
  module DataHut
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -0,0 +1,55 @@
1
+
2
+ # run from the samples dir with:
3
+ # $ ruby basic.rb
4
+
5
+ require_relative 'sample_helper.rb'
6
+
7
+ require 'data_hut'
8
+ require 'pry'
9
+
10
+ dh = DataHut.connect("sample")
11
+
12
+ data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
13
+ {name: "phil", age: 31},
14
+ {name: "fred", age: 44, login: DateTime.parse('2013-02-07')},
15
+ {name: "sarah", age: 24, login: DateTime.parse('2011-04-01')},
16
+ {name: "robin", age: 45},
17
+ {name: "jane", age: 19, login: DateTime.parse('2012-10-14')}]
18
+
19
+ # extract your data by iterating over your data format (from whatever source) and map it to a record model...
20
+ puts "extracting data"
21
+ dh.extract(data) do |r, d|
22
+ r.name = d[:name]
23
+ r.age = d[:age]
24
+ # data quality step:
25
+ d[:login] = DateTime.new unless d.has_key?(:login)
26
+ r.last_active = d[:login]
27
+ print '.'
28
+ end
29
+
30
+ # and only transform the new records automatically
31
+ puts "\ntransforming data"
32
+ dh.transform do |r|
33
+ r.eligible = r.age < 30
34
+ print '*'
35
+ end
36
+
37
+ dh.transform_complete
38
+ puts "\ndone."
39
+
40
+ # operate on your dataset by using chained queries
41
+ ds = dh.dataset
42
+
43
+ ds.each{|d| puts d.inspect}
44
+
45
+ puts "Average age: #{ds.avg(:age)}"
46
+
47
+ puts "Eligible:"
48
+ eligible = ds.where(eligible:true)
49
+ eligible.each{|d| puts d.inspect}
50
+
51
+ binding.pry
52
+
53
+ # clean up scratch demo
54
+ FileUtils.rm("sample.db")
55
+ puts "done."
@@ -0,0 +1,97 @@
1
+
2
+ # run from the samples dir with:
3
+ # $ ruby league_of_legends.rb
4
+
5
+ require_relative 'sample_helper.rb'
6
+
7
+ require 'data_hut'
8
+ require 'nokogiri'
9
+ require 'open-uri'
10
+ require 'pry'
11
+
12
+ root = 'http://na.leagueoflegends.com'
13
+
14
+ # load the data once... (manually delete it to refresh)
15
+ unless File.exists?("lolstats.db")
16
+ dh = DataHut.connect("lolstats")
17
+
18
+ champions_page = Nokogiri::HTML(open("#{root}/champions"))
19
+
20
+ urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
21
+
22
+ # keep the powers for later since they are on different pages.
23
+ powers = {}
24
+ champions_page.css('table.champion_item').each do |c|
25
+ name = c.css('td.description span.highlight a').text
26
+ attack = c.css('td.graphing td.filled_attack').count
27
+ health = c.css('td.graphing td.filled_health').count
28
+ spells = c.css('td.graphing td.filled_spells').count
29
+ difficulty = c.css('td.graphing td.filled_difficulty').count
30
+ powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
31
+ end
32
+
33
+ puts "loading champion data"
34
+ dh.extract(urls) do |r, url|
35
+ champion_page = Nokogiri::HTML(open("#{root}#{url}"))
36
+ r.name = champion_page.css('div.page_header_text').text
37
+
38
+ st = champion_page.css('table.stats_table')
39
+ names = st.css('td.stats_name').collect{|e| e.text.strip}
40
+ values = st.css('td.stats_value').collect{|e| e.text.strip}
41
+ modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
42
+
43
+ (0..names.count-1).collect do |i|
44
+ stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
45
+ r.send(stat, values[i].to_f)
46
+ stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
47
+ per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
48
+ r.send(stat_per_level, per_level_value)
49
+ end
50
+
51
+ # add the powers for this champion...
52
+ power = powers[r.name]
53
+ r.attack_power = power[:attack_power]
54
+ r.defense_power = power[:defense_power]
55
+ r.ability_power = power[:ability_power]
56
+ r.difficulty = power[:difficulty]
57
+
58
+ print "."
59
+ end
60
+ puts "done."
61
+ end
62
+
63
+ dh = DataHut.connect("lolstats")
64
+
65
+ puts "first transform"
66
+ dh.transform do |r|
67
+ r.total_damage = r.damage + (r.damage_per_level * 18.0)
68
+ r.total_health = r.health + (r.health_per_level * 18.0)
69
+ r.total_mana = r.mana + (r.mana_per_level * 18.0)
70
+ r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
71
+ r.total_armor = r.armor + (r.armor_per_level * 18.0)
72
+ r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
73
+ r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
74
+ r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
75
+ print '.'
76
+ end
77
+
78
+ puts "second transform"
79
+ # there's no need to do transforms all in one batch either... you can layer them...
80
+ dh.transform(true) do |r|
81
+ # this index combines the tank dimensions above for best combination (simple Euclidean metric)
82
+ r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
83
+ r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
84
+ r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
85
+ r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
86
+ print '.'
87
+ end
88
+
89
+ # use once at the end to mark records processed.
90
+ dh.transform_complete
91
+ puts "transforms complete"
92
+
93
+ ds = dh.dataset
94
+
95
+ binding.pry
96
+
97
+ puts "done."
@@ -0,0 +1,7 @@
1
+ # sample helper takes care of loading the gem from source without requiring it to be rebuilt and installed.
2
+ # this is useful in allowing the samples in this directory to evolve the behavior of the actual gem.
3
+
4
+ lp = File.expand_path(File.join(*%w[.. lib]), File.dirname(__FILE__))
5
+ unless $LOAD_PATH.include?(lp)
6
+ $LOAD_PATH.unshift(lp)
7
+ end
@@ -136,5 +136,43 @@ describe DataHut do
136
136
 
137
137
  end
138
138
 
139
+
140
+ describe "nice usage" do
141
+
142
+ class Foo
143
+ end
144
+
145
+ it "should provide logging services to see or debug underlying Sequel" do
146
+ dh = DataHut.connect("foo")
147
+
148
+ dh.logger = ::Logger.new(STDOUT)
149
+
150
+ assert_raises(ArgumentError) do
151
+ dh.logger = Foo.new
152
+ end
153
+
154
+ end
155
+
156
+ it "should handle type errors" do
157
+ dh = DataHut.connect("foo")
158
+
159
+ data = [{name: "fred", birthday: '1978-02-11'}]
160
+
161
+ # how about dates?
162
+ dh.extract(data) do |r, d|
163
+ r.name = d[:name]
164
+ r.birthday = Date.parse(d[:birthday])
165
+ end
166
+
167
+ # ok, but what about a custom type... that's guaranteed to fail!
168
+ assert_raises(ArgumentError) do
169
+ dh.transform do |r|
170
+ r.my_foo = Foo.new
171
+ end
172
+ end
173
+ end
174
+
175
+ end
176
+
139
177
  end
140
178
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_hut
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-09 00:00:00.000000000 Z
12
+ date: 2013-02-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: sequel
@@ -141,7 +141,9 @@ files:
141
141
  - lib/data_hut.rb
142
142
  - lib/data_hut/data_warehouse.rb
143
143
  - lib/data_hut/version.rb
144
- - scratch
144
+ - samples/basic.rb
145
+ - samples/league_of_legends.rb
146
+ - samples/sample_helper.rb
145
147
  - test/spec/basic_test.rb
146
148
  - test/test_helper.rb
147
149
  - test/unit/data_warehouse_test.rb
data/scratch DELETED
@@ -1,164 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # allows this script to behave as if the gem were installed.
3
- lp = File.expand_path(File.join(*%w[lib]), File.dirname(__FILE__))
4
- unless $LOAD_PATH.include?(lp)
5
- $LOAD_PATH.unshift(lp)
6
- end
7
-
8
- # ------------------ manual example of client usage for docs, etc. -----------
9
-
10
- require 'data_hut'
11
- require 'pry'
12
-
13
- dh = DataHut.connect("scratch")
14
-
15
- data = [{name: "barney", age: 27, login: DateTime.parse('2008-05-03') },
16
- {name: "phil", age: 31},
17
- {name: "fred", age: 44, login: DateTime.parse('2013-02-07')}]
18
-
19
- # extract your data by iterating over your data format (from whatever source) and map it to a record model...
20
- dh.extract(data) do |r, d|
21
- r.name = d[:name]
22
- r.age = d[:age]
23
- # data quality step:
24
- d[:login] = DateTime.new unless d.has_key?(:login)
25
- r.last_active = d[:login]
26
- print 'v'
27
- end
28
-
29
- # transform your data by adding fields to it
30
- dh.transform do |r|
31
- r.eligible = r.age < 30
32
- print '*'
33
- end
34
-
35
- dh.transform_complete
36
-
37
- # later... you have more data...
38
- data = [{name: "sarah", age: 24, login: DateTime.parse('2011-04-01') },
39
- {name: "robin", age: 45},
40
- {name: "jane", age: 19, login: DateTime.parse('2012-10-14')}]
41
-
42
- # add it too...
43
- dh.extract(data) do |r, d|
44
- r.name = d[:name]
45
- r.age = d[:age]
46
- # data quality step:
47
- d[:login] = DateTime.new unless d.has_key?(:login)
48
- r.last_active = d[:login]
49
- print 'v'
50
- end
51
-
52
- # and only transform the new records automatically
53
- dh.transform do |r|
54
- r.eligible = r.age < 30
55
- print '*'
56
- end
57
-
58
- dh.transform_complete
59
-
60
-
61
- # operate on your dataset by using chained queries
62
- ds = dh.dataset
63
-
64
- #binding.pry
65
-
66
-
67
- # clean up scratch demo
68
- FileUtils.rm("scratch.db")
69
- puts "done."
70
-
71
- #exit
72
-
73
- # ------------------------------ a more ambitious example -------------------------
74
-
75
- require 'nokogiri'
76
- require 'open-uri'
77
- require 'pry'
78
-
79
- root = 'http://na.leagueoflegends.com'
80
-
81
- # load the data once... (manually delete it to refresh)
82
- unless File.exists?("lolstats.db")
83
- dh = DataHut.connect("lolstats")
84
-
85
- champions_page = Nokogiri::HTML(open("#{root}/champions"))
86
-
87
- urls = champions_page.css('table.champion_item td.description span a').collect{|e| e.attribute('href').value}
88
-
89
- # keep the powers for later since they are on different pages.
90
- powers = {}
91
- champions_page.css('table.champion_item').each do |c|
92
- name = c.css('td.description span.highlight a').text
93
- attack = c.css('td.graphing td.filled_attack').count
94
- health = c.css('td.graphing td.filled_health').count
95
- spells = c.css('td.graphing td.filled_spells').count
96
- difficulty = c.css('td.graphing td.filled_difficulty').count
97
- powers.store(name, {attack_power: attack, defense_power: health, ability_power: spells, difficulty: difficulty})
98
- end
99
-
100
- puts "loading champion data"
101
- dh.extract(urls) do |r, url|
102
- champion_page = Nokogiri::HTML(open("#{root}#{url}"))
103
- r.name = champion_page.css('div.page_header_text').text
104
-
105
- st = champion_page.css('table.stats_table')
106
- names = st.css('td.stats_name').collect{|e| e.text.strip}
107
- values = st.css('td.stats_value').collect{|e| e.text.strip}
108
- modifiers = st.css('td.stats_modifier').collect{|e| e.text.strip}
109
-
110
- (0..names.count-1).collect do |i|
111
- stat = (names[i].downcase.gsub(/ /,'_') << "=").to_sym
112
- r.send(stat, values[i].to_f)
113
- stat_per_level = (names[i].downcase.gsub(/ /,'_') << "_per_level=").to_sym
114
- per_level_value = modifiers[i].match(/\+([\d\.]+)/)[1].to_f rescue 0
115
- r.send(stat_per_level, per_level_value)
116
- end
117
-
118
- # add the powers for this champion...
119
- power = powers[r.name]
120
- r.attack_power = power[:attack_power]
121
- r.defense_power = power[:defense_power]
122
- r.ability_power = power[:ability_power]
123
- r.difficulty = power[:difficulty]
124
-
125
- print "."
126
- end
127
- puts "done."
128
- end
129
-
130
- dh = DataHut.connect("lolstats")
131
-
132
- puts "first transform"
133
- dh.transform do |r|
134
- r.total_damage = r.damage + (r.damage_per_level * 18.0)
135
- r.total_health = r.health + (r.health_per_level * 18.0)
136
- r.total_mana = r.mana + (r.mana_per_level * 18.0)
137
- r.total_move_speed = r.move_speed + (r.move_speed_per_level * 18.0)
138
- r.total_armor = r.armor + (r.armor_per_level * 18.0)
139
- r.total_spell_block = r.spell_block + (r.spell_block_per_level * 18.0)
140
- r.total_health_regen = r.health_regen + (r.health_regen_per_level * 18.0)
141
- r.total_mana_regen = r.mana_regen + (r.mana_regen_per_level * 18.0)
142
- print '.'
143
- end
144
-
145
- puts "second transform"
146
- # there's no need to do transforms all in one batch either... you can layer them...
147
- dh.transform(true) do |r|
148
- # this index combines the tank dimensions above for best combination (simple Euclidean metric)
149
- r.nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power)
150
- r.easy_nuke_index = r.total_damage * r.total_move_speed * r.total_mana * (r.ability_power) * (1.0/r.difficulty)
151
- r.tenacious_index = r.total_armor * r.total_health * r.total_spell_block * r.total_health_regen * (r.defense_power)
152
- r.support_index = r.total_mana * r.total_armor * r.total_spell_block * r.total_health * r.total_health_regen * r.total_mana_regen * (r.ability_power * r.defense_power)
153
- print '.'
154
- end
155
-
156
- # use once at the end to mark records processed.
157
- dh.transform_complete
158
- puts "transforms complete"
159
-
160
- ds = dh.dataset
161
-
162
- binding.pry
163
-
164
- puts "done."