activewarehouse 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/README +41 -0
  2. data/Rakefile +121 -0
  3. data/TODO +4 -0
  4. data/db/migrations/001_create_table_reports.rb +28 -0
  5. data/doc/agg_queries.txt +26 -0
  6. data/doc/agg_queries_results.txt +150 -0
  7. data/doc/queries.txt +35 -0
  8. data/generators/cube/USAGE +1 -0
  9. data/generators/cube/cube_generator.rb +28 -0
  10. data/generators/cube/templates/model.rb +3 -0
  11. data/generators/cube/templates/unit_test.rb +8 -0
  12. data/generators/dimension/USAGE +1 -0
  13. data/generators/dimension/dimension_generator.rb +46 -0
  14. data/generators/dimension/templates/fixture.yml +5 -0
  15. data/generators/dimension/templates/migration.rb +20 -0
  16. data/generators/dimension/templates/model.rb +3 -0
  17. data/generators/dimension/templates/unit_test.rb +10 -0
  18. data/generators/fact/USAGE +1 -0
  19. data/generators/fact/fact_generator.rb +46 -0
  20. data/generators/fact/templates/fixture.yml +5 -0
  21. data/generators/fact/templates/migration.rb +11 -0
  22. data/generators/fact/templates/model.rb +3 -0
  23. data/generators/fact/templates/unit_test.rb +10 -0
  24. data/install.rb +5 -0
  25. data/lib/active_warehouse.rb +65 -0
  26. data/lib/active_warehouse/builder.rb +2 -0
  27. data/lib/active_warehouse/builder/date_dimension_builder.rb +65 -0
  28. data/lib/active_warehouse/builder/random_data_builder.rb +13 -0
  29. data/lib/active_warehouse/core_ext.rb +1 -0
  30. data/lib/active_warehouse/core_ext/time.rb +5 -0
  31. data/lib/active_warehouse/core_ext/time/calculations.rb +40 -0
  32. data/lib/active_warehouse/migrations.rb +65 -0
  33. data/lib/active_warehouse/model.rb +5 -0
  34. data/lib/active_warehouse/model/aggregate.rb +244 -0
  35. data/lib/active_warehouse/model/cube.rb +273 -0
  36. data/lib/active_warehouse/model/dimension.rb +3 -0
  37. data/lib/active_warehouse/model/dimension/bridge.rb +32 -0
  38. data/lib/active_warehouse/model/dimension/dimension.rb +152 -0
  39. data/lib/active_warehouse/model/dimension/hierarchical_dimension.rb +35 -0
  40. data/lib/active_warehouse/model/fact.rb +96 -0
  41. data/lib/active_warehouse/model/report.rb +3 -0
  42. data/lib/active_warehouse/model/report/abstract_report.rb +121 -0
  43. data/lib/active_warehouse/model/report/chart_report.rb +9 -0
  44. data/lib/active_warehouse/model/report/table_report.rb +23 -0
  45. data/lib/active_warehouse/version.rb +9 -0
  46. data/lib/active_warehouse/view.rb +2 -0
  47. data/lib/active_warehouse/view/report_helper.rb +213 -0
  48. data/tasks/active_warehouse_tasks.rake +50 -0
  49. metadata +144 -0
@@ -0,0 +1,5 @@
1
+ require 'active_warehouse/model/aggregate'
2
+ require 'active_warehouse/model/fact'
3
+ require 'active_warehouse/model/dimension'
4
+ require 'active_warehouse/model/cube'
5
+ require 'active_warehouse/model/report'
@@ -0,0 +1,244 @@
1
+ module ActiveWarehouse
2
+ # An aggreate within a cube used to store calculated values
3
+ # Each aggregate will contain values for a dimension pair, down each of the dimension hierarchies
4
+ class Aggregate < ActiveRecord::Base
5
+ class << self
6
+ attr_accessor :name, :cube, :dimension1, :dimension2, :dimension1_hierarchy_name, :dimension2_hierarchy_name
7
+
8
+ # Get the table name for the aggregate
9
+ def table_name
10
+ name = self.name.demodulize.underscore
11
+ set_table_name(name)
12
+ name
13
+ end
14
+
15
+ # Returns the aggregate ID
16
+ def aggregate_id
17
+ table_name =~ /(\d+)$/
18
+ $1.to_i
19
+ end
20
+
21
+ # Returns the AggregateMetaData instance associated with this aggregate
22
+ def meta_data
23
+ AggregateMetaData.find(aggregate_id)
24
+ end
25
+
26
+ # Return true if the aggregate needs to be rebuilt
27
+ def needs_rebuild?(last_build=nil)
28
+ return true if meta_data.populated_at.nil?
29
+ return true if last_build && (meta_data.populated_at < last_build)
30
+ return false
31
+ end
32
+
33
+ # Return a key for the aggregate
34
+ def key(dimension1, dimension1_hierarchy, dimension2, dimension2_hierarchy)
35
+ AggregateKey.new(dimension1, dimension1_hierarchy, dimension2, dimension2_hierarchy)
36
+ end
37
+
38
+ # Create the aggregate table if required. Set force option to true to force creation of the table
39
+ # if it already exists
40
+ def create_storage_table(force=false)
41
+ connection.drop_table(table_name) if force and table_exists?
42
+ if !table_exists?
43
+ connection.create_table(table_name, :id => false) do |t|
44
+ t.column :dimension1_path, :string
45
+ t.column :dimension1_stage, :integer
46
+ t.column :dimension2_path, :string
47
+ t.column :dimension2_stage, :integer
48
+ cube.fact_class.aggregate_fields.each do |field|
49
+ #options = cube.fact_class.aggregate_field_options[field]
50
+ col = cube.fact_class.columns_hash[field.to_s]
51
+ t.column field, col.type if col
52
+ end
53
+ end
54
+ connection.add_index(table_name, :dimension1_path)
55
+ connection.add_index(table_name, :dimension1_stage)
56
+ connection.add_index(table_name, :dimension2_path)
57
+ connection.add_index(table_name, :dimension2_stage)
58
+ end
59
+ end
60
+
61
+ # Populate the aggregate table
62
+ def populate
63
+ # create the storage table if necessary
64
+ create_storage_table
65
+
66
+ #puts "Populating aggregate table #{table_name}"
67
+ # clear out the current data
68
+ #connection.execute("TRUNCATE TABLE #{table_name}") #TODO: make this generic to support all databases
69
+ delete_all
70
+
71
+ # aggregate the data for the two dimensions
72
+ fact_class = cube.fact_class
73
+ dim1 = Dimension.class_name(dimension1).constantize
74
+ dim2 = Dimension.class_name(dimension2).constantize
75
+ dim1_stage_path = []
76
+ dim1.hierarchy(meta_data.dimension1_hierarchy.to_sym).each_with_index do |dim1_stage_name, dim1_stage_level|
77
+ dim1_stage_path << dim1_stage_name
78
+ dim2_stage_path = []
79
+ dim2.hierarchy(meta_data.dimension2_hierarchy.to_sym).each_with_index do |dim2_stage_name, dim2_stage_level|
80
+ dim2_stage_path << dim2_stage_name
81
+
82
+ stmt, fields = build_query(fact_class, dim1, dim1_stage_path, dim2, dim2_stage_path)
83
+
84
+ # Get the facts and aggregate them
85
+ fact_class.connection.execute(stmt).each do |row|
86
+ dim1_value = []
87
+ dim1_stage_path.each_with_index do |v, index|
88
+ dim1_value << row[index]
89
+ end
90
+ dim2_value = []
91
+ dim2_stage_path.each_with_index do |v, index|
92
+ dim2_value << row[dim1_stage_path.length + index]
93
+ end
94
+
95
+ agg_instance = new
96
+ agg_instance.dimension1_path = dim1_value.join(':')
97
+ agg_instance.dimension1_stage = dim1_stage_level
98
+ agg_instance.dimension2_path = dim2_value.join(':')
99
+ agg_instance.dimension2_stage = dim2_stage_level
100
+ fields.each_with_index do |field, index|
101
+ # do the average here
102
+ agg_instance.send("#{field}=".to_sym, row[index + dim1_value.length + dim2_value.length])
103
+ end
104
+ agg_instance.save!
105
+
106
+ meta_data.update_attribute(:populated_at, Time.now)
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ # Build the aggregation query for the given dimensions and stage paths
113
+ def build_query(fact_class, dim1, dim1_stage_path, dim2, dim2_stage_path)
114
+ dim1_group = dim1_stage_path.collect { |p| "d1.#{p}"}.join(", ")
115
+ dim2_group = dim2_stage_path.collect { |p| "d2.#{p}"}.join(", ")
116
+
117
+ # Set up the find options
118
+ fact_find_options = {}
119
+ fact_find_options[:group] = "#{dim1_group}, #{dim2_group}"
120
+ fact_find_options[:joins] = "join #{dim1.table_name} d1 on f.#{dim1.foreign_key} = d1.id"
121
+ fact_find_options[:joins] << " join #{dim2.table_name} d2 on f.#{dim2.foreign_key} = d2.id"
122
+
123
+ # Build the 'select' part of the query
124
+ # denominator = nil
125
+ fields = []
126
+ fact_select = ["#{dim1_group}, #{dim2_group}"]
127
+ fact_class.aggregate_fields.each do |field_name|
128
+ options = fact_class.aggregate_field_options[field_name]
129
+ fields << field_name
130
+
131
+ options[:type] ||= :sum
132
+ case options[:type]
133
+ when :sum
134
+ fact_select << " sum(f.#{field_name}) as #{field_name}"
135
+ when Hash
136
+ if options[:type][dim1.sym] == :average && options[:type][dim2.sym] == :average
137
+ # I believe this is a special case, but I'm not sure how yet. If both dimensions are defined
138
+ # averages then perhaps that value cannot be calculated at all. TODO: research
139
+ else
140
+ fact_select << " sum(f.#{field_name}) as #{field_name}"
141
+ end
142
+ else
143
+ raise "Unsupported aggregate type: #{options[:type]}"
144
+ end
145
+ end
146
+ fact_find_options[:select] = fact_select.join(',')
147
+
148
+ # put the SQL statement together
149
+ stmt = "select #{fact_find_options[:select]} from "
150
+ stmt << "#{fact_class.table_name} f #{fact_find_options[:joins]} "
151
+ stmt << "group by #{fact_find_options[:group]}"
152
+
153
+ return stmt, fields
154
+ end
155
+
156
+ end
157
+
158
+ public
159
+ # Clone and reset at the same time
160
+ def clone_and_reset
161
+ o = clone
162
+ o.reset
163
+ o
164
+ end
165
+
166
+ def non_data_fields
167
+ ['dimension1_path','dimension1_stage','dimension2_path','dimension2_stage']
168
+ end
169
+
170
+ def data_fields
171
+ fields = []
172
+ self.class.columns.each do |column|
173
+ unless non_data_fields.include?(column.name)
174
+ fields << column.name
175
+ end
176
+ end
177
+ fields
178
+ end
179
+
180
+ # Reset the aggregate
181
+ def reset
182
+ self.class.columns.each do |column|
183
+ unless non_data_fields.include?(column.name)
184
+ value = column.number? ? 0 : 'None'
185
+ send("#{column.name}=".to_sym, value)
186
+ end
187
+ end
188
+ end
189
+ end
190
+
191
+ # ActiveRecord object which stores meta data about the aggregate
192
+ class AggregateMetaData < ActiveRecord::Base
193
+ # Build the underlying table. Set force to true to force the build of the table
194
+ def self.build_table(force=false)
195
+ connection.drop_table(table_name) if force and table_exists?
196
+ if !table_exists?
197
+ connection.create_table(table_name) do |t|
198
+ t.column :cube_name, :string
199
+ t.column :dimension1, :string
200
+ t.column :dimension1_hierarchy, :string
201
+ t.column :dimension2, :string
202
+ t.column :dimension2_hierarchy, :string
203
+ t.column :created_at, :datetime
204
+ t.column :populated_at, :datetime
205
+ end
206
+ connection.add_index table_name, :cube_name
207
+ end
208
+ end
209
+ def key
210
+ Aggregate.key(dimension1, dimension1_hierarchy, dimension2, dimension2_hierarchy)
211
+ end
212
+ end
213
+
214
+ # Key for aggregate caching
215
+ class AggregateKey
216
+ attr_reader :dimension1, :dimension1_hierarchy, :dimension2, :dimension2_hierarchy
217
+
218
+ def initialize(dimension1, dimension1_hierarchy, dimension2, dimension2_hierarchy)
219
+ @dimension1 = dimension1
220
+ @dimension1_hierarchy = dimension1_hierarchy
221
+ @dimension2 = dimension2
222
+ @dimension2_hierarchy = dimension2_hierarchy
223
+ end
224
+
225
+ def ==(o)
226
+ o.instance_of?(self.class) and (o.to_s == to_s or o.to_s = to_rs)
227
+ end
228
+
229
+ def hash
230
+ to_s.hash
231
+ end
232
+
233
+ def to_s
234
+ "#{@dimension1}.#{@dimension1_hierarchy}.#{@dimension2}.#{@dimension2_hierarchy}"
235
+ end
236
+
237
+ # Return the "reveresed" version of this key String representation
238
+ def to_rs
239
+ "#{@dimension2}.#{@dimension2_hierarchy}.#{@dimension1}.#{@dimension1_hierarchy}"
240
+ end
241
+ end
242
+ end
243
+
244
+ ActiveWarehouse::AggregateMetaData.build_table
@@ -0,0 +1,273 @@
1
+ module ActiveWarehouse
2
+ # A Cube represents a collection of dimensions operating on a fact. The Cube provides a front-end for getting at the
3
+ # underlying data. The Cube manages the creation and population of all underlying aggregates.
4
+ class Cube
5
+ class << self
6
+ # Callback which is invoked when subclasses are created
7
+ def inherited(subclass)
8
+ subclasses << subclass
9
+ end
10
+
11
+ # Get a list of all known subclasses
12
+ def subclasses
13
+ @subclasses ||= []
14
+ end
15
+
16
+ # Defines the dimensions that this cube pivots on.
17
+ def pivots_on(*dimension_list)
18
+ # TODO: Validate if the fact is set
19
+ dimension_list.each do |dimension|
20
+ dimensions << dimension
21
+ end
22
+ end
23
+
24
+ # Defines the fact that this cube reports on
25
+ def reports_on(fact)
26
+ # TODO: Validate if one or more dimension is set
27
+ @fact = fact
28
+ end
29
+
30
+ # Rebuild all aggregate classes. Set :force => true to force the rebuild of aggregate classes.
31
+ def rebuild(options={})
32
+ logger.debug "Rebuilding aggregates for cube #{name}"
33
+ options[:force] ||= false
34
+ build_aggregate_classes(options)
35
+ end
36
+
37
+ # Populate all aggregates. Set :force => true to force the population of the aggregate class.
38
+ def populate(options={})
39
+ options[:force] ||= false
40
+ aggregates.each do |agg_id, agg_clazz|
41
+ if agg_clazz.needs_rebuild? || options[:force]
42
+ logger.debug "Populating aggregate class #{agg_clazz.name}"
43
+ agg_clazz.populate
44
+ end
45
+ end
46
+ end
47
+
48
+ # Get the fact that this cube reports on
49
+ def fact
50
+ @fact
51
+ end
52
+
53
+ # Get the dimensions that this cube pivots on
54
+ def dimensions
55
+ @dimensions ||= []
56
+ end
57
+
58
+ # Get the aggregate classes for this dimension
59
+ def aggregates
60
+ rebuild if @aggregates.nil?
61
+ @aggregates
62
+ end
63
+
64
+ # Get the class name for the specified cube name
65
+ # Example: Regional Sales will become RegionalSalesCube
66
+ def class_name(name)
67
+ cube_name = name.to_s
68
+ cube_name = "#{cube_name}_cube" unless cube_name =~ /_cube$/
69
+ cube_name.classify
70
+ end
71
+
72
+ # Get the aggregated fact class name
73
+ def fact_class_name
74
+ Fact.class_name(fact)
75
+ end
76
+
77
+ # Get the aggregated fact class instance
78
+ def fact_class
79
+ fact_class_name.constantize
80
+ end
81
+
82
+ # Get a list of dimension class instances
83
+ def dimension_classes
84
+ dimensions.collect {|dimension| Dimension.class_name(dimension).constantize}
85
+ end
86
+
87
+ def logger
88
+ @logger ||= Logger.new('cube.log')
89
+ end
90
+
91
+ def last_modified
92
+ lm = Fact.class_for_name(fact).last_modified
93
+ dimensions.each do |dimension|
94
+ dim = Dimension.class_for_name(dimension)
95
+ lm = dim.last_modified if dim.last_modified > lm
96
+ end
97
+ lm
98
+ end
99
+
100
+ protected
101
+ def build_aggregate_classes(options={})
102
+ @aggregates = {}
103
+ existing_dimension_pairs = []
104
+ logger.debug "Building aggregate classes"
105
+ dimensions.each do |column_dimension|
106
+ dimensions.each do |row_dimension|
107
+ next if column_dimension == row_dimension
108
+ next if existing_dimension_pairs.include? [column_dimension,row_dimension]
109
+ next if existing_dimension_pairs.include? [row_dimension,column_dimension]
110
+
111
+ existing_dimension_pairs << [column_dimension,row_dimension]
112
+ col_dim_class = Dimension.class_for_name(column_dimension)
113
+ col_dim_class.hierarchy_levels.each_key do |column_hierarchy_name|
114
+ row_dim_class = Dimension.class_for_name(row_dimension)
115
+ row_dim_class.hierarchy_levels.each_key do |row_hierarchy_name|
116
+ # Construct the aggregate meta data instance
117
+ meta_data_attributes = {
118
+ :cube_name => self.name,
119
+ :dimension1 => column_dimension.to_s,
120
+ :dimension1_hierarchy => column_hierarchy_name.to_s,
121
+ :dimension2 => row_dimension.to_s,
122
+ :dimension2_hierarchy => row_hierarchy_name.to_s
123
+ }
124
+ conditions = []
125
+ condition_args = []
126
+ meta_data_attributes.each do |key, value|
127
+ conditions << "#{key} = ?"
128
+ condition_args << value
129
+ end
130
+ conditions = [conditions.join(' and ')] + condition_args
131
+ meta_data = AggregateMetaData.find(:first, :conditions => conditions)
132
+ unless meta_data
133
+ meta_data = AggregateMetaData.create(meta_data_attributes)
134
+ end
135
+
136
+ # Construct the aggregate class instance
137
+ aggregate_class = Class.new(ActiveWarehouse::Aggregate)
138
+ aggregate_class.name = "Agg#{meta_data.id}"
139
+ logger.debug "Constructed aggregate #{aggregate_class.name}"
140
+ aggregate_class.cube = self
141
+ aggregate_class.dimension1 = column_dimension
142
+ aggregate_class.dimension1_hierarchy_name = column_hierarchy_name
143
+ aggregate_class.dimension2 = row_dimension
144
+ aggregate_class.dimension2_hierarchy_name = row_hierarchy_name
145
+
146
+ # Create the underlying aggregate storage table
147
+ # TODO: fix the bug of data not being found when a storage table rebuild occurs
148
+ force_storage_table_rebuild = options[:force] || aggregate_class.needs_rebuild?(last_modified)
149
+ logger.debug "Force storage table rebuild? #{force_storage_table_rebuild}"
150
+ aggregate_class.create_storage_table(force_storage_table_rebuild)
151
+
152
+ # Keep a reference to the aggregate class instance
153
+ @aggregates[meta_data.id] = aggregate_class
154
+ end
155
+ end
156
+ end
157
+ end
158
+ end
159
+
160
+ end
161
+
162
+ public
163
+ def aggregate_map(column_dimension, column_hierarchy, row_dimension, row_hierarchy, cstage=0, rstage=0)
164
+ # Fill known cells
165
+ agg_map = AggregateMap.new
166
+ agg_records = aggregate_records(column_dimension, column_hierarchy, row_dimension, row_hierarchy)
167
+ agg_records.each do |agg_record|
168
+ # agg_record is an instance of Aggregate
169
+ # collect the aggregate record data fields into an array
170
+ data_array = agg_record.data_fields.collect{ |data_field_name| agg_record.send(data_field_name.to_sym) }
171
+
172
+ # convert to an average where necessary
173
+ # TODO: implement
174
+
175
+ # add calculated fields to the data array
176
+ self.class.fact_class.calculated_fields.each do |calculated_field|
177
+ options = self.class.fact_class.calculated_field_options[calculated_field]
178
+ data_array << options[:block].call(agg_record)
179
+ end
180
+
181
+ # add the data array to the aggregate map
182
+ agg_map.add_data(agg_record.dimension2_path, agg_record.dimension1_path, data_array)
183
+ end
184
+ agg_map
185
+ end
186
+
187
+ protected
188
+ # Return all of the Aggregate records for the specified dimensions and hierarchies
189
+ def aggregate_records(column_dimension, column_hierarchy, row_dimension, row_hierarchy)
190
+ k = Aggregate.key(column_dimension, column_hierarchy, row_dimension, row_hierarchy)
191
+ if aggregates[k].nil?
192
+ self.class.logger.debug("Aggregate #{k} not found in cache")
193
+ conditions = ['cube_name = ?', self.class.name]
194
+ conditions[0] << ' and dimension1 = ? and dimension1_hierarchy = ? and dimension2 = ? and dimension2_hierarchy = ?'
195
+ conditions << column_dimension.to_s
196
+ conditions << column_hierarchy.to_s
197
+ conditions << row_dimension.to_s
198
+ conditions << row_hierarchy.to_s
199
+
200
+ conditions_reversed = ['cube_name = ?', self.class.name]
201
+ conditions_reversed[0] << ' and dimension1 = ? and dimension1_hierarchy = ? and dimension2 = ? and dimension2_hierarchy = ?'
202
+ conditions_reversed << row_dimension.to_s
203
+ conditions_reversed << row_hierarchy.to_s
204
+ conditions_reversed << column_dimension.to_s
205
+ conditions_reversed << column_hierarchy.to_s
206
+
207
+ aggregate_meta_data = AggregateMetaData.find(:first, :conditions => conditions)
208
+ aggregate_meta_data ||= AggregateMetaData.find(:first, :conditions => conditions_reversed)
209
+ if aggregate_meta_data.nil?
210
+ self.class.rebuild
211
+ aggregate_meta_data = AggregateMetaData.find(:first, :conditions => conditions)
212
+ raise "Cannot find aggregate meta data for key #{k}" if aggregate_meta_data.nil?
213
+ end
214
+ aggregate_class = self.class.aggregates[aggregate_meta_data.id]
215
+ if aggregate_class.nil?
216
+ self.class.rebuild
217
+ aggregate_class = self.class.aggregates[aggregate_meta_data.id]
218
+ raise "Cannot find aggregate for id #{aggregate_meta_data.id}" if aggregate_class.nil?
219
+ end
220
+ #puts "Loading aggregate #{aggregate_meta_data.id}"
221
+ aggregates[k] = aggregate_class.find(:all)
222
+ end
223
+ aggregates[k]
224
+ end
225
+
226
+ # Get a hash of all aggregate data
227
+ def aggregates
228
+ @aggregates ||= {}
229
+ end
230
+ end
231
+
232
+ # In-memory map of aggregate values
233
+ class AggregateMap
234
+ attr_reader :length
235
+
236
+ # Initialize the aggregate map
237
+ def initialize
238
+ @m = {}
239
+ end
240
+
241
+ # Return true if the aggregate map includes the specified row path
242
+ def has_row_path?(row_path)
243
+ @m.has_key?(row_path)
244
+ end
245
+
246
+ # Get the value for the specified row path, column path and field index
247
+ def value(row_path, col_path, field_index)
248
+ #puts "Getting value for #{row_path}, #{col_path} [field=#{field_index}]"
249
+ row = @m[row_path]
250
+ return 0 if row.nil?
251
+ col = row[col_path]
252
+ return 0 if col.nil?
253
+ return col[field_index] || 0
254
+ end
255
+
256
+ # Get an array of the values for the specified row path and column path
257
+ def values(row_path, col_path)
258
+ row = @m[row_path]
259
+ return Array.new(length, 0) if row.nil?
260
+ col = row[col_path]
261
+ return Array.new(length, 0) if col.nil?
262
+ col
263
+ end
264
+
265
+ # Add an array of data for the given row and column path
266
+ def add_data(row_path, col_path, data_array)
267
+ @length ||= data_array.length
268
+ #puts "Adding data for #{row_path}, #{col_path} [data=[#{data_array.join(',')}]]"
269
+ @m[row_path] ||= {}
270
+ @m[row_path][col_path] = data_array
271
+ end
272
+ end
273
+ end