cubicle 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.0"
8
+ s.version = "0.1.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-03-13}
12
+ s.date = %q{2010-03-14}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,389 +1,390 @@
1
- require "rubygems"
2
- require "active_support"
3
- require "mongo"
4
-
5
- dir = File.dirname(__FILE__)
6
- ["mongo_environment",
7
- "member",
8
- "member_list",
9
- "measure",
10
- "calculated_measure",
11
- "dimension",
12
- "ratio",
13
- "query",
14
- "data_level",
15
- "data",
16
- "aggregation",
17
- "date_time",
18
- "support"].each {|lib|require File.join(dir,'cubicle',lib)}
19
-
20
- require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
21
-
22
- module Cubicle
23
-
24
- def self.register_cubicle_directory(directory_path, recursive=true)
25
- searcher = "#{recursive ? "*" : "**/*"}.rb"
26
- Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
27
- end
28
-
29
- def self.mongo
30
- @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
31
- end
32
-
33
- def self.logger
34
- Cubicle.mongo.logger
35
- end
36
-
37
- def database
38
- Cubicle.mongo.database
39
- end
40
-
41
- def collection
42
- database[target_collection_name]
43
- end
44
-
45
- def transient?
46
- @transient ||= false
47
- end
48
-
49
- def transient!
50
- @transient = true
51
- end
52
-
53
- def expire!
54
- collection.drop
55
- expire_aggregations!
56
- end
57
-
58
- def aggregations
59
- return (@aggregations ||= [])
60
- end
61
-
62
- #DSL
63
- def source_collection_name(collection_name = nil)
64
- return @source_collection = collection_name if collection_name
65
- @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
66
- end
67
- alias source_collection_name= source_collection_name
68
-
69
- def target_collection_name(collection_name = nil)
70
- return nil if transient?
71
- return @target_name = collection_name if collection_name
72
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
73
- end
74
- alias target_collection_name= target_collection_name
75
-
76
- def dimension(*args)
77
- dimensions << Cubicle::Dimension.new(*args)
78
- dimensions[-1]
79
- end
80
-
81
- def dimension_names
82
- return @dimensions.map{|dim|dim.name.to_s}
83
- end
84
-
85
- def dimensions(*args)
86
- return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
87
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
88
- args.each {|dim| dimension dim }
89
- @dimensions
90
- end
91
-
92
- def measure(*args)
93
- measures << Measure.new(*args)
94
- measures[-1]
95
- end
96
-
97
- def measures(*args)
98
- return (@measures ||= Cubicle::MemberList.new) if args.length < 1
99
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
100
- args.each {|m| measure m}
101
- @measures
102
- end
103
-
104
- def count(*args)
105
- options = args.extract_options!
106
- options[:aggregation_method] = :count
107
- measure(*(args << options))
108
- end
109
-
110
- def average(*args)
111
- options = args.extract_options!
112
- options[:aggregation_method] = :average
113
- measure(*(args << options))
114
- #Averaged fields need a count of non-null values to properly calculate the average
115
- args[0] = "#{args[0]}_count".to_sym
116
- count *args
117
- end
118
- alias avg average
119
-
120
- def sum(*args)
121
- options = args.extract_options!
122
- options[:aggregation_method] = :sum
123
- measure(*(args << options))
124
- end
125
-
126
- def ratio(member_name, numerator, denominator)
127
- measures << Ratio.new(member_name, numerator, denominator)
128
- end
129
-
130
- def aggregation(*member_list)
131
- member_list = member_list[0] if member_list[0].is_a?(Array)
132
- aggregations << member_list
133
- end
134
-
135
- def time_dimension(*args)
136
- return (@time_dimension ||= nil) unless args.length > 0
137
- @time_dimension = dimension(*args)
138
- end
139
- alias time_dimension= time_dimension
140
- alias date time_dimension
141
- alias time time_dimension
142
-
143
- def find_member(member_name)
144
- @dimensions[member_name] ||
145
- @measures[member_name]
146
- end
147
-
148
- def query(*args,&block)
149
- options = args.extract_options!
150
- query = Cubicle::Query.new(self)
151
- query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
152
- query.select(*args) if args.length > 0
153
- if block_given?
154
- block.arity == 1 ? (yield query) : (query.instance_eval(&block))
155
- end
156
- query.select_all unless query.selected?
157
- return query if options[:defer]
158
- results = execute_query(query,options)
159
- #If the 'by' clause was used in the the query,
160
- #we'll hierarchize by the members indicated,
161
- #as the next step would otherwise almost certainly
162
- #need to be a call to hierarchize anyway.
163
- query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
164
- end
165
-
166
- def execute_query(query,options={})
167
- count = 0
168
-
169
- find_options = {
170
- :limit=>query.limit || 0,
171
- :skip=>query.offset || 0
172
- }
173
-
174
- find_options[:sort] = prepare_order_by(query)
175
- filter = {}
176
- if query == self || query.transient?
177
- aggregation = aggregate(query,options)
178
- else
179
- process_if_required
180
- aggregation = aggregation_for(query)
181
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
182
- #otherwise, a second map reduce is required to reduce the data set one last time
183
- if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
184
- filter = prepare_filter(query,options[:where] || {})
185
- else
186
- aggregation = aggregate(query,:source_collection=>collection.name)
187
- end
188
- end
189
- count = aggregation.count
190
- #noinspection RubyArgCount
191
- data = aggregation.find(filter,find_options).to_a
192
- #noinspection RubyArgCount
193
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
194
- Cubicle::Data.new(query, data, count)
195
- end
196
-
197
- def process(options={})
198
- Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
199
- start = Time.now
200
- expire!
201
- aggregate(self,options)
202
- #Sort desc by length of array, so that larget
203
- #aggregations are processed first, hopefully increasing efficiency
204
- #of the processing step
205
- aggregations.sort!{|a,b|b.length<=>a.length}
206
- aggregations.each do |member_list|
207
- agg_start = Time.now
208
- aggregation_for(query(:defer=>true){select member_list})
209
- Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
210
- end
211
- duration = Time.now - start
212
- Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
213
- end
214
-
215
- protected
216
-
217
- def aggregation_collection_names
218
- database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
219
- end
220
-
221
- def expire_aggregations!
222
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
223
- end
224
-
225
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
226
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
227
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
228
- existing = existing_aggregations.map do |agg_col_name|
229
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
230
- end
231
-
232
- #This will select all the aggregations that contain ALL of the desired dimension names
233
- #we are sorting by length because the aggregation with the least number of members
234
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
235
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
236
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
237
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
238
- #this should do for now.
239
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
240
-
241
- #If no suitable aggregation exists to base this one off of,
242
- #we'll just use the base cubes aggregation collection
243
- return target_collection_name if candidates.blank?
244
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
245
-
246
- end
247
-
248
- def aggregation_for(query)
249
- return collection if query.all_dimensions?
250
-
251
- aggregation_query = query.clone
252
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
253
- filter = (query.where if query.respond_to?(:where))
254
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
255
-
256
- dimension_names = aggregation_query.dimension_names.sort
257
- agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
258
-
259
- unless database.collection_names.include?(agg_col_name)
260
- source_col_name = find_best_source_collection(dimension_names)
261
- exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
262
- aggregate(exec_query, :target_collection=>agg_col_name)
263
- end
264
-
265
- database[agg_col_name]
266
- end
267
-
268
- def ensure_indexes(collection_name,dimension_names)
269
- #an index for each dimension
270
- dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
271
- #and a composite
272
- database[collection_name].create_index(dimension_names)
273
- end
274
-
275
- def aggregate(query,options={})
276
- map, reduce = generate_map_function(query), generate_reduce_function
277
- options[:finalize] = generate_finalize_function(query)
278
- options["query"] = prepare_filter(query,options[:where] || {})
279
-
280
- query.source_collection_name ||= source_collection_name
281
-
282
- target_collection = options.delete(:target_collection)
283
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
284
-
285
- options[:out] = target_collection unless target_collection.blank? || query.transient?
286
-
287
- #This is defensive - some tests run without ever initializing any collections
288
- return [] unless database.collection_names.include?(query.source_collection_name)
289
-
290
- result = database[query.source_collection_name].map_reduce(map,reduce,options)
291
-
292
- ensure_indexes(target_collection,query.dimension_names) if target_collection
293
-
294
- result
295
- end
296
-
297
- def prepare_filter(query,filter={})
298
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
299
- filter.stringify_keys!
300
- transient = (query.transient? || query == self)
301
- filter.keys.each do |key|
302
- next if key=~/^\$.*/
303
- prefix = nil
304
- prefix = "_id" if (member = self.dimensions[key])
305
- prefix = "value" if (member = self.measures[key]) unless member
306
-
307
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
308
-
309
- filter_value = filter.delete(key)
310
- if transient
311
- if (member.expression_type == :javascript)
312
- filter_name = "$where"
313
- filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
314
- filter_value = "(#{member.expression})==#{filter_value}"
315
- else
316
- filter_name = member.expression
317
- end
318
- else
319
- filter_name = "#{prefix}.#{member.name}"
320
- end
321
- filter[filter_name] = filter_value
322
- end
323
- filter
324
- end
325
-
326
- def prepare_order_by(query)
327
- order_by = []
328
- query.order_by.each do |order|
329
- prefix = "_id" if (member = self.dimensions[order[0]])
330
- prefix = "value" if (member = self.measures[order[0]]) unless member
331
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
332
- order_by << ["#{prefix}.#{order[0]}",order[1]]
333
- end
334
- order_by
335
- end
336
-
337
- def process_if_required
338
- return if database.collection_names.include?(target_collection_name)
339
- process
340
- end
341
-
342
-
343
- def generate_keys_string(query)
344
- "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
345
- end
346
-
347
- def generate_values_string(query = self)
348
- "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
349
- end
350
-
351
- def generate_map_function(query = self)
352
- <<MAP
353
- function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
354
- MAP
355
- end
356
-
357
- def generate_reduce_function()
358
- <<REDUCE
359
- function(key,values){
360
- var output = {};
361
- values.forEach(function(doc){
362
- for(var key in doc){
363
- if (doc[key] != null){
364
- output[key] = output[key] || 0;
365
- output[key] += doc[key];
366
- }
367
- }
368
- });
369
- return output;
370
- }
371
- REDUCE
372
- end
373
-
374
- def generate_finalize_function(query = self)
375
- <<FINALIZE
376
- function(key,value)
377
- {
378
-
379
- #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
380
- "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
381
- end.join("\n")}
382
- #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
383
- "value.#{m.name}=#{m.expression};";
384
- end.join("\n")}
385
- return value;
386
- }
387
- FINALIZE
388
- end
1
+ require "rubygems"
2
+ require "active_support"
3
+ require "mongo"
4
+ require "logger"
5
+
6
+ dir = File.dirname(__FILE__)
7
+ ["mongo_environment",
8
+ "member",
9
+ "member_list",
10
+ "measure",
11
+ "calculated_measure",
12
+ "dimension",
13
+ "ratio",
14
+ "query",
15
+ "data_level",
16
+ "data",
17
+ "aggregation",
18
+ "date_time",
19
+ "support"].each {|lib|require File.join(dir,'cubicle',lib)}
20
+
21
+ require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
22
+
23
+ module Cubicle
24
+
25
+ def self.register_cubicle_directory(directory_path, recursive=true)
26
+ searcher = "#{recursive ? "*" : "**/*"}.rb"
27
+ Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
28
+ end
29
+
30
+ def self.mongo
31
+ @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
32
+ end
33
+
34
+ def self.logger
35
+ Cubicle.mongo.logger || Logger.new("cubicle.log")
36
+ end
37
+
38
+ def database
39
+ Cubicle.mongo.database
40
+ end
41
+
42
+ def collection
43
+ database[target_collection_name]
44
+ end
45
+
46
+ def transient?
47
+ @transient ||= false
48
+ end
49
+
50
+ def transient!
51
+ @transient = true
52
+ end
53
+
54
+ def expire!
55
+ collection.drop
56
+ expire_aggregations!
57
+ end
58
+
59
+ def aggregations
60
+ return (@aggregations ||= [])
61
+ end
62
+
63
+ #DSL
64
+ def source_collection_name(collection_name = nil)
65
+ return @source_collection = collection_name if collection_name
66
+ @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
67
+ end
68
+ alias source_collection_name= source_collection_name
69
+
70
+ def target_collection_name(collection_name = nil)
71
+ return nil if transient?
72
+ return @target_name = collection_name if collection_name
73
+ @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
74
+ end
75
+ alias target_collection_name= target_collection_name
76
+
77
+ def dimension(*args)
78
+ dimensions << Cubicle::Dimension.new(*args)
79
+ dimensions[-1]
80
+ end
81
+
82
+ def dimension_names
83
+ return @dimensions.map{|dim|dim.name.to_s}
84
+ end
85
+
86
+ def dimensions(*args)
87
+ return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
88
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
89
+ args.each {|dim| dimension dim }
90
+ @dimensions
91
+ end
92
+
93
+ def measure(*args)
94
+ measures << Measure.new(*args)
95
+ measures[-1]
96
+ end
97
+
98
+ def measures(*args)
99
+ return (@measures ||= Cubicle::MemberList.new) if args.length < 1
100
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
101
+ args.each {|m| measure m}
102
+ @measures
103
+ end
104
+
105
+ def count(*args)
106
+ options = args.extract_options!
107
+ options[:aggregation_method] = :count
108
+ measure(*(args << options))
109
+ end
110
+
111
+ def average(*args)
112
+ options = args.extract_options!
113
+ options[:aggregation_method] = :average
114
+ measure(*(args << options))
115
+ #Averaged fields need a count of non-null values to properly calculate the average
116
+ args[0] = "#{args[0]}_count".to_sym
117
+ count *args
118
+ end
119
+ alias avg average
120
+
121
+ def sum(*args)
122
+ options = args.extract_options!
123
+ options[:aggregation_method] = :sum
124
+ measure(*(args << options))
125
+ end
126
+
127
+ def ratio(member_name, numerator, denominator)
128
+ measures << Ratio.new(member_name, numerator, denominator)
129
+ end
130
+
131
+ def aggregation(*member_list)
132
+ member_list = member_list[0] if member_list[0].is_a?(Array)
133
+ aggregations << member_list
134
+ end
135
+
136
+ def time_dimension(*args)
137
+ return (@time_dimension ||= nil) unless args.length > 0
138
+ @time_dimension = dimension(*args)
139
+ end
140
+ alias time_dimension= time_dimension
141
+ alias date time_dimension
142
+ alias time time_dimension
143
+
144
+ def find_member(member_name)
145
+ @dimensions[member_name] ||
146
+ @measures[member_name]
147
+ end
148
+
149
+ def query(*args,&block)
150
+ options = args.extract_options!
151
+ query = Cubicle::Query.new(self)
152
+ query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
153
+ query.select(*args) if args.length > 0
154
+ if block_given?
155
+ block.arity == 1 ? (yield query) : (query.instance_eval(&block))
156
+ end
157
+ query.select_all unless query.selected?
158
+ return query if options[:defer]
159
+ results = execute_query(query,options)
160
+ #If the 'by' clause was used in the the query,
161
+ #we'll hierarchize by the members indicated,
162
+ #as the next step would otherwise almost certainly
163
+ #need to be a call to hierarchize anyway.
164
+ query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
165
+ end
166
+
167
+ def execute_query(query,options={})
168
+ count = 0
169
+
170
+ find_options = {
171
+ :limit=>query.limit || 0,
172
+ :skip=>query.offset || 0
173
+ }
174
+
175
+ find_options[:sort] = prepare_order_by(query)
176
+ filter = {}
177
+ if query == self || query.transient?
178
+ aggregation = aggregate(query,options)
179
+ else
180
+ process_if_required
181
+ aggregation = aggregation_for(query)
182
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
183
+ #otherwise, a second map reduce is required to reduce the data set one last time
184
+ if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
185
+ filter = prepare_filter(query,options[:where] || {})
186
+ else
187
+ aggregation = aggregate(query,:source_collection=>collection.name)
188
+ end
189
+ end
190
+ count = aggregation.count
191
+ #noinspection RubyArgCount
192
+ data = aggregation.find(filter,find_options).to_a
193
+ #noinspection RubyArgCount
194
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
195
+ Cubicle::Data.new(query, data, count)
196
+ end
197
+
198
+ def process(options={})
199
+ Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
200
+ start = Time.now
201
+ expire!
202
+ aggregate(self,options)
203
+ #Sort desc by length of array, so that larget
204
+ #aggregations are processed first, hopefully increasing efficiency
205
+ #of the processing step
206
+ aggregations.sort!{|a,b|b.length<=>a.length}
207
+ aggregations.each do |member_list|
208
+ agg_start = Time.now
209
+ aggregation_for(query(:defer=>true){select member_list})
210
+ Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
211
+ end
212
+ duration = Time.now - start
213
+ Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
214
+ end
215
+
216
+ protected
217
+
218
+ def aggregation_collection_names
219
+ database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
220
+ end
221
+
222
+ def expire_aggregations!
223
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
224
+ end
225
+
226
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
227
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
228
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
229
+ existing = existing_aggregations.map do |agg_col_name|
230
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
231
+ end
232
+
233
+ #This will select all the aggregations that contain ALL of the desired dimension names
234
+ #we are sorting by length because the aggregation with the least number of members
235
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
236
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
237
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
238
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
239
+ #this should do for now.
240
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
241
+
242
+ #If no suitable aggregation exists to base this one off of,
243
+ #we'll just use the base cubes aggregation collection
244
+ return target_collection_name if candidates.blank?
245
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
246
+
247
+ end
248
+
249
+ def aggregation_for(query)
250
+ return collection if query.all_dimensions?
251
+
252
+ aggregation_query = query.clone
253
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
254
+ filter = (query.where if query.respond_to?(:where))
255
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
256
+
257
+ dimension_names = aggregation_query.dimension_names.sort
258
+ agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
259
+
260
+ unless database.collection_names.include?(agg_col_name)
261
+ source_col_name = find_best_source_collection(dimension_names)
262
+ exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
263
+ aggregate(exec_query, :target_collection=>agg_col_name)
264
+ end
265
+
266
+ database[agg_col_name]
267
+ end
268
+
269
+ def ensure_indexes(collection_name,dimension_names)
270
+ #an index for each dimension
271
+ dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
272
+ #and a composite
273
+ database[collection_name].create_index(dimension_names)
274
+ end
275
+
276
+ def aggregate(query,options={})
277
+ map, reduce = generate_map_function(query), generate_reduce_function
278
+ options[:finalize] = generate_finalize_function(query)
279
+ options["query"] = prepare_filter(query,options[:where] || {})
280
+
281
+ query.source_collection_name ||= source_collection_name
282
+
283
+ target_collection = options.delete(:target_collection)
284
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
285
+
286
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
287
+
288
+ #This is defensive - some tests run without ever initializing any collections
289
+ return [] unless database.collection_names.include?(query.source_collection_name)
290
+
291
+ result = database[query.source_collection_name].map_reduce(map,reduce,options)
292
+
293
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
294
+
295
+ result
296
+ end
297
+
298
+ def prepare_filter(query,filter={})
299
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
300
+ filter.stringify_keys!
301
+ transient = (query.transient? || query == self)
302
+ filter.keys.each do |key|
303
+ next if key=~/^\$.*/
304
+ prefix = nil
305
+ prefix = "_id" if (member = self.dimensions[key])
306
+ prefix = "value" if (member = self.measures[key]) unless member
307
+
308
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
309
+
310
+ filter_value = filter.delete(key)
311
+ if transient
312
+ if (member.expression_type == :javascript)
313
+ filter_name = "$where"
314
+ filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
315
+ filter_value = "(#{member.expression})==#{filter_value}"
316
+ else
317
+ filter_name = member.expression
318
+ end
319
+ else
320
+ filter_name = "#{prefix}.#{member.name}"
321
+ end
322
+ filter[filter_name] = filter_value
323
+ end
324
+ filter
325
+ end
326
+
327
+ def prepare_order_by(query)
328
+ order_by = []
329
+ query.order_by.each do |order|
330
+ prefix = "_id" if (member = self.dimensions[order[0]])
331
+ prefix = "value" if (member = self.measures[order[0]]) unless member
332
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
333
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
334
+ end
335
+ order_by
336
+ end
337
+
338
+ def process_if_required
339
+ return if database.collection_names.include?(target_collection_name)
340
+ process
341
+ end
342
+
343
+
344
+ def generate_keys_string(query)
345
+ "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
346
+ end
347
+
348
+ def generate_values_string(query = self)
349
+ "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
350
+ end
351
+
352
+ def generate_map_function(query = self)
353
+ <<MAP
354
+ function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
355
+ MAP
356
+ end
357
+
358
+ def generate_reduce_function()
359
+ <<REDUCE
360
+ function(key,values){
361
+ var output = {};
362
+ values.forEach(function(doc){
363
+ for(var key in doc){
364
+ if (doc[key] != null){
365
+ output[key] = output[key] || 0;
366
+ output[key] += doc[key];
367
+ }
368
+ }
369
+ });
370
+ return output;
371
+ }
372
+ REDUCE
373
+ end
374
+
375
+ def generate_finalize_function(query = self)
376
+ <<FINALIZE
377
+ function(key,value)
378
+ {
379
+
380
+ #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
381
+ "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
382
+ end.join("\n")}
383
+ #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
384
+ "value.#{m.name}=#{m.expression};";
385
+ end.join("\n")}
386
+ return value;
387
+ }
388
+ FINALIZE
389
+ end
389
390
  end