cubicle 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.0"
8
+ s.version = "0.1.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-03-13}
12
+ s.date = %q{2010-03-14}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,389 +1,390 @@
1
- require "rubygems"
2
- require "active_support"
3
- require "mongo"
4
-
5
- dir = File.dirname(__FILE__)
6
- ["mongo_environment",
7
- "member",
8
- "member_list",
9
- "measure",
10
- "calculated_measure",
11
- "dimension",
12
- "ratio",
13
- "query",
14
- "data_level",
15
- "data",
16
- "aggregation",
17
- "date_time",
18
- "support"].each {|lib|require File.join(dir,'cubicle',lib)}
19
-
20
- require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
21
-
22
- module Cubicle
23
-
24
- def self.register_cubicle_directory(directory_path, recursive=true)
25
- searcher = "#{recursive ? "*" : "**/*"}.rb"
26
- Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
27
- end
28
-
29
- def self.mongo
30
- @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
31
- end
32
-
33
- def self.logger
34
- Cubicle.mongo.logger
35
- end
36
-
37
- def database
38
- Cubicle.mongo.database
39
- end
40
-
41
- def collection
42
- database[target_collection_name]
43
- end
44
-
45
- def transient?
46
- @transient ||= false
47
- end
48
-
49
- def transient!
50
- @transient = true
51
- end
52
-
53
- def expire!
54
- collection.drop
55
- expire_aggregations!
56
- end
57
-
58
- def aggregations
59
- return (@aggregations ||= [])
60
- end
61
-
62
- #DSL
63
- def source_collection_name(collection_name = nil)
64
- return @source_collection = collection_name if collection_name
65
- @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
66
- end
67
- alias source_collection_name= source_collection_name
68
-
69
- def target_collection_name(collection_name = nil)
70
- return nil if transient?
71
- return @target_name = collection_name if collection_name
72
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
73
- end
74
- alias target_collection_name= target_collection_name
75
-
76
- def dimension(*args)
77
- dimensions << Cubicle::Dimension.new(*args)
78
- dimensions[-1]
79
- end
80
-
81
- def dimension_names
82
- return @dimensions.map{|dim|dim.name.to_s}
83
- end
84
-
85
- def dimensions(*args)
86
- return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
87
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
88
- args.each {|dim| dimension dim }
89
- @dimensions
90
- end
91
-
92
- def measure(*args)
93
- measures << Measure.new(*args)
94
- measures[-1]
95
- end
96
-
97
- def measures(*args)
98
- return (@measures ||= Cubicle::MemberList.new) if args.length < 1
99
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
100
- args.each {|m| measure m}
101
- @measures
102
- end
103
-
104
- def count(*args)
105
- options = args.extract_options!
106
- options[:aggregation_method] = :count
107
- measure(*(args << options))
108
- end
109
-
110
- def average(*args)
111
- options = args.extract_options!
112
- options[:aggregation_method] = :average
113
- measure(*(args << options))
114
- #Averaged fields need a count of non-null values to properly calculate the average
115
- args[0] = "#{args[0]}_count".to_sym
116
- count *args
117
- end
118
- alias avg average
119
-
120
- def sum(*args)
121
- options = args.extract_options!
122
- options[:aggregation_method] = :sum
123
- measure(*(args << options))
124
- end
125
-
126
- def ratio(member_name, numerator, denominator)
127
- measures << Ratio.new(member_name, numerator, denominator)
128
- end
129
-
130
- def aggregation(*member_list)
131
- member_list = member_list[0] if member_list[0].is_a?(Array)
132
- aggregations << member_list
133
- end
134
-
135
- def time_dimension(*args)
136
- return (@time_dimension ||= nil) unless args.length > 0
137
- @time_dimension = dimension(*args)
138
- end
139
- alias time_dimension= time_dimension
140
- alias date time_dimension
141
- alias time time_dimension
142
-
143
- def find_member(member_name)
144
- @dimensions[member_name] ||
145
- @measures[member_name]
146
- end
147
-
148
- def query(*args,&block)
149
- options = args.extract_options!
150
- query = Cubicle::Query.new(self)
151
- query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
152
- query.select(*args) if args.length > 0
153
- if block_given?
154
- block.arity == 1 ? (yield query) : (query.instance_eval(&block))
155
- end
156
- query.select_all unless query.selected?
157
- return query if options[:defer]
158
- results = execute_query(query,options)
159
- #If the 'by' clause was used in the the query,
160
- #we'll hierarchize by the members indicated,
161
- #as the next step would otherwise almost certainly
162
- #need to be a call to hierarchize anyway.
163
- query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
164
- end
165
-
166
- def execute_query(query,options={})
167
- count = 0
168
-
169
- find_options = {
170
- :limit=>query.limit || 0,
171
- :skip=>query.offset || 0
172
- }
173
-
174
- find_options[:sort] = prepare_order_by(query)
175
- filter = {}
176
- if query == self || query.transient?
177
- aggregation = aggregate(query,options)
178
- else
179
- process_if_required
180
- aggregation = aggregation_for(query)
181
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
182
- #otherwise, a second map reduce is required to reduce the data set one last time
183
- if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
184
- filter = prepare_filter(query,options[:where] || {})
185
- else
186
- aggregation = aggregate(query,:source_collection=>collection.name)
187
- end
188
- end
189
- count = aggregation.count
190
- #noinspection RubyArgCount
191
- data = aggregation.find(filter,find_options).to_a
192
- #noinspection RubyArgCount
193
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
194
- Cubicle::Data.new(query, data, count)
195
- end
196
-
197
- def process(options={})
198
- Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
199
- start = Time.now
200
- expire!
201
- aggregate(self,options)
202
- #Sort desc by length of array, so that larget
203
- #aggregations are processed first, hopefully increasing efficiency
204
- #of the processing step
205
- aggregations.sort!{|a,b|b.length<=>a.length}
206
- aggregations.each do |member_list|
207
- agg_start = Time.now
208
- aggregation_for(query(:defer=>true){select member_list})
209
- Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
210
- end
211
- duration = Time.now - start
212
- Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
213
- end
214
-
215
- protected
216
-
217
- def aggregation_collection_names
218
- database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
219
- end
220
-
221
- def expire_aggregations!
222
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
223
- end
224
-
225
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
226
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
227
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
228
- existing = existing_aggregations.map do |agg_col_name|
229
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
230
- end
231
-
232
- #This will select all the aggregations that contain ALL of the desired dimension names
233
- #we are sorting by length because the aggregation with the least number of members
234
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
235
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
236
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
237
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
238
- #this should do for now.
239
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
240
-
241
- #If no suitable aggregation exists to base this one off of,
242
- #we'll just use the base cubes aggregation collection
243
- return target_collection_name if candidates.blank?
244
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
245
-
246
- end
247
-
248
- def aggregation_for(query)
249
- return collection if query.all_dimensions?
250
-
251
- aggregation_query = query.clone
252
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
253
- filter = (query.where if query.respond_to?(:where))
254
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
255
-
256
- dimension_names = aggregation_query.dimension_names.sort
257
- agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
258
-
259
- unless database.collection_names.include?(agg_col_name)
260
- source_col_name = find_best_source_collection(dimension_names)
261
- exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
262
- aggregate(exec_query, :target_collection=>agg_col_name)
263
- end
264
-
265
- database[agg_col_name]
266
- end
267
-
268
- def ensure_indexes(collection_name,dimension_names)
269
- #an index for each dimension
270
- dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
271
- #and a composite
272
- database[collection_name].create_index(dimension_names)
273
- end
274
-
275
- def aggregate(query,options={})
276
- map, reduce = generate_map_function(query), generate_reduce_function
277
- options[:finalize] = generate_finalize_function(query)
278
- options["query"] = prepare_filter(query,options[:where] || {})
279
-
280
- query.source_collection_name ||= source_collection_name
281
-
282
- target_collection = options.delete(:target_collection)
283
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
284
-
285
- options[:out] = target_collection unless target_collection.blank? || query.transient?
286
-
287
- #This is defensive - some tests run without ever initializing any collections
288
- return [] unless database.collection_names.include?(query.source_collection_name)
289
-
290
- result = database[query.source_collection_name].map_reduce(map,reduce,options)
291
-
292
- ensure_indexes(target_collection,query.dimension_names) if target_collection
293
-
294
- result
295
- end
296
-
297
- def prepare_filter(query,filter={})
298
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
299
- filter.stringify_keys!
300
- transient = (query.transient? || query == self)
301
- filter.keys.each do |key|
302
- next if key=~/^\$.*/
303
- prefix = nil
304
- prefix = "_id" if (member = self.dimensions[key])
305
- prefix = "value" if (member = self.measures[key]) unless member
306
-
307
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
308
-
309
- filter_value = filter.delete(key)
310
- if transient
311
- if (member.expression_type == :javascript)
312
- filter_name = "$where"
313
- filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
314
- filter_value = "(#{member.expression})==#{filter_value}"
315
- else
316
- filter_name = member.expression
317
- end
318
- else
319
- filter_name = "#{prefix}.#{member.name}"
320
- end
321
- filter[filter_name] = filter_value
322
- end
323
- filter
324
- end
325
-
326
- def prepare_order_by(query)
327
- order_by = []
328
- query.order_by.each do |order|
329
- prefix = "_id" if (member = self.dimensions[order[0]])
330
- prefix = "value" if (member = self.measures[order[0]]) unless member
331
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
332
- order_by << ["#{prefix}.#{order[0]}",order[1]]
333
- end
334
- order_by
335
- end
336
-
337
- def process_if_required
338
- return if database.collection_names.include?(target_collection_name)
339
- process
340
- end
341
-
342
-
343
- def generate_keys_string(query)
344
- "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
345
- end
346
-
347
- def generate_values_string(query = self)
348
- "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
349
- end
350
-
351
- def generate_map_function(query = self)
352
- <<MAP
353
- function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
354
- MAP
355
- end
356
-
357
- def generate_reduce_function()
358
- <<REDUCE
359
- function(key,values){
360
- var output = {};
361
- values.forEach(function(doc){
362
- for(var key in doc){
363
- if (doc[key] != null){
364
- output[key] = output[key] || 0;
365
- output[key] += doc[key];
366
- }
367
- }
368
- });
369
- return output;
370
- }
371
- REDUCE
372
- end
373
-
374
- def generate_finalize_function(query = self)
375
- <<FINALIZE
376
- function(key,value)
377
- {
378
-
379
- #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
380
- "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
381
- end.join("\n")}
382
- #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
383
- "value.#{m.name}=#{m.expression};";
384
- end.join("\n")}
385
- return value;
386
- }
387
- FINALIZE
388
- end
1
+ require "rubygems"
2
+ require "active_support"
3
+ require "mongo"
4
+ require "logger"
5
+
6
+ dir = File.dirname(__FILE__)
7
+ ["mongo_environment",
8
+ "member",
9
+ "member_list",
10
+ "measure",
11
+ "calculated_measure",
12
+ "dimension",
13
+ "ratio",
14
+ "query",
15
+ "data_level",
16
+ "data",
17
+ "aggregation",
18
+ "date_time",
19
+ "support"].each {|lib|require File.join(dir,'cubicle',lib)}
20
+
21
+ require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
22
+
23
+ module Cubicle
24
+
25
+ def self.register_cubicle_directory(directory_path, recursive=true)
26
+ searcher = "#{recursive ? "*" : "**/*"}.rb"
27
+ Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
28
+ end
29
+
30
+ def self.mongo
31
+ @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
32
+ end
33
+
34
+ def self.logger
35
+ Cubicle.mongo.logger || Logger.new("cubicle.log")
36
+ end
37
+
38
+ def database
39
+ Cubicle.mongo.database
40
+ end
41
+
42
+ def collection
43
+ database[target_collection_name]
44
+ end
45
+
46
+ def transient?
47
+ @transient ||= false
48
+ end
49
+
50
+ def transient!
51
+ @transient = true
52
+ end
53
+
54
+ def expire!
55
+ collection.drop
56
+ expire_aggregations!
57
+ end
58
+
59
+ def aggregations
60
+ return (@aggregations ||= [])
61
+ end
62
+
63
+ #DSL
64
+ def source_collection_name(collection_name = nil)
65
+ return @source_collection = collection_name if collection_name
66
+ @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
67
+ end
68
+ alias source_collection_name= source_collection_name
69
+
70
+ def target_collection_name(collection_name = nil)
71
+ return nil if transient?
72
+ return @target_name = collection_name if collection_name
73
+ @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
74
+ end
75
+ alias target_collection_name= target_collection_name
76
+
77
+ def dimension(*args)
78
+ dimensions << Cubicle::Dimension.new(*args)
79
+ dimensions[-1]
80
+ end
81
+
82
+ def dimension_names
83
+ return @dimensions.map{|dim|dim.name.to_s}
84
+ end
85
+
86
+ def dimensions(*args)
87
+ return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
88
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
89
+ args.each {|dim| dimension dim }
90
+ @dimensions
91
+ end
92
+
93
+ def measure(*args)
94
+ measures << Measure.new(*args)
95
+ measures[-1]
96
+ end
97
+
98
+ def measures(*args)
99
+ return (@measures ||= Cubicle::MemberList.new) if args.length < 1
100
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
101
+ args.each {|m| measure m}
102
+ @measures
103
+ end
104
+
105
+ def count(*args)
106
+ options = args.extract_options!
107
+ options[:aggregation_method] = :count
108
+ measure(*(args << options))
109
+ end
110
+
111
+ def average(*args)
112
+ options = args.extract_options!
113
+ options[:aggregation_method] = :average
114
+ measure(*(args << options))
115
+ #Averaged fields need a count of non-null values to properly calculate the average
116
+ args[0] = "#{args[0]}_count".to_sym
117
+ count *args
118
+ end
119
+ alias avg average
120
+
121
+ def sum(*args)
122
+ options = args.extract_options!
123
+ options[:aggregation_method] = :sum
124
+ measure(*(args << options))
125
+ end
126
+
127
+ def ratio(member_name, numerator, denominator)
128
+ measures << Ratio.new(member_name, numerator, denominator)
129
+ end
130
+
131
+ def aggregation(*member_list)
132
+ member_list = member_list[0] if member_list[0].is_a?(Array)
133
+ aggregations << member_list
134
+ end
135
+
136
+ def time_dimension(*args)
137
+ return (@time_dimension ||= nil) unless args.length > 0
138
+ @time_dimension = dimension(*args)
139
+ end
140
+ alias time_dimension= time_dimension
141
+ alias date time_dimension
142
+ alias time time_dimension
143
+
144
+ def find_member(member_name)
145
+ @dimensions[member_name] ||
146
+ @measures[member_name]
147
+ end
148
+
149
+ def query(*args,&block)
150
+ options = args.extract_options!
151
+ query = Cubicle::Query.new(self)
152
+ query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
153
+ query.select(*args) if args.length > 0
154
+ if block_given?
155
+ block.arity == 1 ? (yield query) : (query.instance_eval(&block))
156
+ end
157
+ query.select_all unless query.selected?
158
+ return query if options[:defer]
159
+ results = execute_query(query,options)
160
+ #If the 'by' clause was used in the the query,
161
+ #we'll hierarchize by the members indicated,
162
+ #as the next step would otherwise almost certainly
163
+ #need to be a call to hierarchize anyway.
164
+ query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
165
+ end
166
+
167
+ def execute_query(query,options={})
168
+ count = 0
169
+
170
+ find_options = {
171
+ :limit=>query.limit || 0,
172
+ :skip=>query.offset || 0
173
+ }
174
+
175
+ find_options[:sort] = prepare_order_by(query)
176
+ filter = {}
177
+ if query == self || query.transient?
178
+ aggregation = aggregate(query,options)
179
+ else
180
+ process_if_required
181
+ aggregation = aggregation_for(query)
182
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
183
+ #otherwise, a second map reduce is required to reduce the data set one last time
184
+ if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
185
+ filter = prepare_filter(query,options[:where] || {})
186
+ else
187
+ aggregation = aggregate(query,:source_collection=>collection.name)
188
+ end
189
+ end
190
+ count = aggregation.count
191
+ #noinspection RubyArgCount
192
+ data = aggregation.find(filter,find_options).to_a
193
+ #noinspection RubyArgCount
194
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
195
+ Cubicle::Data.new(query, data, count)
196
+ end
197
+
198
+ def process(options={})
199
+ Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
200
+ start = Time.now
201
+ expire!
202
+ aggregate(self,options)
203
+ #Sort desc by length of array, so that larget
204
+ #aggregations are processed first, hopefully increasing efficiency
205
+ #of the processing step
206
+ aggregations.sort!{|a,b|b.length<=>a.length}
207
+ aggregations.each do |member_list|
208
+ agg_start = Time.now
209
+ aggregation_for(query(:defer=>true){select member_list})
210
+ Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
211
+ end
212
+ duration = Time.now - start
213
+ Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
214
+ end
215
+
216
+ protected
217
+
218
+ def aggregation_collection_names
219
+ database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
220
+ end
221
+
222
+ def expire_aggregations!
223
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
224
+ end
225
+
226
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
227
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
228
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
229
+ existing = existing_aggregations.map do |agg_col_name|
230
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
231
+ end
232
+
233
+ #This will select all the aggregations that contain ALL of the desired dimension names
234
+ #we are sorting by length because the aggregation with the least number of members
235
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
236
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
237
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
238
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
239
+ #this should do for now.
240
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
241
+
242
+ #If no suitable aggregation exists to base this one off of,
243
+ #we'll just use the base cubes aggregation collection
244
+ return target_collection_name if candidates.blank?
245
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
246
+
247
+ end
248
+
249
+ def aggregation_for(query)
250
+ return collection if query.all_dimensions?
251
+
252
+ aggregation_query = query.clone
253
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
254
+ filter = (query.where if query.respond_to?(:where))
255
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
256
+
257
+ dimension_names = aggregation_query.dimension_names.sort
258
+ agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
259
+
260
+ unless database.collection_names.include?(agg_col_name)
261
+ source_col_name = find_best_source_collection(dimension_names)
262
+ exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
263
+ aggregate(exec_query, :target_collection=>agg_col_name)
264
+ end
265
+
266
+ database[agg_col_name]
267
+ end
268
+
269
+ def ensure_indexes(collection_name,dimension_names)
270
+ #an index for each dimension
271
+ dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
272
+ #and a composite
273
+ database[collection_name].create_index(dimension_names)
274
+ end
275
+
276
+ def aggregate(query,options={})
277
+ map, reduce = generate_map_function(query), generate_reduce_function
278
+ options[:finalize] = generate_finalize_function(query)
279
+ options["query"] = prepare_filter(query,options[:where] || {})
280
+
281
+ query.source_collection_name ||= source_collection_name
282
+
283
+ target_collection = options.delete(:target_collection)
284
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
285
+
286
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
287
+
288
+ #This is defensive - some tests run without ever initializing any collections
289
+ return [] unless database.collection_names.include?(query.source_collection_name)
290
+
291
+ result = database[query.source_collection_name].map_reduce(map,reduce,options)
292
+
293
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
294
+
295
+ result
296
+ end
297
+
298
+ def prepare_filter(query,filter={})
299
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
300
+ filter.stringify_keys!
301
+ transient = (query.transient? || query == self)
302
+ filter.keys.each do |key|
303
+ next if key=~/^\$.*/
304
+ prefix = nil
305
+ prefix = "_id" if (member = self.dimensions[key])
306
+ prefix = "value" if (member = self.measures[key]) unless member
307
+
308
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
309
+
310
+ filter_value = filter.delete(key)
311
+ if transient
312
+ if (member.expression_type == :javascript)
313
+ filter_name = "$where"
314
+ filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
315
+ filter_value = "(#{member.expression})==#{filter_value}"
316
+ else
317
+ filter_name = member.expression
318
+ end
319
+ else
320
+ filter_name = "#{prefix}.#{member.name}"
321
+ end
322
+ filter[filter_name] = filter_value
323
+ end
324
+ filter
325
+ end
326
+
327
+ def prepare_order_by(query)
328
+ order_by = []
329
+ query.order_by.each do |order|
330
+ prefix = "_id" if (member = self.dimensions[order[0]])
331
+ prefix = "value" if (member = self.measures[order[0]]) unless member
332
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
333
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
334
+ end
335
+ order_by
336
+ end
337
+
338
+ def process_if_required
339
+ return if database.collection_names.include?(target_collection_name)
340
+ process
341
+ end
342
+
343
+
344
+ def generate_keys_string(query)
345
+ "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
346
+ end
347
+
348
+ def generate_values_string(query = self)
349
+ "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
350
+ end
351
+
352
+ def generate_map_function(query = self)
353
+ <<MAP
354
+ function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
355
+ MAP
356
+ end
357
+
358
+ def generate_reduce_function()
359
+ <<REDUCE
360
+ function(key,values){
361
+ var output = {};
362
+ values.forEach(function(doc){
363
+ for(var key in doc){
364
+ if (doc[key] != null){
365
+ output[key] = output[key] || 0;
366
+ output[key] += doc[key];
367
+ }
368
+ }
369
+ });
370
+ return output;
371
+ }
372
+ REDUCE
373
+ end
374
+
375
+ def generate_finalize_function(query = self)
376
+ <<FINALIZE
377
+ function(key,value)
378
+ {
379
+
380
+ #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
381
+ "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
382
+ end.join("\n")}
383
+ #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
384
+ "value.#{m.name}=#{m.expression};";
385
+ end.join("\n")}
386
+ return value;
387
+ }
388
+ FINALIZE
389
+ end
389
390
  end