cubicle 0.1.19 → 0.1.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,6 @@
1
+ ==0.1.20
2
+ *Updated to work with mongo driver 1.0 (and therefore latest versions of MongoMapper)
3
+
1
4
  ==0.1.19
2
5
  *Fixed bug that caused cubicle to hang when grouping by days
3
6
 
data/cubicle.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.19"
8
+ s.version = "0.1.20"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-04-28}
12
+ s.date = %q{2010-05-03}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,249 +1,254 @@
1
- module Cubicle
2
- module Aggregation
3
- class AggregationManager
4
-
5
- attr_reader :aggregation
6
-
7
- def initialize(aggregation)
8
- @aggregation = aggregation
9
- end
10
-
11
- def database
12
- Cubicle.mongo.database
13
- end
14
-
15
- def collection
16
- database[aggregation.target_collection_name]
17
- end
18
-
19
- def target_collection_name
20
- aggregation.target_collection_name
21
- end
22
-
23
-
24
- #noinspection RubyArgCount
25
- def execute_query(query,options={})
26
- count = 0
27
-
28
- find_options = {
29
- :limit=>query.limit || 0,
30
- :skip=>query.offset || 0
31
- }
32
-
33
- find_options[:sort] = prepare_order_by(query)
34
- filter = {}
35
- if query == aggregation || query.transient?
36
- aggregation = aggregate(query,options)
37
- else
38
- process_if_required
39
- aggregation = aggregation_for(query)
40
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
- #otherwise, a second map reduce is required to reduce the data set one last time
42
- if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
- filter = prepare_filter(query,options[:where] || {})
44
- else
45
- aggregation = aggregate(query,:source_collection=>aggregation.name)
46
- end
47
- end
48
-
49
- if aggregation.blank?
50
- Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
- else
52
- count = aggregation.count
53
- results = aggregation.find(filter,find_options).to_a
54
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
- Cubicle::Data::Table.new(query, results, count)
56
- end
57
-
58
- end
59
-
60
- def process(options={})
61
- Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
- start = Time.now
63
- expire!
64
- aggregate(aggregation,options)
65
- #Sort desc by length of array, so that larget
66
- #aggregations are processed first, hopefully increasing efficiency
67
- #of the processing step
68
- aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
- aggregation.aggregations.each do |member_list|
70
- agg_start = Time.now
71
- aggregation_for(aggregation.query(:defer=>true){select member_list})
72
- Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
- end
74
- duration = Time.now - start
75
- Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
- end
77
-
78
- def expire!
79
- collection.drop
80
- expire_aggregations!
81
- end
82
-
83
- protected
84
-
85
- def aggregation_collection_names
86
- database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
- end
88
-
89
- def expire_aggregations!
90
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
- end
92
-
93
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
- existing = existing_aggregations.map do |agg_col_name|
97
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
- end
99
-
100
- #This will select all the aggregations that contain ALL of the desired dimension names
101
- #we are sorting by length because the aggregation with the least number of members
102
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
- #this should do for now.
107
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
-
109
- #If no suitable aggregation exists to base this one off of,
110
- #we'll just use the base cubes aggregation collection
111
- return target_collection_name if candidates.blank?
112
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
-
114
- end
115
-
116
- def aggregation_for(query)
117
- return collection if query.all_dimensions?
118
-
119
- aggregation_query = query.clone
120
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
- filter = (query.where if query.respond_to?(:where))
122
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
-
124
- dimension_names = aggregation_query.dimension_names.sort
125
- agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
-
127
- unless database.collection_names.include?(agg_col_name)
128
- source_col_name = find_best_source_collection(dimension_names)
129
- exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
- aggregate(exec_query, :target_collection=>agg_col_name)
131
- end
132
-
133
- database[agg_col_name]
134
- end
135
-
136
- def ensure_indexes(collection_name,dimension_names)
137
- col = database[collection_name]
138
- #an index for each dimension
139
- dimension_names.each {|dim|col.create_index([dim,Mongo::ASCENDING])}
140
- #and a composite
141
- col.create_index(dimension_names)
142
- end
143
-
144
- def aggregate(query,options={})
145
- view = AggregationView.new(aggregation,query)
146
-
147
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
148
-
149
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
150
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
151
-
152
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
153
-
154
- target_collection = options.delete(:target_collection)
155
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
156
-
157
- options[:out] = target_collection unless target_collection.blank? || query.transient?
158
-
159
- #This is defensive - some tests run without ever initializing any collections
160
- return [] unless database.collection_names.include?(query.source_collection_name)
161
-
162
- result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
163
-
164
- ensure_indexes(target_collection,query.dimension_names) if target_collection
165
-
166
- result
167
- end
168
-
169
- def expand_template(template,view)
170
- return "" unless template
171
- return Mustache.render(template,view) if template.is_a?(String)
172
- if (template.is_a?(Hash))
173
- template.each {|key,val|template[key] = expand_template(val,view)}
174
- return template
175
- end
176
- template
177
- end
178
-
179
- def prepare_filter(query,filter={})
180
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
181
- filter.stringify_keys!
182
- transient = (query.transient? || query == aggregation)
183
- filter.keys.each do |key|
184
- next if key=~/^\$.*/
185
- prefix = nil
186
- prefix = "_id" if (member = aggregation.dimensions[key])
187
- prefix = "value" if (member = aggregation.measures[key]) unless member
188
-
189
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
190
-
191
- filter_value = filter.delete(key)
192
- if transient
193
- if (member.expression_type == :javascript)
194
- filter_name = "$where"
195
- filter_value = make_filter_transient(member.expression,filter_value)
196
- else
197
- filter_name = member.field_name
198
- end
199
- else
200
- filter_name = "#{prefix}.#{member.name}"
201
- end
202
- filter[filter_name] = filter_value
203
- end
204
- filter
205
- end
206
-
207
- def prepare_order_by(query)
208
- order_by = []
209
- query.order_by.each do |order|
210
- prefix = "_id" if (member = aggregation.dimensions[order[0]])
211
- prefix = "value" if (member = aggregation.measures[order[0]]) unless member
212
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
213
- order_by << ["#{prefix}.#{order[0]}",order[1]]
214
- end
215
- order_by
216
- end
217
-
218
- def process_if_required
219
- return if database.collection_names.include?(target_collection_name)
220
- process
221
- end
222
-
223
- def make_filter_transient(filter_expression,filter_value)
224
- filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
225
- conditions = filter_value.keys.map do |operator|
226
- "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
227
- end
228
- return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
229
- end
230
-
231
- def make_operator_transient(operator)
232
- case operator
233
- when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
234
- when "$ne" then "!="
235
- when "$lt" then "<"
236
- when "$gt" then ">"
237
- when "$lte" then "<="
238
- when "$gte" then ">="
239
- else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
240
- end
241
- end
242
-
243
- def quote_if_required(filter_value)
244
- (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
245
- end
246
-
247
- end
248
- end
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ end
10
+
11
+ def database
12
+ Cubicle.mongo.database
13
+ end
14
+
15
+ def collection
16
+ database[aggregation.target_collection_name]
17
+ end
18
+
19
+ def target_collection_name
20
+ aggregation.target_collection_name
21
+ end
22
+
23
+
24
+ #noinspection RubyArgCount
25
+ def execute_query(query,options={})
26
+ count = 0
27
+
28
+ find_options = {
29
+ :limit=>query.limit || 0,
30
+ :skip=>query.offset || 0
31
+ }
32
+
33
+ find_options[:sort] = prepare_order_by(query)
34
+ filter = {}
35
+ if query == aggregation || query.transient?
36
+ aggregation = aggregate(query,options)
37
+ else
38
+ process_if_required
39
+ aggregation = aggregation_for(query)
40
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
+ #otherwise, a second map reduce is required to reduce the data set one last time
42
+ if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
+ filter = prepare_filter(query,options[:where] || {})
44
+ else
45
+ aggregation = aggregate(query,:source_collection=>aggregation.name)
46
+ end
47
+ end
48
+
49
+ if aggregation.blank?
50
+ Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
+ else
52
+ count = aggregation.count
53
+ results = aggregation.find(filter,find_options).to_a
54
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ Cubicle::Data::Table.new(query, results, count)
56
+ end
57
+
58
+ end
59
+
60
+ def process(options={})
61
+ Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
+ start = Time.now
63
+ expire!
64
+ aggregate(aggregation,options)
65
+ #Sort desc by length of array, so that larget
66
+ #aggregations are processed first, hopefully increasing efficiency
67
+ #of the processing step
68
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
+ aggregation.aggregations.each do |member_list|
70
+ agg_start = Time.now
71
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
72
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
+ end
74
+ duration = Time.now - start
75
+ Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
+ end
77
+
78
+ def expire!
79
+ collection.drop
80
+ expire_aggregations!
81
+ end
82
+
83
+ protected
84
+
85
+ def aggregation_collection_names
86
+ database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
+ end
88
+
89
+ def expire_aggregations!
90
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
+ end
92
+
93
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
+ existing = existing_aggregations.map do |agg_col_name|
97
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
+ end
99
+
100
+ #This will select all the aggregations that contain ALL of the desired dimension names
101
+ #we are sorting by length because the aggregation with the least number of members
102
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
+ #this should do for now.
107
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
+
109
+ #If no suitable aggregation exists to base this one off of,
110
+ #we'll just use the base cubes aggregation collection
111
+ return target_collection_name if candidates.blank?
112
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
+
114
+ end
115
+
116
+ def aggregation_for(query)
117
+ return collection if query.all_dimensions?
118
+
119
+ aggregation_query = query.clone
120
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
+ filter = (query.where if query.respond_to?(:where))
122
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
+
124
+ dimension_names = aggregation_query.dimension_names.sort
125
+ agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
+
127
+ unless database.collection_names.include?(agg_col_name)
128
+ source_col_name = find_best_source_collection(dimension_names)
129
+ exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
+ aggregate(exec_query, :target_collection=>agg_col_name)
131
+ end
132
+
133
+ database[agg_col_name]
134
+ end
135
+
136
+ def ensure_indexes(collection_name,dimension_names)
137
+ col = database[collection_name]
138
+ #an index for each dimension
139
+ dimension_names.each {|dim|col.create_index(dim)}
140
+ #The below composite isn't working, I think because of too many fields being
141
+ #indexed. After some thought, I think maybe this is overkill anyway. However,
142
+ #there should be SOME way to build composite indexes for common queries,
143
+ #so more thought is needed. Maybe cubicle can compile and analyze query
144
+ #stats and choose indexes automatically based on usage. For now, however,
145
+ #I'm just going to turn the thing off.
146
+ #col.create_index(dimension_names.map{|dim|[dim,1]})
147
+ end
148
+
149
+ def aggregate(query,options={})
150
+ view = AggregationView.new(aggregation,query)
151
+
152
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
153
+
154
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
155
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
156
+
157
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
158
+
159
+ target_collection = options.delete(:target_collection)
160
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
161
+
162
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
163
+
164
+ #This is defensive - some tests run without ever initializing any collections
165
+ return [] unless database.collection_names.include?(query.source_collection_name)
166
+
167
+ result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
168
+
169
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
170
+
171
+ result
172
+ end
173
+
174
+ def expand_template(template,view)
175
+ return "" unless template
176
+ return Mustache.render(template,view) if template.is_a?(String)
177
+ if (template.is_a?(Hash))
178
+ template.each {|key,val|template[key] = expand_template(val,view)}
179
+ return template
180
+ end
181
+ template
182
+ end
183
+
184
+ def prepare_filter(query,filter={})
185
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
186
+ filter.stringify_keys!
187
+ transient = (query.transient? || query == aggregation)
188
+ filter.keys.each do |key|
189
+ next if key=~/^\$.*/
190
+ prefix = nil
191
+ prefix = "_id" if (member = aggregation.dimensions[key])
192
+ prefix = "value" if (member = aggregation.measures[key]) unless member
193
+
194
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
195
+
196
+ filter_value = filter.delete(key)
197
+ if transient
198
+ if (member.expression_type == :javascript)
199
+ filter_name = "$where"
200
+ filter_value = make_filter_transient(member.expression,filter_value)
201
+ else
202
+ filter_name = member.field_name
203
+ end
204
+ else
205
+ filter_name = "#{prefix}.#{member.name}"
206
+ end
207
+ filter[filter_name] = filter_value
208
+ end
209
+ filter
210
+ end
211
+
212
+ def prepare_order_by(query)
213
+ order_by = []
214
+ query.order_by.each do |order|
215
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
216
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
217
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
218
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
219
+ end
220
+ order_by
221
+ end
222
+
223
+ def process_if_required
224
+ return if database.collection_names.include?(target_collection_name)
225
+ process
226
+ end
227
+
228
+ def make_filter_transient(filter_expression,filter_value)
229
+ filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
230
+ conditions = filter_value.keys.map do |operator|
231
+ "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
232
+ end
233
+ return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
234
+ end
235
+
236
+ def make_operator_transient(operator)
237
+ case operator
238
+ when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
239
+ when "$ne" then "!="
240
+ when "$lt" then "<"
241
+ when "$gt" then ">"
242
+ when "$lte" then "<="
243
+ when "$gte" then ">="
244
+ else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
245
+ end
246
+ end
247
+
248
+ def quote_if_required(filter_value)
249
+ (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
250
+ end
251
+
252
+ end
253
+ end
249
254
  end