cubicle 0.1.19 → 0.1.20

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,6 @@
1
+ ==0.1.20
2
+ *Updated to work with mongo driver 1.0 (and therefore latest versions of MongoMapper)
3
+
1
4
  ==0.1.19
2
5
  *Fixed bug that caused cubicle to hang when grouping by days
3
6
 
data/cubicle.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.19"
8
+ s.version = "0.1.20"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-04-28}
12
+ s.date = %q{2010-05-03}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,249 +1,254 @@
1
- module Cubicle
2
- module Aggregation
3
- class AggregationManager
4
-
5
- attr_reader :aggregation
6
-
7
- def initialize(aggregation)
8
- @aggregation = aggregation
9
- end
10
-
11
- def database
12
- Cubicle.mongo.database
13
- end
14
-
15
- def collection
16
- database[aggregation.target_collection_name]
17
- end
18
-
19
- def target_collection_name
20
- aggregation.target_collection_name
21
- end
22
-
23
-
24
- #noinspection RubyArgCount
25
- def execute_query(query,options={})
26
- count = 0
27
-
28
- find_options = {
29
- :limit=>query.limit || 0,
30
- :skip=>query.offset || 0
31
- }
32
-
33
- find_options[:sort] = prepare_order_by(query)
34
- filter = {}
35
- if query == aggregation || query.transient?
36
- aggregation = aggregate(query,options)
37
- else
38
- process_if_required
39
- aggregation = aggregation_for(query)
40
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
- #otherwise, a second map reduce is required to reduce the data set one last time
42
- if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
- filter = prepare_filter(query,options[:where] || {})
44
- else
45
- aggregation = aggregate(query,:source_collection=>aggregation.name)
46
- end
47
- end
48
-
49
- if aggregation.blank?
50
- Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
- else
52
- count = aggregation.count
53
- results = aggregation.find(filter,find_options).to_a
54
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
- Cubicle::Data::Table.new(query, results, count)
56
- end
57
-
58
- end
59
-
60
- def process(options={})
61
- Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
- start = Time.now
63
- expire!
64
- aggregate(aggregation,options)
65
- #Sort desc by length of array, so that larget
66
- #aggregations are processed first, hopefully increasing efficiency
67
- #of the processing step
68
- aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
- aggregation.aggregations.each do |member_list|
70
- agg_start = Time.now
71
- aggregation_for(aggregation.query(:defer=>true){select member_list})
72
- Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
- end
74
- duration = Time.now - start
75
- Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
- end
77
-
78
- def expire!
79
- collection.drop
80
- expire_aggregations!
81
- end
82
-
83
- protected
84
-
85
- def aggregation_collection_names
86
- database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
- end
88
-
89
- def expire_aggregations!
90
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
- end
92
-
93
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
- existing = existing_aggregations.map do |agg_col_name|
97
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
- end
99
-
100
- #This will select all the aggregations that contain ALL of the desired dimension names
101
- #we are sorting by length because the aggregation with the least number of members
102
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
- #this should do for now.
107
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
-
109
- #If no suitable aggregation exists to base this one off of,
110
- #we'll just use the base cubes aggregation collection
111
- return target_collection_name if candidates.blank?
112
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
-
114
- end
115
-
116
- def aggregation_for(query)
117
- return collection if query.all_dimensions?
118
-
119
- aggregation_query = query.clone
120
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
- filter = (query.where if query.respond_to?(:where))
122
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
-
124
- dimension_names = aggregation_query.dimension_names.sort
125
- agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
-
127
- unless database.collection_names.include?(agg_col_name)
128
- source_col_name = find_best_source_collection(dimension_names)
129
- exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
- aggregate(exec_query, :target_collection=>agg_col_name)
131
- end
132
-
133
- database[agg_col_name]
134
- end
135
-
136
- def ensure_indexes(collection_name,dimension_names)
137
- col = database[collection_name]
138
- #an index for each dimension
139
- dimension_names.each {|dim|col.create_index([dim,Mongo::ASCENDING])}
140
- #and a composite
141
- col.create_index(dimension_names)
142
- end
143
-
144
- def aggregate(query,options={})
145
- view = AggregationView.new(aggregation,query)
146
-
147
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
148
-
149
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
150
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
151
-
152
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
153
-
154
- target_collection = options.delete(:target_collection)
155
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
156
-
157
- options[:out] = target_collection unless target_collection.blank? || query.transient?
158
-
159
- #This is defensive - some tests run without ever initializing any collections
160
- return [] unless database.collection_names.include?(query.source_collection_name)
161
-
162
- result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
163
-
164
- ensure_indexes(target_collection,query.dimension_names) if target_collection
165
-
166
- result
167
- end
168
-
169
- def expand_template(template,view)
170
- return "" unless template
171
- return Mustache.render(template,view) if template.is_a?(String)
172
- if (template.is_a?(Hash))
173
- template.each {|key,val|template[key] = expand_template(val,view)}
174
- return template
175
- end
176
- template
177
- end
178
-
179
- def prepare_filter(query,filter={})
180
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
181
- filter.stringify_keys!
182
- transient = (query.transient? || query == aggregation)
183
- filter.keys.each do |key|
184
- next if key=~/^\$.*/
185
- prefix = nil
186
- prefix = "_id" if (member = aggregation.dimensions[key])
187
- prefix = "value" if (member = aggregation.measures[key]) unless member
188
-
189
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
190
-
191
- filter_value = filter.delete(key)
192
- if transient
193
- if (member.expression_type == :javascript)
194
- filter_name = "$where"
195
- filter_value = make_filter_transient(member.expression,filter_value)
196
- else
197
- filter_name = member.field_name
198
- end
199
- else
200
- filter_name = "#{prefix}.#{member.name}"
201
- end
202
- filter[filter_name] = filter_value
203
- end
204
- filter
205
- end
206
-
207
- def prepare_order_by(query)
208
- order_by = []
209
- query.order_by.each do |order|
210
- prefix = "_id" if (member = aggregation.dimensions[order[0]])
211
- prefix = "value" if (member = aggregation.measures[order[0]]) unless member
212
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
213
- order_by << ["#{prefix}.#{order[0]}",order[1]]
214
- end
215
- order_by
216
- end
217
-
218
- def process_if_required
219
- return if database.collection_names.include?(target_collection_name)
220
- process
221
- end
222
-
223
- def make_filter_transient(filter_expression,filter_value)
224
- filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
225
- conditions = filter_value.keys.map do |operator|
226
- "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
227
- end
228
- return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
229
- end
230
-
231
- def make_operator_transient(operator)
232
- case operator
233
- when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
234
- when "$ne" then "!="
235
- when "$lt" then "<"
236
- when "$gt" then ">"
237
- when "$lte" then "<="
238
- when "$gte" then ">="
239
- else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
240
- end
241
- end
242
-
243
- def quote_if_required(filter_value)
244
- (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
245
- end
246
-
247
- end
248
- end
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ end
10
+
11
+ def database
12
+ Cubicle.mongo.database
13
+ end
14
+
15
+ def collection
16
+ database[aggregation.target_collection_name]
17
+ end
18
+
19
+ def target_collection_name
20
+ aggregation.target_collection_name
21
+ end
22
+
23
+
24
+ #noinspection RubyArgCount
25
+ def execute_query(query,options={})
26
+ count = 0
27
+
28
+ find_options = {
29
+ :limit=>query.limit || 0,
30
+ :skip=>query.offset || 0
31
+ }
32
+
33
+ find_options[:sort] = prepare_order_by(query)
34
+ filter = {}
35
+ if query == aggregation || query.transient?
36
+ aggregation = aggregate(query,options)
37
+ else
38
+ process_if_required
39
+ aggregation = aggregation_for(query)
40
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
+ #otherwise, a second map reduce is required to reduce the data set one last time
42
+ if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
+ filter = prepare_filter(query,options[:where] || {})
44
+ else
45
+ aggregation = aggregate(query,:source_collection=>aggregation.name)
46
+ end
47
+ end
48
+
49
+ if aggregation.blank?
50
+ Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
+ else
52
+ count = aggregation.count
53
+ results = aggregation.find(filter,find_options).to_a
54
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ Cubicle::Data::Table.new(query, results, count)
56
+ end
57
+
58
+ end
59
+
60
+ def process(options={})
61
+ Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
+ start = Time.now
63
+ expire!
64
+ aggregate(aggregation,options)
65
+ #Sort desc by length of array, so that larget
66
+ #aggregations are processed first, hopefully increasing efficiency
67
+ #of the processing step
68
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
+ aggregation.aggregations.each do |member_list|
70
+ agg_start = Time.now
71
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
72
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
+ end
74
+ duration = Time.now - start
75
+ Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
+ end
77
+
78
+ def expire!
79
+ collection.drop
80
+ expire_aggregations!
81
+ end
82
+
83
+ protected
84
+
85
+ def aggregation_collection_names
86
+ database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
+ end
88
+
89
+ def expire_aggregations!
90
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
+ end
92
+
93
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
+ existing = existing_aggregations.map do |agg_col_name|
97
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
+ end
99
+
100
+ #This will select all the aggregations that contain ALL of the desired dimension names
101
+ #we are sorting by length because the aggregation with the least number of members
102
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
+ #this should do for now.
107
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
+
109
+ #If no suitable aggregation exists to base this one off of,
110
+ #we'll just use the base cubes aggregation collection
111
+ return target_collection_name if candidates.blank?
112
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
+
114
+ end
115
+
116
+ def aggregation_for(query)
117
+ return collection if query.all_dimensions?
118
+
119
+ aggregation_query = query.clone
120
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
+ filter = (query.where if query.respond_to?(:where))
122
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
+
124
+ dimension_names = aggregation_query.dimension_names.sort
125
+ agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
+
127
+ unless database.collection_names.include?(agg_col_name)
128
+ source_col_name = find_best_source_collection(dimension_names)
129
+ exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
+ aggregate(exec_query, :target_collection=>agg_col_name)
131
+ end
132
+
133
+ database[agg_col_name]
134
+ end
135
+
136
+ def ensure_indexes(collection_name,dimension_names)
137
+ col = database[collection_name]
138
+ #an index for each dimension
139
+ dimension_names.each {|dim|col.create_index(dim)}
140
+ #The below composite isn't working, I think because of too many fields being
141
+ #indexed. After some thought, I think maybe this is overkill anyway. However,
142
+ #there should be SOME way to build composite indexes for common queries,
143
+ #so more thought is needed. Maybe cubicle can compile and analyze query
144
+ #stats and choose indexes automatically based on usage. For now, however,
145
+ #I'm just going to turn the thing off.
146
+ #col.create_index(dimension_names.map{|dim|[dim,1]})
147
+ end
148
+
149
+ def aggregate(query,options={})
150
+ view = AggregationView.new(aggregation,query)
151
+
152
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
153
+
154
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
155
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
156
+
157
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
158
+
159
+ target_collection = options.delete(:target_collection)
160
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
161
+
162
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
163
+
164
+ #This is defensive - some tests run without ever initializing any collections
165
+ return [] unless database.collection_names.include?(query.source_collection_name)
166
+
167
+ result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
168
+
169
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
170
+
171
+ result
172
+ end
173
+
174
+ def expand_template(template,view)
175
+ return "" unless template
176
+ return Mustache.render(template,view) if template.is_a?(String)
177
+ if (template.is_a?(Hash))
178
+ template.each {|key,val|template[key] = expand_template(val,view)}
179
+ return template
180
+ end
181
+ template
182
+ end
183
+
184
+ def prepare_filter(query,filter={})
185
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
186
+ filter.stringify_keys!
187
+ transient = (query.transient? || query == aggregation)
188
+ filter.keys.each do |key|
189
+ next if key=~/^\$.*/
190
+ prefix = nil
191
+ prefix = "_id" if (member = aggregation.dimensions[key])
192
+ prefix = "value" if (member = aggregation.measures[key]) unless member
193
+
194
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
195
+
196
+ filter_value = filter.delete(key)
197
+ if transient
198
+ if (member.expression_type == :javascript)
199
+ filter_name = "$where"
200
+ filter_value = make_filter_transient(member.expression,filter_value)
201
+ else
202
+ filter_name = member.field_name
203
+ end
204
+ else
205
+ filter_name = "#{prefix}.#{member.name}"
206
+ end
207
+ filter[filter_name] = filter_value
208
+ end
209
+ filter
210
+ end
211
+
212
+ def prepare_order_by(query)
213
+ order_by = []
214
+ query.order_by.each do |order|
215
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
216
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
217
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
218
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
219
+ end
220
+ order_by
221
+ end
222
+
223
+ def process_if_required
224
+ return if database.collection_names.include?(target_collection_name)
225
+ process
226
+ end
227
+
228
+ def make_filter_transient(filter_expression,filter_value)
229
+ filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
230
+ conditions = filter_value.keys.map do |operator|
231
+ "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
232
+ end
233
+ return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
234
+ end
235
+
236
+ def make_operator_transient(operator)
237
+ case operator
238
+ when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
239
+ when "$ne" then "!="
240
+ when "$lt" then "<"
241
+ when "$gt" then ">"
242
+ when "$lte" then "<="
243
+ when "$gte" then ">="
244
+ else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
245
+ end
246
+ end
247
+
248
+ def quote_if_required(filter_value)
249
+ (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
250
+ end
251
+
252
+ end
253
+ end
249
254
  end