cubicle 0.1.30 → 0.1.31

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ ==0.1.31
2
+ *Added ability to filter aggregations from within the cubicle definition. Particularly useful in STI situations
3
+ to prepare a cubicle for a particular value of _type.
4
+
1
5
  ==0.1.30
2
6
  *Added duration in weeks.
3
7
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.30"
8
+ s.version = "0.1.31"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-07-07}
12
+ s.date = %q{2010-07-15}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,261 +1,265 @@
1
- module Cubicle
2
- module Aggregation
3
- class AggregationManager
4
-
5
- attr_reader :aggregation, :metadata, :profiler
6
-
7
- def initialize(aggregation)
8
- @aggregation = aggregation
9
- @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
10
- @profiler = Cubicle::Aggregation::Profiler.new(aggregation)
11
- end
12
-
13
- def database
14
- Cubicle.mongo.database
15
- end
16
-
17
- def collection
18
- database[aggregation.target_collection_name]
19
- end
20
-
21
- def target_collection_name
22
- aggregation.target_collection_name
23
- end
24
-
25
-
26
- #noinspection RubyArgCount
27
- def execute_query(query,options={})
28
- count = 0
29
-
30
- find_options = {
31
- :limit=>query.limit || 0,
32
- :skip=>query.offset || 0
33
- }
34
-
35
- find_options[:sort] = prepare_order_by(query)
36
- filter = {}
37
-
38
- if query == aggregation || query.transient?
39
- reduction = aggregate(query,options.merge(:reason=>"Transient query"))
40
- else
41
- process_if_required
42
- agg_data = aggregation_for(query)
43
- reduction = agg_data.collection
44
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
45
- #otherwise, a second map reduce is required to reduce the data set one last time
46
- if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
47
- filter = prepare_filter(query,options[:where] || {})
48
- else
49
- reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
50
- end
51
- end
52
-
53
- if reduction.blank?
54
- Cubicle::Data::Table.new(query,[],0)
55
- else
56
-
57
- @profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
58
- count = reduction.count
59
- results = reduction.find(filter,find_options).to_a
60
- #reduction.drop if reduction.name =~ /^tmp.mr.*/
61
- Cubicle::Data::Table.new(query, results, count)
62
- end
63
-
64
- end
65
-
66
- end
67
-
68
- def process(options={})
69
- @metadata.update_processing_stats do
70
- expire!
71
- aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
72
- #Sort desc by length of array, so that larget
73
- #aggregations are processed first, hopefully increasing efficiency
74
- #of the processing step
75
- aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
76
- aggregation.aggregations.each do |member_list|
77
- agg_start = Time.now
78
- aggregation_for(aggregation.query(:defer=>true){select member_list})
79
- Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
80
- end
81
- end
82
- end
83
-
84
- def expire!
85
- @profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
86
- collection.drop
87
- @metadata.expire!
88
- end
89
- end
90
-
91
- def aggregate(query,options={})
92
- view = AggregationView.new(aggregation,query)
93
-
94
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
95
-
96
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
97
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
98
-
99
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
100
-
101
- target_collection = options.delete(:target_collection)
102
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
103
-
104
- options[:out] = target_collection unless target_collection.blank? || query.transient?
105
-
106
- #This is defensive - some tests run without ever initializing any collections
107
- unless database.collection_names.include?(query.source_collection_name)
108
- Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
109
- return []
110
- end
111
-
112
- reason = options.delete(:reason) || "Unknown"
113
- agg_info= options.delete(:aggregation_info)
114
-
115
- result = map_reduce(query.source_collection_name,expand_template(map, view),reduce,options)
116
-
117
- @profiler.record_map_reduce_result(query,options,result,reason,agg_info)
118
-
119
- @profiler.measure(:create_indexes, :target_collection=>options[:out] || "transient", :reason=>:finalize_aggregation) do
120
- ensure_indexes(target_collection,query.dimension_names)
121
- end if target_collection && !query.transient?
122
-
123
- #A bug, possibly in Mongo, does not produce a count on MR collections
124
- #sometimes, so we'll just add it from the result.
125
- output = database[result["result"]]
126
- output.instance_eval "def count; #{result["counts"]["output"]}; end"
127
- output
128
- end
129
-
130
- protected
131
-
132
-
133
- def aggregation_for(query)
134
- #return collection if query.all_dimensions?
135
-
136
- aggregation_query = query.clone
137
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
138
- filter = (query.where if query.respond_to?(:where))
139
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
140
-
141
- dimension_names = aggregation_query.dimension_names.sort
142
- @metadata.aggregation_for(dimension_names)
143
- end
144
-
145
- def ensure_indexes(collection_name,dimension_names)
146
- col = database[collection_name]
147
- #an index for each dimension
148
- dimension_names.each {|dim|col.create_index(dim)}
149
- #The below composite isn't working, I think because of too many fields being
150
- #indexed. After some thought, I think maybe this is overkill anyway. However,
151
- #there should be SOME way to build composite indexes for common queries,
152
- #so more thought is needed. Maybe cubicle can compile and analyze query
153
- #stats and choose indexes automatically based on usage. For now, however,
154
- #I'm just going to turn the thing off.
155
- #col.create_index(dimension_names.map{|dim|[dim,1]})
156
- end
157
-
158
- def expand_template(template,view)
159
- return "" unless template
160
- return Mustache.render(template,view) if template.is_a?(String)
161
- if (template.is_a?(Hash))
162
- template.each {|key,val|template[key] = expand_template(val,view)}
163
- return template
164
- end
165
- template
166
- end
167
-
168
- def prepare_filter(query,filter={})
169
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
170
- filter.stringify_keys!
171
- transient = (query.transient? || query == aggregation)
172
- filter.keys.each do |key|
173
- next if key=~/^\$.*/
174
- prefix = nil
175
- prefix = "_id" if (member = aggregation.dimensions[key])
176
- prefix = "value" if (member = aggregation.measures[key]) unless member
177
-
178
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
179
-
180
- filter_value = filter.delete(key)
181
- if transient
182
- if (member.expression_type == :javascript)
183
- filter_name = "$where"
184
- filter_value = make_filter_transient(member.expression,filter_value)
185
- else
186
- filter_name = member.field_name
187
- end
188
- else
189
- filter_name = "#{prefix}.#{member.name}"
190
- end
191
- filter[filter_name] = filter_value
192
- end
193
- filter
194
- end
195
-
196
- def prepare_order_by(query)
197
- order_by = []
198
- query.order_by.each do |order|
199
- prefix = "_id" if (member = aggregation.dimensions[order[0]])
200
- prefix = "value" if (member = aggregation.measures[order[0]]) unless member
201
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
202
- order_by << ["#{prefix}.#{order[0]}",order[1]]
203
- end
204
- order_by
205
- end
206
-
207
- def process_if_required
208
- return if database.collection_names.include?(target_collection_name)
209
- process
210
- end
211
-
212
- def make_filter_transient(filter_expression,filter_value)
213
- filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
214
- conditions = filter_value.keys.map do |operator|
215
- "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
216
- end
217
- return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
218
- end
219
-
220
- def make_operator_transient(operator)
221
- case operator
222
- when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
223
- when "$ne" then "!="
224
- when "$lt" then "<"
225
- when "$gt" then ">"
226
- when "$lte" then "<="
227
- when "$gte" then ">="
228
- else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
229
- end
230
- end
231
-
232
- def quote_if_required(filter_value)
233
- (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
234
- end
235
-
236
- #this is just the Mongo driver's implementation of the MapReduce
237
- #method, but instead of returning the resulting collection,
238
- #I'm returning the full 'results' so that I can capture
239
- #the delicious stats contained within its delicate hash shell
240
- def map_reduce(source_collection_name,map, reduce, opts={})
241
-
242
- map = BSON::Code.new(map) unless map.is_a?(BSON::Code)
243
- reduce = BSON::Code.new(reduce) unless reduce.is_a?(BSON::Code)
244
-
245
- hash = BSON::OrderedHash.new
246
- hash['mapreduce'] = source_collection_name
247
- hash['map'] = map
248
- hash['reduce'] = reduce
249
- hash.merge! opts
250
-
251
- result = database.command(hash)
252
- unless result["ok"] == 1
253
- raise Mongo::OperationFailure, "map-reduce failed: #{result['errmsg']}"
254
- end
255
-
256
- result
257
- end
258
-
259
- end
260
- end
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation, :metadata, :profiler
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
10
+ @profiler = Cubicle::Aggregation::Profiler.new(aggregation)
11
+ end
12
+
13
+ def database
14
+ Cubicle.mongo.database
15
+ end
16
+
17
+ def collection
18
+ database[aggregation.target_collection_name]
19
+ end
20
+
21
+ def target_collection_name
22
+ aggregation.target_collection_name
23
+ end
24
+
25
+
26
+ #noinspection RubyArgCount
27
+ def execute_query(query,options={})
28
+ count = 0
29
+
30
+ find_options = {
31
+ :limit=>query.limit || 0,
32
+ :skip=>query.offset || 0
33
+ }
34
+
35
+ find_options[:sort] = prepare_order_by(query)
36
+ filter = {}
37
+
38
+ if query == aggregation || query.transient?
39
+ reduction = aggregate(query,options.merge(:reason=>"Transient query"))
40
+ else
41
+ process_if_required
42
+ agg_data = aggregation_for(query)
43
+ reduction = agg_data.collection
44
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
45
+ #otherwise, a second map reduce is required to reduce the data set one last time
46
+ if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
47
+ filter = prepare_filter(query,options[:where] || {})
48
+ else
49
+ reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
50
+ end
51
+ end
52
+
53
+ if reduction.blank?
54
+ Cubicle::Data::Table.new(query,[],0)
55
+ else
56
+
57
+ @profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
58
+ count = reduction.count
59
+ results = reduction.find(filter,find_options).to_a
60
+ #reduction.drop if reduction.name =~ /^tmp.mr.*/
61
+ Cubicle::Data::Table.new(query, results, count)
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+
68
+ def process(options={})
69
+ @metadata.update_processing_stats do
70
+ expire!
71
+ aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
72
+ #Sort desc by length of array, so that larget
73
+ #aggregations are processed first, hopefully increasing efficiency
74
+ #of the processing step
75
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
76
+ aggregation.aggregations.each do |member_list|
77
+ agg_start = Time.now
78
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
79
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
80
+ end
81
+ end
82
+ end
83
+
84
+ def expire!
85
+ @profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
86
+ collection.drop
87
+ @metadata.expire!
88
+ end
89
+ end
90
+
91
+ def aggregate(query,options={})
92
+ view = AggregationView.new(aggregation,query)
93
+
94
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
95
+
96
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
97
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
98
+
99
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
100
+
101
+ target_collection = options.delete(:target_collection)
102
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
103
+
104
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
105
+
106
+ #This is defensive - some tests run without ever initializing any collections
107
+ unless database.collection_names.include?(query.source_collection_name)
108
+ Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
109
+ return []
110
+ end
111
+
112
+ reason = options.delete(:reason) || "Unknown"
113
+ agg_info= options.delete(:aggregation_info)
114
+
115
+ if aggregation.filter && (query.transient? || query == aggregation)
116
+ (options['query'] ||= {}).merge!(aggregation.filter)
117
+ end
118
+
119
+ result = map_reduce(query.source_collection_name,expand_template(map, view),reduce,options)
120
+
121
+ @profiler.record_map_reduce_result(query,options,result,reason,agg_info)
122
+
123
+ @profiler.measure(:create_indexes, :target_collection=>options[:out] || "transient", :reason=>:finalize_aggregation) do
124
+ ensure_indexes(target_collection,query.dimension_names)
125
+ end if target_collection && !query.transient?
126
+
127
+ #A bug, possibly in Mongo, does not produce a count on MR collections
128
+ #sometimes, so we'll just add it from the result.
129
+ output = database[result["result"]]
130
+ output.instance_eval "def count; #{result["counts"]["output"]}; end"
131
+ output
132
+ end
133
+
134
+ protected
135
+
136
+
137
+ def aggregation_for(query)
138
+ #return collection if query.all_dimensions?
139
+
140
+ aggregation_query = query.clone
141
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
142
+ filter = (query.where if query.respond_to?(:where))
143
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
144
+
145
+ dimension_names = aggregation_query.dimension_names.sort
146
+ @metadata.aggregation_for(dimension_names)
147
+ end
148
+
149
+ def ensure_indexes(collection_name,dimension_names)
150
+ col = database[collection_name]
151
+ #an index for each dimension
152
+ dimension_names.each {|dim|col.create_index(dim)}
153
+ #The below composite isn't working, I think because of too many fields being
154
+ #indexed. After some thought, I think maybe this is overkill anyway. However,
155
+ #there should be SOME way to build composite indexes for common queries,
156
+ #so more thought is needed. Maybe cubicle can compile and analyze query
157
+ #stats and choose indexes automatically based on usage. For now, however,
158
+ #I'm just going to turn the thing off.
159
+ #col.create_index(dimension_names.map{|dim|[dim,1]})
160
+ end
161
+
162
+ def expand_template(template,view)
163
+ return "" unless template
164
+ return Mustache.render(template,view) if template.is_a?(String)
165
+ if (template.is_a?(Hash))
166
+ template.each {|key,val|template[key] = expand_template(val,view)}
167
+ return template
168
+ end
169
+ template
170
+ end
171
+
172
+ def prepare_filter(query,filter={})
173
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
174
+ filter.stringify_keys!
175
+ transient = (query.transient? || query == aggregation)
176
+ filter.keys.each do |key|
177
+ next if key=~/^\$.*/
178
+ prefix = nil
179
+ prefix = "_id" if (member = aggregation.dimensions[key])
180
+ prefix = "value" if (member = aggregation.measures[key]) unless member
181
+
182
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
183
+
184
+ filter_value = filter.delete(key)
185
+ if transient
186
+ if (member.expression_type == :javascript)
187
+ filter_name = "$where"
188
+ filter_value = make_filter_transient(member.expression,filter_value)
189
+ else
190
+ filter_name = member.field_name
191
+ end
192
+ else
193
+ filter_name = "#{prefix}.#{member.name}"
194
+ end
195
+ filter[filter_name] = filter_value
196
+ end
197
+ filter
198
+ end
199
+
200
+ def prepare_order_by(query)
201
+ order_by = []
202
+ query.order_by.each do |order|
203
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
204
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
205
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
206
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
207
+ end
208
+ order_by
209
+ end
210
+
211
+ def process_if_required
212
+ return if database.collection_names.include?(target_collection_name)
213
+ process
214
+ end
215
+
216
+ def make_filter_transient(filter_expression,filter_value)
217
+ filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
218
+ conditions = filter_value.keys.map do |operator|
219
+ "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
220
+ end
221
+ return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
222
+ end
223
+
224
+ def make_operator_transient(operator)
225
+ case operator
226
+ when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
227
+ when "$ne" then "!="
228
+ when "$lt" then "<"
229
+ when "$gt" then ">"
230
+ when "$lte" then "<="
231
+ when "$gte" then ">="
232
+ else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
233
+ end
234
+ end
235
+
236
+ def quote_if_required(filter_value)
237
+ (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
238
+ end
239
+
240
+ #this is just the Mongo driver's implementation of the MapReduce
241
+ #method, but instead of returning the resulting collection,
242
+ #I'm returning the full 'results' so that I can capture
243
+ #the delicious stats contained within its delicate hash shell
244
+ def map_reduce(source_collection_name,map, reduce, opts={})
245
+
246
+ map = BSON::Code.new(map) unless map.is_a?(BSON::Code)
247
+ reduce = BSON::Code.new(reduce) unless reduce.is_a?(BSON::Code)
248
+
249
+ hash = BSON::OrderedHash.new
250
+ hash['mapreduce'] = source_collection_name
251
+ hash['map'] = map
252
+ hash['reduce'] = reduce
253
+ hash.merge! opts
254
+
255
+ result = database.command(hash)
256
+ unless result["ok"] == 1
257
+ raise Mongo::OperationFailure, "map-reduce failed: #{result['errmsg']}"
258
+ end
259
+
260
+ result
261
+ end
262
+
263
+ end
264
+ end
261
265
  end