cubicle 0.1.30 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ ==0.1.31
2
+ *Added ability to filter aggregations from within the cubicle definition. Particularly useful in STI situations
3
+ to prepare a cubicle for a particular value of _type.
4
+
1
5
  ==0.1.30
2
6
  *Added duration in weeks.
3
7
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.30"
8
+ s.version = "0.1.31"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-07-07}
12
+ s.date = %q{2010-07-15}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -1,261 +1,265 @@
1
- module Cubicle
2
- module Aggregation
3
- class AggregationManager
4
-
5
- attr_reader :aggregation, :metadata, :profiler
6
-
7
- def initialize(aggregation)
8
- @aggregation = aggregation
9
- @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
10
- @profiler = Cubicle::Aggregation::Profiler.new(aggregation)
11
- end
12
-
13
- def database
14
- Cubicle.mongo.database
15
- end
16
-
17
- def collection
18
- database[aggregation.target_collection_name]
19
- end
20
-
21
- def target_collection_name
22
- aggregation.target_collection_name
23
- end
24
-
25
-
26
- #noinspection RubyArgCount
27
- def execute_query(query,options={})
28
- count = 0
29
-
30
- find_options = {
31
- :limit=>query.limit || 0,
32
- :skip=>query.offset || 0
33
- }
34
-
35
- find_options[:sort] = prepare_order_by(query)
36
- filter = {}
37
-
38
- if query == aggregation || query.transient?
39
- reduction = aggregate(query,options.merge(:reason=>"Transient query"))
40
- else
41
- process_if_required
42
- agg_data = aggregation_for(query)
43
- reduction = agg_data.collection
44
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
45
- #otherwise, a second map reduce is required to reduce the data set one last time
46
- if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
47
- filter = prepare_filter(query,options[:where] || {})
48
- else
49
- reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
50
- end
51
- end
52
-
53
- if reduction.blank?
54
- Cubicle::Data::Table.new(query,[],0)
55
- else
56
-
57
- @profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
58
- count = reduction.count
59
- results = reduction.find(filter,find_options).to_a
60
- #reduction.drop if reduction.name =~ /^tmp.mr.*/
61
- Cubicle::Data::Table.new(query, results, count)
62
- end
63
-
64
- end
65
-
66
- end
67
-
68
- def process(options={})
69
- @metadata.update_processing_stats do
70
- expire!
71
- aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
72
- #Sort desc by length of array, so that larget
73
- #aggregations are processed first, hopefully increasing efficiency
74
- #of the processing step
75
- aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
76
- aggregation.aggregations.each do |member_list|
77
- agg_start = Time.now
78
- aggregation_for(aggregation.query(:defer=>true){select member_list})
79
- Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
80
- end
81
- end
82
- end
83
-
84
- def expire!
85
- @profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
86
- collection.drop
87
- @metadata.expire!
88
- end
89
- end
90
-
91
- def aggregate(query,options={})
92
- view = AggregationView.new(aggregation,query)
93
-
94
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
95
-
96
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
97
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
98
-
99
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
100
-
101
- target_collection = options.delete(:target_collection)
102
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
103
-
104
- options[:out] = target_collection unless target_collection.blank? || query.transient?
105
-
106
- #This is defensive - some tests run without ever initializing any collections
107
- unless database.collection_names.include?(query.source_collection_name)
108
- Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
109
- return []
110
- end
111
-
112
- reason = options.delete(:reason) || "Unknown"
113
- agg_info= options.delete(:aggregation_info)
114
-
115
- result = map_reduce(query.source_collection_name,expand_template(map, view),reduce,options)
116
-
117
- @profiler.record_map_reduce_result(query,options,result,reason,agg_info)
118
-
119
- @profiler.measure(:create_indexes, :target_collection=>options[:out] || "transient", :reason=>:finalize_aggregation) do
120
- ensure_indexes(target_collection,query.dimension_names)
121
- end if target_collection && !query.transient?
122
-
123
- #A bug, possibly in Mongo, does not produce a count on MR collections
124
- #sometimes, so we'll just add it from the result.
125
- output = database[result["result"]]
126
- output.instance_eval "def count; #{result["counts"]["output"]}; end"
127
- output
128
- end
129
-
130
- protected
131
-
132
-
133
- def aggregation_for(query)
134
- #return collection if query.all_dimensions?
135
-
136
- aggregation_query = query.clone
137
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
138
- filter = (query.where if query.respond_to?(:where))
139
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
140
-
141
- dimension_names = aggregation_query.dimension_names.sort
142
- @metadata.aggregation_for(dimension_names)
143
- end
144
-
145
- def ensure_indexes(collection_name,dimension_names)
146
- col = database[collection_name]
147
- #an index for each dimension
148
- dimension_names.each {|dim|col.create_index(dim)}
149
- #The below composite isn't working, I think because of too many fields being
150
- #indexed. After some thought, I think maybe this is overkill anyway. However,
151
- #there should be SOME way to build composite indexes for common queries,
152
- #so more thought is needed. Maybe cubicle can compile and analyze query
153
- #stats and choose indexes automatically based on usage. For now, however,
154
- #I'm just going to turn the thing off.
155
- #col.create_index(dimension_names.map{|dim|[dim,1]})
156
- end
157
-
158
- def expand_template(template,view)
159
- return "" unless template
160
- return Mustache.render(template,view) if template.is_a?(String)
161
- if (template.is_a?(Hash))
162
- template.each {|key,val|template[key] = expand_template(val,view)}
163
- return template
164
- end
165
- template
166
- end
167
-
168
- def prepare_filter(query,filter={})
169
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
170
- filter.stringify_keys!
171
- transient = (query.transient? || query == aggregation)
172
- filter.keys.each do |key|
173
- next if key=~/^\$.*/
174
- prefix = nil
175
- prefix = "_id" if (member = aggregation.dimensions[key])
176
- prefix = "value" if (member = aggregation.measures[key]) unless member
177
-
178
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
179
-
180
- filter_value = filter.delete(key)
181
- if transient
182
- if (member.expression_type == :javascript)
183
- filter_name = "$where"
184
- filter_value = make_filter_transient(member.expression,filter_value)
185
- else
186
- filter_name = member.field_name
187
- end
188
- else
189
- filter_name = "#{prefix}.#{member.name}"
190
- end
191
- filter[filter_name] = filter_value
192
- end
193
- filter
194
- end
195
-
196
- def prepare_order_by(query)
197
- order_by = []
198
- query.order_by.each do |order|
199
- prefix = "_id" if (member = aggregation.dimensions[order[0]])
200
- prefix = "value" if (member = aggregation.measures[order[0]]) unless member
201
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
202
- order_by << ["#{prefix}.#{order[0]}",order[1]]
203
- end
204
- order_by
205
- end
206
-
207
- def process_if_required
208
- return if database.collection_names.include?(target_collection_name)
209
- process
210
- end
211
-
212
- def make_filter_transient(filter_expression,filter_value)
213
- filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
214
- conditions = filter_value.keys.map do |operator|
215
- "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
216
- end
217
- return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
218
- end
219
-
220
- def make_operator_transient(operator)
221
- case operator
222
- when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
223
- when "$ne" then "!="
224
- when "$lt" then "<"
225
- when "$gt" then ">"
226
- when "$lte" then "<="
227
- when "$gte" then ">="
228
- else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
229
- end
230
- end
231
-
232
- def quote_if_required(filter_value)
233
- (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
234
- end
235
-
236
- #this is just the Mongo driver's implementation of the MapReduce
237
- #method, but instead of returning the resulting collection,
238
- #I'm returning the full 'results' so that I can capture
239
- #the delicious stats contained within its delicate hash shell
240
- def map_reduce(source_collection_name,map, reduce, opts={})
241
-
242
- map = BSON::Code.new(map) unless map.is_a?(BSON::Code)
243
- reduce = BSON::Code.new(reduce) unless reduce.is_a?(BSON::Code)
244
-
245
- hash = BSON::OrderedHash.new
246
- hash['mapreduce'] = source_collection_name
247
- hash['map'] = map
248
- hash['reduce'] = reduce
249
- hash.merge! opts
250
-
251
- result = database.command(hash)
252
- unless result["ok"] == 1
253
- raise Mongo::OperationFailure, "map-reduce failed: #{result['errmsg']}"
254
- end
255
-
256
- result
257
- end
258
-
259
- end
260
- end
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation, :metadata, :profiler
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
10
+ @profiler = Cubicle::Aggregation::Profiler.new(aggregation)
11
+ end
12
+
13
+ def database
14
+ Cubicle.mongo.database
15
+ end
16
+
17
+ def collection
18
+ database[aggregation.target_collection_name]
19
+ end
20
+
21
+ def target_collection_name
22
+ aggregation.target_collection_name
23
+ end
24
+
25
+
26
+ #noinspection RubyArgCount
27
+ def execute_query(query,options={})
28
+ count = 0
29
+
30
+ find_options = {
31
+ :limit=>query.limit || 0,
32
+ :skip=>query.offset || 0
33
+ }
34
+
35
+ find_options[:sort] = prepare_order_by(query)
36
+ filter = {}
37
+
38
+ if query == aggregation || query.transient?
39
+ reduction = aggregate(query,options.merge(:reason=>"Transient query"))
40
+ else
41
+ process_if_required
42
+ agg_data = aggregation_for(query)
43
+ reduction = agg_data.collection
44
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
45
+ #otherwise, a second map reduce is required to reduce the data set one last time
46
+ if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
47
+ filter = prepare_filter(query,options[:where] || {})
48
+ else
49
+ reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
50
+ end
51
+ end
52
+
53
+ if reduction.blank?
54
+ Cubicle::Data::Table.new(query,[],0)
55
+ else
56
+
57
+ @profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
58
+ count = reduction.count
59
+ results = reduction.find(filter,find_options).to_a
60
+ #reduction.drop if reduction.name =~ /^tmp.mr.*/
61
+ Cubicle::Data::Table.new(query, results, count)
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+
68
+ def process(options={})
69
+ @metadata.update_processing_stats do
70
+ expire!
71
+ aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
72
+ #Sort desc by length of array, so that larget
73
+ #aggregations are processed first, hopefully increasing efficiency
74
+ #of the processing step
75
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
76
+ aggregation.aggregations.each do |member_list|
77
+ agg_start = Time.now
78
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
79
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
80
+ end
81
+ end
82
+ end
83
+
84
+ def expire!
85
+ @profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
86
+ collection.drop
87
+ @metadata.expire!
88
+ end
89
+ end
90
+
91
+ def aggregate(query,options={})
92
+ view = AggregationView.new(aggregation,query)
93
+
94
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
95
+
96
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
97
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
98
+
99
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
100
+
101
+ target_collection = options.delete(:target_collection)
102
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
103
+
104
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
105
+
106
+ #This is defensive - some tests run without ever initializing any collections
107
+ unless database.collection_names.include?(query.source_collection_name)
108
+ Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
109
+ return []
110
+ end
111
+
112
+ reason = options.delete(:reason) || "Unknown"
113
+ agg_info= options.delete(:aggregation_info)
114
+
115
+ if aggregation.filter && (query.transient? || query == aggregation)
116
+ (options['query'] ||= {}).merge!(aggregation.filter)
117
+ end
118
+
119
+ result = map_reduce(query.source_collection_name,expand_template(map, view),reduce,options)
120
+
121
+ @profiler.record_map_reduce_result(query,options,result,reason,agg_info)
122
+
123
+ @profiler.measure(:create_indexes, :target_collection=>options[:out] || "transient", :reason=>:finalize_aggregation) do
124
+ ensure_indexes(target_collection,query.dimension_names)
125
+ end if target_collection && !query.transient?
126
+
127
+ #A bug, possibly in Mongo, does not produce a count on MR collections
128
+ #sometimes, so we'll just add it from the result.
129
+ output = database[result["result"]]
130
+ output.instance_eval "def count; #{result["counts"]["output"]}; end"
131
+ output
132
+ end
133
+
134
+ protected
135
+
136
+
137
+ def aggregation_for(query)
138
+ #return collection if query.all_dimensions?
139
+
140
+ aggregation_query = query.clone
141
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
142
+ filter = (query.where if query.respond_to?(:where))
143
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
144
+
145
+ dimension_names = aggregation_query.dimension_names.sort
146
+ @metadata.aggregation_for(dimension_names)
147
+ end
148
+
149
+ def ensure_indexes(collection_name,dimension_names)
150
+ col = database[collection_name]
151
+ #an index for each dimension
152
+ dimension_names.each {|dim|col.create_index(dim)}
153
+ #The below composite isn't working, I think because of too many fields being
154
+ #indexed. After some thought, I think maybe this is overkill anyway. However,
155
+ #there should be SOME way to build composite indexes for common queries,
156
+ #so more thought is needed. Maybe cubicle can compile and analyze query
157
+ #stats and choose indexes automatically based on usage. For now, however,
158
+ #I'm just going to turn the thing off.
159
+ #col.create_index(dimension_names.map{|dim|[dim,1]})
160
+ end
161
+
162
+ def expand_template(template,view)
163
+ return "" unless template
164
+ return Mustache.render(template,view) if template.is_a?(String)
165
+ if (template.is_a?(Hash))
166
+ template.each {|key,val|template[key] = expand_template(val,view)}
167
+ return template
168
+ end
169
+ template
170
+ end
171
+
172
+ def prepare_filter(query,filter={})
173
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
174
+ filter.stringify_keys!
175
+ transient = (query.transient? || query == aggregation)
176
+ filter.keys.each do |key|
177
+ next if key=~/^\$.*/
178
+ prefix = nil
179
+ prefix = "_id" if (member = aggregation.dimensions[key])
180
+ prefix = "value" if (member = aggregation.measures[key]) unless member
181
+
182
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
183
+
184
+ filter_value = filter.delete(key)
185
+ if transient
186
+ if (member.expression_type == :javascript)
187
+ filter_name = "$where"
188
+ filter_value = make_filter_transient(member.expression,filter_value)
189
+ else
190
+ filter_name = member.field_name
191
+ end
192
+ else
193
+ filter_name = "#{prefix}.#{member.name}"
194
+ end
195
+ filter[filter_name] = filter_value
196
+ end
197
+ filter
198
+ end
199
+
200
+ def prepare_order_by(query)
201
+ order_by = []
202
+ query.order_by.each do |order|
203
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
204
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
205
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
206
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
207
+ end
208
+ order_by
209
+ end
210
+
211
+ def process_if_required
212
+ return if database.collection_names.include?(target_collection_name)
213
+ process
214
+ end
215
+
216
+ def make_filter_transient(filter_expression,filter_value)
217
+ filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
218
+ conditions = filter_value.keys.map do |operator|
219
+ "val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
220
+ end
221
+ return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
222
+ end
223
+
224
+ def make_operator_transient(operator)
225
+ case operator
226
+ when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
227
+ when "$ne" then "!="
228
+ when "$lt" then "<"
229
+ when "$gt" then ">"
230
+ when "$lte" then "<="
231
+ when "$gte" then ">="
232
+ else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
233
+ end
234
+ end
235
+
236
+ def quote_if_required(filter_value)
237
+ (filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
238
+ end
239
+
240
+ #this is just the Mongo driver's implementation of the MapReduce
241
+ #method, but instead of returning the resulting collection,
242
+ #I'm returning the full 'results' so that I can capture
243
+ #the delicious stats contained within its delicate hash shell
244
+ def map_reduce(source_collection_name,map, reduce, opts={})
245
+
246
+ map = BSON::Code.new(map) unless map.is_a?(BSON::Code)
247
+ reduce = BSON::Code.new(reduce) unless reduce.is_a?(BSON::Code)
248
+
249
+ hash = BSON::OrderedHash.new
250
+ hash['mapreduce'] = source_collection_name
251
+ hash['map'] = map
252
+ hash['reduce'] = reduce
253
+ hash.merge! opts
254
+
255
+ result = database.command(hash)
256
+ unless result["ok"] == 1
257
+ raise Mongo::OperationFailure, "map-reduce failed: #{result['errmsg']}"
258
+ end
259
+
260
+ result
261
+ end
262
+
263
+ end
264
+ end
261
265
  end