cubicle 0.1.30 → 0.1.31
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +4 -0
- data/cubicle.gemspec +2 -2
- data/lib/cubicle/aggregation/aggregation_manager.rb +264 -260
- data/lib/cubicle/aggregation/dsl.rb +142 -135
- data/lib/cubicle/version.rb +1 -1
- data/test/cubicle/aggregation/ad_hoc_test.rb +1 -0
- data/test/cubicle/cubicle_query_test.rb +487 -485
- data/test/cubicles/defect_cubicle.rb +2 -0
- data/test/models/defect.rb +28 -8
- metadata +4 -4
data/CHANGELOG.rdoc
CHANGED
data/cubicle.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cubicle}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.31"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Nathan Stults"]
|
12
|
-
s.date = %q{2010-07-
|
12
|
+
s.date = %q{2010-07-15}
|
13
13
|
s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
|
14
14
|
s.email = %q{hereiam@sonic.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -1,261 +1,265 @@
|
|
1
|
-
module Cubicle
|
2
|
-
module Aggregation
|
3
|
-
class AggregationManager
|
4
|
-
|
5
|
-
attr_reader :aggregation, :metadata, :profiler
|
6
|
-
|
7
|
-
def initialize(aggregation)
|
8
|
-
@aggregation = aggregation
|
9
|
-
@metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
|
10
|
-
@profiler = Cubicle::Aggregation::Profiler.new(aggregation)
|
11
|
-
end
|
12
|
-
|
13
|
-
def database
|
14
|
-
Cubicle.mongo.database
|
15
|
-
end
|
16
|
-
|
17
|
-
def collection
|
18
|
-
database[aggregation.target_collection_name]
|
19
|
-
end
|
20
|
-
|
21
|
-
def target_collection_name
|
22
|
-
aggregation.target_collection_name
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
#noinspection RubyArgCount
|
27
|
-
def execute_query(query,options={})
|
28
|
-
count = 0
|
29
|
-
|
30
|
-
find_options = {
|
31
|
-
:limit=>query.limit || 0,
|
32
|
-
:skip=>query.offset || 0
|
33
|
-
}
|
34
|
-
|
35
|
-
find_options[:sort] = prepare_order_by(query)
|
36
|
-
filter = {}
|
37
|
-
|
38
|
-
if query == aggregation || query.transient?
|
39
|
-
reduction = aggregate(query,options.merge(:reason=>"Transient query"))
|
40
|
-
else
|
41
|
-
process_if_required
|
42
|
-
agg_data = aggregation_for(query)
|
43
|
-
reduction = agg_data.collection
|
44
|
-
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
45
|
-
#otherwise, a second map reduce is required to reduce the data set one last time
|
46
|
-
if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
|
47
|
-
filter = prepare_filter(query,options[:where] || {})
|
48
|
-
else
|
49
|
-
reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
if reduction.blank?
|
54
|
-
Cubicle::Data::Table.new(query,[],0)
|
55
|
-
else
|
56
|
-
|
57
|
-
@profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
|
58
|
-
count = reduction.count
|
59
|
-
results = reduction.find(filter,find_options).to_a
|
60
|
-
#reduction.drop if reduction.name =~ /^tmp.mr.*/
|
61
|
-
Cubicle::Data::Table.new(query, results, count)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
|
66
|
-
end
|
67
|
-
|
68
|
-
def process(options={})
|
69
|
-
@metadata.update_processing_stats do
|
70
|
-
expire!
|
71
|
-
aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
|
72
|
-
#Sort desc by length of array, so that larget
|
73
|
-
#aggregations are processed first, hopefully increasing efficiency
|
74
|
-
#of the processing step
|
75
|
-
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
76
|
-
aggregation.aggregations.each do |member_list|
|
77
|
-
agg_start = Time.now
|
78
|
-
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
79
|
-
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def expire!
|
85
|
-
@profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
|
86
|
-
collection.drop
|
87
|
-
@metadata.expire!
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def aggregate(query,options={})
|
92
|
-
view = AggregationView.new(aggregation,query)
|
93
|
-
|
94
|
-
map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
|
95
|
-
|
96
|
-
options[:finalize] = MapReduceHelper.generate_finalize_function(query)
|
97
|
-
options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
|
98
|
-
|
99
|
-
query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
|
100
|
-
|
101
|
-
target_collection = options.delete(:target_collection)
|
102
|
-
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
103
|
-
|
104
|
-
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
105
|
-
|
106
|
-
#This is defensive - some tests run without ever initializing any collections
|
107
|
-
unless database.collection_names.include?(query.source_collection_name)
|
108
|
-
Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
|
109
|
-
return []
|
110
|
-
end
|
111
|
-
|
112
|
-
reason = options.delete(:reason) || "Unknown"
|
113
|
-
agg_info= options.delete(:aggregation_info)
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
#
|
152
|
-
|
153
|
-
#
|
154
|
-
#I
|
155
|
-
#
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
template
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
when "$
|
227
|
-
when "$
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
end
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
hash.
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
result
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
class AggregationManager
|
4
|
+
|
5
|
+
attr_reader :aggregation, :metadata, :profiler
|
6
|
+
|
7
|
+
def initialize(aggregation)
|
8
|
+
@aggregation = aggregation
|
9
|
+
@metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
|
10
|
+
@profiler = Cubicle::Aggregation::Profiler.new(aggregation)
|
11
|
+
end
|
12
|
+
|
13
|
+
def database
|
14
|
+
Cubicle.mongo.database
|
15
|
+
end
|
16
|
+
|
17
|
+
def collection
|
18
|
+
database[aggregation.target_collection_name]
|
19
|
+
end
|
20
|
+
|
21
|
+
def target_collection_name
|
22
|
+
aggregation.target_collection_name
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
#noinspection RubyArgCount
|
27
|
+
def execute_query(query,options={})
|
28
|
+
count = 0
|
29
|
+
|
30
|
+
find_options = {
|
31
|
+
:limit=>query.limit || 0,
|
32
|
+
:skip=>query.offset || 0
|
33
|
+
}
|
34
|
+
|
35
|
+
find_options[:sort] = prepare_order_by(query)
|
36
|
+
filter = {}
|
37
|
+
|
38
|
+
if query == aggregation || query.transient?
|
39
|
+
reduction = aggregate(query,options.merge(:reason=>"Transient query"))
|
40
|
+
else
|
41
|
+
process_if_required
|
42
|
+
agg_data = aggregation_for(query)
|
43
|
+
reduction = agg_data.collection
|
44
|
+
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
45
|
+
#otherwise, a second map reduce is required to reduce the data set one last time
|
46
|
+
if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
|
47
|
+
filter = prepare_filter(query,options[:where] || {})
|
48
|
+
else
|
49
|
+
reduction = aggregate(query,:source_collection=>agg_data.target_collection_name, :reason=>"Last mile reduction - source aggregation has too many members (#{agg_data.member_names.join(",").inspect})")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
if reduction.blank?
|
54
|
+
Cubicle::Data::Table.new(query,[],0)
|
55
|
+
else
|
56
|
+
|
57
|
+
@profiler.measure(:find, :source=>reduction.name, :reason=>"Fetch final query results", :query=>find_options) do
|
58
|
+
count = reduction.count
|
59
|
+
results = reduction.find(filter,find_options).to_a
|
60
|
+
#reduction.drop if reduction.name =~ /^tmp.mr.*/
|
61
|
+
Cubicle::Data::Table.new(query, results, count)
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
def process(options={})
|
69
|
+
@metadata.update_processing_stats do
|
70
|
+
expire!
|
71
|
+
aggregate(aggregation,options.merge(:reason=>"Processing fact collection"))
|
72
|
+
#Sort desc by length of array, so that larget
|
73
|
+
#aggregations are processed first, hopefully increasing efficiency
|
74
|
+
#of the processing step
|
75
|
+
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
76
|
+
aggregation.aggregations.each do |member_list|
|
77
|
+
agg_start = Time.now
|
78
|
+
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
79
|
+
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def expire!
|
85
|
+
@profiler.measure(:expire_aggregations, :reason=>"Expire aggregations") do
|
86
|
+
collection.drop
|
87
|
+
@metadata.expire!
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def aggregate(query,options={})
|
92
|
+
view = AggregationView.new(aggregation,query)
|
93
|
+
|
94
|
+
map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
|
95
|
+
|
96
|
+
options[:finalize] = MapReduceHelper.generate_finalize_function(query)
|
97
|
+
options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
|
98
|
+
|
99
|
+
query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
|
100
|
+
|
101
|
+
target_collection = options.delete(:target_collection)
|
102
|
+
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
103
|
+
|
104
|
+
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
105
|
+
|
106
|
+
#This is defensive - some tests run without ever initializing any collections
|
107
|
+
unless database.collection_names.include?(query.source_collection_name)
|
108
|
+
Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
|
109
|
+
return []
|
110
|
+
end
|
111
|
+
|
112
|
+
reason = options.delete(:reason) || "Unknown"
|
113
|
+
agg_info= options.delete(:aggregation_info)
|
114
|
+
|
115
|
+
if aggregation.filter && (query.transient? || query == aggregation)
|
116
|
+
(options['query'] ||= {}).merge!(aggregation.filter)
|
117
|
+
end
|
118
|
+
|
119
|
+
result = map_reduce(query.source_collection_name,expand_template(map, view),reduce,options)
|
120
|
+
|
121
|
+
@profiler.record_map_reduce_result(query,options,result,reason,agg_info)
|
122
|
+
|
123
|
+
@profiler.measure(:create_indexes, :target_collection=>options[:out] || "transient", :reason=>:finalize_aggregation) do
|
124
|
+
ensure_indexes(target_collection,query.dimension_names)
|
125
|
+
end if target_collection && !query.transient?
|
126
|
+
|
127
|
+
#A bug, possibly in Mongo, does not produce a count on MR collections
|
128
|
+
#sometimes, so we'll just add it from the result.
|
129
|
+
output = database[result["result"]]
|
130
|
+
output.instance_eval "def count; #{result["counts"]["output"]}; end"
|
131
|
+
output
|
132
|
+
end
|
133
|
+
|
134
|
+
protected
|
135
|
+
|
136
|
+
|
137
|
+
def aggregation_for(query)
|
138
|
+
#return collection if query.all_dimensions?
|
139
|
+
|
140
|
+
aggregation_query = query.clone
|
141
|
+
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
142
|
+
filter = (query.where if query.respond_to?(:where))
|
143
|
+
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
144
|
+
|
145
|
+
dimension_names = aggregation_query.dimension_names.sort
|
146
|
+
@metadata.aggregation_for(dimension_names)
|
147
|
+
end
|
148
|
+
|
149
|
+
def ensure_indexes(collection_name,dimension_names)
|
150
|
+
col = database[collection_name]
|
151
|
+
#an index for each dimension
|
152
|
+
dimension_names.each {|dim|col.create_index(dim)}
|
153
|
+
#The below composite isn't working, I think because of too many fields being
|
154
|
+
#indexed. After some thought, I think maybe this is overkill anyway. However,
|
155
|
+
#there should be SOME way to build composite indexes for common queries,
|
156
|
+
#so more thought is needed. Maybe cubicle can compile and analyze query
|
157
|
+
#stats and choose indexes automatically based on usage. For now, however,
|
158
|
+
#I'm just going to turn the thing off.
|
159
|
+
#col.create_index(dimension_names.map{|dim|[dim,1]})
|
160
|
+
end
|
161
|
+
|
162
|
+
def expand_template(template,view)
|
163
|
+
return "" unless template
|
164
|
+
return Mustache.render(template,view) if template.is_a?(String)
|
165
|
+
if (template.is_a?(Hash))
|
166
|
+
template.each {|key,val|template[key] = expand_template(val,view)}
|
167
|
+
return template
|
168
|
+
end
|
169
|
+
template
|
170
|
+
end
|
171
|
+
|
172
|
+
def prepare_filter(query,filter={})
|
173
|
+
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
174
|
+
filter.stringify_keys!
|
175
|
+
transient = (query.transient? || query == aggregation)
|
176
|
+
filter.keys.each do |key|
|
177
|
+
next if key=~/^\$.*/
|
178
|
+
prefix = nil
|
179
|
+
prefix = "_id" if (member = aggregation.dimensions[key])
|
180
|
+
prefix = "value" if (member = aggregation.measures[key]) unless member
|
181
|
+
|
182
|
+
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
183
|
+
|
184
|
+
filter_value = filter.delete(key)
|
185
|
+
if transient
|
186
|
+
if (member.expression_type == :javascript)
|
187
|
+
filter_name = "$where"
|
188
|
+
filter_value = make_filter_transient(member.expression,filter_value)
|
189
|
+
else
|
190
|
+
filter_name = member.field_name
|
191
|
+
end
|
192
|
+
else
|
193
|
+
filter_name = "#{prefix}.#{member.name}"
|
194
|
+
end
|
195
|
+
filter[filter_name] = filter_value
|
196
|
+
end
|
197
|
+
filter
|
198
|
+
end
|
199
|
+
|
200
|
+
def prepare_order_by(query)
|
201
|
+
order_by = []
|
202
|
+
query.order_by.each do |order|
|
203
|
+
prefix = "_id" if (member = aggregation.dimensions[order[0]])
|
204
|
+
prefix = "value" if (member = aggregation.measures[order[0]]) unless member
|
205
|
+
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
206
|
+
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
207
|
+
end
|
208
|
+
order_by
|
209
|
+
end
|
210
|
+
|
211
|
+
def process_if_required
|
212
|
+
return if database.collection_names.include?(target_collection_name)
|
213
|
+
process
|
214
|
+
end
|
215
|
+
|
216
|
+
def make_filter_transient(filter_expression,filter_value)
|
217
|
+
filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
|
218
|
+
conditions = filter_value.keys.map do |operator|
|
219
|
+
"val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
|
220
|
+
end
|
221
|
+
return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
|
222
|
+
end
|
223
|
+
|
224
|
+
def make_operator_transient(operator)
|
225
|
+
case operator
|
226
|
+
when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
|
227
|
+
when "$ne" then "!="
|
228
|
+
when "$lt" then "<"
|
229
|
+
when "$gt" then ">"
|
230
|
+
when "$lte" then "<="
|
231
|
+
when "$gte" then ">="
|
232
|
+
else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def quote_if_required(filter_value)
|
237
|
+
(filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
|
238
|
+
end
|
239
|
+
|
240
|
+
#this is just the Mongo driver's implementation of the MapReduce
|
241
|
+
#method, but instead of returning the resulting collection,
|
242
|
+
#I'm returning the full 'results' so that I can capture
|
243
|
+
#the delicious stats contained within its delicate hash shell
|
244
|
+
def map_reduce(source_collection_name,map, reduce, opts={})
|
245
|
+
|
246
|
+
map = BSON::Code.new(map) unless map.is_a?(BSON::Code)
|
247
|
+
reduce = BSON::Code.new(reduce) unless reduce.is_a?(BSON::Code)
|
248
|
+
|
249
|
+
hash = BSON::OrderedHash.new
|
250
|
+
hash['mapreduce'] = source_collection_name
|
251
|
+
hash['map'] = map
|
252
|
+
hash['reduce'] = reduce
|
253
|
+
hash.merge! opts
|
254
|
+
|
255
|
+
result = database.command(hash)
|
256
|
+
unless result["ok"] == 1
|
257
|
+
raise Mongo::OperationFailure, "map-reduce failed: #{result['errmsg']}"
|
258
|
+
end
|
259
|
+
|
260
|
+
result
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
264
|
+
end
|
261
265
|
end
|