cubicle 0.1.19 → 0.1.20
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +3 -0
- data/cubicle.gemspec +2 -2
- data/lib/cubicle/aggregation/aggregation_manager.rb +253 -248
- data/lib/cubicle/version.rb +1 -1
- data/test/log/test.log +1738 -64537
- metadata +3 -3
data/CHANGELOG.rdoc
CHANGED
data/cubicle.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cubicle}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.20"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Nathan Stults"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-05-03}
|
13
13
|
s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
|
14
14
|
s.email = %q{hereiam@sonic.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -1,249 +1,254 @@
|
|
1
|
-
module Cubicle
|
2
|
-
module Aggregation
|
3
|
-
class AggregationManager
|
4
|
-
|
5
|
-
attr_reader :aggregation
|
6
|
-
|
7
|
-
def initialize(aggregation)
|
8
|
-
@aggregation = aggregation
|
9
|
-
end
|
10
|
-
|
11
|
-
def database
|
12
|
-
Cubicle.mongo.database
|
13
|
-
end
|
14
|
-
|
15
|
-
def collection
|
16
|
-
database[aggregation.target_collection_name]
|
17
|
-
end
|
18
|
-
|
19
|
-
def target_collection_name
|
20
|
-
aggregation.target_collection_name
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
#noinspection RubyArgCount
|
25
|
-
def execute_query(query,options={})
|
26
|
-
count = 0
|
27
|
-
|
28
|
-
find_options = {
|
29
|
-
:limit=>query.limit || 0,
|
30
|
-
:skip=>query.offset || 0
|
31
|
-
}
|
32
|
-
|
33
|
-
find_options[:sort] = prepare_order_by(query)
|
34
|
-
filter = {}
|
35
|
-
if query == aggregation || query.transient?
|
36
|
-
aggregation = aggregate(query,options)
|
37
|
-
else
|
38
|
-
process_if_required
|
39
|
-
aggregation = aggregation_for(query)
|
40
|
-
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
41
|
-
#otherwise, a second map reduce is required to reduce the data set one last time
|
42
|
-
if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
43
|
-
filter = prepare_filter(query,options[:where] || {})
|
44
|
-
else
|
45
|
-
aggregation = aggregate(query,:source_collection=>aggregation.name)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
if aggregation.blank?
|
50
|
-
Cubicle::Data::Table.new(query,[],0) if aggregation == []
|
51
|
-
else
|
52
|
-
count = aggregation.count
|
53
|
-
results = aggregation.find(filter,find_options).to_a
|
54
|
-
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
55
|
-
Cubicle::Data::Table.new(query, results, count)
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
def process(options={})
|
61
|
-
Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
|
62
|
-
start = Time.now
|
63
|
-
expire!
|
64
|
-
aggregate(aggregation,options)
|
65
|
-
#Sort desc by length of array, so that larget
|
66
|
-
#aggregations are processed first, hopefully increasing efficiency
|
67
|
-
#of the processing step
|
68
|
-
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
69
|
-
aggregation.aggregations.each do |member_list|
|
70
|
-
agg_start = Time.now
|
71
|
-
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
72
|
-
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
73
|
-
end
|
74
|
-
duration = Time.now - start
|
75
|
-
Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
|
76
|
-
end
|
77
|
-
|
78
|
-
def expire!
|
79
|
-
collection.drop
|
80
|
-
expire_aggregations!
|
81
|
-
end
|
82
|
-
|
83
|
-
protected
|
84
|
-
|
85
|
-
def aggregation_collection_names
|
86
|
-
database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
|
87
|
-
end
|
88
|
-
|
89
|
-
def expire_aggregations!
|
90
|
-
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
91
|
-
end
|
92
|
-
|
93
|
-
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
94
|
-
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
95
|
-
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
96
|
-
existing = existing_aggregations.map do |agg_col_name|
|
97
|
-
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
98
|
-
end
|
99
|
-
|
100
|
-
#This will select all the aggregations that contain ALL of the desired dimension names
|
101
|
-
#we are sorting by length because the aggregation with the least number of members
|
102
|
-
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
103
|
-
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
104
|
-
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
105
|
-
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
106
|
-
#this should do for now.
|
107
|
-
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
108
|
-
|
109
|
-
#If no suitable aggregation exists to base this one off of,
|
110
|
-
#we'll just use the base cubes aggregation collection
|
111
|
-
return target_collection_name if candidates.blank?
|
112
|
-
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
113
|
-
|
114
|
-
end
|
115
|
-
|
116
|
-
def aggregation_for(query)
|
117
|
-
return collection if query.all_dimensions?
|
118
|
-
|
119
|
-
aggregation_query = query.clone
|
120
|
-
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
121
|
-
filter = (query.where if query.respond_to?(:where))
|
122
|
-
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
123
|
-
|
124
|
-
dimension_names = aggregation_query.dimension_names.sort
|
125
|
-
agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
126
|
-
|
127
|
-
unless database.collection_names.include?(agg_col_name)
|
128
|
-
source_col_name = find_best_source_collection(dimension_names)
|
129
|
-
exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
130
|
-
aggregate(exec_query, :target_collection=>agg_col_name)
|
131
|
-
end
|
132
|
-
|
133
|
-
database[agg_col_name]
|
134
|
-
end
|
135
|
-
|
136
|
-
def ensure_indexes(collection_name,dimension_names)
|
137
|
-
col = database[collection_name]
|
138
|
-
#an index for each dimension
|
139
|
-
dimension_names.each {|dim|col.create_index(
|
140
|
-
#
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
template
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
if
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
end
|
222
|
-
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
when "$
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
class AggregationManager
|
4
|
+
|
5
|
+
attr_reader :aggregation
|
6
|
+
|
7
|
+
def initialize(aggregation)
|
8
|
+
@aggregation = aggregation
|
9
|
+
end
|
10
|
+
|
11
|
+
def database
|
12
|
+
Cubicle.mongo.database
|
13
|
+
end
|
14
|
+
|
15
|
+
def collection
|
16
|
+
database[aggregation.target_collection_name]
|
17
|
+
end
|
18
|
+
|
19
|
+
def target_collection_name
|
20
|
+
aggregation.target_collection_name
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
#noinspection RubyArgCount
|
25
|
+
def execute_query(query,options={})
|
26
|
+
count = 0
|
27
|
+
|
28
|
+
find_options = {
|
29
|
+
:limit=>query.limit || 0,
|
30
|
+
:skip=>query.offset || 0
|
31
|
+
}
|
32
|
+
|
33
|
+
find_options[:sort] = prepare_order_by(query)
|
34
|
+
filter = {}
|
35
|
+
if query == aggregation || query.transient?
|
36
|
+
aggregation = aggregate(query,options)
|
37
|
+
else
|
38
|
+
process_if_required
|
39
|
+
aggregation = aggregation_for(query)
|
40
|
+
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
41
|
+
#otherwise, a second map reduce is required to reduce the data set one last time
|
42
|
+
if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
43
|
+
filter = prepare_filter(query,options[:where] || {})
|
44
|
+
else
|
45
|
+
aggregation = aggregate(query,:source_collection=>aggregation.name)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if aggregation.blank?
|
50
|
+
Cubicle::Data::Table.new(query,[],0) if aggregation == []
|
51
|
+
else
|
52
|
+
count = aggregation.count
|
53
|
+
results = aggregation.find(filter,find_options).to_a
|
54
|
+
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
55
|
+
Cubicle::Data::Table.new(query, results, count)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def process(options={})
|
61
|
+
Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
|
62
|
+
start = Time.now
|
63
|
+
expire!
|
64
|
+
aggregate(aggregation,options)
|
65
|
+
#Sort desc by length of array, so that larget
|
66
|
+
#aggregations are processed first, hopefully increasing efficiency
|
67
|
+
#of the processing step
|
68
|
+
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
69
|
+
aggregation.aggregations.each do |member_list|
|
70
|
+
agg_start = Time.now
|
71
|
+
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
72
|
+
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
73
|
+
end
|
74
|
+
duration = Time.now - start
|
75
|
+
Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
|
76
|
+
end
|
77
|
+
|
78
|
+
def expire!
|
79
|
+
collection.drop
|
80
|
+
expire_aggregations!
|
81
|
+
end
|
82
|
+
|
83
|
+
protected
|
84
|
+
|
85
|
+
def aggregation_collection_names
|
86
|
+
database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
|
87
|
+
end
|
88
|
+
|
89
|
+
def expire_aggregations!
|
90
|
+
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
91
|
+
end
|
92
|
+
|
93
|
+
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
94
|
+
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
95
|
+
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
96
|
+
existing = existing_aggregations.map do |agg_col_name|
|
97
|
+
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
98
|
+
end
|
99
|
+
|
100
|
+
#This will select all the aggregations that contain ALL of the desired dimension names
|
101
|
+
#we are sorting by length because the aggregation with the least number of members
|
102
|
+
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
103
|
+
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
104
|
+
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
105
|
+
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
106
|
+
#this should do for now.
|
107
|
+
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
108
|
+
|
109
|
+
#If no suitable aggregation exists to base this one off of,
|
110
|
+
#we'll just use the base cubes aggregation collection
|
111
|
+
return target_collection_name if candidates.blank?
|
112
|
+
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
def aggregation_for(query)
|
117
|
+
return collection if query.all_dimensions?
|
118
|
+
|
119
|
+
aggregation_query = query.clone
|
120
|
+
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
121
|
+
filter = (query.where if query.respond_to?(:where))
|
122
|
+
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
123
|
+
|
124
|
+
dimension_names = aggregation_query.dimension_names.sort
|
125
|
+
agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
126
|
+
|
127
|
+
unless database.collection_names.include?(agg_col_name)
|
128
|
+
source_col_name = find_best_source_collection(dimension_names)
|
129
|
+
exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
130
|
+
aggregate(exec_query, :target_collection=>agg_col_name)
|
131
|
+
end
|
132
|
+
|
133
|
+
database[agg_col_name]
|
134
|
+
end
|
135
|
+
|
136
|
+
def ensure_indexes(collection_name,dimension_names)
|
137
|
+
col = database[collection_name]
|
138
|
+
#an index for each dimension
|
139
|
+
dimension_names.each {|dim|col.create_index(dim)}
|
140
|
+
#The below composite isn't working, I think because of too many fields being
|
141
|
+
#indexed. After some thought, I think maybe this is overkill anyway. However,
|
142
|
+
#there should be SOME way to build composite indexes for common queries,
|
143
|
+
#so more thought is needed. Maybe cubicle can compile and analyze query
|
144
|
+
#stats and choose indexes automatically based on usage. For now, however,
|
145
|
+
#I'm just going to turn the thing off.
|
146
|
+
#col.create_index(dimension_names.map{|dim|[dim,1]})
|
147
|
+
end
|
148
|
+
|
149
|
+
def aggregate(query,options={})
|
150
|
+
view = AggregationView.new(aggregation,query)
|
151
|
+
|
152
|
+
map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
|
153
|
+
|
154
|
+
options[:finalize] = MapReduceHelper.generate_finalize_function(query)
|
155
|
+
options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
|
156
|
+
|
157
|
+
query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
|
158
|
+
|
159
|
+
target_collection = options.delete(:target_collection)
|
160
|
+
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
161
|
+
|
162
|
+
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
163
|
+
|
164
|
+
#This is defensive - some tests run without ever initializing any collections
|
165
|
+
return [] unless database.collection_names.include?(query.source_collection_name)
|
166
|
+
|
167
|
+
result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
|
168
|
+
|
169
|
+
ensure_indexes(target_collection,query.dimension_names) if target_collection
|
170
|
+
|
171
|
+
result
|
172
|
+
end
|
173
|
+
|
174
|
+
def expand_template(template,view)
|
175
|
+
return "" unless template
|
176
|
+
return Mustache.render(template,view) if template.is_a?(String)
|
177
|
+
if (template.is_a?(Hash))
|
178
|
+
template.each {|key,val|template[key] = expand_template(val,view)}
|
179
|
+
return template
|
180
|
+
end
|
181
|
+
template
|
182
|
+
end
|
183
|
+
|
184
|
+
def prepare_filter(query,filter={})
|
185
|
+
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
186
|
+
filter.stringify_keys!
|
187
|
+
transient = (query.transient? || query == aggregation)
|
188
|
+
filter.keys.each do |key|
|
189
|
+
next if key=~/^\$.*/
|
190
|
+
prefix = nil
|
191
|
+
prefix = "_id" if (member = aggregation.dimensions[key])
|
192
|
+
prefix = "value" if (member = aggregation.measures[key]) unless member
|
193
|
+
|
194
|
+
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
195
|
+
|
196
|
+
filter_value = filter.delete(key)
|
197
|
+
if transient
|
198
|
+
if (member.expression_type == :javascript)
|
199
|
+
filter_name = "$where"
|
200
|
+
filter_value = make_filter_transient(member.expression,filter_value)
|
201
|
+
else
|
202
|
+
filter_name = member.field_name
|
203
|
+
end
|
204
|
+
else
|
205
|
+
filter_name = "#{prefix}.#{member.name}"
|
206
|
+
end
|
207
|
+
filter[filter_name] = filter_value
|
208
|
+
end
|
209
|
+
filter
|
210
|
+
end
|
211
|
+
|
212
|
+
def prepare_order_by(query)
|
213
|
+
order_by = []
|
214
|
+
query.order_by.each do |order|
|
215
|
+
prefix = "_id" if (member = aggregation.dimensions[order[0]])
|
216
|
+
prefix = "value" if (member = aggregation.measures[order[0]]) unless member
|
217
|
+
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
218
|
+
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
219
|
+
end
|
220
|
+
order_by
|
221
|
+
end
|
222
|
+
|
223
|
+
def process_if_required
|
224
|
+
return if database.collection_names.include?(target_collection_name)
|
225
|
+
process
|
226
|
+
end
|
227
|
+
|
228
|
+
def make_filter_transient(filter_expression,filter_value)
|
229
|
+
filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
|
230
|
+
conditions = filter_value.keys.map do |operator|
|
231
|
+
"val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
|
232
|
+
end
|
233
|
+
return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
|
234
|
+
end
|
235
|
+
|
236
|
+
def make_operator_transient(operator)
|
237
|
+
case operator
|
238
|
+
when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
|
239
|
+
when "$ne" then "!="
|
240
|
+
when "$lt" then "<"
|
241
|
+
when "$gt" then ">"
|
242
|
+
when "$lte" then "<="
|
243
|
+
when "$gte" then ">="
|
244
|
+
else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def quote_if_required(filter_value)
|
249
|
+
(filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
end
|
249
254
|
end
|