cubicle 0.1.19 → 0.1.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +3 -0
- data/cubicle.gemspec +2 -2
- data/lib/cubicle/aggregation/aggregation_manager.rb +253 -248
- data/lib/cubicle/version.rb +1 -1
- data/test/log/test.log +1738 -64537
- metadata +3 -3
data/CHANGELOG.rdoc
CHANGED
data/cubicle.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{cubicle}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.20"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Nathan Stults"]
|
12
|
-
s.date = %q{2010-
|
12
|
+
s.date = %q{2010-05-03}
|
13
13
|
s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
|
14
14
|
s.email = %q{hereiam@sonic.net}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -1,249 +1,254 @@
|
|
1
|
-
module Cubicle
|
2
|
-
module Aggregation
|
3
|
-
class AggregationManager
|
4
|
-
|
5
|
-
attr_reader :aggregation
|
6
|
-
|
7
|
-
def initialize(aggregation)
|
8
|
-
@aggregation = aggregation
|
9
|
-
end
|
10
|
-
|
11
|
-
def database
|
12
|
-
Cubicle.mongo.database
|
13
|
-
end
|
14
|
-
|
15
|
-
def collection
|
16
|
-
database[aggregation.target_collection_name]
|
17
|
-
end
|
18
|
-
|
19
|
-
def target_collection_name
|
20
|
-
aggregation.target_collection_name
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
#noinspection RubyArgCount
|
25
|
-
def execute_query(query,options={})
|
26
|
-
count = 0
|
27
|
-
|
28
|
-
find_options = {
|
29
|
-
:limit=>query.limit || 0,
|
30
|
-
:skip=>query.offset || 0
|
31
|
-
}
|
32
|
-
|
33
|
-
find_options[:sort] = prepare_order_by(query)
|
34
|
-
filter = {}
|
35
|
-
if query == aggregation || query.transient?
|
36
|
-
aggregation = aggregate(query,options)
|
37
|
-
else
|
38
|
-
process_if_required
|
39
|
-
aggregation = aggregation_for(query)
|
40
|
-
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
41
|
-
#otherwise, a second map reduce is required to reduce the data set one last time
|
42
|
-
if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
43
|
-
filter = prepare_filter(query,options[:where] || {})
|
44
|
-
else
|
45
|
-
aggregation = aggregate(query,:source_collection=>aggregation.name)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
if aggregation.blank?
|
50
|
-
Cubicle::Data::Table.new(query,[],0) if aggregation == []
|
51
|
-
else
|
52
|
-
count = aggregation.count
|
53
|
-
results = aggregation.find(filter,find_options).to_a
|
54
|
-
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
55
|
-
Cubicle::Data::Table.new(query, results, count)
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
def process(options={})
|
61
|
-
Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
|
62
|
-
start = Time.now
|
63
|
-
expire!
|
64
|
-
aggregate(aggregation,options)
|
65
|
-
#Sort desc by length of array, so that larget
|
66
|
-
#aggregations are processed first, hopefully increasing efficiency
|
67
|
-
#of the processing step
|
68
|
-
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
69
|
-
aggregation.aggregations.each do |member_list|
|
70
|
-
agg_start = Time.now
|
71
|
-
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
72
|
-
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
73
|
-
end
|
74
|
-
duration = Time.now - start
|
75
|
-
Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
|
76
|
-
end
|
77
|
-
|
78
|
-
def expire!
|
79
|
-
collection.drop
|
80
|
-
expire_aggregations!
|
81
|
-
end
|
82
|
-
|
83
|
-
protected
|
84
|
-
|
85
|
-
def aggregation_collection_names
|
86
|
-
database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
|
87
|
-
end
|
88
|
-
|
89
|
-
def expire_aggregations!
|
90
|
-
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
91
|
-
end
|
92
|
-
|
93
|
-
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
94
|
-
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
95
|
-
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
96
|
-
existing = existing_aggregations.map do |agg_col_name|
|
97
|
-
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
98
|
-
end
|
99
|
-
|
100
|
-
#This will select all the aggregations that contain ALL of the desired dimension names
|
101
|
-
#we are sorting by length because the aggregation with the least number of members
|
102
|
-
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
103
|
-
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
104
|
-
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
105
|
-
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
106
|
-
#this should do for now.
|
107
|
-
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
108
|
-
|
109
|
-
#If no suitable aggregation exists to base this one off of,
|
110
|
-
#we'll just use the base cubes aggregation collection
|
111
|
-
return target_collection_name if candidates.blank?
|
112
|
-
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
113
|
-
|
114
|
-
end
|
115
|
-
|
116
|
-
def aggregation_for(query)
|
117
|
-
return collection if query.all_dimensions?
|
118
|
-
|
119
|
-
aggregation_query = query.clone
|
120
|
-
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
121
|
-
filter = (query.where if query.respond_to?(:where))
|
122
|
-
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
123
|
-
|
124
|
-
dimension_names = aggregation_query.dimension_names.sort
|
125
|
-
agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
126
|
-
|
127
|
-
unless database.collection_names.include?(agg_col_name)
|
128
|
-
source_col_name = find_best_source_collection(dimension_names)
|
129
|
-
exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
130
|
-
aggregate(exec_query, :target_collection=>agg_col_name)
|
131
|
-
end
|
132
|
-
|
133
|
-
database[agg_col_name]
|
134
|
-
end
|
135
|
-
|
136
|
-
def ensure_indexes(collection_name,dimension_names)
|
137
|
-
col = database[collection_name]
|
138
|
-
#an index for each dimension
|
139
|
-
dimension_names.each {|dim|col.create_index(
|
140
|
-
#
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
template
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
if
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
end
|
222
|
-
|
223
|
-
def
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
when "$
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
class AggregationManager
|
4
|
+
|
5
|
+
attr_reader :aggregation
|
6
|
+
|
7
|
+
def initialize(aggregation)
|
8
|
+
@aggregation = aggregation
|
9
|
+
end
|
10
|
+
|
11
|
+
def database
|
12
|
+
Cubicle.mongo.database
|
13
|
+
end
|
14
|
+
|
15
|
+
def collection
|
16
|
+
database[aggregation.target_collection_name]
|
17
|
+
end
|
18
|
+
|
19
|
+
def target_collection_name
|
20
|
+
aggregation.target_collection_name
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
#noinspection RubyArgCount
|
25
|
+
def execute_query(query,options={})
|
26
|
+
count = 0
|
27
|
+
|
28
|
+
find_options = {
|
29
|
+
:limit=>query.limit || 0,
|
30
|
+
:skip=>query.offset || 0
|
31
|
+
}
|
32
|
+
|
33
|
+
find_options[:sort] = prepare_order_by(query)
|
34
|
+
filter = {}
|
35
|
+
if query == aggregation || query.transient?
|
36
|
+
aggregation = aggregate(query,options)
|
37
|
+
else
|
38
|
+
process_if_required
|
39
|
+
aggregation = aggregation_for(query)
|
40
|
+
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
41
|
+
#otherwise, a second map reduce is required to reduce the data set one last time
|
42
|
+
if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
43
|
+
filter = prepare_filter(query,options[:where] || {})
|
44
|
+
else
|
45
|
+
aggregation = aggregate(query,:source_collection=>aggregation.name)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if aggregation.blank?
|
50
|
+
Cubicle::Data::Table.new(query,[],0) if aggregation == []
|
51
|
+
else
|
52
|
+
count = aggregation.count
|
53
|
+
results = aggregation.find(filter,find_options).to_a
|
54
|
+
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
55
|
+
Cubicle::Data::Table.new(query, results, count)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def process(options={})
|
61
|
+
Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
|
62
|
+
start = Time.now
|
63
|
+
expire!
|
64
|
+
aggregate(aggregation,options)
|
65
|
+
#Sort desc by length of array, so that larget
|
66
|
+
#aggregations are processed first, hopefully increasing efficiency
|
67
|
+
#of the processing step
|
68
|
+
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
69
|
+
aggregation.aggregations.each do |member_list|
|
70
|
+
agg_start = Time.now
|
71
|
+
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
72
|
+
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
73
|
+
end
|
74
|
+
duration = Time.now - start
|
75
|
+
Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
|
76
|
+
end
|
77
|
+
|
78
|
+
def expire!
|
79
|
+
collection.drop
|
80
|
+
expire_aggregations!
|
81
|
+
end
|
82
|
+
|
83
|
+
protected
|
84
|
+
|
85
|
+
def aggregation_collection_names
|
86
|
+
database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
|
87
|
+
end
|
88
|
+
|
89
|
+
def expire_aggregations!
|
90
|
+
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
91
|
+
end
|
92
|
+
|
93
|
+
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
94
|
+
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
95
|
+
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
96
|
+
existing = existing_aggregations.map do |agg_col_name|
|
97
|
+
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
98
|
+
end
|
99
|
+
|
100
|
+
#This will select all the aggregations that contain ALL of the desired dimension names
|
101
|
+
#we are sorting by length because the aggregation with the least number of members
|
102
|
+
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
103
|
+
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
104
|
+
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
105
|
+
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
106
|
+
#this should do for now.
|
107
|
+
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
108
|
+
|
109
|
+
#If no suitable aggregation exists to base this one off of,
|
110
|
+
#we'll just use the base cubes aggregation collection
|
111
|
+
return target_collection_name if candidates.blank?
|
112
|
+
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
def aggregation_for(query)
|
117
|
+
return collection if query.all_dimensions?
|
118
|
+
|
119
|
+
aggregation_query = query.clone
|
120
|
+
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
121
|
+
filter = (query.where if query.respond_to?(:where))
|
122
|
+
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
123
|
+
|
124
|
+
dimension_names = aggregation_query.dimension_names.sort
|
125
|
+
agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
126
|
+
|
127
|
+
unless database.collection_names.include?(agg_col_name)
|
128
|
+
source_col_name = find_best_source_collection(dimension_names)
|
129
|
+
exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
130
|
+
aggregate(exec_query, :target_collection=>agg_col_name)
|
131
|
+
end
|
132
|
+
|
133
|
+
database[agg_col_name]
|
134
|
+
end
|
135
|
+
|
136
|
+
def ensure_indexes(collection_name,dimension_names)
|
137
|
+
col = database[collection_name]
|
138
|
+
#an index for each dimension
|
139
|
+
dimension_names.each {|dim|col.create_index(dim)}
|
140
|
+
#The below composite isn't working, I think because of too many fields being
|
141
|
+
#indexed. After some thought, I think maybe this is overkill anyway. However,
|
142
|
+
#there should be SOME way to build composite indexes for common queries,
|
143
|
+
#so more thought is needed. Maybe cubicle can compile and analyze query
|
144
|
+
#stats and choose indexes automatically based on usage. For now, however,
|
145
|
+
#I'm just going to turn the thing off.
|
146
|
+
#col.create_index(dimension_names.map{|dim|[dim,1]})
|
147
|
+
end
|
148
|
+
|
149
|
+
def aggregate(query,options={})
|
150
|
+
view = AggregationView.new(aggregation,query)
|
151
|
+
|
152
|
+
map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
|
153
|
+
|
154
|
+
options[:finalize] = MapReduceHelper.generate_finalize_function(query)
|
155
|
+
options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
|
156
|
+
|
157
|
+
query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
|
158
|
+
|
159
|
+
target_collection = options.delete(:target_collection)
|
160
|
+
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
161
|
+
|
162
|
+
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
163
|
+
|
164
|
+
#This is defensive - some tests run without ever initializing any collections
|
165
|
+
return [] unless database.collection_names.include?(query.source_collection_name)
|
166
|
+
|
167
|
+
result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
|
168
|
+
|
169
|
+
ensure_indexes(target_collection,query.dimension_names) if target_collection
|
170
|
+
|
171
|
+
result
|
172
|
+
end
|
173
|
+
|
174
|
+
def expand_template(template,view)
|
175
|
+
return "" unless template
|
176
|
+
return Mustache.render(template,view) if template.is_a?(String)
|
177
|
+
if (template.is_a?(Hash))
|
178
|
+
template.each {|key,val|template[key] = expand_template(val,view)}
|
179
|
+
return template
|
180
|
+
end
|
181
|
+
template
|
182
|
+
end
|
183
|
+
|
184
|
+
def prepare_filter(query,filter={})
|
185
|
+
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
186
|
+
filter.stringify_keys!
|
187
|
+
transient = (query.transient? || query == aggregation)
|
188
|
+
filter.keys.each do |key|
|
189
|
+
next if key=~/^\$.*/
|
190
|
+
prefix = nil
|
191
|
+
prefix = "_id" if (member = aggregation.dimensions[key])
|
192
|
+
prefix = "value" if (member = aggregation.measures[key]) unless member
|
193
|
+
|
194
|
+
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
195
|
+
|
196
|
+
filter_value = filter.delete(key)
|
197
|
+
if transient
|
198
|
+
if (member.expression_type == :javascript)
|
199
|
+
filter_name = "$where"
|
200
|
+
filter_value = make_filter_transient(member.expression,filter_value)
|
201
|
+
else
|
202
|
+
filter_name = member.field_name
|
203
|
+
end
|
204
|
+
else
|
205
|
+
filter_name = "#{prefix}.#{member.name}"
|
206
|
+
end
|
207
|
+
filter[filter_name] = filter_value
|
208
|
+
end
|
209
|
+
filter
|
210
|
+
end
|
211
|
+
|
212
|
+
def prepare_order_by(query)
|
213
|
+
order_by = []
|
214
|
+
query.order_by.each do |order|
|
215
|
+
prefix = "_id" if (member = aggregation.dimensions[order[0]])
|
216
|
+
prefix = "value" if (member = aggregation.measures[order[0]]) unless member
|
217
|
+
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
218
|
+
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
219
|
+
end
|
220
|
+
order_by
|
221
|
+
end
|
222
|
+
|
223
|
+
def process_if_required
|
224
|
+
return if database.collection_names.include?(target_collection_name)
|
225
|
+
process
|
226
|
+
end
|
227
|
+
|
228
|
+
def make_filter_transient(filter_expression,filter_value)
|
229
|
+
filter_value = {"$eq"=>filter_value} unless filter_value.is_a?(Hash)
|
230
|
+
conditions = filter_value.keys.map do |operator|
|
231
|
+
"val #{make_operator_transient(operator)} #{quote_if_required(filter_value[operator])}"
|
232
|
+
end
|
233
|
+
return "return (function(val){return #{conditions.join(" && ")};})(#{filter_expression})"
|
234
|
+
end
|
235
|
+
|
236
|
+
def make_operator_transient(operator)
|
237
|
+
case operator
|
238
|
+
when "$eq" then "==" #not actually a mongo operator, but added for keeping things consistent
|
239
|
+
when "$ne" then "!="
|
240
|
+
when "$lt" then "<"
|
241
|
+
when "$gt" then ">"
|
242
|
+
when "$lte" then "<="
|
243
|
+
when "$gte" then ">="
|
244
|
+
else raise "unsupported filter operator for filtering members of expression based members in a transient query: #{operator}"
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def quote_if_required(filter_value)
|
249
|
+
(filter_value.is_a?(String) || filter_value.is_a?(Symbol)) ? "'#{filter_value}'" :filter_value
|
250
|
+
end
|
251
|
+
|
252
|
+
end
|
253
|
+
end
|
249
254
|
end
|