cubicle 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +14 -0
- data/README.rdoc +188 -174
- data/cubicle.gemspec +26 -10
- data/lib/cubicle.rb +47 -422
- data/lib/cubicle/aggregation.rb +58 -7
- data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
- data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
- data/lib/cubicle/aggregation/dsl.rb +108 -0
- data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
- data/lib/cubicle/data.rb +29 -84
- data/lib/cubicle/data/hierarchy.rb +55 -0
- data/lib/cubicle/data/level.rb +62 -0
- data/lib/cubicle/data/member.rb +28 -0
- data/lib/cubicle/data/table.rb +56 -0
- data/lib/cubicle/measure.rb +30 -20
- data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
- data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
- data/lib/cubicle/query.rb +21 -194
- data/lib/cubicle/query/dsl.rb +118 -0
- data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
- data/lib/cubicle/ratio.rb +28 -12
- data/lib/cubicle/version.rb +2 -2
- data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
- data/test/cubicle/cubicle_aggregation_test.rb +84 -20
- data/test/cubicle/cubicle_query_test.rb +36 -0
- data/test/cubicle/data/data_test.rb +30 -0
- data/test/cubicle/data/level_test.rb +42 -0
- data/test/cubicle/data/member_test.rb +40 -0
- data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
- data/test/cubicle/duration_test.rb +46 -48
- data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
- data/test/cubicles/defect_cubicle.rb +31 -31
- data/test/log/test.log +102066 -0
- metadata +26 -10
- data/lib/cubicle/data_level.rb +0 -60
- data/test/cubicle/cubicle_data_level_test.rb +0 -58
- data/test/cubicle/cubicle_test.rb +0 -85
data/lib/cubicle.rb
CHANGED
@@ -1,423 +1,48 @@
|
|
1
|
-
require "rubygems"
|
2
|
-
require "active_support"
|
3
|
-
require "mongo"
|
4
|
-
require "logger"
|
5
|
-
|
6
|
-
dir = File.dirname(__FILE__)
|
7
|
-
["mongo_environment",
|
8
|
-
"
|
9
|
-
"
|
10
|
-
"
|
11
|
-
"
|
12
|
-
"
|
13
|
-
"
|
14
|
-
"
|
15
|
-
"
|
16
|
-
"
|
17
|
-
"
|
18
|
-
"
|
19
|
-
"
|
20
|
-
"
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
@transient ||= false
|
49
|
-
end
|
50
|
-
|
51
|
-
def transient!
|
52
|
-
@transient = true
|
53
|
-
end
|
54
|
-
|
55
|
-
def expire!
|
56
|
-
collection.drop
|
57
|
-
expire_aggregations!
|
58
|
-
end
|
59
|
-
|
60
|
-
def aggregations
|
61
|
-
return (@aggregations ||= [])
|
62
|
-
end
|
63
|
-
|
64
|
-
#DSL
|
65
|
-
def source_collection_name(collection_name = nil)
|
66
|
-
return @source_collection = collection_name if collection_name
|
67
|
-
@source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
|
68
|
-
end
|
69
|
-
alias source_collection_name= source_collection_name
|
70
|
-
|
71
|
-
def target_collection_name(collection_name = nil)
|
72
|
-
return nil if transient?
|
73
|
-
return @target_name = collection_name if collection_name
|
74
|
-
@target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
|
75
|
-
end
|
76
|
-
alias target_collection_name= target_collection_name
|
77
|
-
|
78
|
-
def dimension(*args)
|
79
|
-
dimensions << Cubicle::Dimension.new(*args)
|
80
|
-
dimensions[-1]
|
81
|
-
end
|
82
|
-
|
83
|
-
def dimension_names
|
84
|
-
return @dimensions.map{|dim|dim.name.to_s}
|
85
|
-
end
|
86
|
-
|
87
|
-
def dimensions(*args)
|
88
|
-
return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
|
89
|
-
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
90
|
-
args.each {|dim| dimension dim }
|
91
|
-
@dimensions
|
92
|
-
end
|
93
|
-
|
94
|
-
def measure(*args)
|
95
|
-
measures << Measure.new(*args)
|
96
|
-
measures[-1]
|
97
|
-
end
|
98
|
-
|
99
|
-
def measures(*args)
|
100
|
-
return (@measures ||= Cubicle::MemberList.new) if args.length < 1
|
101
|
-
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
102
|
-
args.each {|m| measure m}
|
103
|
-
@measures
|
104
|
-
end
|
105
|
-
|
106
|
-
def count(*args)
|
107
|
-
options = args.extract_options!
|
108
|
-
options[:aggregation_method] = :count
|
109
|
-
measure(*(args << options))
|
110
|
-
end
|
111
|
-
|
112
|
-
def average(*args)
|
113
|
-
options = args.extract_options!
|
114
|
-
options[:aggregation_method] = :average
|
115
|
-
measure(*(args << options))
|
116
|
-
#Averaged fields need a count of non-null values to properly calculate the average
|
117
|
-
args[0] = "#{args[0]}_count".to_sym
|
118
|
-
count *args
|
119
|
-
end
|
120
|
-
alias avg average
|
121
|
-
|
122
|
-
def sum(*args)
|
123
|
-
options = args.extract_options!
|
124
|
-
options[:aggregation_method] = :sum
|
125
|
-
measure(*(args << options))
|
126
|
-
end
|
127
|
-
|
128
|
-
def duration(*args)
|
129
|
-
options = args.extract_options!
|
130
|
-
options[:in] ||= durations_in
|
131
|
-
args << options
|
132
|
-
measures << (dur = Duration.new(*args))
|
133
|
-
count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
|
134
|
-
end
|
135
|
-
|
136
|
-
def average_duration(*args)
|
137
|
-
duration(*args)
|
138
|
-
end
|
139
|
-
alias avg_duration average_duration
|
140
|
-
|
141
|
-
def total_duration(*args)
|
142
|
-
options = args.extract_options!
|
143
|
-
options[:aggregation_method] = :sum
|
144
|
-
duration(*(args<<options))
|
145
|
-
end
|
146
|
-
|
147
|
-
def durations_in(unit_of_time = nil)
|
148
|
-
return (@duration_unit ||= :seconds) unless unit_of_time
|
149
|
-
@duration_unit = unit_of_time.to_s.pluralize.to_sym
|
150
|
-
end
|
151
|
-
alias :duration_unit :durations_in
|
152
|
-
|
153
|
-
|
154
|
-
def ratio(member_name, numerator, denominator)
|
155
|
-
measures << Ratio.new(member_name, numerator, denominator)
|
156
|
-
end
|
157
|
-
|
158
|
-
def aggregation(*member_list)
|
159
|
-
member_list = member_list[0] if member_list[0].is_a?(Array)
|
160
|
-
aggregations << member_list
|
161
|
-
end
|
162
|
-
|
163
|
-
def time_dimension(*args)
|
164
|
-
return (@time_dimension ||= nil) unless args.length > 0
|
165
|
-
@time_dimension = dimension(*args)
|
166
|
-
end
|
167
|
-
alias time_dimension= time_dimension
|
168
|
-
alias date time_dimension
|
169
|
-
alias time time_dimension
|
170
|
-
|
171
|
-
def find_member(member_name)
|
172
|
-
@dimensions[member_name] ||
|
173
|
-
@measures[member_name]
|
174
|
-
end
|
175
|
-
|
176
|
-
def query(*args,&block)
|
177
|
-
options = args.extract_options!
|
178
|
-
query = Cubicle::Query.new(self)
|
179
|
-
query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
|
180
|
-
query.select(*args) if args.length > 0
|
181
|
-
if block_given?
|
182
|
-
block.arity == 1 ? (yield query) : (query.instance_eval(&block))
|
183
|
-
end
|
184
|
-
query.select_all unless query.selected?
|
185
|
-
return query if options[:defer]
|
186
|
-
results = execute_query(query,options)
|
187
|
-
#return results if results.blank?
|
188
|
-
#If the 'by' clause was used in the the query,
|
189
|
-
#we'll hierarchize by the members indicated,
|
190
|
-
#as the next step would otherwise almost certainly
|
191
|
-
#need to be a call to hierarchize anyway.
|
192
|
-
query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
|
193
|
-
end
|
194
|
-
|
195
|
-
#noinspection RubyArgCount
|
196
|
-
def execute_query(query,options={})
|
197
|
-
count = 0
|
198
|
-
|
199
|
-
find_options = {
|
200
|
-
:limit=>query.limit || 0,
|
201
|
-
:skip=>query.offset || 0
|
202
|
-
}
|
203
|
-
|
204
|
-
find_options[:sort] = prepare_order_by(query)
|
205
|
-
filter = {}
|
206
|
-
if query == self || query.transient?
|
207
|
-
aggregation = aggregate(query,options)
|
208
|
-
else
|
209
|
-
process_if_required
|
210
|
-
aggregation = aggregation_for(query)
|
211
|
-
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
212
|
-
#otherwise, a second map reduce is required to reduce the data set one last time
|
213
|
-
if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
214
|
-
filter = prepare_filter(query,options[:where] || {})
|
215
|
-
else
|
216
|
-
aggregation = aggregate(query,:source_collection=>collection.name)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
if aggregation.blank?
|
221
|
-
Cubicle::Data.new(query,[],0) if aggregation == []
|
222
|
-
else
|
223
|
-
count = aggregation.count
|
224
|
-
results = aggregation.find(filter,find_options).to_a
|
225
|
-
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
226
|
-
Cubicle::Data.new(query, results, count)
|
227
|
-
end
|
228
|
-
|
229
|
-
end
|
230
|
-
|
231
|
-
def process(options={})
|
232
|
-
Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
|
233
|
-
start = Time.now
|
234
|
-
expire!
|
235
|
-
aggregate(self,options)
|
236
|
-
#Sort desc by length of array, so that larget
|
237
|
-
#aggregations are processed first, hopefully increasing efficiency
|
238
|
-
#of the processing step
|
239
|
-
aggregations.sort!{|a,b|b.length<=>a.length}
|
240
|
-
aggregations.each do |member_list|
|
241
|
-
agg_start = Time.now
|
242
|
-
aggregation_for(query(:defer=>true){select member_list})
|
243
|
-
Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
244
|
-
end
|
245
|
-
duration = Time.now - start
|
246
|
-
Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
|
247
|
-
end
|
248
|
-
|
249
|
-
protected
|
250
|
-
|
251
|
-
def aggregation_collection_names
|
252
|
-
database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
|
253
|
-
end
|
254
|
-
|
255
|
-
def expire_aggregations!
|
256
|
-
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
257
|
-
end
|
258
|
-
|
259
|
-
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
260
|
-
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
261
|
-
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
262
|
-
existing = existing_aggregations.map do |agg_col_name|
|
263
|
-
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
264
|
-
end
|
265
|
-
|
266
|
-
#This will select all the aggregations that contain ALL of the desired dimension names
|
267
|
-
#we are sorting by length because the aggregation with the least number of members
|
268
|
-
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
269
|
-
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
270
|
-
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
271
|
-
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
272
|
-
#this should do for now.
|
273
|
-
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
274
|
-
|
275
|
-
#If no suitable aggregation exists to base this one off of,
|
276
|
-
#we'll just use the base cubes aggregation collection
|
277
|
-
return target_collection_name if candidates.blank?
|
278
|
-
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
279
|
-
|
280
|
-
end
|
281
|
-
|
282
|
-
def aggregation_for(query)
|
283
|
-
return collection if query.all_dimensions?
|
284
|
-
|
285
|
-
aggregation_query = query.clone
|
286
|
-
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
287
|
-
filter = (query.where if query.respond_to?(:where))
|
288
|
-
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
|
289
|
-
|
290
|
-
dimension_names = aggregation_query.dimension_names.sort
|
291
|
-
agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
292
|
-
|
293
|
-
unless database.collection_names.include?(agg_col_name)
|
294
|
-
source_col_name = find_best_source_collection(dimension_names)
|
295
|
-
exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
296
|
-
aggregate(exec_query, :target_collection=>agg_col_name)
|
297
|
-
end
|
298
|
-
|
299
|
-
database[agg_col_name]
|
300
|
-
end
|
301
|
-
|
302
|
-
def ensure_indexes(collection_name,dimension_names)
|
303
|
-
#an index for each dimension
|
304
|
-
dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
|
305
|
-
#and a composite
|
306
|
-
database[collection_name].create_index(dimension_names)
|
307
|
-
end
|
308
|
-
|
309
|
-
def aggregate(query,options={})
|
310
|
-
map, reduce = generate_map_function(query), generate_reduce_function
|
311
|
-
options[:finalize] = generate_finalize_function(query)
|
312
|
-
options["query"] = prepare_filter(query,options[:where] || {})
|
313
|
-
|
314
|
-
query.source_collection_name ||= source_collection_name
|
315
|
-
|
316
|
-
target_collection = options.delete(:target_collection)
|
317
|
-
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
318
|
-
|
319
|
-
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
320
|
-
|
321
|
-
#This is defensive - some tests run without ever initializing any collections
|
322
|
-
return [] unless database.collection_names.include?(query.source_collection_name)
|
323
|
-
|
324
|
-
result = database[query.source_collection_name].map_reduce(map,reduce,options)
|
325
|
-
|
326
|
-
ensure_indexes(target_collection,query.dimension_names) if target_collection
|
327
|
-
|
328
|
-
result
|
329
|
-
end
|
330
|
-
|
331
|
-
def prepare_filter(query,filter={})
|
332
|
-
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
333
|
-
filter.stringify_keys!
|
334
|
-
transient = (query.transient? || query == self)
|
335
|
-
filter.keys.each do |key|
|
336
|
-
next if key=~/^\$.*/
|
337
|
-
prefix = nil
|
338
|
-
prefix = "_id" if (member = self.dimensions[key])
|
339
|
-
prefix = "value" if (member = self.measures[key]) unless member
|
340
|
-
|
341
|
-
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
342
|
-
|
343
|
-
filter_value = filter.delete(key)
|
344
|
-
if transient
|
345
|
-
if (member.expression_type == :javascript)
|
346
|
-
filter_name = "$where"
|
347
|
-
filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
|
348
|
-
filter_value = "(#{member.expression})==#{filter_value}"
|
349
|
-
else
|
350
|
-
filter_name = member.expression
|
351
|
-
end
|
352
|
-
else
|
353
|
-
filter_name = "#{prefix}.#{member.name}"
|
354
|
-
end
|
355
|
-
filter[filter_name] = filter_value
|
356
|
-
end
|
357
|
-
filter
|
358
|
-
end
|
359
|
-
|
360
|
-
def prepare_order_by(query)
|
361
|
-
order_by = []
|
362
|
-
query.order_by.each do |order|
|
363
|
-
prefix = "_id" if (member = self.dimensions[order[0]])
|
364
|
-
prefix = "value" if (member = self.measures[order[0]]) unless member
|
365
|
-
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
366
|
-
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
367
|
-
end
|
368
|
-
order_by
|
369
|
-
end
|
370
|
-
|
371
|
-
def process_if_required
|
372
|
-
return if database.collection_names.include?(target_collection_name)
|
373
|
-
process
|
374
|
-
end
|
375
|
-
|
376
|
-
|
377
|
-
def generate_keys_string(query)
|
378
|
-
"{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
|
379
|
-
end
|
380
|
-
|
381
|
-
def generate_values_string(query = self)
|
382
|
-
"{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
|
383
|
-
end
|
384
|
-
|
385
|
-
def generate_map_function(query = self)
|
386
|
-
<<MAP
|
387
|
-
function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
|
388
|
-
MAP
|
389
|
-
end
|
390
|
-
|
391
|
-
def generate_reduce_function()
|
392
|
-
<<REDUCE
|
393
|
-
function(key,values){
|
394
|
-
var output = {};
|
395
|
-
values.forEach(function(doc){
|
396
|
-
for(var key in doc){
|
397
|
-
if (doc[key] || doc[key] == 0){
|
398
|
-
output[key] = output[key] || 0;
|
399
|
-
output[key] += doc[key];
|
400
|
-
}
|
401
|
-
}
|
402
|
-
});
|
403
|
-
return output;
|
404
|
-
}
|
405
|
-
REDUCE
|
406
|
-
end
|
407
|
-
|
408
|
-
def generate_finalize_function(query = self)
|
409
|
-
<<FINALIZE
|
410
|
-
function(key,value)
|
411
|
-
{
|
412
|
-
|
413
|
-
#{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
|
414
|
-
"value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
|
415
|
-
end.join("\n")}
|
416
|
-
#{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
|
417
|
-
"value.#{m.name}=#{m.expression};";
|
418
|
-
end.join("\n")}
|
419
|
-
return value;
|
420
|
-
}
|
421
|
-
FINALIZE
|
422
|
-
end
|
1
|
+
require "rubygems"
|
2
|
+
require "active_support"
|
3
|
+
require "mongo"
|
4
|
+
require "logger"
|
5
|
+
|
6
|
+
dir = File.dirname(__FILE__)
|
7
|
+
["mongo_environment",
|
8
|
+
"ordered_hash_with_indifferent_access",
|
9
|
+
"member",
|
10
|
+
"member_list",
|
11
|
+
"measure",
|
12
|
+
"calculated_measure",
|
13
|
+
"dimension",
|
14
|
+
"ratio",
|
15
|
+
"duration",
|
16
|
+
"query/dsl/time_intelligence",
|
17
|
+
"query/dsl",
|
18
|
+
"query",
|
19
|
+
"data",
|
20
|
+
"data/member",
|
21
|
+
"data/level",
|
22
|
+
"data/hierarchy",
|
23
|
+
"data/table",
|
24
|
+
"aggregation/aggregation_manager",
|
25
|
+
"aggregation/map_reduce_helper",
|
26
|
+
"aggregation/dsl",
|
27
|
+
"aggregation",
|
28
|
+
"aggregation/ad_hoc",
|
29
|
+
"date_time",
|
30
|
+
"support"].each {|lib|require File.join(dir,'cubicle',lib)}
|
31
|
+
|
32
|
+
require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
|
33
|
+
|
34
|
+
module Cubicle
|
35
|
+
|
36
|
+
def self.register_cubicle_directory(directory_path, recursive=true)
|
37
|
+
searcher = "#{recursive ? "*" : "**/*"}.rb"
|
38
|
+
Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.mongo
|
42
|
+
@mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.logger
|
46
|
+
@logger ||= (Cubicle.mongo.logger || Logger.new("cubicle.log"))
|
47
|
+
end
|
423
48
|
end
|