cubicle 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +14 -0
- data/README.rdoc +188 -174
- data/cubicle.gemspec +26 -10
- data/lib/cubicle.rb +47 -422
- data/lib/cubicle/aggregation.rb +58 -7
- data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
- data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
- data/lib/cubicle/aggregation/dsl.rb +108 -0
- data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
- data/lib/cubicle/data.rb +29 -84
- data/lib/cubicle/data/hierarchy.rb +55 -0
- data/lib/cubicle/data/level.rb +62 -0
- data/lib/cubicle/data/member.rb +28 -0
- data/lib/cubicle/data/table.rb +56 -0
- data/lib/cubicle/measure.rb +30 -20
- data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
- data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
- data/lib/cubicle/query.rb +21 -194
- data/lib/cubicle/query/dsl.rb +118 -0
- data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
- data/lib/cubicle/ratio.rb +28 -12
- data/lib/cubicle/version.rb +2 -2
- data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
- data/test/cubicle/cubicle_aggregation_test.rb +84 -20
- data/test/cubicle/cubicle_query_test.rb +36 -0
- data/test/cubicle/data/data_test.rb +30 -0
- data/test/cubicle/data/level_test.rb +42 -0
- data/test/cubicle/data/member_test.rb +40 -0
- data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
- data/test/cubicle/duration_test.rb +46 -48
- data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
- data/test/cubicles/defect_cubicle.rb +31 -31
- data/test/log/test.log +102066 -0
- metadata +26 -10
- data/lib/cubicle/data_level.rb +0 -60
- data/test/cubicle/cubicle_data_level_test.rb +0 -58
- data/test/cubicle/cubicle_test.rb +0 -85
data/lib/cubicle.rb
CHANGED
@@ -1,423 +1,48 @@
|
|
1
|
-
require "rubygems"
|
2
|
-
require "active_support"
|
3
|
-
require "mongo"
|
4
|
-
require "logger"
|
5
|
-
|
6
|
-
dir = File.dirname(__FILE__)
|
7
|
-
["mongo_environment",
|
8
|
-
"
|
9
|
-
"
|
10
|
-
"
|
11
|
-
"
|
12
|
-
"
|
13
|
-
"
|
14
|
-
"
|
15
|
-
"
|
16
|
-
"
|
17
|
-
"
|
18
|
-
"
|
19
|
-
"
|
20
|
-
"
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
@transient ||= false
|
49
|
-
end
|
50
|
-
|
51
|
-
def transient!
|
52
|
-
@transient = true
|
53
|
-
end
|
54
|
-
|
55
|
-
def expire!
|
56
|
-
collection.drop
|
57
|
-
expire_aggregations!
|
58
|
-
end
|
59
|
-
|
60
|
-
def aggregations
|
61
|
-
return (@aggregations ||= [])
|
62
|
-
end
|
63
|
-
|
64
|
-
#DSL
|
65
|
-
def source_collection_name(collection_name = nil)
|
66
|
-
return @source_collection = collection_name if collection_name
|
67
|
-
@source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
|
68
|
-
end
|
69
|
-
alias source_collection_name= source_collection_name
|
70
|
-
|
71
|
-
def target_collection_name(collection_name = nil)
|
72
|
-
return nil if transient?
|
73
|
-
return @target_name = collection_name if collection_name
|
74
|
-
@target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
|
75
|
-
end
|
76
|
-
alias target_collection_name= target_collection_name
|
77
|
-
|
78
|
-
def dimension(*args)
|
79
|
-
dimensions << Cubicle::Dimension.new(*args)
|
80
|
-
dimensions[-1]
|
81
|
-
end
|
82
|
-
|
83
|
-
def dimension_names
|
84
|
-
return @dimensions.map{|dim|dim.name.to_s}
|
85
|
-
end
|
86
|
-
|
87
|
-
def dimensions(*args)
|
88
|
-
return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
|
89
|
-
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
90
|
-
args.each {|dim| dimension dim }
|
91
|
-
@dimensions
|
92
|
-
end
|
93
|
-
|
94
|
-
def measure(*args)
|
95
|
-
measures << Measure.new(*args)
|
96
|
-
measures[-1]
|
97
|
-
end
|
98
|
-
|
99
|
-
def measures(*args)
|
100
|
-
return (@measures ||= Cubicle::MemberList.new) if args.length < 1
|
101
|
-
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
102
|
-
args.each {|m| measure m}
|
103
|
-
@measures
|
104
|
-
end
|
105
|
-
|
106
|
-
def count(*args)
|
107
|
-
options = args.extract_options!
|
108
|
-
options[:aggregation_method] = :count
|
109
|
-
measure(*(args << options))
|
110
|
-
end
|
111
|
-
|
112
|
-
def average(*args)
|
113
|
-
options = args.extract_options!
|
114
|
-
options[:aggregation_method] = :average
|
115
|
-
measure(*(args << options))
|
116
|
-
#Averaged fields need a count of non-null values to properly calculate the average
|
117
|
-
args[0] = "#{args[0]}_count".to_sym
|
118
|
-
count *args
|
119
|
-
end
|
120
|
-
alias avg average
|
121
|
-
|
122
|
-
def sum(*args)
|
123
|
-
options = args.extract_options!
|
124
|
-
options[:aggregation_method] = :sum
|
125
|
-
measure(*(args << options))
|
126
|
-
end
|
127
|
-
|
128
|
-
def duration(*args)
|
129
|
-
options = args.extract_options!
|
130
|
-
options[:in] ||= durations_in
|
131
|
-
args << options
|
132
|
-
measures << (dur = Duration.new(*args))
|
133
|
-
count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
|
134
|
-
end
|
135
|
-
|
136
|
-
def average_duration(*args)
|
137
|
-
duration(*args)
|
138
|
-
end
|
139
|
-
alias avg_duration average_duration
|
140
|
-
|
141
|
-
def total_duration(*args)
|
142
|
-
options = args.extract_options!
|
143
|
-
options[:aggregation_method] = :sum
|
144
|
-
duration(*(args<<options))
|
145
|
-
end
|
146
|
-
|
147
|
-
def durations_in(unit_of_time = nil)
|
148
|
-
return (@duration_unit ||= :seconds) unless unit_of_time
|
149
|
-
@duration_unit = unit_of_time.to_s.pluralize.to_sym
|
150
|
-
end
|
151
|
-
alias :duration_unit :durations_in
|
152
|
-
|
153
|
-
|
154
|
-
def ratio(member_name, numerator, denominator)
|
155
|
-
measures << Ratio.new(member_name, numerator, denominator)
|
156
|
-
end
|
157
|
-
|
158
|
-
def aggregation(*member_list)
|
159
|
-
member_list = member_list[0] if member_list[0].is_a?(Array)
|
160
|
-
aggregations << member_list
|
161
|
-
end
|
162
|
-
|
163
|
-
def time_dimension(*args)
|
164
|
-
return (@time_dimension ||= nil) unless args.length > 0
|
165
|
-
@time_dimension = dimension(*args)
|
166
|
-
end
|
167
|
-
alias time_dimension= time_dimension
|
168
|
-
alias date time_dimension
|
169
|
-
alias time time_dimension
|
170
|
-
|
171
|
-
def find_member(member_name)
|
172
|
-
@dimensions[member_name] ||
|
173
|
-
@measures[member_name]
|
174
|
-
end
|
175
|
-
|
176
|
-
def query(*args,&block)
|
177
|
-
options = args.extract_options!
|
178
|
-
query = Cubicle::Query.new(self)
|
179
|
-
query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
|
180
|
-
query.select(*args) if args.length > 0
|
181
|
-
if block_given?
|
182
|
-
block.arity == 1 ? (yield query) : (query.instance_eval(&block))
|
183
|
-
end
|
184
|
-
query.select_all unless query.selected?
|
185
|
-
return query if options[:defer]
|
186
|
-
results = execute_query(query,options)
|
187
|
-
#return results if results.blank?
|
188
|
-
#If the 'by' clause was used in the the query,
|
189
|
-
#we'll hierarchize by the members indicated,
|
190
|
-
#as the next step would otherwise almost certainly
|
191
|
-
#need to be a call to hierarchize anyway.
|
192
|
-
query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
|
193
|
-
end
|
194
|
-
|
195
|
-
#noinspection RubyArgCount
|
196
|
-
def execute_query(query,options={})
|
197
|
-
count = 0
|
198
|
-
|
199
|
-
find_options = {
|
200
|
-
:limit=>query.limit || 0,
|
201
|
-
:skip=>query.offset || 0
|
202
|
-
}
|
203
|
-
|
204
|
-
find_options[:sort] = prepare_order_by(query)
|
205
|
-
filter = {}
|
206
|
-
if query == self || query.transient?
|
207
|
-
aggregation = aggregate(query,options)
|
208
|
-
else
|
209
|
-
process_if_required
|
210
|
-
aggregation = aggregation_for(query)
|
211
|
-
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
212
|
-
#otherwise, a second map reduce is required to reduce the data set one last time
|
213
|
-
if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
214
|
-
filter = prepare_filter(query,options[:where] || {})
|
215
|
-
else
|
216
|
-
aggregation = aggregate(query,:source_collection=>collection.name)
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
if aggregation.blank?
|
221
|
-
Cubicle::Data.new(query,[],0) if aggregation == []
|
222
|
-
else
|
223
|
-
count = aggregation.count
|
224
|
-
results = aggregation.find(filter,find_options).to_a
|
225
|
-
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
226
|
-
Cubicle::Data.new(query, results, count)
|
227
|
-
end
|
228
|
-
|
229
|
-
end
|
230
|
-
|
231
|
-
def process(options={})
|
232
|
-
Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
|
233
|
-
start = Time.now
|
234
|
-
expire!
|
235
|
-
aggregate(self,options)
|
236
|
-
#Sort desc by length of array, so that larget
|
237
|
-
#aggregations are processed first, hopefully increasing efficiency
|
238
|
-
#of the processing step
|
239
|
-
aggregations.sort!{|a,b|b.length<=>a.length}
|
240
|
-
aggregations.each do |member_list|
|
241
|
-
agg_start = Time.now
|
242
|
-
aggregation_for(query(:defer=>true){select member_list})
|
243
|
-
Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
244
|
-
end
|
245
|
-
duration = Time.now - start
|
246
|
-
Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
|
247
|
-
end
|
248
|
-
|
249
|
-
protected
|
250
|
-
|
251
|
-
def aggregation_collection_names
|
252
|
-
database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
|
253
|
-
end
|
254
|
-
|
255
|
-
def expire_aggregations!
|
256
|
-
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
257
|
-
end
|
258
|
-
|
259
|
-
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
260
|
-
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
261
|
-
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
262
|
-
existing = existing_aggregations.map do |agg_col_name|
|
263
|
-
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
264
|
-
end
|
265
|
-
|
266
|
-
#This will select all the aggregations that contain ALL of the desired dimension names
|
267
|
-
#we are sorting by length because the aggregation with the least number of members
|
268
|
-
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
269
|
-
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
270
|
-
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
271
|
-
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
272
|
-
#this should do for now.
|
273
|
-
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
274
|
-
|
275
|
-
#If no suitable aggregation exists to base this one off of,
|
276
|
-
#we'll just use the base cubes aggregation collection
|
277
|
-
return target_collection_name if candidates.blank?
|
278
|
-
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
279
|
-
|
280
|
-
end
|
281
|
-
|
282
|
-
def aggregation_for(query)
|
283
|
-
return collection if query.all_dimensions?
|
284
|
-
|
285
|
-
aggregation_query = query.clone
|
286
|
-
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
287
|
-
filter = (query.where if query.respond_to?(:where))
|
288
|
-
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
|
289
|
-
|
290
|
-
dimension_names = aggregation_query.dimension_names.sort
|
291
|
-
agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
292
|
-
|
293
|
-
unless database.collection_names.include?(agg_col_name)
|
294
|
-
source_col_name = find_best_source_collection(dimension_names)
|
295
|
-
exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
296
|
-
aggregate(exec_query, :target_collection=>agg_col_name)
|
297
|
-
end
|
298
|
-
|
299
|
-
database[agg_col_name]
|
300
|
-
end
|
301
|
-
|
302
|
-
def ensure_indexes(collection_name,dimension_names)
|
303
|
-
#an index for each dimension
|
304
|
-
dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
|
305
|
-
#and a composite
|
306
|
-
database[collection_name].create_index(dimension_names)
|
307
|
-
end
|
308
|
-
|
309
|
-
def aggregate(query,options={})
|
310
|
-
map, reduce = generate_map_function(query), generate_reduce_function
|
311
|
-
options[:finalize] = generate_finalize_function(query)
|
312
|
-
options["query"] = prepare_filter(query,options[:where] || {})
|
313
|
-
|
314
|
-
query.source_collection_name ||= source_collection_name
|
315
|
-
|
316
|
-
target_collection = options.delete(:target_collection)
|
317
|
-
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
318
|
-
|
319
|
-
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
320
|
-
|
321
|
-
#This is defensive - some tests run without ever initializing any collections
|
322
|
-
return [] unless database.collection_names.include?(query.source_collection_name)
|
323
|
-
|
324
|
-
result = database[query.source_collection_name].map_reduce(map,reduce,options)
|
325
|
-
|
326
|
-
ensure_indexes(target_collection,query.dimension_names) if target_collection
|
327
|
-
|
328
|
-
result
|
329
|
-
end
|
330
|
-
|
331
|
-
def prepare_filter(query,filter={})
|
332
|
-
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
333
|
-
filter.stringify_keys!
|
334
|
-
transient = (query.transient? || query == self)
|
335
|
-
filter.keys.each do |key|
|
336
|
-
next if key=~/^\$.*/
|
337
|
-
prefix = nil
|
338
|
-
prefix = "_id" if (member = self.dimensions[key])
|
339
|
-
prefix = "value" if (member = self.measures[key]) unless member
|
340
|
-
|
341
|
-
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
342
|
-
|
343
|
-
filter_value = filter.delete(key)
|
344
|
-
if transient
|
345
|
-
if (member.expression_type == :javascript)
|
346
|
-
filter_name = "$where"
|
347
|
-
filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
|
348
|
-
filter_value = "(#{member.expression})==#{filter_value}"
|
349
|
-
else
|
350
|
-
filter_name = member.expression
|
351
|
-
end
|
352
|
-
else
|
353
|
-
filter_name = "#{prefix}.#{member.name}"
|
354
|
-
end
|
355
|
-
filter[filter_name] = filter_value
|
356
|
-
end
|
357
|
-
filter
|
358
|
-
end
|
359
|
-
|
360
|
-
def prepare_order_by(query)
|
361
|
-
order_by = []
|
362
|
-
query.order_by.each do |order|
|
363
|
-
prefix = "_id" if (member = self.dimensions[order[0]])
|
364
|
-
prefix = "value" if (member = self.measures[order[0]]) unless member
|
365
|
-
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
366
|
-
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
367
|
-
end
|
368
|
-
order_by
|
369
|
-
end
|
370
|
-
|
371
|
-
def process_if_required
|
372
|
-
return if database.collection_names.include?(target_collection_name)
|
373
|
-
process
|
374
|
-
end
|
375
|
-
|
376
|
-
|
377
|
-
def generate_keys_string(query)
|
378
|
-
"{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
|
379
|
-
end
|
380
|
-
|
381
|
-
def generate_values_string(query = self)
|
382
|
-
"{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
|
383
|
-
end
|
384
|
-
|
385
|
-
def generate_map_function(query = self)
|
386
|
-
<<MAP
|
387
|
-
function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
|
388
|
-
MAP
|
389
|
-
end
|
390
|
-
|
391
|
-
def generate_reduce_function()
|
392
|
-
<<REDUCE
|
393
|
-
function(key,values){
|
394
|
-
var output = {};
|
395
|
-
values.forEach(function(doc){
|
396
|
-
for(var key in doc){
|
397
|
-
if (doc[key] || doc[key] == 0){
|
398
|
-
output[key] = output[key] || 0;
|
399
|
-
output[key] += doc[key];
|
400
|
-
}
|
401
|
-
}
|
402
|
-
});
|
403
|
-
return output;
|
404
|
-
}
|
405
|
-
REDUCE
|
406
|
-
end
|
407
|
-
|
408
|
-
def generate_finalize_function(query = self)
|
409
|
-
<<FINALIZE
|
410
|
-
function(key,value)
|
411
|
-
{
|
412
|
-
|
413
|
-
#{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
|
414
|
-
"value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
|
415
|
-
end.join("\n")}
|
416
|
-
#{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
|
417
|
-
"value.#{m.name}=#{m.expression};";
|
418
|
-
end.join("\n")}
|
419
|
-
return value;
|
420
|
-
}
|
421
|
-
FINALIZE
|
422
|
-
end
|
1
|
+
require "rubygems"
|
2
|
+
require "active_support"
|
3
|
+
require "mongo"
|
4
|
+
require "logger"
|
5
|
+
|
6
|
+
dir = File.dirname(__FILE__)
|
7
|
+
["mongo_environment",
|
8
|
+
"ordered_hash_with_indifferent_access",
|
9
|
+
"member",
|
10
|
+
"member_list",
|
11
|
+
"measure",
|
12
|
+
"calculated_measure",
|
13
|
+
"dimension",
|
14
|
+
"ratio",
|
15
|
+
"duration",
|
16
|
+
"query/dsl/time_intelligence",
|
17
|
+
"query/dsl",
|
18
|
+
"query",
|
19
|
+
"data",
|
20
|
+
"data/member",
|
21
|
+
"data/level",
|
22
|
+
"data/hierarchy",
|
23
|
+
"data/table",
|
24
|
+
"aggregation/aggregation_manager",
|
25
|
+
"aggregation/map_reduce_helper",
|
26
|
+
"aggregation/dsl",
|
27
|
+
"aggregation",
|
28
|
+
"aggregation/ad_hoc",
|
29
|
+
"date_time",
|
30
|
+
"support"].each {|lib|require File.join(dir,'cubicle',lib)}
|
31
|
+
|
32
|
+
require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
|
33
|
+
|
34
|
+
module Cubicle
|
35
|
+
|
36
|
+
def self.register_cubicle_directory(directory_path, recursive=true)
|
37
|
+
searcher = "#{recursive ? "*" : "**/*"}.rb"
|
38
|
+
Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.mongo
|
42
|
+
@mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.logger
|
46
|
+
@logger ||= (Cubicle.mongo.logger || Logger.new("cubicle.log"))
|
47
|
+
end
|
423
48
|
end
|