cubicle 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/CHANGELOG.rdoc +14 -0
  2. data/README.rdoc +188 -174
  3. data/cubicle.gemspec +26 -10
  4. data/lib/cubicle.rb +47 -422
  5. data/lib/cubicle/aggregation.rb +58 -7
  6. data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
  7. data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
  8. data/lib/cubicle/aggregation/dsl.rb +108 -0
  9. data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
  10. data/lib/cubicle/data.rb +29 -84
  11. data/lib/cubicle/data/hierarchy.rb +55 -0
  12. data/lib/cubicle/data/level.rb +62 -0
  13. data/lib/cubicle/data/member.rb +28 -0
  14. data/lib/cubicle/data/table.rb +56 -0
  15. data/lib/cubicle/measure.rb +30 -20
  16. data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
  17. data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
  18. data/lib/cubicle/query.rb +21 -194
  19. data/lib/cubicle/query/dsl.rb +118 -0
  20. data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
  21. data/lib/cubicle/ratio.rb +28 -12
  22. data/lib/cubicle/version.rb +2 -2
  23. data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
  24. data/test/cubicle/cubicle_aggregation_test.rb +84 -20
  25. data/test/cubicle/cubicle_query_test.rb +36 -0
  26. data/test/cubicle/data/data_test.rb +30 -0
  27. data/test/cubicle/data/level_test.rb +42 -0
  28. data/test/cubicle/data/member_test.rb +40 -0
  29. data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
  30. data/test/cubicle/duration_test.rb +46 -48
  31. data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
  32. data/test/cubicles/defect_cubicle.rb +31 -31
  33. data/test/log/test.log +102066 -0
  34. metadata +26 -10
  35. data/lib/cubicle/data_level.rb +0 -60
  36. data/test/cubicle/cubicle_data_level_test.rb +0 -58
  37. data/test/cubicle/cubicle_test.rb +0 -85
data/lib/cubicle.rb CHANGED
@@ -1,423 +1,48 @@
1
- require "rubygems"
2
- require "active_support"
3
- require "mongo"
4
- require "logger"
5
-
6
- dir = File.dirname(__FILE__)
7
- ["mongo_environment",
8
- "member",
9
- "member_list",
10
- "measure",
11
- "calculated_measure",
12
- "dimension",
13
- "ratio",
14
- "duration",
15
- "query",
16
- "data_level",
17
- "data",
18
- "aggregation",
19
- "date_time",
20
- "support"].each {|lib|require File.join(dir,'cubicle',lib)}
21
-
22
- require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
23
-
24
- module Cubicle
25
-
26
- def self.register_cubicle_directory(directory_path, recursive=true)
27
- searcher = "#{recursive ? "*" : "**/*"}.rb"
28
- Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
29
- end
30
-
31
- def self.mongo
32
- @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
33
- end
34
-
35
- def self.logger
36
- Cubicle.mongo.logger || Logger.new("cubicle.log")
37
- end
38
-
39
- def database
40
- Cubicle.mongo.database
41
- end
42
-
43
- def collection
44
- database[target_collection_name]
45
- end
46
-
47
- def transient?
48
- @transient ||= false
49
- end
50
-
51
- def transient!
52
- @transient = true
53
- end
54
-
55
- def expire!
56
- collection.drop
57
- expire_aggregations!
58
- end
59
-
60
- def aggregations
61
- return (@aggregations ||= [])
62
- end
63
-
64
- #DSL
65
- def source_collection_name(collection_name = nil)
66
- return @source_collection = collection_name if collection_name
67
- @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
68
- end
69
- alias source_collection_name= source_collection_name
70
-
71
- def target_collection_name(collection_name = nil)
72
- return nil if transient?
73
- return @target_name = collection_name if collection_name
74
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
75
- end
76
- alias target_collection_name= target_collection_name
77
-
78
- def dimension(*args)
79
- dimensions << Cubicle::Dimension.new(*args)
80
- dimensions[-1]
81
- end
82
-
83
- def dimension_names
84
- return @dimensions.map{|dim|dim.name.to_s}
85
- end
86
-
87
- def dimensions(*args)
88
- return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
89
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
90
- args.each {|dim| dimension dim }
91
- @dimensions
92
- end
93
-
94
- def measure(*args)
95
- measures << Measure.new(*args)
96
- measures[-1]
97
- end
98
-
99
- def measures(*args)
100
- return (@measures ||= Cubicle::MemberList.new) if args.length < 1
101
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
102
- args.each {|m| measure m}
103
- @measures
104
- end
105
-
106
- def count(*args)
107
- options = args.extract_options!
108
- options[:aggregation_method] = :count
109
- measure(*(args << options))
110
- end
111
-
112
- def average(*args)
113
- options = args.extract_options!
114
- options[:aggregation_method] = :average
115
- measure(*(args << options))
116
- #Averaged fields need a count of non-null values to properly calculate the average
117
- args[0] = "#{args[0]}_count".to_sym
118
- count *args
119
- end
120
- alias avg average
121
-
122
- def sum(*args)
123
- options = args.extract_options!
124
- options[:aggregation_method] = :sum
125
- measure(*(args << options))
126
- end
127
-
128
- def duration(*args)
129
- options = args.extract_options!
130
- options[:in] ||= durations_in
131
- args << options
132
- measures << (dur = Duration.new(*args))
133
- count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
134
- end
135
-
136
- def average_duration(*args)
137
- duration(*args)
138
- end
139
- alias avg_duration average_duration
140
-
141
- def total_duration(*args)
142
- options = args.extract_options!
143
- options[:aggregation_method] = :sum
144
- duration(*(args<<options))
145
- end
146
-
147
- def durations_in(unit_of_time = nil)
148
- return (@duration_unit ||= :seconds) unless unit_of_time
149
- @duration_unit = unit_of_time.to_s.pluralize.to_sym
150
- end
151
- alias :duration_unit :durations_in
152
-
153
-
154
- def ratio(member_name, numerator, denominator)
155
- measures << Ratio.new(member_name, numerator, denominator)
156
- end
157
-
158
- def aggregation(*member_list)
159
- member_list = member_list[0] if member_list[0].is_a?(Array)
160
- aggregations << member_list
161
- end
162
-
163
- def time_dimension(*args)
164
- return (@time_dimension ||= nil) unless args.length > 0
165
- @time_dimension = dimension(*args)
166
- end
167
- alias time_dimension= time_dimension
168
- alias date time_dimension
169
- alias time time_dimension
170
-
171
- def find_member(member_name)
172
- @dimensions[member_name] ||
173
- @measures[member_name]
174
- end
175
-
176
- def query(*args,&block)
177
- options = args.extract_options!
178
- query = Cubicle::Query.new(self)
179
- query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
180
- query.select(*args) if args.length > 0
181
- if block_given?
182
- block.arity == 1 ? (yield query) : (query.instance_eval(&block))
183
- end
184
- query.select_all unless query.selected?
185
- return query if options[:defer]
186
- results = execute_query(query,options)
187
- #return results if results.blank?
188
- #If the 'by' clause was used in the the query,
189
- #we'll hierarchize by the members indicated,
190
- #as the next step would otherwise almost certainly
191
- #need to be a call to hierarchize anyway.
192
- query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
193
- end
194
-
195
- #noinspection RubyArgCount
196
- def execute_query(query,options={})
197
- count = 0
198
-
199
- find_options = {
200
- :limit=>query.limit || 0,
201
- :skip=>query.offset || 0
202
- }
203
-
204
- find_options[:sort] = prepare_order_by(query)
205
- filter = {}
206
- if query == self || query.transient?
207
- aggregation = aggregate(query,options)
208
- else
209
- process_if_required
210
- aggregation = aggregation_for(query)
211
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
212
- #otherwise, a second map reduce is required to reduce the data set one last time
213
- if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
214
- filter = prepare_filter(query,options[:where] || {})
215
- else
216
- aggregation = aggregate(query,:source_collection=>collection.name)
217
- end
218
- end
219
-
220
- if aggregation.blank?
221
- Cubicle::Data.new(query,[],0) if aggregation == []
222
- else
223
- count = aggregation.count
224
- results = aggregation.find(filter,find_options).to_a
225
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
226
- Cubicle::Data.new(query, results, count)
227
- end
228
-
229
- end
230
-
231
- def process(options={})
232
- Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
233
- start = Time.now
234
- expire!
235
- aggregate(self,options)
236
- #Sort desc by length of array, so that larget
237
- #aggregations are processed first, hopefully increasing efficiency
238
- #of the processing step
239
- aggregations.sort!{|a,b|b.length<=>a.length}
240
- aggregations.each do |member_list|
241
- agg_start = Time.now
242
- aggregation_for(query(:defer=>true){select member_list})
243
- Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
244
- end
245
- duration = Time.now - start
246
- Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
247
- end
248
-
249
- protected
250
-
251
- def aggregation_collection_names
252
- database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
253
- end
254
-
255
- def expire_aggregations!
256
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
257
- end
258
-
259
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
260
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
261
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
262
- existing = existing_aggregations.map do |agg_col_name|
263
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
264
- end
265
-
266
- #This will select all the aggregations that contain ALL of the desired dimension names
267
- #we are sorting by length because the aggregation with the least number of members
268
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
269
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
270
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
271
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
272
- #this should do for now.
273
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
274
-
275
- #If no suitable aggregation exists to base this one off of,
276
- #we'll just use the base cubes aggregation collection
277
- return target_collection_name if candidates.blank?
278
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
279
-
280
- end
281
-
282
- def aggregation_for(query)
283
- return collection if query.all_dimensions?
284
-
285
- aggregation_query = query.clone
286
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
287
- filter = (query.where if query.respond_to?(:where))
288
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
289
-
290
- dimension_names = aggregation_query.dimension_names.sort
291
- agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
292
-
293
- unless database.collection_names.include?(agg_col_name)
294
- source_col_name = find_best_source_collection(dimension_names)
295
- exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
296
- aggregate(exec_query, :target_collection=>agg_col_name)
297
- end
298
-
299
- database[agg_col_name]
300
- end
301
-
302
- def ensure_indexes(collection_name,dimension_names)
303
- #an index for each dimension
304
- dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
305
- #and a composite
306
- database[collection_name].create_index(dimension_names)
307
- end
308
-
309
- def aggregate(query,options={})
310
- map, reduce = generate_map_function(query), generate_reduce_function
311
- options[:finalize] = generate_finalize_function(query)
312
- options["query"] = prepare_filter(query,options[:where] || {})
313
-
314
- query.source_collection_name ||= source_collection_name
315
-
316
- target_collection = options.delete(:target_collection)
317
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
318
-
319
- options[:out] = target_collection unless target_collection.blank? || query.transient?
320
-
321
- #This is defensive - some tests run without ever initializing any collections
322
- return [] unless database.collection_names.include?(query.source_collection_name)
323
-
324
- result = database[query.source_collection_name].map_reduce(map,reduce,options)
325
-
326
- ensure_indexes(target_collection,query.dimension_names) if target_collection
327
-
328
- result
329
- end
330
-
331
- def prepare_filter(query,filter={})
332
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
333
- filter.stringify_keys!
334
- transient = (query.transient? || query == self)
335
- filter.keys.each do |key|
336
- next if key=~/^\$.*/
337
- prefix = nil
338
- prefix = "_id" if (member = self.dimensions[key])
339
- prefix = "value" if (member = self.measures[key]) unless member
340
-
341
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
342
-
343
- filter_value = filter.delete(key)
344
- if transient
345
- if (member.expression_type == :javascript)
346
- filter_name = "$where"
347
- filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
348
- filter_value = "(#{member.expression})==#{filter_value}"
349
- else
350
- filter_name = member.expression
351
- end
352
- else
353
- filter_name = "#{prefix}.#{member.name}"
354
- end
355
- filter[filter_name] = filter_value
356
- end
357
- filter
358
- end
359
-
360
- def prepare_order_by(query)
361
- order_by = []
362
- query.order_by.each do |order|
363
- prefix = "_id" if (member = self.dimensions[order[0]])
364
- prefix = "value" if (member = self.measures[order[0]]) unless member
365
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
366
- order_by << ["#{prefix}.#{order[0]}",order[1]]
367
- end
368
- order_by
369
- end
370
-
371
- def process_if_required
372
- return if database.collection_names.include?(target_collection_name)
373
- process
374
- end
375
-
376
-
377
- def generate_keys_string(query)
378
- "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
379
- end
380
-
381
- def generate_values_string(query = self)
382
- "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
383
- end
384
-
385
- def generate_map_function(query = self)
386
- <<MAP
387
- function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
388
- MAP
389
- end
390
-
391
- def generate_reduce_function()
392
- <<REDUCE
393
- function(key,values){
394
- var output = {};
395
- values.forEach(function(doc){
396
- for(var key in doc){
397
- if (doc[key] || doc[key] == 0){
398
- output[key] = output[key] || 0;
399
- output[key] += doc[key];
400
- }
401
- }
402
- });
403
- return output;
404
- }
405
- REDUCE
406
- end
407
-
408
- def generate_finalize_function(query = self)
409
- <<FINALIZE
410
- function(key,value)
411
- {
412
-
413
- #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
414
- "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
415
- end.join("\n")}
416
- #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
417
- "value.#{m.name}=#{m.expression};";
418
- end.join("\n")}
419
- return value;
420
- }
421
- FINALIZE
422
- end
1
+ require "rubygems"
2
+ require "active_support"
3
+ require "mongo"
4
+ require "logger"
5
+
6
+ dir = File.dirname(__FILE__)
7
+ ["mongo_environment",
8
+ "ordered_hash_with_indifferent_access",
9
+ "member",
10
+ "member_list",
11
+ "measure",
12
+ "calculated_measure",
13
+ "dimension",
14
+ "ratio",
15
+ "duration",
16
+ "query/dsl/time_intelligence",
17
+ "query/dsl",
18
+ "query",
19
+ "data",
20
+ "data/member",
21
+ "data/level",
22
+ "data/hierarchy",
23
+ "data/table",
24
+ "aggregation/aggregation_manager",
25
+ "aggregation/map_reduce_helper",
26
+ "aggregation/dsl",
27
+ "aggregation",
28
+ "aggregation/ad_hoc",
29
+ "date_time",
30
+ "support"].each {|lib|require File.join(dir,'cubicle',lib)}
31
+
32
+ require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
33
+
34
+ module Cubicle
35
+
36
+ def self.register_cubicle_directory(directory_path, recursive=true)
37
+ searcher = "#{recursive ? "*" : "**/*"}.rb"
38
+ Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
39
+ end
40
+
41
+ def self.mongo
42
+ @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
43
+ end
44
+
45
+ def self.logger
46
+ @logger ||= (Cubicle.mongo.logger || Logger.new("cubicle.log"))
47
+ end
423
48
  end