cubicle 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/CHANGELOG.rdoc +14 -0
  2. data/README.rdoc +188 -174
  3. data/cubicle.gemspec +26 -10
  4. data/lib/cubicle.rb +47 -422
  5. data/lib/cubicle/aggregation.rb +58 -7
  6. data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
  7. data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
  8. data/lib/cubicle/aggregation/dsl.rb +108 -0
  9. data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
  10. data/lib/cubicle/data.rb +29 -84
  11. data/lib/cubicle/data/hierarchy.rb +55 -0
  12. data/lib/cubicle/data/level.rb +62 -0
  13. data/lib/cubicle/data/member.rb +28 -0
  14. data/lib/cubicle/data/table.rb +56 -0
  15. data/lib/cubicle/measure.rb +30 -20
  16. data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
  17. data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
  18. data/lib/cubicle/query.rb +21 -194
  19. data/lib/cubicle/query/dsl.rb +118 -0
  20. data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
  21. data/lib/cubicle/ratio.rb +28 -12
  22. data/lib/cubicle/version.rb +2 -2
  23. data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
  24. data/test/cubicle/cubicle_aggregation_test.rb +84 -20
  25. data/test/cubicle/cubicle_query_test.rb +36 -0
  26. data/test/cubicle/data/data_test.rb +30 -0
  27. data/test/cubicle/data/level_test.rb +42 -0
  28. data/test/cubicle/data/member_test.rb +40 -0
  29. data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
  30. data/test/cubicle/duration_test.rb +46 -48
  31. data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
  32. data/test/cubicles/defect_cubicle.rb +31 -31
  33. data/test/log/test.log +102066 -0
  34. metadata +26 -10
  35. data/lib/cubicle/data_level.rb +0 -60
  36. data/test/cubicle/cubicle_data_level_test.rb +0 -58
  37. data/test/cubicle/cubicle_test.rb +0 -85
data/lib/cubicle.rb CHANGED
@@ -1,423 +1,48 @@
1
- require "rubygems"
2
- require "active_support"
3
- require "mongo"
4
- require "logger"
5
-
6
- dir = File.dirname(__FILE__)
7
- ["mongo_environment",
8
- "member",
9
- "member_list",
10
- "measure",
11
- "calculated_measure",
12
- "dimension",
13
- "ratio",
14
- "duration",
15
- "query",
16
- "data_level",
17
- "data",
18
- "aggregation",
19
- "date_time",
20
- "support"].each {|lib|require File.join(dir,'cubicle',lib)}
21
-
22
- require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
23
-
24
- module Cubicle
25
-
26
- def self.register_cubicle_directory(directory_path, recursive=true)
27
- searcher = "#{recursive ? "*" : "**/*"}.rb"
28
- Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
29
- end
30
-
31
- def self.mongo
32
- @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
33
- end
34
-
35
- def self.logger
36
- Cubicle.mongo.logger || Logger.new("cubicle.log")
37
- end
38
-
39
- def database
40
- Cubicle.mongo.database
41
- end
42
-
43
- def collection
44
- database[target_collection_name]
45
- end
46
-
47
- def transient?
48
- @transient ||= false
49
- end
50
-
51
- def transient!
52
- @transient = true
53
- end
54
-
55
- def expire!
56
- collection.drop
57
- expire_aggregations!
58
- end
59
-
60
- def aggregations
61
- return (@aggregations ||= [])
62
- end
63
-
64
- #DSL
65
- def source_collection_name(collection_name = nil)
66
- return @source_collection = collection_name if collection_name
67
- @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
68
- end
69
- alias source_collection_name= source_collection_name
70
-
71
- def target_collection_name(collection_name = nil)
72
- return nil if transient?
73
- return @target_name = collection_name if collection_name
74
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
75
- end
76
- alias target_collection_name= target_collection_name
77
-
78
- def dimension(*args)
79
- dimensions << Cubicle::Dimension.new(*args)
80
- dimensions[-1]
81
- end
82
-
83
- def dimension_names
84
- return @dimensions.map{|dim|dim.name.to_s}
85
- end
86
-
87
- def dimensions(*args)
88
- return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
89
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
90
- args.each {|dim| dimension dim }
91
- @dimensions
92
- end
93
-
94
- def measure(*args)
95
- measures << Measure.new(*args)
96
- measures[-1]
97
- end
98
-
99
- def measures(*args)
100
- return (@measures ||= Cubicle::MemberList.new) if args.length < 1
101
- args = args[0] if args.length == 1 && args[0].is_a?(Array)
102
- args.each {|m| measure m}
103
- @measures
104
- end
105
-
106
- def count(*args)
107
- options = args.extract_options!
108
- options[:aggregation_method] = :count
109
- measure(*(args << options))
110
- end
111
-
112
- def average(*args)
113
- options = args.extract_options!
114
- options[:aggregation_method] = :average
115
- measure(*(args << options))
116
- #Averaged fields need a count of non-null values to properly calculate the average
117
- args[0] = "#{args[0]}_count".to_sym
118
- count *args
119
- end
120
- alias avg average
121
-
122
- def sum(*args)
123
- options = args.extract_options!
124
- options[:aggregation_method] = :sum
125
- measure(*(args << options))
126
- end
127
-
128
- def duration(*args)
129
- options = args.extract_options!
130
- options[:in] ||= durations_in
131
- args << options
132
- measures << (dur = Duration.new(*args))
133
- count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
134
- end
135
-
136
- def average_duration(*args)
137
- duration(*args)
138
- end
139
- alias avg_duration average_duration
140
-
141
- def total_duration(*args)
142
- options = args.extract_options!
143
- options[:aggregation_method] = :sum
144
- duration(*(args<<options))
145
- end
146
-
147
- def durations_in(unit_of_time = nil)
148
- return (@duration_unit ||= :seconds) unless unit_of_time
149
- @duration_unit = unit_of_time.to_s.pluralize.to_sym
150
- end
151
- alias :duration_unit :durations_in
152
-
153
-
154
- def ratio(member_name, numerator, denominator)
155
- measures << Ratio.new(member_name, numerator, denominator)
156
- end
157
-
158
- def aggregation(*member_list)
159
- member_list = member_list[0] if member_list[0].is_a?(Array)
160
- aggregations << member_list
161
- end
162
-
163
- def time_dimension(*args)
164
- return (@time_dimension ||= nil) unless args.length > 0
165
- @time_dimension = dimension(*args)
166
- end
167
- alias time_dimension= time_dimension
168
- alias date time_dimension
169
- alias time time_dimension
170
-
171
- def find_member(member_name)
172
- @dimensions[member_name] ||
173
- @measures[member_name]
174
- end
175
-
176
- def query(*args,&block)
177
- options = args.extract_options!
178
- query = Cubicle::Query.new(self)
179
- query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
180
- query.select(*args) if args.length > 0
181
- if block_given?
182
- block.arity == 1 ? (yield query) : (query.instance_eval(&block))
183
- end
184
- query.select_all unless query.selected?
185
- return query if options[:defer]
186
- results = execute_query(query,options)
187
- #return results if results.blank?
188
- #If the 'by' clause was used in the the query,
189
- #we'll hierarchize by the members indicated,
190
- #as the next step would otherwise almost certainly
191
- #need to be a call to hierarchize anyway.
192
- query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
193
- end
194
-
195
- #noinspection RubyArgCount
196
- def execute_query(query,options={})
197
- count = 0
198
-
199
- find_options = {
200
- :limit=>query.limit || 0,
201
- :skip=>query.offset || 0
202
- }
203
-
204
- find_options[:sort] = prepare_order_by(query)
205
- filter = {}
206
- if query == self || query.transient?
207
- aggregation = aggregate(query,options)
208
- else
209
- process_if_required
210
- aggregation = aggregation_for(query)
211
- #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
212
- #otherwise, a second map reduce is required to reduce the data set one last time
213
- if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
214
- filter = prepare_filter(query,options[:where] || {})
215
- else
216
- aggregation = aggregate(query,:source_collection=>collection.name)
217
- end
218
- end
219
-
220
- if aggregation.blank?
221
- Cubicle::Data.new(query,[],0) if aggregation == []
222
- else
223
- count = aggregation.count
224
- results = aggregation.find(filter,find_options).to_a
225
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
226
- Cubicle::Data.new(query, results, count)
227
- end
228
-
229
- end
230
-
231
- def process(options={})
232
- Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
233
- start = Time.now
234
- expire!
235
- aggregate(self,options)
236
- #Sort desc by length of array, so that larget
237
- #aggregations are processed first, hopefully increasing efficiency
238
- #of the processing step
239
- aggregations.sort!{|a,b|b.length<=>a.length}
240
- aggregations.each do |member_list|
241
- agg_start = Time.now
242
- aggregation_for(query(:defer=>true){select member_list})
243
- Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
244
- end
245
- duration = Time.now - start
246
- Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
247
- end
248
-
249
- protected
250
-
251
- def aggregation_collection_names
252
- database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
253
- end
254
-
255
- def expire_aggregations!
256
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
257
- end
258
-
259
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
260
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
261
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
262
- existing = existing_aggregations.map do |agg_col_name|
263
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
264
- end
265
-
266
- #This will select all the aggregations that contain ALL of the desired dimension names
267
- #we are sorting by length because the aggregation with the least number of members
268
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
269
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
270
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
271
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
272
- #this should do for now.
273
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
274
-
275
- #If no suitable aggregation exists to base this one off of,
276
- #we'll just use the base cubes aggregation collection
277
- return target_collection_name if candidates.blank?
278
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
279
-
280
- end
281
-
282
- def aggregation_for(query)
283
- return collection if query.all_dimensions?
284
-
285
- aggregation_query = query.clone
286
- #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
287
- filter = (query.where if query.respond_to?(:where))
288
- filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
289
-
290
- dimension_names = aggregation_query.dimension_names.sort
291
- agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
292
-
293
- unless database.collection_names.include?(agg_col_name)
294
- source_col_name = find_best_source_collection(dimension_names)
295
- exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
296
- aggregate(exec_query, :target_collection=>agg_col_name)
297
- end
298
-
299
- database[agg_col_name]
300
- end
301
-
302
- def ensure_indexes(collection_name,dimension_names)
303
- #an index for each dimension
304
- dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
305
- #and a composite
306
- database[collection_name].create_index(dimension_names)
307
- end
308
-
309
- def aggregate(query,options={})
310
- map, reduce = generate_map_function(query), generate_reduce_function
311
- options[:finalize] = generate_finalize_function(query)
312
- options["query"] = prepare_filter(query,options[:where] || {})
313
-
314
- query.source_collection_name ||= source_collection_name
315
-
316
- target_collection = options.delete(:target_collection)
317
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
318
-
319
- options[:out] = target_collection unless target_collection.blank? || query.transient?
320
-
321
- #This is defensive - some tests run without ever initializing any collections
322
- return [] unless database.collection_names.include?(query.source_collection_name)
323
-
324
- result = database[query.source_collection_name].map_reduce(map,reduce,options)
325
-
326
- ensure_indexes(target_collection,query.dimension_names) if target_collection
327
-
328
- result
329
- end
330
-
331
- def prepare_filter(query,filter={})
332
- filter.merge!(query.where) if query.respond_to?(:where) && query.where
333
- filter.stringify_keys!
334
- transient = (query.transient? || query == self)
335
- filter.keys.each do |key|
336
- next if key=~/^\$.*/
337
- prefix = nil
338
- prefix = "_id" if (member = self.dimensions[key])
339
- prefix = "value" if (member = self.measures[key]) unless member
340
-
341
- raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
342
-
343
- filter_value = filter.delete(key)
344
- if transient
345
- if (member.expression_type == :javascript)
346
- filter_name = "$where"
347
- filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
348
- filter_value = "(#{member.expression})==#{filter_value}"
349
- else
350
- filter_name = member.expression
351
- end
352
- else
353
- filter_name = "#{prefix}.#{member.name}"
354
- end
355
- filter[filter_name] = filter_value
356
- end
357
- filter
358
- end
359
-
360
- def prepare_order_by(query)
361
- order_by = []
362
- query.order_by.each do |order|
363
- prefix = "_id" if (member = self.dimensions[order[0]])
364
- prefix = "value" if (member = self.measures[order[0]]) unless member
365
- raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
366
- order_by << ["#{prefix}.#{order[0]}",order[1]]
367
- end
368
- order_by
369
- end
370
-
371
- def process_if_required
372
- return if database.collection_names.include?(target_collection_name)
373
- process
374
- end
375
-
376
-
377
- def generate_keys_string(query)
378
- "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
379
- end
380
-
381
- def generate_values_string(query = self)
382
- "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
383
- end
384
-
385
- def generate_map_function(query = self)
386
- <<MAP
387
- function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
388
- MAP
389
- end
390
-
391
- def generate_reduce_function()
392
- <<REDUCE
393
- function(key,values){
394
- var output = {};
395
- values.forEach(function(doc){
396
- for(var key in doc){
397
- if (doc[key] || doc[key] == 0){
398
- output[key] = output[key] || 0;
399
- output[key] += doc[key];
400
- }
401
- }
402
- });
403
- return output;
404
- }
405
- REDUCE
406
- end
407
-
408
- def generate_finalize_function(query = self)
409
- <<FINALIZE
410
- function(key,value)
411
- {
412
-
413
- #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
414
- "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
415
- end.join("\n")}
416
- #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
417
- "value.#{m.name}=#{m.expression};";
418
- end.join("\n")}
419
- return value;
420
- }
421
- FINALIZE
422
- end
1
+ require "rubygems"
2
+ require "active_support"
3
+ require "mongo"
4
+ require "logger"
5
+
6
+ dir = File.dirname(__FILE__)
7
+ ["mongo_environment",
8
+ "ordered_hash_with_indifferent_access",
9
+ "member",
10
+ "member_list",
11
+ "measure",
12
+ "calculated_measure",
13
+ "dimension",
14
+ "ratio",
15
+ "duration",
16
+ "query/dsl/time_intelligence",
17
+ "query/dsl",
18
+ "query",
19
+ "data",
20
+ "data/member",
21
+ "data/level",
22
+ "data/hierarchy",
23
+ "data/table",
24
+ "aggregation/aggregation_manager",
25
+ "aggregation/map_reduce_helper",
26
+ "aggregation/dsl",
27
+ "aggregation",
28
+ "aggregation/ad_hoc",
29
+ "date_time",
30
+ "support"].each {|lib|require File.join(dir,'cubicle',lib)}
31
+
32
+ require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
33
+
34
+ module Cubicle
35
+
36
+ def self.register_cubicle_directory(directory_path, recursive=true)
37
+ searcher = "#{recursive ? "*" : "**/*"}.rb"
38
+ Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
39
+ end
40
+
41
+ def self.mongo
42
+ @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
43
+ end
44
+
45
+ def self.logger
46
+ @logger ||= (Cubicle.mongo.logger || Logger.new("cubicle.log"))
47
+ end
423
48
  end