cubicle 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/CHANGELOG.rdoc +14 -0
  2. data/README.rdoc +188 -174
  3. data/cubicle.gemspec +26 -10
  4. data/lib/cubicle.rb +47 -422
  5. data/lib/cubicle/aggregation.rb +58 -7
  6. data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
  7. data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
  8. data/lib/cubicle/aggregation/dsl.rb +108 -0
  9. data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
  10. data/lib/cubicle/data.rb +29 -84
  11. data/lib/cubicle/data/hierarchy.rb +55 -0
  12. data/lib/cubicle/data/level.rb +62 -0
  13. data/lib/cubicle/data/member.rb +28 -0
  14. data/lib/cubicle/data/table.rb +56 -0
  15. data/lib/cubicle/measure.rb +30 -20
  16. data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
  17. data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
  18. data/lib/cubicle/query.rb +21 -194
  19. data/lib/cubicle/query/dsl.rb +118 -0
  20. data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
  21. data/lib/cubicle/ratio.rb +28 -12
  22. data/lib/cubicle/version.rb +2 -2
  23. data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
  24. data/test/cubicle/cubicle_aggregation_test.rb +84 -20
  25. data/test/cubicle/cubicle_query_test.rb +36 -0
  26. data/test/cubicle/data/data_test.rb +30 -0
  27. data/test/cubicle/data/level_test.rb +42 -0
  28. data/test/cubicle/data/member_test.rb +40 -0
  29. data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
  30. data/test/cubicle/duration_test.rb +46 -48
  31. data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
  32. data/test/cubicles/defect_cubicle.rb +31 -31
  33. data/test/log/test.log +102066 -0
  34. metadata +26 -10
  35. data/lib/cubicle/data_level.rb +0 -60
  36. data/test/cubicle/cubicle_data_level_test.rb +0 -58
  37. data/test/cubicle/cubicle_test.rb +0 -85
@@ -1,10 +1,61 @@
1
1
  module Cubicle
2
- class Aggregation
3
- include Cubicle
4
- def initialize(source_collection,&block)
5
- transient!
6
- source_collection_name source_collection
7
- instance_eval(&block) if block_given?
2
+ module Aggregation
3
+ include Dsl
4
+
5
+ def aggregator
6
+ @aggregator ||= AggregationManager.new(self)
7
+ end
8
+
9
+ def transient?
10
+ @transient ||= false
11
+ end
12
+
13
+ def transient!
14
+ @transient = true
15
+ end
16
+
17
+ def expire!
18
+ aggregator.expire!
19
+ end
20
+
21
+ def process(*args)
22
+ aggregator.process(*args)
23
+ end
24
+
25
+ def aggregations
26
+ return (@aggregations ||= [])
27
+ end
28
+
29
+ def dimension_names
30
+ return @dimensions.map{|dim|dim.name.to_s}
31
+ end
32
+
33
+ def find_member(member_name)
34
+ @dimensions[member_name] ||
35
+ @measures[member_name]
36
+ end
37
+
38
+ def query(*args,&block)
39
+ options = args.extract_options!
40
+ query = Cubicle::Query.new(self)
41
+ query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
42
+ query.select(*args) if args.length > 0
43
+ if block_given?
44
+ block.arity == 1 ? (yield query) : (query.instance_eval(&block))
45
+ end
46
+ query.select_all unless query.selected?
47
+ return query if options[:defer]
48
+ results = execute_query(query,options)
49
+ #return results if results.blank?
50
+ #If the 'by' clause was used in the the query,
51
+ #we'll hierarchize by the members indicated,
52
+ #as the next step would otherwise almost certainly
53
+ #need to be a call to hierarchize anyway.
54
+ query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
55
+ end
56
+
57
+ def execute_query(query,options)
58
+ aggregator.execute_query(query,options)
8
59
  end
9
60
  end
10
- end
61
+ end
@@ -0,0 +1,12 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AdHoc
4
+ include Cubicle::Aggregation
5
+ def initialize(source_collection,&block)
6
+ transient!
7
+ source_collection_name source_collection
8
+ instance_eval(&block) if block_given?
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,212 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ end
10
+
11
+ def database
12
+ Cubicle.mongo.database
13
+ end
14
+
15
+ def collection
16
+ database[aggregation.target_collection_name]
17
+ end
18
+
19
+ def target_collection_name
20
+ aggregation.target_collection_name
21
+ end
22
+
23
+
24
+ #noinspection RubyArgCount
25
+ def execute_query(query,options={})
26
+ count = 0
27
+
28
+ find_options = {
29
+ :limit=>query.limit || 0,
30
+ :skip=>query.offset || 0
31
+ }
32
+
33
+ find_options[:sort] = prepare_order_by(query)
34
+ filter = {}
35
+ if query == aggregation || query.transient?
36
+ aggregation = aggregate(query,options)
37
+ else
38
+ process_if_required
39
+ aggregation = aggregation_for(query)
40
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
+ #otherwise, a second map reduce is required to reduce the data set one last time
42
+ if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
+ filter = prepare_filter(query,options[:where] || {})
44
+ else
45
+ aggregation = aggregate(query,:source_collection=>collection.name)
46
+ end
47
+ end
48
+
49
+ if aggregation.blank?
50
+ Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
+ else
52
+ count = aggregation.count
53
+ results = aggregation.find(filter,find_options).to_a
54
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ Cubicle::Data::Table.new(query, results, count)
56
+ end
57
+
58
+ end
59
+
60
+ def process(options={})
61
+ Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
+ start = Time.now
63
+ expire!
64
+ aggregate(aggregation,options)
65
+ #Sort desc by length of array, so that larget
66
+ #aggregations are processed first, hopefully increasing efficiency
67
+ #of the processing step
68
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
+ aggregation.aggregations.each do |member_list|
70
+ agg_start = Time.now
71
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
72
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
+ end
74
+ duration = Time.now - start
75
+ Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
+ end
77
+
78
+ def expire!
79
+ collection.drop
80
+ expire_aggregations!
81
+ end
82
+
83
+ protected
84
+
85
+ def aggregation_collection_names
86
+ database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
+ end
88
+
89
+ def expire_aggregations!
90
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
+ end
92
+
93
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
+ existing = existing_aggregations.map do |agg_col_name|
97
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
+ end
99
+
100
+ #This will select all the aggregations that contain ALL of the desired dimension names
101
+ #we are sorting by length because the aggregation with the least number of members
102
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
+ #this should do for now.
107
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
+
109
+ #If no suitable aggregation exists to base this one off of,
110
+ #we'll just use the base cubes aggregation collection
111
+ return target_collection_name if candidates.blank?
112
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
+
114
+ end
115
+
116
+ def aggregation_for(query)
117
+ return collection if query.all_dimensions?
118
+
119
+ aggregation_query = query.clone
120
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
+ filter = (query.where if query.respond_to?(:where))
122
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
+
124
+ dimension_names = aggregation_query.dimension_names.sort
125
+ agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
+
127
+ unless database.collection_names.include?(agg_col_name)
128
+ source_col_name = find_best_source_collection(dimension_names)
129
+ exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
+ aggregate(exec_query, :target_collection=>agg_col_name)
131
+ end
132
+
133
+ database[agg_col_name]
134
+ end
135
+
136
+ def ensure_indexes(collection_name,dimension_names)
137
+ col = database[collection_name]
138
+ #an index for each dimension
139
+ dimension_names.each {|dim|col.create_index([dim,Mongo::ASCENDING])}
140
+ #and a composite
141
+ col.create_index(dimension_names)
142
+ end
143
+
144
+ def aggregate(query,options={})
145
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
146
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
147
+ options["query"] = prepare_filter(query,options[:where] || {})
148
+
149
+ query.source_collection_name ||= aggregation.source_collection_name
150
+
151
+ target_collection = options.delete(:target_collection)
152
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
153
+
154
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
155
+
156
+ #This is defensive - some tests run without ever initializing any collections
157
+ return [] unless database.collection_names.include?(query.source_collection_name)
158
+
159
+ result = database[query.source_collection_name].map_reduce(map,reduce,options)
160
+
161
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
162
+
163
+ result
164
+ end
165
+
166
+ def prepare_filter(query,filter={})
167
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
168
+ filter.stringify_keys!
169
+ transient = (query.transient? || query == aggregation)
170
+ filter.keys.each do |key|
171
+ next if key=~/^\$.*/
172
+ prefix = nil
173
+ prefix = "_id" if (member = aggregation.dimensions[key])
174
+ prefix = "value" if (member = aggregation.measures[key]) unless member
175
+
176
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
177
+
178
+ filter_value = filter.delete(key)
179
+ if transient
180
+ if (member.expression_type == :javascript)
181
+ filter_name = "$where"
182
+ filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
183
+ filter_value = "(#{member.expression})==#{filter_value}"
184
+ else
185
+ filter_name = member.expression
186
+ end
187
+ else
188
+ filter_name = "#{prefix}.#{member.name}"
189
+ end
190
+ filter[filter_name] = filter_value
191
+ end
192
+ filter
193
+ end
194
+
195
+ def prepare_order_by(query)
196
+ order_by = []
197
+ query.order_by.each do |order|
198
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
199
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
200
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
201
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
202
+ end
203
+ order_by
204
+ end
205
+
206
+ def process_if_required
207
+ return if database.collection_names.include?(target_collection_name)
208
+ process
209
+ end
210
+ end
211
+ end
212
+ end
@@ -0,0 +1,108 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ module Dsl
4
+
5
+ def source_collection_name(collection_name = nil)
6
+ return @source_collection = collection_name if collection_name
7
+ @source_collection ||= name.chomp("Cubicle").chomp("Cube").chomp("Aggregation").underscore.pluralize
8
+ end
9
+ alias source_collection_name= source_collection_name
10
+
11
+ def target_collection_name(collection_name = nil)
12
+ return nil if transient?
13
+ return @target_name = collection_name if collection_name
14
+ @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
15
+ end
16
+ alias target_collection_name= target_collection_name
17
+
18
+ def dimension(*args)
19
+ dimensions << Cubicle::Dimension.new(*args)
20
+ dimensions[-1]
21
+ end
22
+
23
+ def dimensions(*args)
24
+ return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
25
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
26
+ args.each {|dim| dimension dim }
27
+ @dimensions
28
+ end
29
+
30
+ def measure(*args)
31
+ measures << Measure.new(*args)
32
+ measures[-1]
33
+ end
34
+
35
+ def measures(*args)
36
+ return (@measures ||= Cubicle::MemberList.new) if args.length < 1
37
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
38
+ args.each {|m| measure m}
39
+ @measures
40
+ end
41
+
42
+ def count(*args)
43
+ options = args.extract_options!
44
+ options[:aggregation_method] = :count
45
+ measure(*(args << options))
46
+ end
47
+
48
+ def average(*args)
49
+ options = args.extract_options!
50
+ options[:aggregation_method] = :average
51
+ measure(*(args << options))
52
+ #Averaged fields need a count of non-null values to properly calculate the average
53
+ args[0] = "#{args[0]}_count".to_sym
54
+ count *args
55
+ end
56
+ alias avg average
57
+
58
+ def sum(*args)
59
+ options = args.extract_options!
60
+ options[:aggregation_method] = :sum
61
+ measure(*(args << options))
62
+ end
63
+
64
+ def duration(*args)
65
+ options = args.extract_options!
66
+ options[:in] ||= durations_in
67
+ args << options
68
+ measures << (dur = Duration.new(*args))
69
+ count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
70
+ end
71
+
72
+ def average_duration(*args)
73
+ duration(*args)
74
+ end
75
+ alias avg_duration average_duration
76
+
77
+ def total_duration(*args)
78
+ options = args.extract_options!
79
+ options[:aggregation_method] = :sum
80
+ duration(*(args<<options))
81
+ end
82
+
83
+ def durations_in(unit_of_time = nil)
84
+ return (@duration_unit ||= :seconds) unless unit_of_time
85
+ @duration_unit = unit_of_time.to_s.pluralize.to_sym
86
+ end
87
+ alias :duration_unit :durations_in
88
+
89
+
90
+ def ratio(member_name, numerator, denominator)
91
+ measures << Ratio.new(member_name, numerator, denominator)
92
+ end
93
+
94
+ def aggregation(*member_list)
95
+ member_list = member_list[0] if member_list[0].is_a?(Array)
96
+ aggregations << member_list
97
+ end
98
+
99
+ def time_dimension(*args)
100
+ return (@time_dimension ||= nil) unless args.length > 0
101
+ @time_dimension = dimension(*args)
102
+ end
103
+ alias time_dimension= time_dimension
104
+ alias date time_dimension
105
+ alias time time_dimension
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,55 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class MapReduceHelper
4
+ class << self
5
+
6
+ def generate_keys_string(query)
7
+ "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
8
+ end
9
+
10
+ def generate_values_string(query)
11
+ "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
12
+ end
13
+
14
+ def generate_map_function(query)
15
+ <<MAP
16
+ function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
17
+ MAP
18
+ end
19
+
20
+ def generate_reduce_function()
21
+ <<REDUCE
22
+ function(key,values){
23
+ var output = {};
24
+ values.forEach(function(doc){
25
+ for(var key in doc){
26
+ if (doc[key] || doc[key] == 0){
27
+ output[key] = output[key] || 0;
28
+ output[key] += doc[key];
29
+ }
30
+ }
31
+ });
32
+ return output;
33
+ }
34
+ REDUCE
35
+ end
36
+
37
+ def generate_finalize_function(query)
38
+ <<FINALIZE
39
+ function(key,value)
40
+ {
41
+
42
+ #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
43
+ "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
44
+ end.join("\n")}
45
+ #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
46
+ "value.#{m.name}=#{m.expression};";
47
+ end.join("\n")}
48
+ return value;
49
+ }
50
+ FINALIZE
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end