cubicle 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/CHANGELOG.rdoc +14 -0
  2. data/README.rdoc +188 -174
  3. data/cubicle.gemspec +26 -10
  4. data/lib/cubicle.rb +47 -422
  5. data/lib/cubicle/aggregation.rb +58 -7
  6. data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
  7. data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
  8. data/lib/cubicle/aggregation/dsl.rb +108 -0
  9. data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
  10. data/lib/cubicle/data.rb +29 -84
  11. data/lib/cubicle/data/hierarchy.rb +55 -0
  12. data/lib/cubicle/data/level.rb +62 -0
  13. data/lib/cubicle/data/member.rb +28 -0
  14. data/lib/cubicle/data/table.rb +56 -0
  15. data/lib/cubicle/measure.rb +30 -20
  16. data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
  17. data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
  18. data/lib/cubicle/query.rb +21 -194
  19. data/lib/cubicle/query/dsl.rb +118 -0
  20. data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
  21. data/lib/cubicle/ratio.rb +28 -12
  22. data/lib/cubicle/version.rb +2 -2
  23. data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
  24. data/test/cubicle/cubicle_aggregation_test.rb +84 -20
  25. data/test/cubicle/cubicle_query_test.rb +36 -0
  26. data/test/cubicle/data/data_test.rb +30 -0
  27. data/test/cubicle/data/level_test.rb +42 -0
  28. data/test/cubicle/data/member_test.rb +40 -0
  29. data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
  30. data/test/cubicle/duration_test.rb +46 -48
  31. data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
  32. data/test/cubicles/defect_cubicle.rb +31 -31
  33. data/test/log/test.log +102066 -0
  34. metadata +26 -10
  35. data/lib/cubicle/data_level.rb +0 -60
  36. data/test/cubicle/cubicle_data_level_test.rb +0 -58
  37. data/test/cubicle/cubicle_test.rb +0 -85
@@ -1,10 +1,61 @@
1
1
  module Cubicle
2
- class Aggregation
3
- include Cubicle
4
- def initialize(source_collection,&block)
5
- transient!
6
- source_collection_name source_collection
7
- instance_eval(&block) if block_given?
2
+ module Aggregation
3
+ include Dsl
4
+
5
+ def aggregator
6
+ @aggregator ||= AggregationManager.new(self)
7
+ end
8
+
9
+ def transient?
10
+ @transient ||= false
11
+ end
12
+
13
+ def transient!
14
+ @transient = true
15
+ end
16
+
17
+ def expire!
18
+ aggregator.expire!
19
+ end
20
+
21
+ def process(*args)
22
+ aggregator.process(*args)
23
+ end
24
+
25
+ def aggregations
26
+ return (@aggregations ||= [])
27
+ end
28
+
29
+ def dimension_names
30
+ return @dimensions.map{|dim|dim.name.to_s}
31
+ end
32
+
33
+ def find_member(member_name)
34
+ @dimensions[member_name] ||
35
+ @measures[member_name]
36
+ end
37
+
38
+ def query(*args,&block)
39
+ options = args.extract_options!
40
+ query = Cubicle::Query.new(self)
41
+ query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
42
+ query.select(*args) if args.length > 0
43
+ if block_given?
44
+ block.arity == 1 ? (yield query) : (query.instance_eval(&block))
45
+ end
46
+ query.select_all unless query.selected?
47
+ return query if options[:defer]
48
+ results = execute_query(query,options)
49
+ #return results if results.blank?
50
+ #If the 'by' clause was used in the the query,
51
+ #we'll hierarchize by the members indicated,
52
+ #as the next step would otherwise almost certainly
53
+ #need to be a call to hierarchize anyway.
54
+ query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
55
+ end
56
+
57
+ def execute_query(query,options)
58
+ aggregator.execute_query(query,options)
8
59
  end
9
60
  end
10
- end
61
+ end
@@ -0,0 +1,12 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AdHoc
4
+ include Cubicle::Aggregation
5
+ def initialize(source_collection,&block)
6
+ transient!
7
+ source_collection_name source_collection
8
+ instance_eval(&block) if block_given?
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,212 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationManager
4
+
5
+ attr_reader :aggregation
6
+
7
+ def initialize(aggregation)
8
+ @aggregation = aggregation
9
+ end
10
+
11
+ def database
12
+ Cubicle.mongo.database
13
+ end
14
+
15
+ def collection
16
+ database[aggregation.target_collection_name]
17
+ end
18
+
19
+ def target_collection_name
20
+ aggregation.target_collection_name
21
+ end
22
+
23
+
24
+ #noinspection RubyArgCount
25
+ def execute_query(query,options={})
26
+ count = 0
27
+
28
+ find_options = {
29
+ :limit=>query.limit || 0,
30
+ :skip=>query.offset || 0
31
+ }
32
+
33
+ find_options[:sort] = prepare_order_by(query)
34
+ filter = {}
35
+ if query == aggregation || query.transient?
36
+ aggregation = aggregate(query,options)
37
+ else
38
+ process_if_required
39
+ aggregation = aggregation_for(query)
40
+ #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
+ #otherwise, a second map reduce is required to reduce the data set one last time
42
+ if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
43
+ filter = prepare_filter(query,options[:where] || {})
44
+ else
45
+ aggregation = aggregate(query,:source_collection=>collection.name)
46
+ end
47
+ end
48
+
49
+ if aggregation.blank?
50
+ Cubicle::Data::Table.new(query,[],0) if aggregation == []
51
+ else
52
+ count = aggregation.count
53
+ results = aggregation.find(filter,find_options).to_a
54
+ aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ Cubicle::Data::Table.new(query, results, count)
56
+ end
57
+
58
+ end
59
+
60
+ def process(options={})
61
+ Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
62
+ start = Time.now
63
+ expire!
64
+ aggregate(aggregation,options)
65
+ #Sort desc by length of array, so that larget
66
+ #aggregations are processed first, hopefully increasing efficiency
67
+ #of the processing step
68
+ aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
69
+ aggregation.aggregations.each do |member_list|
70
+ agg_start = Time.now
71
+ aggregation_for(aggregation.query(:defer=>true){select member_list})
72
+ Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
73
+ end
74
+ duration = Time.now - start
75
+ Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
76
+ end
77
+
78
+ def expire!
79
+ collection.drop
80
+ expire_aggregations!
81
+ end
82
+
83
+ protected
84
+
85
+ def aggregation_collection_names
86
+ database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
+ end
88
+
89
+ def expire_aggregations!
90
+ aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
+ end
92
+
93
+ def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
+ #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
+ #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
+ existing = existing_aggregations.map do |agg_col_name|
97
+ agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
98
+ end
99
+
100
+ #This will select all the aggregations that contain ALL of the desired dimension names
101
+ #we are sorting by length because the aggregation with the least number of members
102
+ #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
+ #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
+ #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
+ #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
+ #this should do for now.
107
+ candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
108
+
109
+ #If no suitable aggregation exists to base this one off of,
110
+ #we'll just use the base cubes aggregation collection
111
+ return target_collection_name if candidates.blank?
112
+ "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
113
+
114
+ end
115
+
116
+ def aggregation_for(query)
117
+ return collection if query.all_dimensions?
118
+
119
+ aggregation_query = query.clone
120
+ #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
121
+ filter = (query.where if query.respond_to?(:where))
122
+ filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
+
124
+ dimension_names = aggregation_query.dimension_names.sort
125
+ agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
+
127
+ unless database.collection_names.include?(agg_col_name)
128
+ source_col_name = find_best_source_collection(dimension_names)
129
+ exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
+ aggregate(exec_query, :target_collection=>agg_col_name)
131
+ end
132
+
133
+ database[agg_col_name]
134
+ end
135
+
136
+ def ensure_indexes(collection_name,dimension_names)
137
+ col = database[collection_name]
138
+ #an index for each dimension
139
+ dimension_names.each {|dim|col.create_index([dim,Mongo::ASCENDING])}
140
+ #and a composite
141
+ col.create_index(dimension_names)
142
+ end
143
+
144
+ def aggregate(query,options={})
145
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
146
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
147
+ options["query"] = prepare_filter(query,options[:where] || {})
148
+
149
+ query.source_collection_name ||= aggregation.source_collection_name
150
+
151
+ target_collection = options.delete(:target_collection)
152
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
153
+
154
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
155
+
156
+ #This is defensive - some tests run without ever initializing any collections
157
+ return [] unless database.collection_names.include?(query.source_collection_name)
158
+
159
+ result = database[query.source_collection_name].map_reduce(map,reduce,options)
160
+
161
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
162
+
163
+ result
164
+ end
165
+
166
+ def prepare_filter(query,filter={})
167
+ filter.merge!(query.where) if query.respond_to?(:where) && query.where
168
+ filter.stringify_keys!
169
+ transient = (query.transient? || query == aggregation)
170
+ filter.keys.each do |key|
171
+ next if key=~/^\$.*/
172
+ prefix = nil
173
+ prefix = "_id" if (member = aggregation.dimensions[key])
174
+ prefix = "value" if (member = aggregation.measures[key]) unless member
175
+
176
+ raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
177
+
178
+ filter_value = filter.delete(key)
179
+ if transient
180
+ if (member.expression_type == :javascript)
181
+ filter_name = "$where"
182
+ filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
183
+ filter_value = "(#{member.expression})==#{filter_value}"
184
+ else
185
+ filter_name = member.expression
186
+ end
187
+ else
188
+ filter_name = "#{prefix}.#{member.name}"
189
+ end
190
+ filter[filter_name] = filter_value
191
+ end
192
+ filter
193
+ end
194
+
195
+ def prepare_order_by(query)
196
+ order_by = []
197
+ query.order_by.each do |order|
198
+ prefix = "_id" if (member = aggregation.dimensions[order[0]])
199
+ prefix = "value" if (member = aggregation.measures[order[0]]) unless member
200
+ raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
201
+ order_by << ["#{prefix}.#{order[0]}",order[1]]
202
+ end
203
+ order_by
204
+ end
205
+
206
+ def process_if_required
207
+ return if database.collection_names.include?(target_collection_name)
208
+ process
209
+ end
210
+ end
211
+ end
212
+ end
@@ -0,0 +1,108 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ module Dsl
4
+
5
+ def source_collection_name(collection_name = nil)
6
+ return @source_collection = collection_name if collection_name
7
+ @source_collection ||= name.chomp("Cubicle").chomp("Cube").chomp("Aggregation").underscore.pluralize
8
+ end
9
+ alias source_collection_name= source_collection_name
10
+
11
+ def target_collection_name(collection_name = nil)
12
+ return nil if transient?
13
+ return @target_name = collection_name if collection_name
14
+ @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
15
+ end
16
+ alias target_collection_name= target_collection_name
17
+
18
+ def dimension(*args)
19
+ dimensions << Cubicle::Dimension.new(*args)
20
+ dimensions[-1]
21
+ end
22
+
23
+ def dimensions(*args)
24
+ return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
25
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
26
+ args.each {|dim| dimension dim }
27
+ @dimensions
28
+ end
29
+
30
+ def measure(*args)
31
+ measures << Measure.new(*args)
32
+ measures[-1]
33
+ end
34
+
35
+ def measures(*args)
36
+ return (@measures ||= Cubicle::MemberList.new) if args.length < 1
37
+ args = args[0] if args.length == 1 && args[0].is_a?(Array)
38
+ args.each {|m| measure m}
39
+ @measures
40
+ end
41
+
42
+ def count(*args)
43
+ options = args.extract_options!
44
+ options[:aggregation_method] = :count
45
+ measure(*(args << options))
46
+ end
47
+
48
+ def average(*args)
49
+ options = args.extract_options!
50
+ options[:aggregation_method] = :average
51
+ measure(*(args << options))
52
+ #Averaged fields need a count of non-null values to properly calculate the average
53
+ args[0] = "#{args[0]}_count".to_sym
54
+ count *args
55
+ end
56
+ alias avg average
57
+
58
+ def sum(*args)
59
+ options = args.extract_options!
60
+ options[:aggregation_method] = :sum
61
+ measure(*(args << options))
62
+ end
63
+
64
+ def duration(*args)
65
+ options = args.extract_options!
66
+ options[:in] ||= durations_in
67
+ args << options
68
+ measures << (dur = Duration.new(*args))
69
+ count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
70
+ end
71
+
72
+ def average_duration(*args)
73
+ duration(*args)
74
+ end
75
+ alias avg_duration average_duration
76
+
77
+ def total_duration(*args)
78
+ options = args.extract_options!
79
+ options[:aggregation_method] = :sum
80
+ duration(*(args<<options))
81
+ end
82
+
83
+ def durations_in(unit_of_time = nil)
84
+ return (@duration_unit ||= :seconds) unless unit_of_time
85
+ @duration_unit = unit_of_time.to_s.pluralize.to_sym
86
+ end
87
+ alias :duration_unit :durations_in
88
+
89
+
90
+ def ratio(member_name, numerator, denominator)
91
+ measures << Ratio.new(member_name, numerator, denominator)
92
+ end
93
+
94
+ def aggregation(*member_list)
95
+ member_list = member_list[0] if member_list[0].is_a?(Array)
96
+ aggregations << member_list
97
+ end
98
+
99
+ def time_dimension(*args)
100
+ return (@time_dimension ||= nil) unless args.length > 0
101
+ @time_dimension = dimension(*args)
102
+ end
103
+ alias time_dimension= time_dimension
104
+ alias date time_dimension
105
+ alias time time_dimension
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,55 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class MapReduceHelper
4
+ class << self
5
+
6
+ def generate_keys_string(query)
7
+ "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
8
+ end
9
+
10
+ def generate_values_string(query)
11
+ "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
12
+ end
13
+
14
+ def generate_map_function(query)
15
+ <<MAP
16
+ function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
17
+ MAP
18
+ end
19
+
20
+ def generate_reduce_function()
21
+ <<REDUCE
22
+ function(key,values){
23
+ var output = {};
24
+ values.forEach(function(doc){
25
+ for(var key in doc){
26
+ if (doc[key] || doc[key] == 0){
27
+ output[key] = output[key] || 0;
28
+ output[key] += doc[key];
29
+ }
30
+ }
31
+ });
32
+ return output;
33
+ }
34
+ REDUCE
35
+ end
36
+
37
+ def generate_finalize_function(query)
38
+ <<FINALIZE
39
+ function(key,value)
40
+ {
41
+
42
+ #{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
43
+ "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
44
+ end.join("\n")}
45
+ #{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
46
+ "value.#{m.name}=#{m.expression};";
47
+ end.join("\n")}
48
+ return value;
49
+ }
50
+ FINALIZE
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end