cubicle 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +14 -0
- data/README.rdoc +188 -174
- data/cubicle.gemspec +26 -10
- data/lib/cubicle.rb +47 -422
- data/lib/cubicle/aggregation.rb +58 -7
- data/lib/cubicle/aggregation/ad_hoc.rb +12 -0
- data/lib/cubicle/aggregation/aggregation_manager.rb +212 -0
- data/lib/cubicle/aggregation/dsl.rb +108 -0
- data/lib/cubicle/aggregation/map_reduce_helper.rb +55 -0
- data/lib/cubicle/data.rb +29 -84
- data/lib/cubicle/data/hierarchy.rb +55 -0
- data/lib/cubicle/data/level.rb +62 -0
- data/lib/cubicle/data/member.rb +28 -0
- data/lib/cubicle/data/table.rb +56 -0
- data/lib/cubicle/measure.rb +30 -20
- data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +1 -1
- data/lib/cubicle/ordered_hash_with_indifferent_access.rb +27 -0
- data/lib/cubicle/query.rb +21 -194
- data/lib/cubicle/query/dsl.rb +118 -0
- data/lib/cubicle/query/dsl/time_intelligence.rb +89 -0
- data/lib/cubicle/ratio.rb +28 -12
- data/lib/cubicle/version.rb +2 -2
- data/test/cubicle/aggregation/ad_hoc_test.rb +21 -0
- data/test/cubicle/cubicle_aggregation_test.rb +84 -20
- data/test/cubicle/cubicle_query_test.rb +36 -0
- data/test/cubicle/data/data_test.rb +30 -0
- data/test/cubicle/data/level_test.rb +42 -0
- data/test/cubicle/data/member_test.rb +40 -0
- data/test/cubicle/{cubicle_data_test.rb → data/table_test.rb} +50 -50
- data/test/cubicle/duration_test.rb +46 -48
- data/test/cubicle/ordered_hash_with_indifferent_access_test.rb +19 -0
- data/test/cubicles/defect_cubicle.rb +31 -31
- data/test/log/test.log +102066 -0
- metadata +26 -10
- data/lib/cubicle/data_level.rb +0 -60
- data/test/cubicle/cubicle_data_level_test.rb +0 -58
- data/test/cubicle/cubicle_test.rb +0 -85
data/lib/cubicle/aggregation.rb
CHANGED
@@ -1,10 +1,61 @@
|
|
1
1
|
module Cubicle
|
2
|
-
|
3
|
-
include
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
module Aggregation
|
3
|
+
include Dsl
|
4
|
+
|
5
|
+
def aggregator
|
6
|
+
@aggregator ||= AggregationManager.new(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
def transient?
|
10
|
+
@transient ||= false
|
11
|
+
end
|
12
|
+
|
13
|
+
def transient!
|
14
|
+
@transient = true
|
15
|
+
end
|
16
|
+
|
17
|
+
def expire!
|
18
|
+
aggregator.expire!
|
19
|
+
end
|
20
|
+
|
21
|
+
def process(*args)
|
22
|
+
aggregator.process(*args)
|
23
|
+
end
|
24
|
+
|
25
|
+
def aggregations
|
26
|
+
return (@aggregations ||= [])
|
27
|
+
end
|
28
|
+
|
29
|
+
def dimension_names
|
30
|
+
return @dimensions.map{|dim|dim.name.to_s}
|
31
|
+
end
|
32
|
+
|
33
|
+
def find_member(member_name)
|
34
|
+
@dimensions[member_name] ||
|
35
|
+
@measures[member_name]
|
36
|
+
end
|
37
|
+
|
38
|
+
def query(*args,&block)
|
39
|
+
options = args.extract_options!
|
40
|
+
query = Cubicle::Query.new(self)
|
41
|
+
query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
|
42
|
+
query.select(*args) if args.length > 0
|
43
|
+
if block_given?
|
44
|
+
block.arity == 1 ? (yield query) : (query.instance_eval(&block))
|
45
|
+
end
|
46
|
+
query.select_all unless query.selected?
|
47
|
+
return query if options[:defer]
|
48
|
+
results = execute_query(query,options)
|
49
|
+
#return results if results.blank?
|
50
|
+
#If the 'by' clause was used in the the query,
|
51
|
+
#we'll hierarchize by the members indicated,
|
52
|
+
#as the next step would otherwise almost certainly
|
53
|
+
#need to be a call to hierarchize anyway.
|
54
|
+
query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
|
55
|
+
end
|
56
|
+
|
57
|
+
def execute_query(query,options)
|
58
|
+
aggregator.execute_query(query,options)
|
8
59
|
end
|
9
60
|
end
|
10
|
-
end
|
61
|
+
end
|
@@ -0,0 +1,212 @@
|
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
class AggregationManager
|
4
|
+
|
5
|
+
attr_reader :aggregation
|
6
|
+
|
7
|
+
def initialize(aggregation)
|
8
|
+
@aggregation = aggregation
|
9
|
+
end
|
10
|
+
|
11
|
+
def database
|
12
|
+
Cubicle.mongo.database
|
13
|
+
end
|
14
|
+
|
15
|
+
def collection
|
16
|
+
database[aggregation.target_collection_name]
|
17
|
+
end
|
18
|
+
|
19
|
+
def target_collection_name
|
20
|
+
aggregation.target_collection_name
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
#noinspection RubyArgCount
|
25
|
+
def execute_query(query,options={})
|
26
|
+
count = 0
|
27
|
+
|
28
|
+
find_options = {
|
29
|
+
:limit=>query.limit || 0,
|
30
|
+
:skip=>query.offset || 0
|
31
|
+
}
|
32
|
+
|
33
|
+
find_options[:sort] = prepare_order_by(query)
|
34
|
+
filter = {}
|
35
|
+
if query == aggregation || query.transient?
|
36
|
+
aggregation = aggregate(query,options)
|
37
|
+
else
|
38
|
+
process_if_required
|
39
|
+
aggregation = aggregation_for(query)
|
40
|
+
#if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
|
41
|
+
#otherwise, a second map reduce is required to reduce the data set one last time
|
42
|
+
if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
|
43
|
+
filter = prepare_filter(query,options[:where] || {})
|
44
|
+
else
|
45
|
+
aggregation = aggregate(query,:source_collection=>collection.name)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
if aggregation.blank?
|
50
|
+
Cubicle::Data::Table.new(query,[],0) if aggregation == []
|
51
|
+
else
|
52
|
+
count = aggregation.count
|
53
|
+
results = aggregation.find(filter,find_options).to_a
|
54
|
+
aggregation.drop if aggregation.name =~ /^tmp.mr.*/
|
55
|
+
Cubicle::Data::Table.new(query, results, count)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
def process(options={})
|
61
|
+
Cubicle.logger.info "Processing #{aggregation.name} @ #{Time.now}"
|
62
|
+
start = Time.now
|
63
|
+
expire!
|
64
|
+
aggregate(aggregation,options)
|
65
|
+
#Sort desc by length of array, so that larget
|
66
|
+
#aggregations are processed first, hopefully increasing efficiency
|
67
|
+
#of the processing step
|
68
|
+
aggregation.aggregations.sort!{|a,b|b.length<=>a.length}
|
69
|
+
aggregation.aggregations.each do |member_list|
|
70
|
+
agg_start = Time.now
|
71
|
+
aggregation_for(aggregation.query(:defer=>true){select member_list})
|
72
|
+
Cubicle.logger.info "#{aggregation.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
|
73
|
+
end
|
74
|
+
duration = Time.now - start
|
75
|
+
Cubicle.logger.info "#{aggregation.name} processed @ #{Time.now}in #{duration} seconds."
|
76
|
+
end
|
77
|
+
|
78
|
+
def expire!
|
79
|
+
collection.drop
|
80
|
+
expire_aggregations!
|
81
|
+
end
|
82
|
+
|
83
|
+
protected
|
84
|
+
|
85
|
+
def aggregation_collection_names
|
86
|
+
database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
|
87
|
+
end
|
88
|
+
|
89
|
+
def expire_aggregations!
|
90
|
+
aggregation_collection_names.each{|agg_col|database[agg_col].drop}
|
91
|
+
end
|
92
|
+
|
93
|
+
def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
|
94
|
+
#format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
|
95
|
+
#this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
|
96
|
+
existing = existing_aggregations.map do |agg_col_name|
|
97
|
+
agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
|
98
|
+
end
|
99
|
+
|
100
|
+
#This will select all the aggregations that contain ALL of the desired dimension names
|
101
|
+
#we are sorting by length because the aggregation with the least number of members
|
102
|
+
#is likely to be the most efficient data source as it will likely contain the smallest number of rows.
|
103
|
+
#this will not always be true, and situations may exist where it is rarely true, however the alternative
|
104
|
+
#is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
|
105
|
+
#but until there is some reason to believe the aggregation caching process needs be highly performant,
|
106
|
+
#this should do for now.
|
107
|
+
candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
|
108
|
+
|
109
|
+
#If no suitable aggregation exists to base this one off of,
|
110
|
+
#we'll just use the base cubes aggregation collection
|
111
|
+
return target_collection_name if candidates.blank?
|
112
|
+
"#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
def aggregation_for(query)
|
117
|
+
return collection if query.all_dimensions?
|
118
|
+
|
119
|
+
aggregation_query = query.clone
|
120
|
+
#If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
|
121
|
+
filter = (query.where if query.respond_to?(:where))
|
122
|
+
filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
|
123
|
+
|
124
|
+
dimension_names = aggregation_query.dimension_names.sort
|
125
|
+
agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
|
126
|
+
|
127
|
+
unless database.collection_names.include?(agg_col_name)
|
128
|
+
source_col_name = find_best_source_collection(dimension_names)
|
129
|
+
exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
|
130
|
+
aggregate(exec_query, :target_collection=>agg_col_name)
|
131
|
+
end
|
132
|
+
|
133
|
+
database[agg_col_name]
|
134
|
+
end
|
135
|
+
|
136
|
+
def ensure_indexes(collection_name,dimension_names)
|
137
|
+
col = database[collection_name]
|
138
|
+
#an index for each dimension
|
139
|
+
dimension_names.each {|dim|col.create_index([dim,Mongo::ASCENDING])}
|
140
|
+
#and a composite
|
141
|
+
col.create_index(dimension_names)
|
142
|
+
end
|
143
|
+
|
144
|
+
def aggregate(query,options={})
|
145
|
+
map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
|
146
|
+
options[:finalize] = MapReduceHelper.generate_finalize_function(query)
|
147
|
+
options["query"] = prepare_filter(query,options[:where] || {})
|
148
|
+
|
149
|
+
query.source_collection_name ||= aggregation.source_collection_name
|
150
|
+
|
151
|
+
target_collection = options.delete(:target_collection)
|
152
|
+
target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
|
153
|
+
|
154
|
+
options[:out] = target_collection unless target_collection.blank? || query.transient?
|
155
|
+
|
156
|
+
#This is defensive - some tests run without ever initializing any collections
|
157
|
+
return [] unless database.collection_names.include?(query.source_collection_name)
|
158
|
+
|
159
|
+
result = database[query.source_collection_name].map_reduce(map,reduce,options)
|
160
|
+
|
161
|
+
ensure_indexes(target_collection,query.dimension_names) if target_collection
|
162
|
+
|
163
|
+
result
|
164
|
+
end
|
165
|
+
|
166
|
+
def prepare_filter(query,filter={})
|
167
|
+
filter.merge!(query.where) if query.respond_to?(:where) && query.where
|
168
|
+
filter.stringify_keys!
|
169
|
+
transient = (query.transient? || query == aggregation)
|
170
|
+
filter.keys.each do |key|
|
171
|
+
next if key=~/^\$.*/
|
172
|
+
prefix = nil
|
173
|
+
prefix = "_id" if (member = aggregation.dimensions[key])
|
174
|
+
prefix = "value" if (member = aggregation.measures[key]) unless member
|
175
|
+
|
176
|
+
raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
|
177
|
+
|
178
|
+
filter_value = filter.delete(key)
|
179
|
+
if transient
|
180
|
+
if (member.expression_type == :javascript)
|
181
|
+
filter_name = "$where"
|
182
|
+
filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
|
183
|
+
filter_value = "(#{member.expression})==#{filter_value}"
|
184
|
+
else
|
185
|
+
filter_name = member.expression
|
186
|
+
end
|
187
|
+
else
|
188
|
+
filter_name = "#{prefix}.#{member.name}"
|
189
|
+
end
|
190
|
+
filter[filter_name] = filter_value
|
191
|
+
end
|
192
|
+
filter
|
193
|
+
end
|
194
|
+
|
195
|
+
def prepare_order_by(query)
|
196
|
+
order_by = []
|
197
|
+
query.order_by.each do |order|
|
198
|
+
prefix = "_id" if (member = aggregation.dimensions[order[0]])
|
199
|
+
prefix = "value" if (member = aggregation.measures[order[0]]) unless member
|
200
|
+
raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
|
201
|
+
order_by << ["#{prefix}.#{order[0]}",order[1]]
|
202
|
+
end
|
203
|
+
order_by
|
204
|
+
end
|
205
|
+
|
206
|
+
def process_if_required
|
207
|
+
return if database.collection_names.include?(target_collection_name)
|
208
|
+
process
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
module Dsl
|
4
|
+
|
5
|
+
def source_collection_name(collection_name = nil)
|
6
|
+
return @source_collection = collection_name if collection_name
|
7
|
+
@source_collection ||= name.chomp("Cubicle").chomp("Cube").chomp("Aggregation").underscore.pluralize
|
8
|
+
end
|
9
|
+
alias source_collection_name= source_collection_name
|
10
|
+
|
11
|
+
def target_collection_name(collection_name = nil)
|
12
|
+
return nil if transient?
|
13
|
+
return @target_name = collection_name if collection_name
|
14
|
+
@target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
|
15
|
+
end
|
16
|
+
alias target_collection_name= target_collection_name
|
17
|
+
|
18
|
+
def dimension(*args)
|
19
|
+
dimensions << Cubicle::Dimension.new(*args)
|
20
|
+
dimensions[-1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def dimensions(*args)
|
24
|
+
return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
|
25
|
+
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
26
|
+
args.each {|dim| dimension dim }
|
27
|
+
@dimensions
|
28
|
+
end
|
29
|
+
|
30
|
+
def measure(*args)
|
31
|
+
measures << Measure.new(*args)
|
32
|
+
measures[-1]
|
33
|
+
end
|
34
|
+
|
35
|
+
def measures(*args)
|
36
|
+
return (@measures ||= Cubicle::MemberList.new) if args.length < 1
|
37
|
+
args = args[0] if args.length == 1 && args[0].is_a?(Array)
|
38
|
+
args.each {|m| measure m}
|
39
|
+
@measures
|
40
|
+
end
|
41
|
+
|
42
|
+
def count(*args)
|
43
|
+
options = args.extract_options!
|
44
|
+
options[:aggregation_method] = :count
|
45
|
+
measure(*(args << options))
|
46
|
+
end
|
47
|
+
|
48
|
+
def average(*args)
|
49
|
+
options = args.extract_options!
|
50
|
+
options[:aggregation_method] = :average
|
51
|
+
measure(*(args << options))
|
52
|
+
#Averaged fields need a count of non-null values to properly calculate the average
|
53
|
+
args[0] = "#{args[0]}_count".to_sym
|
54
|
+
count *args
|
55
|
+
end
|
56
|
+
alias avg average
|
57
|
+
|
58
|
+
def sum(*args)
|
59
|
+
options = args.extract_options!
|
60
|
+
options[:aggregation_method] = :sum
|
61
|
+
measure(*(args << options))
|
62
|
+
end
|
63
|
+
|
64
|
+
def duration(*args)
|
65
|
+
options = args.extract_options!
|
66
|
+
options[:in] ||= durations_in
|
67
|
+
args << options
|
68
|
+
measures << (dur = Duration.new(*args))
|
69
|
+
count("#{dur.name}_count".to_sym, :expression=>dur.expression) if dur.aggregation_method == :average
|
70
|
+
end
|
71
|
+
|
72
|
+
def average_duration(*args)
|
73
|
+
duration(*args)
|
74
|
+
end
|
75
|
+
alias avg_duration average_duration
|
76
|
+
|
77
|
+
def total_duration(*args)
|
78
|
+
options = args.extract_options!
|
79
|
+
options[:aggregation_method] = :sum
|
80
|
+
duration(*(args<<options))
|
81
|
+
end
|
82
|
+
|
83
|
+
def durations_in(unit_of_time = nil)
|
84
|
+
return (@duration_unit ||= :seconds) unless unit_of_time
|
85
|
+
@duration_unit = unit_of_time.to_s.pluralize.to_sym
|
86
|
+
end
|
87
|
+
alias :duration_unit :durations_in
|
88
|
+
|
89
|
+
|
90
|
+
def ratio(member_name, numerator, denominator)
|
91
|
+
measures << Ratio.new(member_name, numerator, denominator)
|
92
|
+
end
|
93
|
+
|
94
|
+
def aggregation(*member_list)
|
95
|
+
member_list = member_list[0] if member_list[0].is_a?(Array)
|
96
|
+
aggregations << member_list
|
97
|
+
end
|
98
|
+
|
99
|
+
def time_dimension(*args)
|
100
|
+
return (@time_dimension ||= nil) unless args.length > 0
|
101
|
+
@time_dimension = dimension(*args)
|
102
|
+
end
|
103
|
+
alias time_dimension= time_dimension
|
104
|
+
alias date time_dimension
|
105
|
+
alias time time_dimension
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Cubicle
|
2
|
+
module Aggregation
|
3
|
+
class MapReduceHelper
|
4
|
+
class << self
|
5
|
+
|
6
|
+
def generate_keys_string(query)
|
7
|
+
"{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
|
8
|
+
end
|
9
|
+
|
10
|
+
def generate_values_string(query)
|
11
|
+
"{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate_map_function(query)
|
15
|
+
<<MAP
|
16
|
+
function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
|
17
|
+
MAP
|
18
|
+
end
|
19
|
+
|
20
|
+
def generate_reduce_function()
|
21
|
+
<<REDUCE
|
22
|
+
function(key,values){
|
23
|
+
var output = {};
|
24
|
+
values.forEach(function(doc){
|
25
|
+
for(var key in doc){
|
26
|
+
if (doc[key] || doc[key] == 0){
|
27
|
+
output[key] = output[key] || 0;
|
28
|
+
output[key] += doc[key];
|
29
|
+
}
|
30
|
+
}
|
31
|
+
});
|
32
|
+
return output;
|
33
|
+
}
|
34
|
+
REDUCE
|
35
|
+
end
|
36
|
+
|
37
|
+
def generate_finalize_function(query)
|
38
|
+
<<FINALIZE
|
39
|
+
function(key,value)
|
40
|
+
{
|
41
|
+
|
42
|
+
#{ (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
|
43
|
+
"value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
|
44
|
+
end.join("\n")}
|
45
|
+
#{ (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
|
46
|
+
"value.#{m.name}=#{m.expression};";
|
47
|
+
end.join("\n")}
|
48
|
+
return value;
|
49
|
+
}
|
50
|
+
FINALIZE
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|