cubicle 0.1.20 → 0.1.21

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,8 @@
1
+ ==0.1.21
2
+ *Added metadata tables in the database for cubicle to manage aggregation info. This was necessary because previously
3
+ I was trying to overload the collection name with metadata, which was making the names longer than MongoDb could support
4
+ and causing errors. This change will enable richer monitoring and profiling and optimization in the near future.
5
+
1
6
  ==0.1.20
2
7
  *Updated to work with mongo driver 1.0 (and therefore latest versions of MongoMapper)
3
8
 
data/cubicle.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.20"
8
+ s.version = "0.1.21"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-05-03}
12
+ s.date = %q{2010-05-05}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -27,7 +27,9 @@ Gem::Specification.new do |s|
27
27
  "lib/cubicle/aggregation.rb",
28
28
  "lib/cubicle/aggregation/ad_hoc.rb",
29
29
  "lib/cubicle/aggregation/aggregation_manager.rb",
30
+ "lib/cubicle/aggregation/aggregation_metadata.rb",
30
31
  "lib/cubicle/aggregation/aggregation_view.rb",
32
+ "lib/cubicle/aggregation/cubicle_metadata.rb",
31
33
  "lib/cubicle/aggregation/dsl.rb",
32
34
  "lib/cubicle/aggregation/map_reduce_helper.rb",
33
35
  "lib/cubicle/bucketized_dimension.rb",
@@ -55,6 +57,8 @@ Gem::Specification.new do |s|
55
57
  "lib/cubicle/version.rb",
56
58
  "test/config/database.yml",
57
59
  "test/cubicle/aggregation/ad_hoc_test.rb",
60
+ "test/cubicle/aggregation/aggregation_metadata_test.rb",
61
+ "test/cubicle/aggregation/cubicle_metadata_test.rb",
58
62
  "test/cubicle/bucketized_dimension_test.rb",
59
63
  "test/cubicle/cubicle_aggregation_test.rb",
60
64
  "test/cubicle/cubicle_query_test.rb",
@@ -77,6 +81,8 @@ Gem::Specification.new do |s|
77
81
  s.summary = %q{Pseudo-Multi Dimensional analysis / simplified aggregation for MongoDB in Ruby (NOLAP ;))}
78
82
  s.test_files = [
79
83
  "test/cubicle/aggregation/ad_hoc_test.rb",
84
+ "test/cubicle/aggregation/aggregation_metadata_test.rb",
85
+ "test/cubicle/aggregation/cubicle_metadata_test.rb",
80
86
  "test/cubicle/bucketized_dimension_test.rb",
81
87
  "test/cubicle/cubicle_aggregation_test.rb",
82
88
  "test/cubicle/cubicle_query_test.rb",
data/lib/cubicle.rb CHANGED
@@ -25,6 +25,8 @@ dir = File.dirname(__FILE__)
25
25
  "data/level",
26
26
  "data/hierarchy",
27
27
  "data/table",
28
+ "aggregation/aggregation_metadata",
29
+ "aggregation/cubicle_metadata",
28
30
  "aggregation/aggregation_view",
29
31
  "aggregation/aggregation_manager",
30
32
  "aggregation/map_reduce_helper",
@@ -2,10 +2,11 @@ module Cubicle
2
2
  module Aggregation
3
3
  class AggregationManager
4
4
 
5
- attr_reader :aggregation
5
+ attr_reader :aggregation, :metadata
6
6
 
7
7
  def initialize(aggregation)
8
8
  @aggregation = aggregation
9
+ @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
9
10
  end
10
11
 
11
12
  def database
@@ -32,26 +33,28 @@ module Cubicle
32
33
 
33
34
  find_options[:sort] = prepare_order_by(query)
34
35
  filter = {}
36
+
35
37
  if query == aggregation || query.transient?
36
- aggregation = aggregate(query,options)
38
+ reduction = aggregate(query,options)
37
39
  else
38
40
  process_if_required
39
- aggregation = aggregation_for(query)
41
+ agg_data = aggregation_for(query)
42
+ reduction = agg_data.collection
40
43
  #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
44
  #otherwise, a second map reduce is required to reduce the data set one last time
42
- if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
45
+ if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
43
46
  filter = prepare_filter(query,options[:where] || {})
44
47
  else
45
- aggregation = aggregate(query,:source_collection=>aggregation.name)
48
+ reduction = aggregate(query,:source_collection=>agg_data.target_collection_name)
46
49
  end
47
50
  end
48
51
 
49
- if aggregation.blank?
50
- Cubicle::Data::Table.new(query,[],0) if aggregation == []
52
+ if reduction.blank?
53
+ Cubicle::Data::Table.new(query,[],0)
51
54
  else
52
- count = aggregation.count
53
- results = aggregation.find(filter,find_options).to_a
54
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ count = reduction.count
56
+ results = reduction.find(filter,find_options).to_a
57
+ reduction.drop if reduction.name =~ /^tmp.mr.*/
55
58
  Cubicle::Data::Table.new(query, results, count)
56
59
  end
57
60
 
@@ -77,44 +80,42 @@ module Cubicle
77
80
 
78
81
  def expire!
79
82
  collection.drop
80
- expire_aggregations!
83
+ @metadata.expire!
81
84
  end
82
85
 
83
- protected
86
+ def aggregate(query,options={})
87
+ view = AggregationView.new(aggregation,query)
84
88
 
85
- def aggregation_collection_names
86
- database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
- end
89
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
88
90
 
89
- def expire_aggregations!
90
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
- end
91
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
92
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
93
+
94
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
92
95
 
93
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
- existing = existing_aggregations.map do |agg_col_name|
97
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
96
+ target_collection = options.delete(:target_collection)
97
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
98
+
99
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
100
+
101
+ #This is defensive - some tests run without ever initializing any collections
102
+ unless database.collection_names.include?(query.source_collection_name)
103
+ Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
104
+ return []
98
105
  end
99
106
 
100
- #This will select all the aggregations that contain ALL of the desired dimension names
101
- #we are sorting by length because the aggregation with the least number of members
102
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
- #this should do for now.
107
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
107
+ result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
108
108
 
109
- #If no suitable aggregation exists to base this one off of,
110
- #we'll just use the base cubes aggregation collection
111
- return target_collection_name if candidates.blank?
112
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
109
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
113
110
 
111
+ result
114
112
  end
115
113
 
114
+ protected
115
+
116
+
116
117
  def aggregation_for(query)
117
- return collection if query.all_dimensions?
118
+ #return collection if query.all_dimensions?
118
119
 
119
120
  aggregation_query = query.clone
120
121
  #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
@@ -122,15 +123,7 @@ module Cubicle
122
123
  filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
124
 
124
125
  dimension_names = aggregation_query.dimension_names.sort
125
- agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
-
127
- unless database.collection_names.include?(agg_col_name)
128
- source_col_name = find_best_source_collection(dimension_names)
129
- exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
- aggregate(exec_query, :target_collection=>agg_col_name)
131
- end
132
-
133
- database[agg_col_name]
126
+ @metadata.aggregation_for(dimension_names)
134
127
  end
135
128
 
136
129
  def ensure_indexes(collection_name,dimension_names)
@@ -146,31 +139,6 @@ module Cubicle
146
139
  #col.create_index(dimension_names.map{|dim|[dim,1]})
147
140
  end
148
141
 
149
- def aggregate(query,options={})
150
- view = AggregationView.new(aggregation,query)
151
-
152
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
153
-
154
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
155
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
156
-
157
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
158
-
159
- target_collection = options.delete(:target_collection)
160
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
161
-
162
- options[:out] = target_collection unless target_collection.blank? || query.transient?
163
-
164
- #This is defensive - some tests run without ever initializing any collections
165
- return [] unless database.collection_names.include?(query.source_collection_name)
166
-
167
- result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
168
-
169
- ensure_indexes(target_collection,query.dimension_names) if target_collection
170
-
171
- result
172
- end
173
-
174
142
  def expand_template(template,view)
175
143
  return "" unless template
176
144
  return Mustache.render(template,view) if template.is_a?(String)
@@ -0,0 +1,121 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationMetadata
4
+ class << self
5
+
6
+ def collection
7
+ @@aggregations_collection_name ||= "#{Cubicle::Aggregation::CubicleMetadata.collection.name}.aggregations"
8
+ Cubicle.mongo.database[@@aggregations_collection_name]
9
+ end
10
+
11
+ def collection=(collection_name)
12
+ @@aggregations_collection_name = collection_name
13
+ end
14
+
15
+ def min_records_to_reduce
16
+ @min_records_to_reduce ||= 100
17
+ end
18
+
19
+ def min_records_to_reduce=(min)
20
+ @min_records_to_reduce = min
21
+ end
22
+
23
+ def expire(aggregation)
24
+ aggregation_name = case aggregation
25
+ when String then aggregation
26
+ when Symbol then aggregation.to_s
27
+ when Cubicle::Aggregation::CubicleMetadata then aggregation.aggregation.name
28
+ else aggregation.name
29
+ end
30
+ Cubicle.mongo.database.collection_names.each do |col|
31
+ Cubicle.mongo.database[col].drop if col =~ /cubicle.aggregation.#{aggregation_name}._*/i
32
+ collection.remove(:aggregation=>aggregation_name)
33
+ end
34
+ end
35
+ end
36
+
37
+ def initialize(cubicle_metadata,member_names_or_attribute_hash)
38
+ @cubicle_metadata = cubicle_metadata
39
+ if (member_names_or_attribute_hash.kind_of?(Hash))
40
+ @attributes = member_names_or_attribute_hash
41
+ else
42
+ member_names = member_names_or_attribute_hash
43
+ @candidate_aggregation = self.class.collection.find(
44
+ :aggregation=>@cubicle_metadata.aggregation.name,
45
+ :member_names=>{"$all"=>member_names}, :document_count=>{"$gte"=>0}).sort([:document_count, :asc]).limit(1).next_document
46
+
47
+
48
+ #since the operator used in the query was $all, having equal lengths in the original and returned
49
+ #member array means that they are identical, which means that regardless of the number of documents
50
+ #in the aggregation, it is the candidate we want. Otherwise, we'll check to see if we
51
+ #boil down the data further, or just make our soup with what we've got.
52
+ @attributes = @candidate_aggregation if @candidate_aggregation &&
53
+ (@candidate_aggregation["member_names"].length == member_names.length ||
54
+ @candidate_aggregation["document_count"] < self.class.min_records_to_reduce)
55
+
56
+ unless @attributes
57
+ @attributes = HashWithIndifferentAccess.new({:aggregation=>@cubicle_metadata.aggregation.name,
58
+ :member_names=>member_names,
59
+ :document_count=>-1})
60
+
61
+ #materialize the aggregation, and, if the operation was successful,
62
+ #register it as available for use by future queries
63
+ @attributes[:_id] = self.class.collection.insert(@attributes)
64
+ materialize!
65
+ end
66
+
67
+ end
68
+ end
69
+
70
+ def target_collection_name
71
+ "cubicle.aggregation.#{@cubicle_metadata.aggregation.name}._#{@attributes["_id"].to_s}"
72
+ end
73
+
74
+ def source_collection_name
75
+ if @candidate_aggregation
76
+ candidate = Cubicle::Aggregation::AggregationMetadata.new(@cubicle_metadata,@candidate_aggregation)
77
+ return candidate.target_collection_name
78
+ end
79
+ @cubicle_metadata.aggregation.target_collection_name
80
+ end
81
+
82
+ def member_names; @attributes["member_names"] || []; end
83
+
84
+ def materialized?
85
+ document_count >= 0 &&
86
+ (!@collection.blank? ||
87
+ Cubicle.mongo.database.collection_names.include?(target_collection_name))
88
+ end
89
+
90
+ def collection
91
+ @collection ||= Cubicle.mongo.database[target_collection_name] if materialized?
92
+ end
93
+
94
+ def collection=(collection)
95
+ @collection = collection
96
+ end
97
+
98
+ def document_count
99
+ @attributes["document_count"]
100
+ end
101
+
102
+ protected
103
+ def update_document_count!(new_doc_count)
104
+ self.class.collection.update({:_id=>@attributes[:_id]}, "$set"=>{:document_count=>new_doc_count})
105
+ @attributes["document_count"]=new_doc_count
106
+ end
107
+
108
+ def materialize!
109
+ unless materialized?
110
+ exec_query = @cubicle_metadata.aggregation.query(member_names + [:all_measures],
111
+ :source_collection=>source_collection_name,
112
+ :defer=>true)
113
+ self.collection = @cubicle_metadata.aggregation.aggregator.aggregate(exec_query,
114
+ :target_collection=>target_collection_name)
115
+ end
116
+ update_document_count!(@collection.count) unless @collection.blank?
117
+ end
118
+
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,30 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class CubicleMetadata
4
+
5
+ class << self
6
+
7
+ def collection
8
+ @@collection_name ||= "cubicle.metadata"
9
+ Cubicle.mongo.database[@@collection_name]
10
+ end
11
+ def collection=(collection_name)
12
+ @@collection_name = collection_name
13
+ end
14
+ end
15
+
16
+ attr_reader :aggregation
17
+ def initialize(aggregation)
18
+ @aggregation = aggregation
19
+ end
20
+
21
+ def aggregation_for(member_names = [])
22
+ AggregationMetadata.new(self,member_names)
23
+ end
24
+
25
+ def expire!
26
+ AggregationMetadata.expire(self)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -11,7 +11,7 @@ module Cubicle
11
11
  def target_collection_name(collection_name = nil)
12
12
  return nil if transient?
13
13
  return @target_name = collection_name if collection_name
14
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
14
+ @target_name ||= "cubicle.fact.#{name.blank? ? source_collection_name : name.underscore}"
15
15
  end
16
16
  alias target_collection_name= target_collection_name
17
17
 
@@ -1,3 +1,3 @@
1
1
  module Cubicle
2
- VERSION = '0.1.20'
2
+ VERSION = '0.1.21'
3
3
  end
@@ -0,0 +1,89 @@
1
+ require "test_helper"
2
+
3
+ class AggregationMetadataTest < ActiveSupport::TestCase
4
+ context "Class level collection names" do
5
+ should "use appropriate default values for the aggregations collection" do
6
+ assert_equal "cubicle.metadata.aggregations", Cubicle::Aggregation::AggregationMetadata.collection.name
7
+ end
8
+ end
9
+
10
+ context "AggregationMetadata.update_document_count" do
11
+ setup do
12
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
13
+ end
14
+ should "update the document count for a given aggregation instance" do
15
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
16
+ agg_info.send(:update_document_count!,1024)
17
+ assert_equal 1024, agg_info.document_count
18
+ assert_equal false,agg_info.materialized?
19
+ end
20
+ end
21
+
22
+ context "AggregationMetadata#new" do
23
+ setup do
24
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
25
+ end
26
+ should "create initialize an instance of AggregationMetadata in the database" do
27
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region])
28
+ assert /cubicle.aggregation.DefectCubicle._+/ =~ agg_info.target_collection_name
29
+ assert_equal [:product,:region], agg_info.member_names
30
+ assert_equal false, agg_info.materialized?
31
+ assert_nil agg_info.collection
32
+ end
33
+ should "fetch an existing aggregation from the database" do
34
+ ag = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region])
35
+ ag.send(:update_document_count!,1)
36
+ col_name = ag.target_collection_name
37
+ assert_equal col_name, Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region]).target_collection_name
38
+ end
39
+ should "ignore an existing aggregation that does not satisfy all fields" do
40
+ ag = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
41
+ ag.send(:update_document_count!,1)
42
+ col_name = ag.target_collection_name
43
+ assert col_name != Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region]).target_collection_name
44
+ end
45
+ should "select an existing aggregation with rows below the minimum threshold instead of creating a new one" do
46
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region,:operator])
47
+ agg_info.send(:update_document_count!,99)
48
+ assert_equal agg_info.target_collection_name, Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product]).target_collection_name
49
+ end
50
+
51
+ should "ignore an existing aggregation with too many rows, but store that aggregation as a candidate source for use when materializing the aggregation" do
52
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region,:operator])
53
+ agg_info.send(:update_document_count!,101)
54
+ new_agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
55
+ assert agg_info.target_collection_name != new_agg_info.target_collection_name
56
+ assert_equal agg_info.target_collection_name, new_agg_info.source_collection_name
57
+ end
58
+ end
59
+
60
+ context "AggregationMetadata#materialize!" do
61
+ should "run a map reduce and produce the resulting collection" do
62
+ Defect.create_test_data
63
+ DefectCubicle.process
64
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
65
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
66
+ aggregation = agg_info.collection
67
+ assert_not_nil aggregation
68
+ assert aggregation.count > 0
69
+ assert_equal aggregation.count, agg_info.document_count
70
+ end
71
+ end
72
+
73
+ context "AggregationMetadata.expire" do
74
+ should "drop any aggregation columns and remove metadata rows from the database" do
75
+ Defect.create_test_data
76
+ DefectCubicle.process
77
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
78
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
79
+
80
+ assert Cubicle.mongo.database.collection_names.include?(agg_info.target_collection_name)
81
+ assert Cubicle::Aggregation::AggregationMetadata.collection.find(:aggregation=>"DefectCubicle").count > 0
82
+
83
+ Cubicle::Aggregation::AggregationMetadata.expire(@cm)
84
+
85
+ assert !Cubicle.mongo.database.collection_names.include?(agg_info.target_collection_name)
86
+ assert_equal 0, Cubicle::Aggregation::AggregationMetadata.collection.find(:aggregation=>"DefectCubicle").count
87
+ end
88
+ end
89
+ end