cubicle 0.1.20 → 0.1.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,8 @@
1
+ ==0.1.21
2
+ *Added metadata tables in the database for cubicle to manage aggregation info. This was necessary because previously
3
+ I was trying to overload the collection name with metadata, which was making the names longer than MongoDb could support
4
+ and causing errors. This change will enable richer monitoring and profiling and optimization in the near future.
5
+
1
6
  ==0.1.20
2
7
  *Updated to work with mongo driver 1.0 (and therefore latest versions of MongoMapper)
3
8
 
data/cubicle.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{cubicle}
8
- s.version = "0.1.20"
8
+ s.version = "0.1.21"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Nathan Stults"]
12
- s.date = %q{2010-05-03}
12
+ s.date = %q{2010-05-05}
13
13
  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
14
14
  s.email = %q{hereiam@sonic.net}
15
15
  s.extra_rdoc_files = [
@@ -27,7 +27,9 @@ Gem::Specification.new do |s|
27
27
  "lib/cubicle/aggregation.rb",
28
28
  "lib/cubicle/aggregation/ad_hoc.rb",
29
29
  "lib/cubicle/aggregation/aggregation_manager.rb",
30
+ "lib/cubicle/aggregation/aggregation_metadata.rb",
30
31
  "lib/cubicle/aggregation/aggregation_view.rb",
32
+ "lib/cubicle/aggregation/cubicle_metadata.rb",
31
33
  "lib/cubicle/aggregation/dsl.rb",
32
34
  "lib/cubicle/aggregation/map_reduce_helper.rb",
33
35
  "lib/cubicle/bucketized_dimension.rb",
@@ -55,6 +57,8 @@ Gem::Specification.new do |s|
55
57
  "lib/cubicle/version.rb",
56
58
  "test/config/database.yml",
57
59
  "test/cubicle/aggregation/ad_hoc_test.rb",
60
+ "test/cubicle/aggregation/aggregation_metadata_test.rb",
61
+ "test/cubicle/aggregation/cubicle_metadata_test.rb",
58
62
  "test/cubicle/bucketized_dimension_test.rb",
59
63
  "test/cubicle/cubicle_aggregation_test.rb",
60
64
  "test/cubicle/cubicle_query_test.rb",
@@ -77,6 +81,8 @@ Gem::Specification.new do |s|
77
81
  s.summary = %q{Pseudo-Multi Dimensional analysis / simplified aggregation for MongoDB in Ruby (NOLAP ;))}
78
82
  s.test_files = [
79
83
  "test/cubicle/aggregation/ad_hoc_test.rb",
84
+ "test/cubicle/aggregation/aggregation_metadata_test.rb",
85
+ "test/cubicle/aggregation/cubicle_metadata_test.rb",
80
86
  "test/cubicle/bucketized_dimension_test.rb",
81
87
  "test/cubicle/cubicle_aggregation_test.rb",
82
88
  "test/cubicle/cubicle_query_test.rb",
data/lib/cubicle.rb CHANGED
@@ -25,6 +25,8 @@ dir = File.dirname(__FILE__)
25
25
  "data/level",
26
26
  "data/hierarchy",
27
27
  "data/table",
28
+ "aggregation/aggregation_metadata",
29
+ "aggregation/cubicle_metadata",
28
30
  "aggregation/aggregation_view",
29
31
  "aggregation/aggregation_manager",
30
32
  "aggregation/map_reduce_helper",
@@ -2,10 +2,11 @@ module Cubicle
2
2
  module Aggregation
3
3
  class AggregationManager
4
4
 
5
- attr_reader :aggregation
5
+ attr_reader :aggregation, :metadata
6
6
 
7
7
  def initialize(aggregation)
8
8
  @aggregation = aggregation
9
+ @metadata = Cubicle::Aggregation::CubicleMetadata.new(aggregation)
9
10
  end
10
11
 
11
12
  def database
@@ -32,26 +33,28 @@ module Cubicle
32
33
 
33
34
  find_options[:sort] = prepare_order_by(query)
34
35
  filter = {}
36
+
35
37
  if query == aggregation || query.transient?
36
- aggregation = aggregate(query,options)
38
+ reduction = aggregate(query,options)
37
39
  else
38
40
  process_if_required
39
- aggregation = aggregation_for(query)
41
+ agg_data = aggregation_for(query)
42
+ reduction = agg_data.collection
40
43
  #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
41
44
  #otherwise, a second map reduce is required to reduce the data set one last time
42
- if query.all_dimensions? || ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
45
+ if query.all_dimensions? || (agg_data.member_names - query.member_names - [:all_measures]).blank?
43
46
  filter = prepare_filter(query,options[:where] || {})
44
47
  else
45
- aggregation = aggregate(query,:source_collection=>aggregation.name)
48
+ reduction = aggregate(query,:source_collection=>agg_data.target_collection_name)
46
49
  end
47
50
  end
48
51
 
49
- if aggregation.blank?
50
- Cubicle::Data::Table.new(query,[],0) if aggregation == []
52
+ if reduction.blank?
53
+ Cubicle::Data::Table.new(query,[],0)
51
54
  else
52
- count = aggregation.count
53
- results = aggregation.find(filter,find_options).to_a
54
- aggregation.drop if aggregation.name =~ /^tmp.mr.*/
55
+ count = reduction.count
56
+ results = reduction.find(filter,find_options).to_a
57
+ reduction.drop if reduction.name =~ /^tmp.mr.*/
55
58
  Cubicle::Data::Table.new(query, results, count)
56
59
  end
57
60
 
@@ -77,44 +80,42 @@ module Cubicle
77
80
 
78
81
  def expire!
79
82
  collection.drop
80
- expire_aggregations!
83
+ @metadata.expire!
81
84
  end
82
85
 
83
- protected
86
+ def aggregate(query,options={})
87
+ view = AggregationView.new(aggregation,query)
84
88
 
85
- def aggregation_collection_names
86
- database.collection_names.select {|col_name|col_name=~/#{aggregation.target_collection_name}_aggregation_(.*)/}
87
- end
89
+ map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
88
90
 
89
- def expire_aggregations!
90
- aggregation_collection_names.each{|agg_col|database[agg_col].drop}
91
- end
91
+ options[:finalize] = MapReduceHelper.generate_finalize_function(query)
92
+ options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
93
+
94
+ query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
92
95
 
93
- def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
94
- #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
95
- #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
96
- existing = existing_aggregations.map do |agg_col_name|
97
- agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
96
+ target_collection = options.delete(:target_collection)
97
+ target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
98
+
99
+ options[:out] = target_collection unless target_collection.blank? || query.transient?
100
+
101
+ #This is defensive - some tests run without ever initializing any collections
102
+ unless database.collection_names.include?(query.source_collection_name)
103
+ Cubicle.logger.info "No collection was found in the database with a name of #{query.source_collection_name}"
104
+ return []
98
105
  end
99
106
 
100
- #This will select all the aggregations that contain ALL of the desired dimension names
101
- #we are sorting by length because the aggregation with the least number of members
102
- #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
103
- #this will not always be true, and situations may exist where it is rarely true, however the alternative
104
- #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
105
- #but until there is some reason to believe the aggregation caching process needs be highly performant,
106
- #this should do for now.
107
- candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
107
+ result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
108
108
 
109
- #If no suitable aggregation exists to base this one off of,
110
- #we'll just use the base cubes aggregation collection
111
- return target_collection_name if candidates.blank?
112
- "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
109
+ ensure_indexes(target_collection,query.dimension_names) if target_collection
113
110
 
111
+ result
114
112
  end
115
113
 
114
+ protected
115
+
116
+
116
117
  def aggregation_for(query)
117
- return collection if query.all_dimensions?
118
+ #return collection if query.all_dimensions?
118
119
 
119
120
  aggregation_query = query.clone
120
121
  #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
@@ -122,15 +123,7 @@ module Cubicle
122
123
  filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/\$where/} unless filter.blank?
123
124
 
124
125
  dimension_names = aggregation_query.dimension_names.sort
125
- agg_col_name = "#{aggregation.target_collection_name}_aggregation_#{dimension_names.join('.')}"
126
-
127
- unless database.collection_names.include?(agg_col_name)
128
- source_col_name = find_best_source_collection(dimension_names)
129
- exec_query = aggregation.query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
130
- aggregate(exec_query, :target_collection=>agg_col_name)
131
- end
132
-
133
- database[agg_col_name]
126
+ @metadata.aggregation_for(dimension_names)
134
127
  end
135
128
 
136
129
  def ensure_indexes(collection_name,dimension_names)
@@ -146,31 +139,6 @@ module Cubicle
146
139
  #col.create_index(dimension_names.map{|dim|[dim,1]})
147
140
  end
148
141
 
149
- def aggregate(query,options={})
150
- view = AggregationView.new(aggregation,query)
151
-
152
- map, reduce = MapReduceHelper.generate_map_function(query), MapReduceHelper.generate_reduce_function
153
-
154
- options[:finalize] = MapReduceHelper.generate_finalize_function(query)
155
- options["query"] = expand_template(prepare_filter(query,options[:where] || {}),view)
156
-
157
- query.source_collection_name = options.delete(:source_collection) || query.source_collection_name || aggregation.source_collection_name
158
-
159
- target_collection = options.delete(:target_collection)
160
- target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
161
-
162
- options[:out] = target_collection unless target_collection.blank? || query.transient?
163
-
164
- #This is defensive - some tests run without ever initializing any collections
165
- return [] unless database.collection_names.include?(query.source_collection_name)
166
-
167
- result = database[query.source_collection_name].map_reduce(expand_template(map, view),reduce,options)
168
-
169
- ensure_indexes(target_collection,query.dimension_names) if target_collection
170
-
171
- result
172
- end
173
-
174
142
  def expand_template(template,view)
175
143
  return "" unless template
176
144
  return Mustache.render(template,view) if template.is_a?(String)
@@ -0,0 +1,121 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class AggregationMetadata
4
+ class << self
5
+
6
+ def collection
7
+ @@aggregations_collection_name ||= "#{Cubicle::Aggregation::CubicleMetadata.collection.name}.aggregations"
8
+ Cubicle.mongo.database[@@aggregations_collection_name]
9
+ end
10
+
11
+ def collection=(collection_name)
12
+ @@aggregations_collection_name = collection_name
13
+ end
14
+
15
+ def min_records_to_reduce
16
+ @min_records_to_reduce ||= 100
17
+ end
18
+
19
+ def min_records_to_reduce=(min)
20
+ @min_records_to_reduce = min
21
+ end
22
+
23
+ def expire(aggregation)
24
+ aggregation_name = case aggregation
25
+ when String then aggregation
26
+ when Symbol then aggregation.to_s
27
+ when Cubicle::Aggregation::CubicleMetadata then aggregation.aggregation.name
28
+ else aggregation.name
29
+ end
30
+ Cubicle.mongo.database.collection_names.each do |col|
31
+ Cubicle.mongo.database[col].drop if col =~ /cubicle.aggregation.#{aggregation_name}._*/i
32
+ collection.remove(:aggregation=>aggregation_name)
33
+ end
34
+ end
35
+ end
36
+
37
+ def initialize(cubicle_metadata,member_names_or_attribute_hash)
38
+ @cubicle_metadata = cubicle_metadata
39
+ if (member_names_or_attribute_hash.kind_of?(Hash))
40
+ @attributes = member_names_or_attribute_hash
41
+ else
42
+ member_names = member_names_or_attribute_hash
43
+ @candidate_aggregation = self.class.collection.find(
44
+ :aggregation=>@cubicle_metadata.aggregation.name,
45
+ :member_names=>{"$all"=>member_names}, :document_count=>{"$gte"=>0}).sort([:document_count, :asc]).limit(1).next_document
46
+
47
+
48
+ #since the operator used in the query was $all, having equal lengths in the original and returned
49
+ #member array means that they are identical, which means that regardless of the number of documents
50
+ #in the aggregation, it is the candidate we want. Otherwise, we'll check to see if we
51
+ #boil down the data further, or just make our soup with what we've got.
52
+ @attributes = @candidate_aggregation if @candidate_aggregation &&
53
+ (@candidate_aggregation["member_names"].length == member_names.length ||
54
+ @candidate_aggregation["document_count"] < self.class.min_records_to_reduce)
55
+
56
+ unless @attributes
57
+ @attributes = HashWithIndifferentAccess.new({:aggregation=>@cubicle_metadata.aggregation.name,
58
+ :member_names=>member_names,
59
+ :document_count=>-1})
60
+
61
+ #materialize the aggregation, and, if the operation was successful,
62
+ #register it as available for use by future queries
63
+ @attributes[:_id] = self.class.collection.insert(@attributes)
64
+ materialize!
65
+ end
66
+
67
+ end
68
+ end
69
+
70
+ def target_collection_name
71
+ "cubicle.aggregation.#{@cubicle_metadata.aggregation.name}._#{@attributes["_id"].to_s}"
72
+ end
73
+
74
+ def source_collection_name
75
+ if @candidate_aggregation
76
+ candidate = Cubicle::Aggregation::AggregationMetadata.new(@cubicle_metadata,@candidate_aggregation)
77
+ return candidate.target_collection_name
78
+ end
79
+ @cubicle_metadata.aggregation.target_collection_name
80
+ end
81
+
82
+ def member_names; @attributes["member_names"] || []; end
83
+
84
+ def materialized?
85
+ document_count >= 0 &&
86
+ (!@collection.blank? ||
87
+ Cubicle.mongo.database.collection_names.include?(target_collection_name))
88
+ end
89
+
90
+ def collection
91
+ @collection ||= Cubicle.mongo.database[target_collection_name] if materialized?
92
+ end
93
+
94
+ def collection=(collection)
95
+ @collection = collection
96
+ end
97
+
98
+ def document_count
99
+ @attributes["document_count"]
100
+ end
101
+
102
+ protected
103
+ def update_document_count!(new_doc_count)
104
+ self.class.collection.update({:_id=>@attributes[:_id]}, "$set"=>{:document_count=>new_doc_count})
105
+ @attributes["document_count"]=new_doc_count
106
+ end
107
+
108
+ def materialize!
109
+ unless materialized?
110
+ exec_query = @cubicle_metadata.aggregation.query(member_names + [:all_measures],
111
+ :source_collection=>source_collection_name,
112
+ :defer=>true)
113
+ self.collection = @cubicle_metadata.aggregation.aggregator.aggregate(exec_query,
114
+ :target_collection=>target_collection_name)
115
+ end
116
+ update_document_count!(@collection.count) unless @collection.blank?
117
+ end
118
+
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,30 @@
1
+ module Cubicle
2
+ module Aggregation
3
+ class CubicleMetadata
4
+
5
+ class << self
6
+
7
+ def collection
8
+ @@collection_name ||= "cubicle.metadata"
9
+ Cubicle.mongo.database[@@collection_name]
10
+ end
11
+ def collection=(collection_name)
12
+ @@collection_name = collection_name
13
+ end
14
+ end
15
+
16
+ attr_reader :aggregation
17
+ def initialize(aggregation)
18
+ @aggregation = aggregation
19
+ end
20
+
21
+ def aggregation_for(member_names = [])
22
+ AggregationMetadata.new(self,member_names)
23
+ end
24
+
25
+ def expire!
26
+ AggregationMetadata.expire(self)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -11,7 +11,7 @@ module Cubicle
11
11
  def target_collection_name(collection_name = nil)
12
12
  return nil if transient?
13
13
  return @target_name = collection_name if collection_name
14
- @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
14
+ @target_name ||= "cubicle.fact.#{name.blank? ? source_collection_name : name.underscore}"
15
15
  end
16
16
  alias target_collection_name= target_collection_name
17
17
 
@@ -1,3 +1,3 @@
1
1
  module Cubicle
2
- VERSION = '0.1.20'
2
+ VERSION = '0.1.21'
3
3
  end
@@ -0,0 +1,89 @@
1
+ require "test_helper"
2
+
3
+ class AggregationMetadataTest < ActiveSupport::TestCase
4
+ context "Class level collection names" do
5
+ should "use appropriate default values for the aggregations collection" do
6
+ assert_equal "cubicle.metadata.aggregations", Cubicle::Aggregation::AggregationMetadata.collection.name
7
+ end
8
+ end
9
+
10
+ context "AggregationMetadata.update_document_count" do
11
+ setup do
12
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
13
+ end
14
+ should "update the document count for a given aggregation instance" do
15
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
16
+ agg_info.send(:update_document_count!,1024)
17
+ assert_equal 1024, agg_info.document_count
18
+ assert_equal false,agg_info.materialized?
19
+ end
20
+ end
21
+
22
+ context "AggregationMetadata#new" do
23
+ setup do
24
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
25
+ end
26
+ should "create initialize an instance of AggregationMetadata in the database" do
27
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region])
28
+ assert /cubicle.aggregation.DefectCubicle._+/ =~ agg_info.target_collection_name
29
+ assert_equal [:product,:region], agg_info.member_names
30
+ assert_equal false, agg_info.materialized?
31
+ assert_nil agg_info.collection
32
+ end
33
+ should "fetch an existing aggregation from the database" do
34
+ ag = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region])
35
+ ag.send(:update_document_count!,1)
36
+ col_name = ag.target_collection_name
37
+ assert_equal col_name, Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region]).target_collection_name
38
+ end
39
+ should "ignore an existing aggregation that does not satisfy all fields" do
40
+ ag = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
41
+ ag.send(:update_document_count!,1)
42
+ col_name = ag.target_collection_name
43
+ assert col_name != Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region]).target_collection_name
44
+ end
45
+ should "select an existing aggregation with rows below the minimum threshold instead of creating a new one" do
46
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region,:operator])
47
+ agg_info.send(:update_document_count!,99)
48
+ assert_equal agg_info.target_collection_name, Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product]).target_collection_name
49
+ end
50
+
51
+ should "ignore an existing aggregation with too many rows, but store that aggregation as a candidate source for use when materializing the aggregation" do
52
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product,:region,:operator])
53
+ agg_info.send(:update_document_count!,101)
54
+ new_agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
55
+ assert agg_info.target_collection_name != new_agg_info.target_collection_name
56
+ assert_equal agg_info.target_collection_name, new_agg_info.source_collection_name
57
+ end
58
+ end
59
+
60
+ context "AggregationMetadata#materialize!" do
61
+ should "run a map reduce and produce the resulting collection" do
62
+ Defect.create_test_data
63
+ DefectCubicle.process
64
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
65
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
66
+ aggregation = agg_info.collection
67
+ assert_not_nil aggregation
68
+ assert aggregation.count > 0
69
+ assert_equal aggregation.count, agg_info.document_count
70
+ end
71
+ end
72
+
73
+ context "AggregationMetadata.expire" do
74
+ should "drop any aggregation columns and remove metadata rows from the database" do
75
+ Defect.create_test_data
76
+ DefectCubicle.process
77
+ @cm = Cubicle::Aggregation::CubicleMetadata.new(DefectCubicle)
78
+ agg_info = Cubicle::Aggregation::AggregationMetadata.new(@cm,[:product])
79
+
80
+ assert Cubicle.mongo.database.collection_names.include?(agg_info.target_collection_name)
81
+ assert Cubicle::Aggregation::AggregationMetadata.collection.find(:aggregation=>"DefectCubicle").count > 0
82
+
83
+ Cubicle::Aggregation::AggregationMetadata.expire(@cm)
84
+
85
+ assert !Cubicle.mongo.database.collection_names.include?(agg_info.target_collection_name)
86
+ assert_equal 0, Cubicle::Aggregation::AggregationMetadata.collection.find(:aggregation=>"DefectCubicle").count
87
+ end
88
+ end
89
+ end