RubyGems - cubicle - Versions diffs - 0.1.0 - Mend

cubicle 0.1.0

Files changed (35) hide show

data/CHANGELOG.rdoc +2 -0
data/LICENSE.txt +22 -0
data/README.rdoc +173 -0
data/Rakefile +49 -0
data/cubicle.gemspec +91 -0
data/examples/cubicles/poker_hand_cubicle.rb +16 -0
data/examples/models/poker_hand.rb +3 -0
data/lib/cubicle.rb +389 -0
data/lib/cubicle/aggregation.rb +10 -0
data/lib/cubicle/calculated_measure.rb +18 -0
data/lib/cubicle/data.rb +84 -0
data/lib/cubicle/data_level.rb +60 -0
data/lib/cubicle/date_time.rb +33 -0
data/lib/cubicle/dimension.rb +4 -0
data/lib/cubicle/measure.rb +16 -0
data/lib/cubicle/member.rb +53 -0
data/lib/cubicle/member_list.rb +11 -0
data/lib/cubicle/mongo_environment.rb +102 -0
data/lib/cubicle/mongo_mapper/aggregate_plugin.rb +17 -0
data/lib/cubicle/query.rb +315 -0
data/lib/cubicle/ratio.rb +12 -0
data/lib/cubicle/support.rb +46 -0
data/lib/cubicle/version.rb +3 -0
data/test/config/database.yml +15 -0
data/test/cubicle/cubicle_aggregation_test.rb +21 -0
data/test/cubicle/cubicle_data_level_test.rb +58 -0
data/test/cubicle/cubicle_data_test.rb +51 -0
data/test/cubicle/cubicle_query_test.rb +326 -0
data/test/cubicle/cubicle_test.rb +85 -0
data/test/cubicle/mongo_mapper/aggregate_plugin_test.rb +39 -0
data/test/cubicles/defect_cubicle.rb +26 -0
data/test/log/test.log +6813 -0
data/test/models/defect.rb +73 -0
data/test/test_helper.rb +62 -0
metadata +144 -0

data/CHANGELOG.rdoc ADDED

	@@ -0,0 +1,2 @@
1	+ == 0.1.0
2	+ * Initial release

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+The MIT LICENSE
+Copyright (c) 2010 Gabriel Horner
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,173 @@
+== Overview
+Cubicle is a Ruby library and DSL for automating the generation, execution and caching of common aggregations of MongoDB documents. Cubicle was born from the need to easily extract simple, processed statistical views of raw, real time business data being collected from a variety of systems.
+== Motivation
+Aggregating data in MongoDB, unlike relational or multidimensional (OLAP) database, requires writing custom reduce functions in JavaScript for the simplest cases and full map reduce functions in the more complex cases, even for common aggregations like sums or averages.
+While writing such map reduce functions isn't particularly difficult it can be tedious and error prone and requires switching from Ruby to JavaScript. Cubicle presents a simplified Ruby DSL for generating the JavaScript required for most common aggregation tasks and also handles processing, caching and presenting the  results. JavaScript is still required in some cases, but is limited to constructing simple data transformation expressions.
+== Approach
+Cubicle breaks the task of defining and executing aggregation queries into two pieces. The first is the Cubicle, an analysis friendly 'view' of  the underlying collection which defines both the attributes that will be used for grouping  (dimensions) , the numerical fields that will be aggregated (measures), and  kind of aggregation will be applied to each measure.  The second piece of the Cubicle puzzle is a Query  which specifies which particular dimensions or measures will be selected from the Cubicle for a given data request, along with how the resulting data will be filtered, ordered, paginated and organized.
+== Install
+Install the gem with:
+    gem install cubicle
+          or
+    sudo gem install cubicle
+== An Example
+Given a document with the following structure (I'm using MongoMapper here as the ORM, but MongoMapper, or any other ORM, is not required by Cubicle, it works directly with the Mongo-Ruby Driver)
+    class PokerHand
+        include MongoMapper::Document
+        key :match_date, String    #we use iso8601 strings for dates, but Time works too
+        key :table, String
+        key :winner, Person 	   # {:person=>{:name=>'Jim', :address=>{...}...}}
+        key :winning_hand, Symbol  #:two_of_a_kind, :full_house, etc...
+        key :amount_won, Float
+    end
+== The Cubicle
+here's how a Cubicle designed to analyze these poker hands might look:
+    class PokerHandCubicle
+        extend Cubicle
+        date       :date,      :field_name=>'match_date'
+        dimension  :month,     :expression=>'this.match_date.substring(0,7)'
+        dimension  :year,      :expression=>'this.match_date.substring(0,4)'
+        dimensions :table,     :winning_hand
+        dimension :winner,     :field_name=>'winner.name'
+        count :total_hands,    :expression=>'true'
+        count :total_draws,    :expression=>'this.winning_hand=="draw"'
+        sum   :total_winnings, :field_name=>'amount_won'
+        avg   :avg_winnings,   :field_name=>'amount_won'
+        ratio :draw_pct,       :total_draws, :total_hands
+    end
+== The Queries
+The Queries
+And here's how you would use this cubicle to query the underlying data:
+    aggregated_data = PokerHandCubicle.query
+Issuing an empty query to the cubicle like the one above will return a list of measures aggregated according to type for each combination of dimensions. However, once a Cubicle has been defined, you can query it in many different ways. For instance if you wanted to see the total number of hands by type, you could do this:
+    hands_by_type = PokerHandCubicle.query { select :winning_hand, :total_hands }
+Or, if you wanted to see the total amount won with a full house, by player, sorted by amount won, you could do this:
+    full_houses_by_player = PokerHandCubicle.query do
+        select :winner
+        where :winning_hand=>'full_house'
+        order_by :total_winnings
+    end
+Cubicle can return your data in a hierarchy (tree) too, if you want. If you wanted to see the percent of hands resulting in a draw by table by day, you could do this:
+    draw_pct_by_player_by_day = PokerHandCubicle.query do
+        select :draw_pct
+        by :date, :table
+    end
+In addition to the basic query primitives such as select, where, by and order_by, Cubicle has a basic understanding of time, so as long as you have a dimension in your cubicle defined using 'date', and that dimension is either an iso8601 string or an instance of Time, then you can easily perform some handy date filtering in the DSL:
+    winnings_last_30_days_by_player = PokerHandCubicle.query do
+        select :winner, :total_winnings
+        for_the_last 30.days
+    end
+or
+    winnings_ytd_by_player = PokerHandCubicle.query do
+        select :winner, :all_measures
+        year_to_date
+        order_by [:total_winnings, :desc]
+    end
+== The Results
+Cubicle data is returned as either an array of hashes, for a two dimensional query, or a hash-based tree the leaves of which are arrays of hashes for hierarchical data (via queries using the 'by' keyword)
+Flat data:
+    [{:dimension1=>'d1', :dimension2=>'d1', :measure1=>'1.0'},{:dimension1=>'d2'...
+Hierarchical data 2 levels deep:
+    {'dimension 1'=>{'dimension2'=>[{:measures1=>'1.0'}],'dimension2b'=>[{measure1=>'2.0'}],...
+When you request two dimensional data (i.e. you do not use the 'by' keyword) you can transform your two dimensional data set into hierarchical data at any time using the 'hierarchize' method, specifying the dimensions you want to use in your hierarchy:
+    data = MyCubicle.query {select :date, :name, :all_measures}
+    hierarchized_data = data.hierarchize :name, :date
+This will result in a hash containing each unique value for :name in your source collection, and for each unique :name, a hash containing each unique :date with that :name, and for each :date, an array of hashes keyed by the measures in your Cubicle.
+== Caching & Processing
+Map reduce operations, especially over large or very large data sets, can take time to complete. Sometimes a long time. However, very often what you want to do is present a graph or a table of numbers to an interactive user on your website, and you probably don't want to make them wait for all your bazillion rows of raw data to be reduced down to the handful of numbers they are actually interested in seeing. For this reason, Cubicle has two modes of operation, the normal default mode in which aggregations are automatically cached until YourCubicle.expire! Or YourCubicle.process is called, and transient mode, which bypasses the caching mechanisms and executes real time queries against the raw source data.
+== Preprocessed Aggregations
+The expected normal mode of operation, however, is cached mode. While far from anything actually resembling an OLAP cube, Cubicle was designed to to process data on some periodic schedule and provide quick access to stored, aggregated data in between each processing, much like a real OLAP cube. Also reminiscent of an OLAP cube, Cubicle will cache aggregations at various levels of resolution, depending on the aggregations that were set up when defining a cubicle and depending on what queries are executed. For example, if a given Cubicle has three dimensions, Name, City and Date, when the Cubicle is processed, it will calculated aggregated measures for each combination of values on those three fields. If a query is executed that only requires Name and Date, then Cubicle will aggregate and cache measures by just Name and Date. If a third query asks for just Name, then Cubicle will create an aggregation based just on Name, but rather than using the original data source with its many rows, it will execute its map reduce against the previously cached Name-Date aggregation, which by definition will have fewer rows and should therefore perform faster. If you are aware ahead of time the aggregations your queries will need, you can specify them in the Cubicle definition, like this
+    class MyCubicle
+        extend Cubicle
+        dimension :name
+        dimension :date
+        ...
+        avg :my_measure
+        ...
+        aggregate :name, :date
+        aggregate :name
+        aggregate :date
+    end
+When aggregations are specified in this way, then Cubicle will pre-aggregate your data for each of the specified combinations of dimensions whenever MyCubicle.process is called, eliminating the first-hit penalty that would otherwise be incurred when Cubicle encountered a given aggregation for the first time.
+== Transient (Real Time) Queries
+Sometimes you may not want to query cached data. In our application, we are using Cubicle to provide data for our performance management Key Performance Indicators (KPI's) which consist of both a historical trend of a particular metric as well as the current, real time value of the same metric for, say, the current month or a rolling 30 day period. For performance reasons, we fetch our trend, which is usually 12 months, from cached data but want up to the minute freshness for our real time KPI values, so we need to query the living source data. To accomplish this using Cubicle, you simply insert 'transient!' into your query definition, like so
+    MyCubicle.query do
+        transient!
+        select :this, :that, :the_other
+    end
+This will bypass cached aggregations and execute a map reduce query directly against the cubicle source collection.
+== Mongo Mapper plugin
+If MongoMapper is detected, Cubicle will use its connection to MongoDB. Additionally, Cubicle will install a simple MongoMapper plugin for doing ad-hoc, non-cached aggregations on the fly from a MongoMapper document, like this:
+    MyMongoMapperModel.aggregate do
+        dimension :my_dimension
+        count :measure1
+        avg :measure2
+    end.query {order_by [:measure2, :desc]; limit 10;}
+== Limitations
+* Cubicle cannot currently cause child documents to be emitted in the map reduce. This is a pretty big limitation, and will be resolved shortly.
+* Documentation is non-existent. This is being worked on (head that one before?)
+* Test coverage is OK, but the tests could be better organized
+* Code needs to be modularized a bit, main classes are a bit hairy at the moment
+== Credits
+* Alex Wang, Patrick Gannon for features, fixes & testing
+== Bugs/Issues
+Please report them {on github}[http://github.com/plasticlizard/cubicle/issues].
+== Links
+== Todo
+* Support for emitting child / descendant documents
+* Work with native Date type, instead of just iso strings
+* Hirb support
+* Member format strings
+* Auto gen of a cubicle definition based on existing keys/key types in the MongoMapper plugin
+* DSL support for topcount and bottomcount queries
+* Support for 'duration' aggregation that will calculated durations between timestamps

data/Rakefile ADDED

@@ -0,0 +1,49 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+begin
+  require 'jeweler'
+  require File.dirname(__FILE__) + "/lib/cubicle/version"
+  Jeweler::Tasks.new do |s|
+    s.name = "cubicle"
+    s.version = Cubicle::VERSION
+    s.summary = "Pseudo-Multi Dimensional analysis / simplified aggregation for MongoDB in Ruby (NOLAP ;))"
+    s.description = "Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model."
+    s.email = "hereiam@sonic.net"
+    s.homepage = "http://github.com/PlasticLizard/cubicle"
+    s.authors = ["Nathan Stults"]
+    s.has_rdoc = false #=>Should be true, someday
+    s.extra_rdoc_files = ["README.rdoc", "LICENSE.txt"]
+    s.files = FileList["[A-Z]*", "{bin,lib,test}/**/*"]
+    s.add_dependency('activesupport', '>= 2.3')
+    s.add_dependency('mongo', '>= 0.18.3')
+    s.add_development_dependency('shoulda', '2.10.3')
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError => ex
+  puts "Jeweler not available. Install it for jeweler-related tasks with: sudo gem install jeweler"
+end
+Rake::TestTask.new do |t|
+  t.libs << 'libs' << 'test'
+  t.pattern = 'test/**/*_test.rb'
+  t.verbose = false
+end
+Rake::RDocTask.new do |rdoc|
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = 'test'
+  rdoc.options << '--line-numbers' << '--inline-source'
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+task :default => :test

data/cubicle.gemspec ADDED

@@ -0,0 +1,91 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{cubicle}
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Nathan Stults"]
+  s.date = %q{2010-03-13}
+  s.description = %q{Cubicle provides a dsl and aggregation caching framework for automating the generation, execution and caching of map reduce queries when using MongoDB in Ruby. Cubicle also includes a MongoMapper plugin for quickly performing ad-hoc, multi-level group-by queries against a MongoMapper model.}
+  s.email = %q{hereiam@sonic.net}
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+     "README.rdoc"
+  ]
+  s.files = [
+    "CHANGELOG.rdoc",
+     "LICENSE.txt",
+     "README.rdoc",
+     "Rakefile",
+     "cubicle.gemspec",
+     "lib/cubicle.rb",
+     "lib/cubicle/aggregation.rb",
+     "lib/cubicle/calculated_measure.rb",
+     "lib/cubicle/data.rb",
+     "lib/cubicle/data_level.rb",
+     "lib/cubicle/date_time.rb",
+     "lib/cubicle/dimension.rb",
+     "lib/cubicle/measure.rb",
+     "lib/cubicle/member.rb",
+     "lib/cubicle/member_list.rb",
+     "lib/cubicle/mongo_environment.rb",
+     "lib/cubicle/mongo_mapper/aggregate_plugin.rb",
+     "lib/cubicle/query.rb",
+     "lib/cubicle/ratio.rb",
+     "lib/cubicle/support.rb",
+     "lib/cubicle/version.rb",
+     "test/config/database.yml",
+     "test/cubicle/cubicle_aggregation_test.rb",
+     "test/cubicle/cubicle_data_level_test.rb",
+     "test/cubicle/cubicle_data_test.rb",
+     "test/cubicle/cubicle_query_test.rb",
+     "test/cubicle/cubicle_test.rb",
+     "test/cubicle/mongo_mapper/aggregate_plugin_test.rb",
+     "test/cubicles/defect_cubicle.rb",
+     "test/log/test.log",
+     "test/models/defect.rb",
+     "test/test_helper.rb"
+  ]
+  s.homepage = %q{http://github.com/PlasticLizard/cubicle}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.6}
+  s.summary = %q{Pseudo-Multi Dimensional analysis / simplified aggregation for MongoDB in Ruby (NOLAP ;))}
+  s.test_files = [
+    "test/cubicle/cubicle_aggregation_test.rb",
+     "test/cubicle/cubicle_data_level_test.rb",
+     "test/cubicle/cubicle_data_test.rb",
+     "test/cubicle/cubicle_query_test.rb",
+     "test/cubicle/cubicle_test.rb",
+     "test/cubicle/mongo_mapper/aggregate_plugin_test.rb",
+     "test/cubicles/defect_cubicle.rb",
+     "test/models/defect.rb",
+     "test/test_helper.rb",
+     "examples/cubicles/poker_hand_cubicle.rb",
+     "examples/models/poker_hand.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<activesupport>, [">= 2.3"])
+      s.add_runtime_dependency(%q<mongo>, [">= 0.18.3"])
+      s.add_development_dependency(%q<shoulda>, ["= 2.10.3"])
+    else
+      s.add_dependency(%q<activesupport>, [">= 2.3"])
+      s.add_dependency(%q<mongo>, [">= 0.18.3"])
+      s.add_dependency(%q<shoulda>, ["= 2.10.3"])
+    end
+  else
+    s.add_dependency(%q<activesupport>, [">= 2.3"])
+    s.add_dependency(%q<mongo>, [">= 0.18.3"])
+    s.add_dependency(%q<shoulda>, ["= 2.10.3"])
+  end
+end

data/examples/cubicles/poker_hand_cubicle.rb ADDED

@@ -0,0 +1,16 @@
+class PokerHandCubicle
+  extend Cubicle
+  date       :date,  :field_name=>'match_date'
+  dimension  :month, :expression=>'this.match_date.substring(0,7)'
+  dimension  :year,  :expression=>'this.match_date.substring(0,4)'
+  dimensions :table, :winner, :winning_hand
+  count :total_hands,            :expression=>'true'
+  count :total_draws,            :expression=>'this.winning_hand=="draw"'
+  sum   :total_winnings,         :field_name=>'amount_won'
+  avg   :avg_winnings,           :field_name=>'amount_won'
+  ratio :royal_flush_pct,        :royal_flushes, :total_hands
+end

data/examples/models/poker_hand.rb ADDED

@@ -0,0 +1,3 @@
+class CookieSale
+  #Code here
+end

data/lib/cubicle.rb ADDED

@@ -0,0 +1,389 @@
+require "rubygems"
+require "active_support"
+require "mongo"
+dir = File.dirname(__FILE__)
+["mongo_environment",
+ "member",
+ "member_list",
+ "measure",
+ "calculated_measure",
+ "dimension",
+ "ratio",
+ "query",
+ "data_level",
+ "data",
+ "aggregation",
+ "date_time",
+ "support"].each {|lib|require File.join(dir,'cubicle',lib)}
+require File.join(dir,"cubicle","mongo_mapper","aggregate_plugin") if defined?(MongoMapper::Document)
+module Cubicle
+  def self.register_cubicle_directory(directory_path, recursive=true)
+    searcher = "#{recursive ? "*" : "**/*"}.rb"
+    Dir[File.join(directory_path,searcher)].each {|cubicle| require cubicle}
+  end
+  def self.mongo
+    @mongo ||= defined?(::MongoMapper::Document) ? ::MongoMapper : MongoEnvironment
+  end
+  def self.logger
+    Cubicle.mongo.logger
+  end
+  def database
+    Cubicle.mongo.database
+  end
+  def collection
+    database[target_collection_name]
+  end
+  def transient?
+    @transient ||= false
+  end
+  def transient!
+    @transient = true
+  end
+  def expire!
+    collection.drop
+    expire_aggregations!
+  end
+  def aggregations
+    return (@aggregations ||= [])
+  end
+  #DSL
+  def source_collection_name(collection_name = nil)
+    return @source_collection = collection_name if collection_name
+    @source_collection ||= name.chomp("Cubicle").chomp("Cube").underscore.pluralize
+  end
+  alias source_collection_name= source_collection_name
+  def target_collection_name(collection_name = nil)
+    return nil if transient?
+    return @target_name = collection_name if collection_name
+    @target_name ||= "#{name.blank? ? source_collection_name : name.underscore.pluralize}_cubicle"
+  end
+  alias target_collection_name= target_collection_name
+  def dimension(*args)
+    dimensions << Cubicle::Dimension.new(*args)
+    dimensions[-1]
+  end
+  def dimension_names
+    return @dimensions.map{|dim|dim.name.to_s}
+  end
+  def dimensions(*args)
+    return (@dimensions ||= Cubicle::MemberList.new) if args.length < 1
+    args = args[0] if args.length == 1 && args[0].is_a?(Array)
+    args.each {|dim| dimension dim }
+    @dimensions
+  end
+  def measure(*args)
+    measures << Measure.new(*args)
+    measures[-1]
+  end
+  def measures(*args)
+    return (@measures ||= Cubicle::MemberList.new) if args.length < 1
+    args = args[0] if args.length == 1 && args[0].is_a?(Array)
+    args.each {|m| measure m}
+    @measures
+  end
+  def count(*args)
+    options = args.extract_options!
+    options[:aggregation_method] = :count
+    measure(*(args << options))
+  end
+  def average(*args)
+    options = args.extract_options!
+    options[:aggregation_method] = :average
+    measure(*(args << options))
+    #Averaged fields need a count of non-null values to properly calculate the average
+    args[0] = "#{args[0]}_count".to_sym
+    count *args
+  end
+  alias avg average
+  def sum(*args)
+    options = args.extract_options!
+    options[:aggregation_method] = :sum
+    measure(*(args << options))
+  end
+  def ratio(member_name, numerator, denominator)
+    measures << Ratio.new(member_name, numerator, denominator)
+  end
+  def aggregation(*member_list)
+    member_list = member_list[0] if member_list[0].is_a?(Array)
+    aggregations << member_list
+  end
+  def time_dimension(*args)
+    return (@time_dimension ||= nil) unless args.length > 0
+    @time_dimension = dimension(*args)
+  end
+  alias time_dimension= time_dimension
+  alias date time_dimension
+  alias time time_dimension
+  def find_member(member_name)
+    @dimensions[member_name] ||
+            @measures[member_name]
+  end
+  def query(*args,&block)
+    options = args.extract_options!
+    query = Cubicle::Query.new(self)
+    query.source_collection_name = options.delete(:source_collection) if options[:source_collection]
+    query.select(*args) if args.length > 0
+    if block_given?
+      block.arity == 1 ? (yield query) : (query.instance_eval(&block))
+    end
+    query.select_all unless query.selected?
+    return query if options[:defer]
+    results = execute_query(query,options)
+    #If the 'by' clause was used in the the query,
+    #we'll hierarchize by the members indicated,
+    #as the next step would otherwise almost certainly
+    #need to be a call to hierarchize anyway.
+    query.respond_to?(:by) && query.by.length > 0 ? results.hierarchize(*query.by) : results
+  end
+  def execute_query(query,options={})
+    count = 0
+    find_options = {
+            :limit=>query.limit || 0,
+            :skip=>query.offset || 0
+    }
+    find_options[:sort] = prepare_order_by(query)
+    filter = {}
+    if query == self || query.transient?
+      aggregation = aggregate(query,options)
+    else
+      process_if_required
+      aggregation = aggregation_for(query)
+      #if the query exactly matches the aggregation in terms of requested members, we can issue a simple find
+      #otherwise, a second map reduce is required to reduce the data set one last time
+      if ((aggregation.name.split("_")[-1].split(".")) - query.member_names - [:all_measures]).blank?
+        filter = prepare_filter(query,options[:where] || {})
+      else
+        aggregation = aggregate(query,:source_collection=>collection.name)
+      end
+    end
+    count = aggregation.count
+    #noinspection RubyArgCount
+    data = aggregation.find(filter,find_options).to_a
+    #noinspection RubyArgCount
+    aggregation.drop if aggregation.name =~ /^tmp.mr.*/
+    Cubicle::Data.new(query, data, count)
+  end
+  def process(options={})
+    Cubicle.logger.info "Processing #{self.name} @ #{Time.now}"
+    start = Time.now
+    expire!
+    aggregate(self,options)
+    #Sort desc by length of array, so that larget
+    #aggregations are processed first, hopefully increasing efficiency
+    #of the processing step
+    aggregations.sort!{|a,b|b.length<=>a.length}
+    aggregations.each do |member_list|
+      agg_start = Time.now
+      aggregation_for(query(:defer=>true){select member_list})
+      Cubicle.logger.info "#{self.name} aggregation #{member_list.inspect} processed in #{Time.now-agg_start} seconds"
+    end
+    duration = Time.now - start
+    Cubicle.logger.info "#{self.name} processed @ #{Time.now}in #{duration} seconds."
+  end
+  protected
+  def aggregation_collection_names
+    database.collection_names.select {|col_name|col_name=~/#{target_collection_name}_aggregation_(.*)/}
+  end
+  def expire_aggregations!
+    aggregation_collection_names.each{|agg_col|database[agg_col].drop}
+  end
+  def find_best_source_collection(dimension_names, existing_aggregations=self.aggregation_collection_names)
+    #format of aggregation collection names is source_cubicle_collection_aggregation_dim1.dim2.dim3.dimn
+    #this next ugly bit of algebra will create 2d array containing a list of the dimension names in each existing aggregation
+    existing = existing_aggregations.map do |agg_col_name|
+      agg_col_name.gsub("#{target_collection_name}_aggregation_","").split(".")
+    end
+    #This will select all the aggregations that contain ALL of the desired dimension names
+    #we are sorting by length because the aggregation with the least number of members
+    #is likely to be the most efficient data source as it will likely contain the smallest number of rows.
+    #this will not always be true, and situations may exist where it is rarely true, however the alternative
+    #is to actually count rows of candidates, which seems a bit wasteful. Of course only the profiler knows,
+    #but until there is some reason to believe the aggregation caching process needs be highly performant,
+    #this should do for now.
+    candidates = existing.select {|candidate|(dimension_names - candidate).blank?}.sort {|a,b|a.length <=> b.length}
+    #If no suitable aggregation exists to base this one off of,
+    #we'll just use the base cubes aggregation collection
+    return target_collection_name if candidates.blank?
+    "#{target_collection_name}_aggregation_#{candidates[0].join('.')}"
+  end
+  def aggregation_for(query)
+    return collection if query.all_dimensions?
+    aggregation_query = query.clone
+    #If the query needs to filter on a field, it had better be in the aggregation...if it isn't a $where filter...
+    filter = (query.where if query.respond_to?(:where))
+    filter.keys.each {|filter_key|aggregation_query.select(filter_key) unless filter_key=~/^\$.*/} unless filter.blank?
+    dimension_names = aggregation_query.dimension_names.sort
+    agg_col_name = "#{target_collection_name}_aggregation_#{dimension_names.join('.')}"
+    unless database.collection_names.include?(agg_col_name)
+      source_col_name = find_best_source_collection(dimension_names)
+      exec_query = query(dimension_names + [:all_measures], :source_collection=>source_col_name, :defer=>true)
+      aggregate(exec_query, :target_collection=>agg_col_name)
+    end
+    database[agg_col_name]
+  end
+  def ensure_indexes(collection_name,dimension_names)
+    #an index for each dimension
+    dimension_names.each {|dim|database[collection_name].create_index([dim,Mongo::ASCENDING])}
+    #and a composite
+    database[collection_name].create_index(dimension_names)
+  end
+  def aggregate(query,options={})
+    map, reduce = generate_map_function(query), generate_reduce_function
+    options[:finalize] = generate_finalize_function(query)
+    options["query"] = prepare_filter(query,options[:where] || {})
+    query.source_collection_name ||= source_collection_name
+    target_collection = options.delete(:target_collection)
+    target_collection ||= query.target_collection_name if query.respond_to?(:target_collection_name)
+    options[:out] = target_collection unless target_collection.blank? || query.transient?
+    #This is defensive - some tests run without ever initializing any collections
+    return [] unless database.collection_names.include?(query.source_collection_name)
+    result = database[query.source_collection_name].map_reduce(map,reduce,options)
+    ensure_indexes(target_collection,query.dimension_names) if target_collection
+    result
+  end
+  def prepare_filter(query,filter={})
+    filter.merge!(query.where) if query.respond_to?(:where) && query.where
+    filter.stringify_keys!
+    transient = (query.transient? || query == self)
+    filter.keys.each do |key|
+      next if key=~/^\$.*/
+      prefix = nil
+      prefix = "_id" if (member = self.dimensions[key])
+      prefix = "value" if (member = self.measures[key]) unless member
+      raise "You supplied a filter that does not appear to be a member of this cubicle:#{key}" unless member
+      filter_value = filter.delete(key)
+      if transient
+        if (member.expression_type == :javascript)
+          filter_name = "$where"
+          filter_value = "'#{filter_value}'" if filter_value.is_a?(String) || filter_value.is_a?(Symbol)
+          filter_value = "(#{member.expression})==#{filter_value}"
+        else
+          filter_name = member.expression
+        end
+      else
+        filter_name = "#{prefix}.#{member.name}"
+      end
+      filter[filter_name] = filter_value
+    end
+    filter
+  end
+  def prepare_order_by(query)
+    order_by = []
+    query.order_by.each do |order|
+      prefix = "_id" if (member = self.dimensions[order[0]])
+      prefix = "value" if (member = self.measures[order[0]]) unless member
+      raise "You supplied a field to order_by that does not appear to be a member of this cubicle:#{key}" unless member
+      order_by << ["#{prefix}.#{order[0]}",order[1]]
+    end
+    order_by
+  end
+  def process_if_required
+    return if database.collection_names.include?(target_collection_name)
+    process
+  end
+  def generate_keys_string(query)
+    "{#{query.dimensions.map{|dim|dim.to_js_keys}.flatten.join(", ")}}"
+  end
+  def generate_values_string(query = self)
+    "{#{query.measures.map{|measure|measure.to_js_keys}.flatten.join(", ")}}"
+  end
+  def generate_map_function(query = self)
+    <<MAP
+    function(){emit(#{generate_keys_string(query)},#{generate_values_string(query)});}
+MAP
+  end
+  def generate_reduce_function()
+    <<REDUCE
+  function(key,values){
+	var output = {};
+	values.forEach(function(doc){
+        for(var key in doc){
+			if (doc[key] != null){
+				output[key] = output[key] || 0;
+				output[key]  += doc[key];
+			}
+		}
+	  });
+	return output;
+  }
+REDUCE
+  end
+  def generate_finalize_function(query = self)
+    <<FINALIZE
+    function(key,value)
+    {
+     #{  (query.measures.select{|m|m.aggregation_method == :average}).map do |m|
+      "value.#{m.name}=value.#{m.name}/value.#{m.name}_count;"
+    end.join("\n")}
+    #{  (query.measures.select{|m|m.aggregation_method == :calculation}).map do|m|
+      "value.#{m.name}=#{m.expression};";
+    end.join("\n")}
+    return value;
+    }
+FINALIZE
+  end
+end