RubyGems - statlysis - Versions diffs - 0.0.2 → 0.0.3 - Mend

statlysis 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/README.markdown +18 -5
data/lib/statlysis.rb +11 -13
data/lib/statlysis/clock.rb +4 -0
data/lib/statlysis/configuration.rb +56 -28
data/lib/statlysis/constants.rb +12 -0
data/lib/statlysis/cron.rb +4 -6
data/lib/statlysis/cron/timely.rb +171 -0
data/lib/statlysis/cron/timely/multiple_dimensions.rb +52 -0
data/lib/statlysis/cron/timely/one_dimension.rb +60 -0
data/lib/statlysis/cron_set.rb +4 -3
data/lib/statlysis/javascript/count.rb +50 -15
data/lib/statlysis/map_reduce.rb +1 -1
data/lib/statlysis/utils.rb +6 -0
data/statlysis.gemspec +3 -3
data/test/config/database.yml +1 -2
data/test/helper.rb +15 -4
data/test/migrate/1_active_record.rb +1 -0
data/test/models/code_gist.rb +12 -0
data/test/models/eoe_log.rb +2 -4
data/test/test_daily_count.rb +4 -2
data/test/test_mapreduce.rb +8 -1
metadata +8 -8
data/lib/statlysis/cron/count.rb +0 -51
data/lib/statlysis/cron/count/dimensions.rb +0 -7
data/lib/statlysis/cron/count/timely.rb +0 -63

data/README.markdown CHANGED

@@ -10,11 +10,13 @@ Usage
 Statlysis.setup do
   set_database :statlysis
-  hourly :time_column => :t
+  daily CodeGist
+  hourly EoeLog, :time_column => :t # support custom time_column
   [EoeLog,
-   EoeLog.where(:ui => 0),
+   EoeLog.where(:ui => 0), # support query scope
    EoeLog.where(:ui => {"$ne" => 0}),
-   Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
+   Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}), # support collection name regexp
    EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
   ].each do |s|
     daily s, :time_column => :t
@@ -45,8 +47,6 @@ TODO
 * Admin interface
 * statistical query api in Ruby and HTTP
 * Interacting with Javascript charting library, e.g. Highcharts, D3.
-* More tests
-* Add @criteria to MultipleDataset
 Statistical Process
@@ -68,6 +68,11 @@ Q: Why do you recommend using multiple collections to store logs rather than a s
 A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
+Q: In Mongodb, why use MapReduce instead of Aggregation?
+A: The result of aggregation pipeline is a document and is subject to the BSON Document size limit, which is currently 16 megabytes, see more details at http://docs.mongodb.org/manual/core/aggregation-pipeline/#pipeline
 Copyright
 -----------------------------------------------
 MIT. David Chen at eoe.cn.
@@ -91,3 +96,11 @@ Related
 ### Admin interface
 * http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
+### ETL
+* https://github.com/activewarehouse/activewarehouse-etl/
+* http://jisraelsen.github.io/drudgery/ ruby ETL DSL, support csv, sqlite3, ActiveRecord, without support time range
+* https://github.com/square/ETL Simply encapsulates the SQL procedures

data/lib/statlysis.rb CHANGED

@@ -20,34 +20,33 @@ require 'activerecord_idnamecache'
 module Rails; end
 require 'statlysis/constants'
+require 'statlysis/utils'
+require 'statlysis/configuration'
+require 'statlysis/common'
 module Statlysis
   class << self
     def setup &blk
       raise "Need to setup proc" if not blk
-      logger.info "Start to setup Statlysis"
+      logger.info "Start to setup Statlysis" if ENV['DEBUG']
       time_log do
         self.config.instance_exec(&blk)
       end
-      logger.info
     end
     def time_log text = nil
       t = Time.now
       logger.info text if text
       yield if block_given?
-      logger.info "Time spend #{(Time.now - t).round(2)} seconds."
-      logger.info "-" * 42
+      logger.info "Time spend #{(Time.now - t).round(2)} seconds." if ENV['DEBUG']
+      logger.info "-" * 42 if ENV['DEBUG']
     end
     # delagate config methods to Configuration
     def config; Configuration.instance end
     require 'active_support/core_ext/module/delegation.rb'
-    [:sequel, :set_database, :check_set_database,
-     :default_time_zone,
-     :set_tablename_default_pre, :tablename_default_pre
-    ].each do |sym|
+    Configuration::DelegateMethods.each do |sym|
       delegate sym, :to => :config
     end
@@ -56,18 +55,17 @@ module Statlysis
     def source_to_database_type; @_source_to_database_type ||= {} end
+    # 代理访问 各个时间类型的 crons
     def daily; CronSet.new(Statlysis.config.day_crons) end
     def hourly; CronSet.new(Statlysis.config.hour_crons) end
+    def always; CronSet.new(Statlysis.config.always_crons) end
   end
 end
-require 'statlysis/utils'
-require 'statlysis/configuration'
-require 'statlysis/common'
 require 'statlysis/timeseries'
+require 'statlysis/map_reduce'
 require 'statlysis/clock'
 require 'statlysis/rake'
 require 'statlysis/cron'
@@ -77,7 +75,7 @@ require 'statlysis/multiple_dataset'
 module Statlysis
   require 'short_inspect'
-  ShortInspect.apply_to Cron, CronSet, MultipleDataset
+  ShortInspect.apply_to Cron, MultipleDataset
   ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
 end

data/lib/statlysis/clock.rb CHANGED

@@ -8,6 +8,8 @@ module Statlysis
     # feature is a string
     def initialize feature, default_time
       raise "Please assign default_time params" if not default_time
+      # init table & model
       cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
       unless Statlysis.sequel.table_exists?(cron.stat_table_name)
         Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
@@ -19,6 +21,8 @@ module Statlysis
       end
       h = Utils.setup_pattern_table_and_model cron.stat_table_name
       cron.stat_model = h[:model]
+      # init default_time
       cron.clock = cron.stat_model.find_or_create(:feature => feature)
       cron.clock.update :t => default_time if cron.current.nil?
       cron

data/lib/statlysis/configuration.rb CHANGED

@@ -9,16 +9,21 @@ module Statlysis
   class Configuration
     include Singleton
+    # variables
     attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
     attr_accessor :is_skip_database_index
-    TimeUnits.each {|unit| module_eval "attr_accessor :#{unit}_crons; self.instance.#{unit}_crons = []" }
-    [:realtime, :similar, :hotest].each do |sym|
-      sym = "#{sym}_crons"
-      attr_accessor sym; self.instance.send "#{sym}=", []
+    (TimeUnits + %W[always] + [:realtime, :similar, :hotest]).each do |unit|
+      sym = "#{unit}_crons"; attr_accessor sym; self.instance.send "#{sym}=", []
     end
     self.instance.send "tablename_default_pre=", "st"
     self.instance.send "is_skip_database_index=", false
+    DelegateMethods = [
+      :sequel, :set_database, :check_set_database,
+      :default_time_zone,
+      :set_tablename_default_pre, :tablename_default_pre
+    ]
     # 会在自动拼接统计数据库表名时去除这些时间字段
     def update_time_columns *columns
       self.default_time_columns ||= [:created_at, :updated_at]
@@ -26,43 +31,44 @@ module Statlysis
       self.default_time_columns = self.default_time_columns.uniq
     end
-    def set_database sym_or_hash
-      self.database_opts = if sym_or_hash.is_a? Symbol
-        YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
-      elsif Hash
-        sym_or_hash
+    def set_database obj
+      self.database_opts = case obj
+                           when Hash
+                             obj
+                           when Symbol, String
+                             YAML.load_file(Rails.root.join("config/database.yml"))[Rails.env].merge('database' => obj.to_s)
+                           else
+                             raise "Statlysis#set_database only support symbol or hash params"
+                           end
+      raise "database_opts should not be blank" if self.database_opts.blank?
+      # sqlite dont support regular creating database in mysql style
+      self.sequel = if (self.database_opts['adapter'].match(/sqlite/) && self.database_opts['database'].match(/\A:memory:\Z/)) # only for test envrionment
+        Sequel.sqlite
       else
-        raise "Statlysis#set_database only support symbol or hash params"
+        # create database, copied from http://stackoverflow.com/a/14435522/595618
+        require 'mysql2'
+        mysql2_client = Mysql2::Client.new(self.database_opts.except('database'))
+        mysql2_client.query("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']}")
+        Sequel.connect(self.database_opts)
       end
-      self.sequel = Sequel.connect(self.database_opts)
       # 初始化键值model
       ["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
         Utils.setup_pattern_table_and_model tn
       end
-      return self
-    end
-    def set_default_time_zone zone
-      self.default_time_zone = zone
       return self
     end
-    def set_tablename_default_pre str
-      self.tablename_default_pre = str.to_s
-    end
-    def daily  source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
-    def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
+    def set_default_time_zone zone; self.default_time_zone = zone; return self; end
+    def set_tablename_default_pre str; self.tablename_default_pre = str.to_s; return self end
     def check_set_database; raise "Please setup database first" if sequel.nil?  end
-    def timely source, opts
-      self.check_set_database
-      opts.reverse_merge! :time_column => :created_at, :time_unit => :day
-      t = Timely.new source, opts
-      self.send("#{opts[:time_unit]}_crons").push t
-    end
+    def daily  source, opts = {}; timely source, {:time_unit => :day}.merge(opts) end
+    def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
+    def always source, opts = {}; timely source, {:time_unit => false, :time_column => false}.merge(opts) end # IMPORTANT set :time_unit to false
     # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
     def lastest_visits source, opts
@@ -101,5 +107,27 @@ module Statlysis
       self.similar_crons.push Similar.new(model_name, _p)
     end
+    private
+    def timely source, opts
+      self.check_set_database
+      opts.reverse_merge! :time_column => :created_at,
+                          :time_unit => :day,
+                          :sum_columns => [],
+                          :group_by_columns => [],
+                          :group_concat_columns => []
+      opts.each {|k, v| opts[k] = v.map(&:to_sym) if (Timely::SqlColumns - [:group_by_columns]).include?(k) } # Sequel use symbol as column names
+      # e.g. convert [:user_id] to [{:column_name => :user_id, :type => :integer}]
+      if (opts[:group_by_columns].first || {})[:type].blank?
+        opts[:group_by_columns] = opts[:group_by_columns].map {|i| {:column_name => i.to_sym, :type => :integer} }
+      end
+      t = Timely.new source, opts
+      self.send("#{opts[:time_unit] || 'always'}_crons").push t
+    end
   end
 end

data/lib/statlysis/constants.rb CHANGED

@@ -3,8 +3,20 @@
 module Statlysis
   TimeUnits = %w[hour day week month year]
   DateTime1970 = Time.zone.parse("19700101").in_time_zone
+  TimeUnitToTableSuffixHash = (TimeUnits + [false]).inject({}) {|_h, _i| _h[_i] = (_i ? _i[0] : 'a'); _h }
   DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
   DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
+  SymbolToClassInDataType = {
+    :string =>   String,
+    :datetime => DateTime,
+    :time =>     Time,
+    :integer =>  Integer,
+    :float =>    Float,
+    :text =>     String
+  }
 end

data/lib/statlysis/cron.rb CHANGED

@@ -29,10 +29,8 @@ module Statlysis
     def is_activerecord?; @source_type == :activerecord; end
     def is_mongoid?; @source_type == :mongoid; end
     def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
+    def _source; cron.multiple_dataset.sources.first end
-    def _source
-      cron.multiple_dataset.sources.first
-    end
     def source_where_array
       # TODO follow index seq
       a = _source.where("").where_values.map do |equality|
@@ -64,8 +62,6 @@ module Statlysis
       TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
     end
-    protected
     # 兼容采用整数类型作时间字段
     def is_time_column_integer?
       if is_activerecord?
@@ -74,11 +70,13 @@ module Statlysis
         false
       end
     end
+    def time_column?; !!@time_column end
+    def group_by_columns?; !!@group_by_columns.any? end
   end
 end
-require 'statlysis/cron/count'
+require 'statlysis/cron/timely'
 require 'statlysis/cron/top'

data/lib/statlysis/cron/timely.rb ADDED

@@ -0,0 +1,171 @@
+# encoding: UTF-8
+module Statlysis
+  class Timely < Cron
+    SqlColumns = [:sum_columns, :group_by_columns, :group_concat_columns]
+    attr_reader(*SqlColumns)
+    def initialize source, opts = {}
+      super
+      Statlysis.check_set_database
+      SqlColumns.each {|sym| instance_variable_set "@#{sym}", (opts[sym] || []) }
+      cron.setup_stat_model
+      cron
+    end
+    # 设置数据源，并保存结果入数据库
+    def run
+      (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
+      raise "cron.output has no Enumerable" if not cron.output.class.included_modules.include? Enumerable
+      num_i = 0; num_add = 999
+      Statlysis.sequel.transaction do
+        # delete first in range
+        cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete if cron.time_column?
+        # TODO partial delete
+        cron.stat_model.where("").delete if cron.group_by_columns?
+        while !(_a = cron.output[num_i..(num_i+num_add)]).blank? do
+          # batch insert all
+          cron.stat_model.multi_insert _a
+          num_i += (num_add + 1)
+        end
+      end
+      return self
+    end
+    def setup_stat_model
+      cron.stat_table_name = Utils.normalise_name cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array, cron.group_by_columns.map {|i| i[:column_name] }, TimeUnitToTableSuffixHash[cron.time_unit]
+      raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
+      # create basic unchangeable table structure
+      if not Statlysis.sequel.table_exists?(cron.stat_table_name)
+        Statlysis.sequel.transaction do
+          Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
+            primary_key :id # Add one column at least in this block to avoid `SQLite3::SQLException: near ")": syntax error (Sequel::DatabaseError)`
+          end
+          Statlysis.sequel.add_column   cron.stat_table_name, :t, DateTime if cron.time_column? # alias for :time
+          # add count columns
+          if cron.time_column?
+            count_columns = [:timely_c, :totally_c] # alias for :count
+            count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
+          else
+            Statlysis.sequel.add_column cron.stat_table_name, :c, Integer # alias for :count
+          end
+        end
+      end
+      # add group_by columns & indexes
+      remodel
+      cron.stat_model.cron = cron
+      if cron.group_by_columns.any?
+        cron.group_by_columns.each do |_h|
+          if not cron.stat_model.columns.include?(_h[:column_name])
+            _h[:type] = SymbolToClassInDataType[_h[:type]] if _h[:type].is_a?(Symbol) # && (Statlysis.sequel.opts[:adapter] == :sqlite)
+            Statlysis.sequel.add_column cron.stat_table_name, _h[:column_name], _h[:type]
+          end
+        end
+      end
+      # add sum columns
+      remodel
+      sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
+        _result_cols.each do |_result_col|
+          if not cron.stat_model.columns.include?(_result_col)
+            # convert to Interger type in view if needed
+            Statlysis.sequel.add_column cron.stat_table_name, _result_col, Float
+          end
+        end
+      end
+      # Fix there should be uniq index name between tables
+      # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
+      _group_by_columns_index_name = cron.group_by_columns.reject {|i| i[:no_index] }.map {|i| i[:column_name] }
+      _truncated_columns = _group_by_columns_index_name.dup # only String column
+      _group_by_columns_index_name = _group_by_columns_index_name.unshift :t if cron.time_column?
+      # TODO use https://github.com/german/redis_orm to support full string indexes
+      if !Statlysis.config.is_skip_database_index && _group_by_columns_index_name.any?
+        mysql_per_column_length_limit_in_one_index = (1000 / 3.0 / _group_by_columns_index_name.size.to_f).to_i
+        index_columns_str = _group_by_columns_index_name.map {|s| _truncated_columns.include?(s) ? "#{s.to_s}(#{mysql_per_column_length_limit_in_one_index})" : s.to_s }.join(", ")
+        index_columns_str = "(#{index_columns_str})"
+        begin
+          # NOTE mysql indexes key length limit is 1000 bytes
+          cron.stat_model.dataset.with_sql("CREATE INDEX #{Utils.sha1_name(_group_by_columns_index_name)} ON #{cron.stat_table_name} #{index_columns_str};").to_a
+        rescue => e
+          raise e if not e.inspect.match(/exists|duplicate/i)
+        end
+      end
+      # add group_concat column
+      remodel
+      if cron.group_concat_columns.any? && !cron.stat_model.columns.include?(:other_json)
+        Statlysis.sequel.add_column cron.stat_table_name, :other_json, :text
+      end
+      # add access to group_concat values in other_json
+      remodel.class_eval do
+        define_method("other_json_hash") do
+          @__other_json_hash_cache ||= (JSON.parse(self.other_json) rescue {})
+        end
+        cron.group_concat_columns.each do |_group_concat_column|
+          define_method("#{_group_concat_column}_values") do
+            self.other_json_hash[_group_concat_column.to_s]
+          end
+        end
+      end
+      remodel
+    end
+    def output
+      @output ||= (cron.group_by_columns.any? ? multiple_dimensions_output : one_dimension_output)
+    end
+    protected
+    def unit_range_query time, time_begin = nil
+      # time begin and end
+      tb = time
+      te = (time+1.send(cron.time_unit)-1.second)
+      tb, te = tb.to_i, te.to_i if is_time_column_integer?
+      tb = time_begin || tb
+      return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
+      return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc  [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
+    end
+    # e.g. {:fav_count=>[:timely_favcount_s, :totally_favcount_s]}
+    def sum_column_to_result_columns_hash
+      cron.sum_columns.inject({}) do |h, _col|
+        [:timely, :totally].each do |_pre|
+          h[_col] ||= []
+          h[_col] << Utils.normalise_name(_pre, _col, 's').to_sym
+        end
+        h
+      end
+    end
+    private
+    def remodel
+      n = cron.stat_table_name.to_s.singularize.camelize
+      cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
+        class ::#{n} < Sequel::Model;
+          self.set_dataset :#{cron.stat_table_name}
+          cattr_accessor :cron
+        end
+        #{n}
+      MODEL
+    end
+  end
+end
+require 'statlysis/cron/timely/one_dimension'
+require 'statlysis/cron/timely/multiple_dimensions'

data/lib/statlysis/cron/timely/multiple_dimensions.rb ADDED

@@ -0,0 +1,52 @@
+# encoding: UTF-8
+module Statlysis
+  class Timely
+    def multiple_dimensions_output
+      self.send "multiple_dimensions_output_with#{cron.time_column ? '' : 'out'}_time_column"
+    end
+    private
+    def multiple_dimensions_output_with_time_column
+      cron.time_range.map do |time|
+        raise DefaultNotImplementWrongMessage # TODO
+      end
+    end
+    # TODO encapsulate Mongoid MapReduce in collection output mode
+    # TODO support large dataset, e.g. a million.
+    def multiple_dimensions_output_without_time_column
+      mr = Javascript::MultiDimensionalCount.new(cron)
+      array = []
+      cron.multiple_dataset.sources.each do |_source|
+        # _source = _source.time_range # TODO
+        array += _source.map_reduce(mr.map_func, mr.reduce_func)
+                        .out(inline: 1) # TODO use replace mode
+                        .to_a.map do |i|
+                          v = i['value']
+                          _h = {:c => v['count']}
+                          cron.group_by_columns.each do |_group_by_column|
+                            _h[_group_by_column[:column_name]] = v[_group_by_column[:column_name].to_s]
+                          end
+                          _h[:other_json] = {}
+                          cron.group_concat_columns.each do |_group_concat_column|
+                            _h[:other_json][_group_concat_column] = v["#{_group_concat_column}_values"].inject({}) {|_h2, i2| _h2[i2] ||= 0; _h2[i2] += 1; _h2 }
+                          end
+                          _h[:other_json] = _h[:other_json].to_json
+                          _h
+                        end
+      end
+      array
+      # TODO support sum_columns
+    end
+  end
+end

data/lib/statlysis/cron/timely/one_dimension.rb ADDED

@@ -0,0 +1,60 @@
+# encoding: UTF-8
+module Statlysis
+  class Timely
+    # one dimension **must** have `time_column`, or there's nothing to do
+    #
+    # TODO add to FAQ
+    # * if you want to statistics one column through `group_by_columns`
+    # params, and dont need time column, then you could use `always` DSL.
+    #
+    def one_dimension_output
+      cron.time_range.map do |time|
+        _hash = {:t => time, :timely_c => 0, :totally_c => 0}
+        sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
+          _result_cols.each do |_result_col|
+            _hash[_result_col] = 0.0
+          end
+        end
+        # support multiple data sources
+        _first_source = nil
+        cron.multiple_dataset.sources.each do |s|
+          _t = DateTime1970
+          _t = is_time_column_integer? ? _t.to_i : _t
+          _scope_one = s.where(unit_range_query(time))
+          # TODO cache pre-result
+          _scope_all = s.where(unit_range_query(time, _t))
+          # 1. count
+          _hash[:timely_c]  += _scope_one.count
+          _hash[:totally_c] += _scope_all.count
+          # 2. sum
+          sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
+            _hash[_result_cols[0]] = _scope_one.map(&_sum_col).reduce(:+).to_f
+            _hash[_result_cols[1]] = _scope_all.map(&_sum_col).reduce(:+).to_f
+          end
+          # 3. group_concat
+          _other_json = {}
+          _other_json[:group_concat_columns] ||= {}
+          cron.group_concat_columns.each do |_group_concat_column|
+            _other_json[:group_concat_columns][_group_concat_column] = _scope_one.map(&_group_concat_column).uniq
+          end
+          _hash[:other_json] = _other_json.to_json
+          _first_source ||= s.where(unit_range_query(time))
+        end
+        logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{_hash[:timely_c]} totally_c:#{_hash[:totally_c]}" if ENV['DEBUG']
+        _hash
+      end.select {|r1| r1.except(:t, :other_json).values.reject {|r2| r2.zero? }.any? }
+    end
+  end
+end

data/lib/statlysis/cron_set.rb CHANGED

@@ -10,13 +10,14 @@ module Statlysis
       when Fixnum, Integer # support array idx access
         self.to_a[pattern]
       else
-        CronSet.new(select do |cron_set|
-          cron_set.multiple_dataset.name.to_s.match Regexp.new(pattern.to_s)
+        CronSet.new(self.select do |cron|
+          reg = Regexp.new(pattern.to_s)
+          cron.stat_table_name.match(reg) || cron.multiple_dataset.name.to_s.match(reg)
         end)
       end
     end
-    def last; [-1]; end
+    def last; self[-1]; end
     def run
       map(&:run)

data/lib/statlysis/javascript/count.rb CHANGED

@@ -4,34 +4,69 @@ module Statlysis
   module Javascript
     class MultiDimensionalCount
       attr_reader :map_func, :reduce_func
+      attr_reader :cron
-      def initialize *fields
-        fields = :_id if fields.blank?
-        emit_key = case fields
-        when Array
-          emit_key = fields.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
-          emit_key = "{#{emit_key}}"
-        when Symbol, String
-          "this.#{fields}"
-        else
-          raise "Please assign symbol, string, or array of them"
+      def initialize cron
+        @cron = cron
+        # setup group_by_columns
+        _group_by_columns = :_id if cron.group_by_columns.blank?
+        _group_by_columns ||= cron.group_by_columns.map {|i| i[:column_name] }
+        emit_key = _group_by_columns.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
+        emit_key = "{#{emit_key}}"
+        # TODO setup sum_columns
+        # default_emit_values_array += cron.sum_columns.map {|_sum_column| "#{_sum_column}: this.#{_sum_column}" }
+        # setup group_concat_columns
+        # NOTE if only one uniq emit value, then it'll never be appeared in reduce function
+        emit_values_init_array = cron.group_concat_columns.map do |_group_concat_column|
+          "emit_value.#{_group_concat_column}_values = [this.#{_group_concat_column}];\n"
         end
+        emit_values_init_array += (_group_by_columns.map do |_group_by_column|
+          "emit_value.#{_group_by_column} = this.#{_group_by_column};\n"
+        end)
         @map_func = "function() {
-          emit (#{emit_key}, {count: 1});
+          var emit_value = {count: 1};
+          #{emit_values_init_array.join}
+          emit (#{emit_key}, emit_value);
         }"
+        # sum_init_values = cron.sum_columns.map {|_sum_column| "#{_sum_column} = 0.0" }
+        # sum_init_values = "var #{sum_init_values};" if cron.sum_columns.any?
+        # 如果使用Hash，将导致group_concat最终的数目和group_by数目不一致，因为多个任务并行时会导致覆盖(常见于个数多的分类，一个的则不会有这个问题），而可并行化的数组则不会。
+        group_concat_values_init_array = cron.group_concat_columns.map {|_group_concat_column| "reducedObject.#{_group_concat_column}_values = [];" }
+        group_concat_values_process_array = cron.group_concat_columns.map do |_group_concat_column|
+          "reducedObject.#{_group_concat_column}_values = reducedObject.#{_group_concat_column}_values.concat(v['#{_group_concat_column}_values']);\n"
+        end
+        group_by_values_process_array = _group_by_columns.map do |_group_by_column|
+          "reducedObject.#{_group_by_column} = v.#{_group_by_column};\n"
+        end
+        # emit value in map func should be the same structure as the
+        # return value in reduce func, see more details in
+        # http://rickosborne.org/download/SQL-to-MongoDB.pdf and
+        # http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/
         @reduce_func = "function(key, values) {
-          var count = 0;
+          var reducedObject = key;
+          reducedObject.count = 0;
+          #{group_concat_values_init_array.join}
           values.forEach(function(v) {
-            count += v['count'];
+            reducedObject.count += v['count'];
+            #{group_by_values_process_array.join}
+            #{group_concat_values_process_array.join}
           });
-          return {count: count};
+          return reducedObject;
         }"
-        self
+        return self
       end
     end
   end
 end

data/lib/statlysis/map_reduce.rb CHANGED

@@ -1,6 +1,6 @@
 # encoding: utf-8
-require 'javascript/count'
+require 'statlysis/javascript/count'
 module Statlysis
   class MapReduce

data/lib/statlysis/utils.rb CHANGED

@@ -35,6 +35,12 @@ module Statlysis
         {:table => tn, :model => str.constantize}
       end
+      def normalise_name *name
+        Array(name).flatten.compact.map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
+      end
+      def sha1_name name; Digest::SHA1.hexdigest Array(name).map(&:to_s).join end
     end
   end
 end

data/statlysis.gemspec CHANGED

@@ -4,13 +4,13 @@ $:.push File.expand_path("../lib", __FILE__)
 Gem::Specification.new do |s|
   s.name          = 'statlysis'
-  s.version       = '0.0.2'
-  s.date          = '2013-07-26'
+  s.version       = '0.0.3'
+  s.date          = '2013-12-03'
   s.summary       = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
   s.description   = s.summary
   s.authors       = ["David Chen"]
   s.email         = 'mvjome@gmail.com'
-  s.homepage      = 'https://github.com/eoecn/statlysis'
+  s.homepage      = 'https://github.com/SunshineLibrary/statlysis'
   s.license       = 'MIT'
   s.files         = `git ls-files`.split("\n")

data/test/config/database.yml CHANGED

@@ -4,6 +4,5 @@ production: &defaults
   encoding: utf8
   collation: utf8_general_ci
   database: ":memory:"
-statlysis:
+development:
   <<: *defaults
-#  database: statlysis

data/test/helper.rb CHANGED

@@ -12,6 +12,7 @@ require 'test/unit'
 $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
 $LOAD_PATH.unshift File.dirname(__FILE__) # test dirs
+require 'pry-debugger'
 # load mongoid setup
 require 'mongoid'
@@ -22,10 +23,11 @@ require 'statlysis'
 # load rails
 def Rails.root; Pathname.new(File.expand_path('../.', __FILE__)) end
+def Rails.env; 'development' end
 require 'sqlite3'
 # load ActiveRecord setup
-Statlysis.set_database :statlysis
+Statlysis.set_database ":memory:"
 Statlysis.config.is_skip_database_index = true
 ActiveRecord::Base.establish_connection(Statlysis.config.database_opts.merge("adapter" => "sqlite3"))
 Dir[File.expand_path("../migrate/*.rb", __FILE__).to_s].each { |f| require f }
@@ -35,13 +37,20 @@ Dir[File.expand_path("../models/*.rb", __FILE__).to_s].each { |f| require f }
 # copied from http://stackoverflow.com/questions/4410794/ruby-on-rails-import-data-from-a-csv-file/4410880#4410880
 require 'csv'
 csv = CSV.parse(File.read(File.expand_path('../data/code_gists_20130724.csv', __FILE__)), :headers => true) # data from code.eoe.cn
-csv.each {|row| CodeGist.create!(row.to_hash) }
+csv.each do |row|
+  _h = row.to_hash.merge(:fav_count => rand(5).to_i)
+  CodeGist.create! _h
+  _h[:category_id] = rand(10).to_i + 1
+  CodeGistMongoid.create! _h
+end
 Statlysis.setup do
   hourly EoeLog, :time_column => :t
-  daily  CodeGist
+  daily  CodeGist, :sum_columns => [:fav_count], :group_concat_columns => [:user_id]
+  always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}], :group_concat_columns => [:user_id]
+  always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}, {:column_name => :category_id, :type => :integer}], :group_concat_columns => [:user_id]
   [EoeLog,
    EoeLog.where(:do => 3),
@@ -50,6 +59,8 @@ Statlysis.setup do
   ].each do |s|
     daily s, :time_column => :t
   end
-  cron = Statlysis.daily['mul'][1]
+  cron1 = Statlysis.daily['mul'][1]
+  cron2 = Statlysis.daily['cod'][0]
+  cron3 = Statlysis.always['code']['mongoid'][0]
   require 'pry-debugger';binding.pry
 end

data/test/migrate/1_active_record.rb CHANGED

@@ -4,5 +4,6 @@ class CreateActiveRecord < ActiveRecord::Migration
     t.integer :user_id
     t.timestamps
     t.string :author
+    t.integer :fav_count
   end
 end

data/test/models/code_gist.rb CHANGED

@@ -3,3 +3,15 @@
 class CodeGist < ActiveRecord::Base
 end
+class CodeGistMongoid
+  include Mongoid::Document
+  include Mongoid::Timestamps
+  field :id,          :type => Integer
+  field :description, :type => String
+  field :user_id,     :type => Integer
+  field :author,      :type => String
+  field :fav_count,   :type => Integer
+  field :category_id, :type => Integer
+end

data/test/models/eoe_log.rb CHANGED

@@ -43,10 +43,8 @@ EoeLog.create
     collection_class = collection_class_name.constantize
     t = Time.zone.parse(date_str)
-    1.upto(day) do |i|
-      puts "#{month} #{day_range} #{day} #{i}" if ENV['DEBUG']
-      collection_class.create :t => (t.to_time+rand(60*60*24-1)).to_datetime, :url => '/'
-    end
+    values = (1..day).map {|i| (t.to_time+rand(60*60*24-1)).to_datetime }.sort.map {|i| {:t => i, :url => '/' }  }
+    collection_class.create values
     collection_class.count
   end

data/test/test_daily_count.rb CHANGED

@@ -10,13 +10,15 @@ class TestDailyCount < Test::Unit::TestCase
   def test_timely
     o = @output.map {|i| i[:timely_c] }
     r = (o - [5,11,0,1,8,2,3,4,16,10,26,13,7,9,20,15,30,33,14,6,12,17,19,59,65,84,62,114,69,52,61,67,154,70]).reject(&:zero?).blank?
-    assert_equal r, true
+    assert r
   end
   def test_totally
     o = @output.map {|i| i[:totally_c] }
     r = (o - [5,16,17,25,27,30,34,36,37,53,55,56,57,59,60,64,66,67,68,70,71,73,74,75,80,90,116,129,136,145,165,185,200,230,234,235,236,237,270,273,274,288,299,304,305,312,327,337,345,359,374,380,392,418,435,446,452,463,466,473,493,506,512,520,525,545,549,553,558,577,636,701,785,805,867,981,1050,1102,1163,1230,1384,1454,1455,1457,1458]).reject(&:zero?).blank?
-    assert_equal r, true
+    assert r
+    assert_equal @output[-1][:totally_favcount_s].to_i, CodeGist.all.map(&:fav_count).reduce(:+)
   end
 end

data/test/test_mapreduce.rb CHANGED

@@ -6,7 +6,14 @@ class TestMapReduce < Test::Unit::TestCase
   def setup
   end
-  def test_hotest_items_mapreduce
+  def test_multiple_dimensions_output_without_time_column
+    cron = Statlysis.always['mongoid']['code'][0]
+    assert_equal cron.time_column, false
+    assert_equal cron.time_unit, false
+    assert_equal cron.stat_table_name, 'timely_codegistmongoids_author_a'
+    cron.run
+    assert_equal cron.output.detect {|h| h[:author] == 'mvj3' }[:c].to_i, cron.multiple_dataset.sources.first.where(:author => 'mvj3').count
   end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: statlysis
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-26 00:00:00.000000000 Z
+date: 2013-12-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -269,9 +269,9 @@ files:
 - lib/statlysis/configuration.rb
 - lib/statlysis/constants.rb
 - lib/statlysis/cron.rb
-- lib/statlysis/cron/count.rb
-- lib/statlysis/cron/count/dimensions.rb
-- lib/statlysis/cron/count/timely.rb
+- lib/statlysis/cron/timely.rb
+- lib/statlysis/cron/timely/multiple_dimensions.rb
+- lib/statlysis/cron/timely/one_dimension.rb
 - lib/statlysis/cron/top.rb
 - lib/statlysis/cron/top/hotest_items.rb
 - lib/statlysis/cron/top/lastest_visits.rb
@@ -303,7 +303,7 @@ files:
 - test/test_single_log_in_multiple_collections.rb
 - test/test_statlysis.rb
 - test/test_timeseries.rb
-homepage: https://github.com/eoecn/statlysis
+homepage: https://github.com/SunshineLibrary/statlysis
 licenses:
 - MIT
 post_install_message:
@@ -318,7 +318,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 59716176471030881
+      hash: -1643509325996557122
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -327,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 59716176471030881
+      hash: -1643509325996557122
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.23

data/lib/statlysis/cron/count.rb DELETED

@@ -1,51 +0,0 @@
-# encoding: UTF-8
-module Statlysis
-  class Count < Cron
-    def initialize source, opts = {}
-      super
-      Statlysis.check_set_database
-      cron.setup_stat_model
-      cron
-    end
-    # 设置数据源，并保存结果入数据库
-    def run
-      (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
-      # delete first in range
-      @output = cron.output
-      unless @output.any?
-        logger.info "没有数据"; return
-      end
-      num_i = 0; num_add = 999
-      Statlysis.sequel.transaction do
-        cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
-        while !(_a = @output[num_i..(num_i+num_add)]).blank? do
-          # batch insert all
-          cron.stat_model.multi_insert _a
-          num_i += (num_add + 1)
-        end
-      end
-      return self
-    end
-    protected
-    def unit_range_query time, time_begin = nil
-      # time begin and end
-      tb = time
-      te = (time+1.send(cron.time_unit)-1.second)
-      tb, te = tb.to_i, te.to_i if is_time_column_integer?
-      tb = time_begin || tb
-      return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
-      return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc  [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
-    end
-  end
-end
-require 'statlysis/cron/count/timely'
-require 'statlysis/cron/count/dimensions'

data/lib/statlysis/cron/count/dimensions.rb DELETED

@@ -1,7 +0,0 @@
-# encoding: UTF-8
-module Statlysis
-  class Dimensions < Count
-  end
-end

data/lib/statlysis/cron/count/timely.rb DELETED

@@ -1,63 +0,0 @@
-# encoding: UTF-8
-module Statlysis
-  class Timely < Count
-    def setup_stat_model
-      cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
-      raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
-      if not Statlysis.sequel.table_exists?(cron.stat_table_name)
-        Statlysis.sequel.transaction do
-          Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
-            DateTime :t # alias for :time
-          end
-          # TODO Add cron.source_where_array before count_columns
-          count_columns = [:timely_c, :totally_c] # alias for :count
-          count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
-          index_column_names = [:t] + count_columns
-          index_column_names_name = index_column_names.join("_")
-          index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
-          # Fix there should be uniq index name between tables
-          # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
-          if not Statlysis.config.is_skip_database_index
-            Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
-          end
-        end
-      end
-      n = cron.stat_table_name.to_s.singularize.camelize
-      cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
-        class ::#{n} < Sequel::Model;
-          self.set_dataset :#{cron.stat_table_name}
-        end
-        #{n}
-      MODEL
-    end
-    def output
-      @output ||= (cron.time_range.map do |time|
-        timely_c = 0
-        totally_c = 0
-        # support multiple data sources
-        _first_source = nil
-        cron.multiple_dataset.sources.each do |s|
-          timely_c  += s.where(unit_range_query(time)).count
-          _t = DateTime1970
-          _t = is_time_column_integer? ? _t.to_i : _t
-          totally_c += s.where(unit_range_query(time, _t)).count
-          _first_source ||= s.where(unit_range_query(time))
-        end
-        logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
-        if timely_c.zero? && totally_c.zero?
-          nil
-        else
-          {:t => time, :timely_c => timely_c, :totally_c => totally_c}
-        end
-      end.compact)
-    end
-  end
-end