RubyGems - statlysis - Versions diffs - 0.0.1 - Mend

statlysis 0.0.1

Files changed (29) hide show

data/.document +5 -0
data/.gitignore +51 -0
data/Gemfile +2 -0
data/Gemfile.lock +110 -0
data/LICENSE.txt +20 -0
data/README.markdown +43 -0
data/Rakefile +11 -0
data/lib/statlysis.rb +134 -0
data/lib/statlysis/clock.rb +36 -0
data/lib/statlysis/common.rb +27 -0
data/lib/statlysis/configuration.rb +10 -0
data/lib/statlysis/cron.rb +86 -0
data/lib/statlysis/cron/count.rb +93 -0
data/lib/statlysis/cron/top.rb +154 -0
data/lib/statlysis/formula.rb +6 -0
data/lib/statlysis/javascript/count.rb +37 -0
data/lib/statlysis/map_reduce.rb +32 -0
data/lib/statlysis/rake.rb +28 -0
data/lib/statlysis/results.rb +17 -0
data/lib/statlysis/similar.rb +89 -0
data/lib/statlysis/timeseries.rb +41 -0
data/statlysis.gemspec +30 -0
data/test/helper.rb +17 -0
data/test/models/company.rb +12 -0
data/test/models/employee.rb +14 -0
data/test/test_mapreduce.rb +26 -0
data/test/test_statlysis.rb +76 -0
data/test/test_timeseries.rb +6 -0
metadata +216 -0

data/lib/statlysis/configuration.rb ADDED

@@ -0,0 +1,10 @@
+# encoding: UTF-8
+module Statlysis
+  # TODO config methods here
+  class Configuration
+    def inherited(base)
+    end
+  end
+end

data/lib/statlysis/cron.rb ADDED

@@ -0,0 +1,86 @@
+# encoding: UTF-8
+module Statlysis
+  class Cron
+    attr_accessor :source, :time_column, :time_unit
+    include Common
+    DefaultWrongMessage = "not implement yet, please config it by subclass".freeze
+    def initialize source, opts = {}
+      cron.stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
+      cron.time_column     = opts[:time_column]
+      cron.source          = source
+      cron.time_unit       = opts[:time_unit]
+      cron
+    end
+    def output; raise DefaultWrongMessage end
+    def setup_stat_table; raise DefaultWrongMessage end
+    def run; raise DefaultWrongMessage end
+    # overwrite to lazy load @source
+    def inspect
+      source_inspect = is_mysql? ? cron.source.to_sql : cron.source
+      str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
+      str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
+      str << ">"
+      str
+    end
+    def source_where_array
+      # TODO follow index seq
+      a = cron.source.where("").where_values.map do |equality|
+        # use full keyvalue index name
+        equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
+      end if is_mysql?
+      a = cron.source.all.selector.reject {|k, v| k == 't' } if is_mongodb?
+      a.map {|s| s.to_s.split(//).select {|s| s.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
+    end
+    def source_name
+      @source_name ||= begin
+        m = :table_name if is_mysql?
+        m = :collection_name if is_mongodb?
+        cron.source.send(m)
+      end
+    end
+    # automode
+    # or
+    # specify TIME_RANGE and TIME_UNIT in shell to run
+    def time_range
+      return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
+      # 选择开始时间。取出统计表的最后时间，和数据表的最先时间对比，哪个最后就选择
+      begin_day = DateTime.now.beginning_of_day
+      st_timebegin = (a = cron.stat_table.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
+      cron.stat_table.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
+      timebegin = (a = cron.source.first) ? a.send(cron.time_column) : (DateTime.now - 1.second)
+      timebegin = Time.at(timebegin) if is_time_column_integer?
+      timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
+      timeend = DateTime.now
+      puts "#{cron.source_name}'s range #{timebegin..timeend}"
+      # 把统计表的最后时间点也包含进去重新计算下
+      TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
+    end
+    protected
+    def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
+    def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
+    def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
+    # 兼容采用整数类型作时间字段
+    def is_time_column_integer?
+      if is_mysql?
+        cron.source.columns_hash[cron.time_column.to_s].type == :integer
+      else
+        false
+      end
+    end
+  end
+end
+require 'statlysis/cron/count'
+require 'statlysis/cron/top'

data/lib/statlysis/cron/count.rb ADDED

@@ -0,0 +1,93 @@
+# encoding: UTF-8
+module Statlysis
+  class Count < Cron
+    def initialize source, opts = {}
+      super
+      Statlysis.check_set_database
+      cron.setup_stat_table
+      Statlysis.setup_stat_table_and_model cron
+      cron
+    end
+    # 设置数据源，并保存结果入数据库
+    def run
+      cron.source          = cron.source.order("#{cron.time_column} ASC") if is_mysql?
+      cron.source          = cron.source.asc(cron.time_column) if is_mongodb?
+      (puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
+      # delete first in range
+      @output = cron.output
+      unless @output.any?
+        puts "没有数据"; return
+      end
+      @num_i = 0; @num_add = 999
+      Statlysis.sequel.transaction do
+        cron.stat_table.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
+        while !(_a = @output[@num_i..(@num_i+@num_add)]).blank? do
+          # batch insert all
+          cron.stat_table.multi_insert _a
+          @num_i += (@num_add + 1)
+        end
+      end
+    end
+    def reoutput; @output = nil; output end
+    protected
+    def unit_range_query time, time_begin = nil
+      # time begin and end
+      tb = time # TODO 差八个小时 [.in_time_zone, .localtime, .utc] 对于Rails，计算结果还是一样的。
+      te = (time+1.send(cron.time_unit)-1.second)
+      tb, te = tb.to_i, te.to_i if is_time_column_integer?
+      tb = time_begin || tb
+      return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_mysql?
+      return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongodb? # .utc  [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
+    end
+  end
+  class Timely < Count
+    def setup_stat_table
+      # TODO migration proc, merge into setup_stat_table_and_model
+      cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
+      raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
+      unless Statlysis.sequel.table_exists?(cron.stat_table_name)
+        Statlysis.sequel.transaction do
+          Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
+            DateTime :t # alias for :time
+          end
+          # TODO Add cron.source_where_array before count_columns
+          count_columns = [:timely_c, :totally_c] # alias for :count
+          count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
+          index_column_names = [:t] + count_columns
+          index_column_names_name = index_column_names.join("_")
+          index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
+          Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
+        end
+      end
+    end
+    def output
+      @output ||= (cron.time_range.map do |time|
+        timely_c  = cron.source.where(unit_range_query(time)).count
+        _t = DateTime.parse("19700101")
+        _t = is_time_column_integer? ? _t.to_i : _t
+        totally_c = cron.source.where(unit_range_query(time, _t)).count
+        puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
+        if timely_c.zero? && totally_c.zero?
+          nil
+        else
+          {:t => time, :timely_c => timely_c, :totally_c => totally_c}
+        end
+      end.compact)
+    end
+  end
+  class Dimensions < Count
+  end
+end

data/lib/statlysis/cron/top.rb ADDED

@@ -0,0 +1,154 @@
+# encoding: UTF-8
+# TODO support ActiveRecord
+module Statlysis
+  class Top < Cron
+    attr_accessor :result_limit, :logs
+    attr_accessor :stat_model
+    attr_accessor :pattern_proc, :user_id_proc, :user_info_proc
+    def initialize source, opts = {}
+      cron.result_limit = opts[:result_limit] || 100
+      if not opts[:test]
+        [:pattern_proc, :user_id_proc, :user_info_proc].each do |o|
+          raise "Please assign :#{o} params!" if opts[o].nil? && !cron.send(o)
+          cron.send "#{o}=", opts[o]
+        end
+        default_assign_attr :stat_table_name, opts
+      end
+      super
+      cron
+    end
+    def run
+      cron.write
+    end
+    def write; raise DefaultWrongMessage end
+    def self.ensure_statlysis_table_and_model tn
+      Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
+    end
+    def ensure_statlysis_table_and_model tn
+      Top.ensure_statlysis_table_and_model tn
+    end
+    def default_assign_attr key_symbol, opts
+      if opts[key_symbol]
+        cron.send("#{key_symbol}=", opts[key_symbol])
+      else
+        raise "Please assign opts[:#{key_symbol}]"
+      end
+    end
+  end
+  # 博客最近用户访问计算实现流程讨论
+  # 问题分两个，一个是后端，一个是前端。对后端来说，用户每次blog/index|show访问都生成访问记录，后端需要进行排重和去掉未登陆用户。如果在该次访问里进行，特别是某个博客突然火了，必然每次访问都产生IO(磁盘或网络，因为多进程要共享信息），所以必定是异步的。
+  # 前端展示考虑到缓存，一般是页面片段缓存，或者ajax载入。
+  # 后端异步如何计算每个blog的最近访客，log.js记录了最近访问，一个后台常驻进程循环对日志表按时间记录来读取blog访问信息，把最近访客信息刷新到blog。相对单次请求全部处理，这里处理次数更少，资源更节约，当然瓶颈也在日志表的索引更新和读取。
+  class LastestVisits < Top
+    attr_accessor :clock
+    attr_accessor :reject_proc
+    # *pattern_proc* is a proc to extract user_id or url_prefix to compute the
+    # top visitors from log
+    # *user_id_proc* is a proc to extract user_id from log
+    # *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
+    # *reject_proc* filter visitors
+    def initialize source, opts = {}
+      # set variables
+      cron.reclock opts[:default_time]
+      cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
+      super
+      cron.pattern_table_and_model cron.stat_table_name
+      cron
+    end
+    def output
+      cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
+      return {} if cron.logs.blank?
+      cron.logs.inject({}) do |h, log|
+        pattern = cron.pattern_proc.call(log)
+        if pattern
+          h[pattern] ||= []
+          user_id = cron.user_id_proc.call(log).to_i
+          h[pattern] << user_id if not user_id.zero?
+        end
+        h
+      end
+    end
+    def write
+      puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
+      cron.output.each do |pattern, user_ids|
+        s = cron.stat_model.find_or_create(:pattern => pattern)
+        old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
+        new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
+        s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
+      end
+      cron.clock.update cron.logs.last.try(cron.time_column)
+    end
+    def reclock default_time = nil
+      cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
+    end
+  end
+  class SingleKv < Top
+    attr_accessor :time_ago, :stat_column_name
+    def initialize source, opts = {}
+      [:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
+      raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
+      super
+      cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
+      cron
+    end
+  end
+  # 一般最近热门列表通常采用简单对一个字段记录访问数的算法，但是这可能会导致刷量等问题。
+  #
+  # 解决方法为从用户行为中去综合分析，具体流程为：
+  # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id，从like,fav,comment表获取更深的用户行为，把前两者通过一定比例相加得到排行。
+  # 最后用时间降温来避免马太效应，必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
+  #
+  # 线性计算速度很快
+  #
+  class HotestItems < SingleKv
+    attr_accessor :key, :id_to_score_and_time_hash_proc
+    attr_accessor :limit
+    def initialize key, id_to_score_and_time_hash_proc
+      cron.key = key
+      cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
+      cron.limit = 20
+      super
+      cron
+    end
+    def output
+      t = cron.id_to_score_and_time_hash_proc
+      while t.is_a?(Proc) do
+        t = t.call
+      end
+      @id_to_score_and_time_hash = t
+      @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
+      @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
+      array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
+      {cron.key => array}
+    end
+    def write
+      cron.output.each do |key, array|
+        json = array[0..140].to_json
+        StSingleKv.find_or_create(:pattern => key).update :result => json
+        StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
+      end
+    end
+  end
+end

data/lib/statlysis/formula.rb ADDED

@@ -0,0 +1,6 @@
+# encoding: UTF-8
+module Statlysis
+  class Formula
+  end
+end

data/lib/statlysis/javascript/count.rb ADDED

@@ -0,0 +1,37 @@
+# encoding: UTF-8
+module Statlysis
+  module Javascript
+    class MultiDimensionalCount
+      attr_accessor :map_func, :reduce_func
+      def initialize *fields
+        fields = :_id if fields.blank?
+        emit_key = case fields
+        when Array
+          emit_key = fields.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
+          emit_key = "{#{emit_key}}"
+        when Symbol, String
+          "this.#{fields}"
+        else
+          raise "Please assign symbol, string, or array of them"
+        end
+        self.map_func = "function() {
+          emit (#{emit_key}, {count: 1});
+        }"
+        self.reduce_func = "function(key, values) {
+          var count = 0;
+          values.forEach(function(v) {
+            count += v['count'];
+          });
+          return {count: count};
+        }"
+        self
+      end
+    end
+  end
+end

data/lib/statlysis/map_reduce.rb ADDED

@@ -0,0 +1,32 @@
+# encoding: utf-8
+require 'javascript/count'
+module Statlysis
+  class MapReduce
+    attr_reader :mongoid_scope, :mapreduce_javascript
+    attr_accessor :mr_collection, :results
+    attr_accessor :is_use_inline, :identify
+    def initialize mongoid_scope, mapreduce_javascript
+      mr.mongoid_scope = mongoid_scope
+      mr.mapreduce_javascript = mapreduce_javascript
+      mr.is_use_inline = true
+      mr.identify = Time.now.strftime("%m%d_%H%M%S")
+      mr
+    end
+    def run
+      # TODO collection for large
+      mr.results = Results.new mr.mongoid_scope.map_reduce(mapreduce_javascript.map_func, mapreduce_javascript.reduce_func).out(:replace => out_collection_name)
+      self
+    end
+    def output
+      mr.results.output
+    end
+    def out_collection_name; "mr_#{mr.mongoid_scope.collection_name}_#{mr.identify}" end
+    def mr; self end
+  end
+end

data/lib/statlysis/rake.rb ADDED

@@ -0,0 +1,28 @@
+# encoding: UTF-8
+require 'rake'
+namespace :statlysis do
+  Statlysis::Units.each do |unit|
+    desc "statistical in #{unit}"
+    only_one_task "#{unit}_count" => :environment do
+      Statlysis.send("#{unit}_crons").map(&:run)
+    end
+  end
+  desc "realtime process"
+  only_one_task :realtime_process => :environment do
+    loop { Statlysis.realtime_crons.map(&:run); sleep 1 }
+  end
+  desc "similar process"
+  only_one_task :similar_process => :environment do
+    Statlysis.similar_crons.map(&:run)
+  end
+  desc "hotest process"
+  only_one_task :hotest_process => :environment do
+    Statlysis.hotest_crons.map(&:run)
+  end
+end