statlysis 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ # TODO config methods here
5
+ class Configuration
6
+ def inherited(base)
7
+ end
8
+
9
+ end
10
+ end
@@ -0,0 +1,86 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Cron
5
+ attr_accessor :source, :time_column, :time_unit
6
+ include Common
7
+
8
+ DefaultWrongMessage = "not implement yet, please config it by subclass".freeze
9
+ def initialize source, opts = {}
10
+ cron.stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
11
+ cron.time_column = opts[:time_column]
12
+ cron.source = source
13
+ cron.time_unit = opts[:time_unit]
14
+ cron
15
+ end
16
+ def output; raise DefaultWrongMessage end
17
+ def setup_stat_table; raise DefaultWrongMessage end
18
+ def run; raise DefaultWrongMessage end
19
+
20
+ # overwrite to lazy load @source
21
+ def inspect
22
+ source_inspect = is_mysql? ? cron.source.to_sql : cron.source
23
+ str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
24
+ str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
25
+ str << ">"
26
+ str
27
+ end
28
+
29
+ def source_where_array
30
+ # TODO follow index seq
31
+ a = cron.source.where("").where_values.map do |equality|
32
+ # use full keyvalue index name
33
+ equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
34
+ end if is_mysql?
35
+ a = cron.source.all.selector.reject {|k, v| k == 't' } if is_mongodb?
36
+ a.map {|s| s.to_s.split(//).select {|s| s.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
37
+ end
38
+
39
+ def source_name
40
+ @source_name ||= begin
41
+ m = :table_name if is_mysql?
42
+ m = :collection_name if is_mongodb?
43
+ cron.source.send(m)
44
+ end
45
+ end
46
+
47
+ # automode
48
+ # or
49
+ # specify TIME_RANGE and TIME_UNIT in shell to run
50
+ def time_range
51
+ return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
52
+ # 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个最后就选择
53
+ begin_day = DateTime.now.beginning_of_day
54
+ st_timebegin = (a = cron.stat_table.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
55
+ cron.stat_table.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
56
+ timebegin = (a = cron.source.first) ? a.send(cron.time_column) : (DateTime.now - 1.second)
57
+ timebegin = Time.at(timebegin) if is_time_column_integer?
58
+ timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
59
+
60
+ timeend = DateTime.now
61
+ puts "#{cron.source_name}'s range #{timebegin..timeend}"
62
+ # 把统计表的最后时间点也包含进去重新计算下
63
+ TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
64
+ end
65
+
66
+ protected
67
+ def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
68
+ def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
69
+ def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
70
+
71
+ # 兼容采用整数类型作时间字段
72
+ def is_time_column_integer?
73
+ if is_mysql?
74
+ cron.source.columns_hash[cron.time_column.to_s].type == :integer
75
+ else
76
+ false
77
+ end
78
+ end
79
+
80
+ end
81
+
82
+ end
83
+
84
+
85
+ require 'statlysis/cron/count'
86
+ require 'statlysis/cron/top'
@@ -0,0 +1,93 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Count < Cron
5
+ def initialize source, opts = {}
6
+ super
7
+ Statlysis.check_set_database
8
+ cron.setup_stat_table
9
+ Statlysis.setup_stat_table_and_model cron
10
+ cron
11
+ end
12
+
13
+ # 设置数据源,并保存结果入数据库
14
+ def run
15
+ cron.source = cron.source.order("#{cron.time_column} ASC") if is_mysql?
16
+ cron.source = cron.source.asc(cron.time_column) if is_mongodb?
17
+
18
+ (puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
19
+ # delete first in range
20
+ @output = cron.output
21
+ unless @output.any?
22
+ puts "没有数据"; return
23
+ end
24
+ @num_i = 0; @num_add = 999
25
+ Statlysis.sequel.transaction do
26
+ cron.stat_table.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
27
+ while !(_a = @output[@num_i..(@num_i+@num_add)]).blank? do
28
+ # batch insert all
29
+ cron.stat_table.multi_insert _a
30
+ @num_i += (@num_add + 1)
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ def reoutput; @output = nil; output end
37
+ protected
38
+ def unit_range_query time, time_begin = nil
39
+ # time begin and end
40
+ tb = time # TODO 差八个小时 [.in_time_zone, .localtime, .utc] 对于Rails,计算结果还是一样的。
41
+ te = (time+1.send(cron.time_unit)-1.second)
42
+ tb, te = tb.to_i, te.to_i if is_time_column_integer?
43
+ tb = time_begin || tb
44
+ return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_mysql?
45
+ return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongodb? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
46
+ end
47
+
48
+ end
49
+
50
+ class Timely < Count
51
+ def setup_stat_table
52
+ # TODO migration proc, merge into setup_stat_table_and_model
53
+ cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
54
+ raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
55
+ unless Statlysis.sequel.table_exists?(cron.stat_table_name)
56
+ Statlysis.sequel.transaction do
57
+ Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
58
+ DateTime :t # alias for :time
59
+ end
60
+
61
+ # TODO Add cron.source_where_array before count_columns
62
+ count_columns = [:timely_c, :totally_c] # alias for :count
63
+ count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
64
+ index_column_names = [:t] + count_columns
65
+ index_column_names_name = index_column_names.join("_")
66
+ index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
67
+
68
+ Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
69
+ end
70
+ end
71
+ end
72
+
73
+ def output
74
+ @output ||= (cron.time_range.map do |time|
75
+ timely_c = cron.source.where(unit_range_query(time)).count
76
+ _t = DateTime.parse("19700101")
77
+ _t = is_time_column_integer? ? _t.to_i : _t
78
+ totally_c = cron.source.where(unit_range_query(time, _t)).count
79
+
80
+ puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
81
+ if timely_c.zero? && totally_c.zero?
82
+ nil
83
+ else
84
+ {:t => time, :timely_c => timely_c, :totally_c => totally_c}
85
+ end
86
+ end.compact)
87
+ end
88
+ end
89
+
90
+ class Dimensions < Count
91
+ end
92
+
93
+ end
@@ -0,0 +1,154 @@
1
+ # encoding: UTF-8
2
+ # TODO support ActiveRecord
3
+
4
+ module Statlysis
5
+ class Top < Cron
6
+ attr_accessor :result_limit, :logs
7
+ attr_accessor :stat_model
8
+ attr_accessor :pattern_proc, :user_id_proc, :user_info_proc
9
+
10
+ def initialize source, opts = {}
11
+ cron.result_limit = opts[:result_limit] || 100
12
+ if not opts[:test]
13
+ [:pattern_proc, :user_id_proc, :user_info_proc].each do |o|
14
+ raise "Please assign :#{o} params!" if opts[o].nil? && !cron.send(o)
15
+ cron.send "#{o}=", opts[o]
16
+ end
17
+ default_assign_attr :stat_table_name, opts
18
+ end
19
+ super
20
+ cron
21
+ end
22
+
23
+ def run
24
+ cron.write
25
+ end
26
+
27
+ def write; raise DefaultWrongMessage end
28
+
29
+
30
+ def self.ensure_statlysis_table_and_model tn
31
+ Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
32
+ end
33
+ def ensure_statlysis_table_and_model tn
34
+ Top.ensure_statlysis_table_and_model tn
35
+ end
36
+
37
+ def default_assign_attr key_symbol, opts
38
+ if opts[key_symbol]
39
+ cron.send("#{key_symbol}=", opts[key_symbol])
40
+ else
41
+ raise "Please assign opts[:#{key_symbol}]"
42
+ end
43
+ end
44
+ end
45
+
46
+ # 博客最近用户访问计算实现流程讨论
47
+ # 问题分两个,一个是后端,一个是前端。对后端来说,用户每次blog/index|show访问都生成访问记录,后端需要进行排重和去掉未登陆用户。如果在该次访问里进行,特别是某个博客突然火了,必然每次访问都产生IO(磁盘或网络,因为多进程要共享信息),所以必定是异步的。
48
+ # 前端展示考虑到缓存,一般是页面片段缓存,或者ajax载入。
49
+ # 后端异步如何计算每个blog的最近访客,log.js记录了最近访问,一个后台常驻进程循环对日志表按时间记录来读取blog访问信息,把最近访客信息刷新到blog。相对单次请求全部处理,这里处理次数更少,资源更节约,当然瓶颈也在日志表的索引更新和读取。
50
+ class LastestVisits < Top
51
+ attr_accessor :clock
52
+ attr_accessor :reject_proc
53
+
54
+ # *pattern_proc* is a proc to extract user_id or url_prefix to compute the
55
+ # top visitors from log
56
+ # *user_id_proc* is a proc to extract user_id from log
57
+ # *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
58
+ # *reject_proc* filter visitors
59
+ def initialize source, opts = {}
60
+ # set variables
61
+ cron.reclock opts[:default_time]
62
+ cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
63
+ super
64
+ cron.pattern_table_and_model cron.stat_table_name
65
+ cron
66
+ end
67
+
68
+ def output
69
+ cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
70
+ return {} if cron.logs.blank?
71
+ cron.logs.inject({}) do |h, log|
72
+ pattern = cron.pattern_proc.call(log)
73
+ if pattern
74
+ h[pattern] ||= []
75
+ user_id = cron.user_id_proc.call(log).to_i
76
+ h[pattern] << user_id if not user_id.zero?
77
+ end
78
+ h
79
+ end
80
+ end
81
+
82
+ def write
83
+ puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
84
+ cron.output.each do |pattern, user_ids|
85
+ s = cron.stat_model.find_or_create(:pattern => pattern)
86
+ old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
87
+ new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
88
+ s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
89
+ end
90
+ cron.clock.update cron.logs.last.try(cron.time_column)
91
+ end
92
+
93
+ def reclock default_time = nil
94
+ cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
95
+ end
96
+ end
97
+
98
+ class SingleKv < Top
99
+ attr_accessor :time_ago, :stat_column_name
100
+
101
+ def initialize source, opts = {}
102
+ [:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
103
+ raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
104
+ super
105
+ cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
106
+ cron
107
+ end
108
+
109
+ end
110
+
111
+ # 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
112
+ #
113
+ # 解决方法为从用户行为中去综合分析,具体流程为:
114
+ # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
115
+ # 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
116
+ #
117
+ # 线性计算速度很快
118
+ #
119
+ class HotestItems < SingleKv
120
+ attr_accessor :key, :id_to_score_and_time_hash_proc
121
+ attr_accessor :limit
122
+
123
+ def initialize key, id_to_score_and_time_hash_proc
124
+ cron.key = key
125
+ cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
126
+ cron.limit = 20
127
+ super
128
+ cron
129
+ end
130
+
131
+ def output
132
+ t = cron.id_to_score_and_time_hash_proc
133
+ while t.is_a?(Proc) do
134
+ t = t.call
135
+ end
136
+ @id_to_score_and_time_hash = t
137
+ @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
138
+
139
+ @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
140
+ array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
141
+ {cron.key => array}
142
+ end
143
+
144
+ def write
145
+ cron.output.each do |key, array|
146
+ json = array[0..140].to_json
147
+ StSingleKv.find_or_create(:pattern => key).update :result => json
148
+ StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
149
+ end
150
+ end
151
+
152
+ end
153
+
154
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Formula
5
+ end
6
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ module Javascript
5
+ class MultiDimensionalCount
6
+ attr_accessor :map_func, :reduce_func
7
+
8
+ def initialize *fields
9
+ fields = :_id if fields.blank?
10
+ emit_key = case fields
11
+ when Array
12
+ emit_key = fields.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
13
+ emit_key = "{#{emit_key}}"
14
+ when Symbol, String
15
+ "this.#{fields}"
16
+ else
17
+ raise "Please assign symbol, string, or array of them"
18
+ end
19
+
20
+ self.map_func = "function() {
21
+ emit (#{emit_key}, {count: 1});
22
+ }"
23
+
24
+ self.reduce_func = "function(key, values) {
25
+ var count = 0;
26
+
27
+ values.forEach(function(v) {
28
+ count += v['count'];
29
+ });
30
+
31
+ return {count: count};
32
+ }"
33
+ self
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+
3
+ require 'javascript/count'
4
+
5
+ module Statlysis
6
+ class MapReduce
7
+ attr_reader :mongoid_scope, :mapreduce_javascript
8
+ attr_accessor :mr_collection, :results
9
+ attr_accessor :is_use_inline, :identify
10
+ def initialize mongoid_scope, mapreduce_javascript
11
+ mr.mongoid_scope = mongoid_scope
12
+ mr.mapreduce_javascript = mapreduce_javascript
13
+ mr.is_use_inline = true
14
+ mr.identify = Time.now.strftime("%m%d_%H%M%S")
15
+ mr
16
+ end
17
+
18
+ def run
19
+ # TODO collection for large
20
+ mr.results = Results.new mr.mongoid_scope.map_reduce(mapreduce_javascript.map_func, mapreduce_javascript.reduce_func).out(:replace => out_collection_name)
21
+ self
22
+ end
23
+
24
+ def output
25
+ mr.results.output
26
+ end
27
+
28
+ def out_collection_name; "mr_#{mr.mongoid_scope.collection_name}_#{mr.identify}" end
29
+ def mr; self end
30
+ end
31
+
32
+ end
@@ -0,0 +1,28 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rake'
4
+
5
+ namespace :statlysis do
6
+ Statlysis::Units.each do |unit|
7
+ desc "statistical in #{unit}"
8
+ only_one_task "#{unit}_count" => :environment do
9
+ Statlysis.send("#{unit}_crons").map(&:run)
10
+ end
11
+ end
12
+
13
+ desc "realtime process"
14
+ only_one_task :realtime_process => :environment do
15
+ loop { Statlysis.realtime_crons.map(&:run); sleep 1 }
16
+ end
17
+
18
+ desc "similar process"
19
+ only_one_task :similar_process => :environment do
20
+ Statlysis.similar_crons.map(&:run)
21
+ end
22
+
23
+ desc "hotest process"
24
+ only_one_task :hotest_process => :environment do
25
+ Statlysis.hotest_crons.map(&:run)
26
+ end
27
+
28
+ end