statlysis 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Guardfile +14 -0
- data/README.markdown +77 -27
- data/Rakefile +1 -1
- data/lib/statlysis.rb +59 -101
- data/lib/statlysis/clock.rb +3 -3
- data/lib/statlysis/common.rb +4 -16
- data/lib/statlysis/configuration.rb +97 -2
- data/lib/statlysis/constants.rb +10 -0
- data/lib/statlysis/cron.rb +40 -42
- data/lib/statlysis/cron/count.rb +16 -58
- data/lib/statlysis/cron/count/dimensions.rb +7 -0
- data/lib/statlysis/cron/count/timely.rb +63 -0
- data/lib/statlysis/cron/top.rb +4 -104
- data/lib/statlysis/cron/top/hotest_items.rb +47 -0
- data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
- data/lib/statlysis/cron_set.rb +26 -0
- data/lib/statlysis/dataset.rb +6 -0
- data/lib/statlysis/javascript/count.rb +3 -3
- data/lib/statlysis/multiple_dataset.rb +69 -0
- data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
- data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
- data/lib/statlysis/rake.rb +6 -5
- data/lib/statlysis/similar.rb +11 -11
- data/lib/statlysis/timeseries.rb +12 -9
- data/lib/statlysis/utils.rb +40 -0
- data/statlysis.gemspec +13 -3
- data/test/config/database.yml +9 -0
- data/test/config/mongoid.yml +36 -0
- data/test/data/.gitkeep +0 -0
- data/test/data/code_gists_20130724.csv +1459 -0
- data/test/helper.rb +41 -3
- data/test/migrate/1_active_record.rb +8 -0
- data/test/models/.gitkeep +0 -0
- data/test/models/code_gist.rb +5 -0
- data/test/models/eoe_log.rb +53 -0
- data/test/test_daily_count.rb +22 -0
- data/test/test_mapreduce.rb +0 -13
- data/test/test_single_log_in_multiple_collections.rb +22 -0
- data/test/test_statlysis.rb +5 -50
- data/test/test_timeseries.rb +46 -0
- metadata +133 -12
- data/Gemfile.lock +0 -110
- data/LICENSE.txt +0 -20
- data/test/models/company.rb +0 -12
- data/test/models/employee.rb +0 -14
@@ -0,0 +1,10 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
TimeUnits = %w[hour day week month year]
|
5
|
+
DateTime1970 = Time.zone.parse("19700101").in_time_zone
|
6
|
+
|
7
|
+
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
8
|
+
|
9
|
+
DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
|
10
|
+
end
|
data/lib/statlysis/cron.rb
CHANGED
@@ -2,76 +2,74 @@
|
|
2
2
|
|
3
3
|
module Statlysis
|
4
4
|
class Cron
|
5
|
-
|
5
|
+
attr_reader :multiple_dataset, :source_type, :time_column, :time_unit, :time_zone
|
6
6
|
include Common
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def initialize s, opts = {}
|
9
|
+
# setup data type related
|
10
|
+
@source_type = ({Utils.is_activerecord?(s) => :activerecord, Utils.is_mongoid?(s) => :mongoid}.detect {|k, v| k } || {})[1] || :unknown
|
11
|
+
|
12
|
+
@time_column = opts[:time_column]
|
13
|
+
@time_unit = opts[:time_unit]
|
14
|
+
@time_zone = opts[:time_zone] || Statlysis.default_time_zone || Time.zone || Time.now.utc_offset
|
15
|
+
|
16
|
+
# insert source as a dataset
|
17
|
+
@multiple_dataset = (s.is_a?(ActiveRecordDataset) ? s : ActiveRecordDataset.new(cron).add_source(s)) if is_activerecord?
|
18
|
+
@multiple_dataset = (s.is_a?(MongoidDataset) ? s : MongoidDataset.new(cron).add_source(s)) if is_mongoid?
|
19
|
+
@multiple_dataset.instance_variable_set("@cron", cron) if is_orm? && @multiple_dataset.cron.nil?
|
20
|
+
|
21
|
+
@stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
|
22
|
+
|
14
23
|
cron
|
15
24
|
end
|
16
|
-
def output; raise
|
17
|
-
def
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
def
|
22
|
-
|
23
|
-
str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
|
24
|
-
str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
|
25
|
-
str << ">"
|
26
|
-
str
|
27
|
-
end
|
25
|
+
def output; raise DefaultNotImplementWrongMessage end
|
26
|
+
def reoutput; @output = nil; output end
|
27
|
+
def setup_stat_model; raise DefaultNotImplementWrongMessage end
|
28
|
+
def run; raise DefaultNotImplementWrongMessage end
|
29
|
+
def is_activerecord?; @source_type == :activerecord; end
|
30
|
+
def is_mongoid?; @source_type == :mongoid; end
|
31
|
+
def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
|
28
32
|
|
33
|
+
def _source
|
34
|
+
cron.multiple_dataset.sources.first
|
35
|
+
end
|
29
36
|
def source_where_array
|
30
37
|
# TODO follow index seq
|
31
|
-
a =
|
38
|
+
a = _source.where("").where_values.map do |equality|
|
32
39
|
# use full keyvalue index name
|
33
40
|
equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
|
34
|
-
end if
|
35
|
-
a =
|
36
|
-
a.map {|
|
37
|
-
end
|
38
|
-
|
39
|
-
def source_name
|
40
|
-
@source_name ||= begin
|
41
|
-
m = :table_name if is_mysql?
|
42
|
-
m = :collection_name if is_mongodb?
|
43
|
-
cron.source.send(m)
|
44
|
-
end
|
41
|
+
end if is_activerecord?
|
42
|
+
a = _source.all.selector.reject {|k, v| k == 't' } if is_mongoid?
|
43
|
+
a.map {|s1| s1.to_s.split(//).select {|s2| s2.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
|
45
44
|
end
|
46
45
|
|
47
46
|
# automode
|
48
47
|
# or
|
49
48
|
# specify TIME_RANGE and TIME_UNIT in shell to run
|
50
49
|
def time_range
|
51
|
-
return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
|
52
|
-
#
|
50
|
+
return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day'), :zone => cron.time_zone) if ENV['TIME_RANGE']
|
51
|
+
# 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个在后就选择哪个
|
53
52
|
begin_day = DateTime.now.beginning_of_day
|
54
|
-
st_timebegin = (a = cron.
|
55
|
-
|
56
|
-
|
53
|
+
st_timebegin = (a = cron.stat_model.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
|
54
|
+
|
55
|
+
# TODO support multiple log
|
56
|
+
cron.stat_model.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
|
57
|
+
timebegin = (multiple_dataset.first_time != DateTime1970) ? multiple_dataset.first_time : (DateTime.now - 1.second)
|
57
58
|
timebegin = Time.at(timebegin) if is_time_column_integer?
|
58
59
|
timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
|
59
60
|
|
60
61
|
timeend = DateTime.now
|
61
|
-
|
62
|
+
logger.info "#{multiple_dataset.name}'s range #{timebegin..timeend}"
|
62
63
|
# 把统计表的最后时间点也包含进去重新计算下
|
63
64
|
TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
|
64
65
|
end
|
65
66
|
|
66
67
|
protected
|
67
|
-
def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
|
68
|
-
def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
|
69
|
-
def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
|
70
68
|
|
71
69
|
# 兼容采用整数类型作时间字段
|
72
70
|
def is_time_column_integer?
|
73
|
-
if
|
74
|
-
|
71
|
+
if is_activerecord?
|
72
|
+
_source.columns_hash[cron.time_column.to_s].type == :integer
|
75
73
|
else
|
76
74
|
false
|
77
75
|
end
|
data/lib/statlysis/cron/count.rb
CHANGED
@@ -5,89 +5,47 @@ module Statlysis
|
|
5
5
|
def initialize source, opts = {}
|
6
6
|
super
|
7
7
|
Statlysis.check_set_database
|
8
|
-
cron.
|
9
|
-
Statlysis.setup_stat_table_and_model cron
|
8
|
+
cron.setup_stat_model
|
10
9
|
cron
|
11
10
|
end
|
12
11
|
|
13
12
|
# 设置数据源,并保存结果入数据库
|
14
13
|
def run
|
15
|
-
|
16
|
-
cron.source = cron.source.asc(cron.time_column) if is_mongodb?
|
17
|
-
|
18
|
-
(puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
|
14
|
+
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
19
15
|
# delete first in range
|
20
16
|
@output = cron.output
|
21
17
|
unless @output.any?
|
22
|
-
|
18
|
+
logger.info "没有数据"; return
|
23
19
|
end
|
24
|
-
|
20
|
+
num_i = 0; num_add = 999
|
25
21
|
Statlysis.sequel.transaction do
|
26
|
-
cron.
|
27
|
-
while !(_a = @output[
|
22
|
+
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
|
23
|
+
while !(_a = @output[num_i..(num_i+num_add)]).blank? do
|
28
24
|
# batch insert all
|
29
|
-
cron.
|
30
|
-
|
25
|
+
cron.stat_model.multi_insert _a
|
26
|
+
num_i += (num_add + 1)
|
31
27
|
end
|
32
28
|
end
|
29
|
+
|
30
|
+
return self
|
33
31
|
end
|
34
32
|
|
35
33
|
|
36
|
-
def reoutput; @output = nil; output end
|
37
34
|
protected
|
38
35
|
def unit_range_query time, time_begin = nil
|
39
36
|
# time begin and end
|
40
|
-
tb = time
|
37
|
+
tb = time
|
41
38
|
te = (time+1.send(cron.time_unit)-1.second)
|
42
39
|
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
43
40
|
tb = time_begin || tb
|
44
|
-
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if
|
45
|
-
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if
|
41
|
+
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
42
|
+
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
46
43
|
end
|
47
44
|
|
48
45
|
end
|
49
46
|
|
50
|
-
|
51
|
-
def setup_stat_table
|
52
|
-
# TODO migration proc, merge into setup_stat_table_and_model
|
53
|
-
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
54
|
-
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
55
|
-
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
56
|
-
Statlysis.sequel.transaction do
|
57
|
-
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
58
|
-
DateTime :t # alias for :time
|
59
|
-
end
|
60
|
-
|
61
|
-
# TODO Add cron.source_where_array before count_columns
|
62
|
-
count_columns = [:timely_c, :totally_c] # alias for :count
|
63
|
-
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
64
|
-
index_column_names = [:t] + count_columns
|
65
|
-
index_column_names_name = index_column_names.join("_")
|
66
|
-
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
67
|
-
|
68
|
-
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def output
|
74
|
-
@output ||= (cron.time_range.map do |time|
|
75
|
-
timely_c = cron.source.where(unit_range_query(time)).count
|
76
|
-
_t = DateTime.parse("19700101")
|
77
|
-
_t = is_time_column_integer? ? _t.to_i : _t
|
78
|
-
totally_c = cron.source.where(unit_range_query(time, _t)).count
|
79
|
-
|
80
|
-
puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
|
81
|
-
if timely_c.zero? && totally_c.zero?
|
82
|
-
nil
|
83
|
-
else
|
84
|
-
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
85
|
-
end
|
86
|
-
end.compact)
|
87
|
-
end
|
88
|
-
end
|
47
|
+
end
|
89
48
|
|
90
|
-
class Dimensions < Count
|
91
|
-
end
|
92
49
|
|
93
|
-
|
50
|
+
require 'statlysis/cron/count/timely'
|
51
|
+
require 'statlysis/cron/count/dimensions'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely < Count
|
5
|
+
def setup_stat_model
|
6
|
+
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
7
|
+
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
8
|
+
|
9
|
+
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
10
|
+
Statlysis.sequel.transaction do
|
11
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
12
|
+
DateTime :t # alias for :time
|
13
|
+
end
|
14
|
+
|
15
|
+
# TODO Add cron.source_where_array before count_columns
|
16
|
+
count_columns = [:timely_c, :totally_c] # alias for :count
|
17
|
+
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
18
|
+
index_column_names = [:t] + count_columns
|
19
|
+
index_column_names_name = index_column_names.join("_")
|
20
|
+
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
21
|
+
|
22
|
+
# Fix there should be uniq index name between tables
|
23
|
+
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
24
|
+
if not Statlysis.config.is_skip_database_index
|
25
|
+
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
n = cron.stat_table_name.to_s.singularize.camelize
|
31
|
+
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
32
|
+
class ::#{n} < Sequel::Model;
|
33
|
+
self.set_dataset :#{cron.stat_table_name}
|
34
|
+
end
|
35
|
+
#{n}
|
36
|
+
MODEL
|
37
|
+
end
|
38
|
+
|
39
|
+
def output
|
40
|
+
@output ||= (cron.time_range.map do |time|
|
41
|
+
timely_c = 0
|
42
|
+
totally_c = 0
|
43
|
+
# support multiple data sources
|
44
|
+
_first_source = nil
|
45
|
+
cron.multiple_dataset.sources.each do |s|
|
46
|
+
timely_c += s.where(unit_range_query(time)).count
|
47
|
+
_t = DateTime1970
|
48
|
+
_t = is_time_column_integer? ? _t.to_i : _t
|
49
|
+
totally_c += s.where(unit_range_query(time, _t)).count
|
50
|
+
_first_source ||= s.where(unit_range_query(time))
|
51
|
+
end
|
52
|
+
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
|
53
|
+
|
54
|
+
if timely_c.zero? && totally_c.zero?
|
55
|
+
nil
|
56
|
+
else
|
57
|
+
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
58
|
+
end
|
59
|
+
end.compact)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/statlysis/cron/top.rb
CHANGED
@@ -24,15 +24,7 @@ module Statlysis
|
|
24
24
|
cron.write
|
25
25
|
end
|
26
26
|
|
27
|
-
def write; raise
|
28
|
-
|
29
|
-
|
30
|
-
def self.ensure_statlysis_table_and_model tn
|
31
|
-
Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
|
32
|
-
end
|
33
|
-
def ensure_statlysis_table_and_model tn
|
34
|
-
Top.ensure_statlysis_table_and_model tn
|
35
|
-
end
|
27
|
+
def write; raise DefaultNotImplementWrongMessage end
|
36
28
|
|
37
29
|
def default_assign_attr key_symbol, opts
|
38
30
|
if opts[key_symbol]
|
@@ -43,58 +35,6 @@ module Statlysis
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
# 博客最近用户访问计算实现流程讨论
|
47
|
-
# 问题分两个,一个是后端,一个是前端。对后端来说,用户每次blog/index|show访问都生成访问记录,后端需要进行排重和去掉未登陆用户。如果在该次访问里进行,特别是某个博客突然火了,必然每次访问都产生IO(磁盘或网络,因为多进程要共享信息),所以必定是异步的。
|
48
|
-
# 前端展示考虑到缓存,一般是页面片段缓存,或者ajax载入。
|
49
|
-
# 后端异步如何计算每个blog的最近访客,log.js记录了最近访问,一个后台常驻进程循环对日志表按时间记录来读取blog访问信息,把最近访客信息刷新到blog。相对单次请求全部处理,这里处理次数更少,资源更节约,当然瓶颈也在日志表的索引更新和读取。
|
50
|
-
class LastestVisits < Top
|
51
|
-
attr_accessor :clock
|
52
|
-
attr_accessor :reject_proc
|
53
|
-
|
54
|
-
# *pattern_proc* is a proc to extract user_id or url_prefix to compute the
|
55
|
-
# top visitors from log
|
56
|
-
# *user_id_proc* is a proc to extract user_id from log
|
57
|
-
# *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
|
58
|
-
# *reject_proc* filter visitors
|
59
|
-
def initialize source, opts = {}
|
60
|
-
# set variables
|
61
|
-
cron.reclock opts[:default_time]
|
62
|
-
cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
|
63
|
-
super
|
64
|
-
cron.pattern_table_and_model cron.stat_table_name
|
65
|
-
cron
|
66
|
-
end
|
67
|
-
|
68
|
-
def output
|
69
|
-
cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
|
70
|
-
return {} if cron.logs.blank?
|
71
|
-
cron.logs.inject({}) do |h, log|
|
72
|
-
pattern = cron.pattern_proc.call(log)
|
73
|
-
if pattern
|
74
|
-
h[pattern] ||= []
|
75
|
-
user_id = cron.user_id_proc.call(log).to_i
|
76
|
-
h[pattern] << user_id if not user_id.zero?
|
77
|
-
end
|
78
|
-
h
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
def write
|
83
|
-
puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
|
84
|
-
cron.output.each do |pattern, user_ids|
|
85
|
-
s = cron.stat_model.find_or_create(:pattern => pattern)
|
86
|
-
old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
|
87
|
-
new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
|
88
|
-
s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
|
89
|
-
end
|
90
|
-
cron.clock.update cron.logs.last.try(cron.time_column)
|
91
|
-
end
|
92
|
-
|
93
|
-
def reclock default_time = nil
|
94
|
-
cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
38
|
class SingleKv < Top
|
99
39
|
attr_accessor :time_ago, :stat_column_name
|
100
40
|
|
@@ -102,53 +42,13 @@ module Statlysis
|
|
102
42
|
[:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
|
103
43
|
raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
|
104
44
|
super
|
105
|
-
cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
|
106
45
|
cron
|
107
46
|
end
|
108
47
|
|
109
48
|
end
|
110
49
|
|
111
|
-
|
112
|
-
#
|
113
|
-
# 解决方法为从用户行为中去综合分析,具体流程为:
|
114
|
-
# 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
|
115
|
-
# 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
|
116
|
-
#
|
117
|
-
# 线性计算速度很快
|
118
|
-
#
|
119
|
-
class HotestItems < SingleKv
|
120
|
-
attr_accessor :key, :id_to_score_and_time_hash_proc
|
121
|
-
attr_accessor :limit
|
122
|
-
|
123
|
-
def initialize key, id_to_score_and_time_hash_proc
|
124
|
-
cron.key = key
|
125
|
-
cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
|
126
|
-
cron.limit = 20
|
127
|
-
super
|
128
|
-
cron
|
129
|
-
end
|
130
|
-
|
131
|
-
def output
|
132
|
-
t = cron.id_to_score_and_time_hash_proc
|
133
|
-
while t.is_a?(Proc) do
|
134
|
-
t = t.call
|
135
|
-
end
|
136
|
-
@id_to_score_and_time_hash = t
|
137
|
-
@id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
|
138
|
-
|
139
|
-
@id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
|
140
|
-
array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
|
141
|
-
{cron.key => array}
|
142
|
-
end
|
143
|
-
|
144
|
-
def write
|
145
|
-
cron.output.each do |key, array|
|
146
|
-
json = array[0..140].to_json
|
147
|
-
StSingleKv.find_or_create(:pattern => key).update :result => json
|
148
|
-
StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
|
149
|
-
end
|
150
|
-
end
|
50
|
+
end
|
151
51
|
|
152
|
-
end
|
153
52
|
|
154
|
-
|
53
|
+
require 'statlysis/cron/top/lastest_visits.rb'
|
54
|
+
require 'statlysis/cron/top/hotest_items.rb'
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
# 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
|
5
|
+
#
|
6
|
+
# 解决方法为从用户行为中去综合分析,具体流程为:
|
7
|
+
# 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
|
8
|
+
# 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
|
9
|
+
#
|
10
|
+
# 线性计算速度很快
|
11
|
+
#
|
12
|
+
class HotestItems < SingleKv
|
13
|
+
attr_accessor :key, :id_to_score_and_time_hash_proc
|
14
|
+
attr_accessor :limit
|
15
|
+
|
16
|
+
def initialize key, id_to_score_and_time_hash_proc
|
17
|
+
cron.key = key
|
18
|
+
cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
|
19
|
+
cron.limit = 20
|
20
|
+
super
|
21
|
+
cron
|
22
|
+
end
|
23
|
+
|
24
|
+
def output
|
25
|
+
t = cron.id_to_score_and_time_hash_proc
|
26
|
+
while t.is_a?(Proc) do
|
27
|
+
t = t.call
|
28
|
+
end
|
29
|
+
@id_to_score_and_time_hash = t
|
30
|
+
@id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
|
31
|
+
|
32
|
+
@id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
|
33
|
+
array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
|
34
|
+
{cron.key => array}
|
35
|
+
end
|
36
|
+
|
37
|
+
def write
|
38
|
+
cron.output.each do |key, array|
|
39
|
+
json = array[0..140].to_json
|
40
|
+
StSingleKv.find_or_create(:pattern => key).update :result => json
|
41
|
+
StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|