statlysis 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Guardfile +14 -0
- data/README.markdown +77 -27
- data/Rakefile +1 -1
- data/lib/statlysis.rb +59 -101
- data/lib/statlysis/clock.rb +3 -3
- data/lib/statlysis/common.rb +4 -16
- data/lib/statlysis/configuration.rb +97 -2
- data/lib/statlysis/constants.rb +10 -0
- data/lib/statlysis/cron.rb +40 -42
- data/lib/statlysis/cron/count.rb +16 -58
- data/lib/statlysis/cron/count/dimensions.rb +7 -0
- data/lib/statlysis/cron/count/timely.rb +63 -0
- data/lib/statlysis/cron/top.rb +4 -104
- data/lib/statlysis/cron/top/hotest_items.rb +47 -0
- data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
- data/lib/statlysis/cron_set.rb +26 -0
- data/lib/statlysis/dataset.rb +6 -0
- data/lib/statlysis/javascript/count.rb +3 -3
- data/lib/statlysis/multiple_dataset.rb +69 -0
- data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
- data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
- data/lib/statlysis/rake.rb +6 -5
- data/lib/statlysis/similar.rb +11 -11
- data/lib/statlysis/timeseries.rb +12 -9
- data/lib/statlysis/utils.rb +40 -0
- data/statlysis.gemspec +13 -3
- data/test/config/database.yml +9 -0
- data/test/config/mongoid.yml +36 -0
- data/test/data/.gitkeep +0 -0
- data/test/data/code_gists_20130724.csv +1459 -0
- data/test/helper.rb +41 -3
- data/test/migrate/1_active_record.rb +8 -0
- data/test/models/.gitkeep +0 -0
- data/test/models/code_gist.rb +5 -0
- data/test/models/eoe_log.rb +53 -0
- data/test/test_daily_count.rb +22 -0
- data/test/test_mapreduce.rb +0 -13
- data/test/test_single_log_in_multiple_collections.rb +22 -0
- data/test/test_statlysis.rb +5 -50
- data/test/test_timeseries.rb +46 -0
- metadata +133 -12
- data/Gemfile.lock +0 -110
- data/LICENSE.txt +0 -20
- data/test/models/company.rb +0 -12
- data/test/models/employee.rb +0 -14
@@ -0,0 +1,10 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
TimeUnits = %w[hour day week month year]
|
5
|
+
DateTime1970 = Time.zone.parse("19700101").in_time_zone
|
6
|
+
|
7
|
+
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
8
|
+
|
9
|
+
DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
|
10
|
+
end
|
data/lib/statlysis/cron.rb
CHANGED
@@ -2,76 +2,74 @@
|
|
2
2
|
|
3
3
|
module Statlysis
|
4
4
|
class Cron
|
5
|
-
|
5
|
+
attr_reader :multiple_dataset, :source_type, :time_column, :time_unit, :time_zone
|
6
6
|
include Common
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
def initialize s, opts = {}
|
9
|
+
# setup data type related
|
10
|
+
@source_type = ({Utils.is_activerecord?(s) => :activerecord, Utils.is_mongoid?(s) => :mongoid}.detect {|k, v| k } || {})[1] || :unknown
|
11
|
+
|
12
|
+
@time_column = opts[:time_column]
|
13
|
+
@time_unit = opts[:time_unit]
|
14
|
+
@time_zone = opts[:time_zone] || Statlysis.default_time_zone || Time.zone || Time.now.utc_offset
|
15
|
+
|
16
|
+
# insert source as a dataset
|
17
|
+
@multiple_dataset = (s.is_a?(ActiveRecordDataset) ? s : ActiveRecordDataset.new(cron).add_source(s)) if is_activerecord?
|
18
|
+
@multiple_dataset = (s.is_a?(MongoidDataset) ? s : MongoidDataset.new(cron).add_source(s)) if is_mongoid?
|
19
|
+
@multiple_dataset.instance_variable_set("@cron", cron) if is_orm? && @multiple_dataset.cron.nil?
|
20
|
+
|
21
|
+
@stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
|
22
|
+
|
14
23
|
cron
|
15
24
|
end
|
16
|
-
def output; raise
|
17
|
-
def
|
18
|
-
def
|
19
|
-
|
20
|
-
|
21
|
-
def
|
22
|
-
|
23
|
-
str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
|
24
|
-
str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
|
25
|
-
str << ">"
|
26
|
-
str
|
27
|
-
end
|
25
|
+
def output; raise DefaultNotImplementWrongMessage end
|
26
|
+
def reoutput; @output = nil; output end
|
27
|
+
def setup_stat_model; raise DefaultNotImplementWrongMessage end
|
28
|
+
def run; raise DefaultNotImplementWrongMessage end
|
29
|
+
def is_activerecord?; @source_type == :activerecord; end
|
30
|
+
def is_mongoid?; @source_type == :mongoid; end
|
31
|
+
def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
|
28
32
|
|
33
|
+
def _source
|
34
|
+
cron.multiple_dataset.sources.first
|
35
|
+
end
|
29
36
|
def source_where_array
|
30
37
|
# TODO follow index seq
|
31
|
-
a =
|
38
|
+
a = _source.where("").where_values.map do |equality|
|
32
39
|
# use full keyvalue index name
|
33
40
|
equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
|
34
|
-
end if
|
35
|
-
a =
|
36
|
-
a.map {|
|
37
|
-
end
|
38
|
-
|
39
|
-
def source_name
|
40
|
-
@source_name ||= begin
|
41
|
-
m = :table_name if is_mysql?
|
42
|
-
m = :collection_name if is_mongodb?
|
43
|
-
cron.source.send(m)
|
44
|
-
end
|
41
|
+
end if is_activerecord?
|
42
|
+
a = _source.all.selector.reject {|k, v| k == 't' } if is_mongoid?
|
43
|
+
a.map {|s1| s1.to_s.split(//).select {|s2| s2.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
|
45
44
|
end
|
46
45
|
|
47
46
|
# automode
|
48
47
|
# or
|
49
48
|
# specify TIME_RANGE and TIME_UNIT in shell to run
|
50
49
|
def time_range
|
51
|
-
return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
|
52
|
-
#
|
50
|
+
return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day'), :zone => cron.time_zone) if ENV['TIME_RANGE']
|
51
|
+
# 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个在后就选择哪个
|
53
52
|
begin_day = DateTime.now.beginning_of_day
|
54
|
-
st_timebegin = (a = cron.
|
55
|
-
|
56
|
-
|
53
|
+
st_timebegin = (a = cron.stat_model.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
|
54
|
+
|
55
|
+
# TODO support multiple log
|
56
|
+
cron.stat_model.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
|
57
|
+
timebegin = (multiple_dataset.first_time != DateTime1970) ? multiple_dataset.first_time : (DateTime.now - 1.second)
|
57
58
|
timebegin = Time.at(timebegin) if is_time_column_integer?
|
58
59
|
timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
|
59
60
|
|
60
61
|
timeend = DateTime.now
|
61
|
-
|
62
|
+
logger.info "#{multiple_dataset.name}'s range #{timebegin..timeend}"
|
62
63
|
# 把统计表的最后时间点也包含进去重新计算下
|
63
64
|
TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
|
64
65
|
end
|
65
66
|
|
66
67
|
protected
|
67
|
-
def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
|
68
|
-
def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
|
69
|
-
def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
|
70
68
|
|
71
69
|
# 兼容采用整数类型作时间字段
|
72
70
|
def is_time_column_integer?
|
73
|
-
if
|
74
|
-
|
71
|
+
if is_activerecord?
|
72
|
+
_source.columns_hash[cron.time_column.to_s].type == :integer
|
75
73
|
else
|
76
74
|
false
|
77
75
|
end
|
data/lib/statlysis/cron/count.rb
CHANGED
@@ -5,89 +5,47 @@ module Statlysis
|
|
5
5
|
def initialize source, opts = {}
|
6
6
|
super
|
7
7
|
Statlysis.check_set_database
|
8
|
-
cron.
|
9
|
-
Statlysis.setup_stat_table_and_model cron
|
8
|
+
cron.setup_stat_model
|
10
9
|
cron
|
11
10
|
end
|
12
11
|
|
13
12
|
# 设置数据源,并保存结果入数据库
|
14
13
|
def run
|
15
|
-
|
16
|
-
cron.source = cron.source.asc(cron.time_column) if is_mongodb?
|
17
|
-
|
18
|
-
(puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
|
14
|
+
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
19
15
|
# delete first in range
|
20
16
|
@output = cron.output
|
21
17
|
unless @output.any?
|
22
|
-
|
18
|
+
logger.info "没有数据"; return
|
23
19
|
end
|
24
|
-
|
20
|
+
num_i = 0; num_add = 999
|
25
21
|
Statlysis.sequel.transaction do
|
26
|
-
cron.
|
27
|
-
while !(_a = @output[
|
22
|
+
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
|
23
|
+
while !(_a = @output[num_i..(num_i+num_add)]).blank? do
|
28
24
|
# batch insert all
|
29
|
-
cron.
|
30
|
-
|
25
|
+
cron.stat_model.multi_insert _a
|
26
|
+
num_i += (num_add + 1)
|
31
27
|
end
|
32
28
|
end
|
29
|
+
|
30
|
+
return self
|
33
31
|
end
|
34
32
|
|
35
33
|
|
36
|
-
def reoutput; @output = nil; output end
|
37
34
|
protected
|
38
35
|
def unit_range_query time, time_begin = nil
|
39
36
|
# time begin and end
|
40
|
-
tb = time
|
37
|
+
tb = time
|
41
38
|
te = (time+1.send(cron.time_unit)-1.second)
|
42
39
|
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
43
40
|
tb = time_begin || tb
|
44
|
-
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if
|
45
|
-
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if
|
41
|
+
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
42
|
+
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
46
43
|
end
|
47
44
|
|
48
45
|
end
|
49
46
|
|
50
|
-
|
51
|
-
def setup_stat_table
|
52
|
-
# TODO migration proc, merge into setup_stat_table_and_model
|
53
|
-
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
54
|
-
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
55
|
-
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
56
|
-
Statlysis.sequel.transaction do
|
57
|
-
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
58
|
-
DateTime :t # alias for :time
|
59
|
-
end
|
60
|
-
|
61
|
-
# TODO Add cron.source_where_array before count_columns
|
62
|
-
count_columns = [:timely_c, :totally_c] # alias for :count
|
63
|
-
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
64
|
-
index_column_names = [:t] + count_columns
|
65
|
-
index_column_names_name = index_column_names.join("_")
|
66
|
-
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
67
|
-
|
68
|
-
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
def output
|
74
|
-
@output ||= (cron.time_range.map do |time|
|
75
|
-
timely_c = cron.source.where(unit_range_query(time)).count
|
76
|
-
_t = DateTime.parse("19700101")
|
77
|
-
_t = is_time_column_integer? ? _t.to_i : _t
|
78
|
-
totally_c = cron.source.where(unit_range_query(time, _t)).count
|
79
|
-
|
80
|
-
puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
|
81
|
-
if timely_c.zero? && totally_c.zero?
|
82
|
-
nil
|
83
|
-
else
|
84
|
-
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
85
|
-
end
|
86
|
-
end.compact)
|
87
|
-
end
|
88
|
-
end
|
47
|
+
end
|
89
48
|
|
90
|
-
class Dimensions < Count
|
91
|
-
end
|
92
49
|
|
93
|
-
|
50
|
+
require 'statlysis/cron/count/timely'
|
51
|
+
require 'statlysis/cron/count/dimensions'
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely < Count
|
5
|
+
def setup_stat_model
|
6
|
+
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
7
|
+
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
8
|
+
|
9
|
+
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
10
|
+
Statlysis.sequel.transaction do
|
11
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
12
|
+
DateTime :t # alias for :time
|
13
|
+
end
|
14
|
+
|
15
|
+
# TODO Add cron.source_where_array before count_columns
|
16
|
+
count_columns = [:timely_c, :totally_c] # alias for :count
|
17
|
+
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
18
|
+
index_column_names = [:t] + count_columns
|
19
|
+
index_column_names_name = index_column_names.join("_")
|
20
|
+
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
21
|
+
|
22
|
+
# Fix there should be uniq index name between tables
|
23
|
+
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
24
|
+
if not Statlysis.config.is_skip_database_index
|
25
|
+
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
n = cron.stat_table_name.to_s.singularize.camelize
|
31
|
+
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
32
|
+
class ::#{n} < Sequel::Model;
|
33
|
+
self.set_dataset :#{cron.stat_table_name}
|
34
|
+
end
|
35
|
+
#{n}
|
36
|
+
MODEL
|
37
|
+
end
|
38
|
+
|
39
|
+
def output
|
40
|
+
@output ||= (cron.time_range.map do |time|
|
41
|
+
timely_c = 0
|
42
|
+
totally_c = 0
|
43
|
+
# support multiple data sources
|
44
|
+
_first_source = nil
|
45
|
+
cron.multiple_dataset.sources.each do |s|
|
46
|
+
timely_c += s.where(unit_range_query(time)).count
|
47
|
+
_t = DateTime1970
|
48
|
+
_t = is_time_column_integer? ? _t.to_i : _t
|
49
|
+
totally_c += s.where(unit_range_query(time, _t)).count
|
50
|
+
_first_source ||= s.where(unit_range_query(time))
|
51
|
+
end
|
52
|
+
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
|
53
|
+
|
54
|
+
if timely_c.zero? && totally_c.zero?
|
55
|
+
nil
|
56
|
+
else
|
57
|
+
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
58
|
+
end
|
59
|
+
end.compact)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/statlysis/cron/top.rb
CHANGED
@@ -24,15 +24,7 @@ module Statlysis
|
|
24
24
|
cron.write
|
25
25
|
end
|
26
26
|
|
27
|
-
def write; raise
|
28
|
-
|
29
|
-
|
30
|
-
def self.ensure_statlysis_table_and_model tn
|
31
|
-
Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
|
32
|
-
end
|
33
|
-
def ensure_statlysis_table_and_model tn
|
34
|
-
Top.ensure_statlysis_table_and_model tn
|
35
|
-
end
|
27
|
+
def write; raise DefaultNotImplementWrongMessage end
|
36
28
|
|
37
29
|
def default_assign_attr key_symbol, opts
|
38
30
|
if opts[key_symbol]
|
@@ -43,58 +35,6 @@ module Statlysis
|
|
43
35
|
end
|
44
36
|
end
|
45
37
|
|
46
|
-
# 博客最近用户访问计算实现流程讨论
|
47
|
-
# 问题分两个,一个是后端,一个是前端。对后端来说,用户每次blog/index|show访问都生成访问记录,后端需要进行排重和去掉未登陆用户。如果在该次访问里进行,特别是某个博客突然火了,必然每次访问都产生IO(磁盘或网络,因为多进程要共享信息),所以必定是异步的。
|
48
|
-
# 前端展示考虑到缓存,一般是页面片段缓存,或者ajax载入。
|
49
|
-
# 后端异步如何计算每个blog的最近访客,log.js记录了最近访问,一个后台常驻进程循环对日志表按时间记录来读取blog访问信息,把最近访客信息刷新到blog。相对单次请求全部处理,这里处理次数更少,资源更节约,当然瓶颈也在日志表的索引更新和读取。
|
50
|
-
class LastestVisits < Top
|
51
|
-
attr_accessor :clock
|
52
|
-
attr_accessor :reject_proc
|
53
|
-
|
54
|
-
# *pattern_proc* is a proc to extract user_id or url_prefix to compute the
|
55
|
-
# top visitors from log
|
56
|
-
# *user_id_proc* is a proc to extract user_id from log
|
57
|
-
# *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
|
58
|
-
# *reject_proc* filter visitors
|
59
|
-
def initialize source, opts = {}
|
60
|
-
# set variables
|
61
|
-
cron.reclock opts[:default_time]
|
62
|
-
cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
|
63
|
-
super
|
64
|
-
cron.pattern_table_and_model cron.stat_table_name
|
65
|
-
cron
|
66
|
-
end
|
67
|
-
|
68
|
-
def output
|
69
|
-
cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
|
70
|
-
return {} if cron.logs.blank?
|
71
|
-
cron.logs.inject({}) do |h, log|
|
72
|
-
pattern = cron.pattern_proc.call(log)
|
73
|
-
if pattern
|
74
|
-
h[pattern] ||= []
|
75
|
-
user_id = cron.user_id_proc.call(log).to_i
|
76
|
-
h[pattern] << user_id if not user_id.zero?
|
77
|
-
end
|
78
|
-
h
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
def write
|
83
|
-
puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
|
84
|
-
cron.output.each do |pattern, user_ids|
|
85
|
-
s = cron.stat_model.find_or_create(:pattern => pattern)
|
86
|
-
old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
|
87
|
-
new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
|
88
|
-
s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
|
89
|
-
end
|
90
|
-
cron.clock.update cron.logs.last.try(cron.time_column)
|
91
|
-
end
|
92
|
-
|
93
|
-
def reclock default_time = nil
|
94
|
-
cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
38
|
class SingleKv < Top
|
99
39
|
attr_accessor :time_ago, :stat_column_name
|
100
40
|
|
@@ -102,53 +42,13 @@ module Statlysis
|
|
102
42
|
[:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
|
103
43
|
raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
|
104
44
|
super
|
105
|
-
cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
|
106
45
|
cron
|
107
46
|
end
|
108
47
|
|
109
48
|
end
|
110
49
|
|
111
|
-
|
112
|
-
#
|
113
|
-
# 解决方法为从用户行为中去综合分析,具体流程为:
|
114
|
-
# 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
|
115
|
-
# 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
|
116
|
-
#
|
117
|
-
# 线性计算速度很快
|
118
|
-
#
|
119
|
-
class HotestItems < SingleKv
|
120
|
-
attr_accessor :key, :id_to_score_and_time_hash_proc
|
121
|
-
attr_accessor :limit
|
122
|
-
|
123
|
-
def initialize key, id_to_score_and_time_hash_proc
|
124
|
-
cron.key = key
|
125
|
-
cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
|
126
|
-
cron.limit = 20
|
127
|
-
super
|
128
|
-
cron
|
129
|
-
end
|
130
|
-
|
131
|
-
def output
|
132
|
-
t = cron.id_to_score_and_time_hash_proc
|
133
|
-
while t.is_a?(Proc) do
|
134
|
-
t = t.call
|
135
|
-
end
|
136
|
-
@id_to_score_and_time_hash = t
|
137
|
-
@id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
|
138
|
-
|
139
|
-
@id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
|
140
|
-
array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
|
141
|
-
{cron.key => array}
|
142
|
-
end
|
143
|
-
|
144
|
-
def write
|
145
|
-
cron.output.each do |key, array|
|
146
|
-
json = array[0..140].to_json
|
147
|
-
StSingleKv.find_or_create(:pattern => key).update :result => json
|
148
|
-
StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
|
149
|
-
end
|
150
|
-
end
|
50
|
+
end
|
151
51
|
|
152
|
-
end
|
153
52
|
|
154
|
-
|
53
|
+
require 'statlysis/cron/top/lastest_visits.rb'
|
54
|
+
require 'statlysis/cron/top/hotest_items.rb'
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
# 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
|
5
|
+
#
|
6
|
+
# 解决方法为从用户行为中去综合分析,具体流程为:
|
7
|
+
# 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
|
8
|
+
# 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
|
9
|
+
#
|
10
|
+
# 线性计算速度很快
|
11
|
+
#
|
12
|
+
class HotestItems < SingleKv
|
13
|
+
attr_accessor :key, :id_to_score_and_time_hash_proc
|
14
|
+
attr_accessor :limit
|
15
|
+
|
16
|
+
def initialize key, id_to_score_and_time_hash_proc
|
17
|
+
cron.key = key
|
18
|
+
cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
|
19
|
+
cron.limit = 20
|
20
|
+
super
|
21
|
+
cron
|
22
|
+
end
|
23
|
+
|
24
|
+
def output
|
25
|
+
t = cron.id_to_score_and_time_hash_proc
|
26
|
+
while t.is_a?(Proc) do
|
27
|
+
t = t.call
|
28
|
+
end
|
29
|
+
@id_to_score_and_time_hash = t
|
30
|
+
@id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
|
31
|
+
|
32
|
+
@id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
|
33
|
+
array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
|
34
|
+
{cron.key => array}
|
35
|
+
end
|
36
|
+
|
37
|
+
def write
|
38
|
+
cron.output.each do |key, array|
|
39
|
+
json = array[0..140].to_json
|
40
|
+
StSingleKv.find_or_create(:pattern => key).update :result => json
|
41
|
+
StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|