statlysis 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +3 -0
  2. data/Guardfile +14 -0
  3. data/README.markdown +77 -27
  4. data/Rakefile +1 -1
  5. data/lib/statlysis.rb +59 -101
  6. data/lib/statlysis/clock.rb +3 -3
  7. data/lib/statlysis/common.rb +4 -16
  8. data/lib/statlysis/configuration.rb +97 -2
  9. data/lib/statlysis/constants.rb +10 -0
  10. data/lib/statlysis/cron.rb +40 -42
  11. data/lib/statlysis/cron/count.rb +16 -58
  12. data/lib/statlysis/cron/count/dimensions.rb +7 -0
  13. data/lib/statlysis/cron/count/timely.rb +63 -0
  14. data/lib/statlysis/cron/top.rb +4 -104
  15. data/lib/statlysis/cron/top/hotest_items.rb +47 -0
  16. data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
  17. data/lib/statlysis/cron_set.rb +26 -0
  18. data/lib/statlysis/dataset.rb +6 -0
  19. data/lib/statlysis/javascript/count.rb +3 -3
  20. data/lib/statlysis/multiple_dataset.rb +69 -0
  21. data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
  22. data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
  23. data/lib/statlysis/rake.rb +6 -5
  24. data/lib/statlysis/similar.rb +11 -11
  25. data/lib/statlysis/timeseries.rb +12 -9
  26. data/lib/statlysis/utils.rb +40 -0
  27. data/statlysis.gemspec +13 -3
  28. data/test/config/database.yml +9 -0
  29. data/test/config/mongoid.yml +36 -0
  30. data/test/data/.gitkeep +0 -0
  31. data/test/data/code_gists_20130724.csv +1459 -0
  32. data/test/helper.rb +41 -3
  33. data/test/migrate/1_active_record.rb +8 -0
  34. data/test/models/.gitkeep +0 -0
  35. data/test/models/code_gist.rb +5 -0
  36. data/test/models/eoe_log.rb +53 -0
  37. data/test/test_daily_count.rb +22 -0
  38. data/test/test_mapreduce.rb +0 -13
  39. data/test/test_single_log_in_multiple_collections.rb +22 -0
  40. data/test/test_statlysis.rb +5 -50
  41. data/test/test_timeseries.rb +46 -0
  42. metadata +133 -12
  43. data/Gemfile.lock +0 -110
  44. data/LICENSE.txt +0 -20
  45. data/test/models/company.rb +0 -12
  46. data/test/models/employee.rb +0 -14
@@ -0,0 +1,10 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ TimeUnits = %w[hour day week month year]
5
+ DateTime1970 = Time.zone.parse("19700101").in_time_zone
6
+
7
+ DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
8
+
9
+ DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
10
+ end
@@ -2,76 +2,74 @@
2
2
 
3
3
  module Statlysis
4
4
  class Cron
5
- attr_accessor :source, :time_column, :time_unit
5
+ attr_reader :multiple_dataset, :source_type, :time_column, :time_unit, :time_zone
6
6
  include Common
7
7
 
8
- DefaultWrongMessage = "not implement yet, please config it by subclass".freeze
9
- def initialize source, opts = {}
10
- cron.stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
11
- cron.time_column = opts[:time_column]
12
- cron.source = source
13
- cron.time_unit = opts[:time_unit]
8
+ def initialize s, opts = {}
9
+ # setup data type related
10
+ @source_type = ({Utils.is_activerecord?(s) => :activerecord, Utils.is_mongoid?(s) => :mongoid}.detect {|k, v| k } || {})[1] || :unknown
11
+
12
+ @time_column = opts[:time_column]
13
+ @time_unit = opts[:time_unit]
14
+ @time_zone = opts[:time_zone] || Statlysis.default_time_zone || Time.zone || Time.now.utc_offset
15
+
16
+ # insert source as a dataset
17
+ @multiple_dataset = (s.is_a?(ActiveRecordDataset) ? s : ActiveRecordDataset.new(cron).add_source(s)) if is_activerecord?
18
+ @multiple_dataset = (s.is_a?(MongoidDataset) ? s : MongoidDataset.new(cron).add_source(s)) if is_mongoid?
19
+ @multiple_dataset.instance_variable_set("@cron", cron) if is_orm? && @multiple_dataset.cron.nil?
20
+
21
+ @stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
22
+
14
23
  cron
15
24
  end
16
- def output; raise DefaultWrongMessage end
17
- def setup_stat_table; raise DefaultWrongMessage end
18
- def run; raise DefaultWrongMessage end
19
-
20
- # overwrite to lazy load @source
21
- def inspect
22
- source_inspect = is_mysql? ? cron.source.to_sql : cron.source
23
- str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
24
- str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
25
- str << ">"
26
- str
27
- end
25
+ def output; raise DefaultNotImplementWrongMessage end
26
+ def reoutput; @output = nil; output end
27
+ def setup_stat_model; raise DefaultNotImplementWrongMessage end
28
+ def run; raise DefaultNotImplementWrongMessage end
29
+ def is_activerecord?; @source_type == :activerecord; end
30
+ def is_mongoid?; @source_type == :mongoid; end
31
+ def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
28
32
 
33
+ def _source
34
+ cron.multiple_dataset.sources.first
35
+ end
29
36
  def source_where_array
30
37
  # TODO follow index seq
31
- a = cron.source.where("").where_values.map do |equality|
38
+ a = _source.where("").where_values.map do |equality|
32
39
  # use full keyvalue index name
33
40
  equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
34
- end if is_mysql?
35
- a = cron.source.all.selector.reject {|k, v| k == 't' } if is_mongodb?
36
- a.map {|s| s.to_s.split(//).select {|s| s.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
37
- end
38
-
39
- def source_name
40
- @source_name ||= begin
41
- m = :table_name if is_mysql?
42
- m = :collection_name if is_mongodb?
43
- cron.source.send(m)
44
- end
41
+ end if is_activerecord?
42
+ a = _source.all.selector.reject {|k, v| k == 't' } if is_mongoid?
43
+ a.map {|s1| s1.to_s.split(//).select {|s2| s2.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
45
44
  end
46
45
 
47
46
  # automode
48
47
  # or
49
48
  # specify TIME_RANGE and TIME_UNIT in shell to run
50
49
  def time_range
51
- return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
52
- # 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个最后就选择
50
+ return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day'), :zone => cron.time_zone) if ENV['TIME_RANGE']
51
+ # 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个在后就选择哪个
53
52
  begin_day = DateTime.now.beginning_of_day
54
- st_timebegin = (a = cron.stat_table.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
55
- cron.stat_table.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
56
- timebegin = (a = cron.source.first) ? a.send(cron.time_column) : (DateTime.now - 1.second)
53
+ st_timebegin = (a = cron.stat_model.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
54
+
55
+ # TODO support multiple log
56
+ cron.stat_model.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
57
+ timebegin = (multiple_dataset.first_time != DateTime1970) ? multiple_dataset.first_time : (DateTime.now - 1.second)
57
58
  timebegin = Time.at(timebegin) if is_time_column_integer?
58
59
  timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
59
60
 
60
61
  timeend = DateTime.now
61
- puts "#{cron.source_name}'s range #{timebegin..timeend}"
62
+ logger.info "#{multiple_dataset.name}'s range #{timebegin..timeend}"
62
63
  # 把统计表的最后时间点也包含进去重新计算下
63
64
  TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
64
65
  end
65
66
 
66
67
  protected
67
- def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
68
- def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
69
- def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
70
68
 
71
69
  # 兼容采用整数类型作时间字段
72
70
  def is_time_column_integer?
73
- if is_mysql?
74
- cron.source.columns_hash[cron.time_column.to_s].type == :integer
71
+ if is_activerecord?
72
+ _source.columns_hash[cron.time_column.to_s].type == :integer
75
73
  else
76
74
  false
77
75
  end
@@ -5,89 +5,47 @@ module Statlysis
5
5
  def initialize source, opts = {}
6
6
  super
7
7
  Statlysis.check_set_database
8
- cron.setup_stat_table
9
- Statlysis.setup_stat_table_and_model cron
8
+ cron.setup_stat_model
10
9
  cron
11
10
  end
12
11
 
13
12
  # 设置数据源,并保存结果入数据库
14
13
  def run
15
- cron.source = cron.source.order("#{cron.time_column} ASC") if is_mysql?
16
- cron.source = cron.source.asc(cron.time_column) if is_mongodb?
17
-
18
- (puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
14
+ (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
19
15
  # delete first in range
20
16
  @output = cron.output
21
17
  unless @output.any?
22
- puts "没有数据"; return
18
+ logger.info "没有数据"; return
23
19
  end
24
- @num_i = 0; @num_add = 999
20
+ num_i = 0; num_add = 999
25
21
  Statlysis.sequel.transaction do
26
- cron.stat_table.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
27
- while !(_a = @output[@num_i..(@num_i+@num_add)]).blank? do
22
+ cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
23
+ while !(_a = @output[num_i..(num_i+num_add)]).blank? do
28
24
  # batch insert all
29
- cron.stat_table.multi_insert _a
30
- @num_i += (@num_add + 1)
25
+ cron.stat_model.multi_insert _a
26
+ num_i += (num_add + 1)
31
27
  end
32
28
  end
29
+
30
+ return self
33
31
  end
34
32
 
35
33
 
36
- def reoutput; @output = nil; output end
37
34
  protected
38
35
  def unit_range_query time, time_begin = nil
39
36
  # time begin and end
40
- tb = time # TODO 差八个小时 [.in_time_zone, .localtime, .utc] 对于Rails,计算结果还是一样的。
37
+ tb = time
41
38
  te = (time+1.send(cron.time_unit)-1.second)
42
39
  tb, te = tb.to_i, te.to_i if is_time_column_integer?
43
40
  tb = time_begin || tb
44
- return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_mysql?
45
- return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongodb? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
41
+ return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
42
+ return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
46
43
  end
47
44
 
48
45
  end
49
46
 
50
- class Timely < Count
51
- def setup_stat_table
52
- # TODO migration proc, merge into setup_stat_table_and_model
53
- cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
54
- raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
55
- unless Statlysis.sequel.table_exists?(cron.stat_table_name)
56
- Statlysis.sequel.transaction do
57
- Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
58
- DateTime :t # alias for :time
59
- end
60
-
61
- # TODO Add cron.source_where_array before count_columns
62
- count_columns = [:timely_c, :totally_c] # alias for :count
63
- count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
64
- index_column_names = [:t] + count_columns
65
- index_column_names_name = index_column_names.join("_")
66
- index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
67
-
68
- Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
69
- end
70
- end
71
- end
72
-
73
- def output
74
- @output ||= (cron.time_range.map do |time|
75
- timely_c = cron.source.where(unit_range_query(time)).count
76
- _t = DateTime.parse("19700101")
77
- _t = is_time_column_integer? ? _t.to_i : _t
78
- totally_c = cron.source.where(unit_range_query(time, _t)).count
79
-
80
- puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
81
- if timely_c.zero? && totally_c.zero?
82
- nil
83
- else
84
- {:t => time, :timely_c => timely_c, :totally_c => totally_c}
85
- end
86
- end.compact)
87
- end
88
- end
47
+ end
89
48
 
90
- class Dimensions < Count
91
- end
92
49
 
93
- end
50
+ require 'statlysis/cron/count/timely'
51
+ require 'statlysis/cron/count/dimensions'
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Dimensions < Count
5
+ end
6
+
7
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely < Count
5
+ def setup_stat_model
6
+ cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
7
+ raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
8
+
9
+ if not Statlysis.sequel.table_exists?(cron.stat_table_name)
10
+ Statlysis.sequel.transaction do
11
+ Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
12
+ DateTime :t # alias for :time
13
+ end
14
+
15
+ # TODO Add cron.source_where_array before count_columns
16
+ count_columns = [:timely_c, :totally_c] # alias for :count
17
+ count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
18
+ index_column_names = [:t] + count_columns
19
+ index_column_names_name = index_column_names.join("_")
20
+ index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
21
+
22
+ # Fix there should be uniq index name between tables
23
+ # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
24
+ if not Statlysis.config.is_skip_database_index
25
+ Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
26
+ end
27
+ end
28
+ end
29
+
30
+ n = cron.stat_table_name.to_s.singularize.camelize
31
+ cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
32
+ class ::#{n} < Sequel::Model;
33
+ self.set_dataset :#{cron.stat_table_name}
34
+ end
35
+ #{n}
36
+ MODEL
37
+ end
38
+
39
+ def output
40
+ @output ||= (cron.time_range.map do |time|
41
+ timely_c = 0
42
+ totally_c = 0
43
+ # support multiple data sources
44
+ _first_source = nil
45
+ cron.multiple_dataset.sources.each do |s|
46
+ timely_c += s.where(unit_range_query(time)).count
47
+ _t = DateTime1970
48
+ _t = is_time_column_integer? ? _t.to_i : _t
49
+ totally_c += s.where(unit_range_query(time, _t)).count
50
+ _first_source ||= s.where(unit_range_query(time))
51
+ end
52
+ logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
53
+
54
+ if timely_c.zero? && totally_c.zero?
55
+ nil
56
+ else
57
+ {:t => time, :timely_c => timely_c, :totally_c => totally_c}
58
+ end
59
+ end.compact)
60
+ end
61
+ end
62
+
63
+ end
@@ -24,15 +24,7 @@ module Statlysis
24
24
  cron.write
25
25
  end
26
26
 
27
- def write; raise DefaultWrongMessage end
28
-
29
-
30
- def self.ensure_statlysis_table_and_model tn
31
- Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
32
- end
33
- def ensure_statlysis_table_and_model tn
34
- Top.ensure_statlysis_table_and_model tn
35
- end
27
+ def write; raise DefaultNotImplementWrongMessage end
36
28
 
37
29
  def default_assign_attr key_symbol, opts
38
30
  if opts[key_symbol]
@@ -43,58 +35,6 @@ module Statlysis
43
35
  end
44
36
  end
45
37
 
46
- # 博客最近用户访问计算实现流程讨论
47
- # 问题分两个,一个是后端,一个是前端。对后端来说,用户每次blog/index|show访问都生成访问记录,后端需要进行排重和去掉未登陆用户。如果在该次访问里进行,特别是某个博客突然火了,必然每次访问都产生IO(磁盘或网络,因为多进程要共享信息),所以必定是异步的。
48
- # 前端展示考虑到缓存,一般是页面片段缓存,或者ajax载入。
49
- # 后端异步如何计算每个blog的最近访客,log.js记录了最近访问,一个后台常驻进程循环对日志表按时间记录来读取blog访问信息,把最近访客信息刷新到blog。相对单次请求全部处理,这里处理次数更少,资源更节约,当然瓶颈也在日志表的索引更新和读取。
50
- class LastestVisits < Top
51
- attr_accessor :clock
52
- attr_accessor :reject_proc
53
-
54
- # *pattern_proc* is a proc to extract user_id or url_prefix to compute the
55
- # top visitors from log
56
- # *user_id_proc* is a proc to extract user_id from log
57
- # *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
58
- # *reject_proc* filter visitors
59
- def initialize source, opts = {}
60
- # set variables
61
- cron.reclock opts[:default_time]
62
- cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
63
- super
64
- cron.pattern_table_and_model cron.stat_table_name
65
- cron
66
- end
67
-
68
- def output
69
- cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
70
- return {} if cron.logs.blank?
71
- cron.logs.inject({}) do |h, log|
72
- pattern = cron.pattern_proc.call(log)
73
- if pattern
74
- h[pattern] ||= []
75
- user_id = cron.user_id_proc.call(log).to_i
76
- h[pattern] << user_id if not user_id.zero?
77
- end
78
- h
79
- end
80
- end
81
-
82
- def write
83
- puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
84
- cron.output.each do |pattern, user_ids|
85
- s = cron.stat_model.find_or_create(:pattern => pattern)
86
- old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
87
- new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
88
- s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
89
- end
90
- cron.clock.update cron.logs.last.try(cron.time_column)
91
- end
92
-
93
- def reclock default_time = nil
94
- cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
95
- end
96
- end
97
-
98
38
  class SingleKv < Top
99
39
  attr_accessor :time_ago, :stat_column_name
100
40
 
@@ -102,53 +42,13 @@ module Statlysis
102
42
  [:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
103
43
  raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
104
44
  super
105
- cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
106
45
  cron
107
46
  end
108
47
 
109
48
  end
110
49
 
111
- # 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
112
- #
113
- # 解决方法为从用户行为中去综合分析,具体流程为:
114
- # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
115
- # 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
116
- #
117
- # 线性计算速度很快
118
- #
119
- class HotestItems < SingleKv
120
- attr_accessor :key, :id_to_score_and_time_hash_proc
121
- attr_accessor :limit
122
-
123
- def initialize key, id_to_score_and_time_hash_proc
124
- cron.key = key
125
- cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
126
- cron.limit = 20
127
- super
128
- cron
129
- end
130
-
131
- def output
132
- t = cron.id_to_score_and_time_hash_proc
133
- while t.is_a?(Proc) do
134
- t = t.call
135
- end
136
- @id_to_score_and_time_hash = t
137
- @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
138
-
139
- @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
140
- array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
141
- {cron.key => array}
142
- end
143
-
144
- def write
145
- cron.output.each do |key, array|
146
- json = array[0..140].to_json
147
- StSingleKv.find_or_create(:pattern => key).update :result => json
148
- StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
149
- end
150
- end
50
+ end
151
51
 
152
- end
153
52
 
154
- end
53
+ require 'statlysis/cron/top/lastest_visits.rb'
54
+ require 'statlysis/cron/top/hotest_items.rb'
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ # 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
5
+ #
6
+ # 解决方法为从用户行为中去综合分析,具体流程为:
7
+ # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
8
+ # 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
9
+ #
10
+ # 线性计算速度很快
11
+ #
12
+ class HotestItems < SingleKv
13
+ attr_accessor :key, :id_to_score_and_time_hash_proc
14
+ attr_accessor :limit
15
+
16
+ def initialize key, id_to_score_and_time_hash_proc
17
+ cron.key = key
18
+ cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
19
+ cron.limit = 20
20
+ super
21
+ cron
22
+ end
23
+
24
+ def output
25
+ t = cron.id_to_score_and_time_hash_proc
26
+ while t.is_a?(Proc) do
27
+ t = t.call
28
+ end
29
+ @id_to_score_and_time_hash = t
30
+ @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
31
+
32
+ @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
33
+ array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
34
+ {cron.key => array}
35
+ end
36
+
37
+ def write
38
+ cron.output.each do |key, array|
39
+ json = array[0..140].to_json
40
+ StSingleKv.find_or_create(:pattern => key).update :result => json
41
+ StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
42
+ end
43
+ end
44
+
45
+ end
46
+
47
+ end