statlysis 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.gitignore +3 -0
  2. data/Guardfile +14 -0
  3. data/README.markdown +77 -27
  4. data/Rakefile +1 -1
  5. data/lib/statlysis.rb +59 -101
  6. data/lib/statlysis/clock.rb +3 -3
  7. data/lib/statlysis/common.rb +4 -16
  8. data/lib/statlysis/configuration.rb +97 -2
  9. data/lib/statlysis/constants.rb +10 -0
  10. data/lib/statlysis/cron.rb +40 -42
  11. data/lib/statlysis/cron/count.rb +16 -58
  12. data/lib/statlysis/cron/count/dimensions.rb +7 -0
  13. data/lib/statlysis/cron/count/timely.rb +63 -0
  14. data/lib/statlysis/cron/top.rb +4 -104
  15. data/lib/statlysis/cron/top/hotest_items.rb +47 -0
  16. data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
  17. data/lib/statlysis/cron_set.rb +26 -0
  18. data/lib/statlysis/dataset.rb +6 -0
  19. data/lib/statlysis/javascript/count.rb +3 -3
  20. data/lib/statlysis/multiple_dataset.rb +69 -0
  21. data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
  22. data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
  23. data/lib/statlysis/rake.rb +6 -5
  24. data/lib/statlysis/similar.rb +11 -11
  25. data/lib/statlysis/timeseries.rb +12 -9
  26. data/lib/statlysis/utils.rb +40 -0
  27. data/statlysis.gemspec +13 -3
  28. data/test/config/database.yml +9 -0
  29. data/test/config/mongoid.yml +36 -0
  30. data/test/data/.gitkeep +0 -0
  31. data/test/data/code_gists_20130724.csv +1459 -0
  32. data/test/helper.rb +41 -3
  33. data/test/migrate/1_active_record.rb +8 -0
  34. data/test/models/.gitkeep +0 -0
  35. data/test/models/code_gist.rb +5 -0
  36. data/test/models/eoe_log.rb +53 -0
  37. data/test/test_daily_count.rb +22 -0
  38. data/test/test_mapreduce.rb +0 -13
  39. data/test/test_single_log_in_multiple_collections.rb +22 -0
  40. data/test/test_statlysis.rb +5 -50
  41. data/test/test_timeseries.rb +46 -0
  42. metadata +133 -12
  43. data/Gemfile.lock +0 -110
  44. data/LICENSE.txt +0 -20
  45. data/test/models/company.rb +0 -12
  46. data/test/models/employee.rb +0 -14
@@ -0,0 +1,10 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ TimeUnits = %w[hour day week month year]
5
+ DateTime1970 = Time.zone.parse("19700101").in_time_zone
6
+
7
+ DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
8
+
9
+ DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
10
+ end
@@ -2,76 +2,74 @@
2
2
 
3
3
  module Statlysis
4
4
  class Cron
5
- attr_accessor :source, :time_column, :time_unit
5
+ attr_reader :multiple_dataset, :source_type, :time_column, :time_unit, :time_zone
6
6
  include Common
7
7
 
8
- DefaultWrongMessage = "not implement yet, please config it by subclass".freeze
9
- def initialize source, opts = {}
10
- cron.stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
11
- cron.time_column = opts[:time_column]
12
- cron.source = source
13
- cron.time_unit = opts[:time_unit]
8
+ def initialize s, opts = {}
9
+ # setup data type related
10
+ @source_type = ({Utils.is_activerecord?(s) => :activerecord, Utils.is_mongoid?(s) => :mongoid}.detect {|k, v| k } || {})[1] || :unknown
11
+
12
+ @time_column = opts[:time_column]
13
+ @time_unit = opts[:time_unit]
14
+ @time_zone = opts[:time_zone] || Statlysis.default_time_zone || Time.zone || Time.now.utc_offset
15
+
16
+ # insert source as a dataset
17
+ @multiple_dataset = (s.is_a?(ActiveRecordDataset) ? s : ActiveRecordDataset.new(cron).add_source(s)) if is_activerecord?
18
+ @multiple_dataset = (s.is_a?(MongoidDataset) ? s : MongoidDataset.new(cron).add_source(s)) if is_mongoid?
19
+ @multiple_dataset.instance_variable_set("@cron", cron) if is_orm? && @multiple_dataset.cron.nil?
20
+
21
+ @stat_table_name = opts[:stat_table_name] if opts[:stat_table_name]
22
+
14
23
  cron
15
24
  end
16
- def output; raise DefaultWrongMessage end
17
- def setup_stat_table; raise DefaultWrongMessage end
18
- def run; raise DefaultWrongMessage end
19
-
20
- # overwrite to lazy load @source
21
- def inspect
22
- source_inspect = is_mysql? ? cron.source.to_sql : cron.source
23
- str = "#<#{cron.class} @source=#{source_inspect} @stat_table_name=#{cron.stat_table_name} @time_column=#{cron.time_column} @stat_table=#{cron.stat_table}"
24
- str << " @stat_model=#{cron.stat_model}" if cron.methods.index(:stat_model)
25
- str << ">"
26
- str
27
- end
25
+ def output; raise DefaultNotImplementWrongMessage end
26
+ def reoutput; @output = nil; output end
27
+ def setup_stat_model; raise DefaultNotImplementWrongMessage end
28
+ def run; raise DefaultNotImplementWrongMessage end
29
+ def is_activerecord?; @source_type == :activerecord; end
30
+ def is_mongoid?; @source_type == :mongoid; end
31
+ def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
28
32
 
33
+ def _source
34
+ cron.multiple_dataset.sources.first
35
+ end
29
36
  def source_where_array
30
37
  # TODO follow index seq
31
- a = cron.source.where("").where_values.map do |equality|
38
+ a = _source.where("").where_values.map do |equality|
32
39
  # use full keyvalue index name
33
40
  equality.is_a?(String) ? equality.to_sym : "#{equality.operand1.name}#{equality.operand2}"
34
- end if is_mysql?
35
- a = cron.source.all.selector.reject {|k, v| k == 't' } if is_mongodb?
36
- a.map {|s| s.to_s.split(//).select {|s| s.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
37
- end
38
-
39
- def source_name
40
- @source_name ||= begin
41
- m = :table_name if is_mysql?
42
- m = :collection_name if is_mongodb?
43
- cron.source.send(m)
44
- end
41
+ end if is_activerecord?
42
+ a = _source.all.selector.reject {|k, v| k == 't' } if is_mongoid?
43
+ a.map {|s1| s1.to_s.split(//).select {|s2| s2.match(/[a-z0-9]/i) }.join }.sort.map(&:to_sym)
45
44
  end
46
45
 
47
46
  # automode
48
47
  # or
49
48
  # specify TIME_RANGE and TIME_UNIT in shell to run
50
49
  def time_range
51
- return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day')) if ENV['TIME_RANGE']
52
- # 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个最后就选择
50
+ return TimeSeries.parse(ENV['TIME_RANGE'], :unit => (ENV['TIME_UNIT'] || 'day'), :zone => cron.time_zone) if ENV['TIME_RANGE']
51
+ # 选择开始时间。取出统计表的最后时间,和数据表的最先时间对比,哪个在后就选择哪个
53
52
  begin_day = DateTime.now.beginning_of_day
54
- st_timebegin = (a = cron.stat_table.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
55
- cron.stat_table.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
56
- timebegin = (a = cron.source.first) ? a.send(cron.time_column) : (DateTime.now - 1.second)
53
+ st_timebegin = (a = cron.stat_model.order(:t).where("t >= ?", begin_day.yesterday).first) ? a[:t] : nil
54
+
55
+ # TODO support multiple log
56
+ cron.stat_model.where("t >= ?", begin_day.tomorrow).delete # 明天的数据没出来肯定统计不了
57
+ timebegin = (multiple_dataset.first_time != DateTime1970) ? multiple_dataset.first_time : (DateTime.now - 1.second)
57
58
  timebegin = Time.at(timebegin) if is_time_column_integer?
58
59
  timebegin = (st_timebegin > timebegin) ? st_timebegin : timebegin if st_timebegin
59
60
 
60
61
  timeend = DateTime.now
61
- puts "#{cron.source_name}'s range #{timebegin..timeend}"
62
+ logger.info "#{multiple_dataset.name}'s range #{timebegin..timeend}"
62
63
  # 把统计表的最后时间点也包含进去重新计算下
63
64
  TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
64
65
  end
65
66
 
66
67
  protected
67
- def is_mysql?; @_is_mysql ||= modules.grep(/ActiveRecord::Store/).any? end
68
- def is_mongodb?; @_is_mongodb ||= modules.grep(/Mongoid::Document/).any? end
69
- def modules; @_modules ||= cron.source.included_modules.map(&:to_s) end
70
68
 
71
69
  # 兼容采用整数类型作时间字段
72
70
  def is_time_column_integer?
73
- if is_mysql?
74
- cron.source.columns_hash[cron.time_column.to_s].type == :integer
71
+ if is_activerecord?
72
+ _source.columns_hash[cron.time_column.to_s].type == :integer
75
73
  else
76
74
  false
77
75
  end
@@ -5,89 +5,47 @@ module Statlysis
5
5
  def initialize source, opts = {}
6
6
  super
7
7
  Statlysis.check_set_database
8
- cron.setup_stat_table
9
- Statlysis.setup_stat_table_and_model cron
8
+ cron.setup_stat_model
10
9
  cron
11
10
  end
12
11
 
13
12
  # 设置数据源,并保存结果入数据库
14
13
  def run
15
- cron.source = cron.source.order("#{cron.time_column} ASC") if is_mysql?
16
- cron.source = cron.source.asc(cron.time_column) if is_mongodb?
17
-
18
- (puts("#{cron.source_name} have no result!"); return false) if cron.output.blank?
14
+ (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
19
15
  # delete first in range
20
16
  @output = cron.output
21
17
  unless @output.any?
22
- puts "没有数据"; return
18
+ logger.info "没有数据"; return
23
19
  end
24
- @num_i = 0; @num_add = 999
20
+ num_i = 0; num_add = 999
25
21
  Statlysis.sequel.transaction do
26
- cron.stat_table.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
27
- while !(_a = @output[@num_i..(@num_i+@num_add)]).blank? do
22
+ cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
23
+ while !(_a = @output[num_i..(num_i+num_add)]).blank? do
28
24
  # batch insert all
29
- cron.stat_table.multi_insert _a
30
- @num_i += (@num_add + 1)
25
+ cron.stat_model.multi_insert _a
26
+ num_i += (num_add + 1)
31
27
  end
32
28
  end
29
+
30
+ return self
33
31
  end
34
32
 
35
33
 
36
- def reoutput; @output = nil; output end
37
34
  protected
38
35
  def unit_range_query time, time_begin = nil
39
36
  # time begin and end
40
- tb = time # TODO 差八个小时 [.in_time_zone, .localtime, .utc] 对于Rails,计算结果还是一样的。
37
+ tb = time
41
38
  te = (time+1.send(cron.time_unit)-1.second)
42
39
  tb, te = tb.to_i, te.to_i if is_time_column_integer?
43
40
  tb = time_begin || tb
44
- return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_mysql?
45
- return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongodb? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
41
+ return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
42
+ return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
46
43
  end
47
44
 
48
45
  end
49
46
 
50
- class Timely < Count
51
- def setup_stat_table
52
- # TODO migration proc, merge into setup_stat_table_and_model
53
- cron.stat_table_name = [cron.class.name.split("::")[-1], cron.source_name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
54
- raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
55
- unless Statlysis.sequel.table_exists?(cron.stat_table_name)
56
- Statlysis.sequel.transaction do
57
- Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
58
- DateTime :t # alias for :time
59
- end
60
-
61
- # TODO Add cron.source_where_array before count_columns
62
- count_columns = [:timely_c, :totally_c] # alias for :count
63
- count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
64
- index_column_names = [:t] + count_columns
65
- index_column_names_name = index_column_names.join("_")
66
- index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
67
-
68
- Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
69
- end
70
- end
71
- end
72
-
73
- def output
74
- @output ||= (cron.time_range.map do |time|
75
- timely_c = cron.source.where(unit_range_query(time)).count
76
- _t = DateTime.parse("19700101")
77
- _t = is_time_column_integer? ? _t.to_i : _t
78
- totally_c = cron.source.where(unit_range_query(time, _t)).count
79
-
80
- puts "#{time.in_time_zone} #{cron.source_name} timely_c:#{timely_c} totally_c:#{totally_c}"
81
- if timely_c.zero? && totally_c.zero?
82
- nil
83
- else
84
- {:t => time, :timely_c => timely_c, :totally_c => totally_c}
85
- end
86
- end.compact)
87
- end
88
- end
47
+ end
89
48
 
90
- class Dimensions < Count
91
- end
92
49
 
93
- end
50
+ require 'statlysis/cron/count/timely'
51
+ require 'statlysis/cron/count/dimensions'
@@ -0,0 +1,7 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Dimensions < Count
5
+ end
6
+
7
+ end
@@ -0,0 +1,63 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely < Count
5
+ def setup_stat_model
6
+ cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
7
+ raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
8
+
9
+ if not Statlysis.sequel.table_exists?(cron.stat_table_name)
10
+ Statlysis.sequel.transaction do
11
+ Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
12
+ DateTime :t # alias for :time
13
+ end
14
+
15
+ # TODO Add cron.source_where_array before count_columns
16
+ count_columns = [:timely_c, :totally_c] # alias for :count
17
+ count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
18
+ index_column_names = [:t] + count_columns
19
+ index_column_names_name = index_column_names.join("_")
20
+ index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
21
+
22
+ # Fix there should be uniq index name between tables
23
+ # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
24
+ if not Statlysis.config.is_skip_database_index
25
+ Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
26
+ end
27
+ end
28
+ end
29
+
30
+ n = cron.stat_table_name.to_s.singularize.camelize
31
+ cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
32
+ class ::#{n} < Sequel::Model;
33
+ self.set_dataset :#{cron.stat_table_name}
34
+ end
35
+ #{n}
36
+ MODEL
37
+ end
38
+
39
+ def output
40
+ @output ||= (cron.time_range.map do |time|
41
+ timely_c = 0
42
+ totally_c = 0
43
+ # support multiple data sources
44
+ _first_source = nil
45
+ cron.multiple_dataset.sources.each do |s|
46
+ timely_c += s.where(unit_range_query(time)).count
47
+ _t = DateTime1970
48
+ _t = is_time_column_integer? ? _t.to_i : _t
49
+ totally_c += s.where(unit_range_query(time, _t)).count
50
+ _first_source ||= s.where(unit_range_query(time))
51
+ end
52
+ logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
53
+
54
+ if timely_c.zero? && totally_c.zero?
55
+ nil
56
+ else
57
+ {:t => time, :timely_c => timely_c, :totally_c => totally_c}
58
+ end
59
+ end.compact)
60
+ end
61
+ end
62
+
63
+ end
@@ -24,15 +24,7 @@ module Statlysis
24
24
  cron.write
25
25
  end
26
26
 
27
- def write; raise DefaultWrongMessage end
28
-
29
-
30
- def self.ensure_statlysis_table_and_model tn
31
- Top.new("FakeLogSource", :test => true, :stat_table_name => tn).pattern_table_and_model tn
32
- end
33
- def ensure_statlysis_table_and_model tn
34
- Top.ensure_statlysis_table_and_model tn
35
- end
27
+ def write; raise DefaultNotImplementWrongMessage end
36
28
 
37
29
  def default_assign_attr key_symbol, opts
38
30
  if opts[key_symbol]
@@ -43,58 +35,6 @@ module Statlysis
43
35
  end
44
36
  end
45
37
 
46
- # 博客最近用户访问计算实现流程讨论
47
- # 问题分两个,一个是后端,一个是前端。对后端来说,用户每次blog/index|show访问都生成访问记录,后端需要进行排重和去掉未登陆用户。如果在该次访问里进行,特别是某个博客突然火了,必然每次访问都产生IO(磁盘或网络,因为多进程要共享信息),所以必定是异步的。
48
- # 前端展示考虑到缓存,一般是页面片段缓存,或者ajax载入。
49
- # 后端异步如何计算每个blog的最近访客,log.js记录了最近访问,一个后台常驻进程循环对日志表按时间记录来读取blog访问信息,把最近访客信息刷新到blog。相对单次请求全部处理,这里处理次数更少,资源更节约,当然瓶颈也在日志表的索引更新和读取。
50
- class LastestVisits < Top
51
- attr_accessor :clock
52
- attr_accessor :reject_proc
53
-
54
- # *pattern_proc* is a proc to extract user_id or url_prefix to compute the
55
- # top visitors from log
56
- # *user_id_proc* is a proc to extract user_id from log
57
- # *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
58
- # *reject_proc* filter visitors
59
- def initialize source, opts = {}
60
- # set variables
61
- cron.reclock opts[:default_time]
62
- cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
63
- super
64
- cron.pattern_table_and_model cron.stat_table_name
65
- cron
66
- end
67
-
68
- def output
69
- cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
70
- return {} if cron.logs.blank?
71
- cron.logs.inject({}) do |h, log|
72
- pattern = cron.pattern_proc.call(log)
73
- if pattern
74
- h[pattern] ||= []
75
- user_id = cron.user_id_proc.call(log).to_i
76
- h[pattern] << user_id if not user_id.zero?
77
- end
78
- h
79
- end
80
- end
81
-
82
- def write
83
- puts "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
84
- cron.output.each do |pattern, user_ids|
85
- s = cron.stat_model.find_or_create(:pattern => pattern)
86
- old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
87
- new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
88
- s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
89
- end
90
- cron.clock.update cron.logs.last.try(cron.time_column)
91
- end
92
-
93
- def reclock default_time = nil
94
- cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
95
- end
96
- end
97
-
98
38
  class SingleKv < Top
99
39
  attr_accessor :time_ago, :stat_column_name
100
40
 
@@ -102,53 +42,13 @@ module Statlysis
102
42
  [:time_ago, :stat_column_name].each {|key_symbol| default_assign_attr key_symbol, opts }
103
43
  raise "#{cron.class} only is kv store" if cron.stat_table_name # TODO
104
44
  super
105
- cron.ensure_statlysis_table_and_model [Statlysis.tablename_default_pre, 'single_kvs'].compact.join("_").freeze
106
45
  cron
107
46
  end
108
47
 
109
48
  end
110
49
 
111
- # 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
112
- #
113
- # 解决方法为从用户行为中去综合分析,具体流程为:
114
- # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
115
- # 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
116
- #
117
- # 线性计算速度很快
118
- #
119
- class HotestItems < SingleKv
120
- attr_accessor :key, :id_to_score_and_time_hash_proc
121
- attr_accessor :limit
122
-
123
- def initialize key, id_to_score_and_time_hash_proc
124
- cron.key = key
125
- cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
126
- cron.limit = 20
127
- super
128
- cron
129
- end
130
-
131
- def output
132
- t = cron.id_to_score_and_time_hash_proc
133
- while t.is_a?(Proc) do
134
- t = t.call
135
- end
136
- @id_to_score_and_time_hash = t
137
- @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
138
-
139
- @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
140
- array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
141
- {cron.key => array}
142
- end
143
-
144
- def write
145
- cron.output.each do |key, array|
146
- json = array[0..140].to_json
147
- StSingleKv.find_or_create(:pattern => key).update :result => json
148
- StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
149
- end
150
- end
50
+ end
151
51
 
152
- end
153
52
 
154
- end
53
+ require 'statlysis/cron/top/lastest_visits.rb'
54
+ require 'statlysis/cron/top/hotest_items.rb'
@@ -0,0 +1,47 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ # 一般最近热门列表通常采用简单对一个字段记录访问数的算法,但是这可能会导致刷量等问题。
5
+ #
6
+ # 解决方法为从用户行为中去综合分析,具体流程为:
7
+ # 从URI中抽取item_id, 从访问日志抽取排重IP和user_id,从like,fav,comment表获取更深的用户行为,把前两者通过一定比例相加得到排行。
8
+ # 最后用时间降温来避免马太效应,必可动态提升比例以使最近稍微热门的替换掉之前太热门的。
9
+ #
10
+ # 线性计算速度很快
11
+ #
12
+ class HotestItems < SingleKv
13
+ attr_accessor :key, :id_to_score_and_time_hash_proc
14
+ attr_accessor :limit
15
+
16
+ def initialize key, id_to_score_and_time_hash_proc
17
+ cron.key = key
18
+ cron.id_to_score_and_time_hash_proc = id_to_score_and_time_hash_proc
19
+ cron.limit = 20
20
+ super
21
+ cron
22
+ end
23
+
24
+ def output
25
+ t = cron.id_to_score_and_time_hash_proc
26
+ while t.is_a?(Proc) do
27
+ t = t.call
28
+ end
29
+ @id_to_score_and_time_hash = t
30
+ @id_to_day_hash = @id_to_score_and_time_hash.inject({}) {|h, ab| h[ab[0]] = (((Time.now - ab[1][1]) / (3600*24)).round + 1); h }
31
+
32
+ @id_to_timecooldown_hash = @id_to_score_and_time_hash.inject({}) {|h, kv| h[kv[0]] = (kv[1][0] / Math.sqrt(@id_to_day_hash[kv[0]])); h }
33
+ array = @id_to_timecooldown_hash.sort {|a, b| b[1] <=> a[1] }.map(&:first)
34
+ {cron.key => array}
35
+ end
36
+
37
+ def write
38
+ cron.output.each do |key, array|
39
+ json = array[0..140].to_json
40
+ StSingleKv.find_or_create(:pattern => key).update :result => json
41
+ StSingleKvHistory.find_or_create(:pattern => "#{key}_#{Time.now.strftime('%Y%m%d')}").update :result => json
42
+ end
43
+ end
44
+
45
+ end
46
+
47
+ end