statlysis 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.gitignore +3 -0
  2. data/Guardfile +14 -0
  3. data/README.markdown +77 -27
  4. data/Rakefile +1 -1
  5. data/lib/statlysis.rb +59 -101
  6. data/lib/statlysis/clock.rb +3 -3
  7. data/lib/statlysis/common.rb +4 -16
  8. data/lib/statlysis/configuration.rb +97 -2
  9. data/lib/statlysis/constants.rb +10 -0
  10. data/lib/statlysis/cron.rb +40 -42
  11. data/lib/statlysis/cron/count.rb +16 -58
  12. data/lib/statlysis/cron/count/dimensions.rb +7 -0
  13. data/lib/statlysis/cron/count/timely.rb +63 -0
  14. data/lib/statlysis/cron/top.rb +4 -104
  15. data/lib/statlysis/cron/top/hotest_items.rb +47 -0
  16. data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
  17. data/lib/statlysis/cron_set.rb +26 -0
  18. data/lib/statlysis/dataset.rb +6 -0
  19. data/lib/statlysis/javascript/count.rb +3 -3
  20. data/lib/statlysis/multiple_dataset.rb +69 -0
  21. data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
  22. data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
  23. data/lib/statlysis/rake.rb +6 -5
  24. data/lib/statlysis/similar.rb +11 -11
  25. data/lib/statlysis/timeseries.rb +12 -9
  26. data/lib/statlysis/utils.rb +40 -0
  27. data/statlysis.gemspec +13 -3
  28. data/test/config/database.yml +9 -0
  29. data/test/config/mongoid.yml +36 -0
  30. data/test/data/.gitkeep +0 -0
  31. data/test/data/code_gists_20130724.csv +1459 -0
  32. data/test/helper.rb +41 -3
  33. data/test/migrate/1_active_record.rb +8 -0
  34. data/test/models/.gitkeep +0 -0
  35. data/test/models/code_gist.rb +5 -0
  36. data/test/models/eoe_log.rb +53 -0
  37. data/test/test_daily_count.rb +22 -0
  38. data/test/test_mapreduce.rb +0 -13
  39. data/test/test_single_log_in_multiple_collections.rb +22 -0
  40. data/test/test_statlysis.rb +5 -50
  41. data/test/test_timeseries.rb +46 -0
  42. metadata +133 -12
  43. data/Gemfile.lock +0 -110
  44. data/LICENSE.txt +0 -20
  45. data/test/models/company.rb +0 -12
  46. data/test/models/employee.rb +0 -14
@@ -0,0 +1,53 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ # See tech details at http://mvj3.github.io/2013/01/30/recent-visitors-implement/
5
+ class LastestVisits < Top
6
+ attr_accessor :clock
7
+ attr_accessor :reject_proc
8
+
9
+ # *pattern_proc* is a proc to extract user_id or url_prefix to compute the
10
+ # top visitors from log
11
+ # *user_id_proc* is a proc to extract user_id from log
12
+ # *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
13
+ # *reject_proc* filter visitors
14
+ def initialize source, opts = {}
15
+ # set variables
16
+ cron.reclock opts[:default_time]
17
+ cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
18
+ super
19
+ Utils.setup_pattern_table_and_model cron.stat_table_name
20
+ cron
21
+ end
22
+
23
+ def output
24
+ cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
25
+ return {} if cron.logs.blank?
26
+ cron.logs.inject({}) do |h, log|
27
+ pattern = cron.pattern_proc.call(log)
28
+ if pattern
29
+ h[pattern] ||= []
30
+ user_id = cron.user_id_proc.call(log).to_i
31
+ h[pattern] << user_id if not user_id.zero?
32
+ end
33
+ h
34
+ end
35
+ end
36
+
37
+ def write
38
+ logger.info "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
39
+ cron.output.each do |pattern, user_ids|
40
+ s = cron.stat_model.find_or_create(:pattern => pattern)
41
+ old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
42
+ new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
43
+ s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
44
+ end
45
+ cron.clock.update cron.logs.last.try(cron.time_column)
46
+ end
47
+
48
+ def reclock default_time = nil
49
+ cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
50
+ end
51
+ end
52
+
53
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'statlysis/cron'
4
+
5
+ module Statlysis
6
+ class CronSet < Set
7
+ # filter cron_sets by pattern
8
+ def [] pattern = nil
9
+ case pattern
10
+ when Fixnum, Integer # support array idx access
11
+ self.to_a[pattern]
12
+ else
13
+ CronSet.new(select do |cron_set|
14
+ cron_set.multiple_dataset.name.to_s.match Regexp.new(pattern.to_s)
15
+ end)
16
+ end
17
+ end
18
+
19
+ def last; [-1]; end
20
+
21
+ def run
22
+ map(&:run)
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Dataset
5
+ end
6
+ end
@@ -3,7 +3,7 @@
3
3
  module Statlysis
4
4
  module Javascript
5
5
  class MultiDimensionalCount
6
- attr_accessor :map_func, :reduce_func
6
+ attr_reader :map_func, :reduce_func
7
7
 
8
8
  def initialize *fields
9
9
  fields = :_id if fields.blank?
@@ -17,11 +17,11 @@ module Statlysis
17
17
  raise "Please assign symbol, string, or array of them"
18
18
  end
19
19
 
20
- self.map_func = "function() {
20
+ @map_func = "function() {
21
21
  emit (#{emit_key}, {count: 1});
22
22
  }"
23
23
 
24
- self.reduce_func = "function(key, values) {
24
+ @reduce_func = "function(key, values) {
25
25
  var count = 0;
26
26
 
27
27
  values.forEach(function(v) {
@@ -0,0 +1,69 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class MultipleDataset
5
+ def initialize cron = nil
6
+ @cron = cron
7
+ @sources ||= Set.new
8
+ return self
9
+ end
10
+
11
+ attr_reader :cron, :regexp, :sources
12
+ def set_regexp regexp
13
+ case regexp
14
+ when Regexp
15
+ when String
16
+ regexp = Regexp.new(string)
17
+ else
18
+ raise "regexp #{regexp} should be a Regexp!"
19
+ end
20
+ @regexp = regexp
21
+
22
+ return self
23
+ end
24
+
25
+ def add_source s
26
+ @sources.add s
27
+
28
+ return self
29
+ end
30
+
31
+ def name
32
+ if @sources.size.zero?
33
+ Statlysis.logger.warn "Add source to #{self} first!"
34
+ return nil
35
+ elsif @sources.size == 1
36
+ @sources.first.send(Utils.name(@sources.first))
37
+ else
38
+ # /multiple_log_2013[0-9]{4}/ => 'multiple_log'
39
+ regexp.inspect[1..-2].gsub(/\-|\[|\]|\{|\}|[0-9]/, '').sub(/\_+$/, '')
40
+ end
41
+ end
42
+ # Access dataset name, compact with many ORM
43
+ alias collection_name name # mongoid
44
+ alias table_name name # activerecord
45
+
46
+
47
+ def first_time
48
+ t = _resort_source_order.map(&:first).compact.map {|i| i.send(cron.time_column) }.compact.min || DateTime1970
49
+ t.in_time_zone(cron.time_zone)
50
+ end
51
+ def _resort_source_order; resort_source_order if cron; end # lazy load if cron is unassigned
52
+ def resort_source_order; raise DefaultNotImplementWrongMessage; end
53
+
54
+ # select ORM models fron ::Object namespace
55
+ def _select_orm _module
56
+ ::Object.constants.map do |c|
57
+ c.to_s.constantize rescue nil # NameError: uninitialized constant ClassMethods
58
+ end.compact.select do |c|
59
+ (c.class === Class) &&
60
+ c.respond_to?(:included_modules) &&
61
+ c.included_modules.index(_module)
62
+ end
63
+ end
64
+
65
+ end
66
+ end
67
+
68
+ require 'statlysis/multiple_dataset/mongoid'
69
+ require 'statlysis/multiple_dataset/active_record'
@@ -0,0 +1,36 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'active_record'
4
+
5
+ module Statlysis
6
+ class ActiveRecordDataset < MultipleDataset
7
+ attr_reader :arel
8
+ def arel
9
+ @arel = @sources.first.where("").arel
10
+ end
11
+ # TODO
12
+ def method_missing
13
+ end
14
+
15
+ def set_regexp regexp
16
+ super
17
+
18
+ # TODO test it
19
+ activerecord_models = _select_orm(ActiveRecord::Store)
20
+ activerecord_models.select do |_model|
21
+ @sources.add _model if _model.table_name.to_s.match(@regexp)
22
+ end
23
+
24
+ _resort_source_order
25
+
26
+ return self
27
+ end
28
+
29
+ def resort_source_order; @sources = @sources.map {|s| s.order("#{cron.time_column} ASC") } end
30
+
31
+ end
32
+
33
+ def ActiveRecord.[] regexp
34
+ end
35
+
36
+ end
@@ -0,0 +1,54 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'mongoid'
4
+
5
+ # http://mongoid.org/en/origin/index.html
6
+ # Origin provides a DSL to mix in to any object to give it the ability to build MongoDB queries easily. It was extracted from Mongoid in an attempt to allow others to leverage the DSL in their own applications without needing a mapper.
7
+ require 'origin'
8
+
9
+ module Statlysis
10
+ class MongoidDataset < MultipleDataset
11
+ # Notice: Origin::Queryable overwrite MongoidDataset#initialize
12
+ class Query; include Origin::Queryable end
13
+
14
+ # delegate mongoid query to @sources
15
+ # see document at http://rubydoc.info/github/mongoid/origin/Origin/Queryable & http://rubydoc.info/github/mongoid/origin/Origin/Forwardable
16
+ attr_reader :criteria
17
+ def method_missing m, *args, &blk
18
+ @criteria ||= Query.new
19
+ if (Origin::Selectable.forwardables + Origin::Optional.forwardables).include?(m)
20
+ @criteria = @criteria.__send__(m, *args, &blk)
21
+ @sources = @sources.map {|s| s.__send__(m, *args, &blk) }
22
+ return self # support method chain
23
+ else
24
+ super
25
+ end
26
+ end
27
+
28
+ def set_regexp regexp
29
+ super
30
+
31
+ _collections = Mongoid.default_session.collections.select {|_collection| _collection.name.match(@regexp) }
32
+ mongoid_models = _select_orm(Mongoid::Document)
33
+
34
+ _collections.select do |_collection|
35
+ _mongoid_model = mongoid_models.detect {|m| m.collection_name === _collection.name }
36
+ raise "Please define Mongoid model for #{_collection}.collection under ::Object namespace!" if _mongoid_model.nil?
37
+ mongoid_models.delete _mongoid_model
38
+ @sources.add _mongoid_model
39
+ end
40
+
41
+ _resort_source_order
42
+
43
+ return self
44
+ end
45
+
46
+ def resort_source_order; @sources = @sources.map {|s| s.asc(cron.time_column) } end
47
+
48
+ end
49
+
50
+ def Mongoid.[] regexp
51
+ MongoidDataset.new.set_regexp(regexp)
52
+ end
53
+
54
+ end
@@ -1,28 +1,29 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  require 'rake'
4
+ require 'only_one_rake'
4
5
 
5
6
  namespace :statlysis do
6
- Statlysis::Units.each do |unit|
7
+ Statlysis::TimeUnits.each do |unit|
7
8
  desc "statistical in #{unit}"
8
9
  only_one_task "#{unit}_count" => :environment do
9
- Statlysis.send("#{unit}_crons").map(&:run)
10
+ Statlysis.config.send("#{unit}_crons").map(&:run)
10
11
  end
11
12
  end
12
13
 
13
14
  desc "realtime process"
14
15
  only_one_task :realtime_process => :environment do
15
- loop { Statlysis.realtime_crons.map(&:run); sleep 1 }
16
+ loop { Statlysis.config.realtime_crons.map(&:run); sleep 1 }
16
17
  end
17
18
 
18
19
  desc "similar process"
19
20
  only_one_task :similar_process => :environment do
20
- Statlysis.similar_crons.map(&:run)
21
+ Statlysis.config.similar_crons.map(&:run)
21
22
  end
22
23
 
23
24
  desc "hotest process"
24
25
  only_one_task :hotest_process => :environment do
25
- Statlysis.hotest_crons.map(&:run)
26
+ Statlysis.config.hotest_crons.map(&:run)
26
27
  end
27
28
 
28
29
  end
@@ -12,27 +12,27 @@ module Statlysis
12
12
 
13
13
  # 初始化表和模型
14
14
  cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
15
- cron.pattern_table_and_model cron.stat_table_name
15
+ Utils.setup_pattern_table_and_model cron.stat_table_name
16
16
 
17
17
  cron.id_to_similar_ids = {}
18
18
  cron
19
19
  end
20
20
 
21
21
  def process
22
- puts "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
22
+ logger.info "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
23
23
  require 'gsl'
24
24
  require 'tf-idf-similarity'
25
25
 
26
26
  # 初始化文档
27
- puts "开始取出 cron.id_to_text_hash_proc"
27
+ logger.info "开始取出 cron.id_to_text_hash_proc"
28
28
  @id_to_text_hash = cron.id_to_text_hash_proc.call
29
29
 
30
- puts "开始把@id_to_text_hash转化为数组"
30
+ logger.info "开始把@id_to_text_hash转化为数组"
31
31
  as = @id_to_text_hash.to_a
32
32
 
33
- puts "开始把as slice为1200每次"
33
+ logger.info "开始把as slice为1200每次"
34
34
  as.each_slice(1200) do |a|
35
- puts "开始跑 #{a.size} 个条目的相似性"
35
+ logger.info "开始跑 #{a.size} 个条目的相似性"
36
36
  cron.corpus = TfIdfSimilarity::Collection.new
37
37
  a.each do |id, text|
38
38
  cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
@@ -59,15 +59,15 @@ module Statlysis
59
59
  _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
60
60
  end
61
61
  _item_id_to_score.delete document.id
62
- puts "对比文档:"
63
- puts "#{document.id} # #{summary(document.id)}"
64
- puts "相关文档:"
62
+ logger.info "对比文档:"
63
+ logger.info "#{document.id} # #{summary(document.id)}"
64
+ logger.info "相关文档:"
65
65
  _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
66
66
  _item_ids[0..9].each do |item_id, score|
67
- puts "#{score} # #{summary(item_id)}"
67
+ logger.info "#{score} # #{summary(item_id)}"
68
68
  end
69
69
  cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
70
- puts
70
+ logger.info
71
71
  end
72
72
 
73
73
  # save results to database
@@ -4,21 +4,24 @@ module Statlysis
4
4
  module TimeSeries
5
5
  # range支持如下三种时间范围格式
6
6
  # 20121201 20121221
7
- # DateTime.parse('20121221')
8
- # DateTime.parse('20121201')..DateTime.parse('20121221')
7
+ # Time.zone.parse('20121221')
8
+ # Time.zone.parse('20121201')..Time.zone.parse('20121221')
9
9
  # opts[:unit]支持:hour, :day, :week, :month等时间单位
10
10
  # 返回的结果为时间范围内的序列数组
11
11
  def self.parse range, opts = {}
12
- opts = opts.reverse_merge :unit => :day, :utc => true, :offset => nil
12
+ # removed :utc => true, no effect.
13
+ # and so does :offset => nil
14
+ opts = opts.reverse_merge :unit => :day
13
15
  unit = opts[:unit]
16
+ zone = opts[:zone] || Statlysis.default_time_zone || Time.zone
14
17
 
15
- range = Range.new(*range.split.map {|i| DateTime.parse(i).to_time_in_current_zone }) if range.is_a?(String)
18
+ range = Range.new(*range.split.map {|i| Time.zone.parse(i).in_time_zone(zone) }) if range.is_a?(String)
16
19
 
17
20
  begin_unit = "beginning_of_#{unit}".to_sym
18
21
  array = if range.respond_to?(:to_datetime)
19
- [range.in_time_zone.send(begin_unit)]
22
+ [range.in_time_zone(zone).send(begin_unit)]
20
23
  elsif range.is_a?(Range)
21
- ary = [range.first.in_time_zone, range.last.in_time_zone].map(&begin_unit).uniq
24
+ ary = [range.first.in_time_zone(zone), range.last.in_time_zone(zone)].map(&begin_unit).uniq
22
25
 
23
26
  _ary = []
24
27
  _ary.push ary[0]
@@ -32,9 +35,9 @@ module Statlysis
32
35
  _ary.compact.reject {|i| (i < range.first) && (i >= range.last) }
33
36
  end
34
37
 
35
- array = array.map {|s| s.to_time } if opts[:utc]
36
- array = array.map {|i| i + opts[:offset] } if opts[:offset]
37
- array.map(&:in_time_zone)
38
+ # array = array.map {|s| s.to_time } if opts[:utc]
39
+ # array = array.map {|i| i + opts[:offset] } if opts[:offset]
40
+ array.map(&:to_datetime)
38
41
  end
39
42
 
40
43
  end
@@ -0,0 +1,40 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ module Utils
5
+ class << self
6
+ def is_activerecord?(data); data.is_a?(ActiveRecordDataset) || !!((data.respond_to?(:included_modules) ? data.included_modules : []).index(ActiveRecord::Store)) end
7
+ def is_mongoid?(data); data.is_a?(MongoidDataset) || !!((data.respond_to?(:included_modules) ? data.included_modules : []).index(Mongoid::Document)) end
8
+ def name(data)
9
+ return :collection_name if Utils.is_mongoid?(data)
10
+ return :table_name if Utils.is_activerecord?(data)
11
+ end
12
+
13
+ def setup_pattern_table_and_model tn
14
+ # ensure statlysis table
15
+ tn = tn.pluralize
16
+ if not Statlysis.sequel.table_exists?(tn)
17
+ Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
18
+ primary_key :id
19
+ String :pattern
20
+ index :pattern
21
+ end
22
+ Statlysis.sequel.add_column tn, :result, String, :text => true
23
+ end
24
+
25
+ # generate a statlysis kv model
26
+ str = tn.to_s.singularize.camelize
27
+ class_eval <<-MODEL, __FILE__, __LINE__ + 1
28
+ class ::#{str} < Sequel::Model;
29
+ self.set_dataset :#{tn}
30
+ def self.[] item_id
31
+ JSON.parse(find_or_create(:pattern => item_id).result) rescue []
32
+ end
33
+ end;
34
+ MODEL
35
+ {:table => tn, :model => str.constantize}
36
+ end
37
+
38
+ end
39
+ end
40
+ end