statlysis 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/.gitignore +3 -0
  2. data/Guardfile +14 -0
  3. data/README.markdown +77 -27
  4. data/Rakefile +1 -1
  5. data/lib/statlysis.rb +59 -101
  6. data/lib/statlysis/clock.rb +3 -3
  7. data/lib/statlysis/common.rb +4 -16
  8. data/lib/statlysis/configuration.rb +97 -2
  9. data/lib/statlysis/constants.rb +10 -0
  10. data/lib/statlysis/cron.rb +40 -42
  11. data/lib/statlysis/cron/count.rb +16 -58
  12. data/lib/statlysis/cron/count/dimensions.rb +7 -0
  13. data/lib/statlysis/cron/count/timely.rb +63 -0
  14. data/lib/statlysis/cron/top.rb +4 -104
  15. data/lib/statlysis/cron/top/hotest_items.rb +47 -0
  16. data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
  17. data/lib/statlysis/cron_set.rb +26 -0
  18. data/lib/statlysis/dataset.rb +6 -0
  19. data/lib/statlysis/javascript/count.rb +3 -3
  20. data/lib/statlysis/multiple_dataset.rb +69 -0
  21. data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
  22. data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
  23. data/lib/statlysis/rake.rb +6 -5
  24. data/lib/statlysis/similar.rb +11 -11
  25. data/lib/statlysis/timeseries.rb +12 -9
  26. data/lib/statlysis/utils.rb +40 -0
  27. data/statlysis.gemspec +13 -3
  28. data/test/config/database.yml +9 -0
  29. data/test/config/mongoid.yml +36 -0
  30. data/test/data/.gitkeep +0 -0
  31. data/test/data/code_gists_20130724.csv +1459 -0
  32. data/test/helper.rb +41 -3
  33. data/test/migrate/1_active_record.rb +8 -0
  34. data/test/models/.gitkeep +0 -0
  35. data/test/models/code_gist.rb +5 -0
  36. data/test/models/eoe_log.rb +53 -0
  37. data/test/test_daily_count.rb +22 -0
  38. data/test/test_mapreduce.rb +0 -13
  39. data/test/test_single_log_in_multiple_collections.rb +22 -0
  40. data/test/test_statlysis.rb +5 -50
  41. data/test/test_timeseries.rb +46 -0
  42. metadata +133 -12
  43. data/Gemfile.lock +0 -110
  44. data/LICENSE.txt +0 -20
  45. data/test/models/company.rb +0 -12
  46. data/test/models/employee.rb +0 -14
data/.gitignore CHANGED
@@ -49,3 +49,6 @@ tmtags
49
49
  *.rbc
50
50
 
51
51
  coverage
52
+ *.gem
53
+ Gemfile.lock
54
+ tmp
data/Guardfile ADDED
@@ -0,0 +1,14 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :test do
5
+ watch(%r{^lib/(.+)\.rb$}) { |m| "test/#{m[1]}_test.rb" }
6
+ watch(%r{^test/.+_test\.rb$})
7
+ watch('test/test_helper.rb') { "test" }
8
+
9
+ # Rails example
10
+ watch(%r{^app/models/(.+)\.rb$}) { |m| "test/unit/#{m[1]}_test.rb" }
11
+ watch(%r{^app/controllers/(.+)\.rb$}) { |m| "test/functional/#{m[1]}_test.rb" }
12
+ watch(%r{^app/views/.+\.rb$}) { "test/integration" }
13
+ watch('app/controllers/application_controller.rb') { ["test/functional", "test/integration"] }
14
+ end
data/README.markdown CHANGED
@@ -1,43 +1,93 @@
1
- statlysis
1
+ Statlysis
2
2
  ===============================================
3
- statistical analysis in ruby dsl
3
+ Statistical & Analysis in Ruby DSL
4
4
 
5
5
  Usage
6
6
  -----------------------------------------------
7
+ ### setup
8
+
7
9
  ```ruby
8
- module Statlysis
10
+ Statlysis.setup do
9
11
  set_database :statlysis
10
- update_time_columns :t
11
- set_tablename_default_pre :st
12
-
13
- # 初始化键值model
14
- Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kvs'
15
- Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kv_histories'
16
-
17
- # 日常count
18
- EoeLog.class # preload EoeLogTest
19
- @log_model = IS_DEVELOP ? EoeLogTest : EoeLog
20
- hourly @log_model, :t
21
- daily @log_model, :t
22
- daily @log_model.where(:ui => 0), :t
23
- daily @log_model.where(:ui => {"$ne" => 0}), :t
24
-
25
- # 统计各个模块
26
- daily @log_model.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}), :t
27
- [:www, :code, :skill, :book, :edu, :news, :wiki, :salon, :android].each do |site|
28
- daily @log_model.where(:do => DOMAINS_HASH[site]), :t
12
+
13
+ hourly :time_column => :t
14
+ [EoeLog,
15
+ EoeLog.where(:ui => 0),
16
+ EoeLog.where(:ui => {"$ne" => 0}),
17
+ Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
18
+ EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
19
+ ].each do |s|
20
+ daily s, :time_column => :t
29
21
  end
22
+ end
23
+ ```
24
+
25
+ ### access
26
+
27
+ ```ruby
28
+ Statlysis.daily # => return daily crons
29
+ Statlysis.daily.run # => run daily crons
30
+ Statlysis.daily[/name_regexp/] # => return matched daily crons
31
+ ```
32
+
33
+ ### process
34
+
35
+ ```irb
36
+ [23] pry(#<Statlysis::Configuration>)> Statlysis.daily['multi'].first
30
37
  ```
31
38
 
39
+ Features
40
+ -----------------------------------------------
41
+ * Support time column that stored as integer.
42
+
32
43
  TODO
33
44
  -----------------------------------------------
34
- 1. Admin interface
35
- 2. statistical query api in Ruby and HTTP
36
- 3. Interacting with Javascript charting library, e.g. Highcharts, D3.
37
- 4. Add namespace to DSL, like rake
38
- 5. More tests
45
+ * Admin interface
46
+ * statistical query api in Ruby and HTTP
47
+ * Interacting with Javascript charting library, e.g. Highcharts, D3.
48
+ * More tests
49
+ * Add @criteria to MultipleDataset
50
+
51
+
52
+ Statistical Process
53
+ -----------------------------------------------
54
+ 1. Delete invalid statistical data, e.g. data in tomorrow
55
+ 2. Count data within the specified time by the dimensions
56
+ 3. Delete overlapping data, and insert new data
57
+
58
+
59
+ FAQ
60
+ -----------------------------------------------
61
+ Q: Why use Sequel instead of ActiveRecord?
62
+
63
+ A: When initialize an ORM object, ActiveRecord is 3 times slower than Sequel, and we just need the basic operations, including read, write, enumerate, etc. See more details in [Quick dive into Ruby ORM object initialization](http://merbist.com/2012/02/23/quick-dive-into-ruby-orm-object-initialization/) .
64
+
65
+
66
+ Q: Why do you recommend using multiple collections to store logs rather than a single collection, or a capped collection?
67
+
68
+ A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
39
69
 
40
70
 
41
71
  Copyright
42
72
  -----------------------------------------------
43
73
  MIT. David Chen at eoe.cn.
74
+
75
+
76
+ Related
77
+ -----------------------------------------------
78
+ ### Projects
79
+ * https://github.com/paulasmuth/fnordmetric FnordMetric is a redis/ruby-based realtime Event-Tracking app
80
+ * https://github.com/thirtysixthspan/descriptive_statistics adds methods to the Enumerable module to allow easy calculation of basic descriptive statistics for a set of data
81
+ * https://github.com/tmcw/simple-statistics simple statistics for javascript in node and the browser
82
+ * https://github.com/clbustos/statsample/ A suite for basic and advanced statistics on Ruby.
83
+ * https://github.com/SciRuby/sciruby Tools for scientific computation in Ruby/Rails
84
+
85
+ ### Articles
86
+ * http://www.slideshare.net/WombatNation/logging-app-behavior-to-mongo-db
87
+
88
+ ### Event collector
89
+ * https://github.com/fluent
90
+ * https://github.com/logstash/logstash
91
+
92
+ ### Admin interface
93
+ * http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: UTF-8
2
2
 
3
- require 'rake'
3
+ require "bundler/gem_tasks"
4
4
  require 'rake/testtask'
5
5
 
6
6
  Rake::TestTask.new do |t|
data/lib/statlysis.rb CHANGED
@@ -1,134 +1,92 @@
1
1
  # encoding: UTF-8
2
+ #
2
3
  # Sequel的操作均需通过Symbol
3
4
  #
4
5
  # 删除匹配的统计表
5
6
  # Statlysis.sequel.tables.select {|i| i.to_s.match(//i) }.each {|i| Statlysis.sequel.drop_table i }
6
7
 
8
+ # TODO Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
9
+
7
10
  require "active_support/all"
11
+ Time.zone ||= Time.now.utc_offset # require activesupport
12
+
13
+ require "active_support/core_ext"
8
14
  require 'active_support/core_ext/module/attribute_accessors.rb'
9
15
  require 'active_record'
10
- require 'rails'
11
- %w[yaml sequel only_one_rake mongoid].map(&method(:require))
16
+ require 'activerecord_idnamecache'
17
+ %w[yaml sequel mongoid].map(&method(:require))
12
18
 
13
- module Statlysis
14
- Units = %w[hour day week month year]
15
- DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
16
-
17
- def self.setup_stat_table_and_model cron, tablename = nil
18
- tablename = cron.stat_table_name if tablename.nil?
19
- tablename ||= cron.stat_table.first_source_table
20
- cron.stat_table = Statlysis.sequel[tablename.to_sym]
21
-
22
- str = tablename.to_s.singularize.camelize
23
- eval("class ::#{str} < Sequel::Model;
24
- self.set_dataset :#{tablename}
25
- def self.[] item_id
26
- JSON.parse(find_or_create(:pattern => item_id).result) rescue []
27
- end
28
- end; ")
29
- cron.stat_model = str.constantize
30
- end
31
-
32
- end
19
+ # Fake a Rails environment
20
+ module Rails; end
33
21
 
34
- require 'statlysis/common'
35
- require 'statlysis/timeseries'
36
- require 'statlysis/clock'
37
- require 'statlysis/rake'
38
- require 'statlysis/cron'
39
- require 'statlysis/similar'
22
+ require 'statlysis/constants'
40
23
 
41
24
  module Statlysis
42
- mattr_accessor :sequel, :default_time_columns, :database_opts, :tablename_default_pre
43
- Units.each {|unit| module_eval "mattr_accessor :#{unit}_crons; self.#{unit}_crons = []" }
44
- [:realtime, :similar, :hotest].each do |sym|
45
- sym = "#{sym}_crons".to_sym
46
- mattr_accessor sym; self.send "#{sym}=", []
47
- end
48
- # TODO _crons uniq, no readd
49
- extend self
50
-
51
- # 会在自动拼接统计数据库表名时去除这些时间字段
52
- def update_time_columns *columns
53
- self.default_time_columns ||= [:created_at, :updated_at]
54
- columns.each {|column| self.default_time_columns.push column }
55
- self.default_time_columns = self.default_time_columns.uniq
56
- end
25
+ class << self
26
+ def setup &blk
27
+ raise "Need to setup proc" if not blk
57
28
 
58
- def set_database sym_or_hash
59
- self.database_opts = if sym_or_hash.is_a? Symbol
60
- YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
61
- elsif Hash
62
- sym_or_hash
63
- else
64
- raise "Statlysis#set_database only support symbol or hash params"
29
+ logger.info "Start to setup Statlysis"
30
+ time_log do
31
+ self.config.instance_exec(&blk)
32
+ end
33
+ logger.info
65
34
  end
66
- self.sequel = Sequel.connect self.database_opts.except('database')
67
- self.sequel.execute("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;")
68
- self.sequel.use self.database_opts['database']
69
- # Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
70
- end
71
35
 
72
- def set_tablename_default_pre str
73
- self.tablename_default_pre = str.to_s
74
- end
36
+ def time_log text = nil
37
+ t = Time.now
38
+ logger.info text if text
39
+ yield if block_given?
40
+ logger.info "Time spend #{(Time.now - t).round(2)} seconds."
41
+ logger.info "-" * 42
42
+ end
75
43
 
76
- def daily source, time_column = :created_at; timely source, :time_unit => :day, :time_column => time_column end
77
- def hourly source, time_column = :created_at; timely source, :time_unit => :hour, :time_column => time_column end
44
+ # delagate config methods to Configuration
45
+ def config; Configuration.instance end
46
+ require 'active_support/core_ext/module/delegation.rb'
47
+ [:sequel, :set_database, :check_set_database,
48
+ :default_time_zone,
49
+ :set_tablename_default_pre, :tablename_default_pre
50
+ ].each do |sym|
51
+ delegate sym, :to => :config
52
+ end
78
53
 
79
- def check_set_database; raise "Please setup database first" if sequel.nil? end
54
+ attr_accessor :logger
55
+ Statlysis.logger ||= Logger.new($stdout)
80
56
 
81
- def timely source, opts
82
- self.check_set_database
83
- opts.reverse_merge! :time_column => :created_at, :time_unit => :day
84
- t = Timely.new source, opts
85
- module_eval("self.#{opts[:time_unit]}_crons").push t
86
- end
57
+ def source_to_database_type; @_source_to_database_type ||= {} end
87
58
 
88
- # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
89
- def lastest_visits source, opts
90
- self.check_set_database
91
- opts.reverse_merge! :time_column => :created_at
92
- self.realtime_crons.push LastestVisits.new(source, opts)
93
- end
94
59
 
95
- # TODO 为什么一层proc的话会直接执行的
96
- def hotest_items key, id_to_score_and_time_hash = {}
97
- _p = proc { if block_given?
98
- (proc do
99
- id_to_score_and_time_hash = Hash.new
100
- yield id_to_score_and_time_hash
101
- id_to_score_and_time_hash
102
- end)
103
- else
104
- (proc { id_to_score_and_time_hash })
105
- end}
106
-
107
- self.hotest_crons.push HotestItems.new(key, _p)
60
+ def daily; CronSet.new(Statlysis.config.day_crons) end
61
+ def hourly; CronSet.new(Statlysis.config.hour_crons) end
62
+
108
63
  end
109
64
 
110
- # TODO support mongoid
111
- def similar_items model_name, id_to_text_hash = {}
112
- _p = if block_given?
113
- (proc do
114
- id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
115
- yield id_to_text_hash
116
- id_to_text_hash
117
- end)
118
- else
119
- (proc { id_to_text_hash })
120
- end
65
+ end
121
66
 
122
- self.similar_crons.push Similar.new(model_name, _p)
123
- end
67
+ require 'statlysis/utils'
68
+ require 'statlysis/configuration'
69
+ require 'statlysis/common'
70
+ require 'statlysis/timeseries'
71
+ require 'statlysis/clock'
72
+ require 'statlysis/rake'
73
+ require 'statlysis/cron'
74
+ require 'statlysis/cron_set'
75
+ require 'statlysis/similar'
76
+ require 'statlysis/multiple_dataset'
124
77
 
78
+ module Statlysis
79
+ require 'short_inspect'
80
+ ShortInspect.apply_to Cron, CronSet, MultipleDataset
81
+ ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
125
82
  end
126
83
 
127
84
 
85
+ # load rake tasks
128
86
  module Statlysis
129
87
  class Railtie < Rails::Railtie
130
88
  rake_tasks do
131
89
  load File.expand_path('../statlysis/rake.rb', __FILE__)
132
90
  end
133
- end if defined? Rails
134
- end
91
+ end
92
+ end if defined? Rails::Railtie
@@ -1,12 +1,11 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Statlysis
4
- DateTime1970 = DateTime.parse("19700101").in_time_zone
5
-
6
4
  class Clock
7
5
  attr_accessor :clock
8
6
  include Common
9
7
 
8
+ # feature is a string
10
9
  def initialize feature, default_time
11
10
  raise "Please assign default_time params" if not default_time
12
11
  cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
@@ -18,7 +17,8 @@ module Statlysis
18
17
  index :feature, :unique => true
19
18
  end
20
19
  end
21
- Statlysis.setup_stat_table_and_model cron
20
+ h = Utils.setup_pattern_table_and_model cron.stat_table_name
21
+ cron.stat_model = h[:model]
22
22
  cron.clock = cron.stat_model.find_or_create(:feature => feature)
23
23
  cron.clock.update :t => default_time if cron.current.nil?
24
24
  cron
@@ -2,26 +2,14 @@
2
2
 
3
3
  module Statlysis
4
4
  module Common
5
- attr_accessor :stat_table_name, :stat_model, :stat_table
6
- def pattern_table_and_model tn
7
- # ensure statlysis table
8
- tn = tn.pluralize
9
- unless Statlysis.sequel.table_exists?(tn)
10
- Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
11
- primary_key :id
12
- String :pattern
13
- index :pattern
14
- end
15
- Statlysis.sequel.add_column tn, :result, String, :text => true
16
- end
5
+ extend ActiveSupport::Concern
17
6
 
18
- # generate a statlysis model
19
- cron.stat_model = Statlysis.setup_stat_table_and_model cron, tn
7
+ self.included do
8
+ attr_accessor :stat_table_name, :stat_model
20
9
  end
21
10
 
22
11
  def cron; self end
23
- # TODO remove puts, conflict user, user logger
24
- def puts(*strs); $stdout.puts(*strs) if ENV['DEBUG'] end
12
+ delegate :logger, :to => Statlysis
25
13
 
26
14
  end
27
15
  end
@@ -1,9 +1,104 @@
1
1
  # encoding: UTF-8
2
+ #
3
+ # see original implementation at http://mvj3.github.io/2013/04/17/statlysis-analysis-design-solve-two-problems-lazy-loading-and-scope/
4
+ #
5
+
6
+ require 'singleton'
2
7
 
3
8
  module Statlysis
4
- # TODO config methods here
5
9
  class Configuration
6
- def inherited(base)
10
+ include Singleton
11
+
12
+ attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
13
+ attr_accessor :is_skip_database_index
14
+ TimeUnits.each {|unit| module_eval "attr_accessor :#{unit}_crons; self.instance.#{unit}_crons = []" }
15
+ [:realtime, :similar, :hotest].each do |sym|
16
+ sym = "#{sym}_crons"
17
+ attr_accessor sym; self.instance.send "#{sym}=", []
18
+ end
19
+ self.instance.send "tablename_default_pre=", "st"
20
+ self.instance.send "is_skip_database_index=", false
21
+
22
+ # 会在自动拼接统计数据库表名时去除这些时间字段
23
+ def update_time_columns *columns
24
+ self.default_time_columns ||= [:created_at, :updated_at]
25
+ columns.each {|column| self.default_time_columns.push column }
26
+ self.default_time_columns = self.default_time_columns.uniq
27
+ end
28
+
29
+ def set_database sym_or_hash
30
+ self.database_opts = if sym_or_hash.is_a? Symbol
31
+ YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
32
+ elsif Hash
33
+ sym_or_hash
34
+ else
35
+ raise "Statlysis#set_database only support symbol or hash params"
36
+ end
37
+ self.sequel = Sequel.connect(self.database_opts)
38
+
39
+ # 初始化键值model
40
+ ["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
41
+ Utils.setup_pattern_table_and_model tn
42
+ end
43
+ return self
44
+ end
45
+
46
+ def set_default_time_zone zone
47
+ self.default_time_zone = zone
48
+ return self
49
+ end
50
+
51
+ def set_tablename_default_pre str
52
+ self.tablename_default_pre = str.to_s
53
+ end
54
+
55
+ def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
56
+ def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
57
+
58
+ def check_set_database; raise "Please setup database first" if sequel.nil? end
59
+
60
+ def timely source, opts
61
+ self.check_set_database
62
+ opts.reverse_merge! :time_column => :created_at, :time_unit => :day
63
+ t = Timely.new source, opts
64
+ self.send("#{opts[:time_unit]}_crons").push t
65
+ end
66
+
67
+ # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
68
+ def lastest_visits source, opts
69
+ self.check_set_database
70
+ opts.reverse_merge! :time_column => :created_at
71
+ self.realtime_crons.push LastestVisits.new(source, opts)
72
+ end
73
+
74
+ # TODO 为什么一层proc的话会直接执行的
75
+ def hotest_items key, id_to_score_and_time_hash = {}
76
+ _p = proc { if block_given?
77
+ (proc do
78
+ id_to_score_and_time_hash = Hash.new
79
+ yield id_to_score_and_time_hash
80
+ id_to_score_and_time_hash
81
+ end)
82
+ else
83
+ (proc { id_to_score_and_time_hash })
84
+ end}
85
+
86
+ self.hotest_crons.push HotestItems.new(key, _p)
87
+ end
88
+
89
+ # TODO support mongoid
90
+ def similar_items model_name, id_to_text_hash = {}
91
+ _p = if block_given?
92
+ (proc do
93
+ id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
94
+ yield id_to_text_hash
95
+ id_to_text_hash
96
+ end)
97
+ else
98
+ (proc { id_to_text_hash })
99
+ end
100
+
101
+ self.similar_crons.push Similar.new(model_name, _p)
7
102
  end
8
103
 
9
104
  end