statlysis 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/.gitignore +3 -0
  2. data/Guardfile +14 -0
  3. data/README.markdown +77 -27
  4. data/Rakefile +1 -1
  5. data/lib/statlysis.rb +59 -101
  6. data/lib/statlysis/clock.rb +3 -3
  7. data/lib/statlysis/common.rb +4 -16
  8. data/lib/statlysis/configuration.rb +97 -2
  9. data/lib/statlysis/constants.rb +10 -0
  10. data/lib/statlysis/cron.rb +40 -42
  11. data/lib/statlysis/cron/count.rb +16 -58
  12. data/lib/statlysis/cron/count/dimensions.rb +7 -0
  13. data/lib/statlysis/cron/count/timely.rb +63 -0
  14. data/lib/statlysis/cron/top.rb +4 -104
  15. data/lib/statlysis/cron/top/hotest_items.rb +47 -0
  16. data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
  17. data/lib/statlysis/cron_set.rb +26 -0
  18. data/lib/statlysis/dataset.rb +6 -0
  19. data/lib/statlysis/javascript/count.rb +3 -3
  20. data/lib/statlysis/multiple_dataset.rb +69 -0
  21. data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
  22. data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
  23. data/lib/statlysis/rake.rb +6 -5
  24. data/lib/statlysis/similar.rb +11 -11
  25. data/lib/statlysis/timeseries.rb +12 -9
  26. data/lib/statlysis/utils.rb +40 -0
  27. data/statlysis.gemspec +13 -3
  28. data/test/config/database.yml +9 -0
  29. data/test/config/mongoid.yml +36 -0
  30. data/test/data/.gitkeep +0 -0
  31. data/test/data/code_gists_20130724.csv +1459 -0
  32. data/test/helper.rb +41 -3
  33. data/test/migrate/1_active_record.rb +8 -0
  34. data/test/models/.gitkeep +0 -0
  35. data/test/models/code_gist.rb +5 -0
  36. data/test/models/eoe_log.rb +53 -0
  37. data/test/test_daily_count.rb +22 -0
  38. data/test/test_mapreduce.rb +0 -13
  39. data/test/test_single_log_in_multiple_collections.rb +22 -0
  40. data/test/test_statlysis.rb +5 -50
  41. data/test/test_timeseries.rb +46 -0
  42. metadata +133 -12
  43. data/Gemfile.lock +0 -110
  44. data/LICENSE.txt +0 -20
  45. data/test/models/company.rb +0 -12
  46. data/test/models/employee.rb +0 -14
data/.gitignore CHANGED
@@ -49,3 +49,6 @@ tmtags
49
49
  *.rbc
50
50
 
51
51
  coverage
52
+ *.gem
53
+ Gemfile.lock
54
+ tmp
data/Guardfile ADDED
@@ -0,0 +1,14 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :test do
5
+ watch(%r{^lib/(.+)\.rb$}) { |m| "test/#{m[1]}_test.rb" }
6
+ watch(%r{^test/.+_test\.rb$})
7
+ watch('test/test_helper.rb') { "test" }
8
+
9
+ # Rails example
10
+ watch(%r{^app/models/(.+)\.rb$}) { |m| "test/unit/#{m[1]}_test.rb" }
11
+ watch(%r{^app/controllers/(.+)\.rb$}) { |m| "test/functional/#{m[1]}_test.rb" }
12
+ watch(%r{^app/views/.+\.rb$}) { "test/integration" }
13
+ watch('app/controllers/application_controller.rb') { ["test/functional", "test/integration"] }
14
+ end
data/README.markdown CHANGED
@@ -1,43 +1,93 @@
1
- statlysis
1
+ Statlysis
2
2
  ===============================================
3
- statistical analysis in ruby dsl
3
+ Statistical & Analysis in Ruby DSL
4
4
 
5
5
  Usage
6
6
  -----------------------------------------------
7
+ ### setup
8
+
7
9
  ```ruby
8
- module Statlysis
10
+ Statlysis.setup do
9
11
  set_database :statlysis
10
- update_time_columns :t
11
- set_tablename_default_pre :st
12
-
13
- # 初始化键值model
14
- Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kvs'
15
- Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kv_histories'
16
-
17
- # 日常count
18
- EoeLog.class # preload EoeLogTest
19
- @log_model = IS_DEVELOP ? EoeLogTest : EoeLog
20
- hourly @log_model, :t
21
- daily @log_model, :t
22
- daily @log_model.where(:ui => 0), :t
23
- daily @log_model.where(:ui => {"$ne" => 0}), :t
24
-
25
- # 统计各个模块
26
- daily @log_model.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}), :t
27
- [:www, :code, :skill, :book, :edu, :news, :wiki, :salon, :android].each do |site|
28
- daily @log_model.where(:do => DOMAINS_HASH[site]), :t
12
+
13
+ hourly :time_column => :t
14
+ [EoeLog,
15
+ EoeLog.where(:ui => 0),
16
+ EoeLog.where(:ui => {"$ne" => 0}),
17
+ Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
18
+ EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
19
+ ].each do |s|
20
+ daily s, :time_column => :t
29
21
  end
22
+ end
23
+ ```
24
+
25
+ ### access
26
+
27
+ ```ruby
28
+ Statlysis.daily # => return daily crons
29
+ Statlysis.daily.run # => run daily crons
30
+ Statlysis.daily[/name_regexp/] # => return matched daily crons
31
+ ```
32
+
33
+ ### process
34
+
35
+ ```irb
36
+ [23] pry(#<Statlysis::Configuration>)> Statlysis.daily['multi'].first
30
37
  ```
31
38
 
39
+ Features
40
+ -----------------------------------------------
41
+ * Support time column that stored as integer.
42
+
32
43
  TODO
33
44
  -----------------------------------------------
34
- 1. Admin interface
35
- 2. statistical query api in Ruby and HTTP
36
- 3. Interacting with Javascript charting library, e.g. Highcharts, D3.
37
- 4. Add namespace to DSL, like rake
38
- 5. More tests
45
+ * Admin interface
46
+ * statistical query api in Ruby and HTTP
47
+ * Interacting with Javascript charting library, e.g. Highcharts, D3.
48
+ * More tests
49
+ * Add @criteria to MultipleDataset
50
+
51
+
52
+ Statistical Process
53
+ -----------------------------------------------
54
+ 1. Delete invalid statistical data, e.g. data in tomorrow
55
+ 2. Count data within the specified time by the dimensions
56
+ 3. Delete overlapping data, and insert new data
57
+
58
+
59
+ FAQ
60
+ -----------------------------------------------
61
+ Q: Why use Sequel instead of ActiveRecord?
62
+
63
+ A: When initialize an ORM object, ActiveRecord is 3 times slower than Sequel, and we just need the basic operations, including read, write, enumerate, etc. See more details in [Quick dive into Ruby ORM object initialization](http://merbist.com/2012/02/23/quick-dive-into-ruby-orm-object-initialization/) .
64
+
65
+
66
+ Q: Why do you recommend using multiple collections to store logs rather than a single collection, or a capped collection?
67
+
68
+ A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
39
69
 
40
70
 
41
71
  Copyright
42
72
  -----------------------------------------------
43
73
  MIT. David Chen at eoe.cn.
74
+
75
+
76
+ Related
77
+ -----------------------------------------------
78
+ ### Projects
79
+ * https://github.com/paulasmuth/fnordmetric FnordMetric is a redis/ruby-based realtime Event-Tracking app
80
+ * https://github.com/thirtysixthspan/descriptive_statistics adds methods to the Enumerable module to allow easy calculation of basic descriptive statistics for a set of data
81
+ * https://github.com/tmcw/simple-statistics simple statistics for javascript in node and the browser
82
+ * https://github.com/clbustos/statsample/ A suite for basic and advanced statistics on Ruby.
83
+ * https://github.com/SciRuby/sciruby Tools for scientific computation in Ruby/Rails
84
+
85
+ ### Articles
86
+ * http://www.slideshare.net/WombatNation/logging-app-behavior-to-mongo-db
87
+
88
+ ### Event collector
89
+ * https://github.com/fluent
90
+ * https://github.com/logstash/logstash
91
+
92
+ ### Admin interface
93
+ * http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: UTF-8
2
2
 
3
- require 'rake'
3
+ require "bundler/gem_tasks"
4
4
  require 'rake/testtask'
5
5
 
6
6
  Rake::TestTask.new do |t|
data/lib/statlysis.rb CHANGED
@@ -1,134 +1,92 @@
1
1
  # encoding: UTF-8
2
+ #
2
3
  # Sequel的操作均需通过Symbol
3
4
  #
4
5
  # 删除匹配的统计表
5
6
  # Statlysis.sequel.tables.select {|i| i.to_s.match(//i) }.each {|i| Statlysis.sequel.drop_table i }
6
7
 
8
+ # TODO Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
9
+
7
10
  require "active_support/all"
11
+ Time.zone ||= Time.now.utc_offset # require activesupport
12
+
13
+ require "active_support/core_ext"
8
14
  require 'active_support/core_ext/module/attribute_accessors.rb'
9
15
  require 'active_record'
10
- require 'rails'
11
- %w[yaml sequel only_one_rake mongoid].map(&method(:require))
16
+ require 'activerecord_idnamecache'
17
+ %w[yaml sequel mongoid].map(&method(:require))
12
18
 
13
- module Statlysis
14
- Units = %w[hour day week month year]
15
- DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
16
-
17
- def self.setup_stat_table_and_model cron, tablename = nil
18
- tablename = cron.stat_table_name if tablename.nil?
19
- tablename ||= cron.stat_table.first_source_table
20
- cron.stat_table = Statlysis.sequel[tablename.to_sym]
21
-
22
- str = tablename.to_s.singularize.camelize
23
- eval("class ::#{str} < Sequel::Model;
24
- self.set_dataset :#{tablename}
25
- def self.[] item_id
26
- JSON.parse(find_or_create(:pattern => item_id).result) rescue []
27
- end
28
- end; ")
29
- cron.stat_model = str.constantize
30
- end
31
-
32
- end
19
+ # Fake a Rails environment
20
+ module Rails; end
33
21
 
34
- require 'statlysis/common'
35
- require 'statlysis/timeseries'
36
- require 'statlysis/clock'
37
- require 'statlysis/rake'
38
- require 'statlysis/cron'
39
- require 'statlysis/similar'
22
+ require 'statlysis/constants'
40
23
 
41
24
  module Statlysis
42
- mattr_accessor :sequel, :default_time_columns, :database_opts, :tablename_default_pre
43
- Units.each {|unit| module_eval "mattr_accessor :#{unit}_crons; self.#{unit}_crons = []" }
44
- [:realtime, :similar, :hotest].each do |sym|
45
- sym = "#{sym}_crons".to_sym
46
- mattr_accessor sym; self.send "#{sym}=", []
47
- end
48
- # TODO _crons uniq, no readd
49
- extend self
50
-
51
- # 会在自动拼接统计数据库表名时去除这些时间字段
52
- def update_time_columns *columns
53
- self.default_time_columns ||= [:created_at, :updated_at]
54
- columns.each {|column| self.default_time_columns.push column }
55
- self.default_time_columns = self.default_time_columns.uniq
56
- end
25
+ class << self
26
+ def setup &blk
27
+ raise "Need to setup proc" if not blk
57
28
 
58
- def set_database sym_or_hash
59
- self.database_opts = if sym_or_hash.is_a? Symbol
60
- YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
61
- elsif Hash
62
- sym_or_hash
63
- else
64
- raise "Statlysis#set_database only support symbol or hash params"
29
+ logger.info "Start to setup Statlysis"
30
+ time_log do
31
+ self.config.instance_exec(&blk)
32
+ end
33
+ logger.info
65
34
  end
66
- self.sequel = Sequel.connect self.database_opts.except('database')
67
- self.sequel.execute("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;")
68
- self.sequel.use self.database_opts['database']
69
- # Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
70
- end
71
35
 
72
- def set_tablename_default_pre str
73
- self.tablename_default_pre = str.to_s
74
- end
36
+ def time_log text = nil
37
+ t = Time.now
38
+ logger.info text if text
39
+ yield if block_given?
40
+ logger.info "Time spend #{(Time.now - t).round(2)} seconds."
41
+ logger.info "-" * 42
42
+ end
75
43
 
76
- def daily source, time_column = :created_at; timely source, :time_unit => :day, :time_column => time_column end
77
- def hourly source, time_column = :created_at; timely source, :time_unit => :hour, :time_column => time_column end
44
+ # delagate config methods to Configuration
45
+ def config; Configuration.instance end
46
+ require 'active_support/core_ext/module/delegation.rb'
47
+ [:sequel, :set_database, :check_set_database,
48
+ :default_time_zone,
49
+ :set_tablename_default_pre, :tablename_default_pre
50
+ ].each do |sym|
51
+ delegate sym, :to => :config
52
+ end
78
53
 
79
- def check_set_database; raise "Please setup database first" if sequel.nil? end
54
+ attr_accessor :logger
55
+ Statlysis.logger ||= Logger.new($stdout)
80
56
 
81
- def timely source, opts
82
- self.check_set_database
83
- opts.reverse_merge! :time_column => :created_at, :time_unit => :day
84
- t = Timely.new source, opts
85
- module_eval("self.#{opts[:time_unit]}_crons").push t
86
- end
57
+ def source_to_database_type; @_source_to_database_type ||= {} end
87
58
 
88
- # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
89
- def lastest_visits source, opts
90
- self.check_set_database
91
- opts.reverse_merge! :time_column => :created_at
92
- self.realtime_crons.push LastestVisits.new(source, opts)
93
- end
94
59
 
95
- # TODO 为什么一层proc的话会直接执行的
96
- def hotest_items key, id_to_score_and_time_hash = {}
97
- _p = proc { if block_given?
98
- (proc do
99
- id_to_score_and_time_hash = Hash.new
100
- yield id_to_score_and_time_hash
101
- id_to_score_and_time_hash
102
- end)
103
- else
104
- (proc { id_to_score_and_time_hash })
105
- end}
106
-
107
- self.hotest_crons.push HotestItems.new(key, _p)
60
+ def daily; CronSet.new(Statlysis.config.day_crons) end
61
+ def hourly; CronSet.new(Statlysis.config.hour_crons) end
62
+
108
63
  end
109
64
 
110
- # TODO support mongoid
111
- def similar_items model_name, id_to_text_hash = {}
112
- _p = if block_given?
113
- (proc do
114
- id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
115
- yield id_to_text_hash
116
- id_to_text_hash
117
- end)
118
- else
119
- (proc { id_to_text_hash })
120
- end
65
+ end
121
66
 
122
- self.similar_crons.push Similar.new(model_name, _p)
123
- end
67
+ require 'statlysis/utils'
68
+ require 'statlysis/configuration'
69
+ require 'statlysis/common'
70
+ require 'statlysis/timeseries'
71
+ require 'statlysis/clock'
72
+ require 'statlysis/rake'
73
+ require 'statlysis/cron'
74
+ require 'statlysis/cron_set'
75
+ require 'statlysis/similar'
76
+ require 'statlysis/multiple_dataset'
124
77
 
78
+ module Statlysis
79
+ require 'short_inspect'
80
+ ShortInspect.apply_to Cron, CronSet, MultipleDataset
81
+ ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
125
82
  end
126
83
 
127
84
 
85
+ # load rake tasks
128
86
  module Statlysis
129
87
  class Railtie < Rails::Railtie
130
88
  rake_tasks do
131
89
  load File.expand_path('../statlysis/rake.rb', __FILE__)
132
90
  end
133
- end if defined? Rails
134
- end
91
+ end
92
+ end if defined? Rails::Railtie
@@ -1,12 +1,11 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module Statlysis
4
- DateTime1970 = DateTime.parse("19700101").in_time_zone
5
-
6
4
  class Clock
7
5
  attr_accessor :clock
8
6
  include Common
9
7
 
8
+ # feature is a string
10
9
  def initialize feature, default_time
11
10
  raise "Please assign default_time params" if not default_time
12
11
  cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
@@ -18,7 +17,8 @@ module Statlysis
18
17
  index :feature, :unique => true
19
18
  end
20
19
  end
21
- Statlysis.setup_stat_table_and_model cron
20
+ h = Utils.setup_pattern_table_and_model cron.stat_table_name
21
+ cron.stat_model = h[:model]
22
22
  cron.clock = cron.stat_model.find_or_create(:feature => feature)
23
23
  cron.clock.update :t => default_time if cron.current.nil?
24
24
  cron
@@ -2,26 +2,14 @@
2
2
 
3
3
  module Statlysis
4
4
  module Common
5
- attr_accessor :stat_table_name, :stat_model, :stat_table
6
- def pattern_table_and_model tn
7
- # ensure statlysis table
8
- tn = tn.pluralize
9
- unless Statlysis.sequel.table_exists?(tn)
10
- Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
11
- primary_key :id
12
- String :pattern
13
- index :pattern
14
- end
15
- Statlysis.sequel.add_column tn, :result, String, :text => true
16
- end
5
+ extend ActiveSupport::Concern
17
6
 
18
- # generate a statlysis model
19
- cron.stat_model = Statlysis.setup_stat_table_and_model cron, tn
7
+ self.included do
8
+ attr_accessor :stat_table_name, :stat_model
20
9
  end
21
10
 
22
11
  def cron; self end
23
- # TODO remove puts, conflict user, user logger
24
- def puts(*strs); $stdout.puts(*strs) if ENV['DEBUG'] end
12
+ delegate :logger, :to => Statlysis
25
13
 
26
14
  end
27
15
  end
@@ -1,9 +1,104 @@
1
1
  # encoding: UTF-8
2
+ #
3
+ # see original implementation at http://mvj3.github.io/2013/04/17/statlysis-analysis-design-solve-two-problems-lazy-loading-and-scope/
4
+ #
5
+
6
+ require 'singleton'
2
7
 
3
8
  module Statlysis
4
- # TODO config methods here
5
9
  class Configuration
6
- def inherited(base)
10
+ include Singleton
11
+
12
+ attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
13
+ attr_accessor :is_skip_database_index
14
+ TimeUnits.each {|unit| module_eval "attr_accessor :#{unit}_crons; self.instance.#{unit}_crons = []" }
15
+ [:realtime, :similar, :hotest].each do |sym|
16
+ sym = "#{sym}_crons"
17
+ attr_accessor sym; self.instance.send "#{sym}=", []
18
+ end
19
+ self.instance.send "tablename_default_pre=", "st"
20
+ self.instance.send "is_skip_database_index=", false
21
+
22
+ # 会在自动拼接统计数据库表名时去除这些时间字段
23
+ def update_time_columns *columns
24
+ self.default_time_columns ||= [:created_at, :updated_at]
25
+ columns.each {|column| self.default_time_columns.push column }
26
+ self.default_time_columns = self.default_time_columns.uniq
27
+ end
28
+
29
+ def set_database sym_or_hash
30
+ self.database_opts = if sym_or_hash.is_a? Symbol
31
+ YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
32
+ elsif Hash
33
+ sym_or_hash
34
+ else
35
+ raise "Statlysis#set_database only support symbol or hash params"
36
+ end
37
+ self.sequel = Sequel.connect(self.database_opts)
38
+
39
+ # 初始化键值model
40
+ ["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
41
+ Utils.setup_pattern_table_and_model tn
42
+ end
43
+ return self
44
+ end
45
+
46
+ def set_default_time_zone zone
47
+ self.default_time_zone = zone
48
+ return self
49
+ end
50
+
51
+ def set_tablename_default_pre str
52
+ self.tablename_default_pre = str.to_s
53
+ end
54
+
55
+ def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
56
+ def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
57
+
58
+ def check_set_database; raise "Please setup database first" if sequel.nil? end
59
+
60
+ def timely source, opts
61
+ self.check_set_database
62
+ opts.reverse_merge! :time_column => :created_at, :time_unit => :day
63
+ t = Timely.new source, opts
64
+ self.send("#{opts[:time_unit]}_crons").push t
65
+ end
66
+
67
+ # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
68
+ def lastest_visits source, opts
69
+ self.check_set_database
70
+ opts.reverse_merge! :time_column => :created_at
71
+ self.realtime_crons.push LastestVisits.new(source, opts)
72
+ end
73
+
74
+ # TODO 为什么一层proc的话会直接执行的
75
+ def hotest_items key, id_to_score_and_time_hash = {}
76
+ _p = proc { if block_given?
77
+ (proc do
78
+ id_to_score_and_time_hash = Hash.new
79
+ yield id_to_score_and_time_hash
80
+ id_to_score_and_time_hash
81
+ end)
82
+ else
83
+ (proc { id_to_score_and_time_hash })
84
+ end}
85
+
86
+ self.hotest_crons.push HotestItems.new(key, _p)
87
+ end
88
+
89
+ # TODO support mongoid
90
+ def similar_items model_name, id_to_text_hash = {}
91
+ _p = if block_given?
92
+ (proc do
93
+ id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
94
+ yield id_to_text_hash
95
+ id_to_text_hash
96
+ end)
97
+ else
98
+ (proc { id_to_text_hash })
99
+ end
100
+
101
+ self.similar_crons.push Similar.new(model_name, _p)
7
102
  end
8
103
 
9
104
  end