statlysis 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +51 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +110 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +43 -0
- data/Rakefile +11 -0
- data/lib/statlysis.rb +134 -0
- data/lib/statlysis/clock.rb +36 -0
- data/lib/statlysis/common.rb +27 -0
- data/lib/statlysis/configuration.rb +10 -0
- data/lib/statlysis/cron.rb +86 -0
- data/lib/statlysis/cron/count.rb +93 -0
- data/lib/statlysis/cron/top.rb +154 -0
- data/lib/statlysis/formula.rb +6 -0
- data/lib/statlysis/javascript/count.rb +37 -0
- data/lib/statlysis/map_reduce.rb +32 -0
- data/lib/statlysis/rake.rb +28 -0
- data/lib/statlysis/results.rb +17 -0
- data/lib/statlysis/similar.rb +89 -0
- data/lib/statlysis/timeseries.rb +41 -0
- data/statlysis.gemspec +30 -0
- data/test/helper.rb +17 -0
- data/test/models/company.rb +12 -0
- data/test/models/employee.rb +14 -0
- data/test/test_mapreduce.rb +26 -0
- data/test/test_statlysis.rb +76 -0
- data/test/test_timeseries.rb +6 -0
- metadata +216 -0
data/.document
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# rcov generated
|
2
|
+
coverage
|
3
|
+
coverage.data
|
4
|
+
|
5
|
+
# rdoc generated
|
6
|
+
rdoc
|
7
|
+
|
8
|
+
# yard generated
|
9
|
+
doc
|
10
|
+
.yardoc
|
11
|
+
|
12
|
+
# bundler
|
13
|
+
.bundle
|
14
|
+
|
15
|
+
# jeweler generated
|
16
|
+
pkg
|
17
|
+
|
18
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
19
|
+
#
|
20
|
+
# * Create a file at ~/.gitignore
|
21
|
+
# * Include files you want ignored
|
22
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
23
|
+
#
|
24
|
+
# After doing this, these files will be ignored in all your git projects,
|
25
|
+
# saving you from having to 'pollute' every project you touch with them
|
26
|
+
#
|
27
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
28
|
+
#
|
29
|
+
# For MacOS:
|
30
|
+
#
|
31
|
+
.DS_Store
|
32
|
+
|
33
|
+
# For TextMate
|
34
|
+
*.tmproj
|
35
|
+
tmtags
|
36
|
+
|
37
|
+
# For emacs:
|
38
|
+
*~
|
39
|
+
\#*
|
40
|
+
.\#*
|
41
|
+
|
42
|
+
# For vim:
|
43
|
+
*.swp
|
44
|
+
|
45
|
+
# For redcar:
|
46
|
+
.redcar
|
47
|
+
|
48
|
+
# For rubinius:
|
49
|
+
*.rbc
|
50
|
+
|
51
|
+
coverage
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
statlysis (0.0.1)
|
5
|
+
activerecord
|
6
|
+
activerecord_idnamecache
|
7
|
+
activesupport
|
8
|
+
mongoid
|
9
|
+
mysql2
|
10
|
+
only_one_rake
|
11
|
+
rails
|
12
|
+
rake
|
13
|
+
sequel
|
14
|
+
|
15
|
+
GEM
|
16
|
+
remote: http://rubygems.org/
|
17
|
+
specs:
|
18
|
+
actionmailer (3.2.13)
|
19
|
+
actionpack (= 3.2.13)
|
20
|
+
mail (~> 2.5.3)
|
21
|
+
actionpack (3.2.13)
|
22
|
+
activemodel (= 3.2.13)
|
23
|
+
activesupport (= 3.2.13)
|
24
|
+
builder (~> 3.0.0)
|
25
|
+
erubis (~> 2.7.0)
|
26
|
+
journey (~> 1.0.4)
|
27
|
+
rack (~> 1.4.5)
|
28
|
+
rack-cache (~> 1.2)
|
29
|
+
rack-test (~> 0.6.1)
|
30
|
+
sprockets (~> 2.2.1)
|
31
|
+
activemodel (3.2.13)
|
32
|
+
activesupport (= 3.2.13)
|
33
|
+
builder (~> 3.0.0)
|
34
|
+
activerecord (3.2.13)
|
35
|
+
activemodel (= 3.2.13)
|
36
|
+
activesupport (= 3.2.13)
|
37
|
+
arel (~> 3.0.2)
|
38
|
+
tzinfo (~> 0.3.29)
|
39
|
+
activerecord_idnamecache (0.1)
|
40
|
+
activeresource (3.2.13)
|
41
|
+
activemodel (= 3.2.13)
|
42
|
+
activesupport (= 3.2.13)
|
43
|
+
activesupport (3.2.13)
|
44
|
+
i18n (= 0.6.1)
|
45
|
+
multi_json (~> 1.0)
|
46
|
+
arel (3.0.2)
|
47
|
+
builder (3.0.4)
|
48
|
+
erubis (2.7.0)
|
49
|
+
hike (1.2.3)
|
50
|
+
i18n (0.6.1)
|
51
|
+
journey (1.0.4)
|
52
|
+
json (1.8.0)
|
53
|
+
mail (2.5.4)
|
54
|
+
mime-types (~> 1.16)
|
55
|
+
treetop (~> 1.4.8)
|
56
|
+
mime-types (1.23)
|
57
|
+
mongoid (3.1.4)
|
58
|
+
activemodel (~> 3.2)
|
59
|
+
moped (~> 1.4)
|
60
|
+
origin (~> 1.0)
|
61
|
+
tzinfo (~> 0.3.22)
|
62
|
+
moped (1.5.0)
|
63
|
+
multi_json (1.7.7)
|
64
|
+
mysql2 (0.3.11)
|
65
|
+
only_one_rake (0.0.4)
|
66
|
+
origin (1.1.0)
|
67
|
+
polyglot (0.3.3)
|
68
|
+
rack (1.4.5)
|
69
|
+
rack-cache (1.2)
|
70
|
+
rack (>= 0.4)
|
71
|
+
rack-ssl (1.3.3)
|
72
|
+
rack
|
73
|
+
rack-test (0.6.2)
|
74
|
+
rack (>= 1.0)
|
75
|
+
rails (3.2.13)
|
76
|
+
actionmailer (= 3.2.13)
|
77
|
+
actionpack (= 3.2.13)
|
78
|
+
activerecord (= 3.2.13)
|
79
|
+
activeresource (= 3.2.13)
|
80
|
+
activesupport (= 3.2.13)
|
81
|
+
bundler (~> 1.0)
|
82
|
+
railties (= 3.2.13)
|
83
|
+
railties (3.2.13)
|
84
|
+
actionpack (= 3.2.13)
|
85
|
+
activesupport (= 3.2.13)
|
86
|
+
rack-ssl (~> 1.3.2)
|
87
|
+
rake (>= 0.8.7)
|
88
|
+
rdoc (~> 3.4)
|
89
|
+
thor (>= 0.14.6, < 2.0)
|
90
|
+
rake (10.1.0)
|
91
|
+
rdoc (3.12.2)
|
92
|
+
json (~> 1.4)
|
93
|
+
sequel (4.0.0)
|
94
|
+
sprockets (2.2.2)
|
95
|
+
hike (~> 1.2)
|
96
|
+
multi_json (~> 1.0)
|
97
|
+
rack (~> 1.0)
|
98
|
+
tilt (~> 1.1, != 1.3.0)
|
99
|
+
thor (0.18.1)
|
100
|
+
tilt (1.4.1)
|
101
|
+
treetop (1.4.14)
|
102
|
+
polyglot
|
103
|
+
polyglot (>= 0.3.1)
|
104
|
+
tzinfo (0.3.37)
|
105
|
+
|
106
|
+
PLATFORMS
|
107
|
+
ruby
|
108
|
+
|
109
|
+
DEPENDENCIES
|
110
|
+
statlysis!
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 David Chen at eoe.cn
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
statlysis
|
2
|
+
===============================================
|
3
|
+
statistical analysis in ruby dsl
|
4
|
+
|
5
|
+
Usage
|
6
|
+
-----------------------------------------------
|
7
|
+
```ruby
|
8
|
+
module Statlysis
|
9
|
+
set_database :statlysis
|
10
|
+
update_time_columns :t
|
11
|
+
set_tablename_default_pre :st
|
12
|
+
|
13
|
+
# 初始化键值model
|
14
|
+
Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kvs'
|
15
|
+
Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kv_histories'
|
16
|
+
|
17
|
+
# 日常count
|
18
|
+
EoeLog.class # preload EoeLogTest
|
19
|
+
@log_model = IS_DEVELOP ? EoeLogTest : EoeLog
|
20
|
+
hourly @log_model, :t
|
21
|
+
daily @log_model, :t
|
22
|
+
daily @log_model.where(:ui => 0), :t
|
23
|
+
daily @log_model.where(:ui => {"$ne" => 0}), :t
|
24
|
+
|
25
|
+
# 统计各个模块
|
26
|
+
daily @log_model.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}), :t
|
27
|
+
[:www, :code, :skill, :book, :edu, :news, :wiki, :salon, :android].each do |site|
|
28
|
+
daily @log_model.where(:do => DOMAINS_HASH[site]), :t
|
29
|
+
end
|
30
|
+
```
|
31
|
+
|
32
|
+
TODO
|
33
|
+
-----------------------------------------------
|
34
|
+
1. Admin interface
|
35
|
+
2. statistical query api in Ruby and HTTP
|
36
|
+
3. Interacting with Javascript charting library, e.g. Highcharts, D3.
|
37
|
+
4. Add namespace to DSL, like rake
|
38
|
+
5. More tests
|
39
|
+
|
40
|
+
|
41
|
+
Copyright
|
42
|
+
-----------------------------------------------
|
43
|
+
MIT. David Chen at eoe.cn.
|
data/Rakefile
ADDED
data/lib/statlysis.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# Sequel的操作均需通过Symbol
|
3
|
+
#
|
4
|
+
# 删除匹配的统计表
|
5
|
+
# Statlysis.sequel.tables.select {|i| i.to_s.match(//i) }.each {|i| Statlysis.sequel.drop_table i }
|
6
|
+
|
7
|
+
require "active_support/all"
|
8
|
+
require 'active_support/core_ext/module/attribute_accessors.rb'
|
9
|
+
require 'active_record'
|
10
|
+
require 'rails'
|
11
|
+
%w[yaml sequel only_one_rake mongoid].map(&method(:require))
|
12
|
+
|
13
|
+
module Statlysis
|
14
|
+
Units = %w[hour day week month year]
|
15
|
+
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
16
|
+
|
17
|
+
def self.setup_stat_table_and_model cron, tablename = nil
|
18
|
+
tablename = cron.stat_table_name if tablename.nil?
|
19
|
+
tablename ||= cron.stat_table.first_source_table
|
20
|
+
cron.stat_table = Statlysis.sequel[tablename.to_sym]
|
21
|
+
|
22
|
+
str = tablename.to_s.singularize.camelize
|
23
|
+
eval("class ::#{str} < Sequel::Model;
|
24
|
+
self.set_dataset :#{tablename}
|
25
|
+
def self.[] item_id
|
26
|
+
JSON.parse(find_or_create(:pattern => item_id).result) rescue []
|
27
|
+
end
|
28
|
+
end; ")
|
29
|
+
cron.stat_model = str.constantize
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
require 'statlysis/common'
|
35
|
+
require 'statlysis/timeseries'
|
36
|
+
require 'statlysis/clock'
|
37
|
+
require 'statlysis/rake'
|
38
|
+
require 'statlysis/cron'
|
39
|
+
require 'statlysis/similar'
|
40
|
+
|
41
|
+
module Statlysis
|
42
|
+
mattr_accessor :sequel, :default_time_columns, :database_opts, :tablename_default_pre
|
43
|
+
Units.each {|unit| module_eval "mattr_accessor :#{unit}_crons; self.#{unit}_crons = []" }
|
44
|
+
[:realtime, :similar, :hotest].each do |sym|
|
45
|
+
sym = "#{sym}_crons".to_sym
|
46
|
+
mattr_accessor sym; self.send "#{sym}=", []
|
47
|
+
end
|
48
|
+
# TODO _crons uniq, no readd
|
49
|
+
extend self
|
50
|
+
|
51
|
+
# 会在自动拼接统计数据库表名时去除这些时间字段
|
52
|
+
def update_time_columns *columns
|
53
|
+
self.default_time_columns ||= [:created_at, :updated_at]
|
54
|
+
columns.each {|column| self.default_time_columns.push column }
|
55
|
+
self.default_time_columns = self.default_time_columns.uniq
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_database sym_or_hash
|
59
|
+
self.database_opts = if sym_or_hash.is_a? Symbol
|
60
|
+
YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
|
61
|
+
elsif Hash
|
62
|
+
sym_or_hash
|
63
|
+
else
|
64
|
+
raise "Statlysis#set_database only support symbol or hash params"
|
65
|
+
end
|
66
|
+
self.sequel = Sequel.connect self.database_opts.except('database')
|
67
|
+
self.sequel.execute("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;")
|
68
|
+
self.sequel.use self.database_opts['database']
|
69
|
+
# Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
|
70
|
+
end
|
71
|
+
|
72
|
+
def set_tablename_default_pre str
|
73
|
+
self.tablename_default_pre = str.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def daily source, time_column = :created_at; timely source, :time_unit => :day, :time_column => time_column end
|
77
|
+
def hourly source, time_column = :created_at; timely source, :time_unit => :hour, :time_column => time_column end
|
78
|
+
|
79
|
+
def check_set_database; raise "Please setup database first" if sequel.nil? end
|
80
|
+
|
81
|
+
def timely source, opts
|
82
|
+
self.check_set_database
|
83
|
+
opts.reverse_merge! :time_column => :created_at, :time_unit => :day
|
84
|
+
t = Timely.new source, opts
|
85
|
+
module_eval("self.#{opts[:time_unit]}_crons").push t
|
86
|
+
end
|
87
|
+
|
88
|
+
# the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
|
89
|
+
def lastest_visits source, opts
|
90
|
+
self.check_set_database
|
91
|
+
opts.reverse_merge! :time_column => :created_at
|
92
|
+
self.realtime_crons.push LastestVisits.new(source, opts)
|
93
|
+
end
|
94
|
+
|
95
|
+
# TODO 为什么一层proc的话会直接执行的
|
96
|
+
def hotest_items key, id_to_score_and_time_hash = {}
|
97
|
+
_p = proc { if block_given?
|
98
|
+
(proc do
|
99
|
+
id_to_score_and_time_hash = Hash.new
|
100
|
+
yield id_to_score_and_time_hash
|
101
|
+
id_to_score_and_time_hash
|
102
|
+
end)
|
103
|
+
else
|
104
|
+
(proc { id_to_score_and_time_hash })
|
105
|
+
end}
|
106
|
+
|
107
|
+
self.hotest_crons.push HotestItems.new(key, _p)
|
108
|
+
end
|
109
|
+
|
110
|
+
# TODO support mongoid
|
111
|
+
def similar_items model_name, id_to_text_hash = {}
|
112
|
+
_p = if block_given?
|
113
|
+
(proc do
|
114
|
+
id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
|
115
|
+
yield id_to_text_hash
|
116
|
+
id_to_text_hash
|
117
|
+
end)
|
118
|
+
else
|
119
|
+
(proc { id_to_text_hash })
|
120
|
+
end
|
121
|
+
|
122
|
+
self.similar_crons.push Similar.new(model_name, _p)
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
module Statlysis
|
129
|
+
class Railtie < Rails::Railtie
|
130
|
+
rake_tasks do
|
131
|
+
load File.expand_path('../statlysis/rake.rb', __FILE__)
|
132
|
+
end
|
133
|
+
end if defined? Rails
|
134
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
DateTime1970 = DateTime.parse("19700101").in_time_zone
|
5
|
+
|
6
|
+
class Clock
|
7
|
+
attr_accessor :clock
|
8
|
+
include Common
|
9
|
+
|
10
|
+
def initialize feature, default_time
|
11
|
+
raise "Please assign default_time params" if not default_time
|
12
|
+
cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
|
13
|
+
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
14
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
|
15
|
+
primary_key :id
|
16
|
+
String :feature
|
17
|
+
DateTime :t
|
18
|
+
index :feature, :unique => true
|
19
|
+
end
|
20
|
+
end
|
21
|
+
Statlysis.setup_stat_table_and_model cron
|
22
|
+
cron.clock = cron.stat_model.find_or_create(:feature => feature)
|
23
|
+
cron.clock.update :t => default_time if cron.current.nil?
|
24
|
+
cron
|
25
|
+
end
|
26
|
+
|
27
|
+
def update time
|
28
|
+
time = DateTime.now if time == DateTime1970
|
29
|
+
return false if time && (time < cron.current)
|
30
|
+
cron.clock.update :t => time
|
31
|
+
end
|
32
|
+
|
33
|
+
def current; cron.clock.t end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
module Common
|
5
|
+
attr_accessor :stat_table_name, :stat_model, :stat_table
|
6
|
+
def pattern_table_and_model tn
|
7
|
+
# ensure statlysis table
|
8
|
+
tn = tn.pluralize
|
9
|
+
unless Statlysis.sequel.table_exists?(tn)
|
10
|
+
Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
|
11
|
+
primary_key :id
|
12
|
+
String :pattern
|
13
|
+
index :pattern
|
14
|
+
end
|
15
|
+
Statlysis.sequel.add_column tn, :result, String, :text => true
|
16
|
+
end
|
17
|
+
|
18
|
+
# generate a statlysis model
|
19
|
+
cron.stat_model = Statlysis.setup_stat_table_and_model cron, tn
|
20
|
+
end
|
21
|
+
|
22
|
+
def cron; self end
|
23
|
+
# TODO remove puts, conflict user, user logger
|
24
|
+
def puts(*strs); $stdout.puts(*strs) if ENV['DEBUG'] end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|