statlysis 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +51 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +110 -0
- data/LICENSE.txt +20 -0
- data/README.markdown +43 -0
- data/Rakefile +11 -0
- data/lib/statlysis.rb +134 -0
- data/lib/statlysis/clock.rb +36 -0
- data/lib/statlysis/common.rb +27 -0
- data/lib/statlysis/configuration.rb +10 -0
- data/lib/statlysis/cron.rb +86 -0
- data/lib/statlysis/cron/count.rb +93 -0
- data/lib/statlysis/cron/top.rb +154 -0
- data/lib/statlysis/formula.rb +6 -0
- data/lib/statlysis/javascript/count.rb +37 -0
- data/lib/statlysis/map_reduce.rb +32 -0
- data/lib/statlysis/rake.rb +28 -0
- data/lib/statlysis/results.rb +17 -0
- data/lib/statlysis/similar.rb +89 -0
- data/lib/statlysis/timeseries.rb +41 -0
- data/statlysis.gemspec +30 -0
- data/test/helper.rb +17 -0
- data/test/models/company.rb +12 -0
- data/test/models/employee.rb +14 -0
- data/test/test_mapreduce.rb +26 -0
- data/test/test_statlysis.rb +76 -0
- data/test/test_timeseries.rb +6 -0
- metadata +216 -0
data/.document
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# rcov generated
|
2
|
+
coverage
|
3
|
+
coverage.data
|
4
|
+
|
5
|
+
# rdoc generated
|
6
|
+
rdoc
|
7
|
+
|
8
|
+
# yard generated
|
9
|
+
doc
|
10
|
+
.yardoc
|
11
|
+
|
12
|
+
# bundler
|
13
|
+
.bundle
|
14
|
+
|
15
|
+
# jeweler generated
|
16
|
+
pkg
|
17
|
+
|
18
|
+
# Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
|
19
|
+
#
|
20
|
+
# * Create a file at ~/.gitignore
|
21
|
+
# * Include files you want ignored
|
22
|
+
# * Run: git config --global core.excludesfile ~/.gitignore
|
23
|
+
#
|
24
|
+
# After doing this, these files will be ignored in all your git projects,
|
25
|
+
# saving you from having to 'pollute' every project you touch with them
|
26
|
+
#
|
27
|
+
# Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
|
28
|
+
#
|
29
|
+
# For MacOS:
|
30
|
+
#
|
31
|
+
.DS_Store
|
32
|
+
|
33
|
+
# For TextMate
|
34
|
+
*.tmproj
|
35
|
+
tmtags
|
36
|
+
|
37
|
+
# For emacs:
|
38
|
+
*~
|
39
|
+
\#*
|
40
|
+
.\#*
|
41
|
+
|
42
|
+
# For vim:
|
43
|
+
*.swp
|
44
|
+
|
45
|
+
# For redcar:
|
46
|
+
.redcar
|
47
|
+
|
48
|
+
# For rubinius:
|
49
|
+
*.rbc
|
50
|
+
|
51
|
+
coverage
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
statlysis (0.0.1)
|
5
|
+
activerecord
|
6
|
+
activerecord_idnamecache
|
7
|
+
activesupport
|
8
|
+
mongoid
|
9
|
+
mysql2
|
10
|
+
only_one_rake
|
11
|
+
rails
|
12
|
+
rake
|
13
|
+
sequel
|
14
|
+
|
15
|
+
GEM
|
16
|
+
remote: http://rubygems.org/
|
17
|
+
specs:
|
18
|
+
actionmailer (3.2.13)
|
19
|
+
actionpack (= 3.2.13)
|
20
|
+
mail (~> 2.5.3)
|
21
|
+
actionpack (3.2.13)
|
22
|
+
activemodel (= 3.2.13)
|
23
|
+
activesupport (= 3.2.13)
|
24
|
+
builder (~> 3.0.0)
|
25
|
+
erubis (~> 2.7.0)
|
26
|
+
journey (~> 1.0.4)
|
27
|
+
rack (~> 1.4.5)
|
28
|
+
rack-cache (~> 1.2)
|
29
|
+
rack-test (~> 0.6.1)
|
30
|
+
sprockets (~> 2.2.1)
|
31
|
+
activemodel (3.2.13)
|
32
|
+
activesupport (= 3.2.13)
|
33
|
+
builder (~> 3.0.0)
|
34
|
+
activerecord (3.2.13)
|
35
|
+
activemodel (= 3.2.13)
|
36
|
+
activesupport (= 3.2.13)
|
37
|
+
arel (~> 3.0.2)
|
38
|
+
tzinfo (~> 0.3.29)
|
39
|
+
activerecord_idnamecache (0.1)
|
40
|
+
activeresource (3.2.13)
|
41
|
+
activemodel (= 3.2.13)
|
42
|
+
activesupport (= 3.2.13)
|
43
|
+
activesupport (3.2.13)
|
44
|
+
i18n (= 0.6.1)
|
45
|
+
multi_json (~> 1.0)
|
46
|
+
arel (3.0.2)
|
47
|
+
builder (3.0.4)
|
48
|
+
erubis (2.7.0)
|
49
|
+
hike (1.2.3)
|
50
|
+
i18n (0.6.1)
|
51
|
+
journey (1.0.4)
|
52
|
+
json (1.8.0)
|
53
|
+
mail (2.5.4)
|
54
|
+
mime-types (~> 1.16)
|
55
|
+
treetop (~> 1.4.8)
|
56
|
+
mime-types (1.23)
|
57
|
+
mongoid (3.1.4)
|
58
|
+
activemodel (~> 3.2)
|
59
|
+
moped (~> 1.4)
|
60
|
+
origin (~> 1.0)
|
61
|
+
tzinfo (~> 0.3.22)
|
62
|
+
moped (1.5.0)
|
63
|
+
multi_json (1.7.7)
|
64
|
+
mysql2 (0.3.11)
|
65
|
+
only_one_rake (0.0.4)
|
66
|
+
origin (1.1.0)
|
67
|
+
polyglot (0.3.3)
|
68
|
+
rack (1.4.5)
|
69
|
+
rack-cache (1.2)
|
70
|
+
rack (>= 0.4)
|
71
|
+
rack-ssl (1.3.3)
|
72
|
+
rack
|
73
|
+
rack-test (0.6.2)
|
74
|
+
rack (>= 1.0)
|
75
|
+
rails (3.2.13)
|
76
|
+
actionmailer (= 3.2.13)
|
77
|
+
actionpack (= 3.2.13)
|
78
|
+
activerecord (= 3.2.13)
|
79
|
+
activeresource (= 3.2.13)
|
80
|
+
activesupport (= 3.2.13)
|
81
|
+
bundler (~> 1.0)
|
82
|
+
railties (= 3.2.13)
|
83
|
+
railties (3.2.13)
|
84
|
+
actionpack (= 3.2.13)
|
85
|
+
activesupport (= 3.2.13)
|
86
|
+
rack-ssl (~> 1.3.2)
|
87
|
+
rake (>= 0.8.7)
|
88
|
+
rdoc (~> 3.4)
|
89
|
+
thor (>= 0.14.6, < 2.0)
|
90
|
+
rake (10.1.0)
|
91
|
+
rdoc (3.12.2)
|
92
|
+
json (~> 1.4)
|
93
|
+
sequel (4.0.0)
|
94
|
+
sprockets (2.2.2)
|
95
|
+
hike (~> 1.2)
|
96
|
+
multi_json (~> 1.0)
|
97
|
+
rack (~> 1.0)
|
98
|
+
tilt (~> 1.1, != 1.3.0)
|
99
|
+
thor (0.18.1)
|
100
|
+
tilt (1.4.1)
|
101
|
+
treetop (1.4.14)
|
102
|
+
polyglot
|
103
|
+
polyglot (>= 0.3.1)
|
104
|
+
tzinfo (0.3.37)
|
105
|
+
|
106
|
+
PLATFORMS
|
107
|
+
ruby
|
108
|
+
|
109
|
+
DEPENDENCIES
|
110
|
+
statlysis!
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012 David Chen at eoe.cn
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
statlysis
|
2
|
+
===============================================
|
3
|
+
statistical analysis in ruby dsl
|
4
|
+
|
5
|
+
Usage
|
6
|
+
-----------------------------------------------
|
7
|
+
```ruby
|
8
|
+
module Statlysis
|
9
|
+
set_database :statlysis
|
10
|
+
update_time_columns :t
|
11
|
+
set_tablename_default_pre :st
|
12
|
+
|
13
|
+
# 初始化键值model
|
14
|
+
Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kvs'
|
15
|
+
Statlysis::Top.new('', :test => true).pattern_table_and_model 'st_single_kv_histories'
|
16
|
+
|
17
|
+
# 日常count
|
18
|
+
EoeLog.class # preload EoeLogTest
|
19
|
+
@log_model = IS_DEVELOP ? EoeLogTest : EoeLog
|
20
|
+
hourly @log_model, :t
|
21
|
+
daily @log_model, :t
|
22
|
+
daily @log_model.where(:ui => 0), :t
|
23
|
+
daily @log_model.where(:ui => {"$ne" => 0}), :t
|
24
|
+
|
25
|
+
# 统计各个模块
|
26
|
+
daily @log_model.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}), :t
|
27
|
+
[:www, :code, :skill, :book, :edu, :news, :wiki, :salon, :android].each do |site|
|
28
|
+
daily @log_model.where(:do => DOMAINS_HASH[site]), :t
|
29
|
+
end
|
30
|
+
```
|
31
|
+
|
32
|
+
TODO
|
33
|
+
-----------------------------------------------
|
34
|
+
1. Admin interface
|
35
|
+
2. statistical query api in Ruby and HTTP
|
36
|
+
3. Interacting with Javascript charting library, e.g. Highcharts, D3.
|
37
|
+
4. Add namespace to DSL, like rake
|
38
|
+
5. More tests
|
39
|
+
|
40
|
+
|
41
|
+
Copyright
|
42
|
+
-----------------------------------------------
|
43
|
+
MIT. David Chen at eoe.cn.
|
data/Rakefile
ADDED
data/lib/statlysis.rb
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
# Sequel的操作均需通过Symbol
|
3
|
+
#
|
4
|
+
# 删除匹配的统计表
|
5
|
+
# Statlysis.sequel.tables.select {|i| i.to_s.match(//i) }.each {|i| Statlysis.sequel.drop_table i }
|
6
|
+
|
7
|
+
require "active_support/all"
|
8
|
+
require 'active_support/core_ext/module/attribute_accessors.rb'
|
9
|
+
require 'active_record'
|
10
|
+
require 'rails'
|
11
|
+
%w[yaml sequel only_one_rake mongoid].map(&method(:require))
|
12
|
+
|
13
|
+
module Statlysis
|
14
|
+
Units = %w[hour day week month year]
|
15
|
+
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
16
|
+
|
17
|
+
def self.setup_stat_table_and_model cron, tablename = nil
|
18
|
+
tablename = cron.stat_table_name if tablename.nil?
|
19
|
+
tablename ||= cron.stat_table.first_source_table
|
20
|
+
cron.stat_table = Statlysis.sequel[tablename.to_sym]
|
21
|
+
|
22
|
+
str = tablename.to_s.singularize.camelize
|
23
|
+
eval("class ::#{str} < Sequel::Model;
|
24
|
+
self.set_dataset :#{tablename}
|
25
|
+
def self.[] item_id
|
26
|
+
JSON.parse(find_or_create(:pattern => item_id).result) rescue []
|
27
|
+
end
|
28
|
+
end; ")
|
29
|
+
cron.stat_model = str.constantize
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
require 'statlysis/common'
|
35
|
+
require 'statlysis/timeseries'
|
36
|
+
require 'statlysis/clock'
|
37
|
+
require 'statlysis/rake'
|
38
|
+
require 'statlysis/cron'
|
39
|
+
require 'statlysis/similar'
|
40
|
+
|
41
|
+
module Statlysis
|
42
|
+
mattr_accessor :sequel, :default_time_columns, :database_opts, :tablename_default_pre
|
43
|
+
Units.each {|unit| module_eval "mattr_accessor :#{unit}_crons; self.#{unit}_crons = []" }
|
44
|
+
[:realtime, :similar, :hotest].each do |sym|
|
45
|
+
sym = "#{sym}_crons".to_sym
|
46
|
+
mattr_accessor sym; self.send "#{sym}=", []
|
47
|
+
end
|
48
|
+
# TODO _crons uniq, no readd
|
49
|
+
extend self
|
50
|
+
|
51
|
+
# 会在自动拼接统计数据库表名时去除这些时间字段
|
52
|
+
def update_time_columns *columns
|
53
|
+
self.default_time_columns ||= [:created_at, :updated_at]
|
54
|
+
columns.each {|column| self.default_time_columns.push column }
|
55
|
+
self.default_time_columns = self.default_time_columns.uniq
|
56
|
+
end
|
57
|
+
|
58
|
+
def set_database sym_or_hash
|
59
|
+
self.database_opts = if sym_or_hash.is_a? Symbol
|
60
|
+
YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
|
61
|
+
elsif Hash
|
62
|
+
sym_or_hash
|
63
|
+
else
|
64
|
+
raise "Statlysis#set_database only support symbol or hash params"
|
65
|
+
end
|
66
|
+
self.sequel = Sequel.connect self.database_opts.except('database')
|
67
|
+
self.sequel.execute("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;")
|
68
|
+
self.sequel.use self.database_opts['database']
|
69
|
+
# Statlysis.sequel.tables.map {|t| eval "class ::#{t.to_s.camelize} < ActiveRecord::Base; self.establish_connection Statlysis.database_opts; self.table_name = :#{t}; end; #{t.to_s.camelize}" }
|
70
|
+
end
|
71
|
+
|
72
|
+
def set_tablename_default_pre str
|
73
|
+
self.tablename_default_pre = str.to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def daily source, time_column = :created_at; timely source, :time_unit => :day, :time_column => time_column end
|
77
|
+
def hourly source, time_column = :created_at; timely source, :time_unit => :hour, :time_column => time_column end
|
78
|
+
|
79
|
+
def check_set_database; raise "Please setup database first" if sequel.nil? end
|
80
|
+
|
81
|
+
def timely source, opts
|
82
|
+
self.check_set_database
|
83
|
+
opts.reverse_merge! :time_column => :created_at, :time_unit => :day
|
84
|
+
t = Timely.new source, opts
|
85
|
+
module_eval("self.#{opts[:time_unit]}_crons").push t
|
86
|
+
end
|
87
|
+
|
88
|
+
# the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
|
89
|
+
def lastest_visits source, opts
|
90
|
+
self.check_set_database
|
91
|
+
opts.reverse_merge! :time_column => :created_at
|
92
|
+
self.realtime_crons.push LastestVisits.new(source, opts)
|
93
|
+
end
|
94
|
+
|
95
|
+
# TODO 为什么一层proc的话会直接执行的
|
96
|
+
def hotest_items key, id_to_score_and_time_hash = {}
|
97
|
+
_p = proc { if block_given?
|
98
|
+
(proc do
|
99
|
+
id_to_score_and_time_hash = Hash.new
|
100
|
+
yield id_to_score_and_time_hash
|
101
|
+
id_to_score_and_time_hash
|
102
|
+
end)
|
103
|
+
else
|
104
|
+
(proc { id_to_score_and_time_hash })
|
105
|
+
end}
|
106
|
+
|
107
|
+
self.hotest_crons.push HotestItems.new(key, _p)
|
108
|
+
end
|
109
|
+
|
110
|
+
# TODO support mongoid
|
111
|
+
def similar_items model_name, id_to_text_hash = {}
|
112
|
+
_p = if block_given?
|
113
|
+
(proc do
|
114
|
+
id_to_text_hash = Hash.new {|hash, key| hash[key] = "" }
|
115
|
+
yield id_to_text_hash
|
116
|
+
id_to_text_hash
|
117
|
+
end)
|
118
|
+
else
|
119
|
+
(proc { id_to_text_hash })
|
120
|
+
end
|
121
|
+
|
122
|
+
self.similar_crons.push Similar.new(model_name, _p)
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
module Statlysis
|
129
|
+
class Railtie < Rails::Railtie
|
130
|
+
rake_tasks do
|
131
|
+
load File.expand_path('../statlysis/rake.rb', __FILE__)
|
132
|
+
end
|
133
|
+
end if defined? Rails
|
134
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
DateTime1970 = DateTime.parse("19700101").in_time_zone
|
5
|
+
|
6
|
+
class Clock
|
7
|
+
attr_accessor :clock
|
8
|
+
include Common
|
9
|
+
|
10
|
+
def initialize feature, default_time
|
11
|
+
raise "Please assign default_time params" if not default_time
|
12
|
+
cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
|
13
|
+
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
14
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
|
15
|
+
primary_key :id
|
16
|
+
String :feature
|
17
|
+
DateTime :t
|
18
|
+
index :feature, :unique => true
|
19
|
+
end
|
20
|
+
end
|
21
|
+
Statlysis.setup_stat_table_and_model cron
|
22
|
+
cron.clock = cron.stat_model.find_or_create(:feature => feature)
|
23
|
+
cron.clock.update :t => default_time if cron.current.nil?
|
24
|
+
cron
|
25
|
+
end
|
26
|
+
|
27
|
+
def update time
|
28
|
+
time = DateTime.now if time == DateTime1970
|
29
|
+
return false if time && (time < cron.current)
|
30
|
+
cron.clock.update :t => time
|
31
|
+
end
|
32
|
+
|
33
|
+
def current; cron.clock.t end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
module Common
|
5
|
+
attr_accessor :stat_table_name, :stat_model, :stat_table
|
6
|
+
def pattern_table_and_model tn
|
7
|
+
# ensure statlysis table
|
8
|
+
tn = tn.pluralize
|
9
|
+
unless Statlysis.sequel.table_exists?(tn)
|
10
|
+
Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
|
11
|
+
primary_key :id
|
12
|
+
String :pattern
|
13
|
+
index :pattern
|
14
|
+
end
|
15
|
+
Statlysis.sequel.add_column tn, :result, String, :text => true
|
16
|
+
end
|
17
|
+
|
18
|
+
# generate a statlysis model
|
19
|
+
cron.stat_model = Statlysis.setup_stat_table_and_model cron, tn
|
20
|
+
end
|
21
|
+
|
22
|
+
def cron; self end
|
23
|
+
# TODO remove puts, conflict user, user logger
|
24
|
+
def puts(*strs); $stdout.puts(*strs) if ENV['DEBUG'] end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|