statlysis 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +18 -5
- data/lib/statlysis.rb +11 -13
- data/lib/statlysis/clock.rb +4 -0
- data/lib/statlysis/configuration.rb +56 -28
- data/lib/statlysis/constants.rb +12 -0
- data/lib/statlysis/cron.rb +4 -6
- data/lib/statlysis/cron/timely.rb +171 -0
- data/lib/statlysis/cron/timely/multiple_dimensions.rb +52 -0
- data/lib/statlysis/cron/timely/one_dimension.rb +60 -0
- data/lib/statlysis/cron_set.rb +4 -3
- data/lib/statlysis/javascript/count.rb +50 -15
- data/lib/statlysis/map_reduce.rb +1 -1
- data/lib/statlysis/utils.rb +6 -0
- data/statlysis.gemspec +3 -3
- data/test/config/database.yml +1 -2
- data/test/helper.rb +15 -4
- data/test/migrate/1_active_record.rb +1 -0
- data/test/models/code_gist.rb +12 -0
- data/test/models/eoe_log.rb +2 -4
- data/test/test_daily_count.rb +4 -2
- data/test/test_mapreduce.rb +8 -1
- metadata +8 -8
- data/lib/statlysis/cron/count.rb +0 -51
- data/lib/statlysis/cron/count/dimensions.rb +0 -7
- data/lib/statlysis/cron/count/timely.rb +0 -63
data/README.markdown
CHANGED
@@ -10,11 +10,13 @@ Usage
|
|
10
10
|
Statlysis.setup do
|
11
11
|
set_database :statlysis
|
12
12
|
|
13
|
-
|
13
|
+
daily CodeGist
|
14
|
+
hourly EoeLog, :time_column => :t # support custom time_column
|
15
|
+
|
14
16
|
[EoeLog,
|
15
|
-
EoeLog.where(:ui => 0),
|
17
|
+
EoeLog.where(:ui => 0), # support query scope
|
16
18
|
EoeLog.where(:ui => {"$ne" => 0}),
|
17
|
-
Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
|
19
|
+
Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}), # support collection name regexp
|
18
20
|
EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
|
19
21
|
].each do |s|
|
20
22
|
daily s, :time_column => :t
|
@@ -45,8 +47,6 @@ TODO
|
|
45
47
|
* Admin interface
|
46
48
|
* statistical query api in Ruby and HTTP
|
47
49
|
* Interacting with Javascript charting library, e.g. Highcharts, D3.
|
48
|
-
* More tests
|
49
|
-
* Add @criteria to MultipleDataset
|
50
50
|
|
51
51
|
|
52
52
|
Statistical Process
|
@@ -68,6 +68,11 @@ Q: Why do you recommend using multiple collections to store logs rather than a s
|
|
68
68
|
A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
|
69
69
|
|
70
70
|
|
71
|
+
Q: In Mongodb, why use MapReduce instead of Aggregation?
|
72
|
+
|
73
|
+
A: The result of aggregation pipeline is a document and is subject to the BSON Document size limit, which is currently 16 megabytes, see more details at http://docs.mongodb.org/manual/core/aggregation-pipeline/#pipeline
|
74
|
+
|
75
|
+
|
71
76
|
Copyright
|
72
77
|
-----------------------------------------------
|
73
78
|
MIT. David Chen at eoe.cn.
|
@@ -91,3 +96,11 @@ Related
|
|
91
96
|
|
92
97
|
### Admin interface
|
93
98
|
* http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
|
99
|
+
|
100
|
+
|
101
|
+
### ETL
|
102
|
+
* https://github.com/activewarehouse/activewarehouse-etl/
|
103
|
+
* http://jisraelsen.github.io/drudgery/ ruby ETL DSL, support csv, sqlite3, ActiveRecord, without support time range
|
104
|
+
* https://github.com/square/ETL Simply encapsulates the SQL procedures
|
105
|
+
|
106
|
+
|
data/lib/statlysis.rb
CHANGED
@@ -20,34 +20,33 @@ require 'activerecord_idnamecache'
|
|
20
20
|
module Rails; end
|
21
21
|
|
22
22
|
require 'statlysis/constants'
|
23
|
+
require 'statlysis/utils'
|
24
|
+
require 'statlysis/configuration'
|
25
|
+
require 'statlysis/common'
|
23
26
|
|
24
27
|
module Statlysis
|
25
28
|
class << self
|
26
29
|
def setup &blk
|
27
30
|
raise "Need to setup proc" if not blk
|
28
31
|
|
29
|
-
logger.info "Start to setup Statlysis"
|
32
|
+
logger.info "Start to setup Statlysis" if ENV['DEBUG']
|
30
33
|
time_log do
|
31
34
|
self.config.instance_exec(&blk)
|
32
35
|
end
|
33
|
-
logger.info
|
34
36
|
end
|
35
37
|
|
36
38
|
def time_log text = nil
|
37
39
|
t = Time.now
|
38
40
|
logger.info text if text
|
39
41
|
yield if block_given?
|
40
|
-
logger.info "Time spend #{(Time.now - t).round(2)} seconds."
|
41
|
-
logger.info "-" * 42
|
42
|
+
logger.info "Time spend #{(Time.now - t).round(2)} seconds." if ENV['DEBUG']
|
43
|
+
logger.info "-" * 42 if ENV['DEBUG']
|
42
44
|
end
|
43
45
|
|
44
46
|
# delagate config methods to Configuration
|
45
47
|
def config; Configuration.instance end
|
46
48
|
require 'active_support/core_ext/module/delegation.rb'
|
47
|
-
|
48
|
-
:default_time_zone,
|
49
|
-
:set_tablename_default_pre, :tablename_default_pre
|
50
|
-
].each do |sym|
|
49
|
+
Configuration::DelegateMethods.each do |sym|
|
51
50
|
delegate sym, :to => :config
|
52
51
|
end
|
53
52
|
|
@@ -56,18 +55,17 @@ module Statlysis
|
|
56
55
|
|
57
56
|
def source_to_database_type; @_source_to_database_type ||= {} end
|
58
57
|
|
59
|
-
|
58
|
+
# 代理访问 各个时间类型的 crons
|
60
59
|
def daily; CronSet.new(Statlysis.config.day_crons) end
|
61
60
|
def hourly; CronSet.new(Statlysis.config.hour_crons) end
|
61
|
+
def always; CronSet.new(Statlysis.config.always_crons) end
|
62
62
|
|
63
63
|
end
|
64
64
|
|
65
65
|
end
|
66
66
|
|
67
|
-
require 'statlysis/utils'
|
68
|
-
require 'statlysis/configuration'
|
69
|
-
require 'statlysis/common'
|
70
67
|
require 'statlysis/timeseries'
|
68
|
+
require 'statlysis/map_reduce'
|
71
69
|
require 'statlysis/clock'
|
72
70
|
require 'statlysis/rake'
|
73
71
|
require 'statlysis/cron'
|
@@ -77,7 +75,7 @@ require 'statlysis/multiple_dataset'
|
|
77
75
|
|
78
76
|
module Statlysis
|
79
77
|
require 'short_inspect'
|
80
|
-
ShortInspect.apply_to Cron,
|
78
|
+
ShortInspect.apply_to Cron, MultipleDataset
|
81
79
|
ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
|
82
80
|
end
|
83
81
|
|
data/lib/statlysis/clock.rb
CHANGED
@@ -8,6 +8,8 @@ module Statlysis
|
|
8
8
|
# feature is a string
|
9
9
|
def initialize feature, default_time
|
10
10
|
raise "Please assign default_time params" if not default_time
|
11
|
+
|
12
|
+
# init table & model
|
11
13
|
cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
|
12
14
|
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
13
15
|
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
|
@@ -19,6 +21,8 @@ module Statlysis
|
|
19
21
|
end
|
20
22
|
h = Utils.setup_pattern_table_and_model cron.stat_table_name
|
21
23
|
cron.stat_model = h[:model]
|
24
|
+
|
25
|
+
# init default_time
|
22
26
|
cron.clock = cron.stat_model.find_or_create(:feature => feature)
|
23
27
|
cron.clock.update :t => default_time if cron.current.nil?
|
24
28
|
cron
|
@@ -9,16 +9,21 @@ module Statlysis
|
|
9
9
|
class Configuration
|
10
10
|
include Singleton
|
11
11
|
|
12
|
+
# variables
|
12
13
|
attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
|
13
14
|
attr_accessor :is_skip_database_index
|
14
|
-
TimeUnits
|
15
|
-
|
16
|
-
sym = "#{sym}_crons"
|
17
|
-
attr_accessor sym; self.instance.send "#{sym}=", []
|
15
|
+
(TimeUnits + %W[always] + [:realtime, :similar, :hotest]).each do |unit|
|
16
|
+
sym = "#{unit}_crons"; attr_accessor sym; self.instance.send "#{sym}=", []
|
18
17
|
end
|
19
18
|
self.instance.send "tablename_default_pre=", "st"
|
20
19
|
self.instance.send "is_skip_database_index=", false
|
21
20
|
|
21
|
+
DelegateMethods = [
|
22
|
+
:sequel, :set_database, :check_set_database,
|
23
|
+
:default_time_zone,
|
24
|
+
:set_tablename_default_pre, :tablename_default_pre
|
25
|
+
]
|
26
|
+
|
22
27
|
# 会在自动拼接统计数据库表名时去除这些时间字段
|
23
28
|
def update_time_columns *columns
|
24
29
|
self.default_time_columns ||= [:created_at, :updated_at]
|
@@ -26,43 +31,44 @@ module Statlysis
|
|
26
31
|
self.default_time_columns = self.default_time_columns.uniq
|
27
32
|
end
|
28
33
|
|
29
|
-
def set_database
|
30
|
-
self.database_opts =
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
def set_database obj
|
35
|
+
self.database_opts = case obj
|
36
|
+
when Hash
|
37
|
+
obj
|
38
|
+
when Symbol, String
|
39
|
+
YAML.load_file(Rails.root.join("config/database.yml"))[Rails.env].merge('database' => obj.to_s)
|
40
|
+
else
|
41
|
+
raise "Statlysis#set_database only support symbol or hash params"
|
42
|
+
end
|
43
|
+
|
44
|
+
raise "database_opts should not be blank" if self.database_opts.blank?
|
45
|
+
|
46
|
+
# sqlite dont support regular creating database in mysql style
|
47
|
+
self.sequel = if (self.database_opts['adapter'].match(/sqlite/) && self.database_opts['database'].match(/\A:memory:\Z/)) # only for test envrionment
|
48
|
+
Sequel.sqlite
|
34
49
|
else
|
35
|
-
|
50
|
+
# create database, copied from http://stackoverflow.com/a/14435522/595618
|
51
|
+
require 'mysql2'
|
52
|
+
mysql2_client = Mysql2::Client.new(self.database_opts.except('database'))
|
53
|
+
mysql2_client.query("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']}")
|
54
|
+
Sequel.connect(self.database_opts)
|
36
55
|
end
|
37
|
-
self.sequel = Sequel.connect(self.database_opts)
|
38
56
|
|
39
57
|
# 初始化键值model
|
40
58
|
["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
|
41
59
|
Utils.setup_pattern_table_and_model tn
|
42
60
|
end
|
43
|
-
return self
|
44
|
-
end
|
45
61
|
|
46
|
-
def set_default_time_zone zone
|
47
|
-
self.default_time_zone = zone
|
48
62
|
return self
|
49
63
|
end
|
50
64
|
|
51
|
-
def
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
|
56
|
-
def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
|
57
|
-
|
65
|
+
def set_default_time_zone zone; self.default_time_zone = zone; return self; end
|
66
|
+
def set_tablename_default_pre str; self.tablename_default_pre = str.to_s; return self end
|
58
67
|
def check_set_database; raise "Please setup database first" if sequel.nil? end
|
59
68
|
|
60
|
-
def timely source, opts
|
61
|
-
|
62
|
-
|
63
|
-
t = Timely.new source, opts
|
64
|
-
self.send("#{opts[:time_unit]}_crons").push t
|
65
|
-
end
|
69
|
+
def daily source, opts = {}; timely source, {:time_unit => :day}.merge(opts) end
|
70
|
+
def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
|
71
|
+
def always source, opts = {}; timely source, {:time_unit => false, :time_column => false}.merge(opts) end # IMPORTANT set :time_unit to false
|
66
72
|
|
67
73
|
# the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
|
68
74
|
def lastest_visits source, opts
|
@@ -101,5 +107,27 @@ module Statlysis
|
|
101
107
|
self.similar_crons.push Similar.new(model_name, _p)
|
102
108
|
end
|
103
109
|
|
110
|
+
|
111
|
+
private
|
112
|
+
def timely source, opts
|
113
|
+
self.check_set_database
|
114
|
+
|
115
|
+
opts.reverse_merge! :time_column => :created_at,
|
116
|
+
:time_unit => :day,
|
117
|
+
:sum_columns => [],
|
118
|
+
:group_by_columns => [],
|
119
|
+
:group_concat_columns => []
|
120
|
+
|
121
|
+
opts.each {|k, v| opts[k] = v.map(&:to_sym) if (Timely::SqlColumns - [:group_by_columns]).include?(k) } # Sequel use symbol as column names
|
122
|
+
|
123
|
+
# e.g. convert [:user_id] to [{:column_name => :user_id, :type => :integer}]
|
124
|
+
if (opts[:group_by_columns].first || {})[:type].blank?
|
125
|
+
opts[:group_by_columns] = opts[:group_by_columns].map {|i| {:column_name => i.to_sym, :type => :integer} }
|
126
|
+
end
|
127
|
+
|
128
|
+
t = Timely.new source, opts
|
129
|
+
self.send("#{opts[:time_unit] || 'always'}_crons").push t
|
130
|
+
end
|
131
|
+
|
104
132
|
end
|
105
133
|
end
|
data/lib/statlysis/constants.rb
CHANGED
@@ -3,8 +3,20 @@
|
|
3
3
|
module Statlysis
|
4
4
|
TimeUnits = %w[hour day week month year]
|
5
5
|
DateTime1970 = Time.zone.parse("19700101").in_time_zone
|
6
|
+
TimeUnitToTableSuffixHash = (TimeUnits + [false]).inject({}) {|_h, _i| _h[_i] = (_i ? _i[0] : 'a'); _h }
|
6
7
|
|
7
8
|
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
8
9
|
|
9
10
|
DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
|
11
|
+
|
12
|
+
SymbolToClassInDataType = {
|
13
|
+
:string => String,
|
14
|
+
:datetime => DateTime,
|
15
|
+
:time => Time,
|
16
|
+
:integer => Integer,
|
17
|
+
:float => Float,
|
18
|
+
:text => String
|
19
|
+
}
|
20
|
+
|
21
|
+
|
10
22
|
end
|
data/lib/statlysis/cron.rb
CHANGED
@@ -29,10 +29,8 @@ module Statlysis
|
|
29
29
|
def is_activerecord?; @source_type == :activerecord; end
|
30
30
|
def is_mongoid?; @source_type == :mongoid; end
|
31
31
|
def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
|
32
|
+
def _source; cron.multiple_dataset.sources.first end
|
32
33
|
|
33
|
-
def _source
|
34
|
-
cron.multiple_dataset.sources.first
|
35
|
-
end
|
36
34
|
def source_where_array
|
37
35
|
# TODO follow index seq
|
38
36
|
a = _source.where("").where_values.map do |equality|
|
@@ -64,8 +62,6 @@ module Statlysis
|
|
64
62
|
TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
|
65
63
|
end
|
66
64
|
|
67
|
-
protected
|
68
|
-
|
69
65
|
# 兼容采用整数类型作时间字段
|
70
66
|
def is_time_column_integer?
|
71
67
|
if is_activerecord?
|
@@ -74,11 +70,13 @@ module Statlysis
|
|
74
70
|
false
|
75
71
|
end
|
76
72
|
end
|
73
|
+
def time_column?; !!@time_column end
|
74
|
+
def group_by_columns?; !!@group_by_columns.any? end
|
77
75
|
|
78
76
|
end
|
79
77
|
|
80
78
|
end
|
81
79
|
|
82
80
|
|
83
|
-
require 'statlysis/cron/
|
81
|
+
require 'statlysis/cron/timely'
|
84
82
|
require 'statlysis/cron/top'
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely < Cron
|
5
|
+
SqlColumns = [:sum_columns, :group_by_columns, :group_concat_columns]
|
6
|
+
attr_reader(*SqlColumns)
|
7
|
+
|
8
|
+
def initialize source, opts = {}
|
9
|
+
super
|
10
|
+
Statlysis.check_set_database
|
11
|
+
SqlColumns.each {|sym| instance_variable_set "@#{sym}", (opts[sym] || []) }
|
12
|
+
cron.setup_stat_model
|
13
|
+
cron
|
14
|
+
end
|
15
|
+
|
16
|
+
# 设置数据源,并保存结果入数据库
|
17
|
+
def run
|
18
|
+
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
19
|
+
|
20
|
+
raise "cron.output has no Enumerable" if not cron.output.class.included_modules.include? Enumerable
|
21
|
+
|
22
|
+
num_i = 0; num_add = 999
|
23
|
+
Statlysis.sequel.transaction do
|
24
|
+
# delete first in range
|
25
|
+
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete if cron.time_column?
|
26
|
+
|
27
|
+
# TODO partial delete
|
28
|
+
cron.stat_model.where("").delete if cron.group_by_columns?
|
29
|
+
|
30
|
+
while !(_a = cron.output[num_i..(num_i+num_add)]).blank? do
|
31
|
+
# batch insert all
|
32
|
+
cron.stat_model.multi_insert _a
|
33
|
+
num_i += (num_add + 1)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
return self
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
def setup_stat_model
|
42
|
+
cron.stat_table_name = Utils.normalise_name cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array, cron.group_by_columns.map {|i| i[:column_name] }, TimeUnitToTableSuffixHash[cron.time_unit]
|
43
|
+
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
44
|
+
|
45
|
+
|
46
|
+
# create basic unchangeable table structure
|
47
|
+
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
48
|
+
Statlysis.sequel.transaction do
|
49
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
50
|
+
primary_key :id # Add one column at least in this block to avoid `SQLite3::SQLException: near ")": syntax error (Sequel::DatabaseError)`
|
51
|
+
end
|
52
|
+
Statlysis.sequel.add_column cron.stat_table_name, :t, DateTime if cron.time_column? # alias for :time
|
53
|
+
|
54
|
+
# add count columns
|
55
|
+
if cron.time_column?
|
56
|
+
count_columns = [:timely_c, :totally_c] # alias for :count
|
57
|
+
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
58
|
+
else
|
59
|
+
Statlysis.sequel.add_column cron.stat_table_name, :c, Integer # alias for :count
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
# add group_by columns & indexes
|
65
|
+
remodel
|
66
|
+
cron.stat_model.cron = cron
|
67
|
+
if cron.group_by_columns.any?
|
68
|
+
cron.group_by_columns.each do |_h|
|
69
|
+
if not cron.stat_model.columns.include?(_h[:column_name])
|
70
|
+
_h[:type] = SymbolToClassInDataType[_h[:type]] if _h[:type].is_a?(Symbol) # && (Statlysis.sequel.opts[:adapter] == :sqlite)
|
71
|
+
Statlysis.sequel.add_column cron.stat_table_name, _h[:column_name], _h[:type]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# add sum columns
|
77
|
+
remodel
|
78
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
79
|
+
_result_cols.each do |_result_col|
|
80
|
+
if not cron.stat_model.columns.include?(_result_col)
|
81
|
+
# convert to Interger type in view if needed
|
82
|
+
Statlysis.sequel.add_column cron.stat_table_name, _result_col, Float
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Fix there should be uniq index name between tables
|
88
|
+
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
89
|
+
_group_by_columns_index_name = cron.group_by_columns.reject {|i| i[:no_index] }.map {|i| i[:column_name] }
|
90
|
+
_truncated_columns = _group_by_columns_index_name.dup # only String column
|
91
|
+
_group_by_columns_index_name = _group_by_columns_index_name.unshift :t if cron.time_column?
|
92
|
+
# TODO use https://github.com/german/redis_orm to support full string indexes
|
93
|
+
if !Statlysis.config.is_skip_database_index && _group_by_columns_index_name.any?
|
94
|
+
mysql_per_column_length_limit_in_one_index = (1000 / 3.0 / _group_by_columns_index_name.size.to_f).to_i
|
95
|
+
index_columns_str = _group_by_columns_index_name.map {|s| _truncated_columns.include?(s) ? "#{s.to_s}(#{mysql_per_column_length_limit_in_one_index})" : s.to_s }.join(", ")
|
96
|
+
index_columns_str = "(#{index_columns_str})"
|
97
|
+
begin
|
98
|
+
# NOTE mysql indexes key length limit is 1000 bytes
|
99
|
+
cron.stat_model.dataset.with_sql("CREATE INDEX #{Utils.sha1_name(_group_by_columns_index_name)} ON #{cron.stat_table_name} #{index_columns_str};").to_a
|
100
|
+
rescue => e
|
101
|
+
raise e if not e.inspect.match(/exists|duplicate/i)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# add group_concat column
|
106
|
+
remodel
|
107
|
+
if cron.group_concat_columns.any? && !cron.stat_model.columns.include?(:other_json)
|
108
|
+
Statlysis.sequel.add_column cron.stat_table_name, :other_json, :text
|
109
|
+
end
|
110
|
+
|
111
|
+
# add access to group_concat values in other_json
|
112
|
+
remodel.class_eval do
|
113
|
+
define_method("other_json_hash") do
|
114
|
+
@__other_json_hash_cache ||= (JSON.parse(self.other_json) rescue {})
|
115
|
+
end
|
116
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
117
|
+
define_method("#{_group_concat_column}_values") do
|
118
|
+
self.other_json_hash[_group_concat_column.to_s]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
remodel
|
124
|
+
end
|
125
|
+
|
126
|
+
def output
|
127
|
+
@output ||= (cron.group_by_columns.any? ? multiple_dimensions_output : one_dimension_output)
|
128
|
+
end
|
129
|
+
|
130
|
+
protected
|
131
|
+
def unit_range_query time, time_begin = nil
|
132
|
+
# time begin and end
|
133
|
+
tb = time
|
134
|
+
te = (time+1.send(cron.time_unit)-1.second)
|
135
|
+
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
136
|
+
tb = time_begin || tb
|
137
|
+
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
138
|
+
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
139
|
+
end
|
140
|
+
|
141
|
+
# e.g. {:fav_count=>[:timely_favcount_s, :totally_favcount_s]}
|
142
|
+
def sum_column_to_result_columns_hash
|
143
|
+
cron.sum_columns.inject({}) do |h, _col|
|
144
|
+
[:timely, :totally].each do |_pre|
|
145
|
+
h[_col] ||= []
|
146
|
+
h[_col] << Utils.normalise_name(_pre, _col, 's').to_sym
|
147
|
+
end
|
148
|
+
h
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
def remodel
|
154
|
+
n = cron.stat_table_name.to_s.singularize.camelize
|
155
|
+
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
156
|
+
class ::#{n} < Sequel::Model;
|
157
|
+
self.set_dataset :#{cron.stat_table_name}
|
158
|
+
|
159
|
+
cattr_accessor :cron
|
160
|
+
end
|
161
|
+
#{n}
|
162
|
+
MODEL
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
|
170
|
+
require 'statlysis/cron/timely/one_dimension'
|
171
|
+
require 'statlysis/cron/timely/multiple_dimensions'
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely
|
5
|
+
|
6
|
+
|
7
|
+
def multiple_dimensions_output
|
8
|
+
self.send "multiple_dimensions_output_with#{cron.time_column ? '' : 'out'}_time_column"
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def multiple_dimensions_output_with_time_column
|
13
|
+
cron.time_range.map do |time|
|
14
|
+
raise DefaultNotImplementWrongMessage # TODO
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# TODO encapsulate Mongoid MapReduce in collection output mode
|
19
|
+
# TODO support large dataset, e.g. a million.
|
20
|
+
def multiple_dimensions_output_without_time_column
|
21
|
+
mr = Javascript::MultiDimensionalCount.new(cron)
|
22
|
+
|
23
|
+
array = []
|
24
|
+
cron.multiple_dataset.sources.each do |_source|
|
25
|
+
# _source = _source.time_range # TODO
|
26
|
+
array += _source.map_reduce(mr.map_func, mr.reduce_func)
|
27
|
+
.out(inline: 1) # TODO use replace mode
|
28
|
+
.to_a.map do |i|
|
29
|
+
v = i['value']
|
30
|
+
_h = {:c => v['count']}
|
31
|
+
|
32
|
+
cron.group_by_columns.each do |_group_by_column|
|
33
|
+
_h[_group_by_column[:column_name]] = v[_group_by_column[:column_name].to_s]
|
34
|
+
end
|
35
|
+
|
36
|
+
_h[:other_json] = {}
|
37
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
38
|
+
_h[:other_json][_group_concat_column] = v["#{_group_concat_column}_values"].inject({}) {|_h2, i2| _h2[i2] ||= 0; _h2[i2] += 1; _h2 }
|
39
|
+
end
|
40
|
+
_h[:other_json] = _h[:other_json].to_json
|
41
|
+
|
42
|
+
_h
|
43
|
+
end
|
44
|
+
end
|
45
|
+
array
|
46
|
+
|
47
|
+
# TODO support sum_columns
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely
|
5
|
+
|
6
|
+
|
7
|
+
# one dimension **must** have `time_column`, or there's nothing to do
|
8
|
+
#
|
9
|
+
# TODO add to FAQ
|
10
|
+
# * if you want to statistics one column through `group_by_columns`
|
11
|
+
# params, and dont need time column, then you could use `always` DSL.
|
12
|
+
#
|
13
|
+
def one_dimension_output
|
14
|
+
cron.time_range.map do |time|
|
15
|
+
_hash = {:t => time, :timely_c => 0, :totally_c => 0}
|
16
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
17
|
+
_result_cols.each do |_result_col|
|
18
|
+
_hash[_result_col] = 0.0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# support multiple data sources
|
23
|
+
_first_source = nil
|
24
|
+
cron.multiple_dataset.sources.each do |s|
|
25
|
+
_t = DateTime1970
|
26
|
+
_t = is_time_column_integer? ? _t.to_i : _t
|
27
|
+
|
28
|
+
_scope_one = s.where(unit_range_query(time))
|
29
|
+
# TODO cache pre-result
|
30
|
+
_scope_all = s.where(unit_range_query(time, _t))
|
31
|
+
|
32
|
+
# 1. count
|
33
|
+
_hash[:timely_c] += _scope_one.count
|
34
|
+
_hash[:totally_c] += _scope_all.count
|
35
|
+
|
36
|
+
# 2. sum
|
37
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
38
|
+
_hash[_result_cols[0]] = _scope_one.map(&_sum_col).reduce(:+).to_f
|
39
|
+
_hash[_result_cols[1]] = _scope_all.map(&_sum_col).reduce(:+).to_f
|
40
|
+
end
|
41
|
+
|
42
|
+
# 3. group_concat
|
43
|
+
_other_json = {}
|
44
|
+
_other_json[:group_concat_columns] ||= {}
|
45
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
46
|
+
_other_json[:group_concat_columns][_group_concat_column] = _scope_one.map(&_group_concat_column).uniq
|
47
|
+
end
|
48
|
+
_hash[:other_json] = _other_json.to_json
|
49
|
+
|
50
|
+
_first_source ||= s.where(unit_range_query(time))
|
51
|
+
end
|
52
|
+
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{_hash[:timely_c]} totally_c:#{_hash[:totally_c]}" if ENV['DEBUG']
|
53
|
+
|
54
|
+
_hash
|
55
|
+
end.select {|r1| r1.except(:t, :other_json).values.reject {|r2| r2.zero? }.any? }
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
data/lib/statlysis/cron_set.rb
CHANGED
@@ -10,13 +10,14 @@ module Statlysis
|
|
10
10
|
when Fixnum, Integer # support array idx access
|
11
11
|
self.to_a[pattern]
|
12
12
|
else
|
13
|
-
CronSet.new(select do |
|
14
|
-
|
13
|
+
CronSet.new(self.select do |cron|
|
14
|
+
reg = Regexp.new(pattern.to_s)
|
15
|
+
cron.stat_table_name.match(reg) || cron.multiple_dataset.name.to_s.match(reg)
|
15
16
|
end)
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
|
-
def last; [-1]; end
|
20
|
+
def last; self[-1]; end
|
20
21
|
|
21
22
|
def run
|
22
23
|
map(&:run)
|
@@ -4,34 +4,69 @@ module Statlysis
|
|
4
4
|
module Javascript
|
5
5
|
class MultiDimensionalCount
|
6
6
|
attr_reader :map_func, :reduce_func
|
7
|
+
attr_reader :cron
|
7
8
|
|
8
|
-
def initialize
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
def initialize cron
|
10
|
+
@cron = cron
|
11
|
+
|
12
|
+
# setup group_by_columns
|
13
|
+
_group_by_columns = :_id if cron.group_by_columns.blank?
|
14
|
+
_group_by_columns ||= cron.group_by_columns.map {|i| i[:column_name] }
|
15
|
+
emit_key = _group_by_columns.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
|
16
|
+
emit_key = "{#{emit_key}}"
|
17
|
+
|
18
|
+
# TODO setup sum_columns
|
19
|
+
# default_emit_values_array += cron.sum_columns.map {|_sum_column| "#{_sum_column}: this.#{_sum_column}" }
|
20
|
+
|
21
|
+
# setup group_concat_columns
|
22
|
+
# NOTE if only one uniq emit value, then it'll never be appeared in reduce function
|
23
|
+
emit_values_init_array = cron.group_concat_columns.map do |_group_concat_column|
|
24
|
+
"emit_value.#{_group_concat_column}_values = [this.#{_group_concat_column}];\n"
|
18
25
|
end
|
26
|
+
emit_values_init_array += (_group_by_columns.map do |_group_by_column|
|
27
|
+
"emit_value.#{_group_by_column} = this.#{_group_by_column};\n"
|
28
|
+
end)
|
19
29
|
|
20
30
|
@map_func = "function() {
|
21
|
-
|
31
|
+
var emit_value = {count: 1};
|
32
|
+
#{emit_values_init_array.join}
|
33
|
+
|
34
|
+
emit (#{emit_key}, emit_value);
|
22
35
|
}"
|
23
36
|
|
37
|
+
# sum_init_values = cron.sum_columns.map {|_sum_column| "#{_sum_column} = 0.0" }
|
38
|
+
# sum_init_values = "var #{sum_init_values};" if cron.sum_columns.any?
|
39
|
+
|
40
|
+
# 如果使用Hash,将导致group_concat最终的数目和group_by数目不一致,因为多个任务并行时会导致覆盖(常见于个数多的分类,一个的则不会有这个问题),而可并行化的数组则不会。
|
41
|
+
group_concat_values_init_array = cron.group_concat_columns.map {|_group_concat_column| "reducedObject.#{_group_concat_column}_values = [];" }
|
42
|
+
group_concat_values_process_array = cron.group_concat_columns.map do |_group_concat_column|
|
43
|
+
"reducedObject.#{_group_concat_column}_values = reducedObject.#{_group_concat_column}_values.concat(v['#{_group_concat_column}_values']);\n"
|
44
|
+
end
|
45
|
+
group_by_values_process_array = _group_by_columns.map do |_group_by_column|
|
46
|
+
"reducedObject.#{_group_by_column} = v.#{_group_by_column};\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
# emit value in map func should be the same structure as the
|
50
|
+
# return value in reduce func, see more details in
|
51
|
+
# http://rickosborne.org/download/SQL-to-MongoDB.pdf and
|
52
|
+
# http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/
|
24
53
|
@reduce_func = "function(key, values) {
|
25
|
-
var
|
54
|
+
var reducedObject = key;
|
55
|
+
reducedObject.count = 0;
|
56
|
+
#{group_concat_values_init_array.join}
|
26
57
|
|
27
58
|
values.forEach(function(v) {
|
28
|
-
count += v['count'];
|
59
|
+
reducedObject.count += v['count'];
|
60
|
+
#{group_by_values_process_array.join}
|
61
|
+
#{group_concat_values_process_array.join}
|
29
62
|
});
|
30
63
|
|
31
|
-
return
|
64
|
+
return reducedObject;
|
32
65
|
}"
|
33
|
-
|
66
|
+
|
67
|
+
return self
|
34
68
|
end
|
69
|
+
|
35
70
|
end
|
36
71
|
end
|
37
72
|
end
|
data/lib/statlysis/map_reduce.rb
CHANGED
data/lib/statlysis/utils.rb
CHANGED
@@ -35,6 +35,12 @@ module Statlysis
|
|
35
35
|
{:table => tn, :model => str.constantize}
|
36
36
|
end
|
37
37
|
|
38
|
+
def normalise_name *name
|
39
|
+
Array(name).flatten.compact.map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
40
|
+
end
|
41
|
+
|
42
|
+
def sha1_name name; Digest::SHA1.hexdigest Array(name).map(&:to_s).join end
|
43
|
+
|
38
44
|
end
|
39
45
|
end
|
40
46
|
end
|
data/statlysis.gemspec
CHANGED
@@ -4,13 +4,13 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'statlysis'
|
7
|
-
s.version = '0.0.
|
8
|
-
s.date = '2013-
|
7
|
+
s.version = '0.0.3'
|
8
|
+
s.date = '2013-12-03'
|
9
9
|
s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
|
10
10
|
s.description = s.summary
|
11
11
|
s.authors = ["David Chen"]
|
12
12
|
s.email = 'mvjome@gmail.com'
|
13
|
-
s.homepage = 'https://github.com/
|
13
|
+
s.homepage = 'https://github.com/SunshineLibrary/statlysis'
|
14
14
|
s.license = 'MIT'
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
data/test/config/database.yml
CHANGED
data/test/helper.rb
CHANGED
@@ -12,6 +12,7 @@ require 'test/unit'
|
|
12
12
|
|
13
13
|
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
14
14
|
$LOAD_PATH.unshift File.dirname(__FILE__) # test dirs
|
15
|
+
require 'pry-debugger'
|
15
16
|
|
16
17
|
# load mongoid setup
|
17
18
|
require 'mongoid'
|
@@ -22,10 +23,11 @@ require 'statlysis'
|
|
22
23
|
|
23
24
|
# load rails
|
24
25
|
def Rails.root; Pathname.new(File.expand_path('../.', __FILE__)) end
|
26
|
+
def Rails.env; 'development' end
|
25
27
|
require 'sqlite3'
|
26
28
|
|
27
29
|
# load ActiveRecord setup
|
28
|
-
Statlysis.set_database :
|
30
|
+
Statlysis.set_database ":memory:"
|
29
31
|
Statlysis.config.is_skip_database_index = true
|
30
32
|
ActiveRecord::Base.establish_connection(Statlysis.config.database_opts.merge("adapter" => "sqlite3"))
|
31
33
|
Dir[File.expand_path("../migrate/*.rb", __FILE__).to_s].each { |f| require f }
|
@@ -35,13 +37,20 @@ Dir[File.expand_path("../models/*.rb", __FILE__).to_s].each { |f| require f }
|
|
35
37
|
# copied from http://stackoverflow.com/questions/4410794/ruby-on-rails-import-data-from-a-csv-file/4410880#4410880
|
36
38
|
require 'csv'
|
37
39
|
csv = CSV.parse(File.read(File.expand_path('../data/code_gists_20130724.csv', __FILE__)), :headers => true) # data from code.eoe.cn
|
38
|
-
csv.each
|
40
|
+
csv.each do |row|
|
41
|
+
_h = row.to_hash.merge(:fav_count => rand(5).to_i)
|
42
|
+
CodeGist.create! _h
|
43
|
+
_h[:category_id] = rand(10).to_i + 1
|
44
|
+
CodeGistMongoid.create! _h
|
45
|
+
end
|
39
46
|
|
40
47
|
|
41
48
|
Statlysis.setup do
|
42
49
|
hourly EoeLog, :time_column => :t
|
43
50
|
|
44
|
-
daily CodeGist
|
51
|
+
daily CodeGist, :sum_columns => [:fav_count], :group_concat_columns => [:user_id]
|
52
|
+
always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}], :group_concat_columns => [:user_id]
|
53
|
+
always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}, {:column_name => :category_id, :type => :integer}], :group_concat_columns => [:user_id]
|
45
54
|
|
46
55
|
[EoeLog,
|
47
56
|
EoeLog.where(:do => 3),
|
@@ -50,6 +59,8 @@ Statlysis.setup do
|
|
50
59
|
].each do |s|
|
51
60
|
daily s, :time_column => :t
|
52
61
|
end
|
53
|
-
|
62
|
+
cron1 = Statlysis.daily['mul'][1]
|
63
|
+
cron2 = Statlysis.daily['cod'][0]
|
64
|
+
cron3 = Statlysis.always['code']['mongoid'][0]
|
54
65
|
require 'pry-debugger';binding.pry
|
55
66
|
end
|
data/test/models/code_gist.rb
CHANGED
@@ -3,3 +3,15 @@
|
|
3
3
|
class CodeGist < ActiveRecord::Base
|
4
4
|
|
5
5
|
end
|
6
|
+
|
7
|
+
|
8
|
+
class CodeGistMongoid
|
9
|
+
include Mongoid::Document
|
10
|
+
include Mongoid::Timestamps
|
11
|
+
field :id, :type => Integer
|
12
|
+
field :description, :type => String
|
13
|
+
field :user_id, :type => Integer
|
14
|
+
field :author, :type => String
|
15
|
+
field :fav_count, :type => Integer
|
16
|
+
field :category_id, :type => Integer
|
17
|
+
end
|
data/test/models/eoe_log.rb
CHANGED
@@ -43,10 +43,8 @@ EoeLog.create
|
|
43
43
|
|
44
44
|
collection_class = collection_class_name.constantize
|
45
45
|
t = Time.zone.parse(date_str)
|
46
|
-
1.
|
47
|
-
|
48
|
-
collection_class.create :t => (t.to_time+rand(60*60*24-1)).to_datetime, :url => '/'
|
49
|
-
end
|
46
|
+
values = (1..day).map {|i| (t.to_time+rand(60*60*24-1)).to_datetime }.sort.map {|i| {:t => i, :url => '/' } }
|
47
|
+
collection_class.create values
|
50
48
|
|
51
49
|
collection_class.count
|
52
50
|
end
|
data/test/test_daily_count.rb
CHANGED
@@ -10,13 +10,15 @@ class TestDailyCount < Test::Unit::TestCase
|
|
10
10
|
def test_timely
|
11
11
|
o = @output.map {|i| i[:timely_c] }
|
12
12
|
r = (o - [5,11,0,1,8,2,3,4,16,10,26,13,7,9,20,15,30,33,14,6,12,17,19,59,65,84,62,114,69,52,61,67,154,70]).reject(&:zero?).blank?
|
13
|
-
|
13
|
+
assert r
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_totally
|
17
17
|
o = @output.map {|i| i[:totally_c] }
|
18
18
|
r = (o - [5,16,17,25,27,30,34,36,37,53,55,56,57,59,60,64,66,67,68,70,71,73,74,75,80,90,116,129,136,145,165,185,200,230,234,235,236,237,270,273,274,288,299,304,305,312,327,337,345,359,374,380,392,418,435,446,452,463,466,473,493,506,512,520,525,545,549,553,558,577,636,701,785,805,867,981,1050,1102,1163,1230,1384,1454,1455,1457,1458]).reject(&:zero?).blank?
|
19
|
-
|
19
|
+
assert r
|
20
|
+
assert_equal @output[-1][:totally_favcount_s].to_i, CodeGist.all.map(&:fav_count).reduce(:+)
|
20
21
|
end
|
21
22
|
|
23
|
+
|
22
24
|
end
|
data/test/test_mapreduce.rb
CHANGED
@@ -6,7 +6,14 @@ class TestMapReduce < Test::Unit::TestCase
|
|
6
6
|
def setup
|
7
7
|
end
|
8
8
|
|
9
|
-
def
|
9
|
+
def test_multiple_dimensions_output_without_time_column
|
10
|
+
cron = Statlysis.always['mongoid']['code'][0]
|
11
|
+
assert_equal cron.time_column, false
|
12
|
+
assert_equal cron.time_unit, false
|
13
|
+
assert_equal cron.stat_table_name, 'timely_codegistmongoids_author_a'
|
14
|
+
|
15
|
+
cron.run
|
16
|
+
assert_equal cron.output.detect {|h| h[:author] == 'mvj3' }[:c].to_i, cron.multiple_dataset.sources.first.where(:author => 'mvj3').count
|
10
17
|
end
|
11
18
|
|
12
19
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statlysis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-12-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -269,9 +269,9 @@ files:
|
|
269
269
|
- lib/statlysis/configuration.rb
|
270
270
|
- lib/statlysis/constants.rb
|
271
271
|
- lib/statlysis/cron.rb
|
272
|
-
- lib/statlysis/cron/
|
273
|
-
- lib/statlysis/cron/
|
274
|
-
- lib/statlysis/cron/
|
272
|
+
- lib/statlysis/cron/timely.rb
|
273
|
+
- lib/statlysis/cron/timely/multiple_dimensions.rb
|
274
|
+
- lib/statlysis/cron/timely/one_dimension.rb
|
275
275
|
- lib/statlysis/cron/top.rb
|
276
276
|
- lib/statlysis/cron/top/hotest_items.rb
|
277
277
|
- lib/statlysis/cron/top/lastest_visits.rb
|
@@ -303,7 +303,7 @@ files:
|
|
303
303
|
- test/test_single_log_in_multiple_collections.rb
|
304
304
|
- test/test_statlysis.rb
|
305
305
|
- test/test_timeseries.rb
|
306
|
-
homepage: https://github.com/
|
306
|
+
homepage: https://github.com/SunshineLibrary/statlysis
|
307
307
|
licenses:
|
308
308
|
- MIT
|
309
309
|
post_install_message:
|
@@ -318,7 +318,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
318
318
|
version: '0'
|
319
319
|
segments:
|
320
320
|
- 0
|
321
|
-
hash:
|
321
|
+
hash: -1643509325996557122
|
322
322
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
323
323
|
none: false
|
324
324
|
requirements:
|
@@ -327,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
327
327
|
version: '0'
|
328
328
|
segments:
|
329
329
|
- 0
|
330
|
-
hash:
|
330
|
+
hash: -1643509325996557122
|
331
331
|
requirements: []
|
332
332
|
rubyforge_project:
|
333
333
|
rubygems_version: 1.8.23
|
data/lib/statlysis/cron/count.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module Statlysis
|
4
|
-
class Count < Cron
|
5
|
-
def initialize source, opts = {}
|
6
|
-
super
|
7
|
-
Statlysis.check_set_database
|
8
|
-
cron.setup_stat_model
|
9
|
-
cron
|
10
|
-
end
|
11
|
-
|
12
|
-
# 设置数据源,并保存结果入数据库
|
13
|
-
def run
|
14
|
-
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
15
|
-
# delete first in range
|
16
|
-
@output = cron.output
|
17
|
-
unless @output.any?
|
18
|
-
logger.info "没有数据"; return
|
19
|
-
end
|
20
|
-
num_i = 0; num_add = 999
|
21
|
-
Statlysis.sequel.transaction do
|
22
|
-
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
|
23
|
-
while !(_a = @output[num_i..(num_i+num_add)]).blank? do
|
24
|
-
# batch insert all
|
25
|
-
cron.stat_model.multi_insert _a
|
26
|
-
num_i += (num_add + 1)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
return self
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
protected
|
35
|
-
def unit_range_query time, time_begin = nil
|
36
|
-
# time begin and end
|
37
|
-
tb = time
|
38
|
-
te = (time+1.send(cron.time_unit)-1.second)
|
39
|
-
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
40
|
-
tb = time_begin || tb
|
41
|
-
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
42
|
-
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
require 'statlysis/cron/count/timely'
|
51
|
-
require 'statlysis/cron/count/dimensions'
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module Statlysis
|
4
|
-
class Timely < Count
|
5
|
-
def setup_stat_model
|
6
|
-
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
7
|
-
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
8
|
-
|
9
|
-
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
10
|
-
Statlysis.sequel.transaction do
|
11
|
-
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
12
|
-
DateTime :t # alias for :time
|
13
|
-
end
|
14
|
-
|
15
|
-
# TODO Add cron.source_where_array before count_columns
|
16
|
-
count_columns = [:timely_c, :totally_c] # alias for :count
|
17
|
-
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
18
|
-
index_column_names = [:t] + count_columns
|
19
|
-
index_column_names_name = index_column_names.join("_")
|
20
|
-
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
21
|
-
|
22
|
-
# Fix there should be uniq index name between tables
|
23
|
-
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
24
|
-
if not Statlysis.config.is_skip_database_index
|
25
|
-
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
n = cron.stat_table_name.to_s.singularize.camelize
|
31
|
-
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
32
|
-
class ::#{n} < Sequel::Model;
|
33
|
-
self.set_dataset :#{cron.stat_table_name}
|
34
|
-
end
|
35
|
-
#{n}
|
36
|
-
MODEL
|
37
|
-
end
|
38
|
-
|
39
|
-
def output
|
40
|
-
@output ||= (cron.time_range.map do |time|
|
41
|
-
timely_c = 0
|
42
|
-
totally_c = 0
|
43
|
-
# support multiple data sources
|
44
|
-
_first_source = nil
|
45
|
-
cron.multiple_dataset.sources.each do |s|
|
46
|
-
timely_c += s.where(unit_range_query(time)).count
|
47
|
-
_t = DateTime1970
|
48
|
-
_t = is_time_column_integer? ? _t.to_i : _t
|
49
|
-
totally_c += s.where(unit_range_query(time, _t)).count
|
50
|
-
_first_source ||= s.where(unit_range_query(time))
|
51
|
-
end
|
52
|
-
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
|
53
|
-
|
54
|
-
if timely_c.zero? && totally_c.zero?
|
55
|
-
nil
|
56
|
-
else
|
57
|
-
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
58
|
-
end
|
59
|
-
end.compact)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|