statlysis 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +18 -5
- data/lib/statlysis.rb +11 -13
- data/lib/statlysis/clock.rb +4 -0
- data/lib/statlysis/configuration.rb +56 -28
- data/lib/statlysis/constants.rb +12 -0
- data/lib/statlysis/cron.rb +4 -6
- data/lib/statlysis/cron/timely.rb +171 -0
- data/lib/statlysis/cron/timely/multiple_dimensions.rb +52 -0
- data/lib/statlysis/cron/timely/one_dimension.rb +60 -0
- data/lib/statlysis/cron_set.rb +4 -3
- data/lib/statlysis/javascript/count.rb +50 -15
- data/lib/statlysis/map_reduce.rb +1 -1
- data/lib/statlysis/utils.rb +6 -0
- data/statlysis.gemspec +3 -3
- data/test/config/database.yml +1 -2
- data/test/helper.rb +15 -4
- data/test/migrate/1_active_record.rb +1 -0
- data/test/models/code_gist.rb +12 -0
- data/test/models/eoe_log.rb +2 -4
- data/test/test_daily_count.rb +4 -2
- data/test/test_mapreduce.rb +8 -1
- metadata +8 -8
- data/lib/statlysis/cron/count.rb +0 -51
- data/lib/statlysis/cron/count/dimensions.rb +0 -7
- data/lib/statlysis/cron/count/timely.rb +0 -63
data/README.markdown
CHANGED
@@ -10,11 +10,13 @@ Usage
|
|
10
10
|
Statlysis.setup do
|
11
11
|
set_database :statlysis
|
12
12
|
|
13
|
-
|
13
|
+
daily CodeGist
|
14
|
+
hourly EoeLog, :time_column => :t # support custom time_column
|
15
|
+
|
14
16
|
[EoeLog,
|
15
|
-
EoeLog.where(:ui => 0),
|
17
|
+
EoeLog.where(:ui => 0), # support query scope
|
16
18
|
EoeLog.where(:ui => {"$ne" => 0}),
|
17
|
-
Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
|
19
|
+
Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}), # support collection name regexp
|
18
20
|
EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
|
19
21
|
].each do |s|
|
20
22
|
daily s, :time_column => :t
|
@@ -45,8 +47,6 @@ TODO
|
|
45
47
|
* Admin interface
|
46
48
|
* statistical query api in Ruby and HTTP
|
47
49
|
* Interacting with Javascript charting library, e.g. Highcharts, D3.
|
48
|
-
* More tests
|
49
|
-
* Add @criteria to MultipleDataset
|
50
50
|
|
51
51
|
|
52
52
|
Statistical Process
|
@@ -68,6 +68,11 @@ Q: Why do you recommend using multiple collections to store logs rather than a s
|
|
68
68
|
A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
|
69
69
|
|
70
70
|
|
71
|
+
Q: In Mongodb, why use MapReduce instead of Aggregation?
|
72
|
+
|
73
|
+
A: The result of aggregation pipeline is a document and is subject to the BSON Document size limit, which is currently 16 megabytes, see more details at http://docs.mongodb.org/manual/core/aggregation-pipeline/#pipeline
|
74
|
+
|
75
|
+
|
71
76
|
Copyright
|
72
77
|
-----------------------------------------------
|
73
78
|
MIT. David Chen at eoe.cn.
|
@@ -91,3 +96,11 @@ Related
|
|
91
96
|
|
92
97
|
### Admin interface
|
93
98
|
* http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
|
99
|
+
|
100
|
+
|
101
|
+
### ETL
|
102
|
+
* https://github.com/activewarehouse/activewarehouse-etl/
|
103
|
+
* http://jisraelsen.github.io/drudgery/ ruby ETL DSL, support csv, sqlite3, ActiveRecord, without support time range
|
104
|
+
* https://github.com/square/ETL Simply encapsulates the SQL procedures
|
105
|
+
|
106
|
+
|
data/lib/statlysis.rb
CHANGED
@@ -20,34 +20,33 @@ require 'activerecord_idnamecache'
|
|
20
20
|
module Rails; end
|
21
21
|
|
22
22
|
require 'statlysis/constants'
|
23
|
+
require 'statlysis/utils'
|
24
|
+
require 'statlysis/configuration'
|
25
|
+
require 'statlysis/common'
|
23
26
|
|
24
27
|
module Statlysis
|
25
28
|
class << self
|
26
29
|
def setup &blk
|
27
30
|
raise "Need to setup proc" if not blk
|
28
31
|
|
29
|
-
logger.info "Start to setup Statlysis"
|
32
|
+
logger.info "Start to setup Statlysis" if ENV['DEBUG']
|
30
33
|
time_log do
|
31
34
|
self.config.instance_exec(&blk)
|
32
35
|
end
|
33
|
-
logger.info
|
34
36
|
end
|
35
37
|
|
36
38
|
def time_log text = nil
|
37
39
|
t = Time.now
|
38
40
|
logger.info text if text
|
39
41
|
yield if block_given?
|
40
|
-
logger.info "Time spend #{(Time.now - t).round(2)} seconds."
|
41
|
-
logger.info "-" * 42
|
42
|
+
logger.info "Time spend #{(Time.now - t).round(2)} seconds." if ENV['DEBUG']
|
43
|
+
logger.info "-" * 42 if ENV['DEBUG']
|
42
44
|
end
|
43
45
|
|
44
46
|
# delagate config methods to Configuration
|
45
47
|
def config; Configuration.instance end
|
46
48
|
require 'active_support/core_ext/module/delegation.rb'
|
47
|
-
|
48
|
-
:default_time_zone,
|
49
|
-
:set_tablename_default_pre, :tablename_default_pre
|
50
|
-
].each do |sym|
|
49
|
+
Configuration::DelegateMethods.each do |sym|
|
51
50
|
delegate sym, :to => :config
|
52
51
|
end
|
53
52
|
|
@@ -56,18 +55,17 @@ module Statlysis
|
|
56
55
|
|
57
56
|
def source_to_database_type; @_source_to_database_type ||= {} end
|
58
57
|
|
59
|
-
|
58
|
+
# 代理访问 各个时间类型的 crons
|
60
59
|
def daily; CronSet.new(Statlysis.config.day_crons) end
|
61
60
|
def hourly; CronSet.new(Statlysis.config.hour_crons) end
|
61
|
+
def always; CronSet.new(Statlysis.config.always_crons) end
|
62
62
|
|
63
63
|
end
|
64
64
|
|
65
65
|
end
|
66
66
|
|
67
|
-
require 'statlysis/utils'
|
68
|
-
require 'statlysis/configuration'
|
69
|
-
require 'statlysis/common'
|
70
67
|
require 'statlysis/timeseries'
|
68
|
+
require 'statlysis/map_reduce'
|
71
69
|
require 'statlysis/clock'
|
72
70
|
require 'statlysis/rake'
|
73
71
|
require 'statlysis/cron'
|
@@ -77,7 +75,7 @@ require 'statlysis/multiple_dataset'
|
|
77
75
|
|
78
76
|
module Statlysis
|
79
77
|
require 'short_inspect'
|
80
|
-
ShortInspect.apply_to Cron,
|
78
|
+
ShortInspect.apply_to Cron, MultipleDataset
|
81
79
|
ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
|
82
80
|
end
|
83
81
|
|
data/lib/statlysis/clock.rb
CHANGED
@@ -8,6 +8,8 @@ module Statlysis
|
|
8
8
|
# feature is a string
|
9
9
|
def initialize feature, default_time
|
10
10
|
raise "Please assign default_time params" if not default_time
|
11
|
+
|
12
|
+
# init table & model
|
11
13
|
cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
|
12
14
|
unless Statlysis.sequel.table_exists?(cron.stat_table_name)
|
13
15
|
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
|
@@ -19,6 +21,8 @@ module Statlysis
|
|
19
21
|
end
|
20
22
|
h = Utils.setup_pattern_table_and_model cron.stat_table_name
|
21
23
|
cron.stat_model = h[:model]
|
24
|
+
|
25
|
+
# init default_time
|
22
26
|
cron.clock = cron.stat_model.find_or_create(:feature => feature)
|
23
27
|
cron.clock.update :t => default_time if cron.current.nil?
|
24
28
|
cron
|
@@ -9,16 +9,21 @@ module Statlysis
|
|
9
9
|
class Configuration
|
10
10
|
include Singleton
|
11
11
|
|
12
|
+
# variables
|
12
13
|
attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
|
13
14
|
attr_accessor :is_skip_database_index
|
14
|
-
TimeUnits
|
15
|
-
|
16
|
-
sym = "#{sym}_crons"
|
17
|
-
attr_accessor sym; self.instance.send "#{sym}=", []
|
15
|
+
(TimeUnits + %W[always] + [:realtime, :similar, :hotest]).each do |unit|
|
16
|
+
sym = "#{unit}_crons"; attr_accessor sym; self.instance.send "#{sym}=", []
|
18
17
|
end
|
19
18
|
self.instance.send "tablename_default_pre=", "st"
|
20
19
|
self.instance.send "is_skip_database_index=", false
|
21
20
|
|
21
|
+
DelegateMethods = [
|
22
|
+
:sequel, :set_database, :check_set_database,
|
23
|
+
:default_time_zone,
|
24
|
+
:set_tablename_default_pre, :tablename_default_pre
|
25
|
+
]
|
26
|
+
|
22
27
|
# 会在自动拼接统计数据库表名时去除这些时间字段
|
23
28
|
def update_time_columns *columns
|
24
29
|
self.default_time_columns ||= [:created_at, :updated_at]
|
@@ -26,43 +31,44 @@ module Statlysis
|
|
26
31
|
self.default_time_columns = self.default_time_columns.uniq
|
27
32
|
end
|
28
33
|
|
29
|
-
def set_database
|
30
|
-
self.database_opts =
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
def set_database obj
|
35
|
+
self.database_opts = case obj
|
36
|
+
when Hash
|
37
|
+
obj
|
38
|
+
when Symbol, String
|
39
|
+
YAML.load_file(Rails.root.join("config/database.yml"))[Rails.env].merge('database' => obj.to_s)
|
40
|
+
else
|
41
|
+
raise "Statlysis#set_database only support symbol or hash params"
|
42
|
+
end
|
43
|
+
|
44
|
+
raise "database_opts should not be blank" if self.database_opts.blank?
|
45
|
+
|
46
|
+
# sqlite dont support regular creating database in mysql style
|
47
|
+
self.sequel = if (self.database_opts['adapter'].match(/sqlite/) && self.database_opts['database'].match(/\A:memory:\Z/)) # only for test envrionment
|
48
|
+
Sequel.sqlite
|
34
49
|
else
|
35
|
-
|
50
|
+
# create database, copied from http://stackoverflow.com/a/14435522/595618
|
51
|
+
require 'mysql2'
|
52
|
+
mysql2_client = Mysql2::Client.new(self.database_opts.except('database'))
|
53
|
+
mysql2_client.query("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']}")
|
54
|
+
Sequel.connect(self.database_opts)
|
36
55
|
end
|
37
|
-
self.sequel = Sequel.connect(self.database_opts)
|
38
56
|
|
39
57
|
# 初始化键值model
|
40
58
|
["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
|
41
59
|
Utils.setup_pattern_table_and_model tn
|
42
60
|
end
|
43
|
-
return self
|
44
|
-
end
|
45
61
|
|
46
|
-
def set_default_time_zone zone
|
47
|
-
self.default_time_zone = zone
|
48
62
|
return self
|
49
63
|
end
|
50
64
|
|
51
|
-
def
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
|
56
|
-
def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
|
57
|
-
|
65
|
+
def set_default_time_zone zone; self.default_time_zone = zone; return self; end
|
66
|
+
def set_tablename_default_pre str; self.tablename_default_pre = str.to_s; return self end
|
58
67
|
def check_set_database; raise "Please setup database first" if sequel.nil? end
|
59
68
|
|
60
|
-
def timely source, opts
|
61
|
-
|
62
|
-
|
63
|
-
t = Timely.new source, opts
|
64
|
-
self.send("#{opts[:time_unit]}_crons").push t
|
65
|
-
end
|
69
|
+
def daily source, opts = {}; timely source, {:time_unit => :day}.merge(opts) end
|
70
|
+
def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
|
71
|
+
def always source, opts = {}; timely source, {:time_unit => false, :time_column => false}.merge(opts) end # IMPORTANT set :time_unit to false
|
66
72
|
|
67
73
|
# the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
|
68
74
|
def lastest_visits source, opts
|
@@ -101,5 +107,27 @@ module Statlysis
|
|
101
107
|
self.similar_crons.push Similar.new(model_name, _p)
|
102
108
|
end
|
103
109
|
|
110
|
+
|
111
|
+
private
|
112
|
+
def timely source, opts
|
113
|
+
self.check_set_database
|
114
|
+
|
115
|
+
opts.reverse_merge! :time_column => :created_at,
|
116
|
+
:time_unit => :day,
|
117
|
+
:sum_columns => [],
|
118
|
+
:group_by_columns => [],
|
119
|
+
:group_concat_columns => []
|
120
|
+
|
121
|
+
opts.each {|k, v| opts[k] = v.map(&:to_sym) if (Timely::SqlColumns - [:group_by_columns]).include?(k) } # Sequel use symbol as column names
|
122
|
+
|
123
|
+
# e.g. convert [:user_id] to [{:column_name => :user_id, :type => :integer}]
|
124
|
+
if (opts[:group_by_columns].first || {})[:type].blank?
|
125
|
+
opts[:group_by_columns] = opts[:group_by_columns].map {|i| {:column_name => i.to_sym, :type => :integer} }
|
126
|
+
end
|
127
|
+
|
128
|
+
t = Timely.new source, opts
|
129
|
+
self.send("#{opts[:time_unit] || 'always'}_crons").push t
|
130
|
+
end
|
131
|
+
|
104
132
|
end
|
105
133
|
end
|
data/lib/statlysis/constants.rb
CHANGED
@@ -3,8 +3,20 @@
|
|
3
3
|
module Statlysis
|
4
4
|
TimeUnits = %w[hour day week month year]
|
5
5
|
DateTime1970 = Time.zone.parse("19700101").in_time_zone
|
6
|
+
TimeUnitToTableSuffixHash = (TimeUnits + [false]).inject({}) {|_h, _i| _h[_i] = (_i ? _i[0] : 'a'); _h }
|
6
7
|
|
7
8
|
DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
|
8
9
|
|
9
10
|
DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
|
11
|
+
|
12
|
+
SymbolToClassInDataType = {
|
13
|
+
:string => String,
|
14
|
+
:datetime => DateTime,
|
15
|
+
:time => Time,
|
16
|
+
:integer => Integer,
|
17
|
+
:float => Float,
|
18
|
+
:text => String
|
19
|
+
}
|
20
|
+
|
21
|
+
|
10
22
|
end
|
data/lib/statlysis/cron.rb
CHANGED
@@ -29,10 +29,8 @@ module Statlysis
|
|
29
29
|
def is_activerecord?; @source_type == :activerecord; end
|
30
30
|
def is_mongoid?; @source_type == :mongoid; end
|
31
31
|
def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
|
32
|
+
def _source; cron.multiple_dataset.sources.first end
|
32
33
|
|
33
|
-
def _source
|
34
|
-
cron.multiple_dataset.sources.first
|
35
|
-
end
|
36
34
|
def source_where_array
|
37
35
|
# TODO follow index seq
|
38
36
|
a = _source.where("").where_values.map do |equality|
|
@@ -64,8 +62,6 @@ module Statlysis
|
|
64
62
|
TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
|
65
63
|
end
|
66
64
|
|
67
|
-
protected
|
68
|
-
|
69
65
|
# 兼容采用整数类型作时间字段
|
70
66
|
def is_time_column_integer?
|
71
67
|
if is_activerecord?
|
@@ -74,11 +70,13 @@ module Statlysis
|
|
74
70
|
false
|
75
71
|
end
|
76
72
|
end
|
73
|
+
def time_column?; !!@time_column end
|
74
|
+
def group_by_columns?; !!@group_by_columns.any? end
|
77
75
|
|
78
76
|
end
|
79
77
|
|
80
78
|
end
|
81
79
|
|
82
80
|
|
83
|
-
require 'statlysis/cron/
|
81
|
+
require 'statlysis/cron/timely'
|
84
82
|
require 'statlysis/cron/top'
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely < Cron
|
5
|
+
SqlColumns = [:sum_columns, :group_by_columns, :group_concat_columns]
|
6
|
+
attr_reader(*SqlColumns)
|
7
|
+
|
8
|
+
def initialize source, opts = {}
|
9
|
+
super
|
10
|
+
Statlysis.check_set_database
|
11
|
+
SqlColumns.each {|sym| instance_variable_set "@#{sym}", (opts[sym] || []) }
|
12
|
+
cron.setup_stat_model
|
13
|
+
cron
|
14
|
+
end
|
15
|
+
|
16
|
+
# 设置数据源,并保存结果入数据库
|
17
|
+
def run
|
18
|
+
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
19
|
+
|
20
|
+
raise "cron.output has no Enumerable" if not cron.output.class.included_modules.include? Enumerable
|
21
|
+
|
22
|
+
num_i = 0; num_add = 999
|
23
|
+
Statlysis.sequel.transaction do
|
24
|
+
# delete first in range
|
25
|
+
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete if cron.time_column?
|
26
|
+
|
27
|
+
# TODO partial delete
|
28
|
+
cron.stat_model.where("").delete if cron.group_by_columns?
|
29
|
+
|
30
|
+
while !(_a = cron.output[num_i..(num_i+num_add)]).blank? do
|
31
|
+
# batch insert all
|
32
|
+
cron.stat_model.multi_insert _a
|
33
|
+
num_i += (num_add + 1)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
return self
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
def setup_stat_model
|
42
|
+
cron.stat_table_name = Utils.normalise_name cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array, cron.group_by_columns.map {|i| i[:column_name] }, TimeUnitToTableSuffixHash[cron.time_unit]
|
43
|
+
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
44
|
+
|
45
|
+
|
46
|
+
# create basic unchangeable table structure
|
47
|
+
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
48
|
+
Statlysis.sequel.transaction do
|
49
|
+
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
50
|
+
primary_key :id # Add one column at least in this block to avoid `SQLite3::SQLException: near ")": syntax error (Sequel::DatabaseError)`
|
51
|
+
end
|
52
|
+
Statlysis.sequel.add_column cron.stat_table_name, :t, DateTime if cron.time_column? # alias for :time
|
53
|
+
|
54
|
+
# add count columns
|
55
|
+
if cron.time_column?
|
56
|
+
count_columns = [:timely_c, :totally_c] # alias for :count
|
57
|
+
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
58
|
+
else
|
59
|
+
Statlysis.sequel.add_column cron.stat_table_name, :c, Integer # alias for :count
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
64
|
+
# add group_by columns & indexes
|
65
|
+
remodel
|
66
|
+
cron.stat_model.cron = cron
|
67
|
+
if cron.group_by_columns.any?
|
68
|
+
cron.group_by_columns.each do |_h|
|
69
|
+
if not cron.stat_model.columns.include?(_h[:column_name])
|
70
|
+
_h[:type] = SymbolToClassInDataType[_h[:type]] if _h[:type].is_a?(Symbol) # && (Statlysis.sequel.opts[:adapter] == :sqlite)
|
71
|
+
Statlysis.sequel.add_column cron.stat_table_name, _h[:column_name], _h[:type]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# add sum columns
|
77
|
+
remodel
|
78
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
79
|
+
_result_cols.each do |_result_col|
|
80
|
+
if not cron.stat_model.columns.include?(_result_col)
|
81
|
+
# convert to Interger type in view if needed
|
82
|
+
Statlysis.sequel.add_column cron.stat_table_name, _result_col, Float
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Fix there should be uniq index name between tables
|
88
|
+
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
89
|
+
_group_by_columns_index_name = cron.group_by_columns.reject {|i| i[:no_index] }.map {|i| i[:column_name] }
|
90
|
+
_truncated_columns = _group_by_columns_index_name.dup # only String column
|
91
|
+
_group_by_columns_index_name = _group_by_columns_index_name.unshift :t if cron.time_column?
|
92
|
+
# TODO use https://github.com/german/redis_orm to support full string indexes
|
93
|
+
if !Statlysis.config.is_skip_database_index && _group_by_columns_index_name.any?
|
94
|
+
mysql_per_column_length_limit_in_one_index = (1000 / 3.0 / _group_by_columns_index_name.size.to_f).to_i
|
95
|
+
index_columns_str = _group_by_columns_index_name.map {|s| _truncated_columns.include?(s) ? "#{s.to_s}(#{mysql_per_column_length_limit_in_one_index})" : s.to_s }.join(", ")
|
96
|
+
index_columns_str = "(#{index_columns_str})"
|
97
|
+
begin
|
98
|
+
# NOTE mysql indexes key length limit is 1000 bytes
|
99
|
+
cron.stat_model.dataset.with_sql("CREATE INDEX #{Utils.sha1_name(_group_by_columns_index_name)} ON #{cron.stat_table_name} #{index_columns_str};").to_a
|
100
|
+
rescue => e
|
101
|
+
raise e if not e.inspect.match(/exists|duplicate/i)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# add group_concat column
|
106
|
+
remodel
|
107
|
+
if cron.group_concat_columns.any? && !cron.stat_model.columns.include?(:other_json)
|
108
|
+
Statlysis.sequel.add_column cron.stat_table_name, :other_json, :text
|
109
|
+
end
|
110
|
+
|
111
|
+
# add access to group_concat values in other_json
|
112
|
+
remodel.class_eval do
|
113
|
+
define_method("other_json_hash") do
|
114
|
+
@__other_json_hash_cache ||= (JSON.parse(self.other_json) rescue {})
|
115
|
+
end
|
116
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
117
|
+
define_method("#{_group_concat_column}_values") do
|
118
|
+
self.other_json_hash[_group_concat_column.to_s]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
remodel
|
124
|
+
end
|
125
|
+
|
126
|
+
def output
|
127
|
+
@output ||= (cron.group_by_columns.any? ? multiple_dimensions_output : one_dimension_output)
|
128
|
+
end
|
129
|
+
|
130
|
+
protected
|
131
|
+
def unit_range_query time, time_begin = nil
|
132
|
+
# time begin and end
|
133
|
+
tb = time
|
134
|
+
te = (time+1.send(cron.time_unit)-1.second)
|
135
|
+
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
136
|
+
tb = time_begin || tb
|
137
|
+
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
138
|
+
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
139
|
+
end
|
140
|
+
|
141
|
+
# e.g. {:fav_count=>[:timely_favcount_s, :totally_favcount_s]}
|
142
|
+
def sum_column_to_result_columns_hash
|
143
|
+
cron.sum_columns.inject({}) do |h, _col|
|
144
|
+
[:timely, :totally].each do |_pre|
|
145
|
+
h[_col] ||= []
|
146
|
+
h[_col] << Utils.normalise_name(_pre, _col, 's').to_sym
|
147
|
+
end
|
148
|
+
h
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
def remodel
|
154
|
+
n = cron.stat_table_name.to_s.singularize.camelize
|
155
|
+
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
156
|
+
class ::#{n} < Sequel::Model;
|
157
|
+
self.set_dataset :#{cron.stat_table_name}
|
158
|
+
|
159
|
+
cattr_accessor :cron
|
160
|
+
end
|
161
|
+
#{n}
|
162
|
+
MODEL
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
|
170
|
+
require 'statlysis/cron/timely/one_dimension'
|
171
|
+
require 'statlysis/cron/timely/multiple_dimensions'
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely
|
5
|
+
|
6
|
+
|
7
|
+
def multiple_dimensions_output
|
8
|
+
self.send "multiple_dimensions_output_with#{cron.time_column ? '' : 'out'}_time_column"
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def multiple_dimensions_output_with_time_column
|
13
|
+
cron.time_range.map do |time|
|
14
|
+
raise DefaultNotImplementWrongMessage # TODO
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# TODO encapsulate Mongoid MapReduce in collection output mode
|
19
|
+
# TODO support large dataset, e.g. a million.
|
20
|
+
def multiple_dimensions_output_without_time_column
|
21
|
+
mr = Javascript::MultiDimensionalCount.new(cron)
|
22
|
+
|
23
|
+
array = []
|
24
|
+
cron.multiple_dataset.sources.each do |_source|
|
25
|
+
# _source = _source.time_range # TODO
|
26
|
+
array += _source.map_reduce(mr.map_func, mr.reduce_func)
|
27
|
+
.out(inline: 1) # TODO use replace mode
|
28
|
+
.to_a.map do |i|
|
29
|
+
v = i['value']
|
30
|
+
_h = {:c => v['count']}
|
31
|
+
|
32
|
+
cron.group_by_columns.each do |_group_by_column|
|
33
|
+
_h[_group_by_column[:column_name]] = v[_group_by_column[:column_name].to_s]
|
34
|
+
end
|
35
|
+
|
36
|
+
_h[:other_json] = {}
|
37
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
38
|
+
_h[:other_json][_group_concat_column] = v["#{_group_concat_column}_values"].inject({}) {|_h2, i2| _h2[i2] ||= 0; _h2[i2] += 1; _h2 }
|
39
|
+
end
|
40
|
+
_h[:other_json] = _h[:other_json].to_json
|
41
|
+
|
42
|
+
_h
|
43
|
+
end
|
44
|
+
end
|
45
|
+
array
|
46
|
+
|
47
|
+
# TODO support sum_columns
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class Timely
|
5
|
+
|
6
|
+
|
7
|
+
# one dimension **must** have `time_column`, or there's nothing to do
|
8
|
+
#
|
9
|
+
# TODO add to FAQ
|
10
|
+
# * if you want to statistics one column through `group_by_columns`
|
11
|
+
# params, and dont need time column, then you could use `always` DSL.
|
12
|
+
#
|
13
|
+
def one_dimension_output
|
14
|
+
cron.time_range.map do |time|
|
15
|
+
_hash = {:t => time, :timely_c => 0, :totally_c => 0}
|
16
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
17
|
+
_result_cols.each do |_result_col|
|
18
|
+
_hash[_result_col] = 0.0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# support multiple data sources
|
23
|
+
_first_source = nil
|
24
|
+
cron.multiple_dataset.sources.each do |s|
|
25
|
+
_t = DateTime1970
|
26
|
+
_t = is_time_column_integer? ? _t.to_i : _t
|
27
|
+
|
28
|
+
_scope_one = s.where(unit_range_query(time))
|
29
|
+
# TODO cache pre-result
|
30
|
+
_scope_all = s.where(unit_range_query(time, _t))
|
31
|
+
|
32
|
+
# 1. count
|
33
|
+
_hash[:timely_c] += _scope_one.count
|
34
|
+
_hash[:totally_c] += _scope_all.count
|
35
|
+
|
36
|
+
# 2. sum
|
37
|
+
sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
|
38
|
+
_hash[_result_cols[0]] = _scope_one.map(&_sum_col).reduce(:+).to_f
|
39
|
+
_hash[_result_cols[1]] = _scope_all.map(&_sum_col).reduce(:+).to_f
|
40
|
+
end
|
41
|
+
|
42
|
+
# 3. group_concat
|
43
|
+
_other_json = {}
|
44
|
+
_other_json[:group_concat_columns] ||= {}
|
45
|
+
cron.group_concat_columns.each do |_group_concat_column|
|
46
|
+
_other_json[:group_concat_columns][_group_concat_column] = _scope_one.map(&_group_concat_column).uniq
|
47
|
+
end
|
48
|
+
_hash[:other_json] = _other_json.to_json
|
49
|
+
|
50
|
+
_first_source ||= s.where(unit_range_query(time))
|
51
|
+
end
|
52
|
+
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{_hash[:timely_c]} totally_c:#{_hash[:totally_c]}" if ENV['DEBUG']
|
53
|
+
|
54
|
+
_hash
|
55
|
+
end.select {|r1| r1.except(:t, :other_json).values.reject {|r2| r2.zero? }.any? }
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
data/lib/statlysis/cron_set.rb
CHANGED
@@ -10,13 +10,14 @@ module Statlysis
|
|
10
10
|
when Fixnum, Integer # support array idx access
|
11
11
|
self.to_a[pattern]
|
12
12
|
else
|
13
|
-
CronSet.new(select do |
|
14
|
-
|
13
|
+
CronSet.new(self.select do |cron|
|
14
|
+
reg = Regexp.new(pattern.to_s)
|
15
|
+
cron.stat_table_name.match(reg) || cron.multiple_dataset.name.to_s.match(reg)
|
15
16
|
end)
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
19
|
-
def last; [-1]; end
|
20
|
+
def last; self[-1]; end
|
20
21
|
|
21
22
|
def run
|
22
23
|
map(&:run)
|
@@ -4,34 +4,69 @@ module Statlysis
|
|
4
4
|
module Javascript
|
5
5
|
class MultiDimensionalCount
|
6
6
|
attr_reader :map_func, :reduce_func
|
7
|
+
attr_reader :cron
|
7
8
|
|
8
|
-
def initialize
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
def initialize cron
|
10
|
+
@cron = cron
|
11
|
+
|
12
|
+
# setup group_by_columns
|
13
|
+
_group_by_columns = :_id if cron.group_by_columns.blank?
|
14
|
+
_group_by_columns ||= cron.group_by_columns.map {|i| i[:column_name] }
|
15
|
+
emit_key = _group_by_columns.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
|
16
|
+
emit_key = "{#{emit_key}}"
|
17
|
+
|
18
|
+
# TODO setup sum_columns
|
19
|
+
# default_emit_values_array += cron.sum_columns.map {|_sum_column| "#{_sum_column}: this.#{_sum_column}" }
|
20
|
+
|
21
|
+
# setup group_concat_columns
|
22
|
+
# NOTE if only one uniq emit value, then it'll never be appeared in reduce function
|
23
|
+
emit_values_init_array = cron.group_concat_columns.map do |_group_concat_column|
|
24
|
+
"emit_value.#{_group_concat_column}_values = [this.#{_group_concat_column}];\n"
|
18
25
|
end
|
26
|
+
emit_values_init_array += (_group_by_columns.map do |_group_by_column|
|
27
|
+
"emit_value.#{_group_by_column} = this.#{_group_by_column};\n"
|
28
|
+
end)
|
19
29
|
|
20
30
|
@map_func = "function() {
|
21
|
-
|
31
|
+
var emit_value = {count: 1};
|
32
|
+
#{emit_values_init_array.join}
|
33
|
+
|
34
|
+
emit (#{emit_key}, emit_value);
|
22
35
|
}"
|
23
36
|
|
37
|
+
# sum_init_values = cron.sum_columns.map {|_sum_column| "#{_sum_column} = 0.0" }
|
38
|
+
# sum_init_values = "var #{sum_init_values};" if cron.sum_columns.any?
|
39
|
+
|
40
|
+
# 如果使用Hash,将导致group_concat最终的数目和group_by数目不一致,因为多个任务并行时会导致覆盖(常见于个数多的分类,一个的则不会有这个问题),而可并行化的数组则不会。
|
41
|
+
group_concat_values_init_array = cron.group_concat_columns.map {|_group_concat_column| "reducedObject.#{_group_concat_column}_values = [];" }
|
42
|
+
group_concat_values_process_array = cron.group_concat_columns.map do |_group_concat_column|
|
43
|
+
"reducedObject.#{_group_concat_column}_values = reducedObject.#{_group_concat_column}_values.concat(v['#{_group_concat_column}_values']);\n"
|
44
|
+
end
|
45
|
+
group_by_values_process_array = _group_by_columns.map do |_group_by_column|
|
46
|
+
"reducedObject.#{_group_by_column} = v.#{_group_by_column};\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
# emit value in map func should be the same structure as the
|
50
|
+
# return value in reduce func, see more details in
|
51
|
+
# http://rickosborne.org/download/SQL-to-MongoDB.pdf and
|
52
|
+
# http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/
|
24
53
|
@reduce_func = "function(key, values) {
|
25
|
-
var
|
54
|
+
var reducedObject = key;
|
55
|
+
reducedObject.count = 0;
|
56
|
+
#{group_concat_values_init_array.join}
|
26
57
|
|
27
58
|
values.forEach(function(v) {
|
28
|
-
count += v['count'];
|
59
|
+
reducedObject.count += v['count'];
|
60
|
+
#{group_by_values_process_array.join}
|
61
|
+
#{group_concat_values_process_array.join}
|
29
62
|
});
|
30
63
|
|
31
|
-
return
|
64
|
+
return reducedObject;
|
32
65
|
}"
|
33
|
-
|
66
|
+
|
67
|
+
return self
|
34
68
|
end
|
69
|
+
|
35
70
|
end
|
36
71
|
end
|
37
72
|
end
|
data/lib/statlysis/map_reduce.rb
CHANGED
data/lib/statlysis/utils.rb
CHANGED
@@ -35,6 +35,12 @@ module Statlysis
|
|
35
35
|
{:table => tn, :model => str.constantize}
|
36
36
|
end
|
37
37
|
|
38
|
+
def normalise_name *name
|
39
|
+
Array(name).flatten.compact.map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
40
|
+
end
|
41
|
+
|
42
|
+
def sha1_name name; Digest::SHA1.hexdigest Array(name).map(&:to_s).join end
|
43
|
+
|
38
44
|
end
|
39
45
|
end
|
40
46
|
end
|
data/statlysis.gemspec
CHANGED
@@ -4,13 +4,13 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'statlysis'
|
7
|
-
s.version = '0.0.
|
8
|
-
s.date = '2013-
|
7
|
+
s.version = '0.0.3'
|
8
|
+
s.date = '2013-12-03'
|
9
9
|
s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
|
10
10
|
s.description = s.summary
|
11
11
|
s.authors = ["David Chen"]
|
12
12
|
s.email = 'mvjome@gmail.com'
|
13
|
-
s.homepage = 'https://github.com/
|
13
|
+
s.homepage = 'https://github.com/SunshineLibrary/statlysis'
|
14
14
|
s.license = 'MIT'
|
15
15
|
|
16
16
|
s.files = `git ls-files`.split("\n")
|
data/test/config/database.yml
CHANGED
data/test/helper.rb
CHANGED
@@ -12,6 +12,7 @@ require 'test/unit'
|
|
12
12
|
|
13
13
|
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
|
14
14
|
$LOAD_PATH.unshift File.dirname(__FILE__) # test dirs
|
15
|
+
require 'pry-debugger'
|
15
16
|
|
16
17
|
# load mongoid setup
|
17
18
|
require 'mongoid'
|
@@ -22,10 +23,11 @@ require 'statlysis'
|
|
22
23
|
|
23
24
|
# load rails
|
24
25
|
def Rails.root; Pathname.new(File.expand_path('../.', __FILE__)) end
|
26
|
+
def Rails.env; 'development' end
|
25
27
|
require 'sqlite3'
|
26
28
|
|
27
29
|
# load ActiveRecord setup
|
28
|
-
Statlysis.set_database :
|
30
|
+
Statlysis.set_database ":memory:"
|
29
31
|
Statlysis.config.is_skip_database_index = true
|
30
32
|
ActiveRecord::Base.establish_connection(Statlysis.config.database_opts.merge("adapter" => "sqlite3"))
|
31
33
|
Dir[File.expand_path("../migrate/*.rb", __FILE__).to_s].each { |f| require f }
|
@@ -35,13 +37,20 @@ Dir[File.expand_path("../models/*.rb", __FILE__).to_s].each { |f| require f }
|
|
35
37
|
# copied from http://stackoverflow.com/questions/4410794/ruby-on-rails-import-data-from-a-csv-file/4410880#4410880
|
36
38
|
require 'csv'
|
37
39
|
csv = CSV.parse(File.read(File.expand_path('../data/code_gists_20130724.csv', __FILE__)), :headers => true) # data from code.eoe.cn
|
38
|
-
csv.each
|
40
|
+
csv.each do |row|
|
41
|
+
_h = row.to_hash.merge(:fav_count => rand(5).to_i)
|
42
|
+
CodeGist.create! _h
|
43
|
+
_h[:category_id] = rand(10).to_i + 1
|
44
|
+
CodeGistMongoid.create! _h
|
45
|
+
end
|
39
46
|
|
40
47
|
|
41
48
|
Statlysis.setup do
|
42
49
|
hourly EoeLog, :time_column => :t
|
43
50
|
|
44
|
-
daily CodeGist
|
51
|
+
daily CodeGist, :sum_columns => [:fav_count], :group_concat_columns => [:user_id]
|
52
|
+
always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}], :group_concat_columns => [:user_id]
|
53
|
+
always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}, {:column_name => :category_id, :type => :integer}], :group_concat_columns => [:user_id]
|
45
54
|
|
46
55
|
[EoeLog,
|
47
56
|
EoeLog.where(:do => 3),
|
@@ -50,6 +59,8 @@ Statlysis.setup do
|
|
50
59
|
].each do |s|
|
51
60
|
daily s, :time_column => :t
|
52
61
|
end
|
53
|
-
|
62
|
+
cron1 = Statlysis.daily['mul'][1]
|
63
|
+
cron2 = Statlysis.daily['cod'][0]
|
64
|
+
cron3 = Statlysis.always['code']['mongoid'][0]
|
54
65
|
require 'pry-debugger';binding.pry
|
55
66
|
end
|
data/test/models/code_gist.rb
CHANGED
@@ -3,3 +3,15 @@
|
|
3
3
|
class CodeGist < ActiveRecord::Base
|
4
4
|
|
5
5
|
end
|
6
|
+
|
7
|
+
|
8
|
+
class CodeGistMongoid
|
9
|
+
include Mongoid::Document
|
10
|
+
include Mongoid::Timestamps
|
11
|
+
field :id, :type => Integer
|
12
|
+
field :description, :type => String
|
13
|
+
field :user_id, :type => Integer
|
14
|
+
field :author, :type => String
|
15
|
+
field :fav_count, :type => Integer
|
16
|
+
field :category_id, :type => Integer
|
17
|
+
end
|
data/test/models/eoe_log.rb
CHANGED
@@ -43,10 +43,8 @@ EoeLog.create
|
|
43
43
|
|
44
44
|
collection_class = collection_class_name.constantize
|
45
45
|
t = Time.zone.parse(date_str)
|
46
|
-
1.
|
47
|
-
|
48
|
-
collection_class.create :t => (t.to_time+rand(60*60*24-1)).to_datetime, :url => '/'
|
49
|
-
end
|
46
|
+
values = (1..day).map {|i| (t.to_time+rand(60*60*24-1)).to_datetime }.sort.map {|i| {:t => i, :url => '/' } }
|
47
|
+
collection_class.create values
|
50
48
|
|
51
49
|
collection_class.count
|
52
50
|
end
|
data/test/test_daily_count.rb
CHANGED
@@ -10,13 +10,15 @@ class TestDailyCount < Test::Unit::TestCase
|
|
10
10
|
def test_timely
|
11
11
|
o = @output.map {|i| i[:timely_c] }
|
12
12
|
r = (o - [5,11,0,1,8,2,3,4,16,10,26,13,7,9,20,15,30,33,14,6,12,17,19,59,65,84,62,114,69,52,61,67,154,70]).reject(&:zero?).blank?
|
13
|
-
|
13
|
+
assert r
|
14
14
|
end
|
15
15
|
|
16
16
|
def test_totally
|
17
17
|
o = @output.map {|i| i[:totally_c] }
|
18
18
|
r = (o - [5,16,17,25,27,30,34,36,37,53,55,56,57,59,60,64,66,67,68,70,71,73,74,75,80,90,116,129,136,145,165,185,200,230,234,235,236,237,270,273,274,288,299,304,305,312,327,337,345,359,374,380,392,418,435,446,452,463,466,473,493,506,512,520,525,545,549,553,558,577,636,701,785,805,867,981,1050,1102,1163,1230,1384,1454,1455,1457,1458]).reject(&:zero?).blank?
|
19
|
-
|
19
|
+
assert r
|
20
|
+
assert_equal @output[-1][:totally_favcount_s].to_i, CodeGist.all.map(&:fav_count).reduce(:+)
|
20
21
|
end
|
21
22
|
|
23
|
+
|
22
24
|
end
|
data/test/test_mapreduce.rb
CHANGED
@@ -6,7 +6,14 @@ class TestMapReduce < Test::Unit::TestCase
|
|
6
6
|
def setup
|
7
7
|
end
|
8
8
|
|
9
|
-
def
|
9
|
+
def test_multiple_dimensions_output_without_time_column
|
10
|
+
cron = Statlysis.always['mongoid']['code'][0]
|
11
|
+
assert_equal cron.time_column, false
|
12
|
+
assert_equal cron.time_unit, false
|
13
|
+
assert_equal cron.stat_table_name, 'timely_codegistmongoids_author_a'
|
14
|
+
|
15
|
+
cron.run
|
16
|
+
assert_equal cron.output.detect {|h| h[:author] == 'mvj3' }[:c].to_i, cron.multiple_dataset.sources.first.where(:author => 'mvj3').count
|
10
17
|
end
|
11
18
|
|
12
19
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statlysis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-12-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -269,9 +269,9 @@ files:
|
|
269
269
|
- lib/statlysis/configuration.rb
|
270
270
|
- lib/statlysis/constants.rb
|
271
271
|
- lib/statlysis/cron.rb
|
272
|
-
- lib/statlysis/cron/
|
273
|
-
- lib/statlysis/cron/
|
274
|
-
- lib/statlysis/cron/
|
272
|
+
- lib/statlysis/cron/timely.rb
|
273
|
+
- lib/statlysis/cron/timely/multiple_dimensions.rb
|
274
|
+
- lib/statlysis/cron/timely/one_dimension.rb
|
275
275
|
- lib/statlysis/cron/top.rb
|
276
276
|
- lib/statlysis/cron/top/hotest_items.rb
|
277
277
|
- lib/statlysis/cron/top/lastest_visits.rb
|
@@ -303,7 +303,7 @@ files:
|
|
303
303
|
- test/test_single_log_in_multiple_collections.rb
|
304
304
|
- test/test_statlysis.rb
|
305
305
|
- test/test_timeseries.rb
|
306
|
-
homepage: https://github.com/
|
306
|
+
homepage: https://github.com/SunshineLibrary/statlysis
|
307
307
|
licenses:
|
308
308
|
- MIT
|
309
309
|
post_install_message:
|
@@ -318,7 +318,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
318
318
|
version: '0'
|
319
319
|
segments:
|
320
320
|
- 0
|
321
|
-
hash:
|
321
|
+
hash: -1643509325996557122
|
322
322
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
323
323
|
none: false
|
324
324
|
requirements:
|
@@ -327,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
327
327
|
version: '0'
|
328
328
|
segments:
|
329
329
|
- 0
|
330
|
-
hash:
|
330
|
+
hash: -1643509325996557122
|
331
331
|
requirements: []
|
332
332
|
rubyforge_project:
|
333
333
|
rubygems_version: 1.8.23
|
data/lib/statlysis/cron/count.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module Statlysis
|
4
|
-
class Count < Cron
|
5
|
-
def initialize source, opts = {}
|
6
|
-
super
|
7
|
-
Statlysis.check_set_database
|
8
|
-
cron.setup_stat_model
|
9
|
-
cron
|
10
|
-
end
|
11
|
-
|
12
|
-
# 设置数据源,并保存结果入数据库
|
13
|
-
def run
|
14
|
-
(logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
|
15
|
-
# delete first in range
|
16
|
-
@output = cron.output
|
17
|
-
unless @output.any?
|
18
|
-
logger.info "没有数据"; return
|
19
|
-
end
|
20
|
-
num_i = 0; num_add = 999
|
21
|
-
Statlysis.sequel.transaction do
|
22
|
-
cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
|
23
|
-
while !(_a = @output[num_i..(num_i+num_add)]).blank? do
|
24
|
-
# batch insert all
|
25
|
-
cron.stat_model.multi_insert _a
|
26
|
-
num_i += (num_add + 1)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
return self
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
protected
|
35
|
-
def unit_range_query time, time_begin = nil
|
36
|
-
# time begin and end
|
37
|
-
tb = time
|
38
|
-
te = (time+1.send(cron.time_unit)-1.second)
|
39
|
-
tb, te = tb.to_i, te.to_i if is_time_column_integer?
|
40
|
-
tb = time_begin || tb
|
41
|
-
return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
|
42
|
-
return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
|
43
|
-
end
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
require 'statlysis/cron/count/timely'
|
51
|
-
require 'statlysis/cron/count/dimensions'
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
module Statlysis
|
4
|
-
class Timely < Count
|
5
|
-
def setup_stat_model
|
6
|
-
cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
|
7
|
-
raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
|
8
|
-
|
9
|
-
if not Statlysis.sequel.table_exists?(cron.stat_table_name)
|
10
|
-
Statlysis.sequel.transaction do
|
11
|
-
Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
|
12
|
-
DateTime :t # alias for :time
|
13
|
-
end
|
14
|
-
|
15
|
-
# TODO Add cron.source_where_array before count_columns
|
16
|
-
count_columns = [:timely_c, :totally_c] # alias for :count
|
17
|
-
count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
|
18
|
-
index_column_names = [:t] + count_columns
|
19
|
-
index_column_names_name = index_column_names.join("_")
|
20
|
-
index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
|
21
|
-
|
22
|
-
# Fix there should be uniq index name between tables
|
23
|
-
# `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
|
24
|
-
if not Statlysis.config.is_skip_database_index
|
25
|
-
Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
n = cron.stat_table_name.to_s.singularize.camelize
|
31
|
-
cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
|
32
|
-
class ::#{n} < Sequel::Model;
|
33
|
-
self.set_dataset :#{cron.stat_table_name}
|
34
|
-
end
|
35
|
-
#{n}
|
36
|
-
MODEL
|
37
|
-
end
|
38
|
-
|
39
|
-
def output
|
40
|
-
@output ||= (cron.time_range.map do |time|
|
41
|
-
timely_c = 0
|
42
|
-
totally_c = 0
|
43
|
-
# support multiple data sources
|
44
|
-
_first_source = nil
|
45
|
-
cron.multiple_dataset.sources.each do |s|
|
46
|
-
timely_c += s.where(unit_range_query(time)).count
|
47
|
-
_t = DateTime1970
|
48
|
-
_t = is_time_column_integer? ? _t.to_i : _t
|
49
|
-
totally_c += s.where(unit_range_query(time, _t)).count
|
50
|
-
_first_source ||= s.where(unit_range_query(time))
|
51
|
-
end
|
52
|
-
logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
|
53
|
-
|
54
|
-
if timely_c.zero? && totally_c.zero?
|
55
|
-
nil
|
56
|
-
else
|
57
|
-
{:t => time, :timely_c => timely_c, :totally_c => totally_c}
|
58
|
-
end
|
59
|
-
end.compact)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|