statlysis 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,11 +10,13 @@ Usage
10
10
  Statlysis.setup do
11
11
  set_database :statlysis
12
12
 
13
- hourly :time_column => :t
13
+ daily CodeGist
14
+ hourly EoeLog, :time_column => :t # support custom time_column
15
+
14
16
  [EoeLog,
15
- EoeLog.where(:ui => 0),
17
+ EoeLog.where(:ui => 0), # support query scope
16
18
  EoeLog.where(:ui => {"$ne" => 0}),
17
- Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
19
+ Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}), # support collection name regexp
18
20
  EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
19
21
  ].each do |s|
20
22
  daily s, :time_column => :t
@@ -45,8 +47,6 @@ TODO
45
47
  * Admin interface
46
48
  * statistical query api in Ruby and HTTP
47
49
  * Interacting with Javascript charting library, e.g. Highcharts, D3.
48
- * More tests
49
- * Add @criteria to MultipleDataset
50
50
 
51
51
 
52
52
  Statistical Process
@@ -68,6 +68,11 @@ Q: Why do you recommend using multiple collections to store logs rather than a s
68
68
  A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
69
69
 
70
70
 
71
+ Q: In Mongodb, why use MapReduce instead of Aggregation?
72
+
73
+ A: The result of aggregation pipeline is a document and is subject to the BSON Document size limit, which is currently 16 megabytes, see more details at http://docs.mongodb.org/manual/core/aggregation-pipeline/#pipeline
74
+
75
+
71
76
  Copyright
72
77
  -----------------------------------------------
73
78
  MIT. David Chen at eoe.cn.
@@ -91,3 +96,11 @@ Related
91
96
 
92
97
  ### Admin interface
93
98
  * http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
99
+
100
+
101
+ ### ETL
102
+ * https://github.com/activewarehouse/activewarehouse-etl/
103
+ * http://jisraelsen.github.io/drudgery/ ruby ETL DSL, support csv, sqlite3, ActiveRecord, without support time range
104
+ * https://github.com/square/ETL Simply encapsulates the SQL procedures
105
+
106
+
@@ -20,34 +20,33 @@ require 'activerecord_idnamecache'
20
20
  module Rails; end
21
21
 
22
22
  require 'statlysis/constants'
23
+ require 'statlysis/utils'
24
+ require 'statlysis/configuration'
25
+ require 'statlysis/common'
23
26
 
24
27
  module Statlysis
25
28
  class << self
26
29
  def setup &blk
27
30
  raise "Need to setup proc" if not blk
28
31
 
29
- logger.info "Start to setup Statlysis"
32
+ logger.info "Start to setup Statlysis" if ENV['DEBUG']
30
33
  time_log do
31
34
  self.config.instance_exec(&blk)
32
35
  end
33
- logger.info
34
36
  end
35
37
 
36
38
  def time_log text = nil
37
39
  t = Time.now
38
40
  logger.info text if text
39
41
  yield if block_given?
40
- logger.info "Time spend #{(Time.now - t).round(2)} seconds."
41
- logger.info "-" * 42
42
+ logger.info "Time spend #{(Time.now - t).round(2)} seconds." if ENV['DEBUG']
43
+ logger.info "-" * 42 if ENV['DEBUG']
42
44
  end
43
45
 
44
46
  # delagate config methods to Configuration
45
47
  def config; Configuration.instance end
46
48
  require 'active_support/core_ext/module/delegation.rb'
47
- [:sequel, :set_database, :check_set_database,
48
- :default_time_zone,
49
- :set_tablename_default_pre, :tablename_default_pre
50
- ].each do |sym|
49
+ Configuration::DelegateMethods.each do |sym|
51
50
  delegate sym, :to => :config
52
51
  end
53
52
 
@@ -56,18 +55,17 @@ module Statlysis
56
55
 
57
56
  def source_to_database_type; @_source_to_database_type ||= {} end
58
57
 
59
-
58
+ # 代理访问 各个时间类型的 crons
60
59
  def daily; CronSet.new(Statlysis.config.day_crons) end
61
60
  def hourly; CronSet.new(Statlysis.config.hour_crons) end
61
+ def always; CronSet.new(Statlysis.config.always_crons) end
62
62
 
63
63
  end
64
64
 
65
65
  end
66
66
 
67
- require 'statlysis/utils'
68
- require 'statlysis/configuration'
69
- require 'statlysis/common'
70
67
  require 'statlysis/timeseries'
68
+ require 'statlysis/map_reduce'
71
69
  require 'statlysis/clock'
72
70
  require 'statlysis/rake'
73
71
  require 'statlysis/cron'
@@ -77,7 +75,7 @@ require 'statlysis/multiple_dataset'
77
75
 
78
76
  module Statlysis
79
77
  require 'short_inspect'
80
- ShortInspect.apply_to Cron, CronSet, MultipleDataset
78
+ ShortInspect.apply_to Cron, MultipleDataset
81
79
  ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
82
80
  end
83
81
 
@@ -8,6 +8,8 @@ module Statlysis
8
8
  # feature is a string
9
9
  def initialize feature, default_time
10
10
  raise "Please assign default_time params" if not default_time
11
+
12
+ # init table & model
11
13
  cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
12
14
  unless Statlysis.sequel.table_exists?(cron.stat_table_name)
13
15
  Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
@@ -19,6 +21,8 @@ module Statlysis
19
21
  end
20
22
  h = Utils.setup_pattern_table_and_model cron.stat_table_name
21
23
  cron.stat_model = h[:model]
24
+
25
+ # init default_time
22
26
  cron.clock = cron.stat_model.find_or_create(:feature => feature)
23
27
  cron.clock.update :t => default_time if cron.current.nil?
24
28
  cron
@@ -9,16 +9,21 @@ module Statlysis
9
9
  class Configuration
10
10
  include Singleton
11
11
 
12
+ # variables
12
13
  attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
13
14
  attr_accessor :is_skip_database_index
14
- TimeUnits.each {|unit| module_eval "attr_accessor :#{unit}_crons; self.instance.#{unit}_crons = []" }
15
- [:realtime, :similar, :hotest].each do |sym|
16
- sym = "#{sym}_crons"
17
- attr_accessor sym; self.instance.send "#{sym}=", []
15
+ (TimeUnits + %W[always] + [:realtime, :similar, :hotest]).each do |unit|
16
+ sym = "#{unit}_crons"; attr_accessor sym; self.instance.send "#{sym}=", []
18
17
  end
19
18
  self.instance.send "tablename_default_pre=", "st"
20
19
  self.instance.send "is_skip_database_index=", false
21
20
 
21
+ DelegateMethods = [
22
+ :sequel, :set_database, :check_set_database,
23
+ :default_time_zone,
24
+ :set_tablename_default_pre, :tablename_default_pre
25
+ ]
26
+
22
27
  # 会在自动拼接统计数据库表名时去除这些时间字段
23
28
  def update_time_columns *columns
24
29
  self.default_time_columns ||= [:created_at, :updated_at]
@@ -26,43 +31,44 @@ module Statlysis
26
31
  self.default_time_columns = self.default_time_columns.uniq
27
32
  end
28
33
 
29
- def set_database sym_or_hash
30
- self.database_opts = if sym_or_hash.is_a? Symbol
31
- YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
32
- elsif Hash
33
- sym_or_hash
34
+ def set_database obj
35
+ self.database_opts = case obj
36
+ when Hash
37
+ obj
38
+ when Symbol, String
39
+ YAML.load_file(Rails.root.join("config/database.yml"))[Rails.env].merge('database' => obj.to_s)
40
+ else
41
+ raise "Statlysis#set_database only support symbol or hash params"
42
+ end
43
+
44
+ raise "database_opts should not be blank" if self.database_opts.blank?
45
+
46
+ # sqlite dont support regular creating database in mysql style
47
+ self.sequel = if (self.database_opts['adapter'].match(/sqlite/) && self.database_opts['database'].match(/\A:memory:\Z/)) # only for test envrionment
48
+ Sequel.sqlite
34
49
  else
35
- raise "Statlysis#set_database only support symbol or hash params"
50
+ # create database, copied from http://stackoverflow.com/a/14435522/595618
51
+ require 'mysql2'
52
+ mysql2_client = Mysql2::Client.new(self.database_opts.except('database'))
53
+ mysql2_client.query("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']}")
54
+ Sequel.connect(self.database_opts)
36
55
  end
37
- self.sequel = Sequel.connect(self.database_opts)
38
56
 
39
57
  # 初始化键值model
40
58
  ["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
41
59
  Utils.setup_pattern_table_and_model tn
42
60
  end
43
- return self
44
- end
45
61
 
46
- def set_default_time_zone zone
47
- self.default_time_zone = zone
48
62
  return self
49
63
  end
50
64
 
51
- def set_tablename_default_pre str
52
- self.tablename_default_pre = str.to_s
53
- end
54
-
55
- def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
56
- def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
57
-
65
+ def set_default_time_zone zone; self.default_time_zone = zone; return self; end
66
+ def set_tablename_default_pre str; self.tablename_default_pre = str.to_s; return self end
58
67
  def check_set_database; raise "Please setup database first" if sequel.nil? end
59
68
 
60
- def timely source, opts
61
- self.check_set_database
62
- opts.reverse_merge! :time_column => :created_at, :time_unit => :day
63
- t = Timely.new source, opts
64
- self.send("#{opts[:time_unit]}_crons").push t
65
- end
69
+ def daily source, opts = {}; timely source, {:time_unit => :day}.merge(opts) end
70
+ def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
71
+ def always source, opts = {}; timely source, {:time_unit => false, :time_column => false}.merge(opts) end # IMPORTANT set :time_unit to false
66
72
 
67
73
  # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
68
74
  def lastest_visits source, opts
@@ -101,5 +107,27 @@ module Statlysis
101
107
  self.similar_crons.push Similar.new(model_name, _p)
102
108
  end
103
109
 
110
+
111
+ private
112
+ def timely source, opts
113
+ self.check_set_database
114
+
115
+ opts.reverse_merge! :time_column => :created_at,
116
+ :time_unit => :day,
117
+ :sum_columns => [],
118
+ :group_by_columns => [],
119
+ :group_concat_columns => []
120
+
121
+ opts.each {|k, v| opts[k] = v.map(&:to_sym) if (Timely::SqlColumns - [:group_by_columns]).include?(k) } # Sequel use symbol as column names
122
+
123
+ # e.g. convert [:user_id] to [{:column_name => :user_id, :type => :integer}]
124
+ if (opts[:group_by_columns].first || {})[:type].blank?
125
+ opts[:group_by_columns] = opts[:group_by_columns].map {|i| {:column_name => i.to_sym, :type => :integer} }
126
+ end
127
+
128
+ t = Timely.new source, opts
129
+ self.send("#{opts[:time_unit] || 'always'}_crons").push t
130
+ end
131
+
104
132
  end
105
133
  end
@@ -3,8 +3,20 @@
3
3
  module Statlysis
4
4
  TimeUnits = %w[hour day week month year]
5
5
  DateTime1970 = Time.zone.parse("19700101").in_time_zone
6
+ TimeUnitToTableSuffixHash = (TimeUnits + [false]).inject({}) {|_h, _i| _h[_i] = (_i ? _i[0] : 'a'); _h }
6
7
 
7
8
  DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
8
9
 
9
10
  DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
11
+
12
+ SymbolToClassInDataType = {
13
+ :string => String,
14
+ :datetime => DateTime,
15
+ :time => Time,
16
+ :integer => Integer,
17
+ :float => Float,
18
+ :text => String
19
+ }
20
+
21
+
10
22
  end
@@ -29,10 +29,8 @@ module Statlysis
29
29
  def is_activerecord?; @source_type == :activerecord; end
30
30
  def is_mongoid?; @source_type == :mongoid; end
31
31
  def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
32
+ def _source; cron.multiple_dataset.sources.first end
32
33
 
33
- def _source
34
- cron.multiple_dataset.sources.first
35
- end
36
34
  def source_where_array
37
35
  # TODO follow index seq
38
36
  a = _source.where("").where_values.map do |equality|
@@ -64,8 +62,6 @@ module Statlysis
64
62
  TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
65
63
  end
66
64
 
67
- protected
68
-
69
65
  # 兼容采用整数类型作时间字段
70
66
  def is_time_column_integer?
71
67
  if is_activerecord?
@@ -74,11 +70,13 @@ module Statlysis
74
70
  false
75
71
  end
76
72
  end
73
+ def time_column?; !!@time_column end
74
+ def group_by_columns?; !!@group_by_columns.any? end
77
75
 
78
76
  end
79
77
 
80
78
  end
81
79
 
82
80
 
83
- require 'statlysis/cron/count'
81
+ require 'statlysis/cron/timely'
84
82
  require 'statlysis/cron/top'
@@ -0,0 +1,171 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely < Cron
5
+ SqlColumns = [:sum_columns, :group_by_columns, :group_concat_columns]
6
+ attr_reader(*SqlColumns)
7
+
8
+ def initialize source, opts = {}
9
+ super
10
+ Statlysis.check_set_database
11
+ SqlColumns.each {|sym| instance_variable_set "@#{sym}", (opts[sym] || []) }
12
+ cron.setup_stat_model
13
+ cron
14
+ end
15
+
16
+ # 设置数据源,并保存结果入数据库
17
+ def run
18
+ (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
19
+
20
+ raise "cron.output has no Enumerable" if not cron.output.class.included_modules.include? Enumerable
21
+
22
+ num_i = 0; num_add = 999
23
+ Statlysis.sequel.transaction do
24
+ # delete first in range
25
+ cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete if cron.time_column?
26
+
27
+ # TODO partial delete
28
+ cron.stat_model.where("").delete if cron.group_by_columns?
29
+
30
+ while !(_a = cron.output[num_i..(num_i+num_add)]).blank? do
31
+ # batch insert all
32
+ cron.stat_model.multi_insert _a
33
+ num_i += (num_add + 1)
34
+ end
35
+ end
36
+
37
+ return self
38
+ end
39
+
40
+
41
+ def setup_stat_model
42
+ cron.stat_table_name = Utils.normalise_name cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array, cron.group_by_columns.map {|i| i[:column_name] }, TimeUnitToTableSuffixHash[cron.time_unit]
43
+ raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
44
+
45
+
46
+ # create basic unchangeable table structure
47
+ if not Statlysis.sequel.table_exists?(cron.stat_table_name)
48
+ Statlysis.sequel.transaction do
49
+ Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
50
+ primary_key :id # Add one column at least in this block to avoid `SQLite3::SQLException: near ")": syntax error (Sequel::DatabaseError)`
51
+ end
52
+ Statlysis.sequel.add_column cron.stat_table_name, :t, DateTime if cron.time_column? # alias for :time
53
+
54
+ # add count columns
55
+ if cron.time_column?
56
+ count_columns = [:timely_c, :totally_c] # alias for :count
57
+ count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
58
+ else
59
+ Statlysis.sequel.add_column cron.stat_table_name, :c, Integer # alias for :count
60
+ end
61
+
62
+ end
63
+ end
64
+ # add group_by columns & indexes
65
+ remodel
66
+ cron.stat_model.cron = cron
67
+ if cron.group_by_columns.any?
68
+ cron.group_by_columns.each do |_h|
69
+ if not cron.stat_model.columns.include?(_h[:column_name])
70
+ _h[:type] = SymbolToClassInDataType[_h[:type]] if _h[:type].is_a?(Symbol) # && (Statlysis.sequel.opts[:adapter] == :sqlite)
71
+ Statlysis.sequel.add_column cron.stat_table_name, _h[:column_name], _h[:type]
72
+ end
73
+ end
74
+ end
75
+
76
+ # add sum columns
77
+ remodel
78
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
79
+ _result_cols.each do |_result_col|
80
+ if not cron.stat_model.columns.include?(_result_col)
81
+ # convert to Interger type in view if needed
82
+ Statlysis.sequel.add_column cron.stat_table_name, _result_col, Float
83
+ end
84
+ end
85
+ end
86
+
87
+ # Fix there should be uniq index name between tables
88
+ # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
89
+ _group_by_columns_index_name = cron.group_by_columns.reject {|i| i[:no_index] }.map {|i| i[:column_name] }
90
+ _truncated_columns = _group_by_columns_index_name.dup # only String column
91
+ _group_by_columns_index_name = _group_by_columns_index_name.unshift :t if cron.time_column?
92
+ # TODO use https://github.com/german/redis_orm to support full string indexes
93
+ if !Statlysis.config.is_skip_database_index && _group_by_columns_index_name.any?
94
+ mysql_per_column_length_limit_in_one_index = (1000 / 3.0 / _group_by_columns_index_name.size.to_f).to_i
95
+ index_columns_str = _group_by_columns_index_name.map {|s| _truncated_columns.include?(s) ? "#{s.to_s}(#{mysql_per_column_length_limit_in_one_index})" : s.to_s }.join(", ")
96
+ index_columns_str = "(#{index_columns_str})"
97
+ begin
98
+ # NOTE mysql indexes key length limit is 1000 bytes
99
+ cron.stat_model.dataset.with_sql("CREATE INDEX #{Utils.sha1_name(_group_by_columns_index_name)} ON #{cron.stat_table_name} #{index_columns_str};").to_a
100
+ rescue => e
101
+ raise e if not e.inspect.match(/exists|duplicate/i)
102
+ end
103
+ end
104
+
105
+ # add group_concat column
106
+ remodel
107
+ if cron.group_concat_columns.any? && !cron.stat_model.columns.include?(:other_json)
108
+ Statlysis.sequel.add_column cron.stat_table_name, :other_json, :text
109
+ end
110
+
111
+ # add access to group_concat values in other_json
112
+ remodel.class_eval do
113
+ define_method("other_json_hash") do
114
+ @__other_json_hash_cache ||= (JSON.parse(self.other_json) rescue {})
115
+ end
116
+ cron.group_concat_columns.each do |_group_concat_column|
117
+ define_method("#{_group_concat_column}_values") do
118
+ self.other_json_hash[_group_concat_column.to_s]
119
+ end
120
+ end
121
+ end
122
+
123
+ remodel
124
+ end
125
+
126
+ def output
127
+ @output ||= (cron.group_by_columns.any? ? multiple_dimensions_output : one_dimension_output)
128
+ end
129
+
130
+ protected
131
+ def unit_range_query time, time_begin = nil
132
+ # time begin and end
133
+ tb = time
134
+ te = (time+1.send(cron.time_unit)-1.second)
135
+ tb, te = tb.to_i, te.to_i if is_time_column_integer?
136
+ tb = time_begin || tb
137
+ return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
138
+ return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
139
+ end
140
+
141
+ # e.g. {:fav_count=>[:timely_favcount_s, :totally_favcount_s]}
142
+ def sum_column_to_result_columns_hash
143
+ cron.sum_columns.inject({}) do |h, _col|
144
+ [:timely, :totally].each do |_pre|
145
+ h[_col] ||= []
146
+ h[_col] << Utils.normalise_name(_pre, _col, 's').to_sym
147
+ end
148
+ h
149
+ end
150
+ end
151
+
152
+ private
153
+ def remodel
154
+ n = cron.stat_table_name.to_s.singularize.camelize
155
+ cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
156
+ class ::#{n} < Sequel::Model;
157
+ self.set_dataset :#{cron.stat_table_name}
158
+
159
+ cattr_accessor :cron
160
+ end
161
+ #{n}
162
+ MODEL
163
+ end
164
+
165
+ end
166
+ end
167
+
168
+
169
+
170
+ require 'statlysis/cron/timely/one_dimension'
171
+ require 'statlysis/cron/timely/multiple_dimensions'
@@ -0,0 +1,52 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely
5
+
6
+
7
+ def multiple_dimensions_output
8
+ self.send "multiple_dimensions_output_with#{cron.time_column ? '' : 'out'}_time_column"
9
+ end
10
+
11
+ private
12
+ def multiple_dimensions_output_with_time_column
13
+ cron.time_range.map do |time|
14
+ raise DefaultNotImplementWrongMessage # TODO
15
+ end
16
+ end
17
+
18
+ # TODO encapsulate Mongoid MapReduce in collection output mode
19
+ # TODO support large dataset, e.g. a million.
20
+ def multiple_dimensions_output_without_time_column
21
+ mr = Javascript::MultiDimensionalCount.new(cron)
22
+
23
+ array = []
24
+ cron.multiple_dataset.sources.each do |_source|
25
+ # _source = _source.time_range # TODO
26
+ array += _source.map_reduce(mr.map_func, mr.reduce_func)
27
+ .out(inline: 1) # TODO use replace mode
28
+ .to_a.map do |i|
29
+ v = i['value']
30
+ _h = {:c => v['count']}
31
+
32
+ cron.group_by_columns.each do |_group_by_column|
33
+ _h[_group_by_column[:column_name]] = v[_group_by_column[:column_name].to_s]
34
+ end
35
+
36
+ _h[:other_json] = {}
37
+ cron.group_concat_columns.each do |_group_concat_column|
38
+ _h[:other_json][_group_concat_column] = v["#{_group_concat_column}_values"].inject({}) {|_h2, i2| _h2[i2] ||= 0; _h2[i2] += 1; _h2 }
39
+ end
40
+ _h[:other_json] = _h[:other_json].to_json
41
+
42
+ _h
43
+ end
44
+ end
45
+ array
46
+
47
+ # TODO support sum_columns
48
+ end
49
+
50
+
51
+ end
52
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely
5
+
6
+
7
+ # one dimension **must** have `time_column`, or there's nothing to do
8
+ #
9
+ # TODO add to FAQ
10
+ # * if you want to statistics one column through `group_by_columns`
11
+ # params, and dont need time column, then you could use `always` DSL.
12
+ #
13
+ def one_dimension_output
14
+ cron.time_range.map do |time|
15
+ _hash = {:t => time, :timely_c => 0, :totally_c => 0}
16
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
17
+ _result_cols.each do |_result_col|
18
+ _hash[_result_col] = 0.0
19
+ end
20
+ end
21
+
22
+ # support multiple data sources
23
+ _first_source = nil
24
+ cron.multiple_dataset.sources.each do |s|
25
+ _t = DateTime1970
26
+ _t = is_time_column_integer? ? _t.to_i : _t
27
+
28
+ _scope_one = s.where(unit_range_query(time))
29
+ # TODO cache pre-result
30
+ _scope_all = s.where(unit_range_query(time, _t))
31
+
32
+ # 1. count
33
+ _hash[:timely_c] += _scope_one.count
34
+ _hash[:totally_c] += _scope_all.count
35
+
36
+ # 2. sum
37
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
38
+ _hash[_result_cols[0]] = _scope_one.map(&_sum_col).reduce(:+).to_f
39
+ _hash[_result_cols[1]] = _scope_all.map(&_sum_col).reduce(:+).to_f
40
+ end
41
+
42
+ # 3. group_concat
43
+ _other_json = {}
44
+ _other_json[:group_concat_columns] ||= {}
45
+ cron.group_concat_columns.each do |_group_concat_column|
46
+ _other_json[:group_concat_columns][_group_concat_column] = _scope_one.map(&_group_concat_column).uniq
47
+ end
48
+ _hash[:other_json] = _other_json.to_json
49
+
50
+ _first_source ||= s.where(unit_range_query(time))
51
+ end
52
+ logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{_hash[:timely_c]} totally_c:#{_hash[:totally_c]}" if ENV['DEBUG']
53
+
54
+ _hash
55
+ end.select {|r1| r1.except(:t, :other_json).values.reject {|r2| r2.zero? }.any? }
56
+ end
57
+
58
+
59
+ end
60
+ end
@@ -10,13 +10,14 @@ module Statlysis
10
10
  when Fixnum, Integer # support array idx access
11
11
  self.to_a[pattern]
12
12
  else
13
- CronSet.new(select do |cron_set|
14
- cron_set.multiple_dataset.name.to_s.match Regexp.new(pattern.to_s)
13
+ CronSet.new(self.select do |cron|
14
+ reg = Regexp.new(pattern.to_s)
15
+ cron.stat_table_name.match(reg) || cron.multiple_dataset.name.to_s.match(reg)
15
16
  end)
16
17
  end
17
18
  end
18
19
 
19
- def last; [-1]; end
20
+ def last; self[-1]; end
20
21
 
21
22
  def run
22
23
  map(&:run)
@@ -4,34 +4,69 @@ module Statlysis
4
4
  module Javascript
5
5
  class MultiDimensionalCount
6
6
  attr_reader :map_func, :reduce_func
7
+ attr_reader :cron
7
8
 
8
- def initialize *fields
9
- fields = :_id if fields.blank?
10
- emit_key = case fields
11
- when Array
12
- emit_key = fields.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
13
- emit_key = "{#{emit_key}}"
14
- when Symbol, String
15
- "this.#{fields}"
16
- else
17
- raise "Please assign symbol, string, or array of them"
9
+ def initialize cron
10
+ @cron = cron
11
+
12
+ # setup group_by_columns
13
+ _group_by_columns = :_id if cron.group_by_columns.blank?
14
+ _group_by_columns ||= cron.group_by_columns.map {|i| i[:column_name] }
15
+ emit_key = _group_by_columns.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
16
+ emit_key = "{#{emit_key}}"
17
+
18
+ # TODO setup sum_columns
19
+ # default_emit_values_array += cron.sum_columns.map {|_sum_column| "#{_sum_column}: this.#{_sum_column}" }
20
+
21
+ # setup group_concat_columns
22
+ # NOTE if only one uniq emit value, then it'll never be appeared in reduce function
23
+ emit_values_init_array = cron.group_concat_columns.map do |_group_concat_column|
24
+ "emit_value.#{_group_concat_column}_values = [this.#{_group_concat_column}];\n"
18
25
  end
26
+ emit_values_init_array += (_group_by_columns.map do |_group_by_column|
27
+ "emit_value.#{_group_by_column} = this.#{_group_by_column};\n"
28
+ end)
19
29
 
20
30
  @map_func = "function() {
21
- emit (#{emit_key}, {count: 1});
31
+ var emit_value = {count: 1};
32
+ #{emit_values_init_array.join}
33
+
34
+ emit (#{emit_key}, emit_value);
22
35
  }"
23
36
 
37
+ # sum_init_values = cron.sum_columns.map {|_sum_column| "#{_sum_column} = 0.0" }
38
+ # sum_init_values = "var #{sum_init_values};" if cron.sum_columns.any?
39
+
40
+ # 如果使用Hash,将导致group_concat最终的数目和group_by数目不一致,因为多个任务并行时会导致覆盖(常见于个数多的分类,一个的则不会有这个问题),而可并行化的数组则不会。
41
+ group_concat_values_init_array = cron.group_concat_columns.map {|_group_concat_column| "reducedObject.#{_group_concat_column}_values = [];" }
42
+ group_concat_values_process_array = cron.group_concat_columns.map do |_group_concat_column|
43
+ "reducedObject.#{_group_concat_column}_values = reducedObject.#{_group_concat_column}_values.concat(v['#{_group_concat_column}_values']);\n"
44
+ end
45
+ group_by_values_process_array = _group_by_columns.map do |_group_by_column|
46
+ "reducedObject.#{_group_by_column} = v.#{_group_by_column};\n"
47
+ end
48
+
49
+ # emit value in map func should be the same structure as the
50
+ # return value in reduce func, see more details in
51
+ # http://rickosborne.org/download/SQL-to-MongoDB.pdf and
52
+ # http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/
24
53
  @reduce_func = "function(key, values) {
25
- var count = 0;
54
+ var reducedObject = key;
55
+ reducedObject.count = 0;
56
+ #{group_concat_values_init_array.join}
26
57
 
27
58
  values.forEach(function(v) {
28
- count += v['count'];
59
+ reducedObject.count += v['count'];
60
+ #{group_by_values_process_array.join}
61
+ #{group_concat_values_process_array.join}
29
62
  });
30
63
 
31
- return {count: count};
64
+ return reducedObject;
32
65
  }"
33
- self
66
+
67
+ return self
34
68
  end
69
+
35
70
  end
36
71
  end
37
72
  end
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'javascript/count'
3
+ require 'statlysis/javascript/count'
4
4
 
5
5
  module Statlysis
6
6
  class MapReduce
@@ -35,6 +35,12 @@ module Statlysis
35
35
  {:table => tn, :model => str.constantize}
36
36
  end
37
37
 
38
+ def normalise_name *name
39
+ Array(name).flatten.compact.map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
40
+ end
41
+
42
+ def sha1_name name; Digest::SHA1.hexdigest Array(name).map(&:to_s).join end
43
+
38
44
  end
39
45
  end
40
46
  end
@@ -4,13 +4,13 @@ $:.push File.expand_path("../lib", __FILE__)
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'statlysis'
7
- s.version = '0.0.2'
8
- s.date = '2013-07-26'
7
+ s.version = '0.0.3'
8
+ s.date = '2013-12-03'
9
9
  s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
10
10
  s.description = s.summary
11
11
  s.authors = ["David Chen"]
12
12
  s.email = 'mvjome@gmail.com'
13
- s.homepage = 'https://github.com/eoecn/statlysis'
13
+ s.homepage = 'https://github.com/SunshineLibrary/statlysis'
14
14
  s.license = 'MIT'
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
@@ -4,6 +4,5 @@ production: &defaults
4
4
  encoding: utf8
5
5
  collation: utf8_general_ci
6
6
  database: ":memory:"
7
- statlysis:
7
+ development:
8
8
  <<: *defaults
9
- # database: statlysis
@@ -12,6 +12,7 @@ require 'test/unit'
12
12
 
13
13
  $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
14
14
  $LOAD_PATH.unshift File.dirname(__FILE__) # test dirs
15
+ require 'pry-debugger'
15
16
 
16
17
  # load mongoid setup
17
18
  require 'mongoid'
@@ -22,10 +23,11 @@ require 'statlysis'
22
23
 
23
24
  # load rails
24
25
  def Rails.root; Pathname.new(File.expand_path('../.', __FILE__)) end
26
+ def Rails.env; 'development' end
25
27
  require 'sqlite3'
26
28
 
27
29
  # load ActiveRecord setup
28
- Statlysis.set_database :statlysis
30
+ Statlysis.set_database ":memory:"
29
31
  Statlysis.config.is_skip_database_index = true
30
32
  ActiveRecord::Base.establish_connection(Statlysis.config.database_opts.merge("adapter" => "sqlite3"))
31
33
  Dir[File.expand_path("../migrate/*.rb", __FILE__).to_s].each { |f| require f }
@@ -35,13 +37,20 @@ Dir[File.expand_path("../models/*.rb", __FILE__).to_s].each { |f| require f }
35
37
  # copied from http://stackoverflow.com/questions/4410794/ruby-on-rails-import-data-from-a-csv-file/4410880#4410880
36
38
  require 'csv'
37
39
  csv = CSV.parse(File.read(File.expand_path('../data/code_gists_20130724.csv', __FILE__)), :headers => true) # data from code.eoe.cn
38
- csv.each {|row| CodeGist.create!(row.to_hash) }
40
+ csv.each do |row|
41
+ _h = row.to_hash.merge(:fav_count => rand(5).to_i)
42
+ CodeGist.create! _h
43
+ _h[:category_id] = rand(10).to_i + 1
44
+ CodeGistMongoid.create! _h
45
+ end
39
46
 
40
47
 
41
48
  Statlysis.setup do
42
49
  hourly EoeLog, :time_column => :t
43
50
 
44
- daily CodeGist
51
+ daily CodeGist, :sum_columns => [:fav_count], :group_concat_columns => [:user_id]
52
+ always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}], :group_concat_columns => [:user_id]
53
+ always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}, {:column_name => :category_id, :type => :integer}], :group_concat_columns => [:user_id]
45
54
 
46
55
  [EoeLog,
47
56
  EoeLog.where(:do => 3),
@@ -50,6 +59,8 @@ Statlysis.setup do
50
59
  ].each do |s|
51
60
  daily s, :time_column => :t
52
61
  end
53
- cron = Statlysis.daily['mul'][1]
62
+ cron1 = Statlysis.daily['mul'][1]
63
+ cron2 = Statlysis.daily['cod'][0]
64
+ cron3 = Statlysis.always['code']['mongoid'][0]
54
65
  require 'pry-debugger';binding.pry
55
66
  end
@@ -4,5 +4,6 @@ class CreateActiveRecord < ActiveRecord::Migration
4
4
  t.integer :user_id
5
5
  t.timestamps
6
6
  t.string :author
7
+ t.integer :fav_count
7
8
  end
8
9
  end
@@ -3,3 +3,15 @@
3
3
  class CodeGist < ActiveRecord::Base
4
4
 
5
5
  end
6
+
7
+
8
+ class CodeGistMongoid
9
+ include Mongoid::Document
10
+ include Mongoid::Timestamps
11
+ field :id, :type => Integer
12
+ field :description, :type => String
13
+ field :user_id, :type => Integer
14
+ field :author, :type => String
15
+ field :fav_count, :type => Integer
16
+ field :category_id, :type => Integer
17
+ end
@@ -43,10 +43,8 @@ EoeLog.create
43
43
 
44
44
  collection_class = collection_class_name.constantize
45
45
  t = Time.zone.parse(date_str)
46
- 1.upto(day) do |i|
47
- puts "#{month} #{day_range} #{day} #{i}" if ENV['DEBUG']
48
- collection_class.create :t => (t.to_time+rand(60*60*24-1)).to_datetime, :url => '/'
49
- end
46
+ values = (1..day).map {|i| (t.to_time+rand(60*60*24-1)).to_datetime }.sort.map {|i| {:t => i, :url => '/' } }
47
+ collection_class.create values
50
48
 
51
49
  collection_class.count
52
50
  end
@@ -10,13 +10,15 @@ class TestDailyCount < Test::Unit::TestCase
10
10
  def test_timely
11
11
  o = @output.map {|i| i[:timely_c] }
12
12
  r = (o - [5,11,0,1,8,2,3,4,16,10,26,13,7,9,20,15,30,33,14,6,12,17,19,59,65,84,62,114,69,52,61,67,154,70]).reject(&:zero?).blank?
13
- assert_equal r, true
13
+ assert r
14
14
  end
15
15
 
16
16
  def test_totally
17
17
  o = @output.map {|i| i[:totally_c] }
18
18
  r = (o - [5,16,17,25,27,30,34,36,37,53,55,56,57,59,60,64,66,67,68,70,71,73,74,75,80,90,116,129,136,145,165,185,200,230,234,235,236,237,270,273,274,288,299,304,305,312,327,337,345,359,374,380,392,418,435,446,452,463,466,473,493,506,512,520,525,545,549,553,558,577,636,701,785,805,867,981,1050,1102,1163,1230,1384,1454,1455,1457,1458]).reject(&:zero?).blank?
19
- assert_equal r, true
19
+ assert r
20
+ assert_equal @output[-1][:totally_favcount_s].to_i, CodeGist.all.map(&:fav_count).reduce(:+)
20
21
  end
21
22
 
23
+
22
24
  end
@@ -6,7 +6,14 @@ class TestMapReduce < Test::Unit::TestCase
6
6
  def setup
7
7
  end
8
8
 
9
- def test_hotest_items_mapreduce
9
+ def test_multiple_dimensions_output_without_time_column
10
+ cron = Statlysis.always['mongoid']['code'][0]
11
+ assert_equal cron.time_column, false
12
+ assert_equal cron.time_unit, false
13
+ assert_equal cron.stat_table_name, 'timely_codegistmongoids_author_a'
14
+
15
+ cron.run
16
+ assert_equal cron.output.detect {|h| h[:author] == 'mvj3' }[:c].to_i, cron.multiple_dataset.sources.first.where(:author => 'mvj3').count
10
17
  end
11
18
 
12
19
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statlysis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-26 00:00:00.000000000 Z
12
+ date: 2013-12-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -269,9 +269,9 @@ files:
269
269
  - lib/statlysis/configuration.rb
270
270
  - lib/statlysis/constants.rb
271
271
  - lib/statlysis/cron.rb
272
- - lib/statlysis/cron/count.rb
273
- - lib/statlysis/cron/count/dimensions.rb
274
- - lib/statlysis/cron/count/timely.rb
272
+ - lib/statlysis/cron/timely.rb
273
+ - lib/statlysis/cron/timely/multiple_dimensions.rb
274
+ - lib/statlysis/cron/timely/one_dimension.rb
275
275
  - lib/statlysis/cron/top.rb
276
276
  - lib/statlysis/cron/top/hotest_items.rb
277
277
  - lib/statlysis/cron/top/lastest_visits.rb
@@ -303,7 +303,7 @@ files:
303
303
  - test/test_single_log_in_multiple_collections.rb
304
304
  - test/test_statlysis.rb
305
305
  - test/test_timeseries.rb
306
- homepage: https://github.com/eoecn/statlysis
306
+ homepage: https://github.com/SunshineLibrary/statlysis
307
307
  licenses:
308
308
  - MIT
309
309
  post_install_message:
@@ -318,7 +318,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
318
318
  version: '0'
319
319
  segments:
320
320
  - 0
321
- hash: 59716176471030881
321
+ hash: -1643509325996557122
322
322
  required_rubygems_version: !ruby/object:Gem::Requirement
323
323
  none: false
324
324
  requirements:
@@ -327,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
327
327
  version: '0'
328
328
  segments:
329
329
  - 0
330
- hash: 59716176471030881
330
+ hash: -1643509325996557122
331
331
  requirements: []
332
332
  rubyforge_project:
333
333
  rubygems_version: 1.8.23
@@ -1,51 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Count < Cron
5
- def initialize source, opts = {}
6
- super
7
- Statlysis.check_set_database
8
- cron.setup_stat_model
9
- cron
10
- end
11
-
12
- # 设置数据源,并保存结果入数据库
13
- def run
14
- (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
15
- # delete first in range
16
- @output = cron.output
17
- unless @output.any?
18
- logger.info "没有数据"; return
19
- end
20
- num_i = 0; num_add = 999
21
- Statlysis.sequel.transaction do
22
- cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
23
- while !(_a = @output[num_i..(num_i+num_add)]).blank? do
24
- # batch insert all
25
- cron.stat_model.multi_insert _a
26
- num_i += (num_add + 1)
27
- end
28
- end
29
-
30
- return self
31
- end
32
-
33
-
34
- protected
35
- def unit_range_query time, time_begin = nil
36
- # time begin and end
37
- tb = time
38
- te = (time+1.send(cron.time_unit)-1.second)
39
- tb, te = tb.to_i, te.to_i if is_time_column_integer?
40
- tb = time_begin || tb
41
- return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
42
- return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
43
- end
44
-
45
- end
46
-
47
- end
48
-
49
-
50
- require 'statlysis/cron/count/timely'
51
- require 'statlysis/cron/count/dimensions'
@@ -1,7 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Dimensions < Count
5
- end
6
-
7
- end
@@ -1,63 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Timely < Count
5
- def setup_stat_model
6
- cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
7
- raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
8
-
9
- if not Statlysis.sequel.table_exists?(cron.stat_table_name)
10
- Statlysis.sequel.transaction do
11
- Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
12
- DateTime :t # alias for :time
13
- end
14
-
15
- # TODO Add cron.source_where_array before count_columns
16
- count_columns = [:timely_c, :totally_c] # alias for :count
17
- count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
18
- index_column_names = [:t] + count_columns
19
- index_column_names_name = index_column_names.join("_")
20
- index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
21
-
22
- # Fix there should be uniq index name between tables
23
- # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
24
- if not Statlysis.config.is_skip_database_index
25
- Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
26
- end
27
- end
28
- end
29
-
30
- n = cron.stat_table_name.to_s.singularize.camelize
31
- cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
32
- class ::#{n} < Sequel::Model;
33
- self.set_dataset :#{cron.stat_table_name}
34
- end
35
- #{n}
36
- MODEL
37
- end
38
-
39
- def output
40
- @output ||= (cron.time_range.map do |time|
41
- timely_c = 0
42
- totally_c = 0
43
- # support multiple data sources
44
- _first_source = nil
45
- cron.multiple_dataset.sources.each do |s|
46
- timely_c += s.where(unit_range_query(time)).count
47
- _t = DateTime1970
48
- _t = is_time_column_integer? ? _t.to_i : _t
49
- totally_c += s.where(unit_range_query(time, _t)).count
50
- _first_source ||= s.where(unit_range_query(time))
51
- end
52
- logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
53
-
54
- if timely_c.zero? && totally_c.zero?
55
- nil
56
- else
57
- {:t => time, :timely_c => timely_c, :totally_c => totally_c}
58
- end
59
- end.compact)
60
- end
61
- end
62
-
63
- end