statlysis 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,11 +10,13 @@ Usage
10
10
  Statlysis.setup do
11
11
  set_database :statlysis
12
12
 
13
- hourly :time_column => :t
13
+ daily CodeGist
14
+ hourly EoeLog, :time_column => :t # support custom time_column
15
+
14
16
  [EoeLog,
15
- EoeLog.where(:ui => 0),
17
+ EoeLog.where(:ui => 0), # support query scope
16
18
  EoeLog.where(:ui => {"$ne" => 0}),
17
- Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}),
19
+ Mongoid[/eoe_logs_[0-9]+$/].where(:ui => {"$ne" => 0}), # support collection name regexp
18
20
  EoeLog.where(:do => {"$in" => [DOMAINS_HASH[:blog], DOMAINS_HASH[:my]]}),
19
21
  ].each do |s|
20
22
  daily s, :time_column => :t
@@ -45,8 +47,6 @@ TODO
45
47
  * Admin interface
46
48
  * statistical query api in Ruby and HTTP
47
49
  * Interacting with Javascript charting library, e.g. Highcharts, D3.
48
- * More tests
49
- * Add @criteria to MultipleDataset
50
50
 
51
51
 
52
52
  Statistical Process
@@ -68,6 +68,11 @@ Q: Why do you recommend using multiple collections to store logs rather than a s
68
68
  A: MongoDB can effectively reuse space freed by removing entire collections without leading to data fragmentation, see details at http://docs.mongodb.org/manual/use-cases/storing-log-data/#multiple-collections-single-database
69
69
 
70
70
 
71
+ Q: In Mongodb, why use MapReduce instead of Aggregation?
72
+
73
+ A: The result of aggregation pipeline is a document and is subject to the BSON Document size limit, which is currently 16 megabytes, see more details at http://docs.mongodb.org/manual/core/aggregation-pipeline/#pipeline
74
+
75
+
71
76
  Copyright
72
77
  -----------------------------------------------
73
78
  MIT. David Chen at eoe.cn.
@@ -91,3 +96,11 @@ Related
91
96
 
92
97
  ### Admin interface
93
98
  * http://three.kibana.org/ browser based analytics and search interface to Logstash and other timestamped data sets stored in ElasticSearch.
99
+
100
+
101
+ ### ETL
102
+ * https://github.com/activewarehouse/activewarehouse-etl/
103
+ * http://jisraelsen.github.io/drudgery/ ruby ETL DSL, support csv, sqlite3, ActiveRecord, without support time range
104
+ * https://github.com/square/ETL Simply encapsulates the SQL procedures
105
+
106
+
@@ -20,34 +20,33 @@ require 'activerecord_idnamecache'
20
20
  module Rails; end
21
21
 
22
22
  require 'statlysis/constants'
23
+ require 'statlysis/utils'
24
+ require 'statlysis/configuration'
25
+ require 'statlysis/common'
23
26
 
24
27
  module Statlysis
25
28
  class << self
26
29
  def setup &blk
27
30
  raise "Need to setup proc" if not blk
28
31
 
29
- logger.info "Start to setup Statlysis"
32
+ logger.info "Start to setup Statlysis" if ENV['DEBUG']
30
33
  time_log do
31
34
  self.config.instance_exec(&blk)
32
35
  end
33
- logger.info
34
36
  end
35
37
 
36
38
  def time_log text = nil
37
39
  t = Time.now
38
40
  logger.info text if text
39
41
  yield if block_given?
40
- logger.info "Time spend #{(Time.now - t).round(2)} seconds."
41
- logger.info "-" * 42
42
+ logger.info "Time spend #{(Time.now - t).round(2)} seconds." if ENV['DEBUG']
43
+ logger.info "-" * 42 if ENV['DEBUG']
42
44
  end
43
45
 
44
46
  # delagate config methods to Configuration
45
47
  def config; Configuration.instance end
46
48
  require 'active_support/core_ext/module/delegation.rb'
47
- [:sequel, :set_database, :check_set_database,
48
- :default_time_zone,
49
- :set_tablename_default_pre, :tablename_default_pre
50
- ].each do |sym|
49
+ Configuration::DelegateMethods.each do |sym|
51
50
  delegate sym, :to => :config
52
51
  end
53
52
 
@@ -56,18 +55,17 @@ module Statlysis
56
55
 
57
56
  def source_to_database_type; @_source_to_database_type ||= {} end
58
57
 
59
-
58
+ # 代理访问 各个时间类型的 crons
60
59
  def daily; CronSet.new(Statlysis.config.day_crons) end
61
60
  def hourly; CronSet.new(Statlysis.config.hour_crons) end
61
+ def always; CronSet.new(Statlysis.config.always_crons) end
62
62
 
63
63
  end
64
64
 
65
65
  end
66
66
 
67
- require 'statlysis/utils'
68
- require 'statlysis/configuration'
69
- require 'statlysis/common'
70
67
  require 'statlysis/timeseries'
68
+ require 'statlysis/map_reduce'
71
69
  require 'statlysis/clock'
72
70
  require 'statlysis/rake'
73
71
  require 'statlysis/cron'
@@ -77,7 +75,7 @@ require 'statlysis/multiple_dataset'
77
75
 
78
76
  module Statlysis
79
77
  require 'short_inspect'
80
- ShortInspect.apply_to Cron, CronSet, MultipleDataset
78
+ ShortInspect.apply_to Cron, MultipleDataset
81
79
  ShortInspect.apply_minimal_to ActiveRecord::Relation # lazy load
82
80
  end
83
81
 
@@ -8,6 +8,8 @@ module Statlysis
8
8
  # feature is a string
9
9
  def initialize feature, default_time
10
10
  raise "Please assign default_time params" if not default_time
11
+
12
+ # init table & model
11
13
  cron.stat_table_name = [Statlysis.tablename_default_pre, 'clocks'].compact.join("_")
12
14
  unless Statlysis.sequel.table_exists?(cron.stat_table_name)
13
15
  Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts.merge(:engine => "InnoDB") do
@@ -19,6 +21,8 @@ module Statlysis
19
21
  end
20
22
  h = Utils.setup_pattern_table_and_model cron.stat_table_name
21
23
  cron.stat_model = h[:model]
24
+
25
+ # init default_time
22
26
  cron.clock = cron.stat_model.find_or_create(:feature => feature)
23
27
  cron.clock.update :t => default_time if cron.current.nil?
24
28
  cron
@@ -9,16 +9,21 @@ module Statlysis
9
9
  class Configuration
10
10
  include Singleton
11
11
 
12
+ # variables
12
13
  attr_accessor :sequel, :default_time_columns, :default_time_zone, :database_opts, :tablename_default_pre
13
14
  attr_accessor :is_skip_database_index
14
- TimeUnits.each {|unit| module_eval "attr_accessor :#{unit}_crons; self.instance.#{unit}_crons = []" }
15
- [:realtime, :similar, :hotest].each do |sym|
16
- sym = "#{sym}_crons"
17
- attr_accessor sym; self.instance.send "#{sym}=", []
15
+ (TimeUnits + %W[always] + [:realtime, :similar, :hotest]).each do |unit|
16
+ sym = "#{unit}_crons"; attr_accessor sym; self.instance.send "#{sym}=", []
18
17
  end
19
18
  self.instance.send "tablename_default_pre=", "st"
20
19
  self.instance.send "is_skip_database_index=", false
21
20
 
21
+ DelegateMethods = [
22
+ :sequel, :set_database, :check_set_database,
23
+ :default_time_zone,
24
+ :set_tablename_default_pre, :tablename_default_pre
25
+ ]
26
+
22
27
  # 会在自动拼接统计数据库表名时去除这些时间字段
23
28
  def update_time_columns *columns
24
29
  self.default_time_columns ||= [:created_at, :updated_at]
@@ -26,43 +31,44 @@ module Statlysis
26
31
  self.default_time_columns = self.default_time_columns.uniq
27
32
  end
28
33
 
29
- def set_database sym_or_hash
30
- self.database_opts = if sym_or_hash.is_a? Symbol
31
- YAML.load_file(Rails.root.join("config/database.yml"))[sym_or_hash.to_s]
32
- elsif Hash
33
- sym_or_hash
34
+ def set_database obj
35
+ self.database_opts = case obj
36
+ when Hash
37
+ obj
38
+ when Symbol, String
39
+ YAML.load_file(Rails.root.join("config/database.yml"))[Rails.env].merge('database' => obj.to_s)
40
+ else
41
+ raise "Statlysis#set_database only support symbol or hash params"
42
+ end
43
+
44
+ raise "database_opts should not be blank" if self.database_opts.blank?
45
+
46
+ # sqlite dont support regular creating database in mysql style
47
+ self.sequel = if (self.database_opts['adapter'].match(/sqlite/) && self.database_opts['database'].match(/\A:memory:\Z/)) # only for test envrionment
48
+ Sequel.sqlite
34
49
  else
35
- raise "Statlysis#set_database only support symbol or hash params"
50
+ # create database, copied from http://stackoverflow.com/a/14435522/595618
51
+ require 'mysql2'
52
+ mysql2_client = Mysql2::Client.new(self.database_opts.except('database'))
53
+ mysql2_client.query("CREATE DATABASE IF NOT EXISTS #{self.database_opts['database']}")
54
+ Sequel.connect(self.database_opts)
36
55
  end
37
- self.sequel = Sequel.connect(self.database_opts)
38
56
 
39
57
  # 初始化键值model
40
58
  ["#{self.tablename_default_pre}_single_kvs", "#{self.tablename_default_pre}_single_kv_histories"].each do |tn|
41
59
  Utils.setup_pattern_table_and_model tn
42
60
  end
43
- return self
44
- end
45
61
 
46
- def set_default_time_zone zone
47
- self.default_time_zone = zone
48
62
  return self
49
63
  end
50
64
 
51
- def set_tablename_default_pre str
52
- self.tablename_default_pre = str.to_s
53
- end
54
-
55
- def daily source, opts = {}; timely source, {:time_unit => :day }.merge(opts) end
56
- def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
57
-
65
+ def set_default_time_zone zone; self.default_time_zone = zone; return self; end
66
+ def set_tablename_default_pre str; self.tablename_default_pre = str.to_s; return self end
58
67
  def check_set_database; raise "Please setup database first" if sequel.nil? end
59
68
 
60
- def timely source, opts
61
- self.check_set_database
62
- opts.reverse_merge! :time_column => :created_at, :time_unit => :day
63
- t = Timely.new source, opts
64
- self.send("#{opts[:time_unit]}_crons").push t
65
- end
69
+ def daily source, opts = {}; timely source, {:time_unit => :day}.merge(opts) end
70
+ def hourly source, opts = {}; timely source, {:time_unit => :hour}.merge(opts) end
71
+ def always source, opts = {}; timely source, {:time_unit => false, :time_column => false}.merge(opts) end # IMPORTANT set :time_unit to false
66
72
 
67
73
  # the real requirement is to compute lastest items group by special pattens, like user_id, url prefix, ...
68
74
  def lastest_visits source, opts
@@ -101,5 +107,27 @@ module Statlysis
101
107
  self.similar_crons.push Similar.new(model_name, _p)
102
108
  end
103
109
 
110
+
111
+ private
112
+ def timely source, opts
113
+ self.check_set_database
114
+
115
+ opts.reverse_merge! :time_column => :created_at,
116
+ :time_unit => :day,
117
+ :sum_columns => [],
118
+ :group_by_columns => [],
119
+ :group_concat_columns => []
120
+
121
+ opts.each {|k, v| opts[k] = v.map(&:to_sym) if (Timely::SqlColumns - [:group_by_columns]).include?(k) } # Sequel use symbol as column names
122
+
123
+ # e.g. convert [:user_id] to [{:column_name => :user_id, :type => :integer}]
124
+ if (opts[:group_by_columns].first || {})[:type].blank?
125
+ opts[:group_by_columns] = opts[:group_by_columns].map {|i| {:column_name => i.to_sym, :type => :integer} }
126
+ end
127
+
128
+ t = Timely.new source, opts
129
+ self.send("#{opts[:time_unit] || 'always'}_crons").push t
130
+ end
131
+
104
132
  end
105
133
  end
@@ -3,8 +3,20 @@
3
3
  module Statlysis
4
4
  TimeUnits = %w[hour day week month year]
5
5
  DateTime1970 = Time.zone.parse("19700101").in_time_zone
6
+ TimeUnitToTableSuffixHash = (TimeUnits + [false]).inject({}) {|_h, _i| _h[_i] = (_i ? _i[0] : 'a'); _h }
6
7
 
7
8
  DefaultTableOpts = {:charset => "utf8", :collate => "utf8_general_ci", :engine => "MyISAM"}
8
9
 
9
10
  DefaultNotImplementWrongMessage = "Not implement yet, please config it by subclass".freeze
11
+
12
+ SymbolToClassInDataType = {
13
+ :string => String,
14
+ :datetime => DateTime,
15
+ :time => Time,
16
+ :integer => Integer,
17
+ :float => Float,
18
+ :text => String
19
+ }
20
+
21
+
10
22
  end
@@ -29,10 +29,8 @@ module Statlysis
29
29
  def is_activerecord?; @source_type == :activerecord; end
30
30
  def is_mongoid?; @source_type == :mongoid; end
31
31
  def is_orm?; [:activerecord, :mongoid].include?(@source_type); end
32
+ def _source; cron.multiple_dataset.sources.first end
32
33
 
33
- def _source
34
- cron.multiple_dataset.sources.first
35
- end
36
34
  def source_where_array
37
35
  # TODO follow index seq
38
36
  a = _source.where("").where_values.map do |equality|
@@ -64,8 +62,6 @@ module Statlysis
64
62
  TimeSeries.parse(timebegin..timeend, :unit => cron.time_unit)
65
63
  end
66
64
 
67
- protected
68
-
69
65
  # 兼容采用整数类型作时间字段
70
66
  def is_time_column_integer?
71
67
  if is_activerecord?
@@ -74,11 +70,13 @@ module Statlysis
74
70
  false
75
71
  end
76
72
  end
73
+ def time_column?; !!@time_column end
74
+ def group_by_columns?; !!@group_by_columns.any? end
77
75
 
78
76
  end
79
77
 
80
78
  end
81
79
 
82
80
 
83
- require 'statlysis/cron/count'
81
+ require 'statlysis/cron/timely'
84
82
  require 'statlysis/cron/top'
@@ -0,0 +1,171 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely < Cron
5
+ SqlColumns = [:sum_columns, :group_by_columns, :group_concat_columns]
6
+ attr_reader(*SqlColumns)
7
+
8
+ def initialize source, opts = {}
9
+ super
10
+ Statlysis.check_set_database
11
+ SqlColumns.each {|sym| instance_variable_set "@#{sym}", (opts[sym] || []) }
12
+ cron.setup_stat_model
13
+ cron
14
+ end
15
+
16
+ # 设置数据源,并保存结果入数据库
17
+ def run
18
+ (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
19
+
20
+ raise "cron.output has no Enumerable" if not cron.output.class.included_modules.include? Enumerable
21
+
22
+ num_i = 0; num_add = 999
23
+ Statlysis.sequel.transaction do
24
+ # delete first in range
25
+ cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete if cron.time_column?
26
+
27
+ # TODO partial delete
28
+ cron.stat_model.where("").delete if cron.group_by_columns?
29
+
30
+ while !(_a = cron.output[num_i..(num_i+num_add)]).blank? do
31
+ # batch insert all
32
+ cron.stat_model.multi_insert _a
33
+ num_i += (num_add + 1)
34
+ end
35
+ end
36
+
37
+ return self
38
+ end
39
+
40
+
41
+ def setup_stat_model
42
+ cron.stat_table_name = Utils.normalise_name cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array, cron.group_by_columns.map {|i| i[:column_name] }, TimeUnitToTableSuffixHash[cron.time_unit]
43
+ raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
44
+
45
+
46
+ # create basic unchangeable table structure
47
+ if not Statlysis.sequel.table_exists?(cron.stat_table_name)
48
+ Statlysis.sequel.transaction do
49
+ Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
50
+ primary_key :id # Add one column at least in this block to avoid `SQLite3::SQLException: near ")": syntax error (Sequel::DatabaseError)`
51
+ end
52
+ Statlysis.sequel.add_column cron.stat_table_name, :t, DateTime if cron.time_column? # alias for :time
53
+
54
+ # add count columns
55
+ if cron.time_column?
56
+ count_columns = [:timely_c, :totally_c] # alias for :count
57
+ count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
58
+ else
59
+ Statlysis.sequel.add_column cron.stat_table_name, :c, Integer # alias for :count
60
+ end
61
+
62
+ end
63
+ end
64
+ # add group_by columns & indexes
65
+ remodel
66
+ cron.stat_model.cron = cron
67
+ if cron.group_by_columns.any?
68
+ cron.group_by_columns.each do |_h|
69
+ if not cron.stat_model.columns.include?(_h[:column_name])
70
+ _h[:type] = SymbolToClassInDataType[_h[:type]] if _h[:type].is_a?(Symbol) # && (Statlysis.sequel.opts[:adapter] == :sqlite)
71
+ Statlysis.sequel.add_column cron.stat_table_name, _h[:column_name], _h[:type]
72
+ end
73
+ end
74
+ end
75
+
76
+ # add sum columns
77
+ remodel
78
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
79
+ _result_cols.each do |_result_col|
80
+ if not cron.stat_model.columns.include?(_result_col)
81
+ # convert to Interger type in view if needed
82
+ Statlysis.sequel.add_column cron.stat_table_name, _result_col, Float
83
+ end
84
+ end
85
+ end
86
+
87
+ # Fix there should be uniq index name between tables
88
+ # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
89
+ _group_by_columns_index_name = cron.group_by_columns.reject {|i| i[:no_index] }.map {|i| i[:column_name] }
90
+ _truncated_columns = _group_by_columns_index_name.dup # only String column
91
+ _group_by_columns_index_name = _group_by_columns_index_name.unshift :t if cron.time_column?
92
+ # TODO use https://github.com/german/redis_orm to support full string indexes
93
+ if !Statlysis.config.is_skip_database_index && _group_by_columns_index_name.any?
94
+ mysql_per_column_length_limit_in_one_index = (1000 / 3.0 / _group_by_columns_index_name.size.to_f).to_i
95
+ index_columns_str = _group_by_columns_index_name.map {|s| _truncated_columns.include?(s) ? "#{s.to_s}(#{mysql_per_column_length_limit_in_one_index})" : s.to_s }.join(", ")
96
+ index_columns_str = "(#{index_columns_str})"
97
+ begin
98
+ # NOTE mysql indexes key length limit is 1000 bytes
99
+ cron.stat_model.dataset.with_sql("CREATE INDEX #{Utils.sha1_name(_group_by_columns_index_name)} ON #{cron.stat_table_name} #{index_columns_str};").to_a
100
+ rescue => e
101
+ raise e if not e.inspect.match(/exists|duplicate/i)
102
+ end
103
+ end
104
+
105
+ # add group_concat column
106
+ remodel
107
+ if cron.group_concat_columns.any? && !cron.stat_model.columns.include?(:other_json)
108
+ Statlysis.sequel.add_column cron.stat_table_name, :other_json, :text
109
+ end
110
+
111
+ # add access to group_concat values in other_json
112
+ remodel.class_eval do
113
+ define_method("other_json_hash") do
114
+ @__other_json_hash_cache ||= (JSON.parse(self.other_json) rescue {})
115
+ end
116
+ cron.group_concat_columns.each do |_group_concat_column|
117
+ define_method("#{_group_concat_column}_values") do
118
+ self.other_json_hash[_group_concat_column.to_s]
119
+ end
120
+ end
121
+ end
122
+
123
+ remodel
124
+ end
125
+
126
+ def output
127
+ @output ||= (cron.group_by_columns.any? ? multiple_dimensions_output : one_dimension_output)
128
+ end
129
+
130
+ protected
131
+ def unit_range_query time, time_begin = nil
132
+ # time begin and end
133
+ tb = time
134
+ te = (time+1.send(cron.time_unit)-1.second)
135
+ tb, te = tb.to_i, te.to_i if is_time_column_integer?
136
+ tb = time_begin || tb
137
+ return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
138
+ return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
139
+ end
140
+
141
+ # e.g. {:fav_count=>[:timely_favcount_s, :totally_favcount_s]}
142
+ def sum_column_to_result_columns_hash
143
+ cron.sum_columns.inject({}) do |h, _col|
144
+ [:timely, :totally].each do |_pre|
145
+ h[_col] ||= []
146
+ h[_col] << Utils.normalise_name(_pre, _col, 's').to_sym
147
+ end
148
+ h
149
+ end
150
+ end
151
+
152
+ private
153
+ def remodel
154
+ n = cron.stat_table_name.to_s.singularize.camelize
155
+ cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
156
+ class ::#{n} < Sequel::Model;
157
+ self.set_dataset :#{cron.stat_table_name}
158
+
159
+ cattr_accessor :cron
160
+ end
161
+ #{n}
162
+ MODEL
163
+ end
164
+
165
+ end
166
+ end
167
+
168
+
169
+
170
+ require 'statlysis/cron/timely/one_dimension'
171
+ require 'statlysis/cron/timely/multiple_dimensions'
@@ -0,0 +1,52 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely
5
+
6
+
7
+ def multiple_dimensions_output
8
+ self.send "multiple_dimensions_output_with#{cron.time_column ? '' : 'out'}_time_column"
9
+ end
10
+
11
+ private
12
+ def multiple_dimensions_output_with_time_column
13
+ cron.time_range.map do |time|
14
+ raise DefaultNotImplementWrongMessage # TODO
15
+ end
16
+ end
17
+
18
+ # TODO encapsulate Mongoid MapReduce in collection output mode
19
+ # TODO support large dataset, e.g. a million.
20
+ def multiple_dimensions_output_without_time_column
21
+ mr = Javascript::MultiDimensionalCount.new(cron)
22
+
23
+ array = []
24
+ cron.multiple_dataset.sources.each do |_source|
25
+ # _source = _source.time_range # TODO
26
+ array += _source.map_reduce(mr.map_func, mr.reduce_func)
27
+ .out(inline: 1) # TODO use replace mode
28
+ .to_a.map do |i|
29
+ v = i['value']
30
+ _h = {:c => v['count']}
31
+
32
+ cron.group_by_columns.each do |_group_by_column|
33
+ _h[_group_by_column[:column_name]] = v[_group_by_column[:column_name].to_s]
34
+ end
35
+
36
+ _h[:other_json] = {}
37
+ cron.group_concat_columns.each do |_group_concat_column|
38
+ _h[:other_json][_group_concat_column] = v["#{_group_concat_column}_values"].inject({}) {|_h2, i2| _h2[i2] ||= 0; _h2[i2] += 1; _h2 }
39
+ end
40
+ _h[:other_json] = _h[:other_json].to_json
41
+
42
+ _h
43
+ end
44
+ end
45
+ array
46
+
47
+ # TODO support sum_columns
48
+ end
49
+
50
+
51
+ end
52
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ class Timely
5
+
6
+
7
+ # one dimension **must** have `time_column`, or there's nothing to do
8
+ #
9
+ # TODO add to FAQ
10
+ # * if you want to statistics one column through `group_by_columns`
11
+ # params, and dont need time column, then you could use `always` DSL.
12
+ #
13
+ def one_dimension_output
14
+ cron.time_range.map do |time|
15
+ _hash = {:t => time, :timely_c => 0, :totally_c => 0}
16
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
17
+ _result_cols.each do |_result_col|
18
+ _hash[_result_col] = 0.0
19
+ end
20
+ end
21
+
22
+ # support multiple data sources
23
+ _first_source = nil
24
+ cron.multiple_dataset.sources.each do |s|
25
+ _t = DateTime1970
26
+ _t = is_time_column_integer? ? _t.to_i : _t
27
+
28
+ _scope_one = s.where(unit_range_query(time))
29
+ # TODO cache pre-result
30
+ _scope_all = s.where(unit_range_query(time, _t))
31
+
32
+ # 1. count
33
+ _hash[:timely_c] += _scope_one.count
34
+ _hash[:totally_c] += _scope_all.count
35
+
36
+ # 2. sum
37
+ sum_column_to_result_columns_hash.each do |_sum_col, _result_cols|
38
+ _hash[_result_cols[0]] = _scope_one.map(&_sum_col).reduce(:+).to_f
39
+ _hash[_result_cols[1]] = _scope_all.map(&_sum_col).reduce(:+).to_f
40
+ end
41
+
42
+ # 3. group_concat
43
+ _other_json = {}
44
+ _other_json[:group_concat_columns] ||= {}
45
+ cron.group_concat_columns.each do |_group_concat_column|
46
+ _other_json[:group_concat_columns][_group_concat_column] = _scope_one.map(&_group_concat_column).uniq
47
+ end
48
+ _hash[:other_json] = _other_json.to_json
49
+
50
+ _first_source ||= s.where(unit_range_query(time))
51
+ end
52
+ logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{_hash[:timely_c]} totally_c:#{_hash[:totally_c]}" if ENV['DEBUG']
53
+
54
+ _hash
55
+ end.select {|r1| r1.except(:t, :other_json).values.reject {|r2| r2.zero? }.any? }
56
+ end
57
+
58
+
59
+ end
60
+ end
@@ -10,13 +10,14 @@ module Statlysis
10
10
  when Fixnum, Integer # support array idx access
11
11
  self.to_a[pattern]
12
12
  else
13
- CronSet.new(select do |cron_set|
14
- cron_set.multiple_dataset.name.to_s.match Regexp.new(pattern.to_s)
13
+ CronSet.new(self.select do |cron|
14
+ reg = Regexp.new(pattern.to_s)
15
+ cron.stat_table_name.match(reg) || cron.multiple_dataset.name.to_s.match(reg)
15
16
  end)
16
17
  end
17
18
  end
18
19
 
19
- def last; [-1]; end
20
+ def last; self[-1]; end
20
21
 
21
22
  def run
22
23
  map(&:run)
@@ -4,34 +4,69 @@ module Statlysis
4
4
  module Javascript
5
5
  class MultiDimensionalCount
6
6
  attr_reader :map_func, :reduce_func
7
+ attr_reader :cron
7
8
 
8
- def initialize *fields
9
- fields = :_id if fields.blank?
10
- emit_key = case fields
11
- when Array
12
- emit_key = fields.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
13
- emit_key = "{#{emit_key}}"
14
- when Symbol, String
15
- "this.#{fields}"
16
- else
17
- raise "Please assign symbol, string, or array of them"
9
+ def initialize cron
10
+ @cron = cron
11
+
12
+ # setup group_by_columns
13
+ _group_by_columns = :_id if cron.group_by_columns.blank?
14
+ _group_by_columns ||= cron.group_by_columns.map {|i| i[:column_name] }
15
+ emit_key = _group_by_columns.map {|dc| "#{dc}: this.#{dc}" }.join(", ")
16
+ emit_key = "{#{emit_key}}"
17
+
18
+ # TODO setup sum_columns
19
+ # default_emit_values_array += cron.sum_columns.map {|_sum_column| "#{_sum_column}: this.#{_sum_column}" }
20
+
21
+ # setup group_concat_columns
22
+ # NOTE if only one uniq emit value, then it'll never be appeared in reduce function
23
+ emit_values_init_array = cron.group_concat_columns.map do |_group_concat_column|
24
+ "emit_value.#{_group_concat_column}_values = [this.#{_group_concat_column}];\n"
18
25
  end
26
+ emit_values_init_array += (_group_by_columns.map do |_group_by_column|
27
+ "emit_value.#{_group_by_column} = this.#{_group_by_column};\n"
28
+ end)
19
29
 
20
30
  @map_func = "function() {
21
- emit (#{emit_key}, {count: 1});
31
+ var emit_value = {count: 1};
32
+ #{emit_values_init_array.join}
33
+
34
+ emit (#{emit_key}, emit_value);
22
35
  }"
23
36
 
37
+ # sum_init_values = cron.sum_columns.map {|_sum_column| "#{_sum_column} = 0.0" }
38
+ # sum_init_values = "var #{sum_init_values};" if cron.sum_columns.any?
39
+
40
+ # 如果使用Hash,将导致group_concat最终的数目和group_by数目不一致,因为多个任务并行时会导致覆盖(常见于个数多的分类,一个的则不会有这个问题),而可并行化的数组则不会。
41
+ group_concat_values_init_array = cron.group_concat_columns.map {|_group_concat_column| "reducedObject.#{_group_concat_column}_values = [];" }
42
+ group_concat_values_process_array = cron.group_concat_columns.map do |_group_concat_column|
43
+ "reducedObject.#{_group_concat_column}_values = reducedObject.#{_group_concat_column}_values.concat(v['#{_group_concat_column}_values']);\n"
44
+ end
45
+ group_by_values_process_array = _group_by_columns.map do |_group_by_column|
46
+ "reducedObject.#{_group_by_column} = v.#{_group_by_column};\n"
47
+ end
48
+
49
+ # emit value in map func should be the same structure as the
50
+ # return value in reduce func, see more details in
51
+ # http://rickosborne.org/download/SQL-to-MongoDB.pdf and
52
+ # http://docs.mongodb.org/manual/tutorial/perform-incremental-map-reduce/
24
53
  @reduce_func = "function(key, values) {
25
- var count = 0;
54
+ var reducedObject = key;
55
+ reducedObject.count = 0;
56
+ #{group_concat_values_init_array.join}
26
57
 
27
58
  values.forEach(function(v) {
28
- count += v['count'];
59
+ reducedObject.count += v['count'];
60
+ #{group_by_values_process_array.join}
61
+ #{group_concat_values_process_array.join}
29
62
  });
30
63
 
31
- return {count: count};
64
+ return reducedObject;
32
65
  }"
33
- self
66
+
67
+ return self
34
68
  end
69
+
35
70
  end
36
71
  end
37
72
  end
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'javascript/count'
3
+ require 'statlysis/javascript/count'
4
4
 
5
5
  module Statlysis
6
6
  class MapReduce
@@ -35,6 +35,12 @@ module Statlysis
35
35
  {:table => tn, :model => str.constantize}
36
36
  end
37
37
 
38
+ def normalise_name *name
39
+ Array(name).flatten.compact.map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
40
+ end
41
+
42
+ def sha1_name name; Digest::SHA1.hexdigest Array(name).map(&:to_s).join end
43
+
38
44
  end
39
45
  end
40
46
  end
@@ -4,13 +4,13 @@ $:.push File.expand_path("../lib", __FILE__)
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'statlysis'
7
- s.version = '0.0.2'
8
- s.date = '2013-07-26'
7
+ s.version = '0.0.3'
8
+ s.date = '2013-12-03'
9
9
  s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
10
10
  s.description = s.summary
11
11
  s.authors = ["David Chen"]
12
12
  s.email = 'mvjome@gmail.com'
13
- s.homepage = 'https://github.com/eoecn/statlysis'
13
+ s.homepage = 'https://github.com/SunshineLibrary/statlysis'
14
14
  s.license = 'MIT'
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
@@ -4,6 +4,5 @@ production: &defaults
4
4
  encoding: utf8
5
5
  collation: utf8_general_ci
6
6
  database: ":memory:"
7
- statlysis:
7
+ development:
8
8
  <<: *defaults
9
- # database: statlysis
@@ -12,6 +12,7 @@ require 'test/unit'
12
12
 
13
13
  $LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
14
14
  $LOAD_PATH.unshift File.dirname(__FILE__) # test dirs
15
+ require 'pry-debugger'
15
16
 
16
17
  # load mongoid setup
17
18
  require 'mongoid'
@@ -22,10 +23,11 @@ require 'statlysis'
22
23
 
23
24
  # load rails
24
25
  def Rails.root; Pathname.new(File.expand_path('../.', __FILE__)) end
26
+ def Rails.env; 'development' end
25
27
  require 'sqlite3'
26
28
 
27
29
  # load ActiveRecord setup
28
- Statlysis.set_database :statlysis
30
+ Statlysis.set_database ":memory:"
29
31
  Statlysis.config.is_skip_database_index = true
30
32
  ActiveRecord::Base.establish_connection(Statlysis.config.database_opts.merge("adapter" => "sqlite3"))
31
33
  Dir[File.expand_path("../migrate/*.rb", __FILE__).to_s].each { |f| require f }
@@ -35,13 +37,20 @@ Dir[File.expand_path("../models/*.rb", __FILE__).to_s].each { |f| require f }
35
37
  # copied from http://stackoverflow.com/questions/4410794/ruby-on-rails-import-data-from-a-csv-file/4410880#4410880
36
38
  require 'csv'
37
39
  csv = CSV.parse(File.read(File.expand_path('../data/code_gists_20130724.csv', __FILE__)), :headers => true) # data from code.eoe.cn
38
- csv.each {|row| CodeGist.create!(row.to_hash) }
40
+ csv.each do |row|
41
+ _h = row.to_hash.merge(:fav_count => rand(5).to_i)
42
+ CodeGist.create! _h
43
+ _h[:category_id] = rand(10).to_i + 1
44
+ CodeGistMongoid.create! _h
45
+ end
39
46
 
40
47
 
41
48
  Statlysis.setup do
42
49
  hourly EoeLog, :time_column => :t
43
50
 
44
- daily CodeGist
51
+ daily CodeGist, :sum_columns => [:fav_count], :group_concat_columns => [:user_id]
52
+ always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}], :group_concat_columns => [:user_id]
53
+ always CodeGistMongoid, :group_by_columns => [{:column_name => :author, :type => :string}, {:column_name => :category_id, :type => :integer}], :group_concat_columns => [:user_id]
45
54
 
46
55
  [EoeLog,
47
56
  EoeLog.where(:do => 3),
@@ -50,6 +59,8 @@ Statlysis.setup do
50
59
  ].each do |s|
51
60
  daily s, :time_column => :t
52
61
  end
53
- cron = Statlysis.daily['mul'][1]
62
+ cron1 = Statlysis.daily['mul'][1]
63
+ cron2 = Statlysis.daily['cod'][0]
64
+ cron3 = Statlysis.always['code']['mongoid'][0]
54
65
  require 'pry-debugger';binding.pry
55
66
  end
@@ -4,5 +4,6 @@ class CreateActiveRecord < ActiveRecord::Migration
4
4
  t.integer :user_id
5
5
  t.timestamps
6
6
  t.string :author
7
+ t.integer :fav_count
7
8
  end
8
9
  end
@@ -3,3 +3,15 @@
3
3
  class CodeGist < ActiveRecord::Base
4
4
 
5
5
  end
6
+
7
+
8
+ class CodeGistMongoid
9
+ include Mongoid::Document
10
+ include Mongoid::Timestamps
11
+ field :id, :type => Integer
12
+ field :description, :type => String
13
+ field :user_id, :type => Integer
14
+ field :author, :type => String
15
+ field :fav_count, :type => Integer
16
+ field :category_id, :type => Integer
17
+ end
@@ -43,10 +43,8 @@ EoeLog.create
43
43
 
44
44
  collection_class = collection_class_name.constantize
45
45
  t = Time.zone.parse(date_str)
46
- 1.upto(day) do |i|
47
- puts "#{month} #{day_range} #{day} #{i}" if ENV['DEBUG']
48
- collection_class.create :t => (t.to_time+rand(60*60*24-1)).to_datetime, :url => '/'
49
- end
46
+ values = (1..day).map {|i| (t.to_time+rand(60*60*24-1)).to_datetime }.sort.map {|i| {:t => i, :url => '/' } }
47
+ collection_class.create values
50
48
 
51
49
  collection_class.count
52
50
  end
@@ -10,13 +10,15 @@ class TestDailyCount < Test::Unit::TestCase
10
10
  def test_timely
11
11
  o = @output.map {|i| i[:timely_c] }
12
12
  r = (o - [5,11,0,1,8,2,3,4,16,10,26,13,7,9,20,15,30,33,14,6,12,17,19,59,65,84,62,114,69,52,61,67,154,70]).reject(&:zero?).blank?
13
- assert_equal r, true
13
+ assert r
14
14
  end
15
15
 
16
16
  def test_totally
17
17
  o = @output.map {|i| i[:totally_c] }
18
18
  r = (o - [5,16,17,25,27,30,34,36,37,53,55,56,57,59,60,64,66,67,68,70,71,73,74,75,80,90,116,129,136,145,165,185,200,230,234,235,236,237,270,273,274,288,299,304,305,312,327,337,345,359,374,380,392,418,435,446,452,463,466,473,493,506,512,520,525,545,549,553,558,577,636,701,785,805,867,981,1050,1102,1163,1230,1384,1454,1455,1457,1458]).reject(&:zero?).blank?
19
- assert_equal r, true
19
+ assert r
20
+ assert_equal @output[-1][:totally_favcount_s].to_i, CodeGist.all.map(&:fav_count).reduce(:+)
20
21
  end
21
22
 
23
+
22
24
  end
@@ -6,7 +6,14 @@ class TestMapReduce < Test::Unit::TestCase
6
6
  def setup
7
7
  end
8
8
 
9
- def test_hotest_items_mapreduce
9
+ def test_multiple_dimensions_output_without_time_column
10
+ cron = Statlysis.always['mongoid']['code'][0]
11
+ assert_equal cron.time_column, false
12
+ assert_equal cron.time_unit, false
13
+ assert_equal cron.stat_table_name, 'timely_codegistmongoids_author_a'
14
+
15
+ cron.run
16
+ assert_equal cron.output.detect {|h| h[:author] == 'mvj3' }[:c].to_i, cron.multiple_dataset.sources.first.where(:author => 'mvj3').count
10
17
  end
11
18
 
12
19
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statlysis
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-26 00:00:00.000000000 Z
12
+ date: 2013-12-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -269,9 +269,9 @@ files:
269
269
  - lib/statlysis/configuration.rb
270
270
  - lib/statlysis/constants.rb
271
271
  - lib/statlysis/cron.rb
272
- - lib/statlysis/cron/count.rb
273
- - lib/statlysis/cron/count/dimensions.rb
274
- - lib/statlysis/cron/count/timely.rb
272
+ - lib/statlysis/cron/timely.rb
273
+ - lib/statlysis/cron/timely/multiple_dimensions.rb
274
+ - lib/statlysis/cron/timely/one_dimension.rb
275
275
  - lib/statlysis/cron/top.rb
276
276
  - lib/statlysis/cron/top/hotest_items.rb
277
277
  - lib/statlysis/cron/top/lastest_visits.rb
@@ -303,7 +303,7 @@ files:
303
303
  - test/test_single_log_in_multiple_collections.rb
304
304
  - test/test_statlysis.rb
305
305
  - test/test_timeseries.rb
306
- homepage: https://github.com/eoecn/statlysis
306
+ homepage: https://github.com/SunshineLibrary/statlysis
307
307
  licenses:
308
308
  - MIT
309
309
  post_install_message:
@@ -318,7 +318,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
318
318
  version: '0'
319
319
  segments:
320
320
  - 0
321
- hash: 59716176471030881
321
+ hash: -1643509325996557122
322
322
  required_rubygems_version: !ruby/object:Gem::Requirement
323
323
  none: false
324
324
  requirements:
@@ -327,7 +327,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
327
327
  version: '0'
328
328
  segments:
329
329
  - 0
330
- hash: 59716176471030881
330
+ hash: -1643509325996557122
331
331
  requirements: []
332
332
  rubyforge_project:
333
333
  rubygems_version: 1.8.23
@@ -1,51 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Count < Cron
5
- def initialize source, opts = {}
6
- super
7
- Statlysis.check_set_database
8
- cron.setup_stat_model
9
- cron
10
- end
11
-
12
- # 设置数据源,并保存结果入数据库
13
- def run
14
- (logger.info("#{cron.multiple_dataset.name} have no result!"); return false) if cron.output.blank?
15
- # delete first in range
16
- @output = cron.output
17
- unless @output.any?
18
- logger.info "没有数据"; return
19
- end
20
- num_i = 0; num_add = 999
21
- Statlysis.sequel.transaction do
22
- cron.stat_model.where("t >= ? AND t <= ?", cron.output[0][:t], cron.output[-1][:t]).delete
23
- while !(_a = @output[num_i..(num_i+num_add)]).blank? do
24
- # batch insert all
25
- cron.stat_model.multi_insert _a
26
- num_i += (num_add + 1)
27
- end
28
- end
29
-
30
- return self
31
- end
32
-
33
-
34
- protected
35
- def unit_range_query time, time_begin = nil
36
- # time begin and end
37
- tb = time
38
- te = (time+1.send(cron.time_unit)-1.second)
39
- tb, te = tb.to_i, te.to_i if is_time_column_integer?
40
- tb = time_begin || tb
41
- return ["#{cron.time_column} >= ? AND #{cron.time_column} < ?", tb, te] if is_activerecord?
42
- return {cron.time_column => {"$gte" => tb.utc, "$lt" => te.utc}} if is_mongoid? # .utc [fix undefined method `__bson_dump__' for Sun, 16 Dec 2012 16:00:00 +0000:DateTime]
43
- end
44
-
45
- end
46
-
47
- end
48
-
49
-
50
- require 'statlysis/cron/count/timely'
51
- require 'statlysis/cron/count/dimensions'
@@ -1,7 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Dimensions < Count
5
- end
6
-
7
- end
@@ -1,63 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Statlysis
4
- class Timely < Count
5
- def setup_stat_model
6
- cron.stat_table_name = [cron.class.name.split("::")[-1], cron.multiple_dataset.name, cron.source_where_array.join, cron.time_unit[0]].map {|s| s.to_s.gsub('_','') }.reject {|s| s.blank? }.join('_').downcase
7
- raise "mysql only support table_name in 64 characters, the size of '#{cron.stat_table_name}' is #{cron.stat_table_name.to_s.size}. please set cron.stat_table_name when you create a Cron instance" if cron.stat_table_name.to_s.size > 64
8
-
9
- if not Statlysis.sequel.table_exists?(cron.stat_table_name)
10
- Statlysis.sequel.transaction do
11
- Statlysis.sequel.create_table cron.stat_table_name, DefaultTableOpts do
12
- DateTime :t # alias for :time
13
- end
14
-
15
- # TODO Add cron.source_where_array before count_columns
16
- count_columns = [:timely_c, :totally_c] # alias for :count
17
- count_columns.each {|w| Statlysis.sequel.add_column cron.stat_table_name, w, Integer }
18
- index_column_names = [:t] + count_columns
19
- index_column_names_name = index_column_names.join("_")
20
- index_column_names_name = index_column_names_name[-63..-1] if index_column_names_name.size > 64
21
-
22
- # Fix there should be uniq index name between tables
23
- # `SQLite3::SQLException: index t_timely_c_totally_c already exists (Sequel::DatabaseError)`
24
- if not Statlysis.config.is_skip_database_index
25
- Statlysis.sequel.add_index cron.stat_table_name, index_column_names, :name => index_column_names_name
26
- end
27
- end
28
- end
29
-
30
- n = cron.stat_table_name.to_s.singularize.camelize
31
- cron.stat_model = class_eval <<-MODEL, __FILE__, __LINE__+1
32
- class ::#{n} < Sequel::Model;
33
- self.set_dataset :#{cron.stat_table_name}
34
- end
35
- #{n}
36
- MODEL
37
- end
38
-
39
- def output
40
- @output ||= (cron.time_range.map do |time|
41
- timely_c = 0
42
- totally_c = 0
43
- # support multiple data sources
44
- _first_source = nil
45
- cron.multiple_dataset.sources.each do |s|
46
- timely_c += s.where(unit_range_query(time)).count
47
- _t = DateTime1970
48
- _t = is_time_column_integer? ? _t.to_i : _t
49
- totally_c += s.where(unit_range_query(time, _t)).count
50
- _first_source ||= s.where(unit_range_query(time))
51
- end
52
- logger.info "#{time.in_time_zone(cron.time_zone)} multiple_dataset:#{cron.multiple_dataset.name} _first_source:#{_first_source.inspect} timely_c:#{timely_c} totally_c:#{totally_c}" if ENV['DEBUG']
53
-
54
- if timely_c.zero? && totally_c.zero?
55
- nil
56
- else
57
- {:t => time, :timely_c => timely_c, :totally_c => totally_c}
58
- end
59
- end.compact)
60
- end
61
- end
62
-
63
- end