statlysis 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Guardfile +14 -0
- data/README.markdown +77 -27
- data/Rakefile +1 -1
- data/lib/statlysis.rb +59 -101
- data/lib/statlysis/clock.rb +3 -3
- data/lib/statlysis/common.rb +4 -16
- data/lib/statlysis/configuration.rb +97 -2
- data/lib/statlysis/constants.rb +10 -0
- data/lib/statlysis/cron.rb +40 -42
- data/lib/statlysis/cron/count.rb +16 -58
- data/lib/statlysis/cron/count/dimensions.rb +7 -0
- data/lib/statlysis/cron/count/timely.rb +63 -0
- data/lib/statlysis/cron/top.rb +4 -104
- data/lib/statlysis/cron/top/hotest_items.rb +47 -0
- data/lib/statlysis/cron/top/lastest_visits.rb +53 -0
- data/lib/statlysis/cron_set.rb +26 -0
- data/lib/statlysis/dataset.rb +6 -0
- data/lib/statlysis/javascript/count.rb +3 -3
- data/lib/statlysis/multiple_dataset.rb +69 -0
- data/lib/statlysis/multiple_dataset/active_record.rb +36 -0
- data/lib/statlysis/multiple_dataset/mongoid.rb +54 -0
- data/lib/statlysis/rake.rb +6 -5
- data/lib/statlysis/similar.rb +11 -11
- data/lib/statlysis/timeseries.rb +12 -9
- data/lib/statlysis/utils.rb +40 -0
- data/statlysis.gemspec +13 -3
- data/test/config/database.yml +9 -0
- data/test/config/mongoid.yml +36 -0
- data/test/data/.gitkeep +0 -0
- data/test/data/code_gists_20130724.csv +1459 -0
- data/test/helper.rb +41 -3
- data/test/migrate/1_active_record.rb +8 -0
- data/test/models/.gitkeep +0 -0
- data/test/models/code_gist.rb +5 -0
- data/test/models/eoe_log.rb +53 -0
- data/test/test_daily_count.rb +22 -0
- data/test/test_mapreduce.rb +0 -13
- data/test/test_single_log_in_multiple_collections.rb +22 -0
- data/test/test_statlysis.rb +5 -50
- data/test/test_timeseries.rb +46 -0
- metadata +133 -12
- data/Gemfile.lock +0 -110
- data/LICENSE.txt +0 -20
- data/test/models/company.rb +0 -12
- data/test/models/employee.rb +0 -14
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
# See tech details at http://mvj3.github.io/2013/01/30/recent-visitors-implement/
|
5
|
+
class LastestVisits < Top
|
6
|
+
attr_accessor :clock
|
7
|
+
attr_accessor :reject_proc
|
8
|
+
|
9
|
+
# *pattern_proc* is a proc to extract user_id or url_prefix to compute the
|
10
|
+
# top visitors from log
|
11
|
+
# *user_id_proc* is a proc to extract user_id from log
|
12
|
+
# *user_info_proc* is a proc to extract visitor informations(like id, name, ...)
|
13
|
+
# *reject_proc* filter visitors
|
14
|
+
def initialize source, opts = {}
|
15
|
+
# set variables
|
16
|
+
cron.reclock opts[:default_time]
|
17
|
+
cron.reject_proc = opts[:reject_proc] || proc {|pattern, user_id| pattern.to_i == user_id.to_i }
|
18
|
+
super
|
19
|
+
Utils.setup_pattern_table_and_model cron.stat_table_name
|
20
|
+
cron
|
21
|
+
end
|
22
|
+
|
23
|
+
def output
|
24
|
+
cron.logs = cron.source.asc(cron.time_column).where(cron.time_column => {"$gte" => cron.clock.current}).limit(1000).to_a
|
25
|
+
return {} if cron.logs.blank?
|
26
|
+
cron.logs.inject({}) do |h, log|
|
27
|
+
pattern = cron.pattern_proc.call(log)
|
28
|
+
if pattern
|
29
|
+
h[pattern] ||= []
|
30
|
+
user_id = cron.user_id_proc.call(log).to_i
|
31
|
+
h[pattern] << user_id if not user_id.zero?
|
32
|
+
end
|
33
|
+
h
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def write
|
38
|
+
logger.info "#{Time.now.strftime('%H:%M:%S')} #{cron.stat_model} #{cron.output.inspect}"
|
39
|
+
cron.output.each do |pattern, user_ids|
|
40
|
+
s = cron.stat_model.find_or_create(:pattern => pattern)
|
41
|
+
old_array = (JSON.parse(s.result) rescue []).map {|i| Array(i)[0] }
|
42
|
+
new_user_ids = (old_array + user_ids).reverse.uniq.reverse # ensure the right items will overwrite the left [1,4,5,7,4,3,3,2,1,5].uniq => [1, 4, 5, 7, 3, 2]
|
43
|
+
s.update :result => new_user_ids.reject {|user_id| cron.reject_proc.call(pattern, user_id) rescue false }.map {|user_id| cron.user_info_proc.call(user_id) }.compact[0..cron.result_limit].to_json
|
44
|
+
end
|
45
|
+
cron.clock.update cron.logs.last.try(cron.time_column)
|
46
|
+
end
|
47
|
+
|
48
|
+
def reclock default_time = nil
|
49
|
+
cron.clock = Clock.new cron.stat_table_name, (default_time || cron.clock.current)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'statlysis/cron'
|
4
|
+
|
5
|
+
module Statlysis
|
6
|
+
class CronSet < Set
|
7
|
+
# filter cron_sets by pattern
|
8
|
+
def [] pattern = nil
|
9
|
+
case pattern
|
10
|
+
when Fixnum, Integer # support array idx access
|
11
|
+
self.to_a[pattern]
|
12
|
+
else
|
13
|
+
CronSet.new(select do |cron_set|
|
14
|
+
cron_set.multiple_dataset.name.to_s.match Regexp.new(pattern.to_s)
|
15
|
+
end)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def last; [-1]; end
|
20
|
+
|
21
|
+
def run
|
22
|
+
map(&:run)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
module Statlysis
|
4
4
|
module Javascript
|
5
5
|
class MultiDimensionalCount
|
6
|
-
|
6
|
+
attr_reader :map_func, :reduce_func
|
7
7
|
|
8
8
|
def initialize *fields
|
9
9
|
fields = :_id if fields.blank?
|
@@ -17,11 +17,11 @@ module Statlysis
|
|
17
17
|
raise "Please assign symbol, string, or array of them"
|
18
18
|
end
|
19
19
|
|
20
|
-
|
20
|
+
@map_func = "function() {
|
21
21
|
emit (#{emit_key}, {count: 1});
|
22
22
|
}"
|
23
23
|
|
24
|
-
|
24
|
+
@reduce_func = "function(key, values) {
|
25
25
|
var count = 0;
|
26
26
|
|
27
27
|
values.forEach(function(v) {
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
class MultipleDataset
|
5
|
+
def initialize cron = nil
|
6
|
+
@cron = cron
|
7
|
+
@sources ||= Set.new
|
8
|
+
return self
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :cron, :regexp, :sources
|
12
|
+
def set_regexp regexp
|
13
|
+
case regexp
|
14
|
+
when Regexp
|
15
|
+
when String
|
16
|
+
regexp = Regexp.new(string)
|
17
|
+
else
|
18
|
+
raise "regexp #{regexp} should be a Regexp!"
|
19
|
+
end
|
20
|
+
@regexp = regexp
|
21
|
+
|
22
|
+
return self
|
23
|
+
end
|
24
|
+
|
25
|
+
def add_source s
|
26
|
+
@sources.add s
|
27
|
+
|
28
|
+
return self
|
29
|
+
end
|
30
|
+
|
31
|
+
def name
|
32
|
+
if @sources.size.zero?
|
33
|
+
Statlysis.logger.warn "Add source to #{self} first!"
|
34
|
+
return nil
|
35
|
+
elsif @sources.size == 1
|
36
|
+
@sources.first.send(Utils.name(@sources.first))
|
37
|
+
else
|
38
|
+
# /multiple_log_2013[0-9]{4}/ => 'multiple_log'
|
39
|
+
regexp.inspect[1..-2].gsub(/\-|\[|\]|\{|\}|[0-9]/, '').sub(/\_+$/, '')
|
40
|
+
end
|
41
|
+
end
|
42
|
+
# Access dataset name, compact with many ORM
|
43
|
+
alias collection_name name # mongoid
|
44
|
+
alias table_name name # activerecord
|
45
|
+
|
46
|
+
|
47
|
+
def first_time
|
48
|
+
t = _resort_source_order.map(&:first).compact.map {|i| i.send(cron.time_column) }.compact.min || DateTime1970
|
49
|
+
t.in_time_zone(cron.time_zone)
|
50
|
+
end
|
51
|
+
def _resort_source_order; resort_source_order if cron; end # lazy load if cron is unassigned
|
52
|
+
def resort_source_order; raise DefaultNotImplementWrongMessage; end
|
53
|
+
|
54
|
+
# select ORM models fron ::Object namespace
|
55
|
+
def _select_orm _module
|
56
|
+
::Object.constants.map do |c|
|
57
|
+
c.to_s.constantize rescue nil # NameError: uninitialized constant ClassMethods
|
58
|
+
end.compact.select do |c|
|
59
|
+
(c.class === Class) &&
|
60
|
+
c.respond_to?(:included_modules) &&
|
61
|
+
c.included_modules.index(_module)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
require 'statlysis/multiple_dataset/mongoid'
|
69
|
+
require 'statlysis/multiple_dataset/active_record'
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'active_record'
|
4
|
+
|
5
|
+
module Statlysis
|
6
|
+
class ActiveRecordDataset < MultipleDataset
|
7
|
+
attr_reader :arel
|
8
|
+
def arel
|
9
|
+
@arel = @sources.first.where("").arel
|
10
|
+
end
|
11
|
+
# TODO
|
12
|
+
def method_missing
|
13
|
+
end
|
14
|
+
|
15
|
+
def set_regexp regexp
|
16
|
+
super
|
17
|
+
|
18
|
+
# TODO test it
|
19
|
+
activerecord_models = _select_orm(ActiveRecord::Store)
|
20
|
+
activerecord_models.select do |_model|
|
21
|
+
@sources.add _model if _model.table_name.to_s.match(@regexp)
|
22
|
+
end
|
23
|
+
|
24
|
+
_resort_source_order
|
25
|
+
|
26
|
+
return self
|
27
|
+
end
|
28
|
+
|
29
|
+
def resort_source_order; @sources = @sources.map {|s| s.order("#{cron.time_column} ASC") } end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def ActiveRecord.[] regexp
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'mongoid'
|
4
|
+
|
5
|
+
# http://mongoid.org/en/origin/index.html
|
6
|
+
# Origin provides a DSL to mix in to any object to give it the ability to build MongoDB queries easily. It was extracted from Mongoid in an attempt to allow others to leverage the DSL in their own applications without needing a mapper.
|
7
|
+
require 'origin'
|
8
|
+
|
9
|
+
module Statlysis
|
10
|
+
class MongoidDataset < MultipleDataset
|
11
|
+
# Notice: Origin::Queryable overwrite MongoidDataset#initialize
|
12
|
+
class Query; include Origin::Queryable end
|
13
|
+
|
14
|
+
# delegate mongoid query to @sources
|
15
|
+
# see document at http://rubydoc.info/github/mongoid/origin/Origin/Queryable & http://rubydoc.info/github/mongoid/origin/Origin/Forwardable
|
16
|
+
attr_reader :criteria
|
17
|
+
def method_missing m, *args, &blk
|
18
|
+
@criteria ||= Query.new
|
19
|
+
if (Origin::Selectable.forwardables + Origin::Optional.forwardables).include?(m)
|
20
|
+
@criteria = @criteria.__send__(m, *args, &blk)
|
21
|
+
@sources = @sources.map {|s| s.__send__(m, *args, &blk) }
|
22
|
+
return self # support method chain
|
23
|
+
else
|
24
|
+
super
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def set_regexp regexp
|
29
|
+
super
|
30
|
+
|
31
|
+
_collections = Mongoid.default_session.collections.select {|_collection| _collection.name.match(@regexp) }
|
32
|
+
mongoid_models = _select_orm(Mongoid::Document)
|
33
|
+
|
34
|
+
_collections.select do |_collection|
|
35
|
+
_mongoid_model = mongoid_models.detect {|m| m.collection_name === _collection.name }
|
36
|
+
raise "Please define Mongoid model for #{_collection}.collection under ::Object namespace!" if _mongoid_model.nil?
|
37
|
+
mongoid_models.delete _mongoid_model
|
38
|
+
@sources.add _mongoid_model
|
39
|
+
end
|
40
|
+
|
41
|
+
_resort_source_order
|
42
|
+
|
43
|
+
return self
|
44
|
+
end
|
45
|
+
|
46
|
+
def resort_source_order; @sources = @sources.map {|s| s.asc(cron.time_column) } end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def Mongoid.[] regexp
|
51
|
+
MongoidDataset.new.set_regexp(regexp)
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
data/lib/statlysis/rake.rb
CHANGED
@@ -1,28 +1,29 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require 'rake'
|
4
|
+
require 'only_one_rake'
|
4
5
|
|
5
6
|
namespace :statlysis do
|
6
|
-
Statlysis::
|
7
|
+
Statlysis::TimeUnits.each do |unit|
|
7
8
|
desc "statistical in #{unit}"
|
8
9
|
only_one_task "#{unit}_count" => :environment do
|
9
|
-
Statlysis.send("#{unit}_crons").map(&:run)
|
10
|
+
Statlysis.config.send("#{unit}_crons").map(&:run)
|
10
11
|
end
|
11
12
|
end
|
12
13
|
|
13
14
|
desc "realtime process"
|
14
15
|
only_one_task :realtime_process => :environment do
|
15
|
-
loop { Statlysis.realtime_crons.map(&:run); sleep 1 }
|
16
|
+
loop { Statlysis.config.realtime_crons.map(&:run); sleep 1 }
|
16
17
|
end
|
17
18
|
|
18
19
|
desc "similar process"
|
19
20
|
only_one_task :similar_process => :environment do
|
20
|
-
Statlysis.similar_crons.map(&:run)
|
21
|
+
Statlysis.config.similar_crons.map(&:run)
|
21
22
|
end
|
22
23
|
|
23
24
|
desc "hotest process"
|
24
25
|
only_one_task :hotest_process => :environment do
|
25
|
-
Statlysis.hotest_crons.map(&:run)
|
26
|
+
Statlysis.config.hotest_crons.map(&:run)
|
26
27
|
end
|
27
28
|
|
28
29
|
end
|
data/lib/statlysis/similar.rb
CHANGED
@@ -12,27 +12,27 @@ module Statlysis
|
|
12
12
|
|
13
13
|
# 初始化表和模型
|
14
14
|
cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
|
15
|
-
|
15
|
+
Utils.setup_pattern_table_and_model cron.stat_table_name
|
16
16
|
|
17
17
|
cron.id_to_similar_ids = {}
|
18
18
|
cron
|
19
19
|
end
|
20
20
|
|
21
21
|
def process
|
22
|
-
|
22
|
+
logger.info "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
|
23
23
|
require 'gsl'
|
24
24
|
require 'tf-idf-similarity'
|
25
25
|
|
26
26
|
# 初始化文档
|
27
|
-
|
27
|
+
logger.info "开始取出 cron.id_to_text_hash_proc"
|
28
28
|
@id_to_text_hash = cron.id_to_text_hash_proc.call
|
29
29
|
|
30
|
-
|
30
|
+
logger.info "开始把@id_to_text_hash转化为数组"
|
31
31
|
as = @id_to_text_hash.to_a
|
32
32
|
|
33
|
-
|
33
|
+
logger.info "开始把as slice为1200每次"
|
34
34
|
as.each_slice(1200) do |a|
|
35
|
-
|
35
|
+
logger.info "开始跑 #{a.size} 个条目的相似性"
|
36
36
|
cron.corpus = TfIdfSimilarity::Collection.new
|
37
37
|
a.each do |id, text|
|
38
38
|
cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
|
@@ -59,15 +59,15 @@ module Statlysis
|
|
59
59
|
_item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
|
60
60
|
end
|
61
61
|
_item_id_to_score.delete document.id
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
logger.info "对比文档:"
|
63
|
+
logger.info "#{document.id} # #{summary(document.id)}"
|
64
|
+
logger.info "相关文档:"
|
65
65
|
_item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
|
66
66
|
_item_ids[0..9].each do |item_id, score|
|
67
|
-
|
67
|
+
logger.info "#{score} # #{summary(item_id)}"
|
68
68
|
end
|
69
69
|
cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
|
70
|
-
|
70
|
+
logger.info
|
71
71
|
end
|
72
72
|
|
73
73
|
# save results to database
|
data/lib/statlysis/timeseries.rb
CHANGED
@@ -4,21 +4,24 @@ module Statlysis
|
|
4
4
|
module TimeSeries
|
5
5
|
# range支持如下三种时间范围格式
|
6
6
|
# 20121201 20121221
|
7
|
-
#
|
8
|
-
#
|
7
|
+
# Time.zone.parse('20121221')
|
8
|
+
# Time.zone.parse('20121201')..Time.zone.parse('20121221')
|
9
9
|
# opts[:unit]支持:hour, :day, :week, :month等时间单位
|
10
10
|
# 返回的结果为时间范围内的序列数组
|
11
11
|
def self.parse range, opts = {}
|
12
|
-
|
12
|
+
# removed :utc => true, no effect.
|
13
|
+
# and so does :offset => nil
|
14
|
+
opts = opts.reverse_merge :unit => :day
|
13
15
|
unit = opts[:unit]
|
16
|
+
zone = opts[:zone] || Statlysis.default_time_zone || Time.zone
|
14
17
|
|
15
|
-
range = Range.new(*range.split.map {|i|
|
18
|
+
range = Range.new(*range.split.map {|i| Time.zone.parse(i).in_time_zone(zone) }) if range.is_a?(String)
|
16
19
|
|
17
20
|
begin_unit = "beginning_of_#{unit}".to_sym
|
18
21
|
array = if range.respond_to?(:to_datetime)
|
19
|
-
[range.in_time_zone.send(begin_unit)]
|
22
|
+
[range.in_time_zone(zone).send(begin_unit)]
|
20
23
|
elsif range.is_a?(Range)
|
21
|
-
ary = [range.first.in_time_zone, range.last.in_time_zone].map(&begin_unit).uniq
|
24
|
+
ary = [range.first.in_time_zone(zone), range.last.in_time_zone(zone)].map(&begin_unit).uniq
|
22
25
|
|
23
26
|
_ary = []
|
24
27
|
_ary.push ary[0]
|
@@ -32,9 +35,9 @@ module Statlysis
|
|
32
35
|
_ary.compact.reject {|i| (i < range.first) && (i >= range.last) }
|
33
36
|
end
|
34
37
|
|
35
|
-
array = array.map {|s| s.to_time } if opts[:utc]
|
36
|
-
array = array.map {|i| i + opts[:offset] } if opts[:offset]
|
37
|
-
array.map(&:
|
38
|
+
# array = array.map {|s| s.to_time } if opts[:utc]
|
39
|
+
# array = array.map {|i| i + opts[:offset] } if opts[:offset]
|
40
|
+
array.map(&:to_datetime)
|
38
41
|
end
|
39
42
|
|
40
43
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Statlysis
|
4
|
+
module Utils
|
5
|
+
class << self
|
6
|
+
def is_activerecord?(data); data.is_a?(ActiveRecordDataset) || !!((data.respond_to?(:included_modules) ? data.included_modules : []).index(ActiveRecord::Store)) end
|
7
|
+
def is_mongoid?(data); data.is_a?(MongoidDataset) || !!((data.respond_to?(:included_modules) ? data.included_modules : []).index(Mongoid::Document)) end
|
8
|
+
def name(data)
|
9
|
+
return :collection_name if Utils.is_mongoid?(data)
|
10
|
+
return :table_name if Utils.is_activerecord?(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
def setup_pattern_table_and_model tn
|
14
|
+
# ensure statlysis table
|
15
|
+
tn = tn.pluralize
|
16
|
+
if not Statlysis.sequel.table_exists?(tn)
|
17
|
+
Statlysis.sequel.create_table tn, DefaultTableOpts.merge(:engine => "InnoDB") do
|
18
|
+
primary_key :id
|
19
|
+
String :pattern
|
20
|
+
index :pattern
|
21
|
+
end
|
22
|
+
Statlysis.sequel.add_column tn, :result, String, :text => true
|
23
|
+
end
|
24
|
+
|
25
|
+
# generate a statlysis kv model
|
26
|
+
str = tn.to_s.singularize.camelize
|
27
|
+
class_eval <<-MODEL, __FILE__, __LINE__ + 1
|
28
|
+
class ::#{str} < Sequel::Model;
|
29
|
+
self.set_dataset :#{tn}
|
30
|
+
def self.[] item_id
|
31
|
+
JSON.parse(find_or_create(:pattern => item_id).result) rescue []
|
32
|
+
end
|
33
|
+
end;
|
34
|
+
MODEL
|
35
|
+
{:table => tn, :model => str.constantize}
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|