statlysis 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+
3
+ module Statlysis
4
+ class Results
5
+ attr_accessor :data
6
+ # 1, inline
7
+ # 2, collection
8
+ def initialize data
9
+ self.data = data
10
+ self
11
+ end
12
+
13
+ def output
14
+ self.data.to_a
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: UTF-8
2
+ # TODO support mongoid
3
+
4
+ module Statlysis
5
+ class Similar
6
+ attr_accessor :id_to_text_hash_proc, :corpus, :matrix, :id_to_similar_ids
7
+ include Common
8
+
9
+ def initialize model_name, id_to_text_hash_proc
10
+ # 初始化数据
11
+ cron.id_to_text_hash_proc = id_to_text_hash_proc
12
+
13
+ # 初始化表和模型
14
+ cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
15
+ cron.pattern_table_and_model cron.stat_table_name
16
+
17
+ cron.id_to_similar_ids = {}
18
+ cron
19
+ end
20
+
21
+ def process
22
+ puts "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
23
+ require 'gsl'
24
+ require 'tf-idf-similarity'
25
+
26
+ # 初始化文档
27
+ puts "开始取出 cron.id_to_text_hash_proc"
28
+ @id_to_text_hash = cron.id_to_text_hash_proc.call
29
+
30
+ puts "开始把@id_to_text_hash转化为数组"
31
+ as = @id_to_text_hash.to_a
32
+
33
+ puts "开始把as slice为1200每次"
34
+ as.each_slice(1200) do |a|
35
+ puts "开始跑 #{a.size} 个条目的相似性"
36
+ cron.corpus = TfIdfSimilarity::Collection.new
37
+ a.each do |id, text|
38
+ cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
39
+ end
40
+ =begin
41
+ @id_to_text_hash.each do |id, text|
42
+ cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
43
+ end
44
+ =end
45
+
46
+ cron.matrix = cron.corpus.similarity_matrix
47
+ matrix_array = cron.matrix.to_a
48
+
49
+ # matrix的数组下标对应到真实的item_id
50
+ matrix_idx_to_item_id_hash = {}
51
+ cron.corpus.documents.each_with_index do |document, idx1|
52
+ matrix_idx_to_item_id_hash[idx1] = document.id
53
+ end
54
+
55
+ # 取出matrix里各item的按相关度倒序的item_ids,并保存
56
+ cron.corpus.documents.each_with_index do |document, idx1|
57
+ _item_id_to_score = Hash.new 0
58
+ matrix_array[idx1].each_with_index do |num, idx2|
59
+ _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
60
+ end
61
+ _item_id_to_score.delete document.id
62
+ puts "对比文档:"
63
+ puts "#{document.id} # #{summary(document.id)}"
64
+ puts "相关文档:"
65
+ _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
66
+ _item_ids[0..9].each do |item_id, score|
67
+ puts "#{score} # #{summary(item_id)}"
68
+ end
69
+ cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
70
+ puts
71
+ end
72
+
73
+ # save results to database
74
+ cron.id_to_similar_ids.each do |id, similar_ids|
75
+ s = cron.stat_model.find_or_create(:pattern => id)
76
+ s.update :result => similar_ids.to_json
77
+ end
78
+ end # @id_to_text_hash.to_a.each_slice(1000) do |a|
79
+
80
+ return true
81
+ end
82
+ alias run process
83
+
84
+ def summary doc_id
85
+ @id_to_text_hash[doc_id].mb_chars[0..41].split("\n").join
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ module TimeSeries
5
+ # range支持如下三种时间范围格式
6
+ # 20121201 20121221
7
+ # DateTime.parse('20121221')
8
+ # DateTime.parse('20121201')..DateTime.parse('20121221')
9
+ # opts[:unit]支持:hour, :day, :week, :month等时间单位
10
+ # 返回的结果为时间范围内的序列数组
11
+ def self.parse range, opts = {}
12
+ opts = opts.reverse_merge :unit => :day, :utc => true, :offset => nil
13
+ unit = opts[:unit]
14
+
15
+ range = Range.new(*range.split.map {|i| DateTime.parse(i).to_time_in_current_zone }) if range.is_a?(String)
16
+
17
+ begin_unit = "beginning_of_#{unit}".to_sym
18
+ array = if range.respond_to?(:to_datetime)
19
+ [range.in_time_zone.send(begin_unit)]
20
+ elsif range.is_a?(Range)
21
+ ary = [range.first.in_time_zone, range.last.in_time_zone].map(&begin_unit).uniq
22
+
23
+ _ary = []
24
+ _ary.push ary[0]
25
+ tmp = ary[0]
26
+ loop do
27
+ tmp += 1.send(unit)
28
+ break if tmp >= ary[-1]
29
+ _ary << tmp
30
+ end
31
+ _ary.push(ary[1]).compact
32
+ _ary.compact.reject {|i| (i < range.first) && (i >= range.last) }
33
+ end
34
+
35
+ array = array.map {|s| s.to_time } if opts[:utc]
36
+ array = array.map {|i| i + opts[:offset] } if opts[:offset]
37
+ array.map(&:in_time_zone)
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ $:.push File.expand_path("../lib", __FILE__)
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'statlysis'
7
+ s.version = '0.0.1'
8
+ s.date = '2013-07-10'
9
+ s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
10
+ s.description = s.summary
11
+ s.authors = ["David Chen"]
12
+ s.email = 'mvjome@gmail.com'
13
+ s.homepage = 'https://github.com/eoecn/statlysis'
14
+
15
+ s.files = `git ls-files`.split("\n")
16
+ s.test_files = `git ls-files -- test/{functional,unit}/*`.split("\n")
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.require_paths = ["lib"]
19
+
20
+ s.add_dependency "rake"
21
+ s.add_dependency "rails"
22
+ s.add_dependency "mysql2"
23
+ s.add_dependency "mongoid"
24
+ s.add_dependency "activerecord"
25
+ s.add_dependency "activerecord_idnamecache"
26
+ s.add_dependency "activesupport"
27
+ s.add_dependency "sequel"
28
+ s.add_dependency 'only_one_rake'
29
+
30
+ end
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'statlysis'
15
+
16
+ class Test::Unit::TestCase
17
+ end
@@ -0,0 +1,12 @@
1
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
2
+
3
+ class Company
4
+ include Mongoid::Document
5
+
6
+ field :name, :type => String
7
+ field :market, :type => String
8
+ field :shares, :type => Integer
9
+ field :quote, :type => Float
10
+
11
+ has_many :employees
12
+ end
@@ -0,0 +1,14 @@
1
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
2
+
3
+ class Employee
4
+ include Mongoid::Document
5
+
6
+ field :name
7
+ field :division
8
+ field :awards, :type => Integer
9
+ field :age, :type => Integer
10
+ field :rooms, :type => Array
11
+ field :active, :type => Boolean
12
+
13
+ belongs_to :company
14
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ Mongoid.connect_to "mongoid-mapreduce-test"
6
+ Dir["#{File.dirname(__FILE__)}/models/*.rb"].each { |f| require f }
7
+ Mongoid.default_session.collections.select {|c| c.name !~ /system/ }.each(&:drop)
8
+
9
+ class TestMapReduce < Test::Unit::TestCase
10
+ include Statlysis
11
+
12
+ def setup
13
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
14
+ @aapl = Company.create :name => 'Apple', :market => 'Technology', :quote => 401.82, :shares => 972_090_000
15
+ @msft = Company.create :name => 'Microsoft', :market => 'Technology', :quote => 25.06, :shares => 8_380_000_000
16
+ @sbux = Company.create :name => 'Starbucks', :market => 'Food', :quote => 38.60, :shares => 746_010_000
17
+ Employee.create :name => 'Alan', :division => 'Software', :age => 30, :awards => 5, :rooms => [1,2], :active => true, :company => @aapl
18
+ Employee.create :name => 'Bob', :division => 'Software', :age => 30, :awards => 4, :rooms => [3,4,5], :active => true, :company => @aapl
19
+ Employee.create :name => 'Chris', :division => 'Hardware', :age => 30, :awards => 3, :rooms => [1,2,3,4], :active => false, :company => @aapl
20
+ end
21
+
22
+ def test_hotest_items_mapreduce
23
+ end
24
+
25
+
26
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ def Rails.root; Pathname.new(ENV['RAILS_ROOT'] || "#{Dir.pwd}/../..") end
6
+ raise "Please setup RAILS_ROOT shell env first!" if not File.exists?(Rails.root.join("config/database.yml"))
7
+
8
+ class TestStatlysis < Test::Unit::TestCase
9
+ include Statlysis
10
+
11
+ def setup
12
+ super
13
+ @dt = DateTime.parse "20121221 +0800"
14
+ @dt1 = DateTime.parse "20111221 +0800"
15
+ @dt2 = DateTime.parse "20121221 +0800"
16
+ Statlysis.set_database :statlysis
17
+ @old_datetime = DateTime.parse("20130105")
18
+ end
19
+
20
+ def test_parse_datetime
21
+ assert_equal [@dt], TimeSeries.parse(@dt), "抽取单个时间没通过"
22
+ end
23
+
24
+ def test_parse_special_datetime
25
+ assert_equal 1, TimeSeries.parse(DateTime.parse('2012122110')).length, "抽取单个时间没通过"
26
+ end
27
+
28
+ def test_parse_range_in_hour
29
+ # (@dt2 - @dt1).to_i == 366
30
+ assert_equal 24, TimeSeries.parse(@dt1..(@dt1+1.day-1.second), :unit => :hour).length, "抽取小时的时间范围没通过"
31
+ end
32
+
33
+ def test_parse_range_in_day
34
+ # (@dt2 - @dt1).to_i == 366
35
+ assert_equal 366, TimeSeries.parse(@dt1..(@dt2-1.second)).length, "抽取天的时间范围没通过"
36
+ end
37
+
38
+ def test_parse_range_in_week
39
+ # (@dt2 - @dt1).to_i / 7.0 == 52.285714285714285
40
+ assert_equal 53, TimeSeries.parse(@dt1..(@dt2-1.second), :unit => :week).length, "抽取周的时间范围没通过"
41
+ end
42
+
43
+ def test_parse_range_in_201212_week
44
+ w1 = DateTime.parse "20121201 +0800"
45
+ w2 = DateTime.parse "20121231 +0800"
46
+ assert_equal 6, TimeSeries.parse(w1..w2, :unit => :week).length, "2012十二月应该有六周"
47
+ end
48
+
49
+ def test_setup_count_stat_table
50
+ eval("class CodeGist < ActiveRecord::Base; end")
51
+ t = Statlysis::Timely.new CodeGist.where(:user_id => 470700), :time_column => :created_at, :time_unit => :day
52
+ t.setup_stat_table
53
+ is_created = Statlysis.sequel.table_exists?(t.stat_table_name)
54
+ Statlysis.sequel.drop_table t.stat_table_name
55
+
56
+ assert(is_created, "统计表#{t.stat_table_name}没有成功创建")
57
+ end
58
+
59
+ def test_setup_lastest_visits_stat_table
60
+ tn = 'st_blog_lastest_visits_tests'
61
+ lv = Statlysis::LastestVisits.new "FakeLogCollection", :stat_table_name => tn, :test => true, :default_time => @old_datetime
62
+ lv.pattern_table_and_model tn
63
+ is_sequel_model = lv.stat_model.respond_to?(:count)
64
+ Statlysis.sequel.drop_table tn
65
+
66
+ assert(is_sequel_model, "统计表#{lv.stat_table_name}没有成功创建")
67
+ end
68
+
69
+ def test_clock_set_time
70
+ clock = Statlysis::Clock.new "mvj3", Time.now
71
+ clock.update @old_datetime
72
+ update_old_time = (@old_datetime != clock.current)
73
+ assert(update_old_time, "Can't update old time")
74
+ end
75
+
76
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ class TestTimeSeries < Test::Unit::TestCase
6
+ end
metadata ADDED
@@ -0,0 +1,216 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: statlysis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Chen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rails
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: mysql2
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: mongoid
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: activerecord
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activerecord_idnamecache
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: activesupport
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: sequel
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: only_one_rake
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ description: statistical analysis in ruby dsl
159
+ email: mvjome@gmail.com
160
+ executables: []
161
+ extensions: []
162
+ extra_rdoc_files: []
163
+ files:
164
+ - .document
165
+ - .gitignore
166
+ - Gemfile
167
+ - Gemfile.lock
168
+ - LICENSE.txt
169
+ - README.markdown
170
+ - Rakefile
171
+ - lib/statlysis.rb
172
+ - lib/statlysis/clock.rb
173
+ - lib/statlysis/common.rb
174
+ - lib/statlysis/configuration.rb
175
+ - lib/statlysis/cron.rb
176
+ - lib/statlysis/cron/count.rb
177
+ - lib/statlysis/cron/top.rb
178
+ - lib/statlysis/formula.rb
179
+ - lib/statlysis/javascript/count.rb
180
+ - lib/statlysis/map_reduce.rb
181
+ - lib/statlysis/rake.rb
182
+ - lib/statlysis/results.rb
183
+ - lib/statlysis/similar.rb
184
+ - lib/statlysis/timeseries.rb
185
+ - statlysis.gemspec
186
+ - test/helper.rb
187
+ - test/models/company.rb
188
+ - test/models/employee.rb
189
+ - test/test_mapreduce.rb
190
+ - test/test_statlysis.rb
191
+ - test/test_timeseries.rb
192
+ homepage: https://github.com/eoecn/statlysis
193
+ licenses: []
194
+ post_install_message:
195
+ rdoc_options: []
196
+ require_paths:
197
+ - lib
198
+ required_ruby_version: !ruby/object:Gem::Requirement
199
+ none: false
200
+ requirements:
201
+ - - ! '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ none: false
206
+ requirements:
207
+ - - ! '>='
208
+ - !ruby/object:Gem::Version
209
+ version: '0'
210
+ requirements: []
211
+ rubyforge_project:
212
+ rubygems_version: 1.8.25
213
+ signing_key:
214
+ specification_version: 3
215
+ summary: statistical analysis in ruby dsl
216
+ test_files: []