statlysis 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+
3
+ module Statlysis
4
+ class Results
5
+ attr_accessor :data
6
+ # 1, inline
7
+ # 2, collection
8
+ def initialize data
9
+ self.data = data
10
+ self
11
+ end
12
+
13
+ def output
14
+ self.data.to_a
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: UTF-8
2
+ # TODO support mongoid
3
+
4
+ module Statlysis
5
+ class Similar
6
+ attr_accessor :id_to_text_hash_proc, :corpus, :matrix, :id_to_similar_ids
7
+ include Common
8
+
9
+ def initialize model_name, id_to_text_hash_proc
10
+ # 初始化数据
11
+ cron.id_to_text_hash_proc = id_to_text_hash_proc
12
+
13
+ # 初始化表和模型
14
+ cron.stat_table_name = [Statlysis.tablename_default_pre, "similar", model_name].compact.join("_")
15
+ cron.pattern_table_and_model cron.stat_table_name
16
+
17
+ cron.id_to_similar_ids = {}
18
+ cron
19
+ end
20
+
21
+ def process
22
+ puts "SimilarProcess #{cron.stat_model} at #{DateTime.now}"
23
+ require 'gsl'
24
+ require 'tf-idf-similarity'
25
+
26
+ # 初始化文档
27
+ puts "开始取出 cron.id_to_text_hash_proc"
28
+ @id_to_text_hash = cron.id_to_text_hash_proc.call
29
+
30
+ puts "开始把@id_to_text_hash转化为数组"
31
+ as = @id_to_text_hash.to_a
32
+
33
+ puts "开始把as slice为1200每次"
34
+ as.each_slice(1200) do |a|
35
+ puts "开始跑 #{a.size} 个条目的相似性"
36
+ cron.corpus = TfIdfSimilarity::Collection.new
37
+ a.each do |id, text|
38
+ cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
39
+ end
40
+ =begin
41
+ @id_to_text_hash.each do |id, text|
42
+ cron.corpus << TfIdfSimilarity::Document.new(text, :id => id)
43
+ end
44
+ =end
45
+
46
+ cron.matrix = cron.corpus.similarity_matrix
47
+ matrix_array = cron.matrix.to_a
48
+
49
+ # matrix的数组下标对应到真实的item_id
50
+ matrix_idx_to_item_id_hash = {}
51
+ cron.corpus.documents.each_with_index do |document, idx1|
52
+ matrix_idx_to_item_id_hash[idx1] = document.id
53
+ end
54
+
55
+ # 取出matrix里各item的按相关度倒序的item_ids,并保存
56
+ cron.corpus.documents.each_with_index do |document, idx1|
57
+ _item_id_to_score = Hash.new 0
58
+ matrix_array[idx1].each_with_index do |num, idx2|
59
+ _item_id_to_score[matrix_idx_to_item_id_hash[idx2]] = (num.nan? ? 0.0 : num)
60
+ end
61
+ _item_id_to_score.delete document.id
62
+ puts "对比文档:"
63
+ puts "#{document.id} # #{summary(document.id)}"
64
+ puts "相关文档:"
65
+ _item_ids = _item_id_to_score.sort {|a1, b1| b1[1] <=> a1[1] }
66
+ _item_ids[0..9].each do |item_id, score|
67
+ puts "#{score} # #{summary(item_id)}"
68
+ end
69
+ cron.id_to_similar_ids[document.id] = _item_ids[0..99].map(&:first)
70
+ puts
71
+ end
72
+
73
+ # save results to database
74
+ cron.id_to_similar_ids.each do |id, similar_ids|
75
+ s = cron.stat_model.find_or_create(:pattern => id)
76
+ s.update :result => similar_ids.to_json
77
+ end
78
+ end # @id_to_text_hash.to_a.each_slice(1000) do |a|
79
+
80
+ return true
81
+ end
82
+ alias run process
83
+
84
+ def summary doc_id
85
+ @id_to_text_hash[doc_id].mb_chars[0..41].split("\n").join
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,41 @@
1
+ # encoding: UTF-8
2
+
3
+ module Statlysis
4
+ module TimeSeries
5
+ # range支持如下三种时间范围格式
6
+ # 20121201 20121221
7
+ # DateTime.parse('20121221')
8
+ # DateTime.parse('20121201')..DateTime.parse('20121221')
9
+ # opts[:unit]支持:hour, :day, :week, :month等时间单位
10
+ # 返回的结果为时间范围内的序列数组
11
+ def self.parse range, opts = {}
12
+ opts = opts.reverse_merge :unit => :day, :utc => true, :offset => nil
13
+ unit = opts[:unit]
14
+
15
+ range = Range.new(*range.split.map {|i| DateTime.parse(i).to_time_in_current_zone }) if range.is_a?(String)
16
+
17
+ begin_unit = "beginning_of_#{unit}".to_sym
18
+ array = if range.respond_to?(:to_datetime)
19
+ [range.in_time_zone.send(begin_unit)]
20
+ elsif range.is_a?(Range)
21
+ ary = [range.first.in_time_zone, range.last.in_time_zone].map(&begin_unit).uniq
22
+
23
+ _ary = []
24
+ _ary.push ary[0]
25
+ tmp = ary[0]
26
+ loop do
27
+ tmp += 1.send(unit)
28
+ break if tmp >= ary[-1]
29
+ _ary << tmp
30
+ end
31
+ _ary.push(ary[1]).compact
32
+ _ary.compact.reject {|i| (i < range.first) && (i >= range.last) }
33
+ end
34
+
35
+ array = array.map {|s| s.to_time } if opts[:utc]
36
+ array = array.map {|i| i + opts[:offset] } if opts[:offset]
37
+ array.map(&:in_time_zone)
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ $:.push File.expand_path("../lib", __FILE__)
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'statlysis'
7
+ s.version = '0.0.1'
8
+ s.date = '2013-07-10'
9
+ s.summary = File.read("README.markdown").split(/===+/)[1].strip.split("\n")[0]
10
+ s.description = s.summary
11
+ s.authors = ["David Chen"]
12
+ s.email = 'mvjome@gmail.com'
13
+ s.homepage = 'https://github.com/eoecn/statlysis'
14
+
15
+ s.files = `git ls-files`.split("\n")
16
+ s.test_files = `git ls-files -- test/{functional,unit}/*`.split("\n")
17
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.require_paths = ["lib"]
19
+
20
+ s.add_dependency "rake"
21
+ s.add_dependency "rails"
22
+ s.add_dependency "mysql2"
23
+ s.add_dependency "mongoid"
24
+ s.add_dependency "activerecord"
25
+ s.add_dependency "activerecord_idnamecache"
26
+ s.add_dependency "activesupport"
27
+ s.add_dependency "sequel"
28
+ s.add_dependency 'only_one_rake'
29
+
30
+ end
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'statlysis'
15
+
16
+ class Test::Unit::TestCase
17
+ end
@@ -0,0 +1,12 @@
1
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
2
+
3
+ class Company
4
+ include Mongoid::Document
5
+
6
+ field :name, :type => String
7
+ field :market, :type => String
8
+ field :shares, :type => Integer
9
+ field :quote, :type => Float
10
+
11
+ has_many :employees
12
+ end
@@ -0,0 +1,14 @@
1
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
2
+
3
+ class Employee
4
+ include Mongoid::Document
5
+
6
+ field :name
7
+ field :division
8
+ field :awards, :type => Integer
9
+ field :age, :type => Integer
10
+ field :rooms, :type => Array
11
+ field :active, :type => Boolean
12
+
13
+ belongs_to :company
14
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ Mongoid.connect_to "mongoid-mapreduce-test"
6
+ Dir["#{File.dirname(__FILE__)}/models/*.rb"].each { |f| require f }
7
+ Mongoid.default_session.collections.select {|c| c.name !~ /system/ }.each(&:drop)
8
+
9
+ class TestMapReduce < Test::Unit::TestCase
10
+ include Statlysis
11
+
12
+ def setup
13
+ # copied from git://github.com/joe1chen/mongoid-mapreduce.git
14
+ @aapl = Company.create :name => 'Apple', :market => 'Technology', :quote => 401.82, :shares => 972_090_000
15
+ @msft = Company.create :name => 'Microsoft', :market => 'Technology', :quote => 25.06, :shares => 8_380_000_000
16
+ @sbux = Company.create :name => 'Starbucks', :market => 'Food', :quote => 38.60, :shares => 746_010_000
17
+ Employee.create :name => 'Alan', :division => 'Software', :age => 30, :awards => 5, :rooms => [1,2], :active => true, :company => @aapl
18
+ Employee.create :name => 'Bob', :division => 'Software', :age => 30, :awards => 4, :rooms => [3,4,5], :active => true, :company => @aapl
19
+ Employee.create :name => 'Chris', :division => 'Hardware', :age => 30, :awards => 3, :rooms => [1,2,3,4], :active => false, :company => @aapl
20
+ end
21
+
22
+ def test_hotest_items_mapreduce
23
+ end
24
+
25
+
26
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ def Rails.root; Pathname.new(ENV['RAILS_ROOT'] || "#{Dir.pwd}/../..") end
6
+ raise "Please setup RAILS_ROOT shell env first!" if not File.exists?(Rails.root.join("config/database.yml"))
7
+
8
+ class TestStatlysis < Test::Unit::TestCase
9
+ include Statlysis
10
+
11
+ def setup
12
+ super
13
+ @dt = DateTime.parse "20121221 +0800"
14
+ @dt1 = DateTime.parse "20111221 +0800"
15
+ @dt2 = DateTime.parse "20121221 +0800"
16
+ Statlysis.set_database :statlysis
17
+ @old_datetime = DateTime.parse("20130105")
18
+ end
19
+
20
+ def test_parse_datetime
21
+ assert_equal [@dt], TimeSeries.parse(@dt), "抽取单个时间没通过"
22
+ end
23
+
24
+ def test_parse_special_datetime
25
+ assert_equal 1, TimeSeries.parse(DateTime.parse('2012122110')).length, "抽取单个时间没通过"
26
+ end
27
+
28
+ def test_parse_range_in_hour
29
+ # (@dt2 - @dt1).to_i == 366
30
+ assert_equal 24, TimeSeries.parse(@dt1..(@dt1+1.day-1.second), :unit => :hour).length, "抽取小时的时间范围没通过"
31
+ end
32
+
33
+ def test_parse_range_in_day
34
+ # (@dt2 - @dt1).to_i == 366
35
+ assert_equal 366, TimeSeries.parse(@dt1..(@dt2-1.second)).length, "抽取天的时间范围没通过"
36
+ end
37
+
38
+ def test_parse_range_in_week
39
+ # (@dt2 - @dt1).to_i / 7.0 == 52.285714285714285
40
+ assert_equal 53, TimeSeries.parse(@dt1..(@dt2-1.second), :unit => :week).length, "抽取周的时间范围没通过"
41
+ end
42
+
43
+ def test_parse_range_in_201212_week
44
+ w1 = DateTime.parse "20121201 +0800"
45
+ w2 = DateTime.parse "20121231 +0800"
46
+ assert_equal 6, TimeSeries.parse(w1..w2, :unit => :week).length, "2012十二月应该有六周"
47
+ end
48
+
49
+ def test_setup_count_stat_table
50
+ eval("class CodeGist < ActiveRecord::Base; end")
51
+ t = Statlysis::Timely.new CodeGist.where(:user_id => 470700), :time_column => :created_at, :time_unit => :day
52
+ t.setup_stat_table
53
+ is_created = Statlysis.sequel.table_exists?(t.stat_table_name)
54
+ Statlysis.sequel.drop_table t.stat_table_name
55
+
56
+ assert(is_created, "统计表#{t.stat_table_name}没有成功创建")
57
+ end
58
+
59
+ def test_setup_lastest_visits_stat_table
60
+ tn = 'st_blog_lastest_visits_tests'
61
+ lv = Statlysis::LastestVisits.new "FakeLogCollection", :stat_table_name => tn, :test => true, :default_time => @old_datetime
62
+ lv.pattern_table_and_model tn
63
+ is_sequel_model = lv.stat_model.respond_to?(:count)
64
+ Statlysis.sequel.drop_table tn
65
+
66
+ assert(is_sequel_model, "统计表#{lv.stat_table_name}没有成功创建")
67
+ end
68
+
69
+ def test_clock_set_time
70
+ clock = Statlysis::Clock.new "mvj3", Time.now
71
+ clock.update @old_datetime
72
+ update_old_time = (@old_datetime != clock.current)
73
+ assert(update_old_time, "Can't update old time")
74
+ end
75
+
76
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'helper'
4
+
5
+ class TestTimeSeries < Test::Unit::TestCase
6
+ end
metadata ADDED
@@ -0,0 +1,216 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: statlysis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Chen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-07-10 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rails
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: mysql2
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: mongoid
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: activerecord
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: activerecord_idnamecache
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: activesupport
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: sequel
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: only_one_rake
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ description: statistical analysis in ruby dsl
159
+ email: mvjome@gmail.com
160
+ executables: []
161
+ extensions: []
162
+ extra_rdoc_files: []
163
+ files:
164
+ - .document
165
+ - .gitignore
166
+ - Gemfile
167
+ - Gemfile.lock
168
+ - LICENSE.txt
169
+ - README.markdown
170
+ - Rakefile
171
+ - lib/statlysis.rb
172
+ - lib/statlysis/clock.rb
173
+ - lib/statlysis/common.rb
174
+ - lib/statlysis/configuration.rb
175
+ - lib/statlysis/cron.rb
176
+ - lib/statlysis/cron/count.rb
177
+ - lib/statlysis/cron/top.rb
178
+ - lib/statlysis/formula.rb
179
+ - lib/statlysis/javascript/count.rb
180
+ - lib/statlysis/map_reduce.rb
181
+ - lib/statlysis/rake.rb
182
+ - lib/statlysis/results.rb
183
+ - lib/statlysis/similar.rb
184
+ - lib/statlysis/timeseries.rb
185
+ - statlysis.gemspec
186
+ - test/helper.rb
187
+ - test/models/company.rb
188
+ - test/models/employee.rb
189
+ - test/test_mapreduce.rb
190
+ - test/test_statlysis.rb
191
+ - test/test_timeseries.rb
192
+ homepage: https://github.com/eoecn/statlysis
193
+ licenses: []
194
+ post_install_message:
195
+ rdoc_options: []
196
+ require_paths:
197
+ - lib
198
+ required_ruby_version: !ruby/object:Gem::Requirement
199
+ none: false
200
+ requirements:
201
+ - - ! '>='
202
+ - !ruby/object:Gem::Version
203
+ version: '0'
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ none: false
206
+ requirements:
207
+ - - ! '>='
208
+ - !ruby/object:Gem::Version
209
+ version: '0'
210
+ requirements: []
211
+ rubyforge_project:
212
+ rubygems_version: 1.8.25
213
+ signing_key:
214
+ specification_version: 3
215
+ summary: statistical analysis in ruby dsl
216
+ test_files: []