statsample-glm 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +51 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +2 -9
  5. data/Gemfile +2 -20
  6. data/LICENSE.txt +1 -1
  7. data/README.rdoc +14 -11
  8. data/Rakefile +16 -24
  9. data/lib/statsample-glm.rb +1 -11
  10. data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
  11. data/lib/statsample-glm/glm/base.rb +99 -0
  12. data/lib/statsample-glm/glm/irls/base.rb +54 -0
  13. data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
  14. data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
  15. data/lib/statsample-glm/glm/logistic.rb +16 -0
  16. data/lib/statsample-glm/glm/mle/base.rb +157 -0
  17. data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
  18. data/lib/statsample-glm/glm/mle/normal.rb +94 -0
  19. data/lib/statsample-glm/glm/mle/probit.rb +100 -0
  20. data/lib/statsample-glm/glm/normal.rb +17 -0
  21. data/lib/statsample-glm/glm/poisson.rb +17 -0
  22. data/lib/statsample-glm/glm/probit.rb +16 -0
  23. data/lib/statsample-glm/version.rb +5 -0
  24. data/spec/data/logistic.csv +51 -0
  25. data/spec/data/logistic_mle.csv +201 -0
  26. data/spec/data/normal.csv +30 -0
  27. data/spec/logistic_spec.rb +37 -0
  28. data/spec/normal_spec.rb +15 -0
  29. data/spec/poisson_spec.rb +32 -0
  30. data/spec/probit_spec.rb +19 -0
  31. data/spec/spec_helper.rb +50 -0
  32. data/statsample-glm.gemspec +35 -0
  33. metadata +71 -145
  34. data/VERSION +0 -1
  35. data/features/bio-statsample-glm.feature +0 -9
  36. data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
  37. data/features/support/env.rb +0 -15
  38. data/lib/statsample-glm/regression/logistic.rb +0 -108
  39. data/lib/statsample-glm/regression/poisson.rb +0 -90
  40. data/test/helper.rb +0 -87
  41. data/test/test_glm.rb +0 -4
  42. data/test/test_glm_logistic.rb +0 -23
  43. data/test/test_glm_poisson.rb +0 -25
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a514ad693d8ef698d63b3c51c903ead6837bf498
4
+ data.tar.gz: 9a06cc01573d053da123a6f91f50dce703141244
5
+ SHA512:
6
+ metadata.gz: 4330fe635519e054cd8f01e0dddde160cc1f61c21efbc2cedbae8f509c855e4b145982b99b8867fda6d6ecf03f9eea395de062ed0bf675a8309e4c921c26daf8
7
+ data.tar.gz: 979f0af03a502f92b1b09d1e4561ff3d8f77e45a832f1756136591db5ac1adcf807cd513a634f838339ff252214dc8356bf3f4a34088cc0c33fdf5dade192c35
@@ -0,0 +1,51 @@
1
+ # rcov generated
2
+ coverage
3
+ coverage.data
4
+
5
+ # rdoc generated
6
+ rdoc
7
+
8
+ # yard generated
9
+ doc
10
+ .yardoc
11
+
12
+ # bundler
13
+ .bundle
14
+
15
+ # jeweler generated
16
+ pkg
17
+
18
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
19
+ #
20
+ # * Create a file at ~/.gitignore
21
+ # * Include files you want ignored
22
+ # * Run: git config --global core.excludesfile ~/.gitignore
23
+ #
24
+ # After doing this, these files will be ignored in all your git projects,
25
+ # saving you from having to 'pollute' every project you touch with them
26
+ #
27
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
28
+ #
29
+ # For MacOS:
30
+ #
31
+ #.DS_Store
32
+
33
+ # For TextMate
34
+ #*.tmproj
35
+ #tmtags
36
+
37
+ # For emacs:
38
+ #*~
39
+ #\#*
40
+ #.\#*
41
+
42
+ # For vim:
43
+ #*.swp
44
+
45
+ # For redcar:
46
+ #.redcar
47
+
48
+ # For rubinius:
49
+ #*.rbc
50
+ # Ignore Gemfile.lock for gems. See http://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
51
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -1,13 +1,6 @@
1
1
  language: ruby
2
+ cache: bundler
2
3
  rvm:
3
- - 1.9.2
4
4
  - 1.9.3
5
5
  - 2.0.0
6
- - jruby-19mode # JRuby in 1.9 mode
7
- - rbx-19mode
8
- # - 1.8.7
9
- # - jruby-18mode # JRuby in 1.8 mode
10
- # - rbx-18mode
11
-
12
- # uncomment this line if your project needs to run something other than `rake`:
13
- # script: bundle exec rspec spec
6
+ - 2.1.1
data/Gemfile CHANGED
@@ -1,20 +1,2 @@
1
- source "http://rubygems.org"
2
-
3
- gem 'statsample', '>=1.2.0'
4
- # Add dependencies required to use your gem here.
5
- # Example:
6
- gem "activesupport", "= 3.2.10"
7
-
8
- # Add dependencies to develop your gem here.
9
- # Include everything needed to run rake, tests, features, etc.
10
- group :development do
11
- gem "shoulda", ">= 0"
12
- gem "rdoc", "~> 3.12"
13
- gem "minitest", "~> 4.7.5"
14
- gem "cucumber", ">= 0"
15
- gem "bundler", "~> 1.3.5"
16
- gem "jeweler", "~> 1.8.4"
17
- gem "bio", ">= 1.4.2"
18
- gem "rdoc", "~> 3.12"
19
- gem "mocha", "~> 0.14.0"
20
- end
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -9,7 +9,7 @@ You *must* read the Contributor Agreement before contributing code to the SciRub
9
9
 
10
10
  -----
11
11
 
12
- Copyright (c) 2010 - 2013, Ruby Science Foundation
12
+ Copyright (c) 2013, Ankur Goel and the Ruby Science Foundation
13
13
  All rights reserved.
14
14
 
15
15
  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -6,6 +6,8 @@ src="https://secure.travis-ci.org/AnkurGel/statsample-glm.png"
6
6
 
7
7
  Statsample-GLM is an extension of *Generalized Linear Models* to {Statsample}[https://github.com/SciRuby/statsample], a suite of advance statistics in Ruby.
8
8
 
9
+ Requires ruby 1.9.3 or higher.
10
+
9
11
  * {sciruby.com}[http://sciruby.com]
10
12
  * {Google+}[https://plus.google.com/109304769076178160953/posts]
11
13
  * {Ankur Goel}[http://ankurgoel.com]
@@ -13,14 +15,17 @@ Statsample-GLM is an extension of *Generalized Linear Models* to {Statsample}[ht
13
15
 
14
16
 
15
17
  == Description
16
- Statsample-GLM is an extension of Statsample, and includes many helpful regression techniques for Generalized Linear models such as:
18
+ Statsample-glm includes the following Generalized Linear Models:
17
19
 
18
- * Poisson Regression
19
- * Logistic Regression
20
- * Exponential Regression
21
20
  * Iteratively Reweighted Least Squares
21
+ * Poisson Regression
22
+ * Logistic Regression
23
+ * Maximum Likelihood Models (Newton Raphson)
24
+ * Logistic Regression
25
+ * Probit Regression
26
+ * Normal Regression
22
27
 
23
- Statsample-GLM was created by Ankur Goel as part of Google's Summer of Code 2013. It is the part of {SciRuby}[http://sciruby.com]
28
+ Statsample-GLM was created by Ankur Goel as part of Google's Summer of Code 2013. It is the part of {the SciRuby Project}[http://sciruby.com].
24
29
 
25
30
  Note: This is under active development!
26
31
 
@@ -40,7 +45,7 @@ You can also go through the blog-posts on {my blog}[http://ankurgoel.com] for de
40
45
 
41
46
  == Documentation
42
47
 
43
- The API doc is {online}[http://rubygems.org/gems/statsample-glm]. For more code examples see also the test files in the source tree.
48
+ The API doc is {online}[http://rubygems.org/gems/statsample-glm]. For more code examples see also the spec files in the source tree.
44
49
 
45
50
 
46
51
  == Contributing
@@ -49,7 +54,7 @@ The API doc is {online}[http://rubygems.org/gems/statsample-glm]. For more code
49
54
  * Create your feature branch
50
55
  * Add/Modify code.
51
56
  * Write equivalent documentation and **tests**.
52
- * Run `rake test` to verify that all test case passes.
57
+ * Run `rspec` to verify that all test case passes.
53
58
  * Push your branch.
54
59
  * Pull request. :)
55
60
 
@@ -60,12 +65,10 @@ Information on the source tree, documentation, issues and how to contribute, see
60
65
 
61
66
  http://github.com/ankurgel/statsample-glm
62
67
 
63
-
64
- == Biogems.info
65
-
66
68
  This Biogem is published at http://biogems.info/index.html#statsample-glm
67
69
 
68
70
  == Copyright
69
71
 
70
- Copyright (c) 2013 Ankur Goel. See LICENSE.txt for further details.
72
+ Copyright (c) 2013 Ankur Goel and the Ruby Science Foundation. See LICENSE.txt for further details.
71
73
 
74
+ Statsample is (c) 2009-2013 Claudio Bustos and the Ruby Science Foundation.
data/Rakefile CHANGED
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
+ require 'rake'
3
+ require 'bundler/gem_tasks'
2
4
 
3
- require 'rubygems'
4
5
  require 'bundler'
5
6
  begin
6
7
  Bundler.setup(:default, :development)
@@ -9,37 +10,28 @@ rescue Bundler::BundlerError => e
9
10
  $stderr.puts "Run `bundle install` to install missing gems"
10
11
  exit e.status_code
11
12
  end
12
- require 'rake'
13
13
 
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "statsample-glm"
18
- gem.homepage = "http://github.com/AnkurGel/statsample-glm"
19
- gem.license = "MIT"
20
- gem.summary = %Q{Generalized Linear Models for Statsample}
21
- gem.description = %Q{Statsample-GLM is an extension to Statsample, an advance statistics suite in Ruby. This gem includes modules for Regression techniques such as Poisson Regression, Logistic Regression and Exponential Regression}
22
- gem.email = "ankurgel@gmail.com"
23
- gem.authors = ["Ankur Goel"]
24
- # dependencies defined in Gemfile
14
+ desc "Open IRB with statsample-timeseries loaded."
15
+ task :console do
16
+ require 'irb'
17
+ require 'irb/completion'
18
+ $:.unshift File.expand_path("../lib", __FILE__)
19
+ require 'statsample-glm'
20
+ ARGV.clear
21
+ IRB.start
25
22
  end
26
- Jeweler::RubygemsDotOrgTasks.new
27
23
 
28
- require 'rake/testtask'
29
- Rake::TestTask.new(:test) do |test|
30
- test.libs << 'lib' << 'test'
31
- test.pattern = 'test/**/test_*.rb'
32
- test.verbose = true
33
- end
34
24
 
35
- require 'cucumber/rake/task'
36
- Cucumber::Rake::Task.new(:features)
25
+ require 'rspec/core/rake_task'
26
+
27
+ RSpec::Core::RakeTask.new(:spec)
37
28
 
38
- task :default => :test
29
+ task :default => :spec
39
30
 
40
31
  require 'rdoc/task'
41
32
  Rake::RDocTask.new do |rdoc|
42
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
33
+ $:.unshift File.expand_path("../lib", __FILE__)
34
+ version = Statsample::GLM::VERSION
43
35
 
44
36
  rdoc.rdoc_dir = 'rdoc'
45
37
  rdoc.title = "statsample-glm #{version}"
@@ -1,12 +1,2 @@
1
- # Please require your code below, respecting the naming conventions in the
2
- # bioruby directory tree.
3
- #
4
- # For example, say you have a plugin named bio-plugin, the only uncommented
5
- # line in this file would be
6
- #
7
- # require 'bio/bio-plugin/plugin'
8
- #
9
- # In this file only require other files. Avoid other source code.
10
-
11
1
  require 'statsample'
12
- require 'statsample-glm/regression'
2
+ require 'statsample-glm/glm'
@@ -1,7 +1,10 @@
1
- require 'statsample-glm/regression/poisson'
2
- require 'statsample-glm/regression/logistic'
1
+ require 'statsample-glm/glm/logistic'
2
+ require 'statsample-glm/glm/probit'
3
+ require 'statsample-glm/glm/poisson'
4
+ require 'statsample-glm/glm/normal'
5
+
3
6
  module Statsample
4
- module Regression
7
+ module GLM
5
8
  include Statsample::VectorShorthands
6
9
 
7
10
  # = Generalized linear models
@@ -15,55 +18,19 @@ module Statsample
15
18
  # require 'statsample-glm'
16
19
  # x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
17
20
  # x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
18
- # y=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
19
- # x=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
20
- # obj = Statsample::Regression.glm(x, y, :binomial)
21
+ # x=Statsample::Dataset.new({"x1"=>x1,"x2"=>x2})
22
+ # obj = Statsample::GLM.compute(x, y, :logit, {algorithm: :irls})
21
23
  # #=> Logistic Regression object
22
24
  #
23
25
  # == Returns
24
26
  # GLM object for given method.
25
- def self.glm(x, y, method=:gaussian)
26
-
27
- if method.downcase.to_sym == :poisson
28
- obj = Statsample::Regression::GLM::Poisson.new(x,y)
29
- elsif method.downcase.to_sym == :binomial
30
- obj = Statsample::Regression::GLM::Logistic.new(x,y)
31
- else
32
- raise("Not implemented yet")
33
- end
34
- obj.irwls
35
- obj
36
- end
37
-
38
-
39
- def self.irwls(x, y, mu, w, j, h, epsilon = 1e-7, max_iter = 100)
40
- b = Matrix.column_vector(Array.new(x.column_size,0.0))
41
- converged = false
42
- 1.upto(max_iter) do |i|
43
- #conversion from : (solve(j(x,b)) %*% h(x,b,y))
27
+ def self.compute(data_set, dependent_column, method, opts={})
28
+ opts[:method] = method
44
29
 
45
- intermediate = (j.call(x,b).inverse * h.call(x,b,y))
46
- b_new = b - intermediate
47
-
48
- if((b_new - b).map(&:abs)).to_a.flatten.inject(:+) < epsilon
49
- converged = true
50
- b = b_new
51
- break
52
- end
53
- b = b_new
54
- end
55
- ss = j.call(x,b).inverse.diagonal.map{ |x| -x}.map{ |y| Math.sqrt(y) }
56
- values = mu.call(x,b)
57
-
58
- residuals = y - values.column_vectors.map(&:to_a).flatten
59
- df_residuals = y.count - x.column_size
60
- return [create_vector(b.column_vectors[0]), create_vector(ss), create_vector(values.to_a.flatten),
61
- residuals, max_iter, df_residuals, converged]
62
- end
30
+ # TODO: Remove this const_get jugaad after 1.9.3 support is removed.
63
31
 
64
- private
65
- def self.create_vector(arr)
66
- Statsample::Vector.new(arr, :scale)
32
+ Kernel.const_get("Statsample").const_get("GLM").const_get("#{method.capitalize}").new data_set,
33
+ dependent_column, opts
67
34
  end
68
35
  end
69
36
  end
@@ -0,0 +1,99 @@
1
+ require 'statsample-glm/glm/irls/logistic'
2
+ require 'statsample-glm/glm/irls/poisson'
3
+ require 'statsample-glm/glm/mle/logistic'
4
+ require 'statsample-glm/glm/mle/probit'
5
+ require 'statsample-glm/glm/mle/normal'
6
+
7
+ module Statsample
8
+ module GLM
9
+ class Base
10
+
11
+ def initialize ds, y, opts={}
12
+ @opts = opts
13
+
14
+ set_default_opts_if_any
15
+
16
+ @data_set = ds.dup(ds.fields - [y.to_s])
17
+ @dependent = ds[y.to_s]
18
+
19
+ add_constant_vector if @opts[:constant]
20
+ add_constant_vector(1) if self.is_a? Statsample::GLM::Normal
21
+
22
+ algorithm = @opts[:algorithm].upcase
23
+ method = @opts[:method].capitalize
24
+
25
+ # TODO: Remove this const_get jugaad after 1.9.3 support is removed.
26
+
27
+ @regression = Kernel.const_get("Statsample").const_get("GLM")
28
+ .const_get("#{algorithm}").const_get("#{method}")
29
+ .new(@data_set, @dependent, @opts)
30
+ end
31
+
32
+ def coefficients as_a=:array
33
+ if as_a == :hash
34
+ c = {}
35
+ @data_set.fields.each_with_index do |f,i|
36
+ c[f.to_sym] = @regression.coefficients[i]
37
+ end
38
+ return c
39
+ end
40
+ create_vector @regression.coefficients
41
+ end
42
+
43
+ def standard_error as_a=:array
44
+ if as_a == :hash
45
+ se = {}
46
+ @data_set.fields.each_with_index do |f,i|
47
+ se[f.to_sym] = @regression.standard_error[i]
48
+ end
49
+ return se
50
+ end
51
+
52
+ create_vector @regression.standard_error
53
+ end
54
+
55
+ def iterations
56
+ @regression.iterations
57
+ end
58
+
59
+ def fitted_mean_values
60
+ @regression.fitted_mean_values
61
+ end
62
+
63
+ def residuals
64
+ @regression.residuals
65
+ end
66
+
67
+ def degree_of_freedom
68
+ @regression.degree_of_freedom
69
+ end
70
+
71
+ def log_likelihood
72
+ @regression.log_likelihood if @opts[:algorithm] == :mle
73
+ end
74
+
75
+ private
76
+
77
+ def set_default_opts_if_any
78
+ @opts[:algorithm] ||= :irls
79
+ @opts[:iterations] ||= 100
80
+ @opts[:epsilon] ||= 1e-7
81
+ @opts[:link] ||= :log
82
+ end
83
+
84
+ def create_vector arr
85
+ Statsample::Vector.new(arr, :scale)
86
+ end
87
+
88
+ def add_constant_vector x=nil
89
+ @data_set.add_vector "constant",
90
+ (([@opts[:constant]]*@data_set.cases).to_vector(:scale))
91
+
92
+ unless x.nil?
93
+ @data_set.add_vector "constant",
94
+ (([1]*@data_set.cases).to_vector(:scale))
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,54 @@
1
+ module Statsample
2
+ module GLM
3
+ module IRLS
4
+ class Base
5
+
6
+ attr_reader :coefficients, :standard_error, :iterations,
7
+ :fitted_mean_values, :residuals, :degree_of_freedom
8
+
9
+ def initialize data_set, dependent, opts={}
10
+ @data_set = data_set.to_matrix
11
+ @dependent = dependent
12
+ @opts = opts
13
+
14
+ irls
15
+ end
16
+
17
+ private
18
+
19
+ def irls
20
+
21
+ max_iter = @opts[:iterations]
22
+ b = Matrix.column_vector Array.new(@data_set.column_size,0.0)
23
+
24
+ 1.upto(max_iter) do
25
+ intermediate = (hessian(@data_set,b).inverse *
26
+ jacobian(@data_set, b, @dependent))
27
+
28
+ b_new = b - intermediate
29
+
30
+ if((b_new - b).map(&:abs)).to_a.flatten.inject(:+) < @opts[:epsilon]
31
+ b = b_new
32
+ break
33
+ end
34
+ b = b_new
35
+ end
36
+
37
+ @coefficients = create_vector(b.column_vectors[0])
38
+ @iterations = max_iter
39
+ @standard_error = hessian(@data_set,b).inverse
40
+ .diagonal
41
+ .map{ |x| -x}
42
+ .map{ |y| Math.sqrt(y) }
43
+ @fitted_mean_values = create_vector measurement(@data_set,b).to_a.flatten
44
+ @residuals = @dependent - @fitted_mean_values
45
+ @degree_of_freedom = @dependent.count - @data_set.column_size
46
+ end
47
+
48
+ def create_vector arr
49
+ Statsample::Vector.new(arr, :scale)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end