statsample 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -263,7 +263,7 @@ module Statsample
263
263
  s_size=@strata_sizes[s_name]
264
264
  (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
265
265
  }
266
- (1/@population_size.to_f) * Math::sqrt(sum)
266
+ (1.quo(@population_size)) * Math::sqrt(sum)
267
267
  end
268
268
  # Cochran(1971), p. 150
269
269
  def variance_pst(field,v=1)
@@ -0,0 +1,96 @@
1
+ module Statsample
2
+ # Permutation class systematically generates all permutations
3
+ # of elements on an array, using Dijkstra algorithm (1997).
4
+ #
5
+ # As argument, you could use
6
+ # * Number of elements: an array with numbers from 0 to n-1 will be used
7
+ # * Array: if ordered, you obtain permutations on lexicographic order
8
+ # you can repeat elements, if you will.
9
+ #
10
+ # Use:
11
+ # perm=Statsample::Permutation.new(3)
12
+ # perm.permutations
13
+ # => [[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]
14
+ # perm=Statsample::Permutation.new([0,0,1,1])
15
+ # => [[0,0,1,1],[0,1,0,1],[0,1,1,0],[1,0,0,1],[1,0,1,0],[1,1,0,0]]
16
+ #
17
+ # Reference: http://www.cut-the-knot.org/do_you_know/AllPerm.shtml
18
+ class Permutation
19
+ attr_reader :permutation_number
20
+ def initialize(v)
21
+ if v.is_a? Numeric
22
+ @original=(0...v.to_i).to_a
23
+ @permutation_number=factorial(v)
24
+ else
25
+ @original=v
26
+ calculate_max_iterations_from_array
27
+ end
28
+ @n=@original.size
29
+ reset
30
+ end
31
+ def calculate_max_iterations_from_array
32
+ if @original.respond_to? :frequencies
33
+ freq=@original.frequencies
34
+ else
35
+ freq=@original.to_vector.frequencies
36
+ end
37
+ if freq.length==@original.size
38
+ @permutation_number=factorial(@original.size)
39
+ else
40
+ numerator=factorial(@original.size)
41
+ denominator=freq.inject(1) {|a,v|
42
+ a*factorial(v[1])
43
+ }
44
+ @permutation_number=numerator/denominator
45
+ end
46
+ end
47
+ def factorial (n)
48
+ (1..n).inject(1){|a,v| a*v}
49
+ end
50
+ def reset
51
+ @iterations=0
52
+ @data=@original.dup
53
+ end
54
+ def each
55
+ reset
56
+ @permutation_number.times do
57
+ yield next_value
58
+ end
59
+ end
60
+ def permutations
61
+ a=Array.new
62
+ each {|c| a.push(c)}
63
+ a
64
+ end
65
+ def next_value
66
+ prev=@data.dup
67
+ i = @n-1
68
+ while @data[i-1] >= @data[i]
69
+ #return false if i<0
70
+ i=i-1
71
+ end
72
+ j=@n
73
+ while @data[j-1] <= @data[i-1]
74
+ j=j-1
75
+ end
76
+ # swap values at positions (i-1) and (j-1)
77
+ swap(i-1, j-1);
78
+
79
+ i+=1
80
+ j = @n
81
+
82
+ while (i < j)
83
+ swap(i-1, j-1);
84
+ i+=1;
85
+ j-=1;
86
+ sprintf("%d %d",i,j)
87
+ end
88
+ prev
89
+ end
90
+ def swap(i,j)
91
+ tmp=@data[i]
92
+ @data[i]=@data[j]
93
+ @data[j]=tmp
94
+ end
95
+ end
96
+ end
@@ -8,7 +8,7 @@ require 'statsample/regression/binomial/logit'
8
8
  require 'statsample/regression/binomial/probit'
9
9
 
10
10
  module Statsample
11
- # Module for regression procedures
11
+ # Module for regression procedures.
12
12
  module Regression
13
13
  end
14
14
  end
@@ -1,91 +1,91 @@
1
-
2
1
  module Statsample
3
- module Regression
4
- module Binomial
5
- # Create a Logit model object.
6
- # ds:: Dataset
7
- # y:: Name of dependent vector
8
- # Use
9
- # dataset=Statsample::CSV.read("data.csv")
10
- # y="y"
11
- # lr=Statsample::Regression::Binomial.logit(dataset,y)
12
- #
13
- def self.logit(ds,y_var)
14
- Logit.new(ds,y_var)
15
- end
16
- # Create a Probit model object.
17
- # ds:: Dataset
18
- # y:: Name of dependent vector
19
- # Use
20
- # dataset=Statsample::CSV.read("data.csv")
21
- # y="y"
22
- # lr=Statsample::Regression::Binomial.probit(dataset,y)
23
- #
24
-
25
- def self.probit(ds,y_var)
26
- Probit.new(ds,y_var)
27
- end
28
- # Base Engine for binomial regression analysis.
29
- # See Statsample::Regression::Binomial.logit() and
30
- # Statsample::Regression::Binomial.probit for fast
31
- # access methods.
32
- #
33
- # Use:
34
- # dataset=Statsample::CSV.read("data.csv")
35
- # y="y"
36
- # model=Statsample::MLE::Logit.new
37
- # lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
38
- class BaseEngine
39
- attr_reader :log_likehood, :iterations
40
- def initialize(ds,y_var,model)
41
- @ds=ds
42
- @y_var=y_var
43
- @dy=@ds[@y_var]
44
- @ds_indep=ds.dup(ds.fields-[y_var])
45
- constant=([1.0]*ds.cases).to_vector(:scale)
46
- @ds_indep.add_vector("_constant",constant)
47
- mat_x=@ds_indep.to_matrix
48
- mat_y=@dy.to_matrix(:vertical)
49
- @fields=@ds_indep.fields
50
- @model=model
51
- coeffs=model.newton_raphson(mat_x, mat_y)
52
- @coeffs=assign_names(coeffs.column(0).to_a)
53
- @iterations=model.iterations
54
- @var_cov_matrix=model.var_cov_matrix
55
- @log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
56
- end # init
57
- # Coefficients standard error
58
- def coeffs_se
59
- out={}
60
- @fields.each_index{|i|
61
- f=@fields[i]
62
- out[f]=Math::sqrt(@var_cov_matrix[i,i])
63
- }
64
- out.delete("_constant")
65
- out
66
- end
67
- def constant
68
- @coeffs['_constant']
69
- end
70
- def coeffs
71
- c=@coeffs.dup
72
- c.delete("_constant")
73
- c
74
- end
75
- # Constant standard error
76
- def constant_se
77
- i=@fields.index :_constant
78
- Math::sqrt(@var_cov_matrix[i,i])
79
- end
80
- def assign_names(c)
81
- a={}
82
- @fields.each_index {|i|
83
- a[@fields[i]]=c[i]
84
- }
85
- a
86
- end
87
- end # Base Engine
88
-
89
- end # Dichotomic
90
- end # Regression
2
+ module Regression
3
+ module Binomial
4
+ # Create a Logit model object.
5
+ # ds:: Dataset
6
+ # y:: Name of dependent vector
7
+ # Use
8
+ # dataset=Statsample::CSV.read("data.csv")
9
+ # y="y"
10
+ # lr=Statsample::Regression::Binomial.logit(dataset,y)
11
+ #
12
+ def self.logit(ds,y_var)
13
+ Logit.new(ds,y_var)
14
+ end
15
+ # Create a Probit model object.
16
+ # ds:: Dataset
17
+ # y:: Name of dependent vector
18
+ # Use
19
+ # dataset=Statsample::CSV.read("data.csv")
20
+ # y="y"
21
+ # lr=Statsample::Regression::Binomial.probit(dataset,y)
22
+ #
23
+
24
+ def self.probit(ds,y_var)
25
+ Probit.new(ds,y_var)
26
+ end
27
+ # Base Engine for binomial regression analysis.
28
+ # See Statsample::Regression::Binomial.logit() and
29
+ # Statsample::Regression::Binomial.probit for fast
30
+ # access methods.
31
+ #
32
+ # Use:
33
+ # dataset=Statsample::CSV.read("data.csv")
34
+ # y="y"
35
+ # model=Statsample::MLE::Logit.new
36
+ # lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
37
+ class BaseEngine
38
+ attr_reader :log_likehood, :iterations
39
+ def initialize(ds,y_var,model)
40
+ @ds=ds
41
+ @y_var=y_var
42
+ @dy=@ds[@y_var]
43
+ @ds_indep=ds.dup(ds.fields-[y_var])
44
+ constant=([1.0]*ds.cases).to_vector(:scale)
45
+ @ds_indep.add_vector("_constant",constant)
46
+ mat_x=@ds_indep.to_matrix
47
+ mat_y=@dy.to_matrix(:vertical)
48
+ @fields=@ds_indep.fields
49
+ @model=model
50
+ coeffs=model.newton_raphson(mat_x, mat_y)
51
+ @coeffs=assign_names(coeffs.column(0).to_a)
52
+ @iterations=model.iterations
53
+ @var_cov_matrix=model.var_cov_matrix
54
+ @log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
55
+ end # init
56
+ # Coefficients standard error
57
+ def coeffs_se
58
+ out={}
59
+ @fields.each_index{|i|
60
+ f=@fields[i]
61
+ out[f]=Math::sqrt(@var_cov_matrix[i,i])
62
+ }
63
+ out.delete("_constant")
64
+ out
65
+ end
66
+ # Constant value
67
+ def constant
68
+ @coeffs['_constant']
69
+ end
70
+ # Regression coefficients
71
+ def coeffs
72
+ c=@coeffs.dup
73
+ c.delete("_constant")
74
+ c
75
+ end
76
+ # Constant standard error
77
+ def constant_se
78
+ i=@fields.index :_constant
79
+ Math::sqrt(@var_cov_matrix[i,i])
80
+ end
81
+ def assign_names(c)
82
+ a={}
83
+ @fields.each_index do |i|
84
+ a[@fields[i]]=c[i]
85
+ end
86
+ a
87
+ end
88
+ end # Base Engine
89
+ end # Dichotomic
90
+ end # Regression
91
91
  end # Stasample
@@ -1,13 +1,13 @@
1
1
  module Statsample
2
- module Regression
3
- module Binomial
4
- # Logistic Regression
5
- class Logit < BaseEngine
6
- def initialize(ds,y_var)
7
- model=Statsample::MLE::Logit.new
8
- super(ds,y_var,model)
9
- end
10
- end
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Logit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Logit.new
8
+ super(ds,y_var,model)
11
9
  end
10
+ end
12
11
  end
12
+ end
13
13
  end
@@ -1,13 +1,13 @@
1
1
  module Statsample
2
- module Regression
3
- module Binomial
4
- # Logistic Regression
5
- class Probit < BaseEngine
6
- def initialize(ds,y_var)
7
- model=Statsample::MLE::Probit.new
8
- super(ds,y_var,model)
9
- end
10
- end
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Probit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Probit.new
8
+ super(ds,y_var,model)
11
9
  end
10
+ end
12
11
  end
12
+ end
13
13
  end
@@ -1,8 +1,10 @@
1
1
  require 'statsample/regression/multiple/baseengine'
2
2
  module Statsample
3
3
  module Regression
4
- # Module for Linear Multiple Regression Analysis
5
- # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
4
+ # Module for Linear Multiple Regression Analysis.
5
+ #
6
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines.
7
+ #
6
8
  # Example.
7
9
  #
8
10
  # require 'statsample'
@@ -37,18 +39,10 @@ module Statsample
37
39
  def self.listwise_by_exp(ds,exp)
38
40
  raise "Not implemented yet"
39
41
  end
40
- # Returns a dataset and name of criteria using a expression.
41
- # All nominal vectors are replaced by dummy coding
42
- # and interactions are calculated
43
-
44
- def self.ds_by_exp(ds,exp)
45
- raise "Not implemented"
46
- parts=exp.split(/[\+=]/)
47
- dependent=parts.pop
48
- ds_out=[]
49
- parts.each{|p|
50
-
51
- }
42
+ # Obtain r2 for regressors
43
+ def self.r2_from_matrices(rxx,rxy)
44
+ matrix=(rxy.transpose*rxx.inverse*rxy)
45
+ matrix[0,0]
52
46
  end
53
47
 
54
48
  end
@@ -78,7 +78,7 @@ class GslEngine < BaseEngine
78
78
  r**2
79
79
  end
80
80
  def r
81
- Bivariate::pearson(@dy,predicted)
81
+ Bivariate::pearson(@dy, predicted)
82
82
  end
83
83
  def sst
84
84
  @dy.ss
@@ -16,53 +16,53 @@ module Multiple
16
16
  # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
17
17
 
18
18
  class RubyEngine < BaseEngine
19
- def initialize(ds,y_var)
19
+ def initialize(ds,y_var)
20
20
  super
21
- @dy=ds[@y_var]
22
- @ds_valid=ds.dup_only_valid
23
- @ds_indep=ds.dup(ds.fields-[y_var])
24
- @fields=@ds_indep.fields
25
- set_dep_columns
26
- obtain_y_vector
27
- @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
- @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
- @min_n_valid=nil
30
- end
31
- def min_n_valid
32
- if @min_n_valid.nil?
33
- min=@ds.cases
34
- m=Bivariate::n_valid_matrix(@ds)
35
- for x in 0...m.row_size
36
- for y in 0...m.column_size
37
- min=m[x,y] if m[x,y] < min
38
- end
39
- end
40
- @min_n_valid=min
21
+ @dy=ds[@y_var]
22
+ @ds_valid=ds.dup_only_valid
23
+ @ds_indep=ds.dup(ds.fields-[y_var])
24
+ @fields=@ds_indep.fields
25
+ set_dep_columns
26
+ obtain_y_vector
27
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
+ @min_n_valid=nil
30
+ end
31
+ def min_n_valid
32
+ if @min_n_valid.nil?
33
+ min=@ds.cases
34
+ m=Bivariate::n_valid_matrix(@ds)
35
+ for x in 0...m.row_size
36
+ for y in 0...m.column_size
37
+ min=m[x,y] if m[x,y] < min
41
38
  end
42
- @min_n_valid
43
- end
44
- def set_dep_columns
45
- @dep_columns=[]
46
- @ds_indep.each_vector{|k,v|
47
- @dep_columns.push(v.data_with_nils)
48
- }
39
+ end
40
+ @min_n_valid=min
49
41
  end
42
+ @min_n_valid
43
+ end
44
+ def set_dep_columns
45
+ @dep_columns=[]
46
+ @ds_indep.each_vector{|k,v|
47
+ @dep_columns.push(v.data_with_nils)
48
+ }
49
+ end
50
50
  # Sum of square total
51
- def sst
52
- #if @sst.nil?
53
- @sst=@dy.variance*(min_n_valid-1.0)
54
- #end
55
- @sst
56
- end
57
- def r2
58
- if @r2.nil?
59
- c=@matrix_y
60
- rxx=obtain_predictor_matrix
61
- matrix=(c.t*rxx.inverse*c)
62
- @r2=matrix[0,0]
63
- end
64
- @r2
51
+ def sst
52
+ #if @sst.nil?
53
+ @sst=@dy.variance*(min_n_valid-1.0)
54
+ #end
55
+ @sst
56
+ end
57
+ def r2
58
+ if @r2.nil?
59
+ c=@matrix_y
60
+ rxx=obtain_predictor_matrix
61
+ matrix=(c.t*rxx.inverse*c)
62
+ @r2=matrix[0,0]
65
63
  end
64
+ @r2
65
+ end
66
66
  def r
67
67
  Math::sqrt(r2)
68
68
  end
@@ -71,19 +71,19 @@ class RubyEngine < BaseEngine
71
71
  min_n_valid-@dep_columns.size-1
72
72
  end
73
73
  def fix_with_mean
74
- i=0
75
- @ds_indep.each{|row|
76
- empty=[]
77
- row.each{|k,v|
78
- empty.push(k) if v.nil?
79
- }
80
- if empty.size==1
81
- @ds_indep[empty[0]][i]=@ds[empty[0]].mean
82
- end
83
- i+=1
84
- }
85
- @ds_indep.update_valid_data
86
- set_dep_columns
74
+ i=0
75
+ @ds_indep.each do |row|
76
+ empty=[]
77
+ row.each do |k,v|
78
+ empty.push(k) if v.nil?
79
+ end
80
+ if empty.size==1
81
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
82
+ end
83
+ i+=1
84
+ end
85
+ @ds_indep.update_valid_data
86
+ set_dep_columns
87
87
  end
88
88
  def fix_with_regression
89
89
  i=0