statsample 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -263,7 +263,7 @@ module Statsample
263
263
  s_size=@strata_sizes[s_name]
264
264
  (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
265
265
  }
266
- (1/@population_size.to_f) * Math::sqrt(sum)
266
+ (1.quo(@population_size)) * Math::sqrt(sum)
267
267
  end
268
268
  # Cochran(1971), p. 150
269
269
  def variance_pst(field,v=1)
@@ -0,0 +1,96 @@
1
+ module Statsample
2
+ # Permutation class systematically generates all permutations
3
+ # of elements on an array, using Dijkstra algorithm (1997).
4
+ #
5
+ # As argument, you could use
6
+ # * Number of elements: an array with numbers from 0 to n-1 will be used
7
+ # * Array: if ordered, you obtain permutations on lexicographic order
8
+ # you can repeat elements, if you will.
9
+ #
10
+ # Use:
11
+ # perm=Statsample::Permutation.new(3)
12
+ # perm.permutations
13
+ # => [[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]
14
+ # perm=Statsample::Permutation.new([0,0,1,1])
15
+ # => [[0,0,1,1],[0,1,0,1],[0,1,1,0],[1,0,0,1],[1,0,1,0],[1,1,0,0]]
16
+ #
17
+ # Reference: http://www.cut-the-knot.org/do_you_know/AllPerm.shtml
18
+ class Permutation
19
+ attr_reader :permutation_number
20
+ def initialize(v)
21
+ if v.is_a? Numeric
22
+ @original=(0...v.to_i).to_a
23
+ @permutation_number=factorial(v)
24
+ else
25
+ @original=v
26
+ calculate_max_iterations_from_array
27
+ end
28
+ @n=@original.size
29
+ reset
30
+ end
31
+ def calculate_max_iterations_from_array
32
+ if @original.respond_to? :frequencies
33
+ freq=@original.frequencies
34
+ else
35
+ freq=@original.to_vector.frequencies
36
+ end
37
+ if freq.length==@original.size
38
+ @permutation_number=factorial(@original.size)
39
+ else
40
+ numerator=factorial(@original.size)
41
+ denominator=freq.inject(1) {|a,v|
42
+ a*factorial(v[1])
43
+ }
44
+ @permutation_number=numerator/denominator
45
+ end
46
+ end
47
+ def factorial (n)
48
+ (1..n).inject(1){|a,v| a*v}
49
+ end
50
+ def reset
51
+ @iterations=0
52
+ @data=@original.dup
53
+ end
54
+ def each
55
+ reset
56
+ @permutation_number.times do
57
+ yield next_value
58
+ end
59
+ end
60
+ def permutations
61
+ a=Array.new
62
+ each {|c| a.push(c)}
63
+ a
64
+ end
65
+ def next_value
66
+ prev=@data.dup
67
+ i = @n-1
68
+ while @data[i-1] >= @data[i]
69
+ #return false if i<0
70
+ i=i-1
71
+ end
72
+ j=@n
73
+ while @data[j-1] <= @data[i-1]
74
+ j=j-1
75
+ end
76
+ # swap values at positions (i-1) and (j-1)
77
+ swap(i-1, j-1);
78
+
79
+ i+=1
80
+ j = @n
81
+
82
+ while (i < j)
83
+ swap(i-1, j-1);
84
+ i+=1;
85
+ j-=1;
86
+ sprintf("%d %d",i,j)
87
+ end
88
+ prev
89
+ end
90
+ def swap(i,j)
91
+ tmp=@data[i]
92
+ @data[i]=@data[j]
93
+ @data[j]=tmp
94
+ end
95
+ end
96
+ end
@@ -8,7 +8,7 @@ require 'statsample/regression/binomial/logit'
8
8
  require 'statsample/regression/binomial/probit'
9
9
 
10
10
  module Statsample
11
- # Module for regression procedures
11
+ # Module for regression procedures.
12
12
  module Regression
13
13
  end
14
14
  end
@@ -1,91 +1,91 @@
1
-
2
1
  module Statsample
3
- module Regression
4
- module Binomial
5
- # Create a Logit model object.
6
- # ds:: Dataset
7
- # y:: Name of dependent vector
8
- # Use
9
- # dataset=Statsample::CSV.read("data.csv")
10
- # y="y"
11
- # lr=Statsample::Regression::Binomial.logit(dataset,y)
12
- #
13
- def self.logit(ds,y_var)
14
- Logit.new(ds,y_var)
15
- end
16
- # Create a Probit model object.
17
- # ds:: Dataset
18
- # y:: Name of dependent vector
19
- # Use
20
- # dataset=Statsample::CSV.read("data.csv")
21
- # y="y"
22
- # lr=Statsample::Regression::Binomial.probit(dataset,y)
23
- #
24
-
25
- def self.probit(ds,y_var)
26
- Probit.new(ds,y_var)
27
- end
28
- # Base Engine for binomial regression analysis.
29
- # See Statsample::Regression::Binomial.logit() and
30
- # Statsample::Regression::Binomial.probit for fast
31
- # access methods.
32
- #
33
- # Use:
34
- # dataset=Statsample::CSV.read("data.csv")
35
- # y="y"
36
- # model=Statsample::MLE::Logit.new
37
- # lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
38
- class BaseEngine
39
- attr_reader :log_likehood, :iterations
40
- def initialize(ds,y_var,model)
41
- @ds=ds
42
- @y_var=y_var
43
- @dy=@ds[@y_var]
44
- @ds_indep=ds.dup(ds.fields-[y_var])
45
- constant=([1.0]*ds.cases).to_vector(:scale)
46
- @ds_indep.add_vector("_constant",constant)
47
- mat_x=@ds_indep.to_matrix
48
- mat_y=@dy.to_matrix(:vertical)
49
- @fields=@ds_indep.fields
50
- @model=model
51
- coeffs=model.newton_raphson(mat_x, mat_y)
52
- @coeffs=assign_names(coeffs.column(0).to_a)
53
- @iterations=model.iterations
54
- @var_cov_matrix=model.var_cov_matrix
55
- @log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
56
- end # init
57
- # Coefficients standard error
58
- def coeffs_se
59
- out={}
60
- @fields.each_index{|i|
61
- f=@fields[i]
62
- out[f]=Math::sqrt(@var_cov_matrix[i,i])
63
- }
64
- out.delete("_constant")
65
- out
66
- end
67
- def constant
68
- @coeffs['_constant']
69
- end
70
- def coeffs
71
- c=@coeffs.dup
72
- c.delete("_constant")
73
- c
74
- end
75
- # Constant standard error
76
- def constant_se
77
- i=@fields.index :_constant
78
- Math::sqrt(@var_cov_matrix[i,i])
79
- end
80
- def assign_names(c)
81
- a={}
82
- @fields.each_index {|i|
83
- a[@fields[i]]=c[i]
84
- }
85
- a
86
- end
87
- end # Base Engine
88
-
89
- end # Dichotomic
90
- end # Regression
2
+ module Regression
3
+ module Binomial
4
+ # Create a Logit model object.
5
+ # ds:: Dataset
6
+ # y:: Name of dependent vector
7
+ # Use
8
+ # dataset=Statsample::CSV.read("data.csv")
9
+ # y="y"
10
+ # lr=Statsample::Regression::Binomial.logit(dataset,y)
11
+ #
12
+ def self.logit(ds,y_var)
13
+ Logit.new(ds,y_var)
14
+ end
15
+ # Create a Probit model object.
16
+ # ds:: Dataset
17
+ # y:: Name of dependent vector
18
+ # Use
19
+ # dataset=Statsample::CSV.read("data.csv")
20
+ # y="y"
21
+ # lr=Statsample::Regression::Binomial.probit(dataset,y)
22
+ #
23
+
24
+ def self.probit(ds,y_var)
25
+ Probit.new(ds,y_var)
26
+ end
27
+ # Base Engine for binomial regression analysis.
28
+ # See Statsample::Regression::Binomial.logit() and
29
+ # Statsample::Regression::Binomial.probit for fast
30
+ # access methods.
31
+ #
32
+ # Use:
33
+ # dataset=Statsample::CSV.read("data.csv")
34
+ # y="y"
35
+ # model=Statsample::MLE::Logit.new
36
+ # lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
37
+ class BaseEngine
38
+ attr_reader :log_likehood, :iterations
39
+ def initialize(ds,y_var,model)
40
+ @ds=ds
41
+ @y_var=y_var
42
+ @dy=@ds[@y_var]
43
+ @ds_indep=ds.dup(ds.fields-[y_var])
44
+ constant=([1.0]*ds.cases).to_vector(:scale)
45
+ @ds_indep.add_vector("_constant",constant)
46
+ mat_x=@ds_indep.to_matrix
47
+ mat_y=@dy.to_matrix(:vertical)
48
+ @fields=@ds_indep.fields
49
+ @model=model
50
+ coeffs=model.newton_raphson(mat_x, mat_y)
51
+ @coeffs=assign_names(coeffs.column(0).to_a)
52
+ @iterations=model.iterations
53
+ @var_cov_matrix=model.var_cov_matrix
54
+ @log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
55
+ end # init
56
+ # Coefficients standard error
57
+ def coeffs_se
58
+ out={}
59
+ @fields.each_index{|i|
60
+ f=@fields[i]
61
+ out[f]=Math::sqrt(@var_cov_matrix[i,i])
62
+ }
63
+ out.delete("_constant")
64
+ out
65
+ end
66
+ # Constant value
67
+ def constant
68
+ @coeffs['_constant']
69
+ end
70
+ # Regression coefficients
71
+ def coeffs
72
+ c=@coeffs.dup
73
+ c.delete("_constant")
74
+ c
75
+ end
76
+ # Constant standard error
77
+ def constant_se
78
+ i=@fields.index :_constant
79
+ Math::sqrt(@var_cov_matrix[i,i])
80
+ end
81
+ def assign_names(c)
82
+ a={}
83
+ @fields.each_index do |i|
84
+ a[@fields[i]]=c[i]
85
+ end
86
+ a
87
+ end
88
+ end # Base Engine
89
+ end # Dichotomic
90
+ end # Regression
91
91
  end # Stasample
@@ -1,13 +1,13 @@
1
1
  module Statsample
2
- module Regression
3
- module Binomial
4
- # Logistic Regression
5
- class Logit < BaseEngine
6
- def initialize(ds,y_var)
7
- model=Statsample::MLE::Logit.new
8
- super(ds,y_var,model)
9
- end
10
- end
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Logit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Logit.new
8
+ super(ds,y_var,model)
11
9
  end
10
+ end
12
11
  end
12
+ end
13
13
  end
@@ -1,13 +1,13 @@
1
1
  module Statsample
2
- module Regression
3
- module Binomial
4
- # Logistic Regression
5
- class Probit < BaseEngine
6
- def initialize(ds,y_var)
7
- model=Statsample::MLE::Probit.new
8
- super(ds,y_var,model)
9
- end
10
- end
2
+ module Regression
3
+ module Binomial
4
+ # Logistic Regression
5
+ class Probit < BaseEngine
6
+ def initialize(ds,y_var)
7
+ model=Statsample::MLE::Probit.new
8
+ super(ds,y_var,model)
11
9
  end
10
+ end
12
11
  end
12
+ end
13
13
  end
@@ -1,8 +1,10 @@
1
1
  require 'statsample/regression/multiple/baseengine'
2
2
  module Statsample
3
3
  module Regression
4
- # Module for Linear Multiple Regression Analysis
5
- # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
4
+ # Module for Linear Multiple Regression Analysis.
5
+ #
6
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines.
7
+ #
6
8
  # Example.
7
9
  #
8
10
  # require 'statsample'
@@ -37,18 +39,10 @@ module Statsample
37
39
  def self.listwise_by_exp(ds,exp)
38
40
  raise "Not implemented yet"
39
41
  end
40
- # Returns a dataset and name of criteria using a expression.
41
- # All nominal vectors are replaced by dummy coding
42
- # and interactions are calculated
43
-
44
- def self.ds_by_exp(ds,exp)
45
- raise "Not implemented"
46
- parts=exp.split(/[\+=]/)
47
- dependent=parts.pop
48
- ds_out=[]
49
- parts.each{|p|
50
-
51
- }
42
+ # Obtain r2 for regressors
43
+ def self.r2_from_matrices(rxx,rxy)
44
+ matrix=(rxy.transpose*rxx.inverse*rxy)
45
+ matrix[0,0]
52
46
  end
53
47
 
54
48
  end
@@ -78,7 +78,7 @@ class GslEngine < BaseEngine
78
78
  r**2
79
79
  end
80
80
  def r
81
- Bivariate::pearson(@dy,predicted)
81
+ Bivariate::pearson(@dy, predicted)
82
82
  end
83
83
  def sst
84
84
  @dy.ss
@@ -16,53 +16,53 @@ module Multiple
16
16
  # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
17
17
 
18
18
  class RubyEngine < BaseEngine
19
- def initialize(ds,y_var)
19
+ def initialize(ds,y_var)
20
20
  super
21
- @dy=ds[@y_var]
22
- @ds_valid=ds.dup_only_valid
23
- @ds_indep=ds.dup(ds.fields-[y_var])
24
- @fields=@ds_indep.fields
25
- set_dep_columns
26
- obtain_y_vector
27
- @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
- @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
- @min_n_valid=nil
30
- end
31
- def min_n_valid
32
- if @min_n_valid.nil?
33
- min=@ds.cases
34
- m=Bivariate::n_valid_matrix(@ds)
35
- for x in 0...m.row_size
36
- for y in 0...m.column_size
37
- min=m[x,y] if m[x,y] < min
38
- end
39
- end
40
- @min_n_valid=min
21
+ @dy=ds[@y_var]
22
+ @ds_valid=ds.dup_only_valid
23
+ @ds_indep=ds.dup(ds.fields-[y_var])
24
+ @fields=@ds_indep.fields
25
+ set_dep_columns
26
+ obtain_y_vector
27
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
+ @min_n_valid=nil
30
+ end
31
+ def min_n_valid
32
+ if @min_n_valid.nil?
33
+ min=@ds.cases
34
+ m=Bivariate::n_valid_matrix(@ds)
35
+ for x in 0...m.row_size
36
+ for y in 0...m.column_size
37
+ min=m[x,y] if m[x,y] < min
41
38
  end
42
- @min_n_valid
43
- end
44
- def set_dep_columns
45
- @dep_columns=[]
46
- @ds_indep.each_vector{|k,v|
47
- @dep_columns.push(v.data_with_nils)
48
- }
39
+ end
40
+ @min_n_valid=min
49
41
  end
42
+ @min_n_valid
43
+ end
44
+ def set_dep_columns
45
+ @dep_columns=[]
46
+ @ds_indep.each_vector{|k,v|
47
+ @dep_columns.push(v.data_with_nils)
48
+ }
49
+ end
50
50
  # Sum of square total
51
- def sst
52
- #if @sst.nil?
53
- @sst=@dy.variance*(min_n_valid-1.0)
54
- #end
55
- @sst
56
- end
57
- def r2
58
- if @r2.nil?
59
- c=@matrix_y
60
- rxx=obtain_predictor_matrix
61
- matrix=(c.t*rxx.inverse*c)
62
- @r2=matrix[0,0]
63
- end
64
- @r2
51
+ def sst
52
+ #if @sst.nil?
53
+ @sst=@dy.variance*(min_n_valid-1.0)
54
+ #end
55
+ @sst
56
+ end
57
+ def r2
58
+ if @r2.nil?
59
+ c=@matrix_y
60
+ rxx=obtain_predictor_matrix
61
+ matrix=(c.t*rxx.inverse*c)
62
+ @r2=matrix[0,0]
65
63
  end
64
+ @r2
65
+ end
66
66
  def r
67
67
  Math::sqrt(r2)
68
68
  end
@@ -71,19 +71,19 @@ class RubyEngine < BaseEngine
71
71
  min_n_valid-@dep_columns.size-1
72
72
  end
73
73
  def fix_with_mean
74
- i=0
75
- @ds_indep.each{|row|
76
- empty=[]
77
- row.each{|k,v|
78
- empty.push(k) if v.nil?
79
- }
80
- if empty.size==1
81
- @ds_indep[empty[0]][i]=@ds[empty[0]].mean
82
- end
83
- i+=1
84
- }
85
- @ds_indep.update_valid_data
86
- set_dep_columns
74
+ i=0
75
+ @ds_indep.each do |row|
76
+ empty=[]
77
+ row.each do |k,v|
78
+ empty.push(k) if v.nil?
79
+ end
80
+ if empty.size==1
81
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
82
+ end
83
+ i+=1
84
+ end
85
+ @ds_indep.update_valid_data
86
+ set_dep_columns
87
87
  end
88
88
  def fix_with_regression
89
89
  i=0