statsample 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/multiset.rb
CHANGED
@@ -263,7 +263,7 @@ module Statsample
|
|
263
263
|
s_size=@strata_sizes[s_name]
|
264
264
|
(s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
|
265
265
|
}
|
266
|
-
(1
|
266
|
+
(1.quo(@population_size)) * Math::sqrt(sum)
|
267
267
|
end
|
268
268
|
# Cochran(1971), p. 150
|
269
269
|
def variance_pst(field,v=1)
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Permutation class systematically generates all permutations
|
3
|
+
# of elements on an array, using Dijkstra algorithm (1997).
|
4
|
+
#
|
5
|
+
# As argument, you could use
|
6
|
+
# * Number of elements: an array with numbers from 0 to n-1 will be used
|
7
|
+
# * Array: if ordered, you obtain permutations on lexicographic order
|
8
|
+
# you can repeat elements, if you will.
|
9
|
+
#
|
10
|
+
# Use:
|
11
|
+
# perm=Statsample::Permutation.new(3)
|
12
|
+
# perm.permutations
|
13
|
+
# => [[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]
|
14
|
+
# perm=Statsample::Permutation.new([0,0,1,1])
|
15
|
+
# => [[0,0,1,1],[0,1,0,1],[0,1,1,0],[1,0,0,1],[1,0,1,0],[1,1,0,0]]
|
16
|
+
#
|
17
|
+
# Reference: http://www.cut-the-knot.org/do_you_know/AllPerm.shtml
|
18
|
+
class Permutation
|
19
|
+
attr_reader :permutation_number
|
20
|
+
def initialize(v)
|
21
|
+
if v.is_a? Numeric
|
22
|
+
@original=(0...v.to_i).to_a
|
23
|
+
@permutation_number=factorial(v)
|
24
|
+
else
|
25
|
+
@original=v
|
26
|
+
calculate_max_iterations_from_array
|
27
|
+
end
|
28
|
+
@n=@original.size
|
29
|
+
reset
|
30
|
+
end
|
31
|
+
def calculate_max_iterations_from_array
|
32
|
+
if @original.respond_to? :frequencies
|
33
|
+
freq=@original.frequencies
|
34
|
+
else
|
35
|
+
freq=@original.to_vector.frequencies
|
36
|
+
end
|
37
|
+
if freq.length==@original.size
|
38
|
+
@permutation_number=factorial(@original.size)
|
39
|
+
else
|
40
|
+
numerator=factorial(@original.size)
|
41
|
+
denominator=freq.inject(1) {|a,v|
|
42
|
+
a*factorial(v[1])
|
43
|
+
}
|
44
|
+
@permutation_number=numerator/denominator
|
45
|
+
end
|
46
|
+
end
|
47
|
+
def factorial (n)
|
48
|
+
(1..n).inject(1){|a,v| a*v}
|
49
|
+
end
|
50
|
+
def reset
|
51
|
+
@iterations=0
|
52
|
+
@data=@original.dup
|
53
|
+
end
|
54
|
+
def each
|
55
|
+
reset
|
56
|
+
@permutation_number.times do
|
57
|
+
yield next_value
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def permutations
|
61
|
+
a=Array.new
|
62
|
+
each {|c| a.push(c)}
|
63
|
+
a
|
64
|
+
end
|
65
|
+
def next_value
|
66
|
+
prev=@data.dup
|
67
|
+
i = @n-1
|
68
|
+
while @data[i-1] >= @data[i]
|
69
|
+
#return false if i<0
|
70
|
+
i=i-1
|
71
|
+
end
|
72
|
+
j=@n
|
73
|
+
while @data[j-1] <= @data[i-1]
|
74
|
+
j=j-1
|
75
|
+
end
|
76
|
+
# swap values at positions (i-1) and (j-1)
|
77
|
+
swap(i-1, j-1);
|
78
|
+
|
79
|
+
i+=1
|
80
|
+
j = @n
|
81
|
+
|
82
|
+
while (i < j)
|
83
|
+
swap(i-1, j-1);
|
84
|
+
i+=1;
|
85
|
+
j-=1;
|
86
|
+
sprintf("%d %d",i,j)
|
87
|
+
end
|
88
|
+
prev
|
89
|
+
end
|
90
|
+
def swap(i,j)
|
91
|
+
tmp=@data[i]
|
92
|
+
@data[i]=@data[j]
|
93
|
+
@data[j]=tmp
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -1,91 +1,91 @@
|
|
1
|
-
|
2
1
|
module Statsample
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end #
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Create a Logit model object.
|
5
|
+
# ds:: Dataset
|
6
|
+
# y:: Name of dependent vector
|
7
|
+
# Use
|
8
|
+
# dataset=Statsample::CSV.read("data.csv")
|
9
|
+
# y="y"
|
10
|
+
# lr=Statsample::Regression::Binomial.logit(dataset,y)
|
11
|
+
#
|
12
|
+
def self.logit(ds,y_var)
|
13
|
+
Logit.new(ds,y_var)
|
14
|
+
end
|
15
|
+
# Create a Probit model object.
|
16
|
+
# ds:: Dataset
|
17
|
+
# y:: Name of dependent vector
|
18
|
+
# Use
|
19
|
+
# dataset=Statsample::CSV.read("data.csv")
|
20
|
+
# y="y"
|
21
|
+
# lr=Statsample::Regression::Binomial.probit(dataset,y)
|
22
|
+
#
|
23
|
+
|
24
|
+
def self.probit(ds,y_var)
|
25
|
+
Probit.new(ds,y_var)
|
26
|
+
end
|
27
|
+
# Base Engine for binomial regression analysis.
|
28
|
+
# See Statsample::Regression::Binomial.logit() and
|
29
|
+
# Statsample::Regression::Binomial.probit for fast
|
30
|
+
# access methods.
|
31
|
+
#
|
32
|
+
# Use:
|
33
|
+
# dataset=Statsample::CSV.read("data.csv")
|
34
|
+
# y="y"
|
35
|
+
# model=Statsample::MLE::Logit.new
|
36
|
+
# lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
|
37
|
+
class BaseEngine
|
38
|
+
attr_reader :log_likehood, :iterations
|
39
|
+
def initialize(ds,y_var,model)
|
40
|
+
@ds=ds
|
41
|
+
@y_var=y_var
|
42
|
+
@dy=@ds[@y_var]
|
43
|
+
@ds_indep=ds.dup(ds.fields-[y_var])
|
44
|
+
constant=([1.0]*ds.cases).to_vector(:scale)
|
45
|
+
@ds_indep.add_vector("_constant",constant)
|
46
|
+
mat_x=@ds_indep.to_matrix
|
47
|
+
mat_y=@dy.to_matrix(:vertical)
|
48
|
+
@fields=@ds_indep.fields
|
49
|
+
@model=model
|
50
|
+
coeffs=model.newton_raphson(mat_x, mat_y)
|
51
|
+
@coeffs=assign_names(coeffs.column(0).to_a)
|
52
|
+
@iterations=model.iterations
|
53
|
+
@var_cov_matrix=model.var_cov_matrix
|
54
|
+
@log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
|
55
|
+
end # init
|
56
|
+
# Coefficients standard error
|
57
|
+
def coeffs_se
|
58
|
+
out={}
|
59
|
+
@fields.each_index{|i|
|
60
|
+
f=@fields[i]
|
61
|
+
out[f]=Math::sqrt(@var_cov_matrix[i,i])
|
62
|
+
}
|
63
|
+
out.delete("_constant")
|
64
|
+
out
|
65
|
+
end
|
66
|
+
# Constant value
|
67
|
+
def constant
|
68
|
+
@coeffs['_constant']
|
69
|
+
end
|
70
|
+
# Regression coefficients
|
71
|
+
def coeffs
|
72
|
+
c=@coeffs.dup
|
73
|
+
c.delete("_constant")
|
74
|
+
c
|
75
|
+
end
|
76
|
+
# Constant standard error
|
77
|
+
def constant_se
|
78
|
+
i=@fields.index :_constant
|
79
|
+
Math::sqrt(@var_cov_matrix[i,i])
|
80
|
+
end
|
81
|
+
def assign_names(c)
|
82
|
+
a={}
|
83
|
+
@fields.each_index do |i|
|
84
|
+
a[@fields[i]]=c[i]
|
85
|
+
end
|
86
|
+
a
|
87
|
+
end
|
88
|
+
end # Base Engine
|
89
|
+
end # Dichotomic
|
90
|
+
end # Regression
|
91
91
|
end # Stasample
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
end
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Logistic Regression
|
5
|
+
class Logit < BaseEngine
|
6
|
+
def initialize(ds,y_var)
|
7
|
+
model=Statsample::MLE::Logit.new
|
8
|
+
super(ds,y_var,model)
|
11
9
|
end
|
10
|
+
end
|
12
11
|
end
|
12
|
+
end
|
13
13
|
end
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
end
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Logistic Regression
|
5
|
+
class Probit < BaseEngine
|
6
|
+
def initialize(ds,y_var)
|
7
|
+
model=Statsample::MLE::Probit.new
|
8
|
+
super(ds,y_var,model)
|
11
9
|
end
|
10
|
+
end
|
12
11
|
end
|
12
|
+
end
|
13
13
|
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'statsample/regression/multiple/baseengine'
|
2
2
|
module Statsample
|
3
3
|
module Regression
|
4
|
-
# Module for Linear Multiple Regression Analysis
|
5
|
-
#
|
4
|
+
# Module for Linear Multiple Regression Analysis.
|
5
|
+
#
|
6
|
+
# You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines.
|
7
|
+
#
|
6
8
|
# Example.
|
7
9
|
#
|
8
10
|
# require 'statsample'
|
@@ -37,18 +39,10 @@ module Statsample
|
|
37
39
|
def self.listwise_by_exp(ds,exp)
|
38
40
|
raise "Not implemented yet"
|
39
41
|
end
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
def self.ds_by_exp(ds,exp)
|
45
|
-
raise "Not implemented"
|
46
|
-
parts=exp.split(/[\+=]/)
|
47
|
-
dependent=parts.pop
|
48
|
-
ds_out=[]
|
49
|
-
parts.each{|p|
|
50
|
-
|
51
|
-
}
|
42
|
+
# Obtain r2 for regressors
|
43
|
+
def self.r2_from_matrices(rxx,rxy)
|
44
|
+
matrix=(rxy.transpose*rxx.inverse*rxy)
|
45
|
+
matrix[0,0]
|
52
46
|
end
|
53
47
|
|
54
48
|
end
|
@@ -16,53 +16,53 @@ module Multiple
|
|
16
16
|
# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
|
17
17
|
|
18
18
|
class RubyEngine < BaseEngine
|
19
|
-
|
19
|
+
def initialize(ds,y_var)
|
20
20
|
super
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
end
|
39
|
-
end
|
40
|
-
@min_n_valid=min
|
21
|
+
@dy=ds[@y_var]
|
22
|
+
@ds_valid=ds.dup_only_valid
|
23
|
+
@ds_indep=ds.dup(ds.fields-[y_var])
|
24
|
+
@fields=@ds_indep.fields
|
25
|
+
set_dep_columns
|
26
|
+
obtain_y_vector
|
27
|
+
@matrix_x = Bivariate.correlation_matrix(@ds_indep)
|
28
|
+
@coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
|
29
|
+
@min_n_valid=nil
|
30
|
+
end
|
31
|
+
def min_n_valid
|
32
|
+
if @min_n_valid.nil?
|
33
|
+
min=@ds.cases
|
34
|
+
m=Bivariate::n_valid_matrix(@ds)
|
35
|
+
for x in 0...m.row_size
|
36
|
+
for y in 0...m.column_size
|
37
|
+
min=m[x,y] if m[x,y] < min
|
41
38
|
end
|
42
|
-
|
43
|
-
|
44
|
-
def set_dep_columns
|
45
|
-
@dep_columns=[]
|
46
|
-
@ds_indep.each_vector{|k,v|
|
47
|
-
@dep_columns.push(v.data_with_nils)
|
48
|
-
}
|
39
|
+
end
|
40
|
+
@min_n_valid=min
|
49
41
|
end
|
42
|
+
@min_n_valid
|
43
|
+
end
|
44
|
+
def set_dep_columns
|
45
|
+
@dep_columns=[]
|
46
|
+
@ds_indep.each_vector{|k,v|
|
47
|
+
@dep_columns.push(v.data_with_nils)
|
48
|
+
}
|
49
|
+
end
|
50
50
|
# Sum of square total
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
64
|
-
@r2
|
51
|
+
def sst
|
52
|
+
#if @sst.nil?
|
53
|
+
@sst=@dy.variance*(min_n_valid-1.0)
|
54
|
+
#end
|
55
|
+
@sst
|
56
|
+
end
|
57
|
+
def r2
|
58
|
+
if @r2.nil?
|
59
|
+
c=@matrix_y
|
60
|
+
rxx=obtain_predictor_matrix
|
61
|
+
matrix=(c.t*rxx.inverse*c)
|
62
|
+
@r2=matrix[0,0]
|
65
63
|
end
|
64
|
+
@r2
|
65
|
+
end
|
66
66
|
def r
|
67
67
|
Math::sqrt(r2)
|
68
68
|
end
|
@@ -71,19 +71,19 @@ class RubyEngine < BaseEngine
|
|
71
71
|
min_n_valid-@dep_columns.size-1
|
72
72
|
end
|
73
73
|
def fix_with_mean
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
74
|
+
i=0
|
75
|
+
@ds_indep.each do |row|
|
76
|
+
empty=[]
|
77
|
+
row.each do |k,v|
|
78
|
+
empty.push(k) if v.nil?
|
79
|
+
end
|
80
|
+
if empty.size==1
|
81
|
+
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
|
82
|
+
end
|
83
|
+
i+=1
|
84
|
+
end
|
85
|
+
@ds_indep.update_valid_data
|
86
|
+
set_dep_columns
|
87
87
|
end
|
88
88
|
def fix_with_regression
|
89
89
|
i=0
|