statsample 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/multiset.rb
CHANGED
@@ -263,7 +263,7 @@ module Statsample
|
|
263
263
|
s_size=@strata_sizes[s_name]
|
264
264
|
(s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
|
265
265
|
}
|
266
|
-
(1
|
266
|
+
(1.quo(@population_size)) * Math::sqrt(sum)
|
267
267
|
end
|
268
268
|
# Cochran(1971), p. 150
|
269
269
|
def variance_pst(field,v=1)
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Permutation class systematically generates all permutations
|
3
|
+
# of elements on an array, using Dijkstra algorithm (1997).
|
4
|
+
#
|
5
|
+
# As argument, you could use
|
6
|
+
# * Number of elements: an array with numbers from 0 to n-1 will be used
|
7
|
+
# * Array: if ordered, you obtain permutations on lexicographic order
|
8
|
+
# you can repeat elements, if you will.
|
9
|
+
#
|
10
|
+
# Use:
|
11
|
+
# perm=Statsample::Permutation.new(3)
|
12
|
+
# perm.permutations
|
13
|
+
# => [[0,1,2],[0,2,1],[1,0,2],[1,2,0],[2,0,1],[2,1,0]]
|
14
|
+
# perm=Statsample::Permutation.new([0,0,1,1])
|
15
|
+
# => [[0,0,1,1],[0,1,0,1],[0,1,1,0],[1,0,0,1],[1,0,1,0],[1,1,0,0]]
|
16
|
+
#
|
17
|
+
# Reference: http://www.cut-the-knot.org/do_you_know/AllPerm.shtml
|
18
|
+
class Permutation
|
19
|
+
attr_reader :permutation_number
|
20
|
+
def initialize(v)
|
21
|
+
if v.is_a? Numeric
|
22
|
+
@original=(0...v.to_i).to_a
|
23
|
+
@permutation_number=factorial(v)
|
24
|
+
else
|
25
|
+
@original=v
|
26
|
+
calculate_max_iterations_from_array
|
27
|
+
end
|
28
|
+
@n=@original.size
|
29
|
+
reset
|
30
|
+
end
|
31
|
+
def calculate_max_iterations_from_array
|
32
|
+
if @original.respond_to? :frequencies
|
33
|
+
freq=@original.frequencies
|
34
|
+
else
|
35
|
+
freq=@original.to_vector.frequencies
|
36
|
+
end
|
37
|
+
if freq.length==@original.size
|
38
|
+
@permutation_number=factorial(@original.size)
|
39
|
+
else
|
40
|
+
numerator=factorial(@original.size)
|
41
|
+
denominator=freq.inject(1) {|a,v|
|
42
|
+
a*factorial(v[1])
|
43
|
+
}
|
44
|
+
@permutation_number=numerator/denominator
|
45
|
+
end
|
46
|
+
end
|
47
|
+
def factorial (n)
|
48
|
+
(1..n).inject(1){|a,v| a*v}
|
49
|
+
end
|
50
|
+
def reset
|
51
|
+
@iterations=0
|
52
|
+
@data=@original.dup
|
53
|
+
end
|
54
|
+
def each
|
55
|
+
reset
|
56
|
+
@permutation_number.times do
|
57
|
+
yield next_value
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def permutations
|
61
|
+
a=Array.new
|
62
|
+
each {|c| a.push(c)}
|
63
|
+
a
|
64
|
+
end
|
65
|
+
def next_value
|
66
|
+
prev=@data.dup
|
67
|
+
i = @n-1
|
68
|
+
while @data[i-1] >= @data[i]
|
69
|
+
#return false if i<0
|
70
|
+
i=i-1
|
71
|
+
end
|
72
|
+
j=@n
|
73
|
+
while @data[j-1] <= @data[i-1]
|
74
|
+
j=j-1
|
75
|
+
end
|
76
|
+
# swap values at positions (i-1) and (j-1)
|
77
|
+
swap(i-1, j-1);
|
78
|
+
|
79
|
+
i+=1
|
80
|
+
j = @n
|
81
|
+
|
82
|
+
while (i < j)
|
83
|
+
swap(i-1, j-1);
|
84
|
+
i+=1;
|
85
|
+
j-=1;
|
86
|
+
sprintf("%d %d",i,j)
|
87
|
+
end
|
88
|
+
prev
|
89
|
+
end
|
90
|
+
def swap(i,j)
|
91
|
+
tmp=@data[i]
|
92
|
+
@data[i]=@data[j]
|
93
|
+
@data[j]=tmp
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -1,91 +1,91 @@
|
|
1
|
-
|
2
1
|
module Statsample
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
end #
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Create a Logit model object.
|
5
|
+
# ds:: Dataset
|
6
|
+
# y:: Name of dependent vector
|
7
|
+
# Use
|
8
|
+
# dataset=Statsample::CSV.read("data.csv")
|
9
|
+
# y="y"
|
10
|
+
# lr=Statsample::Regression::Binomial.logit(dataset,y)
|
11
|
+
#
|
12
|
+
def self.logit(ds,y_var)
|
13
|
+
Logit.new(ds,y_var)
|
14
|
+
end
|
15
|
+
# Create a Probit model object.
|
16
|
+
# ds:: Dataset
|
17
|
+
# y:: Name of dependent vector
|
18
|
+
# Use
|
19
|
+
# dataset=Statsample::CSV.read("data.csv")
|
20
|
+
# y="y"
|
21
|
+
# lr=Statsample::Regression::Binomial.probit(dataset,y)
|
22
|
+
#
|
23
|
+
|
24
|
+
def self.probit(ds,y_var)
|
25
|
+
Probit.new(ds,y_var)
|
26
|
+
end
|
27
|
+
# Base Engine for binomial regression analysis.
|
28
|
+
# See Statsample::Regression::Binomial.logit() and
|
29
|
+
# Statsample::Regression::Binomial.probit for fast
|
30
|
+
# access methods.
|
31
|
+
#
|
32
|
+
# Use:
|
33
|
+
# dataset=Statsample::CSV.read("data.csv")
|
34
|
+
# y="y"
|
35
|
+
# model=Statsample::MLE::Logit.new
|
36
|
+
# lr=Statsample::Regression::Binomial::BaseEngine(dataset, y, model)
|
37
|
+
class BaseEngine
|
38
|
+
attr_reader :log_likehood, :iterations
|
39
|
+
def initialize(ds,y_var,model)
|
40
|
+
@ds=ds
|
41
|
+
@y_var=y_var
|
42
|
+
@dy=@ds[@y_var]
|
43
|
+
@ds_indep=ds.dup(ds.fields-[y_var])
|
44
|
+
constant=([1.0]*ds.cases).to_vector(:scale)
|
45
|
+
@ds_indep.add_vector("_constant",constant)
|
46
|
+
mat_x=@ds_indep.to_matrix
|
47
|
+
mat_y=@dy.to_matrix(:vertical)
|
48
|
+
@fields=@ds_indep.fields
|
49
|
+
@model=model
|
50
|
+
coeffs=model.newton_raphson(mat_x, mat_y)
|
51
|
+
@coeffs=assign_names(coeffs.column(0).to_a)
|
52
|
+
@iterations=model.iterations
|
53
|
+
@var_cov_matrix=model.var_cov_matrix
|
54
|
+
@log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
|
55
|
+
end # init
|
56
|
+
# Coefficients standard error
|
57
|
+
def coeffs_se
|
58
|
+
out={}
|
59
|
+
@fields.each_index{|i|
|
60
|
+
f=@fields[i]
|
61
|
+
out[f]=Math::sqrt(@var_cov_matrix[i,i])
|
62
|
+
}
|
63
|
+
out.delete("_constant")
|
64
|
+
out
|
65
|
+
end
|
66
|
+
# Constant value
|
67
|
+
def constant
|
68
|
+
@coeffs['_constant']
|
69
|
+
end
|
70
|
+
# Regression coefficients
|
71
|
+
def coeffs
|
72
|
+
c=@coeffs.dup
|
73
|
+
c.delete("_constant")
|
74
|
+
c
|
75
|
+
end
|
76
|
+
# Constant standard error
|
77
|
+
def constant_se
|
78
|
+
i=@fields.index :_constant
|
79
|
+
Math::sqrt(@var_cov_matrix[i,i])
|
80
|
+
end
|
81
|
+
def assign_names(c)
|
82
|
+
a={}
|
83
|
+
@fields.each_index do |i|
|
84
|
+
a[@fields[i]]=c[i]
|
85
|
+
end
|
86
|
+
a
|
87
|
+
end
|
88
|
+
end # Base Engine
|
89
|
+
end # Dichotomic
|
90
|
+
end # Regression
|
91
91
|
end # Stasample
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
end
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Logistic Regression
|
5
|
+
class Logit < BaseEngine
|
6
|
+
def initialize(ds,y_var)
|
7
|
+
model=Statsample::MLE::Logit.new
|
8
|
+
super(ds,y_var,model)
|
11
9
|
end
|
10
|
+
end
|
12
11
|
end
|
12
|
+
end
|
13
13
|
end
|
@@ -1,13 +1,13 @@
|
|
1
1
|
module Statsample
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
end
|
2
|
+
module Regression
|
3
|
+
module Binomial
|
4
|
+
# Logistic Regression
|
5
|
+
class Probit < BaseEngine
|
6
|
+
def initialize(ds,y_var)
|
7
|
+
model=Statsample::MLE::Probit.new
|
8
|
+
super(ds,y_var,model)
|
11
9
|
end
|
10
|
+
end
|
12
11
|
end
|
12
|
+
end
|
13
13
|
end
|
@@ -1,8 +1,10 @@
|
|
1
1
|
require 'statsample/regression/multiple/baseengine'
|
2
2
|
module Statsample
|
3
3
|
module Regression
|
4
|
-
# Module for Linear Multiple Regression Analysis
|
5
|
-
#
|
4
|
+
# Module for Linear Multiple Regression Analysis.
|
5
|
+
#
|
6
|
+
# You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines.
|
7
|
+
#
|
6
8
|
# Example.
|
7
9
|
#
|
8
10
|
# require 'statsample'
|
@@ -37,18 +39,10 @@ module Statsample
|
|
37
39
|
def self.listwise_by_exp(ds,exp)
|
38
40
|
raise "Not implemented yet"
|
39
41
|
end
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
def self.ds_by_exp(ds,exp)
|
45
|
-
raise "Not implemented"
|
46
|
-
parts=exp.split(/[\+=]/)
|
47
|
-
dependent=parts.pop
|
48
|
-
ds_out=[]
|
49
|
-
parts.each{|p|
|
50
|
-
|
51
|
-
}
|
42
|
+
# Obtain r2 for regressors
|
43
|
+
def self.r2_from_matrices(rxx,rxy)
|
44
|
+
matrix=(rxy.transpose*rxx.inverse*rxy)
|
45
|
+
matrix[0,0]
|
52
46
|
end
|
53
47
|
|
54
48
|
end
|
@@ -16,53 +16,53 @@ module Multiple
|
|
16
16
|
# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
|
17
17
|
|
18
18
|
class RubyEngine < BaseEngine
|
19
|
-
|
19
|
+
def initialize(ds,y_var)
|
20
20
|
super
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
end
|
39
|
-
end
|
40
|
-
@min_n_valid=min
|
21
|
+
@dy=ds[@y_var]
|
22
|
+
@ds_valid=ds.dup_only_valid
|
23
|
+
@ds_indep=ds.dup(ds.fields-[y_var])
|
24
|
+
@fields=@ds_indep.fields
|
25
|
+
set_dep_columns
|
26
|
+
obtain_y_vector
|
27
|
+
@matrix_x = Bivariate.correlation_matrix(@ds_indep)
|
28
|
+
@coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
|
29
|
+
@min_n_valid=nil
|
30
|
+
end
|
31
|
+
def min_n_valid
|
32
|
+
if @min_n_valid.nil?
|
33
|
+
min=@ds.cases
|
34
|
+
m=Bivariate::n_valid_matrix(@ds)
|
35
|
+
for x in 0...m.row_size
|
36
|
+
for y in 0...m.column_size
|
37
|
+
min=m[x,y] if m[x,y] < min
|
41
38
|
end
|
42
|
-
|
43
|
-
|
44
|
-
def set_dep_columns
|
45
|
-
@dep_columns=[]
|
46
|
-
@ds_indep.each_vector{|k,v|
|
47
|
-
@dep_columns.push(v.data_with_nils)
|
48
|
-
}
|
39
|
+
end
|
40
|
+
@min_n_valid=min
|
49
41
|
end
|
42
|
+
@min_n_valid
|
43
|
+
end
|
44
|
+
def set_dep_columns
|
45
|
+
@dep_columns=[]
|
46
|
+
@ds_indep.each_vector{|k,v|
|
47
|
+
@dep_columns.push(v.data_with_nils)
|
48
|
+
}
|
49
|
+
end
|
50
50
|
# Sum of square total
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
64
|
-
@r2
|
51
|
+
def sst
|
52
|
+
#if @sst.nil?
|
53
|
+
@sst=@dy.variance*(min_n_valid-1.0)
|
54
|
+
#end
|
55
|
+
@sst
|
56
|
+
end
|
57
|
+
def r2
|
58
|
+
if @r2.nil?
|
59
|
+
c=@matrix_y
|
60
|
+
rxx=obtain_predictor_matrix
|
61
|
+
matrix=(c.t*rxx.inverse*c)
|
62
|
+
@r2=matrix[0,0]
|
65
63
|
end
|
64
|
+
@r2
|
65
|
+
end
|
66
66
|
def r
|
67
67
|
Math::sqrt(r2)
|
68
68
|
end
|
@@ -71,19 +71,19 @@ class RubyEngine < BaseEngine
|
|
71
71
|
min_n_valid-@dep_columns.size-1
|
72
72
|
end
|
73
73
|
def fix_with_mean
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
74
|
+
i=0
|
75
|
+
@ds_indep.each do |row|
|
76
|
+
empty=[]
|
77
|
+
row.each do |k,v|
|
78
|
+
empty.push(k) if v.nil?
|
79
|
+
end
|
80
|
+
if empty.size==1
|
81
|
+
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
|
82
|
+
end
|
83
|
+
i+=1
|
84
|
+
end
|
85
|
+
@ds_indep.update_valid_data
|
86
|
+
set_dep_columns
|
87
87
|
end
|
88
88
|
def fix_with_regression
|
89
89
|
i=0
|