statsample 0.18.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +23 -0
- data/Manifest.txt +28 -17
- data/Rakefile +3 -2
- data/benchmarks/correlation_matrix_15_variables.rb +31 -0
- data/benchmarks/correlation_matrix_5_variables.rb +32 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/examples/boxplot.rb +13 -14
- data/examples/correlation_matrix.rb +16 -8
- data/examples/dataset.rb +13 -4
- data/examples/dominance_analysis.rb +23 -17
- data/examples/dominance_analysis_bootstrap.rb +28 -22
- data/examples/histogram.rb +8 -9
- data/examples/icc.rb +20 -21
- data/examples/levene.rb +10 -4
- data/examples/multiple_regression.rb +9 -28
- data/examples/multivariate_correlation.rb +9 -3
- data/examples/parallel_analysis.rb +20 -16
- data/examples/polychoric.rb +15 -9
- data/examples/principal_axis.rb +18 -6
- data/examples/reliability.rb +26 -13
- data/examples/scatterplot.rb +10 -6
- data/examples/t_test.rb +15 -6
- data/examples/tetrachoric.rb +9 -2
- data/examples/u_test.rb +12 -4
- data/examples/vector.rb +13 -2
- data/examples/velicer_map_test.rb +33 -26
- data/lib/statsample.rb +32 -12
- data/lib/statsample/analysis.rb +79 -0
- data/lib/statsample/analysis/suite.rb +72 -0
- data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
- data/lib/statsample/bivariate.rb +70 -16
- data/lib/statsample/dataset.rb +25 -19
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/factor.rb +2 -0
- data/lib/statsample/factor/map.rb +16 -10
- data/lib/statsample/factor/parallelanalysis.rb +9 -3
- data/lib/statsample/factor/pca.rb +28 -32
- data/lib/statsample/factor/rotation.rb +15 -8
- data/lib/statsample/graph/boxplot.rb +3 -4
- data/lib/statsample/graph/histogram.rb +2 -1
- data/lib/statsample/graph/scatterplot.rb +1 -0
- data/lib/statsample/matrix.rb +106 -16
- data/lib/statsample/regression.rb +4 -1
- data/lib/statsample/regression/binomial.rb +1 -1
- data/lib/statsample/regression/multiple/baseengine.rb +19 -9
- data/lib/statsample/regression/multiple/gslengine.rb +127 -126
- data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
- data/lib/statsample/regression/simple.rb +31 -6
- data/lib/statsample/reliability.rb +11 -3
- data/lib/statsample/reliability/scaleanalysis.rb +4 -4
- data/lib/statsample/shorthand.rb +81 -0
- data/lib/statsample/test/chisquare.rb +1 -1
- data/lib/statsample/vector.rb +163 -163
- data/lib/statsample/vector/gsl.rb +106 -0
- data/references.txt +2 -2
- data/{data → test/fixtures}/crime.txt +0 -0
- data/{data → test/fixtures}/hartman_23.matrix +0 -0
- data/{data → test/fixtures}/repeated_fields.csv +0 -0
- data/{data → test/fixtures}/test_binomial.csv +0 -0
- data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
- data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
- data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
- data/{data → test/fixtures}/tetmat_test.txt +0 -0
- data/test/helpers_tests.rb +18 -2
- data/test/test_analysis.rb +118 -0
- data/test/test_anovatwoway.rb +1 -1
- data/test/test_anovatwowaywithdataset.rb +1 -1
- data/test/test_anovawithvectors.rb +1 -2
- data/test/test_bartlettsphericity.rb +1 -2
- data/test/test_bivariate.rb +64 -22
- data/test/test_codification.rb +1 -2
- data/test/test_crosstab.rb +1 -2
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +24 -3
- data/test/test_dominance_analysis.rb +1 -2
- data/test/test_factor.rb +8 -69
- data/test/test_factor_map.rb +43 -0
- data/test/test_factor_pa.rb +54 -0
- data/test/test_ggobi.rb +1 -1
- data/test/test_gsl.rb +12 -18
- data/test/test_histogram.rb +1 -2
- data/test/test_logit.rb +62 -18
- data/test/test_matrix.rb +4 -5
- data/test/test_mle.rb +3 -4
- data/test/test_regression.rb +21 -2
- data/test/test_reliability.rb +3 -3
- data/test/test_reliability_icc.rb +1 -1
- data/test/test_reliability_skillscale.rb +20 -4
- data/test/test_resample.rb +1 -2
- data/test/test_rserve_extension.rb +1 -2
- data/test/test_srs.rb +1 -2
- data/test/test_statistics.rb +1 -2
- data/test/test_stest.rb +1 -2
- data/test/test_stratified.rb +1 -2
- data/test/test_test_f.rb +1 -2
- data/test/test_test_t.rb +1 -2
- data/test/test_umannwhitney.rb +1 -2
- data/test/test_vector.rb +117 -18
- data/test/test_xls.rb +2 -3
- data/web/Rakefile +39 -0
- metadata +109 -29
- metadata.gz.sig +0 -0
- data/examples/parallel_analysis_tetrachoric.rb +0 -31
- data/lib/distribution.rb +0 -25
- data/lib/distribution/chisquare.rb +0 -23
- data/lib/distribution/f.rb +0 -35
- data/lib/distribution/normal.rb +0 -60
- data/lib/distribution/normalbivariate.rb +0 -284
- data/lib/distribution/normalmultivariate.rb +0 -73
- data/lib/distribution/t.rb +0 -55
- data/test/test_distribution.rb +0 -73
@@ -30,13 +30,13 @@ class MatrixEngine < BaseEngine
|
|
30
30
|
|
31
31
|
# Number of cases
|
32
32
|
attr_writer :cases
|
33
|
-
|
33
|
+
attr_writer :digits
|
34
34
|
# Create object
|
35
35
|
#
|
36
36
|
def initialize(matrix,y_var, opts=Hash.new)
|
37
37
|
matrix.extend Statsample::CovariateMatrix
|
38
38
|
raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
|
39
|
-
if matrix.
|
39
|
+
if matrix._type==:covariance
|
40
40
|
@matrix_cov=matrix
|
41
41
|
@matrix_cor=matrix.correlation
|
42
42
|
@no_covariance=false
|
@@ -53,6 +53,8 @@ class MatrixEngine < BaseEngine
|
|
53
53
|
@predictors_n=@n_predictors
|
54
54
|
@matrix_x= @matrix_cor.submatrix(@fields)
|
55
55
|
@matrix_x_cov= @matrix_cov.submatrix(@fields)
|
56
|
+
raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
|
57
|
+
|
56
58
|
|
57
59
|
@matrix_y = @matrix_cor.submatrix(@fields, [y_var])
|
58
60
|
@matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
|
@@ -75,13 +77,14 @@ class MatrixEngine < BaseEngine
|
|
75
77
|
@y_mean=0.0
|
76
78
|
@name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
|
77
79
|
|
78
|
-
|
80
|
+
opts_default={:digits=>3}
|
81
|
+
opts=opts_default.merge opts
|
79
82
|
opts.each{|k,v|
|
80
83
|
self.send("#{k}=",v) if self.respond_to? k
|
81
84
|
}
|
82
85
|
result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
|
83
86
|
|
84
|
-
if matrix.
|
87
|
+
if matrix._type==:covariance
|
85
88
|
@coeffs=result_matrix.column(0).to_a
|
86
89
|
@coeffs_stan=coeffs.collect {|k,v|
|
87
90
|
coeffs[k]*@x_sd[k].quo(@y_sd)
|
@@ -141,7 +144,7 @@ class MatrixEngine < BaseEngine
|
|
141
144
|
# Tolerance for a given variable
|
142
145
|
# defined as (1-R^2) of regression of other independent variables
|
143
146
|
# over the selected
|
144
|
-
# Reference:
|
147
|
+
# == Reference:
|
145
148
|
# * http://talkstats.com/showthread.php?t=5056
|
146
149
|
def tolerance(var)
|
147
150
|
return 1 if @matrix_x.column_size==1
|
@@ -8,8 +8,10 @@ module Statsample
|
|
8
8
|
# * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
|
9
9
|
#
|
10
10
|
class Simple
|
11
|
+
include Summarizable
|
11
12
|
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
12
|
-
|
13
|
+
attr_accessor :name
|
14
|
+
attr_accessor :digits
|
13
15
|
def initialize(init_method, *argv)
|
14
16
|
self.send(init_method, *argv)
|
15
17
|
end
|
@@ -61,15 +63,15 @@ module Statsample
|
|
61
63
|
new(:init_gsl, *ar)
|
62
64
|
end
|
63
65
|
# Create a simple regression using two vectors
|
64
|
-
def new_from_vectors(vx,vy)
|
65
|
-
new(:init_vectors,vx,vy)
|
66
|
+
def new_from_vectors(vx,vy, opts=Hash.new)
|
67
|
+
new(:init_vectors,vx,vy, opts)
|
66
68
|
end
|
67
69
|
# Create a simple regression using a dataset and two vector names.
|
68
|
-
def new_from_dataset(ds,x,y)
|
69
|
-
new(:init_vectors,ds[x],ds[y])
|
70
|
+
def new_from_dataset(ds,x,y, opts=Hash.new)
|
71
|
+
new(:init_vectors,ds[x],ds[y], opts)
|
70
72
|
end
|
71
73
|
end
|
72
|
-
def init_vectors(vx,vy)
|
74
|
+
def init_vectors(vx,vy, opts=Hash.new)
|
73
75
|
@vx,@vy=Statsample.only_valid_clone(vx,vy)
|
74
76
|
x_m=@vx.mean
|
75
77
|
y_m=@vy.mean
|
@@ -80,6 +82,17 @@ module Statsample
|
|
80
82
|
}
|
81
83
|
@b=num.to_f/den
|
82
84
|
@a=y_m - @b*x_m
|
85
|
+
|
86
|
+
opts_default={
|
87
|
+
:digits=>3,
|
88
|
+
:name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
|
89
|
+
}
|
90
|
+
@opts=opts_default.merge opts
|
91
|
+
|
92
|
+
@opts.each{|k,v|
|
93
|
+
self.send("#{k}=",v) if self.respond_to? k
|
94
|
+
}
|
95
|
+
|
83
96
|
end
|
84
97
|
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
85
98
|
@a=a
|
@@ -90,6 +103,18 @@ module Statsample
|
|
90
103
|
@chisq=chisq
|
91
104
|
@status=status
|
92
105
|
end
|
106
|
+
def report_building(gen)
|
107
|
+
f="%0.#{digits}f"
|
108
|
+
gen.section(:name=>name) do |s|
|
109
|
+
s.table(:header=>[_("Variable"), _("Value")]) do |t|
|
110
|
+
t.row [_("r"), f % r]
|
111
|
+
t.row [_("r^2"), f % r2]
|
112
|
+
t.row [_("a"), f % a]
|
113
|
+
t.row [_("b"), f % a]
|
114
|
+
t.row [_("s.e"), f % standard_error]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
93
118
|
private :init_vectors, :init_gsl
|
94
119
|
end
|
95
120
|
end
|
@@ -5,6 +5,7 @@ module Statsample
|
|
5
5
|
# only uses tuples without missing data
|
6
6
|
def cronbach_alpha(ods)
|
7
7
|
ds=ods.dup_only_valid
|
8
|
+
return nil if ds.vectors.any? {|k,v| v.variance==0}
|
8
9
|
n_items=ds.fields.size
|
9
10
|
return nil if n_items<=1
|
10
11
|
s2_items=ds.vectors.inject(0) {|ac,v|
|
@@ -16,11 +17,18 @@ module Statsample
|
|
16
17
|
# Calculate Chonbach's alpha for a given dataset
|
17
18
|
# using standarized values for every vector.
|
18
19
|
# Only uses tuples without missing data
|
19
|
-
|
20
|
+
# Return nil if one or more vectors has 0 variance
|
20
21
|
def cronbach_alpha_standarized(ods)
|
21
|
-
|
22
|
-
|
22
|
+
|
23
|
+
ds=ods.dup_only_valid
|
24
|
+
|
25
|
+
return nil if ds.vectors.any? {|k,v| v.variance==0}
|
26
|
+
|
27
|
+
ds=ds.fields.inject({}){|a,f|
|
28
|
+
a[f]=ods[f].standarized;
|
29
|
+
a
|
23
30
|
}.to_dataset
|
31
|
+
|
24
32
|
cronbach_alpha(ds)
|
25
33
|
end
|
26
34
|
# Predicted reliability of a test by replicating
|
@@ -53,12 +53,12 @@ module Statsample
|
|
53
53
|
@variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
|
54
54
|
@variances_mean=@variances.mean
|
55
55
|
@covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
|
56
|
-
begin
|
56
|
+
#begin
|
57
57
|
@alpha = Statsample::Reliability.cronbach_alpha(@ds)
|
58
58
|
@alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds)
|
59
|
-
rescue => e
|
60
|
-
|
61
|
-
end
|
59
|
+
#rescue => e
|
60
|
+
# raise DatasetException.new(@ds,e), "Error calculating alpha"
|
61
|
+
#end
|
62
62
|
end
|
63
63
|
# Returns a hash with structure
|
64
64
|
def item_characteristic_curve
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Module which provide shorthands for many methods.
|
3
|
+
module Shorthand
|
4
|
+
###
|
5
|
+
# :section: R like methods
|
6
|
+
###
|
7
|
+
|
8
|
+
# Retrieve names (fields) from dataset
|
9
|
+
def names(ds)
|
10
|
+
ds.fields
|
11
|
+
end
|
12
|
+
# Create a correlation matrix from a dataset
|
13
|
+
def cor(ds)
|
14
|
+
Statsample::Bivariate.correlation_matrix(ds)
|
15
|
+
end
|
16
|
+
# Create a variance/covariance matrix from a dataset
|
17
|
+
def cov(ds)
|
18
|
+
Statsample::Bivariate.covariate_matrix(ds)
|
19
|
+
end
|
20
|
+
# Create a Statsample::Vector
|
21
|
+
# Analog to R's c
|
22
|
+
def c(*args)
|
23
|
+
Statsample::Vector[*args]
|
24
|
+
end
|
25
|
+
# Random generation for the normal distribution
|
26
|
+
def rnorm(n,mean=0,sd=1)
|
27
|
+
rng=Distribution::Normal.rng(mean,sd)
|
28
|
+
Statsample::Vector.new_scale(n) { rng.call}
|
29
|
+
end
|
30
|
+
# Creates a new Statsample::Dataset
|
31
|
+
# Each key is transformed into string
|
32
|
+
def dataset(vectors=Hash.new)
|
33
|
+
vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
|
34
|
+
Statsample::Dataset.new(vectors)
|
35
|
+
end
|
36
|
+
alias :data_frame :dataset
|
37
|
+
# Returns a Statsample::Graph::Boxplot
|
38
|
+
def boxplot(*args)
|
39
|
+
Statsample::Graph::Boxplot.new(*args)
|
40
|
+
end
|
41
|
+
# Returns a Statsample::Graph::Histogram
|
42
|
+
def histogram(*args)
|
43
|
+
Statsample::Graph::Histogram.new(*args)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns a Statsample::Graph::Scatterplot
|
47
|
+
def scatterplot(*args)
|
48
|
+
Statsample::Graph::Scatterplot.new(*args)
|
49
|
+
end
|
50
|
+
# Returns a Statsample::Test::Levene
|
51
|
+
def levene(*args)
|
52
|
+
Statsample::Test::Levene.new(*args)
|
53
|
+
end
|
54
|
+
def principal_axis(*args)
|
55
|
+
Statsample::Factor::PrincipalAxis.new(*args)
|
56
|
+
|
57
|
+
end
|
58
|
+
def polychoric(*args)
|
59
|
+
Statsample::Bivariate::Polychoric.new(*args)
|
60
|
+
end
|
61
|
+
def tetrachoric(*args)
|
62
|
+
Statsample::Bivariate::Tetrachoric.new(*args)
|
63
|
+
end
|
64
|
+
|
65
|
+
###
|
66
|
+
# Other Shortcuts
|
67
|
+
###
|
68
|
+
def lr(*args)
|
69
|
+
Statsample::Regression.multiple(*args)
|
70
|
+
end
|
71
|
+
def pca(ds,opts=Hash.new)
|
72
|
+
Statsample::Factor::PCA.new(ds,opts)
|
73
|
+
end
|
74
|
+
def dominance_analysis(*args)
|
75
|
+
Statsample::DominanceAnalysis.new(*args)
|
76
|
+
end
|
77
|
+
def dominance_analysis_bootstrap(*args)
|
78
|
+
Statsample::DominanceAnalysis::Bootstrap.new(*args)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -1,17 +1,31 @@
|
|
1
1
|
require 'date'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
require 'statsample/vector/gsl'
|
3
|
+
|
4
|
+
module Statsample::VectorShorthands
|
5
|
+
# Creates a new Statsample::Vector object
|
6
|
+
# Argument should be equal to Vector.new
|
7
|
+
def to_vector(*args)
|
6
8
|
Statsample::Vector.new(self,*args)
|
7
9
|
end
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
# Creates a new Statsample::Vector object of type :scale
|
11
|
+
def to_scale(*args)
|
12
|
+
Statsample::Vector.new(self, :scale, *args)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class Array
|
17
|
+
include Statsample::VectorShorthands
|
12
18
|
end
|
13
19
|
|
20
|
+
if Statsample.has_gsl?
|
21
|
+
module GSL
|
22
|
+
class Vector
|
23
|
+
include Statsample::VectorShorthands
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
14
27
|
module Statsample
|
28
|
+
|
15
29
|
|
16
30
|
# Collection of values on one dimension. Works as a column on a Spreadsheet.
|
17
31
|
#
|
@@ -41,26 +55,21 @@ module Statsample
|
|
41
55
|
attr_reader :data_with_nils
|
42
56
|
# Date date, with all missing values replaced by nils
|
43
57
|
attr_reader :date_data_with_nils
|
44
|
-
# GSL Object, only available with rbgsl extension and type==:scale
|
45
|
-
attr_reader :gsl
|
46
58
|
# Change label for specific values
|
47
59
|
attr_accessor :labels
|
48
60
|
# Name of vector. Should be used for output by many classes
|
49
61
|
attr_accessor :name
|
50
62
|
|
51
|
-
#
|
52
63
|
# Creates a new Vector object.
|
53
|
-
# * <tt>data</tt>
|
64
|
+
# * <tt>data</tt> Any data which can be converted on Array
|
54
65
|
# * <tt>type</tt> Level of meausurement. See Vector#type
|
55
66
|
# * <tt>opts</tt> Hash of options
|
56
67
|
# * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
|
57
68
|
# * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
|
58
69
|
# * <tt>:labels</tt> Labels for data values
|
59
70
|
# * <tt>:name</tt> Name of vector
|
60
|
-
#
|
61
71
|
def initialize(data=[], type=:nominal, opts=Hash.new)
|
62
|
-
|
63
|
-
@data=data
|
72
|
+
@data=data.is_a?(Array) ? data : data.to_a
|
64
73
|
@type=type
|
65
74
|
opts_default={
|
66
75
|
:missing_values=>[],
|
@@ -84,9 +93,46 @@ module Statsample
|
|
84
93
|
@missing_data=[]
|
85
94
|
@has_missing_data=nil
|
86
95
|
@scale_data=nil
|
87
|
-
|
96
|
+
set_valid_data
|
88
97
|
self.type=type
|
89
98
|
end
|
99
|
+
# Create a vector using (almost) any object
|
100
|
+
# * Array: flattened
|
101
|
+
# * Range: transformed using to_a
|
102
|
+
# * Statsample::Vector
|
103
|
+
# * Numeric and string values
|
104
|
+
def self.[](*args)
|
105
|
+
values=[]
|
106
|
+
args.each do |a|
|
107
|
+
case a
|
108
|
+
when Array
|
109
|
+
values.concat a.flatten
|
110
|
+
when Statsample::Vector
|
111
|
+
values.concat a.to_a
|
112
|
+
when Range
|
113
|
+
values.concat a.to_a
|
114
|
+
else
|
115
|
+
values << a
|
116
|
+
end
|
117
|
+
end
|
118
|
+
vector=new(values)
|
119
|
+
vector.type=:scale if vector.can_be_scale?
|
120
|
+
vector
|
121
|
+
end
|
122
|
+
# Create a new scale type vector
|
123
|
+
# Parameters
|
124
|
+
# [n] Size
|
125
|
+
# [val] Value of each value
|
126
|
+
# [&block] If block provided, is used to set the values of vector
|
127
|
+
def self.new_scale(n,val=nil, &block)
|
128
|
+
if block
|
129
|
+
vector=n.times.map {|i| block.call(i)}.to_scale
|
130
|
+
else
|
131
|
+
vector=n.times.map { val}.to_scale
|
132
|
+
end
|
133
|
+
vector.type=:scale
|
134
|
+
vector
|
135
|
+
end
|
90
136
|
# Creates a duplicate of the Vector.
|
91
137
|
# Note: data, missing_values and labels are duplicated, so
|
92
138
|
# changes on original vector doesn't propages to copies.
|
@@ -98,40 +144,48 @@ module Statsample
|
|
98
144
|
def dup_empty
|
99
145
|
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
|
100
146
|
end
|
101
|
-
|
102
|
-
|
103
|
-
|
147
|
+
|
148
|
+
if Statsample::STATSAMPLE__.respond_to?(:check_type)
|
149
|
+
# Raises an exception if type of vector is inferior to t type
|
150
|
+
def check_type(t)
|
151
|
+
Statsample::STATSAMPLE__.check_type(self,t)
|
152
|
+
end
|
153
|
+
else
|
154
|
+
def check_type(t) #:nodoc:
|
155
|
+
_check_type(t)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
def _check_type(t) #:nodoc:
|
161
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
|
104
162
|
end
|
105
|
-
private :check_type
|
106
163
|
|
164
|
+
def vector_standarized_compute(m,sd) # :nodoc:
|
165
|
+
@data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
|
166
|
+
end
|
107
167
|
# Return a vector usign the standarized values for data
|
108
168
|
# with sd with denominator n-1. With variance=0 or mean nil,
|
109
169
|
# returns a vector of equal size full of nils
|
110
170
|
#
|
111
|
-
|
112
171
|
def vector_standarized(use_population=false)
|
113
172
|
check_type :scale
|
114
|
-
return ([nil]*size).to_scale if mean.nil?
|
115
173
|
m=mean
|
116
174
|
sd=use_population ? sdp : sds
|
117
|
-
return ([nil]*size).to_scale if sd==0.0
|
118
|
-
vector
|
119
|
-
if !x.nil?
|
120
|
-
(x.to_f - m).quo(sd)
|
121
|
-
else
|
122
|
-
nil
|
123
|
-
end
|
124
|
-
}.to_vector(:scale)
|
175
|
+
return ([nil]*size).to_scale if mean.nil? or sd==0.0
|
176
|
+
vector=vector_standarized_compute(m,sd)
|
125
177
|
vector.name=_("%s(standarized)") % @name
|
126
178
|
vector
|
127
179
|
end
|
180
|
+
def vector_centered_compute(m) #:nodoc:
|
181
|
+
@data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
|
182
|
+
end
|
128
183
|
# Return a centered vector
|
129
184
|
def vector_centered
|
130
185
|
check_type :scale
|
131
186
|
m=mean
|
132
|
-
|
133
|
-
|
134
|
-
}.to_scale
|
187
|
+
return ([nil]*size).to_scale if mean.nil?
|
188
|
+
vector=vector_centered_compute(m)
|
135
189
|
vector.name=_("%s(centered)") % @name
|
136
190
|
vector
|
137
191
|
end
|
@@ -148,18 +202,18 @@ module Statsample
|
|
148
202
|
vector
|
149
203
|
end
|
150
204
|
def box_cox_transformation(lambda) # :nodoc:
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
205
|
+
raise "Should be a scale" unless @type==:scale
|
206
|
+
@data_with_nils.collect{|x|
|
207
|
+
if !x.nil?
|
208
|
+
if(lambda==0)
|
209
|
+
Math.log(x)
|
210
|
+
else
|
211
|
+
(x**lambda-1).quo(lambda)
|
212
|
+
end
|
156
213
|
else
|
157
|
-
|
214
|
+
nil
|
158
215
|
end
|
159
|
-
|
160
|
-
nil
|
161
|
-
end
|
162
|
-
}.to_vector(:scale)
|
216
|
+
}.to_vector(:scale)
|
163
217
|
end
|
164
218
|
|
165
219
|
# Vector equality.
|
@@ -193,6 +247,10 @@ module Statsample
|
|
193
247
|
}
|
194
248
|
set_valid_data
|
195
249
|
end
|
250
|
+
def push(v)
|
251
|
+
@data.push(v)
|
252
|
+
set_valid_data
|
253
|
+
end
|
196
254
|
# Dicotomize the vector with 0 and 1, based on lowest value
|
197
255
|
# If parameter if defined, this value and lower
|
198
256
|
# will be 0 and higher, 1
|
@@ -250,7 +308,6 @@ module Statsample
|
|
250
308
|
@missing_data.clear
|
251
309
|
@data_with_nils.clear
|
252
310
|
@date_data_with_nils.clear
|
253
|
-
@gsl=nil
|
254
311
|
set_valid_data_intern
|
255
312
|
set_scale_data if(@type==:scale)
|
256
313
|
set_date_data if(@type==:date)
|
@@ -281,11 +338,14 @@ module Statsample
|
|
281
338
|
def has_missing_data?
|
282
339
|
@has_missing_data
|
283
340
|
end
|
341
|
+
alias :flawed? :has_missing_data?
|
342
|
+
|
284
343
|
# Retrieves label for value x. Retrieves x if
|
285
344
|
# no label defined.
|
286
345
|
def labeling(x)
|
287
346
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
288
347
|
end
|
348
|
+
alias :label :labeling
|
289
349
|
# Returns a Vector with data with labels replaced by the label.
|
290
350
|
def vector_labeled
|
291
351
|
d=@data.collect{|x|
|
@@ -317,8 +377,7 @@ module Statsample
|
|
317
377
|
!(x.nil? or @missing_values.include? x)
|
318
378
|
end
|
319
379
|
# Set missing_values.
|
320
|
-
#
|
321
|
-
# set_valid_data after all changes
|
380
|
+
# set_valid_data is called after changes
|
322
381
|
def missing_values=(vals)
|
323
382
|
@missing_values = vals
|
324
383
|
set_valid_data
|
@@ -335,7 +394,11 @@ module Statsample
|
|
335
394
|
set_date_data if (t==:date)
|
336
395
|
end
|
337
396
|
def to_a
|
338
|
-
@data.
|
397
|
+
if @data.is_a? Array
|
398
|
+
@data.dup
|
399
|
+
else
|
400
|
+
@data.to_a
|
401
|
+
end
|
339
402
|
end
|
340
403
|
alias_method :to_ary, :to_a
|
341
404
|
|
@@ -357,6 +420,10 @@ module Statsample
|
|
357
420
|
def -(v)
|
358
421
|
_vector_ari("-",v)
|
359
422
|
end
|
423
|
+
|
424
|
+
def *(v)
|
425
|
+
_vector_ari("*",v)
|
426
|
+
end
|
360
427
|
# Reports all values that doesn't comply with a condition.
|
361
428
|
# Returns a hash with the index of data and the invalid data.
|
362
429
|
def verify
|
@@ -370,20 +437,16 @@ module Statsample
|
|
370
437
|
end
|
371
438
|
def _vector_ari(method,v) # :nodoc:
|
372
439
|
if(v.is_a? Vector or v.is_a? Array)
|
373
|
-
|
374
|
-
# i=0
|
440
|
+
raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
|
375
441
|
sum=[]
|
376
|
-
|
442
|
+
v.size.times {|i|
|
377
443
|
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
378
444
|
sum.push(@data[i].send(method,v[i]))
|
379
445
|
else
|
380
446
|
sum.push(nil)
|
381
447
|
end
|
382
448
|
}
|
383
|
-
Statsample::Vector.new(sum, :scale
|
384
|
-
else
|
385
|
-
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
386
|
-
end
|
449
|
+
Statsample::Vector.new(sum, :scale)
|
387
450
|
elsif(v.respond_to? method )
|
388
451
|
Statsample::Vector.new(
|
389
452
|
@data.collect {|x|
|
@@ -451,10 +514,10 @@ module Statsample
|
|
451
514
|
}
|
452
515
|
end
|
453
516
|
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
517
|
+
split_by_separator(sep).inject({}) {|a,v|
|
518
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
519
|
+
a
|
520
|
+
}
|
458
521
|
end
|
459
522
|
|
460
523
|
# Returns an random sample of size n, with replacement,
|
@@ -463,13 +526,8 @@ module Statsample
|
|
463
526
|
# In all the trails, every item have the same probability
|
464
527
|
# of been selected.
|
465
528
|
def sample_with_replacement(sample=1)
|
466
|
-
|
467
|
-
|
468
|
-
(0...sample).collect{ @valid_data[rand(vds)] }
|
469
|
-
else
|
470
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
471
|
-
r.sample(@gsl, sample).to_a
|
472
|
-
end
|
529
|
+
vds=@valid_data.size
|
530
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
473
531
|
end
|
474
532
|
# Returns an random sample of size n, without replacement,
|
475
533
|
# only with valid data.
|
@@ -479,7 +537,6 @@ module Statsample
|
|
479
537
|
# A sample of the same size of the vector is the vector itself.
|
480
538
|
|
481
539
|
def sample_without_replacement(sample=1)
|
482
|
-
if(@type!=:scale or !Statsample.has_gsl?)
|
483
540
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
484
541
|
out=[]
|
485
542
|
size=@valid_data.size
|
@@ -487,11 +544,7 @@ module Statsample
|
|
487
544
|
value=rand(size)
|
488
545
|
out.push(value) if !out.include?value
|
489
546
|
end
|
490
|
-
out.collect{|i
|
491
|
-
else
|
492
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
493
|
-
r.choose(@gsl, sample).to_a
|
494
|
-
end
|
547
|
+
out.collect{|i| @data[i]}
|
495
548
|
end
|
496
549
|
# Retrieves number of cases which comply condition.
|
497
550
|
# If block given, retrieves number of instances where
|
@@ -535,11 +588,11 @@ module Statsample
|
|
535
588
|
end
|
536
589
|
# Return true if all data is Numeric or nil
|
537
590
|
def can_be_scale?
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
591
|
+
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
592
|
+
false
|
593
|
+
else
|
594
|
+
true
|
595
|
+
end
|
543
596
|
end
|
544
597
|
|
545
598
|
def to_s
|
@@ -560,13 +613,13 @@ module Statsample
|
|
560
613
|
end
|
561
614
|
# Retrieves uniques values for data.
|
562
615
|
def factors
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
616
|
+
if @type==:scale
|
617
|
+
@scale_data.uniq.sort
|
618
|
+
elsif @type==:date
|
619
|
+
@date_data_with_nils.uniq.sort
|
620
|
+
else
|
621
|
+
@valid_data.uniq.sort
|
622
|
+
end
|
570
623
|
end
|
571
624
|
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
572
625
|
# Returns a hash with the distribution of frecuencies for
|
@@ -579,6 +632,8 @@ module Statsample
|
|
579
632
|
_frequencies
|
580
633
|
end
|
581
634
|
end
|
635
|
+
|
636
|
+
|
582
637
|
def _frequencies #:nodoc:
|
583
638
|
@valid_data.inject(Hash.new) {|a,x|
|
584
639
|
a[x]||=0
|
@@ -589,7 +644,7 @@ module Statsample
|
|
589
644
|
|
590
645
|
# Returns the most frequent item.
|
591
646
|
def mode
|
592
|
-
frequencies.max{|a,b| a[1]<=>b[1]}
|
647
|
+
frequencies.max{|a,b| a[1]<=>b[1]}.first
|
593
648
|
end
|
594
649
|
# The numbers of item with valid data.
|
595
650
|
def n_valid
|
@@ -678,22 +733,17 @@ module Statsample
|
|
678
733
|
# Return the median (percentil 50)
|
679
734
|
def median
|
680
735
|
check_type :ordinal
|
681
|
-
|
682
|
-
sorted=GSL::Vector.alloc(@scale_data.sort)
|
683
|
-
GSL::Stats::median_from_sorted_data(sorted)
|
684
|
-
else
|
685
|
-
percentil(50)
|
686
|
-
end
|
736
|
+
percentil(50)
|
687
737
|
end
|
688
738
|
# Minimun value
|
689
739
|
def min
|
690
740
|
check_type :ordinal
|
691
|
-
@valid_data.min
|
741
|
+
@valid_data.min
|
692
742
|
end
|
693
743
|
# Maximum value
|
694
744
|
def max
|
695
745
|
check_type :ordinal
|
696
|
-
@valid_data.max
|
746
|
+
@valid_data.max
|
697
747
|
end
|
698
748
|
|
699
749
|
def set_date_data
|
@@ -722,9 +772,6 @@ module Statsample
|
|
722
772
|
x.to_f
|
723
773
|
end
|
724
774
|
end
|
725
|
-
if Statsample.has_gsl?
|
726
|
-
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
727
|
-
end
|
728
775
|
end
|
729
776
|
|
730
777
|
private :set_date_data, :set_scale_data
|
@@ -791,7 +838,6 @@ module Statsample
|
|
791
838
|
# Sample Standard deviation (denominator n-1)
|
792
839
|
def standard_deviation_sample(m=nil)
|
793
840
|
check_type :scale
|
794
|
-
|
795
841
|
m||=mean
|
796
842
|
Math::sqrt(variance_sample(m))
|
797
843
|
end
|
@@ -816,76 +862,30 @@ module Statsample
|
|
816
862
|
check_type :scale
|
817
863
|
@scale_data.inject(1){|a,x| a*x }
|
818
864
|
end
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
}
|
824
|
-
def sum # :nodoc:
|
825
|
-
check_type :scale
|
826
|
-
|
827
|
-
@gsl.sum
|
828
|
-
end
|
829
|
-
def mean # :nodoc:
|
865
|
+
|
866
|
+
# With a fixnum, creates X bins within the range of data
|
867
|
+
# With an Array, each value will be a cut point
|
868
|
+
def histogram(bins=10)
|
830
869
|
check_type :scale
|
831
|
-
@gsl.nil? ? nil : @gsl.mean
|
832
|
-
end
|
833
|
-
def variance_sample(m=nil) # :nodoc:
|
834
|
-
check_type :scale
|
835
|
-
m||=mean
|
836
|
-
@gsl.variance_m
|
837
|
-
end
|
838
|
-
def standard_deviation_sample(m=nil) # :nodoc:
|
839
|
-
check_type :scale
|
840
|
-
return nil if @gsl.nil?
|
841
|
-
m||=mean
|
842
|
-
@gsl.sd(m)
|
843
|
-
end
|
844
870
|
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
check_type :scale
|
857
|
-
@gsl.skew
|
858
|
-
end
|
859
|
-
def kurtosis # :nodoc:
|
860
|
-
check_type :scale
|
861
|
-
@gsl.kurtosis
|
862
|
-
end
|
863
|
-
# Create a GSL::Histogram
|
864
|
-
# With a fixnum, creates X bins within the range of data
|
865
|
-
# With an Array, each value will be a cut point
|
866
|
-
def histogram(bins=10)
|
867
|
-
check_type :scale
|
868
|
-
|
869
|
-
if bins.is_a? Array
|
870
|
-
#h=Statsample::Histogram.new(self, bins)
|
871
|
-
h=Statsample::Histogram.alloc(bins)
|
872
|
-
else
|
873
|
-
# ugly patch. The upper limit for a bin has the form
|
874
|
-
# x < range
|
875
|
-
#h=Statsample::Histogram.new(self, bins)
|
876
|
-
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
|
877
|
-
# fix last data
|
878
|
-
if max==@valid_data.max
|
879
|
-
max+=1e-10
|
880
|
-
end
|
881
|
-
h=Statsample::Histogram.alloc(bins,[min,max])
|
882
|
-
# Fix last bin
|
883
|
-
|
871
|
+
if bins.is_a? Array
|
872
|
+
#h=Statsample::Histogram.new(self, bins)
|
873
|
+
h=Statsample::Histogram.alloc(bins)
|
874
|
+
else
|
875
|
+
# ugly patch. The upper limit for a bin has the form
|
876
|
+
# x < range
|
877
|
+
#h=Statsample::Histogram.new(self, bins)
|
878
|
+
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
|
879
|
+
# fix last data
|
880
|
+
if max==@valid_data.max
|
881
|
+
max+=1e-10
|
884
882
|
end
|
885
|
-
h.
|
886
|
-
|
883
|
+
h=Statsample::Histogram.alloc(bins,[min,max])
|
884
|
+
# Fix last bin
|
885
|
+
|
887
886
|
end
|
888
|
-
|
887
|
+
h.increment(@valid_data)
|
888
|
+
h
|
889
889
|
end
|
890
890
|
|
891
891
|
# Coefficient of variation
|
@@ -894,7 +894,6 @@ module Statsample
|
|
894
894
|
check_type :scale
|
895
895
|
standard_deviation_sample.quo(mean)
|
896
896
|
end
|
897
|
-
|
898
897
|
alias_method :sdp, :standard_deviation_population
|
899
898
|
alias_method :sds, :standard_deviation_sample
|
900
899
|
alias_method :adp, :average_deviation_population
|
@@ -902,5 +901,6 @@ module Statsample
|
|
902
901
|
alias_method :variance, :variance_sample
|
903
902
|
alias_method :sd, :standard_deviation_sample
|
904
903
|
alias_method :ss, :sum_of_squares
|
904
|
+
include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
|
905
905
|
end
|
906
906
|
end
|