statsample 0.18.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +23 -0
- data/Manifest.txt +28 -17
- data/Rakefile +3 -2
- data/benchmarks/correlation_matrix_15_variables.rb +31 -0
- data/benchmarks/correlation_matrix_5_variables.rb +32 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/examples/boxplot.rb +13 -14
- data/examples/correlation_matrix.rb +16 -8
- data/examples/dataset.rb +13 -4
- data/examples/dominance_analysis.rb +23 -17
- data/examples/dominance_analysis_bootstrap.rb +28 -22
- data/examples/histogram.rb +8 -9
- data/examples/icc.rb +20 -21
- data/examples/levene.rb +10 -4
- data/examples/multiple_regression.rb +9 -28
- data/examples/multivariate_correlation.rb +9 -3
- data/examples/parallel_analysis.rb +20 -16
- data/examples/polychoric.rb +15 -9
- data/examples/principal_axis.rb +18 -6
- data/examples/reliability.rb +26 -13
- data/examples/scatterplot.rb +10 -6
- data/examples/t_test.rb +15 -6
- data/examples/tetrachoric.rb +9 -2
- data/examples/u_test.rb +12 -4
- data/examples/vector.rb +13 -2
- data/examples/velicer_map_test.rb +33 -26
- data/lib/statsample.rb +32 -12
- data/lib/statsample/analysis.rb +79 -0
- data/lib/statsample/analysis/suite.rb +72 -0
- data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
- data/lib/statsample/bivariate.rb +70 -16
- data/lib/statsample/dataset.rb +25 -19
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/factor.rb +2 -0
- data/lib/statsample/factor/map.rb +16 -10
- data/lib/statsample/factor/parallelanalysis.rb +9 -3
- data/lib/statsample/factor/pca.rb +28 -32
- data/lib/statsample/factor/rotation.rb +15 -8
- data/lib/statsample/graph/boxplot.rb +3 -4
- data/lib/statsample/graph/histogram.rb +2 -1
- data/lib/statsample/graph/scatterplot.rb +1 -0
- data/lib/statsample/matrix.rb +106 -16
- data/lib/statsample/regression.rb +4 -1
- data/lib/statsample/regression/binomial.rb +1 -1
- data/lib/statsample/regression/multiple/baseengine.rb +19 -9
- data/lib/statsample/regression/multiple/gslengine.rb +127 -126
- data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
- data/lib/statsample/regression/simple.rb +31 -6
- data/lib/statsample/reliability.rb +11 -3
- data/lib/statsample/reliability/scaleanalysis.rb +4 -4
- data/lib/statsample/shorthand.rb +81 -0
- data/lib/statsample/test/chisquare.rb +1 -1
- data/lib/statsample/vector.rb +163 -163
- data/lib/statsample/vector/gsl.rb +106 -0
- data/references.txt +2 -2
- data/{data → test/fixtures}/crime.txt +0 -0
- data/{data → test/fixtures}/hartman_23.matrix +0 -0
- data/{data → test/fixtures}/repeated_fields.csv +0 -0
- data/{data → test/fixtures}/test_binomial.csv +0 -0
- data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
- data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
- data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
- data/{data → test/fixtures}/tetmat_test.txt +0 -0
- data/test/helpers_tests.rb +18 -2
- data/test/test_analysis.rb +118 -0
- data/test/test_anovatwoway.rb +1 -1
- data/test/test_anovatwowaywithdataset.rb +1 -1
- data/test/test_anovawithvectors.rb +1 -2
- data/test/test_bartlettsphericity.rb +1 -2
- data/test/test_bivariate.rb +64 -22
- data/test/test_codification.rb +1 -2
- data/test/test_crosstab.rb +1 -2
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +24 -3
- data/test/test_dominance_analysis.rb +1 -2
- data/test/test_factor.rb +8 -69
- data/test/test_factor_map.rb +43 -0
- data/test/test_factor_pa.rb +54 -0
- data/test/test_ggobi.rb +1 -1
- data/test/test_gsl.rb +12 -18
- data/test/test_histogram.rb +1 -2
- data/test/test_logit.rb +62 -18
- data/test/test_matrix.rb +4 -5
- data/test/test_mle.rb +3 -4
- data/test/test_regression.rb +21 -2
- data/test/test_reliability.rb +3 -3
- data/test/test_reliability_icc.rb +1 -1
- data/test/test_reliability_skillscale.rb +20 -4
- data/test/test_resample.rb +1 -2
- data/test/test_rserve_extension.rb +1 -2
- data/test/test_srs.rb +1 -2
- data/test/test_statistics.rb +1 -2
- data/test/test_stest.rb +1 -2
- data/test/test_stratified.rb +1 -2
- data/test/test_test_f.rb +1 -2
- data/test/test_test_t.rb +1 -2
- data/test/test_umannwhitney.rb +1 -2
- data/test/test_vector.rb +117 -18
- data/test/test_xls.rb +2 -3
- data/web/Rakefile +39 -0
- metadata +109 -29
- metadata.gz.sig +0 -0
- data/examples/parallel_analysis_tetrachoric.rb +0 -31
- data/lib/distribution.rb +0 -25
- data/lib/distribution/chisquare.rb +0 -23
- data/lib/distribution/f.rb +0 -35
- data/lib/distribution/normal.rb +0 -60
- data/lib/distribution/normalbivariate.rb +0 -284
- data/lib/distribution/normalmultivariate.rb +0 -73
- data/lib/distribution/t.rb +0 -55
- data/test/test_distribution.rb +0 -73
|
@@ -30,13 +30,13 @@ class MatrixEngine < BaseEngine
|
|
|
30
30
|
|
|
31
31
|
# Number of cases
|
|
32
32
|
attr_writer :cases
|
|
33
|
-
|
|
33
|
+
attr_writer :digits
|
|
34
34
|
# Create object
|
|
35
35
|
#
|
|
36
36
|
def initialize(matrix,y_var, opts=Hash.new)
|
|
37
37
|
matrix.extend Statsample::CovariateMatrix
|
|
38
38
|
raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
|
|
39
|
-
if matrix.
|
|
39
|
+
if matrix._type==:covariance
|
|
40
40
|
@matrix_cov=matrix
|
|
41
41
|
@matrix_cor=matrix.correlation
|
|
42
42
|
@no_covariance=false
|
|
@@ -53,6 +53,8 @@ class MatrixEngine < BaseEngine
|
|
|
53
53
|
@predictors_n=@n_predictors
|
|
54
54
|
@matrix_x= @matrix_cor.submatrix(@fields)
|
|
55
55
|
@matrix_x_cov= @matrix_cov.submatrix(@fields)
|
|
56
|
+
raise LinearDependency, "Regressors are linearly dependent" if @matrix_x.determinant<1e-15
|
|
57
|
+
|
|
56
58
|
|
|
57
59
|
@matrix_y = @matrix_cor.submatrix(@fields, [y_var])
|
|
58
60
|
@matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
|
|
@@ -75,13 +77,14 @@ class MatrixEngine < BaseEngine
|
|
|
75
77
|
@y_mean=0.0
|
|
76
78
|
@name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
|
|
77
79
|
|
|
78
|
-
|
|
80
|
+
opts_default={:digits=>3}
|
|
81
|
+
opts=opts_default.merge opts
|
|
79
82
|
opts.each{|k,v|
|
|
80
83
|
self.send("#{k}=",v) if self.respond_to? k
|
|
81
84
|
}
|
|
82
85
|
result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
|
|
83
86
|
|
|
84
|
-
if matrix.
|
|
87
|
+
if matrix._type==:covariance
|
|
85
88
|
@coeffs=result_matrix.column(0).to_a
|
|
86
89
|
@coeffs_stan=coeffs.collect {|k,v|
|
|
87
90
|
coeffs[k]*@x_sd[k].quo(@y_sd)
|
|
@@ -141,7 +144,7 @@ class MatrixEngine < BaseEngine
|
|
|
141
144
|
# Tolerance for a given variable
|
|
142
145
|
# defined as (1-R^2) of regression of other independent variables
|
|
143
146
|
# over the selected
|
|
144
|
-
# Reference:
|
|
147
|
+
# == Reference:
|
|
145
148
|
# * http://talkstats.com/showthread.php?t=5056
|
|
146
149
|
def tolerance(var)
|
|
147
150
|
return 1 if @matrix_x.column_size==1
|
|
@@ -8,8 +8,10 @@ module Statsample
|
|
|
8
8
|
# * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
|
|
9
9
|
#
|
|
10
10
|
class Simple
|
|
11
|
+
include Summarizable
|
|
11
12
|
attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
|
|
12
|
-
|
|
13
|
+
attr_accessor :name
|
|
14
|
+
attr_accessor :digits
|
|
13
15
|
def initialize(init_method, *argv)
|
|
14
16
|
self.send(init_method, *argv)
|
|
15
17
|
end
|
|
@@ -61,15 +63,15 @@ module Statsample
|
|
|
61
63
|
new(:init_gsl, *ar)
|
|
62
64
|
end
|
|
63
65
|
# Create a simple regression using two vectors
|
|
64
|
-
def new_from_vectors(vx,vy)
|
|
65
|
-
new(:init_vectors,vx,vy)
|
|
66
|
+
def new_from_vectors(vx,vy, opts=Hash.new)
|
|
67
|
+
new(:init_vectors,vx,vy, opts)
|
|
66
68
|
end
|
|
67
69
|
# Create a simple regression using a dataset and two vector names.
|
|
68
|
-
def new_from_dataset(ds,x,y)
|
|
69
|
-
new(:init_vectors,ds[x],ds[y])
|
|
70
|
+
def new_from_dataset(ds,x,y, opts=Hash.new)
|
|
71
|
+
new(:init_vectors,ds[x],ds[y], opts)
|
|
70
72
|
end
|
|
71
73
|
end
|
|
72
|
-
def init_vectors(vx,vy)
|
|
74
|
+
def init_vectors(vx,vy, opts=Hash.new)
|
|
73
75
|
@vx,@vy=Statsample.only_valid_clone(vx,vy)
|
|
74
76
|
x_m=@vx.mean
|
|
75
77
|
y_m=@vy.mean
|
|
@@ -80,6 +82,17 @@ module Statsample
|
|
|
80
82
|
}
|
|
81
83
|
@b=num.to_f/den
|
|
82
84
|
@a=y_m - @b*x_m
|
|
85
|
+
|
|
86
|
+
opts_default={
|
|
87
|
+
:digits=>3,
|
|
88
|
+
:name=>_("Regression of %s over %s") % [@vx.name, @vy.name]
|
|
89
|
+
}
|
|
90
|
+
@opts=opts_default.merge opts
|
|
91
|
+
|
|
92
|
+
@opts.each{|k,v|
|
|
93
|
+
self.send("#{k}=",v) if self.respond_to? k
|
|
94
|
+
}
|
|
95
|
+
|
|
83
96
|
end
|
|
84
97
|
def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
|
|
85
98
|
@a=a
|
|
@@ -90,6 +103,18 @@ module Statsample
|
|
|
90
103
|
@chisq=chisq
|
|
91
104
|
@status=status
|
|
92
105
|
end
|
|
106
|
+
def report_building(gen)
|
|
107
|
+
f="%0.#{digits}f"
|
|
108
|
+
gen.section(:name=>name) do |s|
|
|
109
|
+
s.table(:header=>[_("Variable"), _("Value")]) do |t|
|
|
110
|
+
t.row [_("r"), f % r]
|
|
111
|
+
t.row [_("r^2"), f % r2]
|
|
112
|
+
t.row [_("a"), f % a]
|
|
113
|
+
t.row [_("b"), f % a]
|
|
114
|
+
t.row [_("s.e"), f % standard_error]
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
93
118
|
private :init_vectors, :init_gsl
|
|
94
119
|
end
|
|
95
120
|
end
|
|
@@ -5,6 +5,7 @@ module Statsample
|
|
|
5
5
|
# only uses tuples without missing data
|
|
6
6
|
def cronbach_alpha(ods)
|
|
7
7
|
ds=ods.dup_only_valid
|
|
8
|
+
return nil if ds.vectors.any? {|k,v| v.variance==0}
|
|
8
9
|
n_items=ds.fields.size
|
|
9
10
|
return nil if n_items<=1
|
|
10
11
|
s2_items=ds.vectors.inject(0) {|ac,v|
|
|
@@ -16,11 +17,18 @@ module Statsample
|
|
|
16
17
|
# Calculate Chonbach's alpha for a given dataset
|
|
17
18
|
# using standarized values for every vector.
|
|
18
19
|
# Only uses tuples without missing data
|
|
19
|
-
|
|
20
|
+
# Return nil if one or more vectors has 0 variance
|
|
20
21
|
def cronbach_alpha_standarized(ods)
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
|
|
23
|
+
ds=ods.dup_only_valid
|
|
24
|
+
|
|
25
|
+
return nil if ds.vectors.any? {|k,v| v.variance==0}
|
|
26
|
+
|
|
27
|
+
ds=ds.fields.inject({}){|a,f|
|
|
28
|
+
a[f]=ods[f].standarized;
|
|
29
|
+
a
|
|
23
30
|
}.to_dataset
|
|
31
|
+
|
|
24
32
|
cronbach_alpha(ds)
|
|
25
33
|
end
|
|
26
34
|
# Predicted reliability of a test by replicating
|
|
@@ -53,12 +53,12 @@ module Statsample
|
|
|
53
53
|
@variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
|
|
54
54
|
@variances_mean=@variances.mean
|
|
55
55
|
@covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
|
|
56
|
-
begin
|
|
56
|
+
#begin
|
|
57
57
|
@alpha = Statsample::Reliability.cronbach_alpha(@ds)
|
|
58
58
|
@alpha_standarized = Statsample::Reliability.cronbach_alpha_standarized(@ds)
|
|
59
|
-
rescue => e
|
|
60
|
-
|
|
61
|
-
end
|
|
59
|
+
#rescue => e
|
|
60
|
+
# raise DatasetException.new(@ds,e), "Error calculating alpha"
|
|
61
|
+
#end
|
|
62
62
|
end
|
|
63
63
|
# Returns a hash with structure
|
|
64
64
|
def item_characteristic_curve
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
module Statsample
|
|
2
|
+
# Module which provide shorthands for many methods.
|
|
3
|
+
module Shorthand
|
|
4
|
+
###
|
|
5
|
+
# :section: R like methods
|
|
6
|
+
###
|
|
7
|
+
|
|
8
|
+
# Retrieve names (fields) from dataset
|
|
9
|
+
def names(ds)
|
|
10
|
+
ds.fields
|
|
11
|
+
end
|
|
12
|
+
# Create a correlation matrix from a dataset
|
|
13
|
+
def cor(ds)
|
|
14
|
+
Statsample::Bivariate.correlation_matrix(ds)
|
|
15
|
+
end
|
|
16
|
+
# Create a variance/covariance matrix from a dataset
|
|
17
|
+
def cov(ds)
|
|
18
|
+
Statsample::Bivariate.covariate_matrix(ds)
|
|
19
|
+
end
|
|
20
|
+
# Create a Statsample::Vector
|
|
21
|
+
# Analog to R's c
|
|
22
|
+
def c(*args)
|
|
23
|
+
Statsample::Vector[*args]
|
|
24
|
+
end
|
|
25
|
+
# Random generation for the normal distribution
|
|
26
|
+
def rnorm(n,mean=0,sd=1)
|
|
27
|
+
rng=Distribution::Normal.rng(mean,sd)
|
|
28
|
+
Statsample::Vector.new_scale(n) { rng.call}
|
|
29
|
+
end
|
|
30
|
+
# Creates a new Statsample::Dataset
|
|
31
|
+
# Each key is transformed into string
|
|
32
|
+
def dataset(vectors=Hash.new)
|
|
33
|
+
vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
|
|
34
|
+
Statsample::Dataset.new(vectors)
|
|
35
|
+
end
|
|
36
|
+
alias :data_frame :dataset
|
|
37
|
+
# Returns a Statsample::Graph::Boxplot
|
|
38
|
+
def boxplot(*args)
|
|
39
|
+
Statsample::Graph::Boxplot.new(*args)
|
|
40
|
+
end
|
|
41
|
+
# Returns a Statsample::Graph::Histogram
|
|
42
|
+
def histogram(*args)
|
|
43
|
+
Statsample::Graph::Histogram.new(*args)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Returns a Statsample::Graph::Scatterplot
|
|
47
|
+
def scatterplot(*args)
|
|
48
|
+
Statsample::Graph::Scatterplot.new(*args)
|
|
49
|
+
end
|
|
50
|
+
# Returns a Statsample::Test::Levene
|
|
51
|
+
def levene(*args)
|
|
52
|
+
Statsample::Test::Levene.new(*args)
|
|
53
|
+
end
|
|
54
|
+
def principal_axis(*args)
|
|
55
|
+
Statsample::Factor::PrincipalAxis.new(*args)
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
def polychoric(*args)
|
|
59
|
+
Statsample::Bivariate::Polychoric.new(*args)
|
|
60
|
+
end
|
|
61
|
+
def tetrachoric(*args)
|
|
62
|
+
Statsample::Bivariate::Tetrachoric.new(*args)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
###
|
|
66
|
+
# Other Shortcuts
|
|
67
|
+
###
|
|
68
|
+
def lr(*args)
|
|
69
|
+
Statsample::Regression.multiple(*args)
|
|
70
|
+
end
|
|
71
|
+
def pca(ds,opts=Hash.new)
|
|
72
|
+
Statsample::Factor::PCA.new(ds,opts)
|
|
73
|
+
end
|
|
74
|
+
def dominance_analysis(*args)
|
|
75
|
+
Statsample::DominanceAnalysis.new(*args)
|
|
76
|
+
end
|
|
77
|
+
def dominance_analysis_bootstrap(*args)
|
|
78
|
+
Statsample::DominanceAnalysis::Bootstrap.new(*args)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
data/lib/statsample/vector.rb
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
1
|
require 'date'
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
2
|
+
require 'statsample/vector/gsl'
|
|
3
|
+
|
|
4
|
+
module Statsample::VectorShorthands
|
|
5
|
+
# Creates a new Statsample::Vector object
|
|
6
|
+
# Argument should be equal to Vector.new
|
|
7
|
+
def to_vector(*args)
|
|
6
8
|
Statsample::Vector.new(self,*args)
|
|
7
9
|
end
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
# Creates a new Statsample::Vector object of type :scale
|
|
11
|
+
def to_scale(*args)
|
|
12
|
+
Statsample::Vector.new(self, :scale, *args)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
class Array
|
|
17
|
+
include Statsample::VectorShorthands
|
|
12
18
|
end
|
|
13
19
|
|
|
20
|
+
if Statsample.has_gsl?
|
|
21
|
+
module GSL
|
|
22
|
+
class Vector
|
|
23
|
+
include Statsample::VectorShorthands
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
14
27
|
module Statsample
|
|
28
|
+
|
|
15
29
|
|
|
16
30
|
# Collection of values on one dimension. Works as a column on a Spreadsheet.
|
|
17
31
|
#
|
|
@@ -41,26 +55,21 @@ module Statsample
|
|
|
41
55
|
attr_reader :data_with_nils
|
|
42
56
|
# Date date, with all missing values replaced by nils
|
|
43
57
|
attr_reader :date_data_with_nils
|
|
44
|
-
# GSL Object, only available with rbgsl extension and type==:scale
|
|
45
|
-
attr_reader :gsl
|
|
46
58
|
# Change label for specific values
|
|
47
59
|
attr_accessor :labels
|
|
48
60
|
# Name of vector. Should be used for output by many classes
|
|
49
61
|
attr_accessor :name
|
|
50
62
|
|
|
51
|
-
#
|
|
52
63
|
# Creates a new Vector object.
|
|
53
|
-
# * <tt>data</tt>
|
|
64
|
+
# * <tt>data</tt> Any data which can be converted on Array
|
|
54
65
|
# * <tt>type</tt> Level of meausurement. See Vector#type
|
|
55
66
|
# * <tt>opts</tt> Hash of options
|
|
56
67
|
# * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
|
|
57
68
|
# * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
|
|
58
69
|
# * <tt>:labels</tt> Labels for data values
|
|
59
70
|
# * <tt>:name</tt> Name of vector
|
|
60
|
-
#
|
|
61
71
|
def initialize(data=[], type=:nominal, opts=Hash.new)
|
|
62
|
-
|
|
63
|
-
@data=data
|
|
72
|
+
@data=data.is_a?(Array) ? data : data.to_a
|
|
64
73
|
@type=type
|
|
65
74
|
opts_default={
|
|
66
75
|
:missing_values=>[],
|
|
@@ -84,9 +93,46 @@ module Statsample
|
|
|
84
93
|
@missing_data=[]
|
|
85
94
|
@has_missing_data=nil
|
|
86
95
|
@scale_data=nil
|
|
87
|
-
|
|
96
|
+
set_valid_data
|
|
88
97
|
self.type=type
|
|
89
98
|
end
|
|
99
|
+
# Create a vector using (almost) any object
|
|
100
|
+
# * Array: flattened
|
|
101
|
+
# * Range: transformed using to_a
|
|
102
|
+
# * Statsample::Vector
|
|
103
|
+
# * Numeric and string values
|
|
104
|
+
def self.[](*args)
|
|
105
|
+
values=[]
|
|
106
|
+
args.each do |a|
|
|
107
|
+
case a
|
|
108
|
+
when Array
|
|
109
|
+
values.concat a.flatten
|
|
110
|
+
when Statsample::Vector
|
|
111
|
+
values.concat a.to_a
|
|
112
|
+
when Range
|
|
113
|
+
values.concat a.to_a
|
|
114
|
+
else
|
|
115
|
+
values << a
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
vector=new(values)
|
|
119
|
+
vector.type=:scale if vector.can_be_scale?
|
|
120
|
+
vector
|
|
121
|
+
end
|
|
122
|
+
# Create a new scale type vector
|
|
123
|
+
# Parameters
|
|
124
|
+
# [n] Size
|
|
125
|
+
# [val] Value of each value
|
|
126
|
+
# [&block] If block provided, is used to set the values of vector
|
|
127
|
+
def self.new_scale(n,val=nil, &block)
|
|
128
|
+
if block
|
|
129
|
+
vector=n.times.map {|i| block.call(i)}.to_scale
|
|
130
|
+
else
|
|
131
|
+
vector=n.times.map { val}.to_scale
|
|
132
|
+
end
|
|
133
|
+
vector.type=:scale
|
|
134
|
+
vector
|
|
135
|
+
end
|
|
90
136
|
# Creates a duplicate of the Vector.
|
|
91
137
|
# Note: data, missing_values and labels are duplicated, so
|
|
92
138
|
# changes on original vector doesn't propages to copies.
|
|
@@ -98,40 +144,48 @@ module Statsample
|
|
|
98
144
|
def dup_empty
|
|
99
145
|
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
|
|
100
146
|
end
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
147
|
+
|
|
148
|
+
if Statsample::STATSAMPLE__.respond_to?(:check_type)
|
|
149
|
+
# Raises an exception if type of vector is inferior to t type
|
|
150
|
+
def check_type(t)
|
|
151
|
+
Statsample::STATSAMPLE__.check_type(self,t)
|
|
152
|
+
end
|
|
153
|
+
else
|
|
154
|
+
def check_type(t) #:nodoc:
|
|
155
|
+
_check_type(t)
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _check_type(t) #:nodoc:
|
|
161
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
|
|
104
162
|
end
|
|
105
|
-
private :check_type
|
|
106
163
|
|
|
164
|
+
def vector_standarized_compute(m,sd) # :nodoc:
|
|
165
|
+
@data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
|
|
166
|
+
end
|
|
107
167
|
# Return a vector usign the standarized values for data
|
|
108
168
|
# with sd with denominator n-1. With variance=0 or mean nil,
|
|
109
169
|
# returns a vector of equal size full of nils
|
|
110
170
|
#
|
|
111
|
-
|
|
112
171
|
def vector_standarized(use_population=false)
|
|
113
172
|
check_type :scale
|
|
114
|
-
return ([nil]*size).to_scale if mean.nil?
|
|
115
173
|
m=mean
|
|
116
174
|
sd=use_population ? sdp : sds
|
|
117
|
-
return ([nil]*size).to_scale if sd==0.0
|
|
118
|
-
vector
|
|
119
|
-
if !x.nil?
|
|
120
|
-
(x.to_f - m).quo(sd)
|
|
121
|
-
else
|
|
122
|
-
nil
|
|
123
|
-
end
|
|
124
|
-
}.to_vector(:scale)
|
|
175
|
+
return ([nil]*size).to_scale if mean.nil? or sd==0.0
|
|
176
|
+
vector=vector_standarized_compute(m,sd)
|
|
125
177
|
vector.name=_("%s(standarized)") % @name
|
|
126
178
|
vector
|
|
127
179
|
end
|
|
180
|
+
def vector_centered_compute(m) #:nodoc:
|
|
181
|
+
@data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
|
|
182
|
+
end
|
|
128
183
|
# Return a centered vector
|
|
129
184
|
def vector_centered
|
|
130
185
|
check_type :scale
|
|
131
186
|
m=mean
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
}.to_scale
|
|
187
|
+
return ([nil]*size).to_scale if mean.nil?
|
|
188
|
+
vector=vector_centered_compute(m)
|
|
135
189
|
vector.name=_("%s(centered)") % @name
|
|
136
190
|
vector
|
|
137
191
|
end
|
|
@@ -148,18 +202,18 @@ module Statsample
|
|
|
148
202
|
vector
|
|
149
203
|
end
|
|
150
204
|
def box_cox_transformation(lambda) # :nodoc:
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
205
|
+
raise "Should be a scale" unless @type==:scale
|
|
206
|
+
@data_with_nils.collect{|x|
|
|
207
|
+
if !x.nil?
|
|
208
|
+
if(lambda==0)
|
|
209
|
+
Math.log(x)
|
|
210
|
+
else
|
|
211
|
+
(x**lambda-1).quo(lambda)
|
|
212
|
+
end
|
|
156
213
|
else
|
|
157
|
-
|
|
214
|
+
nil
|
|
158
215
|
end
|
|
159
|
-
|
|
160
|
-
nil
|
|
161
|
-
end
|
|
162
|
-
}.to_vector(:scale)
|
|
216
|
+
}.to_vector(:scale)
|
|
163
217
|
end
|
|
164
218
|
|
|
165
219
|
# Vector equality.
|
|
@@ -193,6 +247,10 @@ module Statsample
|
|
|
193
247
|
}
|
|
194
248
|
set_valid_data
|
|
195
249
|
end
|
|
250
|
+
def push(v)
|
|
251
|
+
@data.push(v)
|
|
252
|
+
set_valid_data
|
|
253
|
+
end
|
|
196
254
|
# Dicotomize the vector with 0 and 1, based on lowest value
|
|
197
255
|
# If parameter if defined, this value and lower
|
|
198
256
|
# will be 0 and higher, 1
|
|
@@ -250,7 +308,6 @@ module Statsample
|
|
|
250
308
|
@missing_data.clear
|
|
251
309
|
@data_with_nils.clear
|
|
252
310
|
@date_data_with_nils.clear
|
|
253
|
-
@gsl=nil
|
|
254
311
|
set_valid_data_intern
|
|
255
312
|
set_scale_data if(@type==:scale)
|
|
256
313
|
set_date_data if(@type==:date)
|
|
@@ -281,11 +338,14 @@ module Statsample
|
|
|
281
338
|
def has_missing_data?
|
|
282
339
|
@has_missing_data
|
|
283
340
|
end
|
|
341
|
+
alias :flawed? :has_missing_data?
|
|
342
|
+
|
|
284
343
|
# Retrieves label for value x. Retrieves x if
|
|
285
344
|
# no label defined.
|
|
286
345
|
def labeling(x)
|
|
287
346
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
|
288
347
|
end
|
|
348
|
+
alias :label :labeling
|
|
289
349
|
# Returns a Vector with data with labels replaced by the label.
|
|
290
350
|
def vector_labeled
|
|
291
351
|
d=@data.collect{|x|
|
|
@@ -317,8 +377,7 @@ module Statsample
|
|
|
317
377
|
!(x.nil? or @missing_values.include? x)
|
|
318
378
|
end
|
|
319
379
|
# Set missing_values.
|
|
320
|
-
#
|
|
321
|
-
# set_valid_data after all changes
|
|
380
|
+
# set_valid_data is called after changes
|
|
322
381
|
def missing_values=(vals)
|
|
323
382
|
@missing_values = vals
|
|
324
383
|
set_valid_data
|
|
@@ -335,7 +394,11 @@ module Statsample
|
|
|
335
394
|
set_date_data if (t==:date)
|
|
336
395
|
end
|
|
337
396
|
def to_a
|
|
338
|
-
@data.
|
|
397
|
+
if @data.is_a? Array
|
|
398
|
+
@data.dup
|
|
399
|
+
else
|
|
400
|
+
@data.to_a
|
|
401
|
+
end
|
|
339
402
|
end
|
|
340
403
|
alias_method :to_ary, :to_a
|
|
341
404
|
|
|
@@ -357,6 +420,10 @@ module Statsample
|
|
|
357
420
|
def -(v)
|
|
358
421
|
_vector_ari("-",v)
|
|
359
422
|
end
|
|
423
|
+
|
|
424
|
+
def *(v)
|
|
425
|
+
_vector_ari("*",v)
|
|
426
|
+
end
|
|
360
427
|
# Reports all values that doesn't comply with a condition.
|
|
361
428
|
# Returns a hash with the index of data and the invalid data.
|
|
362
429
|
def verify
|
|
@@ -370,20 +437,16 @@ module Statsample
|
|
|
370
437
|
end
|
|
371
438
|
def _vector_ari(method,v) # :nodoc:
|
|
372
439
|
if(v.is_a? Vector or v.is_a? Array)
|
|
373
|
-
|
|
374
|
-
# i=0
|
|
440
|
+
raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
|
|
375
441
|
sum=[]
|
|
376
|
-
|
|
442
|
+
v.size.times {|i|
|
|
377
443
|
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
|
378
444
|
sum.push(@data[i].send(method,v[i]))
|
|
379
445
|
else
|
|
380
446
|
sum.push(nil)
|
|
381
447
|
end
|
|
382
448
|
}
|
|
383
|
-
Statsample::Vector.new(sum, :scale
|
|
384
|
-
else
|
|
385
|
-
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
|
386
|
-
end
|
|
449
|
+
Statsample::Vector.new(sum, :scale)
|
|
387
450
|
elsif(v.respond_to? method )
|
|
388
451
|
Statsample::Vector.new(
|
|
389
452
|
@data.collect {|x|
|
|
@@ -451,10 +514,10 @@ module Statsample
|
|
|
451
514
|
}
|
|
452
515
|
end
|
|
453
516
|
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
517
|
+
split_by_separator(sep).inject({}) {|a,v|
|
|
518
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
|
519
|
+
a
|
|
520
|
+
}
|
|
458
521
|
end
|
|
459
522
|
|
|
460
523
|
# Returns an random sample of size n, with replacement,
|
|
@@ -463,13 +526,8 @@ module Statsample
|
|
|
463
526
|
# In all the trails, every item have the same probability
|
|
464
527
|
# of been selected.
|
|
465
528
|
def sample_with_replacement(sample=1)
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
(0...sample).collect{ @valid_data[rand(vds)] }
|
|
469
|
-
else
|
|
470
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
|
471
|
-
r.sample(@gsl, sample).to_a
|
|
472
|
-
end
|
|
529
|
+
vds=@valid_data.size
|
|
530
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
|
473
531
|
end
|
|
474
532
|
# Returns an random sample of size n, without replacement,
|
|
475
533
|
# only with valid data.
|
|
@@ -479,7 +537,6 @@ module Statsample
|
|
|
479
537
|
# A sample of the same size of the vector is the vector itself.
|
|
480
538
|
|
|
481
539
|
def sample_without_replacement(sample=1)
|
|
482
|
-
if(@type!=:scale or !Statsample.has_gsl?)
|
|
483
540
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
|
484
541
|
out=[]
|
|
485
542
|
size=@valid_data.size
|
|
@@ -487,11 +544,7 @@ module Statsample
|
|
|
487
544
|
value=rand(size)
|
|
488
545
|
out.push(value) if !out.include?value
|
|
489
546
|
end
|
|
490
|
-
out.collect{|i
|
|
491
|
-
else
|
|
492
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
|
493
|
-
r.choose(@gsl, sample).to_a
|
|
494
|
-
end
|
|
547
|
+
out.collect{|i| @data[i]}
|
|
495
548
|
end
|
|
496
549
|
# Retrieves number of cases which comply condition.
|
|
497
550
|
# If block given, retrieves number of instances where
|
|
@@ -535,11 +588,11 @@ module Statsample
|
|
|
535
588
|
end
|
|
536
589
|
# Return true if all data is Numeric or nil
|
|
537
590
|
def can_be_scale?
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
591
|
+
if @data.find {|v| !v.nil? and !v.is_a? Numeric and !@missing_values.include? v}
|
|
592
|
+
false
|
|
593
|
+
else
|
|
594
|
+
true
|
|
595
|
+
end
|
|
543
596
|
end
|
|
544
597
|
|
|
545
598
|
def to_s
|
|
@@ -560,13 +613,13 @@ module Statsample
|
|
|
560
613
|
end
|
|
561
614
|
# Retrieves uniques values for data.
|
|
562
615
|
def factors
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
616
|
+
if @type==:scale
|
|
617
|
+
@scale_data.uniq.sort
|
|
618
|
+
elsif @type==:date
|
|
619
|
+
@date_data_with_nils.uniq.sort
|
|
620
|
+
else
|
|
621
|
+
@valid_data.uniq.sort
|
|
622
|
+
end
|
|
570
623
|
end
|
|
571
624
|
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
|
572
625
|
# Returns a hash with the distribution of frecuencies for
|
|
@@ -579,6 +632,8 @@ module Statsample
|
|
|
579
632
|
_frequencies
|
|
580
633
|
end
|
|
581
634
|
end
|
|
635
|
+
|
|
636
|
+
|
|
582
637
|
def _frequencies #:nodoc:
|
|
583
638
|
@valid_data.inject(Hash.new) {|a,x|
|
|
584
639
|
a[x]||=0
|
|
@@ -589,7 +644,7 @@ module Statsample
|
|
|
589
644
|
|
|
590
645
|
# Returns the most frequent item.
|
|
591
646
|
def mode
|
|
592
|
-
frequencies.max{|a,b| a[1]<=>b[1]}
|
|
647
|
+
frequencies.max{|a,b| a[1]<=>b[1]}.first
|
|
593
648
|
end
|
|
594
649
|
# The numbers of item with valid data.
|
|
595
650
|
def n_valid
|
|
@@ -678,22 +733,17 @@ module Statsample
|
|
|
678
733
|
# Return the median (percentil 50)
|
|
679
734
|
def median
|
|
680
735
|
check_type :ordinal
|
|
681
|
-
|
|
682
|
-
sorted=GSL::Vector.alloc(@scale_data.sort)
|
|
683
|
-
GSL::Stats::median_from_sorted_data(sorted)
|
|
684
|
-
else
|
|
685
|
-
percentil(50)
|
|
686
|
-
end
|
|
736
|
+
percentil(50)
|
|
687
737
|
end
|
|
688
738
|
# Minimun value
|
|
689
739
|
def min
|
|
690
740
|
check_type :ordinal
|
|
691
|
-
@valid_data.min
|
|
741
|
+
@valid_data.min
|
|
692
742
|
end
|
|
693
743
|
# Maximum value
|
|
694
744
|
def max
|
|
695
745
|
check_type :ordinal
|
|
696
|
-
@valid_data.max
|
|
746
|
+
@valid_data.max
|
|
697
747
|
end
|
|
698
748
|
|
|
699
749
|
def set_date_data
|
|
@@ -722,9 +772,6 @@ module Statsample
|
|
|
722
772
|
x.to_f
|
|
723
773
|
end
|
|
724
774
|
end
|
|
725
|
-
if Statsample.has_gsl?
|
|
726
|
-
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
|
727
|
-
end
|
|
728
775
|
end
|
|
729
776
|
|
|
730
777
|
private :set_date_data, :set_scale_data
|
|
@@ -791,7 +838,6 @@ module Statsample
|
|
|
791
838
|
# Sample Standard deviation (denominator n-1)
|
|
792
839
|
def standard_deviation_sample(m=nil)
|
|
793
840
|
check_type :scale
|
|
794
|
-
|
|
795
841
|
m||=mean
|
|
796
842
|
Math::sqrt(variance_sample(m))
|
|
797
843
|
end
|
|
@@ -816,76 +862,30 @@ module Statsample
|
|
|
816
862
|
check_type :scale
|
|
817
863
|
@scale_data.inject(1){|a,x| a*x }
|
|
818
864
|
end
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
}
|
|
824
|
-
def sum # :nodoc:
|
|
825
|
-
check_type :scale
|
|
826
|
-
|
|
827
|
-
@gsl.sum
|
|
828
|
-
end
|
|
829
|
-
def mean # :nodoc:
|
|
865
|
+
|
|
866
|
+
# With a fixnum, creates X bins within the range of data
|
|
867
|
+
# With an Array, each value will be a cut point
|
|
868
|
+
def histogram(bins=10)
|
|
830
869
|
check_type :scale
|
|
831
|
-
@gsl.nil? ? nil : @gsl.mean
|
|
832
|
-
end
|
|
833
|
-
def variance_sample(m=nil) # :nodoc:
|
|
834
|
-
check_type :scale
|
|
835
|
-
m||=mean
|
|
836
|
-
@gsl.variance_m
|
|
837
|
-
end
|
|
838
|
-
def standard_deviation_sample(m=nil) # :nodoc:
|
|
839
|
-
check_type :scale
|
|
840
|
-
return nil if @gsl.nil?
|
|
841
|
-
m||=mean
|
|
842
|
-
@gsl.sd(m)
|
|
843
|
-
end
|
|
844
870
|
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
check_type :scale
|
|
857
|
-
@gsl.skew
|
|
858
|
-
end
|
|
859
|
-
def kurtosis # :nodoc:
|
|
860
|
-
check_type :scale
|
|
861
|
-
@gsl.kurtosis
|
|
862
|
-
end
|
|
863
|
-
# Create a GSL::Histogram
|
|
864
|
-
# With a fixnum, creates X bins within the range of data
|
|
865
|
-
# With an Array, each value will be a cut point
|
|
866
|
-
def histogram(bins=10)
|
|
867
|
-
check_type :scale
|
|
868
|
-
|
|
869
|
-
if bins.is_a? Array
|
|
870
|
-
#h=Statsample::Histogram.new(self, bins)
|
|
871
|
-
h=Statsample::Histogram.alloc(bins)
|
|
872
|
-
else
|
|
873
|
-
# ugly patch. The upper limit for a bin has the form
|
|
874
|
-
# x < range
|
|
875
|
-
#h=Statsample::Histogram.new(self, bins)
|
|
876
|
-
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
|
|
877
|
-
# fix last data
|
|
878
|
-
if max==@valid_data.max
|
|
879
|
-
max+=1e-10
|
|
880
|
-
end
|
|
881
|
-
h=Statsample::Histogram.alloc(bins,[min,max])
|
|
882
|
-
# Fix last bin
|
|
883
|
-
|
|
871
|
+
if bins.is_a? Array
|
|
872
|
+
#h=Statsample::Histogram.new(self, bins)
|
|
873
|
+
h=Statsample::Histogram.alloc(bins)
|
|
874
|
+
else
|
|
875
|
+
# ugly patch. The upper limit for a bin has the form
|
|
876
|
+
# x < range
|
|
877
|
+
#h=Statsample::Histogram.new(self, bins)
|
|
878
|
+
min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
|
|
879
|
+
# fix last data
|
|
880
|
+
if max==@valid_data.max
|
|
881
|
+
max+=1e-10
|
|
884
882
|
end
|
|
885
|
-
h.
|
|
886
|
-
|
|
883
|
+
h=Statsample::Histogram.alloc(bins,[min,max])
|
|
884
|
+
# Fix last bin
|
|
885
|
+
|
|
887
886
|
end
|
|
888
|
-
|
|
887
|
+
h.increment(@valid_data)
|
|
888
|
+
h
|
|
889
889
|
end
|
|
890
890
|
|
|
891
891
|
# Coefficient of variation
|
|
@@ -894,7 +894,6 @@ module Statsample
|
|
|
894
894
|
check_type :scale
|
|
895
895
|
standard_deviation_sample.quo(mean)
|
|
896
896
|
end
|
|
897
|
-
|
|
898
897
|
alias_method :sdp, :standard_deviation_population
|
|
899
898
|
alias_method :sds, :standard_deviation_sample
|
|
900
899
|
alias_method :adp, :average_deviation_population
|
|
@@ -902,5 +901,6 @@ module Statsample
|
|
|
902
901
|
alias_method :variance, :variance_sample
|
|
903
902
|
alias_method :sd, :standard_deviation_sample
|
|
904
903
|
alias_method :ss, :sum_of_squares
|
|
904
|
+
include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
|
|
905
905
|
end
|
|
906
906
|
end
|