statsample 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +20 -1
- data/Manifest.txt +8 -1
- data/README.txt +11 -7
- data/Rakefile +2 -2
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/dataset.rb +8 -0
- data/examples/multiple_regression.rb +1 -1
- data/examples/parallel_analysis.rb +29 -0
- data/examples/parallel_analysis_tetrachoric.rb +30 -0
- data/examples/vector.rb +6 -0
- data/lib/distribution.rb +16 -6
- data/lib/distribution/normal.rb +27 -20
- data/lib/distribution/normalbivariate.rb +1 -1
- data/lib/statsample.rb +19 -2
- data/lib/statsample/anova.rb +118 -16
- data/lib/statsample/bivariate.rb +27 -13
- data/lib/statsample/bivariate/polychoric.rb +18 -5
- data/lib/statsample/crosstab.rb +66 -74
- data/lib/statsample/dataset.rb +52 -45
- data/lib/statsample/dominanceanalysis.rb +2 -5
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/factor/parallelanalysis.rb +122 -0
- data/lib/statsample/factor/pca.rb +23 -28
- data/lib/statsample/factor/principalaxis.rb +8 -3
- data/lib/statsample/matrix.rb +27 -24
- data/lib/statsample/mle.rb +11 -11
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression.rb +10 -8
- data/lib/statsample/regression/multiple/baseengine.rb +36 -25
- data/lib/statsample/regression/multiple/gslengine.rb +14 -0
- data/lib/statsample/regression/multiple/matrixengine.rb +4 -32
- data/lib/statsample/regression/multiple/rubyengine.rb +2 -6
- data/lib/statsample/regression/simple.rb +1 -1
- data/lib/statsample/reliability.rb +42 -54
- data/lib/statsample/test.rb +10 -6
- data/lib/statsample/test/f.rb +16 -26
- data/lib/statsample/test/levene.rb +4 -8
- data/lib/statsample/test/t.rb +30 -24
- data/lib/statsample/test/umannwhitney.rb +13 -6
- data/lib/statsample/vector.rb +86 -76
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +127 -94
- data/po/statsample.pot +114 -79
- data/test/test_anovaoneway.rb +27 -0
- data/test/test_anovawithvectors.rb +97 -0
- data/test/test_bivariate.rb +6 -57
- data/test/test_bivariate_polychoric.rb +65 -0
- data/test/test_crosstab.rb +6 -0
- data/test/test_dataset.rb +29 -1
- data/test/test_distribution.rb +6 -13
- data/test/test_dominance_analysis.rb +1 -1
- data/test/test_factor.rb +3 -3
- data/test/test_helpers.rb +18 -18
- data/test/test_matrix.rb +33 -20
- data/test/test_permutation.rb +36 -30
- data/test/test_regression.rb +26 -8
- data/test/test_reliability.rb +104 -14
- data/test/test_test_f.rb +11 -14
- data/test/test_test_t.rb +42 -35
- data/test/test_umannwhitney.rb +22 -10
- data/test/test_vector.rb +204 -102
- metadata +57 -81
- metadata.gz.sig +0 -0
- data/test/test_anova.rb +0 -24
data/lib/statsample/test/f.rb
CHANGED
@@ -3,34 +3,27 @@ module Statsample
|
|
3
3
|
# From Wikipedia:
|
4
4
|
# An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fit to a data set, in order to identify the model that best fits the population from which the data were sampled.
|
5
5
|
class F
|
6
|
-
include GetText
|
7
|
-
bindtextdomain("statsample")
|
8
|
-
|
9
6
|
include Statsample::Test
|
10
7
|
|
11
|
-
attr_reader :
|
8
|
+
attr_reader :var_num, :var_den, :df_num, :df_den, :var_total, :df_total
|
12
9
|
# Tails for probability (:both, :left or :right)
|
13
10
|
attr_accessor :tails
|
14
11
|
# Name of F analysis
|
15
12
|
attr_accessor :name
|
16
|
-
# Name of numerator
|
17
|
-
attr_accessor :name_numerator
|
18
|
-
# Name of denominator
|
19
|
-
attr_accessor :name_denominator
|
20
13
|
|
21
14
|
# Parameters:
|
22
|
-
# *
|
23
|
-
# *
|
24
|
-
# * df_num: degrees of freedom
|
25
|
-
# * df_den: degrees of freedom
|
26
|
-
def initialize(
|
27
|
-
@
|
28
|
-
@
|
15
|
+
# * var_num: variance numerator
|
16
|
+
# * var_den: variance denominator
|
17
|
+
# * df_num: degrees of freedom numerator
|
18
|
+
# * df_den: degrees of freedom denominator
|
19
|
+
def initialize(var_num, var_den, df_num, df_den, opts=Hash.new)
|
20
|
+
@var_num=var_num
|
21
|
+
@var_den=var_den
|
29
22
|
@df_num=df_num
|
30
23
|
@df_den=df_den
|
31
|
-
@
|
24
|
+
@var_total=var_num+var_den
|
32
25
|
@df_total=df_num+df_den
|
33
|
-
opts_default={:tails=>:right, :
|
26
|
+
opts_default={:tails=>:right, :name=>"F Test"}
|
34
27
|
@opts=opts_default.merge(opts)
|
35
28
|
raise "Tails should be right or left, not both" if @opts[:tails]==:both
|
36
29
|
opts_default.keys.each {|k|
|
@@ -41,20 +34,17 @@ module Statsample
|
|
41
34
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
42
35
|
end
|
43
36
|
def f
|
44
|
-
|
37
|
+
@var_num.quo(@var_den)
|
38
|
+
end
|
39
|
+
def to_f
|
40
|
+
f
|
45
41
|
end
|
46
42
|
# probability
|
47
43
|
def probability
|
48
44
|
p_using_cdf(Distribution::F.cdf(f, @df_num, @df_den), tails)
|
49
45
|
end
|
50
|
-
def report_building(builder)#:nodoc:
|
51
|
-
builder.
|
52
|
-
b.table(:name=>_("%s Table") % @name, :header=>%w{source ss df f p}.map {|v| _(v)}) do |t|
|
53
|
-
t.row([@name_numerator, sprintf("%0.3f",@ss_num), @df_num, sprintf("%0.3f",f), sprintf("%0.3f", probability)])
|
54
|
-
t.row([@name_denominator, sprintf("%0.3f",@ss_den), @df_den, "", ""])
|
55
|
-
t.row([_("Total"), sprintf("%0.3f",@ss_total), @df_total,"",""])
|
56
|
-
end
|
57
|
-
end
|
46
|
+
def report_building(builder) #:nodoc:
|
47
|
+
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
|
58
48
|
end
|
59
49
|
end
|
60
50
|
end
|
@@ -19,6 +19,7 @@ module Statsample
|
|
19
19
|
# Reference:
|
20
20
|
# * NIST/SEMATECH e-Handbook of Statistical Methods. Available on http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
|
21
21
|
class Levene
|
22
|
+
include Statsample::Test
|
22
23
|
# Degrees of freedom 1 (k-1)
|
23
24
|
attr_reader :d1
|
24
25
|
# Degrees of freedom 2 (n-k)
|
@@ -42,18 +43,13 @@ module Statsample
|
|
42
43
|
def f
|
43
44
|
@w
|
44
45
|
end
|
45
|
-
|
46
|
-
|
47
|
-
g.text @name
|
48
|
-
g.text "F: #{"%0.4f" % f}"
|
49
|
-
g.text "p: #{"%0.4f" % probability}"
|
50
|
-
|
46
|
+
def report_building(builder) # :nodoc:
|
47
|
+
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
|
51
48
|
end
|
52
49
|
# Summary of results
|
53
50
|
def summary
|
54
51
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
55
52
|
end
|
56
|
-
|
57
53
|
def compute
|
58
54
|
n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
|
59
55
|
|
@@ -86,7 +82,7 @@ module Statsample
|
|
86
82
|
# Probability.
|
87
83
|
# With H_0 = Sum(s2)=0, probability of getting a value of the test upper or equal to the obtained on the sample
|
88
84
|
def probability
|
89
|
-
|
85
|
+
p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
|
90
86
|
end
|
91
87
|
|
92
88
|
end
|
data/lib/statsample/test/t.rb
CHANGED
@@ -31,7 +31,7 @@ module Statsample
|
|
31
31
|
end
|
32
32
|
num.quo(den)
|
33
33
|
end
|
34
|
-
|
34
|
+
# Degrees of freedom for equal variance on t test
|
35
35
|
def df_equal_variance(n1,n2)
|
36
36
|
n1+n2-2
|
37
37
|
end
|
@@ -67,7 +67,6 @@ module Statsample
|
|
67
67
|
class OneSample
|
68
68
|
include Math
|
69
69
|
include Statsample::Test
|
70
|
-
include DirtyMemoize
|
71
70
|
# Options
|
72
71
|
attr_accessor :opts
|
73
72
|
# Name of test
|
@@ -76,15 +75,9 @@ module Statsample
|
|
76
75
|
attr_accessor :u
|
77
76
|
# Degress of freedom
|
78
77
|
attr_reader :df
|
79
|
-
# Value of t
|
80
|
-
attr_reader :t
|
81
|
-
# Probability
|
82
|
-
attr_reader :probability
|
83
78
|
# Tails for probability (:both, :left or :right)
|
84
79
|
attr_accessor :tails
|
85
80
|
|
86
|
-
dirty_writer :u, :tails
|
87
|
-
dirty_memoize :t, :probability
|
88
81
|
# Create a One Sample T Test
|
89
82
|
# Options:
|
90
83
|
# * :u = Mean to compare. Default= 0
|
@@ -100,14 +93,14 @@ module Statsample
|
|
100
93
|
@df= @vector.n_valid-1
|
101
94
|
@t=nil
|
102
95
|
end
|
103
|
-
|
96
|
+
def t
|
97
|
+
T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
|
98
|
+
end
|
104
99
|
|
105
|
-
|
106
|
-
|
107
|
-
@t = T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
|
108
|
-
@probability = p_using_cdf(Distribution::T.cdf(@t, @df), tails)
|
100
|
+
def probability
|
101
|
+
p_using_cdf(Distribution::T.cdf(t, @df), tails)
|
109
102
|
end
|
110
|
-
#
|
103
|
+
# Summary of analysis
|
111
104
|
#
|
112
105
|
def summary
|
113
106
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
@@ -154,6 +147,8 @@ module Statsample
|
|
154
147
|
include Math
|
155
148
|
include Statsample::Test
|
156
149
|
include DirtyMemoize
|
150
|
+
include GetText
|
151
|
+
bindtextdomain("statsample")
|
157
152
|
# Options
|
158
153
|
attr_accessor :opts
|
159
154
|
# Name of test
|
@@ -204,25 +199,36 @@ module Statsample
|
|
204
199
|
@probability_not_equal_variance = p_using_cdf(Distribution::T.cdf(@t_not_equal_variance, @df_not_equal_variance), tails)
|
205
200
|
|
206
201
|
end
|
202
|
+
# Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data
|
203
|
+
def d
|
204
|
+
n1=@v1.n_valid
|
205
|
+
n2=@v2.n_valid
|
206
|
+
num=@v1.mean-@v2.mean
|
207
|
+
den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2))
|
208
|
+
num.quo(den)
|
209
|
+
end
|
210
|
+
|
207
211
|
# Presents summary of analysis
|
208
|
-
#
|
209
212
|
def summary
|
210
213
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
211
214
|
end
|
215
|
+
|
212
216
|
def report_building(b) # :nodoc:
|
213
217
|
b.section(:name=>@name) {|g|
|
214
|
-
g.table(:name=>"Mean and standard deviation", :header=>["Variable", "m", "sd","n"]) {|t|
|
215
|
-
t.row([
|
216
|
-
t.row([
|
217
|
-
}
|
218
|
-
g.section(:name=>"Levene Test") {|g1|
|
219
|
-
g1.parse_element(Statsample::Test.levene([@v1,@v2]))
|
218
|
+
g.table(:name=>_("Mean and standard deviation"), :header=>["Variable", "m", "sd","n"]) {|t|
|
219
|
+
t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid])
|
220
|
+
t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
|
220
221
|
}
|
222
|
+
g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
|
221
223
|
|
222
|
-
g.table(:name=>"T statistics",:header=>["Type","t","df", "p (#{tails} tails)"]) {|t|
|
223
|
-
t.row(["Equal variance", "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
|
224
|
-
t.row(["Non equal variance", "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
|
224
|
+
g.table(:name=>_("T statistics"),:header=>["Type","t","df", "p (#{tails} tails)"].map{|v| _(v)}) {|t|
|
225
|
+
t.row([_("Equal variance"), "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
|
226
|
+
t.row([_("Non equal variance"), "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
|
225
227
|
}
|
228
|
+
g.table(:name=>_("Effect size")) do |t|
|
229
|
+
t.row ['x1-x2', "%0.4f" % (@v1.mean-@v2.mean)]
|
230
|
+
t.row ['d', "%0.4f" % d]
|
231
|
+
end
|
226
232
|
}
|
227
233
|
end
|
228
234
|
end
|
@@ -107,14 +107,15 @@ module Statsample
|
|
107
107
|
attr_reader :u
|
108
108
|
# Value of compensation for ties (useful for demostration)
|
109
109
|
attr_reader :t
|
110
|
+
# Name of test
|
111
|
+
attr_accessor :name
|
110
112
|
#
|
111
113
|
# Create a new U Mann-Whitney test
|
112
114
|
# Params: Two Statsample::Vectors
|
113
115
|
#
|
114
|
-
def initialize(v1,v2)
|
116
|
+
def initialize(v1,v2, opts=Hash.new)
|
115
117
|
@n1=v1.valid_data.size
|
116
118
|
@n2=v2.valid_data.size
|
117
|
-
|
118
119
|
data=(v1.valid_data+v2.valid_data).to_scale
|
119
120
|
groups=(([0]*@n1)+([1]*@n2)).to_vector
|
120
121
|
ds={'g'=>groups, 'data'=>data}.to_dataset
|
@@ -132,11 +133,17 @@ module Statsample
|
|
132
133
|
@u1=r1-((@n1*(@n1+1)).quo(2))
|
133
134
|
@u2=r2-((@n2*(@n2+1)).quo(2))
|
134
135
|
@u=(u1<u2) ? u1 : u2
|
136
|
+
opts_default={:name=>"Mann-Whitney's U"}
|
137
|
+
@opts=opts_default.merge(opts)
|
138
|
+
opts_default.keys.each {|k|
|
139
|
+
send("#{k}=", @opts[k])
|
140
|
+
}
|
141
|
+
|
135
142
|
end
|
136
143
|
# Report results.
|
137
144
|
def summary
|
138
145
|
out=<<-HEREDOC
|
139
|
-
|
146
|
+
@name
|
140
147
|
Sum of ranks v1: #{@r1.to_f}
|
141
148
|
Sum of ranks v1: #{@r2.to_f}
|
142
149
|
U Value: #{@u.to_f}
|
@@ -152,7 +159,7 @@ Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
|
152
159
|
end
|
153
160
|
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
|
154
161
|
# Uses u_sampling_distribution_as62
|
155
|
-
def
|
162
|
+
def probability_exact
|
156
163
|
dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
|
157
164
|
sum=0
|
158
165
|
(0..@u.to_i).each {|i|
|
@@ -190,9 +197,9 @@ Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
|
190
197
|
(@u-mu).quo(ou)
|
191
198
|
end
|
192
199
|
# Assuming H_0, the proportion of cdf with values of U lower
|
193
|
-
# than the sample.
|
200
|
+
# than the sample, using normal approximation.
|
194
201
|
# Use with more than 30 cases per group.
|
195
|
-
def
|
202
|
+
def probability_z
|
196
203
|
(1-Distribution::Normal.cdf(z.abs()))*2
|
197
204
|
end
|
198
205
|
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -7,7 +7,7 @@ class Array
|
|
7
7
|
end
|
8
8
|
# Creates a new Statsample::Vector object of type :scale
|
9
9
|
def to_scale(*args)
|
10
|
-
Statsample::Vector.new(self
|
10
|
+
Statsample::Vector.new(self, :scale,*args)
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
@@ -24,12 +24,7 @@ module Statsample
|
|
24
24
|
class Vector
|
25
25
|
include Enumerable
|
26
26
|
include Writable
|
27
|
-
|
28
|
-
DEFAULT_OPTIONS={
|
29
|
-
:missing_values=>[],
|
30
|
-
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
31
|
-
:labels=>{}
|
32
|
-
}
|
27
|
+
include Summarizable
|
33
28
|
# Level of measurement. Could be :nominal, :ordinal or :scale
|
34
29
|
attr_reader :type
|
35
30
|
# Original data.
|
@@ -50,23 +45,39 @@ module Statsample
|
|
50
45
|
attr_reader :gsl
|
51
46
|
# Change label for specific values
|
52
47
|
attr_accessor :labels
|
48
|
+
# Name of vector. Should be used for output by many classes
|
49
|
+
attr_accessor :name
|
50
|
+
|
53
51
|
#
|
54
52
|
# Creates a new Vector object.
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
53
|
+
# * <tt>data</tt> Array of data.
|
54
|
+
# * <tt>type</tt> Level of meausurement. See Vector#type
|
55
|
+
# * <tt>opts</tt> Hash of options
|
56
|
+
# * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
|
57
|
+
# * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
|
58
|
+
# * <tt>:labels</tt> Labels for data values
|
59
|
+
# * <tt>:name</tt> Name of vector
|
61
60
|
#
|
62
61
|
def initialize(data=[], type=:nominal, opts=Hash.new)
|
63
62
|
raise "Data should be an array" unless data.is_a? Array
|
64
63
|
@data=data
|
65
64
|
@type=type
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
opts_default={
|
66
|
+
:missing_values=>[],
|
67
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
68
|
+
:labels=>{},
|
69
|
+
:name=>nil
|
70
|
+
}
|
71
|
+
@opts=opts_default.merge(opts)
|
72
|
+
if @opts[:name].nil?
|
73
|
+
@@n_table||=0
|
74
|
+
@@n_table+=1
|
75
|
+
@opts[:name]="Vector #{@@n_table}"
|
76
|
+
end
|
77
|
+
@missing_values=@opts[:missing_values]
|
78
|
+
@labels=@opts[:labels]
|
79
|
+
@today_values=@opts[:today_values]
|
80
|
+
@name=@opts[:name]
|
70
81
|
@valid_data=[]
|
71
82
|
@data_with_nils=[]
|
72
83
|
@date_data_with_nils=[]
|
@@ -80,12 +91,12 @@ module Statsample
|
|
80
91
|
# Note: data, missing_values and labels are duplicated, so
|
81
92
|
# changes on original vector doesn't propages to copies.
|
82
93
|
def dup
|
83
|
-
|
94
|
+
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name.dup)
|
84
95
|
end
|
85
96
|
# Returns an empty duplicate of the vector. Maintains the type,
|
86
97
|
# missing values and labels.
|
87
98
|
def dup_empty
|
88
|
-
|
99
|
+
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name.dup)
|
89
100
|
end
|
90
101
|
# Raises an exception if type of vector is inferior to t type
|
91
102
|
def check_type(t)
|
@@ -128,8 +139,8 @@ module Statsample
|
|
128
139
|
# Vector equality.
|
129
140
|
# Two vector will be the same if their data, missing values, type, labels are equals
|
130
141
|
def ==(v2)
|
131
|
-
|
132
|
-
|
142
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
143
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
|
133
144
|
end
|
134
145
|
|
135
146
|
def _dump(i) # :nodoc:
|
@@ -189,8 +200,8 @@ module Statsample
|
|
189
200
|
# Vector.set_valid_data at the end of your insertion cycle
|
190
201
|
#
|
191
202
|
def add(v,update_valid=true)
|
192
|
-
|
193
|
-
|
203
|
+
@data.push(v)
|
204
|
+
set_valid_data if update_valid
|
194
205
|
end
|
195
206
|
# Update valid_data, missing_data, data_with_nils and gsl
|
196
207
|
# at the end of an insertion.
|
@@ -208,14 +219,14 @@ module Statsample
|
|
208
219
|
# v.valid_data
|
209
220
|
# => [2,3]
|
210
221
|
def set_valid_data
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
222
|
+
@valid_data.clear
|
223
|
+
@missing_data.clear
|
224
|
+
@data_with_nils.clear
|
225
|
+
@date_data_with_nils.clear
|
226
|
+
@gsl=nil
|
227
|
+
set_valid_data_intern
|
228
|
+
set_scale_data if(@type==:scale)
|
229
|
+
set_date_data if(@type==:date)
|
219
230
|
end
|
220
231
|
|
221
232
|
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
@@ -228,21 +239,21 @@ module Statsample
|
|
228
239
|
end
|
229
240
|
end
|
230
241
|
def _set_valid_data_intern #:nodoc:
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
242
|
+
@data.each do |n|
|
243
|
+
if is_valid? n
|
244
|
+
@valid_data.push(n)
|
245
|
+
@data_with_nils.push(n)
|
246
|
+
else
|
247
|
+
@data_with_nils.push(nil)
|
248
|
+
@missing_data.push(n)
|
249
|
+
end
|
238
250
|
end
|
239
|
-
|
240
|
-
@has_missing_data=@missing_data.size>0
|
251
|
+
@has_missing_data=@missing_data.size>0
|
241
252
|
end
|
242
253
|
|
243
254
|
# Retrieves true if data has one o more missing values
|
244
255
|
def has_missing_data?
|
245
|
-
|
256
|
+
@has_missing_data
|
246
257
|
end
|
247
258
|
# Retrieves label for value x. Retrieves x if
|
248
259
|
# no label defined.
|
@@ -251,14 +262,14 @@ module Statsample
|
|
251
262
|
end
|
252
263
|
# Returns a Vector with data with labels replaced by the label.
|
253
264
|
def vector_labeled
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
265
|
+
d=@data.collect{|x|
|
266
|
+
if @labels.has_key? x
|
267
|
+
@labels[x]
|
268
|
+
else
|
269
|
+
x
|
270
|
+
end
|
271
|
+
}
|
272
|
+
Vector.new(d,@type)
|
262
273
|
end
|
263
274
|
# Size of total data
|
264
275
|
def size
|
@@ -427,13 +438,13 @@ module Statsample
|
|
427
438
|
# In all the trails, every item have the same probability
|
428
439
|
# of been selected.
|
429
440
|
def sample_with_replacement(sample=1)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
441
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
442
|
+
vds=@valid_data.size
|
443
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
444
|
+
else
|
445
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
446
|
+
r.sample(@gsl, sample).to_a
|
447
|
+
end
|
437
448
|
end
|
438
449
|
# Returns an random sample of size n, without replacement,
|
439
450
|
# only with valid data.
|
@@ -597,24 +608,24 @@ module Statsample
|
|
597
608
|
def proportion(v=1)
|
598
609
|
frequencies[v].quo(@valid_data.size)
|
599
610
|
end
|
600
|
-
def
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
611
|
+
def report_building(b)
|
612
|
+
b.section(:name=>name) do |s|
|
613
|
+
s.text _("n :%d") % n
|
614
|
+
s.text _("n valid:%d") % n_valid
|
615
|
+
s.text _("factors:%s") % factors.join(",")
|
616
|
+
s.text _("mode: %s") % mode
|
617
|
+
s.table(:name=>_("Distribution")) do |t|
|
618
|
+
frequencies.sort.each do |k,v|
|
619
|
+
key=labels.has_key?(k) ? labels[k]:k
|
620
|
+
t.row [key,v, ("%0.2f%%" % (v.quo(n_valid)*100))]
|
621
|
+
end
|
622
|
+
end
|
623
|
+
s.text _("median: %s") % median.to_s if(@type==:ordinal)
|
624
|
+
if(@type==:scale)
|
625
|
+
s.text _("mean: %0.4f") % mean
|
626
|
+
s.text _("sd: %0.4f") % sd.to_s
|
627
|
+
end
|
616
628
|
end
|
617
|
-
out
|
618
629
|
end
|
619
630
|
|
620
631
|
# Variance of p, according to poblation size
|
@@ -817,8 +828,7 @@ module Statsample
|
|
817
828
|
@gsl.mean
|
818
829
|
end
|
819
830
|
def variance_sample(m=nil) # :nodoc:
|
820
|
-
|
821
|
-
|
831
|
+
check_type :scale
|
822
832
|
m||=mean
|
823
833
|
@gsl.variance_m
|
824
834
|
end
|
@@ -881,7 +891,7 @@ module Statsample
|
|
881
891
|
alias_method :sdp, :standard_deviation_population
|
882
892
|
alias_method :sds, :standard_deviation_sample
|
883
893
|
alias_method :cov, :coefficient_of_variation
|
884
|
-
alias_method :variance, :variance_sample
|
894
|
+
alias_method :variance, :variance_sample
|
885
895
|
alias_method :sd, :standard_deviation_sample
|
886
896
|
alias_method :ss, :sum_of_squares
|
887
897
|
end
|