statsample 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +20 -1
- data/Manifest.txt +8 -1
- data/README.txt +11 -7
- data/Rakefile +2 -2
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/dataset.rb +8 -0
- data/examples/multiple_regression.rb +1 -1
- data/examples/parallel_analysis.rb +29 -0
- data/examples/parallel_analysis_tetrachoric.rb +30 -0
- data/examples/vector.rb +6 -0
- data/lib/distribution.rb +16 -6
- data/lib/distribution/normal.rb +27 -20
- data/lib/distribution/normalbivariate.rb +1 -1
- data/lib/statsample.rb +19 -2
- data/lib/statsample/anova.rb +118 -16
- data/lib/statsample/bivariate.rb +27 -13
- data/lib/statsample/bivariate/polychoric.rb +18 -5
- data/lib/statsample/crosstab.rb +66 -74
- data/lib/statsample/dataset.rb +52 -45
- data/lib/statsample/dominanceanalysis.rb +2 -5
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/factor/parallelanalysis.rb +122 -0
- data/lib/statsample/factor/pca.rb +23 -28
- data/lib/statsample/factor/principalaxis.rb +8 -3
- data/lib/statsample/matrix.rb +27 -24
- data/lib/statsample/mle.rb +11 -11
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression.rb +10 -8
- data/lib/statsample/regression/multiple/baseengine.rb +36 -25
- data/lib/statsample/regression/multiple/gslengine.rb +14 -0
- data/lib/statsample/regression/multiple/matrixengine.rb +4 -32
- data/lib/statsample/regression/multiple/rubyengine.rb +2 -6
- data/lib/statsample/regression/simple.rb +1 -1
- data/lib/statsample/reliability.rb +42 -54
- data/lib/statsample/test.rb +10 -6
- data/lib/statsample/test/f.rb +16 -26
- data/lib/statsample/test/levene.rb +4 -8
- data/lib/statsample/test/t.rb +30 -24
- data/lib/statsample/test/umannwhitney.rb +13 -6
- data/lib/statsample/vector.rb +86 -76
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +127 -94
- data/po/statsample.pot +114 -79
- data/test/test_anovaoneway.rb +27 -0
- data/test/test_anovawithvectors.rb +97 -0
- data/test/test_bivariate.rb +6 -57
- data/test/test_bivariate_polychoric.rb +65 -0
- data/test/test_crosstab.rb +6 -0
- data/test/test_dataset.rb +29 -1
- data/test/test_distribution.rb +6 -13
- data/test/test_dominance_analysis.rb +1 -1
- data/test/test_factor.rb +3 -3
- data/test/test_helpers.rb +18 -18
- data/test/test_matrix.rb +33 -20
- data/test/test_permutation.rb +36 -30
- data/test/test_regression.rb +26 -8
- data/test/test_reliability.rb +104 -14
- data/test/test_test_f.rb +11 -14
- data/test/test_test_t.rb +42 -35
- data/test/test_umannwhitney.rb +22 -10
- data/test/test_vector.rb +204 -102
- metadata +57 -81
- metadata.gz.sig +0 -0
- data/test/test_anova.rb +0 -24
data/lib/statsample/test/f.rb
CHANGED
@@ -3,34 +3,27 @@ module Statsample
|
|
3
3
|
# From Wikipedia:
|
4
4
|
# An F-test is any statistical test in which the test statistic has an F-distribution under the null hypothesis. It is most often used when comparing statistical models that have been fit to a data set, in order to identify the model that best fits the population from which the data were sampled.
|
5
5
|
class F
|
6
|
-
include GetText
|
7
|
-
bindtextdomain("statsample")
|
8
|
-
|
9
6
|
include Statsample::Test
|
10
7
|
|
11
|
-
attr_reader :
|
8
|
+
attr_reader :var_num, :var_den, :df_num, :df_den, :var_total, :df_total
|
12
9
|
# Tails for probability (:both, :left or :right)
|
13
10
|
attr_accessor :tails
|
14
11
|
# Name of F analysis
|
15
12
|
attr_accessor :name
|
16
|
-
# Name of numerator
|
17
|
-
attr_accessor :name_numerator
|
18
|
-
# Name of denominator
|
19
|
-
attr_accessor :name_denominator
|
20
13
|
|
21
14
|
# Parameters:
|
22
|
-
# *
|
23
|
-
# *
|
24
|
-
# * df_num: degrees of freedom
|
25
|
-
# * df_den: degrees of freedom
|
26
|
-
def initialize(
|
27
|
-
@
|
28
|
-
@
|
15
|
+
# * var_num: variance numerator
|
16
|
+
# * var_den: variance denominator
|
17
|
+
# * df_num: degrees of freedom numerator
|
18
|
+
# * df_den: degrees of freedom denominator
|
19
|
+
def initialize(var_num, var_den, df_num, df_den, opts=Hash.new)
|
20
|
+
@var_num=var_num
|
21
|
+
@var_den=var_den
|
29
22
|
@df_num=df_num
|
30
23
|
@df_den=df_den
|
31
|
-
@
|
24
|
+
@var_total=var_num+var_den
|
32
25
|
@df_total=df_num+df_den
|
33
|
-
opts_default={:tails=>:right, :
|
26
|
+
opts_default={:tails=>:right, :name=>"F Test"}
|
34
27
|
@opts=opts_default.merge(opts)
|
35
28
|
raise "Tails should be right or left, not both" if @opts[:tails]==:both
|
36
29
|
opts_default.keys.each {|k|
|
@@ -41,20 +34,17 @@ module Statsample
|
|
41
34
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
42
35
|
end
|
43
36
|
def f
|
44
|
-
|
37
|
+
@var_num.quo(@var_den)
|
38
|
+
end
|
39
|
+
def to_f
|
40
|
+
f
|
45
41
|
end
|
46
42
|
# probability
|
47
43
|
def probability
|
48
44
|
p_using_cdf(Distribution::F.cdf(f, @df_num, @df_den), tails)
|
49
45
|
end
|
50
|
-
def report_building(builder)#:nodoc:
|
51
|
-
builder.
|
52
|
-
b.table(:name=>_("%s Table") % @name, :header=>%w{source ss df f p}.map {|v| _(v)}) do |t|
|
53
|
-
t.row([@name_numerator, sprintf("%0.3f",@ss_num), @df_num, sprintf("%0.3f",f), sprintf("%0.3f", probability)])
|
54
|
-
t.row([@name_denominator, sprintf("%0.3f",@ss_den), @df_den, "", ""])
|
55
|
-
t.row([_("Total"), sprintf("%0.3f",@ss_total), @df_total,"",""])
|
56
|
-
end
|
57
|
-
end
|
46
|
+
def report_building(builder) #:nodoc:
|
47
|
+
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @df_num, @df_den, f, probability]
|
58
48
|
end
|
59
49
|
end
|
60
50
|
end
|
@@ -19,6 +19,7 @@ module Statsample
|
|
19
19
|
# Reference:
|
20
20
|
# * NIST/SEMATECH e-Handbook of Statistical Methods. Available on http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
|
21
21
|
class Levene
|
22
|
+
include Statsample::Test
|
22
23
|
# Degrees of freedom 1 (k-1)
|
23
24
|
attr_reader :d1
|
24
25
|
# Degrees of freedom 2 (n-k)
|
@@ -42,18 +43,13 @@ module Statsample
|
|
42
43
|
def f
|
43
44
|
@w
|
44
45
|
end
|
45
|
-
|
46
|
-
|
47
|
-
g.text @name
|
48
|
-
g.text "F: #{"%0.4f" % f}"
|
49
|
-
g.text "p: #{"%0.4f" % probability}"
|
50
|
-
|
46
|
+
def report_building(builder) # :nodoc:
|
47
|
+
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
|
51
48
|
end
|
52
49
|
# Summary of results
|
53
50
|
def summary
|
54
51
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
55
52
|
end
|
56
|
-
|
57
53
|
def compute
|
58
54
|
n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
|
59
55
|
|
@@ -86,7 +82,7 @@ module Statsample
|
|
86
82
|
# Probability.
|
87
83
|
# With H_0 = Sum(s2)=0, probability of getting a value of the test upper or equal to the obtained on the sample
|
88
84
|
def probability
|
89
|
-
|
85
|
+
p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
|
90
86
|
end
|
91
87
|
|
92
88
|
end
|
data/lib/statsample/test/t.rb
CHANGED
@@ -31,7 +31,7 @@ module Statsample
|
|
31
31
|
end
|
32
32
|
num.quo(den)
|
33
33
|
end
|
34
|
-
|
34
|
+
# Degrees of freedom for equal variance on t test
|
35
35
|
def df_equal_variance(n1,n2)
|
36
36
|
n1+n2-2
|
37
37
|
end
|
@@ -67,7 +67,6 @@ module Statsample
|
|
67
67
|
class OneSample
|
68
68
|
include Math
|
69
69
|
include Statsample::Test
|
70
|
-
include DirtyMemoize
|
71
70
|
# Options
|
72
71
|
attr_accessor :opts
|
73
72
|
# Name of test
|
@@ -76,15 +75,9 @@ module Statsample
|
|
76
75
|
attr_accessor :u
|
77
76
|
# Degress of freedom
|
78
77
|
attr_reader :df
|
79
|
-
# Value of t
|
80
|
-
attr_reader :t
|
81
|
-
# Probability
|
82
|
-
attr_reader :probability
|
83
78
|
# Tails for probability (:both, :left or :right)
|
84
79
|
attr_accessor :tails
|
85
80
|
|
86
|
-
dirty_writer :u, :tails
|
87
|
-
dirty_memoize :t, :probability
|
88
81
|
# Create a One Sample T Test
|
89
82
|
# Options:
|
90
83
|
# * :u = Mean to compare. Default= 0
|
@@ -100,14 +93,14 @@ module Statsample
|
|
100
93
|
@df= @vector.n_valid-1
|
101
94
|
@t=nil
|
102
95
|
end
|
103
|
-
|
96
|
+
def t
|
97
|
+
T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
|
98
|
+
end
|
104
99
|
|
105
|
-
|
106
|
-
|
107
|
-
@t = T.one_sample(@vector.mean, @u, @vector.sd, @vector.n_valid)
|
108
|
-
@probability = p_using_cdf(Distribution::T.cdf(@t, @df), tails)
|
100
|
+
def probability
|
101
|
+
p_using_cdf(Distribution::T.cdf(t, @df), tails)
|
109
102
|
end
|
110
|
-
#
|
103
|
+
# Summary of analysis
|
111
104
|
#
|
112
105
|
def summary
|
113
106
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
@@ -154,6 +147,8 @@ module Statsample
|
|
154
147
|
include Math
|
155
148
|
include Statsample::Test
|
156
149
|
include DirtyMemoize
|
150
|
+
include GetText
|
151
|
+
bindtextdomain("statsample")
|
157
152
|
# Options
|
158
153
|
attr_accessor :opts
|
159
154
|
# Name of test
|
@@ -204,25 +199,36 @@ module Statsample
|
|
204
199
|
@probability_not_equal_variance = p_using_cdf(Distribution::T.cdf(@t_not_equal_variance, @df_not_equal_variance), tails)
|
205
200
|
|
206
201
|
end
|
202
|
+
# Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data
|
203
|
+
def d
|
204
|
+
n1=@v1.n_valid
|
205
|
+
n2=@v2.n_valid
|
206
|
+
num=@v1.mean-@v2.mean
|
207
|
+
den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2))
|
208
|
+
num.quo(den)
|
209
|
+
end
|
210
|
+
|
207
211
|
# Presents summary of analysis
|
208
|
-
#
|
209
212
|
def summary
|
210
213
|
ReportBuilder.new(:no_title=>true).add(self).to_text
|
211
214
|
end
|
215
|
+
|
212
216
|
def report_building(b) # :nodoc:
|
213
217
|
b.section(:name=>@name) {|g|
|
214
|
-
g.table(:name=>"Mean and standard deviation", :header=>["Variable", "m", "sd","n"]) {|t|
|
215
|
-
t.row([
|
216
|
-
t.row([
|
217
|
-
}
|
218
|
-
g.section(:name=>"Levene Test") {|g1|
|
219
|
-
g1.parse_element(Statsample::Test.levene([@v1,@v2]))
|
218
|
+
g.table(:name=>_("Mean and standard deviation"), :header=>["Variable", "m", "sd","n"]) {|t|
|
219
|
+
t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid])
|
220
|
+
t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
|
220
221
|
}
|
222
|
+
g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
|
221
223
|
|
222
|
-
g.table(:name=>"T statistics",:header=>["Type","t","df", "p (#{tails} tails)"]) {|t|
|
223
|
-
t.row(["Equal variance", "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
|
224
|
-
t.row(["Non equal variance", "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
|
224
|
+
g.table(:name=>_("T statistics"),:header=>["Type","t","df", "p (#{tails} tails)"].map{|v| _(v)}) {|t|
|
225
|
+
t.row([_("Equal variance"), "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
|
226
|
+
t.row([_("Non equal variance"), "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
|
225
227
|
}
|
228
|
+
g.table(:name=>_("Effect size")) do |t|
|
229
|
+
t.row ['x1-x2', "%0.4f" % (@v1.mean-@v2.mean)]
|
230
|
+
t.row ['d', "%0.4f" % d]
|
231
|
+
end
|
226
232
|
}
|
227
233
|
end
|
228
234
|
end
|
@@ -107,14 +107,15 @@ module Statsample
|
|
107
107
|
attr_reader :u
|
108
108
|
# Value of compensation for ties (useful for demostration)
|
109
109
|
attr_reader :t
|
110
|
+
# Name of test
|
111
|
+
attr_accessor :name
|
110
112
|
#
|
111
113
|
# Create a new U Mann-Whitney test
|
112
114
|
# Params: Two Statsample::Vectors
|
113
115
|
#
|
114
|
-
def initialize(v1,v2)
|
116
|
+
def initialize(v1,v2, opts=Hash.new)
|
115
117
|
@n1=v1.valid_data.size
|
116
118
|
@n2=v2.valid_data.size
|
117
|
-
|
118
119
|
data=(v1.valid_data+v2.valid_data).to_scale
|
119
120
|
groups=(([0]*@n1)+([1]*@n2)).to_vector
|
120
121
|
ds={'g'=>groups, 'data'=>data}.to_dataset
|
@@ -132,11 +133,17 @@ module Statsample
|
|
132
133
|
@u1=r1-((@n1*(@n1+1)).quo(2))
|
133
134
|
@u2=r2-((@n2*(@n2+1)).quo(2))
|
134
135
|
@u=(u1<u2) ? u1 : u2
|
136
|
+
opts_default={:name=>"Mann-Whitney's U"}
|
137
|
+
@opts=opts_default.merge(opts)
|
138
|
+
opts_default.keys.each {|k|
|
139
|
+
send("#{k}=", @opts[k])
|
140
|
+
}
|
141
|
+
|
135
142
|
end
|
136
143
|
# Report results.
|
137
144
|
def summary
|
138
145
|
out=<<-HEREDOC
|
139
|
-
|
146
|
+
@name
|
140
147
|
Sum of ranks v1: #{@r1.to_f}
|
141
148
|
Sum of ranks v1: #{@r2.to_f}
|
142
149
|
U Value: #{@u.to_f}
|
@@ -152,7 +159,7 @@ Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
|
152
159
|
end
|
153
160
|
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
|
154
161
|
# Uses u_sampling_distribution_as62
|
155
|
-
def
|
162
|
+
def probability_exact
|
156
163
|
dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
|
157
164
|
sum=0
|
158
165
|
(0..@u.to_i).each {|i|
|
@@ -190,9 +197,9 @@ Z: #{sprintf("%0.3f",z)} (p: #{sprintf("%0.3f",z_probability)})
|
|
190
197
|
(@u-mu).quo(ou)
|
191
198
|
end
|
192
199
|
# Assuming H_0, the proportion of cdf with values of U lower
|
193
|
-
# than the sample.
|
200
|
+
# than the sample, using normal approximation.
|
194
201
|
# Use with more than 30 cases per group.
|
195
|
-
def
|
202
|
+
def probability_z
|
196
203
|
(1-Distribution::Normal.cdf(z.abs()))*2
|
197
204
|
end
|
198
205
|
end
|
data/lib/statsample/vector.rb
CHANGED
@@ -7,7 +7,7 @@ class Array
|
|
7
7
|
end
|
8
8
|
# Creates a new Statsample::Vector object of type :scale
|
9
9
|
def to_scale(*args)
|
10
|
-
Statsample::Vector.new(self
|
10
|
+
Statsample::Vector.new(self, :scale,*args)
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
@@ -24,12 +24,7 @@ module Statsample
|
|
24
24
|
class Vector
|
25
25
|
include Enumerable
|
26
26
|
include Writable
|
27
|
-
|
28
|
-
DEFAULT_OPTIONS={
|
29
|
-
:missing_values=>[],
|
30
|
-
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
31
|
-
:labels=>{}
|
32
|
-
}
|
27
|
+
include Summarizable
|
33
28
|
# Level of measurement. Could be :nominal, :ordinal or :scale
|
34
29
|
attr_reader :type
|
35
30
|
# Original data.
|
@@ -50,23 +45,39 @@ module Statsample
|
|
50
45
|
attr_reader :gsl
|
51
46
|
# Change label for specific values
|
52
47
|
attr_accessor :labels
|
48
|
+
# Name of vector. Should be used for output by many classes
|
49
|
+
attr_accessor :name
|
50
|
+
|
53
51
|
#
|
54
52
|
# Creates a new Vector object.
|
55
|
-
#
|
56
|
-
#
|
57
|
-
#
|
58
|
-
#
|
59
|
-
#
|
60
|
-
#
|
53
|
+
# * <tt>data</tt> Array of data.
|
54
|
+
# * <tt>type</tt> Level of meausurement. See Vector#type
|
55
|
+
# * <tt>opts</tt> Hash of options
|
56
|
+
# * <tt>:missing_values</tt> Array of missing values. See Vector#missing_values
|
57
|
+
# * <tt>:today_values</tt> Array of 'today' values. See Vector#today_values
|
58
|
+
# * <tt>:labels</tt> Labels for data values
|
59
|
+
# * <tt>:name</tt> Name of vector
|
61
60
|
#
|
62
61
|
def initialize(data=[], type=:nominal, opts=Hash.new)
|
63
62
|
raise "Data should be an array" unless data.is_a? Array
|
64
63
|
@data=data
|
65
64
|
@type=type
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
65
|
+
opts_default={
|
66
|
+
:missing_values=>[],
|
67
|
+
:today_values=>['NOW','TODAY', :NOW, :TODAY],
|
68
|
+
:labels=>{},
|
69
|
+
:name=>nil
|
70
|
+
}
|
71
|
+
@opts=opts_default.merge(opts)
|
72
|
+
if @opts[:name].nil?
|
73
|
+
@@n_table||=0
|
74
|
+
@@n_table+=1
|
75
|
+
@opts[:name]="Vector #{@@n_table}"
|
76
|
+
end
|
77
|
+
@missing_values=@opts[:missing_values]
|
78
|
+
@labels=@opts[:labels]
|
79
|
+
@today_values=@opts[:today_values]
|
80
|
+
@name=@opts[:name]
|
70
81
|
@valid_data=[]
|
71
82
|
@data_with_nils=[]
|
72
83
|
@date_data_with_nils=[]
|
@@ -80,12 +91,12 @@ module Statsample
|
|
80
91
|
# Note: data, missing_values and labels are duplicated, so
|
81
92
|
# changes on original vector doesn't propages to copies.
|
82
93
|
def dup
|
83
|
-
|
94
|
+
Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name.dup)
|
84
95
|
end
|
85
96
|
# Returns an empty duplicate of the vector. Maintains the type,
|
86
97
|
# missing values and labels.
|
87
98
|
def dup_empty
|
88
|
-
|
99
|
+
Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name.dup)
|
89
100
|
end
|
90
101
|
# Raises an exception if type of vector is inferior to t type
|
91
102
|
def check_type(t)
|
@@ -128,8 +139,8 @@ module Statsample
|
|
128
139
|
# Vector equality.
|
129
140
|
# Two vector will be the same if their data, missing values, type, labels are equals
|
130
141
|
def ==(v2)
|
131
|
-
|
132
|
-
|
142
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
143
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
|
133
144
|
end
|
134
145
|
|
135
146
|
def _dump(i) # :nodoc:
|
@@ -189,8 +200,8 @@ module Statsample
|
|
189
200
|
# Vector.set_valid_data at the end of your insertion cycle
|
190
201
|
#
|
191
202
|
def add(v,update_valid=true)
|
192
|
-
|
193
|
-
|
203
|
+
@data.push(v)
|
204
|
+
set_valid_data if update_valid
|
194
205
|
end
|
195
206
|
# Update valid_data, missing_data, data_with_nils and gsl
|
196
207
|
# at the end of an insertion.
|
@@ -208,14 +219,14 @@ module Statsample
|
|
208
219
|
# v.valid_data
|
209
220
|
# => [2,3]
|
210
221
|
def set_valid_data
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
222
|
+
@valid_data.clear
|
223
|
+
@missing_data.clear
|
224
|
+
@data_with_nils.clear
|
225
|
+
@date_data_with_nils.clear
|
226
|
+
@gsl=nil
|
227
|
+
set_valid_data_intern
|
228
|
+
set_scale_data if(@type==:scale)
|
229
|
+
set_date_data if(@type==:date)
|
219
230
|
end
|
220
231
|
|
221
232
|
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
@@ -228,21 +239,21 @@ module Statsample
|
|
228
239
|
end
|
229
240
|
end
|
230
241
|
def _set_valid_data_intern #:nodoc:
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
242
|
+
@data.each do |n|
|
243
|
+
if is_valid? n
|
244
|
+
@valid_data.push(n)
|
245
|
+
@data_with_nils.push(n)
|
246
|
+
else
|
247
|
+
@data_with_nils.push(nil)
|
248
|
+
@missing_data.push(n)
|
249
|
+
end
|
238
250
|
end
|
239
|
-
|
240
|
-
@has_missing_data=@missing_data.size>0
|
251
|
+
@has_missing_data=@missing_data.size>0
|
241
252
|
end
|
242
253
|
|
243
254
|
# Retrieves true if data has one o more missing values
|
244
255
|
def has_missing_data?
|
245
|
-
|
256
|
+
@has_missing_data
|
246
257
|
end
|
247
258
|
# Retrieves label for value x. Retrieves x if
|
248
259
|
# no label defined.
|
@@ -251,14 +262,14 @@ module Statsample
|
|
251
262
|
end
|
252
263
|
# Returns a Vector with data with labels replaced by the label.
|
253
264
|
def vector_labeled
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
265
|
+
d=@data.collect{|x|
|
266
|
+
if @labels.has_key? x
|
267
|
+
@labels[x]
|
268
|
+
else
|
269
|
+
x
|
270
|
+
end
|
271
|
+
}
|
272
|
+
Vector.new(d,@type)
|
262
273
|
end
|
263
274
|
# Size of total data
|
264
275
|
def size
|
@@ -427,13 +438,13 @@ module Statsample
|
|
427
438
|
# In all the trails, every item have the same probability
|
428
439
|
# of been selected.
|
429
440
|
def sample_with_replacement(sample=1)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
441
|
+
if(@type!=:scale or !Statsample.has_gsl?)
|
442
|
+
vds=@valid_data.size
|
443
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
444
|
+
else
|
445
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
446
|
+
r.sample(@gsl, sample).to_a
|
447
|
+
end
|
437
448
|
end
|
438
449
|
# Returns an random sample of size n, without replacement,
|
439
450
|
# only with valid data.
|
@@ -597,24 +608,24 @@ module Statsample
|
|
597
608
|
def proportion(v=1)
|
598
609
|
frequencies[v].quo(@valid_data.size)
|
599
610
|
end
|
600
|
-
def
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
611
|
+
def report_building(b)
|
612
|
+
b.section(:name=>name) do |s|
|
613
|
+
s.text _("n :%d") % n
|
614
|
+
s.text _("n valid:%d") % n_valid
|
615
|
+
s.text _("factors:%s") % factors.join(",")
|
616
|
+
s.text _("mode: %s") % mode
|
617
|
+
s.table(:name=>_("Distribution")) do |t|
|
618
|
+
frequencies.sort.each do |k,v|
|
619
|
+
key=labels.has_key?(k) ? labels[k]:k
|
620
|
+
t.row [key,v, ("%0.2f%%" % (v.quo(n_valid)*100))]
|
621
|
+
end
|
622
|
+
end
|
623
|
+
s.text _("median: %s") % median.to_s if(@type==:ordinal)
|
624
|
+
if(@type==:scale)
|
625
|
+
s.text _("mean: %0.4f") % mean
|
626
|
+
s.text _("sd: %0.4f") % sd.to_s
|
627
|
+
end
|
616
628
|
end
|
617
|
-
out
|
618
629
|
end
|
619
630
|
|
620
631
|
# Variance of p, according to poblation size
|
@@ -817,8 +828,7 @@ module Statsample
|
|
817
828
|
@gsl.mean
|
818
829
|
end
|
819
830
|
def variance_sample(m=nil) # :nodoc:
|
820
|
-
|
821
|
-
|
831
|
+
check_type :scale
|
822
832
|
m||=mean
|
823
833
|
@gsl.variance_m
|
824
834
|
end
|
@@ -881,7 +891,7 @@ module Statsample
|
|
881
891
|
alias_method :sdp, :standard_deviation_population
|
882
892
|
alias_method :sds, :standard_deviation_sample
|
883
893
|
alias_method :cov, :coefficient_of_variation
|
884
|
-
alias_method :variance, :variance_sample
|
894
|
+
alias_method :variance, :variance_sample
|
885
895
|
alias_method :sd, :standard_deviation_sample
|
886
896
|
alias_method :ss, :sum_of_squares
|
887
897
|
end
|