statsample 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +8 -0
- data/Manifest.txt +20 -2
- data/data/crime.txt +47 -0
- data/data/test_binomial.csv +201 -0
- data/demo/distribution_t.rb +2 -2
- data/demo/regression.rb +2 -1
- data/lib/distribution.rb +8 -0
- data/lib/distribution/chisquare.rb +24 -0
- data/lib/distribution/f.rb +25 -0
- data/lib/distribution/normal.rb +25 -0
- data/lib/distribution/t.rb +22 -0
- data/lib/matrix_extension.rb +78 -0
- data/lib/statistics2.rb +531 -0
- data/lib/statsample.rb +12 -9
- data/lib/statsample/anova.rb +1 -5
- data/lib/statsample/bivariate.rb +24 -20
- data/lib/statsample/combination.rb +14 -4
- data/lib/statsample/converters.rb +17 -1
- data/lib/statsample/dataset.rb +66 -10
- data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
- data/lib/statsample/graph/gdchart.rb +2 -3
- data/lib/statsample/graph/svggraph.rb +8 -4
- data/lib/statsample/mle.rb +137 -0
- data/lib/statsample/mle/logit.rb +95 -0
- data/lib/statsample/mle/normal.rb +83 -0
- data/lib/statsample/mle/probit.rb +93 -0
- data/lib/statsample/regression.rb +3 -1
- data/lib/statsample/regression/binomial.rb +65 -0
- data/lib/statsample/regression/binomial/logit.rb +13 -0
- data/lib/statsample/regression/binomial/probit.rb +13 -0
- data/lib/statsample/regression/multiple.rb +61 -58
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
- data/lib/statsample/srs.rb +5 -5
- data/lib/statsample/vector.rb +129 -59
- data/test/test_anova.rb +0 -5
- data/test/test_dataset.rb +13 -1
- data/test/test_distribution.rb +57 -0
- data/test/test_gsl.rb +22 -0
- data/test/test_logit.rb +22 -0
- data/test/test_mle.rb +140 -0
- data/test/test_r.rb +9 -0
- data/test/test_regression.rb +12 -4
- data/test/test_srs.rb +0 -4
- data/test/test_stata.rb +11 -0
- data/test/test_statistics.rb +0 -15
- data/test/test_vector.rb +11 -0
- metadata +28 -4
- data/lib/statsample/chidistribution.rb +0 -39
- data/lib/statsample/regression/logit.rb +0 -35
@@ -1,6 +1,6 @@
|
|
1
1
|
module Statsample
|
2
2
|
module Regression
|
3
|
-
# Module for Multiple Regression Analysis
|
3
|
+
# Module for Linear Multiple Regression Analysis
|
4
4
|
# You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
|
5
5
|
# Example.
|
6
6
|
#
|
@@ -35,7 +35,21 @@ module Multiple
|
|
35
35
|
def self.pairwise(ds,y_var)
|
36
36
|
RubyEngine.new(ds,y_var)
|
37
37
|
end
|
38
|
-
|
38
|
+
def self.listwise_by_exp(ds,exp)
|
39
|
+
end
|
40
|
+
# Returns a dataset and name of criteria using a expression.
|
41
|
+
# All nominal vectors are replaced by dummy coding
|
42
|
+
# and interactions are calculated
|
43
|
+
|
44
|
+
def self.ds_by_exp(ds,exp)
|
45
|
+
raise "Not implemented"
|
46
|
+
parts=exp.split(/[\+=]/)
|
47
|
+
dependent=parts.pop
|
48
|
+
ds_out=[]
|
49
|
+
parts.each{|p|
|
50
|
+
|
51
|
+
}
|
52
|
+
end
|
39
53
|
# Base class for Multiple Regression Engines
|
40
54
|
class BaseEngine
|
41
55
|
def initialize(ds,y_var)
|
@@ -119,11 +133,7 @@ module Multiple
|
|
119
133
|
end
|
120
134
|
# Significance of Fisher
|
121
135
|
def significance
|
122
|
-
|
123
|
-
GSL::Cdf.fdist_Q(f,df_r,df_e)
|
124
|
-
else
|
125
|
-
raise "Need Ruby/GSL"
|
126
|
-
end
|
136
|
+
1.0-Distribution::F.cdf(f,df_r,df_e)
|
127
137
|
end
|
128
138
|
# Tolerance for a given variable
|
129
139
|
# http://talkstats.com/showthread.php?t=5056
|
@@ -136,13 +146,13 @@ module Multiple
|
|
136
146
|
1-lr.r2
|
137
147
|
end
|
138
148
|
# Tolerances for each coefficient
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
149
|
+
def coeffs_tolerances
|
150
|
+
@fields.inject({}) {|a,f|
|
151
|
+
a[f]=tolerance(f);
|
152
|
+
a
|
153
|
+
}
|
154
|
+
end
|
155
|
+
# Standard Error for coefficients
|
146
156
|
def coeffs_se
|
147
157
|
out={}
|
148
158
|
mse=sse.quo(df_e)
|
@@ -163,7 +173,6 @@ module Multiple
|
|
163
173
|
x=Matrix.columns(columns)
|
164
174
|
matrix=((x.t*x)).inverse * mse
|
165
175
|
matrix.collect {|i|
|
166
|
-
|
167
176
|
Math::sqrt(i) if i>0
|
168
177
|
}
|
169
178
|
end
|
@@ -177,10 +186,10 @@ module Multiple
|
|
177
186
|
end
|
178
187
|
# Retrieves a summary for Regression
|
179
188
|
def summary(report_type=ConsoleSummary)
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
out.add <<HEREDOC
|
189
|
+
c=coeffs
|
190
|
+
out=""
|
191
|
+
out.extend report_type
|
192
|
+
out.add <<HEREDOC
|
184
193
|
Summary for regression of #{@fields.join(',')} over #{@y_var}
|
185
194
|
*************************************************************
|
186
195
|
Engine: #{self.class}
|
@@ -190,45 +199,39 @@ r2=#{sprintf("%0.3f",r2)}
|
|
190
199
|
Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
|
191
200
|
HEREDOC
|
192
201
|
|
193
|
-
out.add_line
|
194
|
-
out.add "ANOVA TABLE"
|
195
|
-
|
196
|
-
t=Statsample::ReportTable.new(%w{source ss df ms f s})
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
a={}
|
227
|
-
@fields.each_index {|i|
|
228
|
-
a[@fields[i]]=c[i]
|
229
|
-
}
|
230
|
-
a
|
231
|
-
end
|
202
|
+
out.add_line
|
203
|
+
out.add "ANOVA TABLE"
|
204
|
+
|
205
|
+
t=Statsample::ReportTable.new(%w{source ss df ms f s})
|
206
|
+
t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
|
207
|
+
t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
|
208
|
+
|
209
|
+
t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
|
210
|
+
|
211
|
+
out.parse_table(t)
|
212
|
+
|
213
|
+
begin
|
214
|
+
out.add "Beta coefficientes"
|
215
|
+
sc=standarized_coeffs
|
216
|
+
cse=coeffs_se
|
217
|
+
t=Statsample::ReportTable.new(%w{coeff b beta se t})
|
218
|
+
t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
|
219
|
+
@fields.each{|f|
|
220
|
+
t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
|
221
|
+
}
|
222
|
+
out.parse_table(t)
|
223
|
+
|
224
|
+
rescue
|
225
|
+
end
|
226
|
+
out
|
227
|
+
end
|
228
|
+
def assign_names(c)
|
229
|
+
a={}
|
230
|
+
@fields.each_index {|i|
|
231
|
+
a[@fields[i]]=c[i]
|
232
|
+
}
|
233
|
+
a
|
234
|
+
end
|
232
235
|
|
233
236
|
|
234
237
|
# Deprecated
|
data/lib/statsample/srs.rb
CHANGED
@@ -26,7 +26,7 @@ module Statsample
|
|
26
26
|
end
|
27
27
|
# Sample size estimation for proportions, infinite poblation
|
28
28
|
def estimation_n0(d,prop,margin=0.95)
|
29
|
-
t=
|
29
|
+
t=Distribution::Normal.p_value(1-(1-margin).quo(2))
|
30
30
|
var=prop*(1-prop)
|
31
31
|
t**2*var.quo(d**2)
|
32
32
|
end
|
@@ -39,13 +39,13 @@ module Statsample
|
|
39
39
|
# Uses estimated proportion, sample without replacement.
|
40
40
|
|
41
41
|
def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
|
42
|
-
t=
|
42
|
+
t = Distribution::T.p_value(1-((1-margin).quo(2)) , n_sample-1)
|
43
43
|
proportion_confidence_interval(prop,n_sample,n_population, t)
|
44
44
|
end
|
45
45
|
# Proportion confidence interval with z values
|
46
46
|
# Uses estimated proportion, sample without replacement.
|
47
47
|
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
48
|
-
z=
|
48
|
+
z=Distribution::Normal.p_value(1-((1-margin).quo(2)))
|
49
49
|
proportion_confidence_interval(p,n_sample,n_population, z)
|
50
50
|
end
|
51
51
|
# Proportion confidence interval with x value
|
@@ -137,13 +137,13 @@ module Statsample
|
|
137
137
|
# Confidence Interval using T-Student
|
138
138
|
# Use with n < 60
|
139
139
|
def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
|
140
|
-
t=
|
140
|
+
t=Distribution::T.p_value(1-((1-margin) / 2),n_sample-1)
|
141
141
|
mean_confidence_interval(mean,s,n_sample,n_population,t)
|
142
142
|
end
|
143
143
|
# Confidente Interval using Z
|
144
144
|
# Use with n > 60
|
145
145
|
def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
|
146
|
-
z=
|
146
|
+
z=Distribution::Normal.p_value(1-((1-margin) / 2))
|
147
147
|
mean_confidence_interval(mean,s,n_sample,n_population, z)
|
148
148
|
end
|
149
149
|
# Confidente interval using X.
|
data/lib/statsample/vector.rb
CHANGED
@@ -42,20 +42,38 @@ module Statsample
|
|
42
42
|
|
43
43
|
class Vector
|
44
44
|
include Enumerable
|
45
|
-
|
45
|
+
# Level of measurement. Could be :nominal, :ordinal or :scale
|
46
|
+
attr_reader :type
|
47
|
+
# Original data.
|
48
|
+
attr_reader :data
|
49
|
+
# Valid data. Equal to data, minus values assigned as missing values
|
50
|
+
attr_reader :valid_data
|
51
|
+
# Array of values considered as missing. Nil is a missing value, by default
|
52
|
+
attr_reader :missing_values
|
53
|
+
# Missing values array
|
54
|
+
attr_reader :missing_data
|
55
|
+
# Original data, with all missing values replaced by nils
|
56
|
+
attr_reader :data_with_nils
|
57
|
+
# GSL Object, only available with rbgsl extension and type==:scale
|
58
|
+
attr_reader :gsl
|
59
|
+
# Change label for specific values
|
46
60
|
attr_accessor :labels
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
61
|
+
# Creates a new Vector object.
|
62
|
+
# [data] Array of data.
|
63
|
+
# [type] Level of meausurement. See Vector#type
|
64
|
+
# [missing_values] Array of missing values. See Vector#missing_values
|
65
|
+
# [labels] Labels for data values
|
66
|
+
#
|
67
|
+
# The fast way to create a vector uses Array#to_vector. Remember
|
68
|
+
# to include as the first argument the level of measurement
|
69
|
+
#
|
70
|
+
# v=[1,2,3,4].to_vector(:scale)
|
71
|
+
#
|
72
|
+
def initialize(data=[], t=:nominal,missing_values=[],labels={})
|
55
73
|
raise "Data should be an array" unless data.is_a? Array
|
56
|
-
|
57
|
-
|
58
|
-
|
74
|
+
@data=data
|
75
|
+
@missing_values=missing_values
|
76
|
+
@labels=labels
|
59
77
|
@type=t
|
60
78
|
@valid_data=[]
|
61
79
|
@data_with_nils=[]
|
@@ -65,6 +83,9 @@ class Vector
|
|
65
83
|
set_valid_data_intern
|
66
84
|
self.type=t
|
67
85
|
end
|
86
|
+
# Creates a duplicate of the Vector.
|
87
|
+
# Note: data, missing_values and labels are duplicated, so
|
88
|
+
# changes on original vector doesn't propages to copies.
|
68
89
|
def dup
|
69
90
|
Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
|
70
91
|
end
|
@@ -73,14 +94,17 @@ class Vector
|
|
73
94
|
def dup_empty
|
74
95
|
Vector.new([],@type,@missing_values.dup,@labels.dup)
|
75
96
|
end
|
97
|
+
# Raises an exception if type of vector is inferior to t type
|
98
|
+
def check_type(t)
|
99
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
|
100
|
+
end
|
101
|
+
private :check_type
|
102
|
+
|
76
103
|
# Return a vector usign the standarized values for data
|
77
104
|
# with sd with denominator N
|
78
105
|
def vector_standarized_pop
|
79
106
|
vector_standarized(true)
|
80
107
|
end
|
81
|
-
def check_type(t)
|
82
|
-
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
|
83
|
-
end
|
84
108
|
# Return a vector usign the standarized values for data
|
85
109
|
# with sd with denominator n-1
|
86
110
|
|
@@ -114,48 +138,63 @@ class Vector
|
|
114
138
|
}.to_vector(:scale)
|
115
139
|
end
|
116
140
|
|
117
|
-
# Vector equality
|
141
|
+
# Vector equality.
|
118
142
|
# Two vector will be the same if their data, missing values, type, labels are equals
|
119
143
|
def ==(v2)
|
120
144
|
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
121
145
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
122
146
|
end
|
123
147
|
|
124
|
-
def _dump(i)
|
148
|
+
def _dump(i) # :nodoc:
|
125
149
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
126
150
|
end
|
127
|
-
|
151
|
+
|
152
|
+
def self._load(data) # :nodoc:
|
128
153
|
h=Marshal.load(data)
|
129
154
|
Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
|
130
155
|
end
|
156
|
+
# Returns a new vector, with data modified by block.
|
157
|
+
# Equivalent to create a Vector after #collect on data
|
131
158
|
def recode
|
132
159
|
@data.collect{|x|
|
133
160
|
yield x
|
134
161
|
}.to_vector(@type)
|
135
162
|
end
|
163
|
+
# Modifies current vector, with data modified by block.
|
164
|
+
# Equivalent to #collect! on @data
|
165
|
+
|
136
166
|
def recode!
|
137
167
|
@data.collect!{|x|
|
138
168
|
yield x
|
139
169
|
}
|
140
170
|
set_valid_data
|
141
171
|
end
|
172
|
+
# Iterate on each item
|
173
|
+
# Equivalent to
|
174
|
+
# @data.each{|x| yield x}
|
142
175
|
def each
|
143
|
-
@data.each{|x|
|
144
|
-
yield(x)
|
145
|
-
}
|
176
|
+
@data.each{|x| yield(x) }
|
146
177
|
end
|
178
|
+
|
179
|
+
# Iterate on each item_index
|
180
|
+
|
147
181
|
def each_index
|
148
182
|
(0...@data.size).each {|i|
|
149
183
|
yield(i)
|
150
184
|
}
|
151
185
|
end
|
152
|
-
# Add a value at the end of the vector
|
153
|
-
# If second argument set to false, you should update
|
186
|
+
# Add a value at the end of the vector.
|
187
|
+
# If second argument set to false, you should update the Vector usign
|
154
188
|
# Vector#set_valid_data at the end of your insertion cycle
|
189
|
+
#
|
155
190
|
def add(v,update_valid=true)
|
156
191
|
@data.push(v)
|
157
192
|
set_valid_data if update_valid
|
158
193
|
end
|
194
|
+
# Update valid_data, missing_data, data_with_nils and gsl
|
195
|
+
# at the end of an insertion
|
196
|
+
#
|
197
|
+
# Use after add(v,false)
|
159
198
|
def set_valid_data
|
160
199
|
@valid_data.clear
|
161
200
|
@missing_data.clear
|
@@ -186,6 +225,7 @@ class Vector
|
|
186
225
|
end
|
187
226
|
@has_missing_data=@missing_data.size>0
|
188
227
|
end
|
228
|
+
|
189
229
|
# Retrieves true if data has one o more missing values
|
190
230
|
def has_missing_data?
|
191
231
|
@has_missing_data
|
@@ -193,7 +233,7 @@ class Vector
|
|
193
233
|
def labeling(x)
|
194
234
|
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
195
235
|
end
|
196
|
-
# Returns a Vector with the data with labels replaced by the label
|
236
|
+
# Returns a Vector with the data with labels replaced by the label.
|
197
237
|
def vector_labeled
|
198
238
|
d=@data.collect{|x|
|
199
239
|
if @labels.has_key? x
|
@@ -204,12 +244,18 @@ class Vector
|
|
204
244
|
}
|
205
245
|
Vector.new(d,@type)
|
206
246
|
end
|
247
|
+
# Size of total data
|
207
248
|
def size
|
208
249
|
@data.size
|
209
250
|
end
|
251
|
+
alias_method :n, :size
|
252
|
+
|
253
|
+
# Retrieves i element of data
|
210
254
|
def [](i)
|
211
255
|
@data[i]
|
212
256
|
end
|
257
|
+
# Set i element of data.
|
258
|
+
# Note: Use set_valid_data if you include missing values
|
213
259
|
def []=(i,v)
|
214
260
|
@data[i]=v
|
215
261
|
end
|
@@ -227,7 +273,7 @@ class Vector
|
|
227
273
|
@type=t
|
228
274
|
set_scale_data if(t==:scale)
|
229
275
|
end
|
230
|
-
|
276
|
+
|
231
277
|
def to_a
|
232
278
|
@data.dup
|
233
279
|
end
|
@@ -292,10 +338,11 @@ class Vector
|
|
292
338
|
end
|
293
339
|
|
294
340
|
end
|
295
|
-
# Return an array with the data splitted by a separator
|
296
|
-
#
|
297
|
-
#
|
298
|
-
#
|
341
|
+
# Return an array with the data splitted by a separator.
|
342
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
343
|
+
# a.splitted
|
344
|
+
# =>
|
345
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
299
346
|
def splitted(sep=Statsample::SPLIT_TOKEN)
|
300
347
|
@data.collect{|x|
|
301
348
|
if x.nil?
|
@@ -311,11 +358,14 @@ class Vector
|
|
311
358
|
# defined on the fields
|
312
359
|
# Example:
|
313
360
|
#
|
314
|
-
#
|
361
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
315
362
|
# a.split_by_separator
|
316
|
-
#
|
317
|
-
#
|
318
|
-
#
|
363
|
+
# => {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88
|
364
|
+
# @data=[1, 0, 1]>,
|
365
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48
|
366
|
+
# @data=[1, 1, 0]>,
|
367
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08
|
368
|
+
# @data=[0, 1, 1]>}
|
319
369
|
#
|
320
370
|
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
321
371
|
split_data=splitted(sep)
|
@@ -353,7 +403,7 @@ class Vector
|
|
353
403
|
# In all the trails, every item have the same probability
|
354
404
|
# of been selected
|
355
405
|
def sample_with_replacement(sample=1)
|
356
|
-
if(@type!=:scale)
|
406
|
+
if(@type!=:scale or !HAS_GSL)
|
357
407
|
vds=@valid_data.size
|
358
408
|
(0...sample).collect{ @valid_data[rand(vds)] }
|
359
409
|
else
|
@@ -368,7 +418,7 @@ class Vector
|
|
368
418
|
# A sample of the same size of the vector is the vector itself
|
369
419
|
|
370
420
|
def sample_without_replacement(sample=1)
|
371
|
-
if(@type!=:scale)
|
421
|
+
if(@type!=:scale or !HAS_GSL)
|
372
422
|
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
373
423
|
out=[]
|
374
424
|
size=@valid_data.size
|
@@ -393,7 +443,8 @@ class Vector
|
|
393
443
|
frequencies[x].nil? ? 0 : frequencies[x]
|
394
444
|
end
|
395
445
|
end
|
396
|
-
# returns the
|
446
|
+
# returns the database type for the vector, according to its content
|
447
|
+
|
397
448
|
def db_type(dbs='mysql')
|
398
449
|
# first, detect any character not number
|
399
450
|
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
@@ -416,15 +467,28 @@ class Vector
|
|
416
467
|
end
|
417
468
|
def to_s
|
418
469
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
470
|
+
end
|
471
|
+
# Ugly name. Really, create a Vector for standard 'matrix' package.
|
472
|
+
# <tt>dir</tt> could. be :horizontal or :vertical
|
473
|
+
def to_matrix(dir=:horizontal)
|
474
|
+
case dir
|
475
|
+
when :horizontal
|
476
|
+
Matrix[@data]
|
477
|
+
when :vertical
|
478
|
+
Matrix.columns([@data])
|
479
|
+
end
|
419
480
|
end
|
420
481
|
def inspect
|
421
482
|
self.to_s
|
422
483
|
end
|
484
|
+
def as_r
|
485
|
+
@data.dup
|
486
|
+
end
|
423
487
|
def factors
|
424
488
|
if @type==:scale
|
425
489
|
@scale_data.uniq.sort
|
426
490
|
else
|
427
|
-
|
491
|
+
@valid_data.uniq.sort
|
428
492
|
end
|
429
493
|
end
|
430
494
|
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
@@ -472,16 +536,16 @@ class Vector
|
|
472
536
|
end
|
473
537
|
|
474
538
|
|
475
|
-
# Returns the most frequent item
|
539
|
+
# Returns the most frequent item.
|
476
540
|
def mode
|
477
541
|
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
478
542
|
end
|
479
|
-
# The numbers of item with valid data
|
543
|
+
# The numbers of item with valid data.
|
480
544
|
def n_valid
|
481
545
|
@valid_data.size
|
482
546
|
end
|
483
547
|
# Returns a hash with the distribution of proportions of
|
484
|
-
# the sample
|
548
|
+
# the sample.
|
485
549
|
def proportions
|
486
550
|
frequencies.inject({}){|a,v|
|
487
551
|
a[v[0]] = v[1].quo(n_valid)
|
@@ -512,13 +576,11 @@ class Vector
|
|
512
576
|
out
|
513
577
|
end
|
514
578
|
|
515
|
-
|
516
|
-
|
517
|
-
|
518
579
|
# Variance of p, according to poblation size
|
519
580
|
def variance_proportion(n_poblation, v=1)
|
520
581
|
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
521
582
|
end
|
583
|
+
# Variance of p, according to poblation size
|
522
584
|
def variance_total(n_poblation, v=1)
|
523
585
|
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
524
586
|
end
|
@@ -534,7 +596,10 @@ class Vector
|
|
534
596
|
alias_method met_or, met
|
535
597
|
end
|
536
598
|
}
|
537
|
-
|
599
|
+
######
|
600
|
+
### Ordinal Methods
|
601
|
+
######
|
602
|
+
|
538
603
|
# Return the value of the percentil q
|
539
604
|
def percentil(q)
|
540
605
|
check_type :ordinal
|
@@ -546,7 +611,7 @@ class Vector
|
|
546
611
|
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
547
612
|
end
|
548
613
|
end
|
549
|
-
# Returns a ranked vector
|
614
|
+
# Returns a ranked vector.
|
550
615
|
def ranked(type=:ordinal)
|
551
616
|
check_type :ordinal
|
552
617
|
i=0
|
@@ -593,6 +658,8 @@ class Vector
|
|
593
658
|
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
594
659
|
end
|
595
660
|
end
|
661
|
+
private :set_scale_data
|
662
|
+
|
596
663
|
# The range of the data (max - min)
|
597
664
|
def range;
|
598
665
|
check_type :scale
|
@@ -608,9 +675,12 @@ class Vector
|
|
608
675
|
|
609
676
|
sum.to_f.quo(n_valid)
|
610
677
|
end
|
678
|
+
# Sum of squares for the data around a value.
|
679
|
+
# By default, this value is the mean
|
680
|
+
# ss= sum{(xi-m)^2}
|
681
|
+
#
|
611
682
|
def sum_of_squares(m=nil)
|
612
683
|
check_type :scale
|
613
|
-
|
614
684
|
m||=mean
|
615
685
|
@scale_data.inject(0){|a,x| a+(x-m).square}
|
616
686
|
end
|
@@ -618,27 +688,25 @@ class Vector
|
|
618
688
|
# Sum of squared deviation
|
619
689
|
def sum_of_squared_deviation
|
620
690
|
check_type :scale
|
621
|
-
|
622
691
|
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
623
692
|
end
|
624
693
|
|
625
|
-
# Population variance (
|
694
|
+
# Population variance (denominator N)
|
626
695
|
def variance_population(m=nil)
|
627
696
|
check_type :scale
|
628
|
-
|
629
697
|
m||=mean
|
630
698
|
squares=@scale_data.inject(0){|a,x| x.square+a}
|
631
699
|
squares.quo(n_valid) - m.square
|
632
700
|
end
|
633
701
|
|
634
702
|
|
635
|
-
# Population Standard deviation (
|
703
|
+
# Population Standard deviation (denominator N)
|
636
704
|
def standard_deviation_population(m=nil)
|
637
705
|
check_type :scale
|
638
706
|
|
639
707
|
Math::sqrt( variance_population(m) )
|
640
708
|
end
|
641
|
-
# Sample Variance (
|
709
|
+
# Sample Variance (denominator n-1)
|
642
710
|
|
643
711
|
def variance_sample(m=nil)
|
644
712
|
check_type :scale
|
@@ -647,7 +715,7 @@ class Vector
|
|
647
715
|
sum_of_squares(m).quo(n_valid - 1)
|
648
716
|
end
|
649
717
|
|
650
|
-
# Sample Standard deviation (
|
718
|
+
# Sample Standard deviation (denominator n-1)
|
651
719
|
|
652
720
|
def standard_deviation_sample(m=nil)
|
653
721
|
check_type :scale
|
@@ -655,13 +723,14 @@ class Vector
|
|
655
723
|
m||=m
|
656
724
|
Math::sqrt(variance_sample(m))
|
657
725
|
end
|
726
|
+
# Skewness of the sample
|
658
727
|
def skew
|
659
728
|
check_type :scale
|
660
|
-
|
661
729
|
m=mean
|
662
730
|
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
|
663
731
|
thirds.quo((@scale_data.size-1)*sd**3)
|
664
732
|
end
|
733
|
+
# Kurtosis of the sample
|
665
734
|
def kurtosis
|
666
735
|
check_type :scale
|
667
736
|
|
@@ -670,9 +739,10 @@ class Vector
|
|
670
739
|
thirds.quo((@scale_data.size-1)*sd**4)
|
671
740
|
|
672
741
|
end
|
742
|
+
# Product of all values on the sample
|
743
|
+
#
|
673
744
|
def product
|
674
745
|
check_type :scale
|
675
|
-
|
676
746
|
@scale_data.inject(1){|a,x| a*x }
|
677
747
|
end
|
678
748
|
if HAS_GSL
|
@@ -712,11 +782,11 @@ class Vector
|
|
712
782
|
m||=mean
|
713
783
|
@gsl.sd_with_fixed_mean(m)
|
714
784
|
end
|
715
|
-
def skew
|
785
|
+
def skew # :nodoc:
|
716
786
|
check_type :scale
|
717
787
|
@gsl.skew
|
718
788
|
end
|
719
|
-
def kurtosis
|
789
|
+
def kurtosis # :nodoc:
|
720
790
|
check_type :scale
|
721
791
|
@gsl.kurtosis
|
722
792
|
end
|
@@ -752,8 +822,8 @@ class Vector
|
|
752
822
|
alias_method :sdp, :standard_deviation_population
|
753
823
|
alias_method :sds, :standard_deviation_sample
|
754
824
|
alias_method :cov, :coefficient_of_variation
|
755
|
-
|
756
|
-
|
757
|
-
|
825
|
+
alias_method :variance, :variance_sample
|
826
|
+
alias_method :sd, :standard_deviation_sample
|
827
|
+
alias_method :ss, :sum_of_squares
|
758
828
|
end
|
759
829
|
end
|