statsample 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/Manifest.txt +3 -0
- data/demo/benchmark.rb +7 -5
- data/demo/regression.rb +30 -4
- data/lib/statsample.rb +23 -15
- data/lib/statsample/bivariate.rb +28 -9
- data/lib/statsample/combination.rb +103 -0
- data/lib/statsample/converters.rb +16 -1
- data/lib/statsample/dataset.rb +29 -11
- data/lib/statsample/dominanceanalysis.rb +15 -11
- data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
- data/lib/statsample/graph/svggraph.rb +1 -3
- data/lib/statsample/regression.rb +1 -0
- data/lib/statsample/regression/logit.rb +35 -0
- data/lib/statsample/regression/multiple.rb +21 -2
- data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
- data/lib/statsample/vector.rb +168 -183
- data/test/test_combination.rb +42 -0
- data/test/test_csv.rb +1 -1
- data/test/test_dataset.rb +5 -0
- data/test/test_statistics.rb +19 -2
- data/test/test_svg_graph.rb +5 -2
- data/test/test_vector.rb +6 -1
- metadata +6 -2
@@ -150,17 +150,16 @@ module Statsample
|
|
150
150
|
@models=[]
|
151
151
|
@models_data={}
|
152
152
|
for i in 1..@fields.size
|
153
|
-
c
|
154
|
-
|
155
|
-
|
156
|
-
|
153
|
+
c=Statsample::Combination.new(i,@fields.size)
|
154
|
+
c.each{|data|
|
155
|
+
convert=data.collect {|i|
|
156
|
+
@fields[i]
|
157
|
+
}
|
158
|
+
@models.push(convert)
|
159
|
+
ds_prev=@ds.dup(convert+[@y_var])
|
160
|
+
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
161
|
+
@models_data[convert.sort]=modeldata
|
157
162
|
}
|
158
|
-
@models.push(convert)
|
159
|
-
ds_prev=@ds.dup(convert+[@y_var])
|
160
|
-
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
161
|
-
|
162
|
-
@models_data[convert.sort]=modeldata
|
163
|
-
end while c.next == GSL::SUCCESS
|
164
163
|
end
|
165
164
|
end
|
166
165
|
def summary(report_type=ConsoleSummary)
|
@@ -232,7 +231,12 @@ module Statsample
|
|
232
231
|
@lr.r2
|
233
232
|
end
|
234
233
|
def add_table_row
|
235
|
-
|
234
|
+
begin
|
235
|
+
sign=sprintf("%0.3f", @lr.significance)
|
236
|
+
rescue RuntimeError
|
237
|
+
sign="???"
|
238
|
+
end
|
239
|
+
[@name.join("*"), sprintf("%0.3f",r2), sign] + @fields.collect{|k|
|
236
240
|
v=@contributions[k]
|
237
241
|
if v.nil?
|
238
242
|
"--"
|
@@ -51,26 +51,28 @@ class DominanceAnalysis
|
|
51
51
|
@samples_cd={}
|
52
52
|
@samples_gd={}
|
53
53
|
@pairs=[]
|
54
|
-
c
|
55
|
-
|
56
|
-
convert=
|
54
|
+
c=Statsample::Combination.new(2,@fields.size)
|
55
|
+
c.each{|data|
|
56
|
+
convert=data.collect {|i|
|
57
57
|
@fields[i]
|
58
58
|
}
|
59
59
|
@pairs.push(convert)
|
60
60
|
[@samples_td,@samples_cd,@samples_gd].each{|s|
|
61
61
|
s[convert]=[]
|
62
62
|
}
|
63
|
-
|
64
|
-
|
63
|
+
}
|
64
|
+
end
|
65
65
|
def summary(report_type=ConsoleSummary)
|
66
66
|
out =""
|
67
67
|
raise "You should bootstrap first" if @n_samples==0
|
68
68
|
alfa=0.95
|
69
|
-
t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
|
70
69
|
out.extend report_type
|
71
70
|
out.add _("Summary for Bootstrap Dominance Analysis of %s on %s\n") % [@fields.join(", "), @y_var]
|
72
71
|
out.add _("Sample size: %d\n") % @n_samples
|
73
|
-
|
72
|
+
if HAS_GSL
|
73
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
|
74
|
+
out.add "t:#{t}\n"
|
75
|
+
end
|
74
76
|
out.add "Linear Regression Engine: #{@lr_class.name}"
|
75
77
|
out.nl
|
76
78
|
table=ReportTable.new
|
@@ -6,7 +6,7 @@ require 'SVG/Graph/Plot'
|
|
6
6
|
require 'statsample/graph/svghistogram'
|
7
7
|
|
8
8
|
module Statsample
|
9
|
-
class
|
9
|
+
class Vector
|
10
10
|
# Creates a barchart using ruby-gdchart
|
11
11
|
def svggraph_frequencies(file, width=600, height=300, chart_type=SVG::Graph::BarNoOp, options={})
|
12
12
|
labels,data=[],[]
|
@@ -26,8 +26,6 @@ module Statsample
|
|
26
26
|
f.puts(graph.burn)
|
27
27
|
}
|
28
28
|
end
|
29
|
-
end
|
30
|
-
class Scale < Ordinal
|
31
29
|
def svggraph_histogram(bins, options={})
|
32
30
|
options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
|
33
31
|
graph = Statsample::Graph::SvgHistogram.new(options)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Regression
|
3
|
+
class Logit
|
4
|
+
def initialize(ds,y_var)
|
5
|
+
@ds=ds
|
6
|
+
@y_var=y_var
|
7
|
+
end
|
8
|
+
def vp(x1,x2)
|
9
|
+
sum=0
|
10
|
+
x1.each_index{|i|
|
11
|
+
sum+=x1[i]*x2[i]
|
12
|
+
}
|
13
|
+
sum
|
14
|
+
end
|
15
|
+
# F(B'Xi)
|
16
|
+
def f(b,x)
|
17
|
+
Math::exp(vp(b,x)) / (1+Math::exp(vp(b,x)))
|
18
|
+
end
|
19
|
+
# f(B'Xi)
|
20
|
+
def fa(b,x)
|
21
|
+
f(b,x)*(1-f(b,x))
|
22
|
+
end
|
23
|
+
def l(b)
|
24
|
+
prod=1
|
25
|
+
y=@ds[@y_var]
|
26
|
+
@ds.each_array{|x|
|
27
|
+
x.unshift(1) # add constant
|
28
|
+
l=(f(b,x)**y[@ds.i])*((1.0-f(b,x))**(1.0-y[@ds.i]))
|
29
|
+
prod=prod*l
|
30
|
+
}
|
31
|
+
prod
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -194,13 +194,32 @@ out.add_line
|
|
194
194
|
out.add "ANOVA TABLE"
|
195
195
|
|
196
196
|
t=Statsample::ReportTable.new(%w{source ss df ms f s})
|
197
|
-
|
198
|
-
|
197
|
+
begin
|
198
|
+
t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
|
199
|
+
rescue RuntimeError
|
200
|
+
t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), "???", "???"])
|
201
|
+
end
|
199
202
|
t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
|
200
203
|
|
201
204
|
t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
|
202
205
|
|
203
206
|
out.parse_table(t)
|
207
|
+
|
208
|
+
begin
|
209
|
+
out.add "Beta coefficientes"
|
210
|
+
sc=standarized_coeffs
|
211
|
+
cse=coeffs_se
|
212
|
+
t=Statsample::ReportTable.new(%w{coeff beta se t})
|
213
|
+
t.add_row(["Constant", "-",constant_se, constant_t])
|
214
|
+
@fields.each{|f|
|
215
|
+
t.add_row([f, sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
|
216
|
+
}
|
217
|
+
out.parse_table(t)
|
218
|
+
|
219
|
+
rescue
|
220
|
+
|
221
|
+
end
|
222
|
+
|
204
223
|
out
|
205
224
|
end
|
206
225
|
def assign_names(c)
|
@@ -37,6 +37,8 @@ class AlglibEngine < BaseEngine
|
|
37
37
|
matrix=Matrix.columns(columns)
|
38
38
|
@lr_s=nil
|
39
39
|
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
40
|
+
@coeffs=assign_names(@lr.coeffs)
|
41
|
+
|
40
42
|
end
|
41
43
|
|
42
44
|
def _dump(i)
|
@@ -48,7 +50,7 @@ class AlglibEngine < BaseEngine
|
|
48
50
|
end
|
49
51
|
|
50
52
|
def coeffs
|
51
|
-
|
53
|
+
@coeffs
|
52
54
|
end
|
53
55
|
# Coefficients using a constant
|
54
56
|
# Based on http://www.xycoon.com/ols1.htm
|
data/lib/statsample/vector.rb
CHANGED
@@ -39,11 +39,11 @@ module Statsample
|
|
39
39
|
ds=Statsample::Dataset.new(h).dup_only_valid
|
40
40
|
ds.vectors.values
|
41
41
|
end
|
42
|
-
|
43
|
-
|
42
|
+
|
43
|
+
class Vector
|
44
44
|
include Enumerable
|
45
|
-
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
|
46
|
-
|
45
|
+
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils, :gsl
|
46
|
+
attr_accessor :labels
|
47
47
|
# Creates a new
|
48
48
|
# data = Array of data
|
49
49
|
# t = level of meausurement. Could be:
|
@@ -61,9 +61,9 @@ class Vector < DelegateClass(Array)
|
|
61
61
|
@data_with_nils=[]
|
62
62
|
@missing_data=[]
|
63
63
|
@has_missing_data=nil
|
64
|
-
|
64
|
+
@scale_data=nil
|
65
|
+
set_valid_data_intern
|
65
66
|
self.type=t
|
66
|
-
super(@delegate)
|
67
67
|
end
|
68
68
|
def dup
|
69
69
|
Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
|
@@ -78,23 +78,27 @@ class Vector < DelegateClass(Array)
|
|
78
78
|
def vector_standarized_pop
|
79
79
|
vector_standarized(true)
|
80
80
|
end
|
81
|
-
|
81
|
+
def check_type(t)
|
82
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
|
83
|
+
end
|
82
84
|
# Return a vector usign the standarized values for data
|
83
85
|
# with sd with denominator n-1
|
84
86
|
|
85
87
|
def vector_standarized(use_population=false)
|
86
88
|
raise "Should be a scale" unless @type==:scale
|
87
|
-
mean
|
88
|
-
sd=use_population ?
|
89
|
+
m=mean
|
90
|
+
sd=use_population ? sdp : sds
|
89
91
|
@data_with_nils.collect{|x|
|
90
92
|
if !x.nil?
|
91
|
-
(x.to_f -
|
93
|
+
(x.to_f - m).quo(sd)
|
92
94
|
else
|
93
95
|
nil
|
94
96
|
end
|
95
97
|
}.to_vector(:scale)
|
96
98
|
end
|
99
|
+
|
97
100
|
alias_method :standarized, :vector_standarized
|
101
|
+
|
98
102
|
def box_cox_transformation(lambda)
|
99
103
|
raise "Should be a scale" unless @type==:scale
|
100
104
|
@data_with_nils.collect{|x|
|
@@ -116,6 +120,7 @@ class Vector < DelegateClass(Array)
|
|
116
120
|
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
117
121
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
118
122
|
end
|
123
|
+
|
119
124
|
def _dump(i)
|
120
125
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
121
126
|
end
|
@@ -155,24 +160,31 @@ class Vector < DelegateClass(Array)
|
|
155
160
|
@valid_data.clear
|
156
161
|
@missing_data.clear
|
157
162
|
@data_with_nils.clear
|
158
|
-
|
159
|
-
|
163
|
+
@gsl=nil
|
164
|
+
set_valid_data_intern
|
165
|
+
set_scale_data if(@type==:scale)
|
160
166
|
end
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
167
|
+
|
168
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
169
|
+
def set_valid_data_intern
|
170
|
+
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
171
|
+
end
|
172
|
+
else
|
173
|
+
def set_valid_data_intern
|
174
|
+
_set_valid_data_intern
|
175
|
+
end
|
176
|
+
end
|
177
|
+
def _set_valid_data_intern
|
165
178
|
@data.each do |n|
|
166
|
-
|
179
|
+
if is_valid? n
|
167
180
|
@valid_data.push(n)
|
168
181
|
@data_with_nils.push(n)
|
169
|
-
|
182
|
+
else
|
170
183
|
@data_with_nils.push(nil)
|
171
184
|
@missing_data.push(n)
|
172
|
-
|
173
|
-
end
|
174
|
-
@has_missing_data=@missing_data.size>0
|
185
|
+
end
|
175
186
|
end
|
187
|
+
@has_missing_data=@missing_data.size>0
|
176
188
|
end
|
177
189
|
# Retrieves true if data has one o more missing values
|
178
190
|
def has_missing_data?
|
@@ -212,29 +224,13 @@ class Vector < DelegateClass(Array)
|
|
212
224
|
end
|
213
225
|
# Set level of measurement.
|
214
226
|
def type=(t)
|
215
|
-
|
216
|
-
|
217
|
-
@delegate=Nominal.new(@valid_data)
|
218
|
-
when :ordinal
|
219
|
-
@delegate=Ordinal.new(@valid_data)
|
220
|
-
when :scale
|
221
|
-
@delegate=Scale.new(@valid_data)
|
222
|
-
else
|
223
|
-
raise "Type doesn't exists"
|
224
|
-
end
|
225
|
-
__setobj__(@delegate)
|
226
|
-
@type=t
|
227
|
+
@type=t
|
228
|
+
set_scale_data if(t==:scale)
|
227
229
|
end
|
228
230
|
def n; @data.size ; end
|
229
231
|
def to_a
|
230
232
|
@data.dup
|
231
|
-
|
232
|
-
# Redundant, but necessary
|
233
|
-
# Spreadsheet creates Array#sum, so calling sum
|
234
|
-
# doesn't call the delegates method
|
235
|
-
def sum
|
236
|
-
@delegate.sum
|
237
|
-
end
|
233
|
+
end
|
238
234
|
alias_method :to_ary, :to_a
|
239
235
|
# Vector sum.
|
240
236
|
# - If v is a scalar, add this value to all elements
|
@@ -357,7 +353,13 @@ class Vector < DelegateClass(Array)
|
|
357
353
|
# In all the trails, every item have the same probability
|
358
354
|
# of been selected
|
359
355
|
def sample_with_replacement(sample=1)
|
360
|
-
|
356
|
+
if(@type!=:scale)
|
357
|
+
vds=@valid_data.size
|
358
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
359
|
+
else
|
360
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
361
|
+
r.sample(@gsl, sample).to_a
|
362
|
+
end
|
361
363
|
end
|
362
364
|
# Returns an random sample of size n, without replacement,
|
363
365
|
# only with valid data.
|
@@ -366,9 +368,20 @@ class Vector < DelegateClass(Array)
|
|
366
368
|
# A sample of the same size of the vector is the vector itself
|
367
369
|
|
368
370
|
def sample_without_replacement(sample=1)
|
369
|
-
|
371
|
+
if(@type!=:scale)
|
372
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
373
|
+
out=[]
|
374
|
+
size=@valid_data.size
|
375
|
+
while out.size<sample
|
376
|
+
value=rand(size)
|
377
|
+
out.push(value) if !out.include?value
|
378
|
+
end
|
379
|
+
out.collect{|i|@data[i]}
|
380
|
+
else
|
381
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
382
|
+
r.choose(@gsl, sample).to_a
|
383
|
+
end
|
370
384
|
end
|
371
|
-
|
372
385
|
def count(x=false)
|
373
386
|
if block_given?
|
374
387
|
r=@data.inject(0) {|s, i|
|
@@ -401,41 +414,37 @@ class Vector < DelegateClass(Array)
|
|
401
414
|
true
|
402
415
|
end
|
403
416
|
end
|
404
|
-
def summary(out="")
|
405
|
-
@delegate.summary(@labels,out)
|
406
|
-
end
|
407
417
|
def to_s
|
408
418
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
409
419
|
end
|
410
420
|
def inspect
|
411
421
|
self.to_s
|
412
422
|
end
|
413
|
-
|
414
|
-
end
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
class Nominal
|
419
|
-
def initialize(data)
|
420
|
-
@data=data
|
421
|
-
# @factors=data.uniq
|
422
|
-
end
|
423
|
-
def delegate_data
|
424
|
-
@data
|
425
|
-
end
|
426
|
-
# Return an array of the different values of the data
|
427
423
|
def factors
|
428
|
-
@
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
424
|
+
if @type==:scale
|
425
|
+
@scale_data.uniq.sort
|
426
|
+
else
|
427
|
+
@valid_data.uniq.sort
|
428
|
+
end
|
429
|
+
end
|
430
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
431
|
+
# Returns a hash with the distribution of frecuencies of
|
432
|
+
# the sample
|
433
|
+
def frequencies
|
434
|
+
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
435
|
+
end
|
436
|
+
else
|
437
|
+
def frequencies
|
438
|
+
_frequencies
|
439
|
+
end
|
438
440
|
end
|
441
|
+
def _frequencies
|
442
|
+
@valid_data.inject(Hash.new) {|a,x|
|
443
|
+
a[x]||=0
|
444
|
+
a[x]=a[x]+1
|
445
|
+
a
|
446
|
+
}
|
447
|
+
end
|
439
448
|
# Plot frequencies on a chart, using gnuplot
|
440
449
|
def plot_frequencies
|
441
450
|
require 'gnuplot'
|
@@ -469,21 +478,21 @@ class Vector < DelegateClass(Array)
|
|
469
478
|
end
|
470
479
|
# The numbers of item with valid data
|
471
480
|
def n_valid
|
472
|
-
@
|
481
|
+
@valid_data.size
|
473
482
|
end
|
474
483
|
# Returns a hash with the distribution of proportions of
|
475
484
|
# the sample
|
476
485
|
def proportions
|
477
486
|
frequencies.inject({}){|a,v|
|
478
|
-
a[v[0]] = v[1].quo(
|
487
|
+
a[v[0]] = v[1].quo(n_valid)
|
479
488
|
a
|
480
489
|
}
|
481
490
|
end
|
482
491
|
# Proportion of a given value.
|
483
492
|
def proportion(v=1)
|
484
|
-
frequencies[v].quo(@
|
493
|
+
frequencies[v].quo(@valid_data.size)
|
485
494
|
end
|
486
|
-
def summary(
|
495
|
+
def summary(out="")
|
487
496
|
out << sprintf("n valid:%d\n",n_valid)
|
488
497
|
out << sprintf("factors:%s\n",factors.join(","))
|
489
498
|
out << "mode:"+mode.to_s+"\n"
|
@@ -492,47 +501,32 @@ class Vector < DelegateClass(Array)
|
|
492
501
|
key=labels.has_key?(k) ? labels[k]:k
|
493
502
|
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
494
503
|
}
|
504
|
+
if(@type==:ordinal)
|
505
|
+
out << "median:"+median.to_s+"\n"
|
506
|
+
end
|
507
|
+
if(@type==:scale)
|
508
|
+
out << "mean:"+mean.to_s+"\n"
|
509
|
+
out << "sd:"+sd.to_s+"\n"
|
510
|
+
|
511
|
+
end
|
495
512
|
out
|
496
513
|
end
|
497
514
|
|
498
|
-
|
499
|
-
# only with valid data.
|
500
|
-
#
|
501
|
-
# In all the trails, every item have the same probability
|
502
|
-
# of been selected
|
503
|
-
def sample_with_replacement(sample)
|
504
|
-
(0...sample).collect{ @data[rand(@data.size)] }
|
505
|
-
end
|
506
|
-
# Returns an random sample of size n, without replacement,
|
507
|
-
# only with valid data.
|
508
|
-
#
|
509
|
-
# Every element could only be selected once
|
510
|
-
# A sample of the same size of the vector is the vector itself
|
511
|
-
|
512
|
-
def sample_without_replacement(sample)
|
513
|
-
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
|
514
|
-
out=[]
|
515
|
-
size=@data.size
|
516
|
-
while out.size<sample
|
517
|
-
value=rand(size)
|
518
|
-
out.push(value) if !out.include?value
|
519
|
-
end
|
520
|
-
out.collect{|i|@data[i]}
|
521
|
-
end
|
515
|
+
|
522
516
|
|
523
517
|
|
524
518
|
# Variance of p, according to poblation size
|
525
519
|
def variance_proportion(n_poblation, v=1)
|
526
|
-
Statsample::proportion_variance_sample(self.proportion(v), @
|
520
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
527
521
|
end
|
528
522
|
def variance_total(n_poblation, v=1)
|
529
|
-
Statsample::total_variance_sample(self.proportion(v), @
|
523
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
530
524
|
end
|
531
525
|
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
532
|
-
Statsample::proportion_confidence_interval_t(proportion(v), @
|
526
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
533
527
|
end
|
534
528
|
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
535
|
-
Statsample::proportion_confidence_interval_z(proportion(v), @
|
529
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
536
530
|
end
|
537
531
|
self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
|
538
532
|
met_or=met.gsub("_slow","")
|
@@ -540,12 +534,11 @@ class Vector < DelegateClass(Array)
|
|
540
534
|
alias_method met_or, met
|
541
535
|
end
|
542
536
|
}
|
543
|
-
|
544
|
-
|
545
|
-
class Ordinal <Nominal
|
537
|
+
# Ordinal Methods
|
546
538
|
# Return the value of the percentil q
|
547
539
|
def percentil(q)
|
548
|
-
|
540
|
+
check_type :ordinal
|
541
|
+
sorted=@valid_data.sort
|
549
542
|
v= (n_valid * q).quo(100)
|
550
543
|
if(v.to_i!=v)
|
551
544
|
sorted[v.to_i]
|
@@ -555,6 +548,7 @@ class Vector < DelegateClass(Array)
|
|
555
548
|
end
|
556
549
|
# Returns a ranked vector
|
557
550
|
def ranked(type=:ordinal)
|
551
|
+
check_type :ordinal
|
558
552
|
i=0
|
559
553
|
r=frequencies.sort.inject({}){|a,v|
|
560
554
|
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
@@ -567,100 +561,88 @@ class Vector < DelegateClass(Array)
|
|
567
561
|
end
|
568
562
|
# Return the median (percentil 50)
|
569
563
|
def median
|
564
|
+
check_type :ordinal
|
565
|
+
if HAS_GSL and @type==:scale
|
566
|
+
GSL::Stats::median_from_sorted_data(@gsl)
|
567
|
+
else
|
570
568
|
percentil(50)
|
571
|
-
end
|
572
|
-
if HAS_GSL
|
573
|
-
%w{median}.each{|m|
|
574
|
-
m_nuevo=(m+"_slow").intern
|
575
|
-
alias_method m_nuevo, m.intern
|
576
|
-
}
|
577
|
-
|
578
|
-
#def percentil(p)
|
579
|
-
# v=GSL::Vector.alloc(@data.sort)
|
580
|
-
# v.stats_quantile_from_sorted_data(p)
|
581
|
-
#end
|
582
|
-
def median # :nodoc:
|
583
|
-
GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
|
584
569
|
end
|
585
570
|
end
|
586
571
|
# Minimun value
|
587
|
-
def min;
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
def summary(labels,out="")
|
593
|
-
out << sprintf("n valid:%d\n",n_valid)
|
594
|
-
out << "median:"+median.to_s+"\n"
|
595
|
-
out << "percentil 25:"+percentil(25).to_s+"\n"
|
596
|
-
out << "percentil 75:"+percentil(75).to_s+"\n"
|
597
|
-
out
|
598
|
-
end
|
599
|
-
end
|
600
|
-
class Scale <Ordinal
|
601
|
-
attr_reader :gsl
|
602
|
-
def initialize(data)
|
603
|
-
# puts "Inicializando Scale..."
|
604
|
-
super(data)
|
605
|
-
|
606
|
-
set_gsl
|
572
|
+
def min;
|
573
|
+
check_type :ordinal
|
574
|
+
@valid_data.min;
|
607
575
|
end
|
608
|
-
|
609
|
-
def
|
610
|
-
|
611
|
-
|
612
|
-
def _load(data)
|
613
|
-
@data=Marshal.restore(data)
|
614
|
-
set_gsl
|
576
|
+
# Maximum value
|
577
|
+
def max;
|
578
|
+
check_type :ordinal
|
579
|
+
@valid_data.max;
|
615
580
|
end
|
616
|
-
|
617
|
-
|
618
|
-
|
581
|
+
|
582
|
+
def set_scale_data # :nodoc
|
583
|
+
@scale_data=@valid_data.collect{|x|
|
584
|
+
if x.is_a? Numeric
|
619
585
|
x
|
620
586
|
elsif x.is_a? String and x.to_i==x.to_f
|
621
587
|
x.to_i
|
622
588
|
else
|
623
589
|
x.to_f
|
624
590
|
end
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
end
|
591
|
+
}
|
592
|
+
if HAS_GSL
|
593
|
+
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
629
594
|
end
|
595
|
+
end
|
630
596
|
# The range of the data (max - min)
|
631
|
-
def range;
|
597
|
+
def range;
|
598
|
+
check_type :scale
|
599
|
+
@scale_data.max - @scale_data.min
|
600
|
+
end
|
632
601
|
# The sum of values for the data
|
633
602
|
def sum
|
634
|
-
|
603
|
+
check_type :scale
|
604
|
+
@scale_data.inject(0){|a,x|x+a} ; end
|
635
605
|
# The arithmetical mean of data
|
636
606
|
def mean
|
607
|
+
check_type :scale
|
608
|
+
|
637
609
|
sum.to_f.quo(n_valid)
|
638
610
|
end
|
639
611
|
def sum_of_squares(m=nil)
|
612
|
+
check_type :scale
|
613
|
+
|
640
614
|
m||=mean
|
641
|
-
@
|
615
|
+
@scale_data.inject(0){|a,x| a+(x-m).square}
|
642
616
|
end
|
643
617
|
|
644
618
|
# Sum of squared deviation
|
645
619
|
def sum_of_squared_deviation
|
646
|
-
|
620
|
+
check_type :scale
|
621
|
+
|
622
|
+
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
647
623
|
end
|
648
624
|
|
649
625
|
# Population variance (divided by n)
|
650
626
|
def variance_population(m=nil)
|
627
|
+
check_type :scale
|
628
|
+
|
651
629
|
m||=mean
|
652
|
-
squares=@
|
630
|
+
squares=@scale_data.inject(0){|a,x| x.square+a}
|
653
631
|
squares.quo(n_valid) - m.square
|
654
632
|
end
|
655
633
|
|
656
634
|
|
657
635
|
# Population Standard deviation (divided by n)
|
658
636
|
def standard_deviation_population(m=nil)
|
637
|
+
check_type :scale
|
638
|
+
|
659
639
|
Math::sqrt( variance_population(m) )
|
660
640
|
end
|
661
641
|
# Sample Variance (divided by n-1)
|
662
642
|
|
663
643
|
def variance_sample(m=nil)
|
644
|
+
check_type :scale
|
645
|
+
|
664
646
|
m||=mean
|
665
647
|
sum_of_squares(m).quo(n_valid - 1)
|
666
648
|
end
|
@@ -668,22 +650,30 @@ class Vector < DelegateClass(Array)
|
|
668
650
|
# Sample Standard deviation (divided by n-1)
|
669
651
|
|
670
652
|
def standard_deviation_sample(m=nil)
|
653
|
+
check_type :scale
|
654
|
+
|
671
655
|
m||=m
|
672
656
|
Math::sqrt(variance_sample(m))
|
673
657
|
end
|
674
658
|
def skew
|
659
|
+
check_type :scale
|
660
|
+
|
675
661
|
m=mean
|
676
|
-
thirds=@
|
677
|
-
thirds.quo((@
|
662
|
+
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
|
663
|
+
thirds.quo((@scale_data.size-1)*sd**3)
|
678
664
|
end
|
679
665
|
def kurtosis
|
666
|
+
check_type :scale
|
667
|
+
|
680
668
|
m=mean
|
681
|
-
thirds=@
|
682
|
-
thirds.quo((@
|
669
|
+
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
|
670
|
+
thirds.quo((@scale_data.size-1)*sd**4)
|
683
671
|
|
684
672
|
end
|
685
673
|
def product
|
686
|
-
|
674
|
+
check_type :scale
|
675
|
+
|
676
|
+
@scale_data.inject(1){|a,x| a*x }
|
687
677
|
end
|
688
678
|
if HAS_GSL
|
689
679
|
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
@@ -691,38 +681,50 @@ class Vector < DelegateClass(Array)
|
|
691
681
|
alias_method m_nuevo, m.intern
|
692
682
|
}
|
693
683
|
def sum # :nodoc:
|
684
|
+
check_type :scale
|
685
|
+
|
694
686
|
@gsl.sum
|
695
687
|
end
|
696
688
|
def mean # :nodoc:
|
689
|
+
check_type :scale
|
690
|
+
|
697
691
|
@gsl.mean
|
698
692
|
end
|
699
693
|
def variance_sample(m=nil) # :nodoc:
|
694
|
+
check_type :scale
|
695
|
+
|
700
696
|
m||=mean
|
701
697
|
@gsl.variance_m
|
702
698
|
end
|
703
699
|
def standard_deviation_sample(m=nil) # :nodoc:
|
700
|
+
check_type :scale
|
704
701
|
m||=mean
|
705
702
|
@gsl.sd(m)
|
706
703
|
end
|
707
704
|
|
708
705
|
def variance_population(m=nil) # :nodoc:
|
706
|
+
check_type :scale
|
709
707
|
m||=mean
|
710
708
|
@gsl.variance_with_fixed_mean(m)
|
711
709
|
end
|
712
710
|
def standard_deviation_population(m=nil) # :nodoc:
|
711
|
+
check_type :scale
|
713
712
|
m||=mean
|
714
713
|
@gsl.sd_with_fixed_mean(m)
|
715
714
|
end
|
716
715
|
def skew
|
716
|
+
check_type :scale
|
717
717
|
@gsl.skew
|
718
718
|
end
|
719
719
|
def kurtosis
|
720
|
+
check_type :scale
|
720
721
|
@gsl.kurtosis
|
721
722
|
end
|
722
723
|
# Create a GSL::Histogram
|
723
724
|
# With a fixnum, creates X bins within the range of data
|
724
725
|
# With an Array, each value will be a cut point
|
725
726
|
def histogram(bins=10)
|
727
|
+
check_type :scale
|
726
728
|
if bins.is_a? Array
|
727
729
|
h=GSL::Histogram.alloc(bins)
|
728
730
|
else
|
@@ -734,35 +736,18 @@ class Vector < DelegateClass(Array)
|
|
734
736
|
h
|
735
737
|
end
|
736
738
|
def plot_histogram(bins=10,options="")
|
739
|
+
check_type :scale
|
737
740
|
self.histogram(bins).graph(options)
|
738
741
|
end
|
739
|
-
|
740
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
741
|
-
r.sample(@gsl, k).to_a
|
742
|
-
end
|
743
|
-
def sample_without_replacement(k)
|
744
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
745
|
-
r.choose(@gsl, k).to_a
|
746
|
-
end
|
742
|
+
|
747
743
|
end
|
748
744
|
|
749
745
|
# Coefficient of variation
|
750
746
|
# Calculed with the sample standard deviation
|
751
747
|
def coefficient_of_variation
|
748
|
+
check_type :scale
|
752
749
|
standard_deviation_sample.quo(mean)
|
753
750
|
end
|
754
|
-
def summary(labels,out="")
|
755
|
-
out << sprintf("n valid:%d\n",n_valid)
|
756
|
-
out << "mean:"+mean.to_s+"\n"
|
757
|
-
out << "sum:"+sum.to_s+"\n"
|
758
|
-
out << "range:"+range.to_s+"\n"
|
759
|
-
out << "variance (pop):"+variance_population.to_s+"\n"
|
760
|
-
out << "sd (pop):"+sdp.to_s+"\n"
|
761
|
-
out << "variance (sample):"+variance_sample.to_s+"\n"
|
762
|
-
out << "sd (sample):"+sds.to_s+"\n"
|
763
|
-
|
764
|
-
out
|
765
|
-
end
|
766
751
|
|
767
752
|
alias_method :sdp, :standard_deviation_population
|
768
753
|
alias_method :sds, :standard_deviation_sample
|