statsample 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -1
- data/Manifest.txt +3 -0
- data/demo/benchmark.rb +7 -5
- data/demo/regression.rb +30 -4
- data/lib/statsample.rb +23 -15
- data/lib/statsample/bivariate.rb +28 -9
- data/lib/statsample/combination.rb +103 -0
- data/lib/statsample/converters.rb +16 -1
- data/lib/statsample/dataset.rb +29 -11
- data/lib/statsample/dominanceanalysis.rb +15 -11
- data/lib/statsample/dominanceanalysis/bootstrap.rb +9 -7
- data/lib/statsample/graph/svggraph.rb +1 -3
- data/lib/statsample/regression.rb +1 -0
- data/lib/statsample/regression/logit.rb +35 -0
- data/lib/statsample/regression/multiple.rb +21 -2
- data/lib/statsample/regression/multiple/alglibengine.rb +3 -1
- data/lib/statsample/vector.rb +168 -183
- data/test/test_combination.rb +42 -0
- data/test/test_csv.rb +1 -1
- data/test/test_dataset.rb +5 -0
- data/test/test_statistics.rb +19 -2
- data/test/test_svg_graph.rb +5 -2
- data/test/test_vector.rb +6 -1
- metadata +6 -2
@@ -150,17 +150,16 @@ module Statsample
|
|
150
150
|
@models=[]
|
151
151
|
@models_data={}
|
152
152
|
for i in 1..@fields.size
|
153
|
-
c
|
154
|
-
|
155
|
-
|
156
|
-
|
153
|
+
c=Statsample::Combination.new(i,@fields.size)
|
154
|
+
c.each{|data|
|
155
|
+
convert=data.collect {|i|
|
156
|
+
@fields[i]
|
157
|
+
}
|
158
|
+
@models.push(convert)
|
159
|
+
ds_prev=@ds.dup(convert+[@y_var])
|
160
|
+
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
161
|
+
@models_data[convert.sort]=modeldata
|
157
162
|
}
|
158
|
-
@models.push(convert)
|
159
|
-
ds_prev=@ds.dup(convert+[@y_var])
|
160
|
-
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
161
|
-
|
162
|
-
@models_data[convert.sort]=modeldata
|
163
|
-
end while c.next == GSL::SUCCESS
|
164
163
|
end
|
165
164
|
end
|
166
165
|
def summary(report_type=ConsoleSummary)
|
@@ -232,7 +231,12 @@ module Statsample
|
|
232
231
|
@lr.r2
|
233
232
|
end
|
234
233
|
def add_table_row
|
235
|
-
|
234
|
+
begin
|
235
|
+
sign=sprintf("%0.3f", @lr.significance)
|
236
|
+
rescue RuntimeError
|
237
|
+
sign="???"
|
238
|
+
end
|
239
|
+
[@name.join("*"), sprintf("%0.3f",r2), sign] + @fields.collect{|k|
|
236
240
|
v=@contributions[k]
|
237
241
|
if v.nil?
|
238
242
|
"--"
|
@@ -51,26 +51,28 @@ class DominanceAnalysis
|
|
51
51
|
@samples_cd={}
|
52
52
|
@samples_gd={}
|
53
53
|
@pairs=[]
|
54
|
-
c
|
55
|
-
|
56
|
-
convert=
|
54
|
+
c=Statsample::Combination.new(2,@fields.size)
|
55
|
+
c.each{|data|
|
56
|
+
convert=data.collect {|i|
|
57
57
|
@fields[i]
|
58
58
|
}
|
59
59
|
@pairs.push(convert)
|
60
60
|
[@samples_td,@samples_cd,@samples_gd].each{|s|
|
61
61
|
s[convert]=[]
|
62
62
|
}
|
63
|
-
|
64
|
-
|
63
|
+
}
|
64
|
+
end
|
65
65
|
def summary(report_type=ConsoleSummary)
|
66
66
|
out =""
|
67
67
|
raise "You should bootstrap first" if @n_samples==0
|
68
68
|
alfa=0.95
|
69
|
-
t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
|
70
69
|
out.extend report_type
|
71
70
|
out.add _("Summary for Bootstrap Dominance Analysis of %s on %s\n") % [@fields.join(", "), @y_var]
|
72
71
|
out.add _("Sample size: %d\n") % @n_samples
|
73
|
-
|
72
|
+
if HAS_GSL
|
73
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
|
74
|
+
out.add "t:#{t}\n"
|
75
|
+
end
|
74
76
|
out.add "Linear Regression Engine: #{@lr_class.name}"
|
75
77
|
out.nl
|
76
78
|
table=ReportTable.new
|
@@ -6,7 +6,7 @@ require 'SVG/Graph/Plot'
|
|
6
6
|
require 'statsample/graph/svghistogram'
|
7
7
|
|
8
8
|
module Statsample
|
9
|
-
class
|
9
|
+
class Vector
|
10
10
|
# Creates a barchart using ruby-gdchart
|
11
11
|
def svggraph_frequencies(file, width=600, height=300, chart_type=SVG::Graph::BarNoOp, options={})
|
12
12
|
labels,data=[],[]
|
@@ -26,8 +26,6 @@ module Statsample
|
|
26
26
|
f.puts(graph.burn)
|
27
27
|
}
|
28
28
|
end
|
29
|
-
end
|
30
|
-
class Scale < Ordinal
|
31
29
|
def svggraph_histogram(bins, options={})
|
32
30
|
options={:graph_title=>"Histogram", :show_graph_title=>true,:show_normal=>true, :mean=>self.mean, :sigma=>sdp }.merge! options
|
33
31
|
graph = Statsample::Graph::SvgHistogram.new(options)
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Regression
|
3
|
+
class Logit
|
4
|
+
def initialize(ds,y_var)
|
5
|
+
@ds=ds
|
6
|
+
@y_var=y_var
|
7
|
+
end
|
8
|
+
def vp(x1,x2)
|
9
|
+
sum=0
|
10
|
+
x1.each_index{|i|
|
11
|
+
sum+=x1[i]*x2[i]
|
12
|
+
}
|
13
|
+
sum
|
14
|
+
end
|
15
|
+
# F(B'Xi)
|
16
|
+
def f(b,x)
|
17
|
+
Math::exp(vp(b,x)) / (1+Math::exp(vp(b,x)))
|
18
|
+
end
|
19
|
+
# f(B'Xi)
|
20
|
+
def fa(b,x)
|
21
|
+
f(b,x)*(1-f(b,x))
|
22
|
+
end
|
23
|
+
def l(b)
|
24
|
+
prod=1
|
25
|
+
y=@ds[@y_var]
|
26
|
+
@ds.each_array{|x|
|
27
|
+
x.unshift(1) # add constant
|
28
|
+
l=(f(b,x)**y[@ds.i])*((1.0-f(b,x))**(1.0-y[@ds.i]))
|
29
|
+
prod=prod*l
|
30
|
+
}
|
31
|
+
prod
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -194,13 +194,32 @@ out.add_line
|
|
194
194
|
out.add "ANOVA TABLE"
|
195
195
|
|
196
196
|
t=Statsample::ReportTable.new(%w{source ss df ms f s})
|
197
|
-
|
198
|
-
|
197
|
+
begin
|
198
|
+
t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
|
199
|
+
rescue RuntimeError
|
200
|
+
t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), "???", "???"])
|
201
|
+
end
|
199
202
|
t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
|
200
203
|
|
201
204
|
t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
|
202
205
|
|
203
206
|
out.parse_table(t)
|
207
|
+
|
208
|
+
begin
|
209
|
+
out.add "Beta coefficientes"
|
210
|
+
sc=standarized_coeffs
|
211
|
+
cse=coeffs_se
|
212
|
+
t=Statsample::ReportTable.new(%w{coeff beta se t})
|
213
|
+
t.add_row(["Constant", "-",constant_se, constant_t])
|
214
|
+
@fields.each{|f|
|
215
|
+
t.add_row([f, sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
|
216
|
+
}
|
217
|
+
out.parse_table(t)
|
218
|
+
|
219
|
+
rescue
|
220
|
+
|
221
|
+
end
|
222
|
+
|
204
223
|
out
|
205
224
|
end
|
206
225
|
def assign_names(c)
|
@@ -37,6 +37,8 @@ class AlglibEngine < BaseEngine
|
|
37
37
|
matrix=Matrix.columns(columns)
|
38
38
|
@lr_s=nil
|
39
39
|
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
|
40
|
+
@coeffs=assign_names(@lr.coeffs)
|
41
|
+
|
40
42
|
end
|
41
43
|
|
42
44
|
def _dump(i)
|
@@ -48,7 +50,7 @@ class AlglibEngine < BaseEngine
|
|
48
50
|
end
|
49
51
|
|
50
52
|
def coeffs
|
51
|
-
|
53
|
+
@coeffs
|
52
54
|
end
|
53
55
|
# Coefficients using a constant
|
54
56
|
# Based on http://www.xycoon.com/ols1.htm
|
data/lib/statsample/vector.rb
CHANGED
@@ -39,11 +39,11 @@ module Statsample
|
|
39
39
|
ds=Statsample::Dataset.new(h).dup_only_valid
|
40
40
|
ds.vectors.values
|
41
41
|
end
|
42
|
-
|
43
|
-
|
42
|
+
|
43
|
+
class Vector
|
44
44
|
include Enumerable
|
45
|
-
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
|
46
|
-
|
45
|
+
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils, :gsl
|
46
|
+
attr_accessor :labels
|
47
47
|
# Creates a new
|
48
48
|
# data = Array of data
|
49
49
|
# t = level of meausurement. Could be:
|
@@ -61,9 +61,9 @@ class Vector < DelegateClass(Array)
|
|
61
61
|
@data_with_nils=[]
|
62
62
|
@missing_data=[]
|
63
63
|
@has_missing_data=nil
|
64
|
-
|
64
|
+
@scale_data=nil
|
65
|
+
set_valid_data_intern
|
65
66
|
self.type=t
|
66
|
-
super(@delegate)
|
67
67
|
end
|
68
68
|
def dup
|
69
69
|
Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
|
@@ -78,23 +78,27 @@ class Vector < DelegateClass(Array)
|
|
78
78
|
def vector_standarized_pop
|
79
79
|
vector_standarized(true)
|
80
80
|
end
|
81
|
-
|
81
|
+
def check_type(t)
|
82
|
+
raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal)
|
83
|
+
end
|
82
84
|
# Return a vector usign the standarized values for data
|
83
85
|
# with sd with denominator n-1
|
84
86
|
|
85
87
|
def vector_standarized(use_population=false)
|
86
88
|
raise "Should be a scale" unless @type==:scale
|
87
|
-
mean
|
88
|
-
sd=use_population ?
|
89
|
+
m=mean
|
90
|
+
sd=use_population ? sdp : sds
|
89
91
|
@data_with_nils.collect{|x|
|
90
92
|
if !x.nil?
|
91
|
-
(x.to_f -
|
93
|
+
(x.to_f - m).quo(sd)
|
92
94
|
else
|
93
95
|
nil
|
94
96
|
end
|
95
97
|
}.to_vector(:scale)
|
96
98
|
end
|
99
|
+
|
97
100
|
alias_method :standarized, :vector_standarized
|
101
|
+
|
98
102
|
def box_cox_transformation(lambda)
|
99
103
|
raise "Should be a scale" unless @type==:scale
|
100
104
|
@data_with_nils.collect{|x|
|
@@ -116,6 +120,7 @@ class Vector < DelegateClass(Array)
|
|
116
120
|
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
117
121
|
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
118
122
|
end
|
123
|
+
|
119
124
|
def _dump(i)
|
120
125
|
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
121
126
|
end
|
@@ -155,24 +160,31 @@ class Vector < DelegateClass(Array)
|
|
155
160
|
@valid_data.clear
|
156
161
|
@missing_data.clear
|
157
162
|
@data_with_nils.clear
|
158
|
-
|
159
|
-
|
163
|
+
@gsl=nil
|
164
|
+
set_valid_data_intern
|
165
|
+
set_scale_data if(@type==:scale)
|
160
166
|
end
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
167
|
+
|
168
|
+
if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
|
169
|
+
def set_valid_data_intern
|
170
|
+
Statsample::STATSAMPLE__.set_valid_data_intern(self)
|
171
|
+
end
|
172
|
+
else
|
173
|
+
def set_valid_data_intern
|
174
|
+
_set_valid_data_intern
|
175
|
+
end
|
176
|
+
end
|
177
|
+
def _set_valid_data_intern
|
165
178
|
@data.each do |n|
|
166
|
-
|
179
|
+
if is_valid? n
|
167
180
|
@valid_data.push(n)
|
168
181
|
@data_with_nils.push(n)
|
169
|
-
|
182
|
+
else
|
170
183
|
@data_with_nils.push(nil)
|
171
184
|
@missing_data.push(n)
|
172
|
-
|
173
|
-
end
|
174
|
-
@has_missing_data=@missing_data.size>0
|
185
|
+
end
|
175
186
|
end
|
187
|
+
@has_missing_data=@missing_data.size>0
|
176
188
|
end
|
177
189
|
# Retrieves true if data has one o more missing values
|
178
190
|
def has_missing_data?
|
@@ -212,29 +224,13 @@ class Vector < DelegateClass(Array)
|
|
212
224
|
end
|
213
225
|
# Set level of measurement.
|
214
226
|
def type=(t)
|
215
|
-
|
216
|
-
|
217
|
-
@delegate=Nominal.new(@valid_data)
|
218
|
-
when :ordinal
|
219
|
-
@delegate=Ordinal.new(@valid_data)
|
220
|
-
when :scale
|
221
|
-
@delegate=Scale.new(@valid_data)
|
222
|
-
else
|
223
|
-
raise "Type doesn't exists"
|
224
|
-
end
|
225
|
-
__setobj__(@delegate)
|
226
|
-
@type=t
|
227
|
+
@type=t
|
228
|
+
set_scale_data if(t==:scale)
|
227
229
|
end
|
228
230
|
def n; @data.size ; end
|
229
231
|
def to_a
|
230
232
|
@data.dup
|
231
|
-
|
232
|
-
# Redundant, but necessary
|
233
|
-
# Spreadsheet creates Array#sum, so calling sum
|
234
|
-
# doesn't call the delegates method
|
235
|
-
def sum
|
236
|
-
@delegate.sum
|
237
|
-
end
|
233
|
+
end
|
238
234
|
alias_method :to_ary, :to_a
|
239
235
|
# Vector sum.
|
240
236
|
# - If v is a scalar, add this value to all elements
|
@@ -357,7 +353,13 @@ class Vector < DelegateClass(Array)
|
|
357
353
|
# In all the trails, every item have the same probability
|
358
354
|
# of been selected
|
359
355
|
def sample_with_replacement(sample=1)
|
360
|
-
|
356
|
+
if(@type!=:scale)
|
357
|
+
vds=@valid_data.size
|
358
|
+
(0...sample).collect{ @valid_data[rand(vds)] }
|
359
|
+
else
|
360
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
361
|
+
r.sample(@gsl, sample).to_a
|
362
|
+
end
|
361
363
|
end
|
362
364
|
# Returns an random sample of size n, without replacement,
|
363
365
|
# only with valid data.
|
@@ -366,9 +368,20 @@ class Vector < DelegateClass(Array)
|
|
366
368
|
# A sample of the same size of the vector is the vector itself
|
367
369
|
|
368
370
|
def sample_without_replacement(sample=1)
|
369
|
-
|
371
|
+
if(@type!=:scale)
|
372
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
|
373
|
+
out=[]
|
374
|
+
size=@valid_data.size
|
375
|
+
while out.size<sample
|
376
|
+
value=rand(size)
|
377
|
+
out.push(value) if !out.include?value
|
378
|
+
end
|
379
|
+
out.collect{|i|@data[i]}
|
380
|
+
else
|
381
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
382
|
+
r.choose(@gsl, sample).to_a
|
383
|
+
end
|
370
384
|
end
|
371
|
-
|
372
385
|
def count(x=false)
|
373
386
|
if block_given?
|
374
387
|
r=@data.inject(0) {|s, i|
|
@@ -401,41 +414,37 @@ class Vector < DelegateClass(Array)
|
|
401
414
|
true
|
402
415
|
end
|
403
416
|
end
|
404
|
-
def summary(out="")
|
405
|
-
@delegate.summary(@labels,out)
|
406
|
-
end
|
407
417
|
def to_s
|
408
418
|
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
409
419
|
end
|
410
420
|
def inspect
|
411
421
|
self.to_s
|
412
422
|
end
|
413
|
-
|
414
|
-
end
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
class Nominal
|
419
|
-
def initialize(data)
|
420
|
-
@data=data
|
421
|
-
# @factors=data.uniq
|
422
|
-
end
|
423
|
-
def delegate_data
|
424
|
-
@data
|
425
|
-
end
|
426
|
-
# Return an array of the different values of the data
|
427
423
|
def factors
|
428
|
-
@
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
424
|
+
if @type==:scale
|
425
|
+
@scale_data.uniq.sort
|
426
|
+
else
|
427
|
+
@valid_data.uniq.sort
|
428
|
+
end
|
429
|
+
end
|
430
|
+
if Statsample::STATSAMPLE__.respond_to?(:frequencies)
|
431
|
+
# Returns a hash with the distribution of frecuencies of
|
432
|
+
# the sample
|
433
|
+
def frequencies
|
434
|
+
Statsample::STATSAMPLE__.frequencies(@valid_data)
|
435
|
+
end
|
436
|
+
else
|
437
|
+
def frequencies
|
438
|
+
_frequencies
|
439
|
+
end
|
438
440
|
end
|
441
|
+
def _frequencies
|
442
|
+
@valid_data.inject(Hash.new) {|a,x|
|
443
|
+
a[x]||=0
|
444
|
+
a[x]=a[x]+1
|
445
|
+
a
|
446
|
+
}
|
447
|
+
end
|
439
448
|
# Plot frequencies on a chart, using gnuplot
|
440
449
|
def plot_frequencies
|
441
450
|
require 'gnuplot'
|
@@ -469,21 +478,21 @@ class Vector < DelegateClass(Array)
|
|
469
478
|
end
|
470
479
|
# The numbers of item with valid data
|
471
480
|
def n_valid
|
472
|
-
@
|
481
|
+
@valid_data.size
|
473
482
|
end
|
474
483
|
# Returns a hash with the distribution of proportions of
|
475
484
|
# the sample
|
476
485
|
def proportions
|
477
486
|
frequencies.inject({}){|a,v|
|
478
|
-
a[v[0]] = v[1].quo(
|
487
|
+
a[v[0]] = v[1].quo(n_valid)
|
479
488
|
a
|
480
489
|
}
|
481
490
|
end
|
482
491
|
# Proportion of a given value.
|
483
492
|
def proportion(v=1)
|
484
|
-
frequencies[v].quo(@
|
493
|
+
frequencies[v].quo(@valid_data.size)
|
485
494
|
end
|
486
|
-
def summary(
|
495
|
+
def summary(out="")
|
487
496
|
out << sprintf("n valid:%d\n",n_valid)
|
488
497
|
out << sprintf("factors:%s\n",factors.join(","))
|
489
498
|
out << "mode:"+mode.to_s+"\n"
|
@@ -492,47 +501,32 @@ class Vector < DelegateClass(Array)
|
|
492
501
|
key=labels.has_key?(k) ? labels[k]:k
|
493
502
|
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
494
503
|
}
|
504
|
+
if(@type==:ordinal)
|
505
|
+
out << "median:"+median.to_s+"\n"
|
506
|
+
end
|
507
|
+
if(@type==:scale)
|
508
|
+
out << "mean:"+mean.to_s+"\n"
|
509
|
+
out << "sd:"+sd.to_s+"\n"
|
510
|
+
|
511
|
+
end
|
495
512
|
out
|
496
513
|
end
|
497
514
|
|
498
|
-
|
499
|
-
# only with valid data.
|
500
|
-
#
|
501
|
-
# In all the trails, every item have the same probability
|
502
|
-
# of been selected
|
503
|
-
def sample_with_replacement(sample)
|
504
|
-
(0...sample).collect{ @data[rand(@data.size)] }
|
505
|
-
end
|
506
|
-
# Returns an random sample of size n, without replacement,
|
507
|
-
# only with valid data.
|
508
|
-
#
|
509
|
-
# Every element could only be selected once
|
510
|
-
# A sample of the same size of the vector is the vector itself
|
511
|
-
|
512
|
-
def sample_without_replacement(sample)
|
513
|
-
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
|
514
|
-
out=[]
|
515
|
-
size=@data.size
|
516
|
-
while out.size<sample
|
517
|
-
value=rand(size)
|
518
|
-
out.push(value) if !out.include?value
|
519
|
-
end
|
520
|
-
out.collect{|i|@data[i]}
|
521
|
-
end
|
515
|
+
|
522
516
|
|
523
517
|
|
524
518
|
# Variance of p, according to poblation size
|
525
519
|
def variance_proportion(n_poblation, v=1)
|
526
|
-
Statsample::proportion_variance_sample(self.proportion(v), @
|
520
|
+
Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
527
521
|
end
|
528
522
|
def variance_total(n_poblation, v=1)
|
529
|
-
Statsample::total_variance_sample(self.proportion(v), @
|
523
|
+
Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
|
530
524
|
end
|
531
525
|
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
532
|
-
Statsample::proportion_confidence_interval_t(proportion(v), @
|
526
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
|
533
527
|
end
|
534
528
|
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
535
|
-
Statsample::proportion_confidence_interval_z(proportion(v), @
|
529
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
|
536
530
|
end
|
537
531
|
self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
|
538
532
|
met_or=met.gsub("_slow","")
|
@@ -540,12 +534,11 @@ class Vector < DelegateClass(Array)
|
|
540
534
|
alias_method met_or, met
|
541
535
|
end
|
542
536
|
}
|
543
|
-
|
544
|
-
|
545
|
-
class Ordinal <Nominal
|
537
|
+
# Ordinal Methods
|
546
538
|
# Return the value of the percentil q
|
547
539
|
def percentil(q)
|
548
|
-
|
540
|
+
check_type :ordinal
|
541
|
+
sorted=@valid_data.sort
|
549
542
|
v= (n_valid * q).quo(100)
|
550
543
|
if(v.to_i!=v)
|
551
544
|
sorted[v.to_i]
|
@@ -555,6 +548,7 @@ class Vector < DelegateClass(Array)
|
|
555
548
|
end
|
556
549
|
# Returns a ranked vector
|
557
550
|
def ranked(type=:ordinal)
|
551
|
+
check_type :ordinal
|
558
552
|
i=0
|
559
553
|
r=frequencies.sort.inject({}){|a,v|
|
560
554
|
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
@@ -567,100 +561,88 @@ class Vector < DelegateClass(Array)
|
|
567
561
|
end
|
568
562
|
# Return the median (percentil 50)
|
569
563
|
def median
|
564
|
+
check_type :ordinal
|
565
|
+
if HAS_GSL and @type==:scale
|
566
|
+
GSL::Stats::median_from_sorted_data(@gsl)
|
567
|
+
else
|
570
568
|
percentil(50)
|
571
|
-
end
|
572
|
-
if HAS_GSL
|
573
|
-
%w{median}.each{|m|
|
574
|
-
m_nuevo=(m+"_slow").intern
|
575
|
-
alias_method m_nuevo, m.intern
|
576
|
-
}
|
577
|
-
|
578
|
-
#def percentil(p)
|
579
|
-
# v=GSL::Vector.alloc(@data.sort)
|
580
|
-
# v.stats_quantile_from_sorted_data(p)
|
581
|
-
#end
|
582
|
-
def median # :nodoc:
|
583
|
-
GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
|
584
569
|
end
|
585
570
|
end
|
586
571
|
# Minimun value
|
587
|
-
def min;
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
def summary(labels,out="")
|
593
|
-
out << sprintf("n valid:%d\n",n_valid)
|
594
|
-
out << "median:"+median.to_s+"\n"
|
595
|
-
out << "percentil 25:"+percentil(25).to_s+"\n"
|
596
|
-
out << "percentil 75:"+percentil(75).to_s+"\n"
|
597
|
-
out
|
598
|
-
end
|
599
|
-
end
|
600
|
-
class Scale <Ordinal
|
601
|
-
attr_reader :gsl
|
602
|
-
def initialize(data)
|
603
|
-
# puts "Inicializando Scale..."
|
604
|
-
super(data)
|
605
|
-
|
606
|
-
set_gsl
|
572
|
+
def min;
|
573
|
+
check_type :ordinal
|
574
|
+
@valid_data.min;
|
607
575
|
end
|
608
|
-
|
609
|
-
def
|
610
|
-
|
611
|
-
|
612
|
-
def _load(data)
|
613
|
-
@data=Marshal.restore(data)
|
614
|
-
set_gsl
|
576
|
+
# Maximum value
|
577
|
+
def max;
|
578
|
+
check_type :ordinal
|
579
|
+
@valid_data.max;
|
615
580
|
end
|
616
|
-
|
617
|
-
|
618
|
-
|
581
|
+
|
582
|
+
def set_scale_data # :nodoc
|
583
|
+
@scale_data=@valid_data.collect{|x|
|
584
|
+
if x.is_a? Numeric
|
619
585
|
x
|
620
586
|
elsif x.is_a? String and x.to_i==x.to_f
|
621
587
|
x.to_i
|
622
588
|
else
|
623
589
|
x.to_f
|
624
590
|
end
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
end
|
591
|
+
}
|
592
|
+
if HAS_GSL
|
593
|
+
@gsl=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
|
629
594
|
end
|
595
|
+
end
|
630
596
|
# The range of the data (max - min)
|
631
|
-
def range;
|
597
|
+
def range;
|
598
|
+
check_type :scale
|
599
|
+
@scale_data.max - @scale_data.min
|
600
|
+
end
|
632
601
|
# The sum of values for the data
|
633
602
|
def sum
|
634
|
-
|
603
|
+
check_type :scale
|
604
|
+
@scale_data.inject(0){|a,x|x+a} ; end
|
635
605
|
# The arithmetical mean of data
|
636
606
|
def mean
|
607
|
+
check_type :scale
|
608
|
+
|
637
609
|
sum.to_f.quo(n_valid)
|
638
610
|
end
|
639
611
|
def sum_of_squares(m=nil)
|
612
|
+
check_type :scale
|
613
|
+
|
640
614
|
m||=mean
|
641
|
-
@
|
615
|
+
@scale_data.inject(0){|a,x| a+(x-m).square}
|
642
616
|
end
|
643
617
|
|
644
618
|
# Sum of squared deviation
|
645
619
|
def sum_of_squared_deviation
|
646
|
-
|
620
|
+
check_type :scale
|
621
|
+
|
622
|
+
@scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
647
623
|
end
|
648
624
|
|
649
625
|
# Population variance (divided by n)
|
650
626
|
def variance_population(m=nil)
|
627
|
+
check_type :scale
|
628
|
+
|
651
629
|
m||=mean
|
652
|
-
squares=@
|
630
|
+
squares=@scale_data.inject(0){|a,x| x.square+a}
|
653
631
|
squares.quo(n_valid) - m.square
|
654
632
|
end
|
655
633
|
|
656
634
|
|
657
635
|
# Population Standard deviation (divided by n)
|
658
636
|
def standard_deviation_population(m=nil)
|
637
|
+
check_type :scale
|
638
|
+
|
659
639
|
Math::sqrt( variance_population(m) )
|
660
640
|
end
|
661
641
|
# Sample Variance (divided by n-1)
|
662
642
|
|
663
643
|
def variance_sample(m=nil)
|
644
|
+
check_type :scale
|
645
|
+
|
664
646
|
m||=mean
|
665
647
|
sum_of_squares(m).quo(n_valid - 1)
|
666
648
|
end
|
@@ -668,22 +650,30 @@ class Vector < DelegateClass(Array)
|
|
668
650
|
# Sample Standard deviation (divided by n-1)
|
669
651
|
|
670
652
|
def standard_deviation_sample(m=nil)
|
653
|
+
check_type :scale
|
654
|
+
|
671
655
|
m||=m
|
672
656
|
Math::sqrt(variance_sample(m))
|
673
657
|
end
|
674
658
|
def skew
|
659
|
+
check_type :scale
|
660
|
+
|
675
661
|
m=mean
|
676
|
-
thirds=@
|
677
|
-
thirds.quo((@
|
662
|
+
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**3)}
|
663
|
+
thirds.quo((@scale_data.size-1)*sd**3)
|
678
664
|
end
|
679
665
|
def kurtosis
|
666
|
+
check_type :scale
|
667
|
+
|
680
668
|
m=mean
|
681
|
-
thirds=@
|
682
|
-
thirds.quo((@
|
669
|
+
thirds=@scale_data.inject(0){|a,x| a+((x-mean)**4)}
|
670
|
+
thirds.quo((@scale_data.size-1)*sd**4)
|
683
671
|
|
684
672
|
end
|
685
673
|
def product
|
686
|
-
|
674
|
+
check_type :scale
|
675
|
+
|
676
|
+
@scale_data.inject(1){|a,x| a*x }
|
687
677
|
end
|
688
678
|
if HAS_GSL
|
689
679
|
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
@@ -691,38 +681,50 @@ class Vector < DelegateClass(Array)
|
|
691
681
|
alias_method m_nuevo, m.intern
|
692
682
|
}
|
693
683
|
def sum # :nodoc:
|
684
|
+
check_type :scale
|
685
|
+
|
694
686
|
@gsl.sum
|
695
687
|
end
|
696
688
|
def mean # :nodoc:
|
689
|
+
check_type :scale
|
690
|
+
|
697
691
|
@gsl.mean
|
698
692
|
end
|
699
693
|
def variance_sample(m=nil) # :nodoc:
|
694
|
+
check_type :scale
|
695
|
+
|
700
696
|
m||=mean
|
701
697
|
@gsl.variance_m
|
702
698
|
end
|
703
699
|
def standard_deviation_sample(m=nil) # :nodoc:
|
700
|
+
check_type :scale
|
704
701
|
m||=mean
|
705
702
|
@gsl.sd(m)
|
706
703
|
end
|
707
704
|
|
708
705
|
def variance_population(m=nil) # :nodoc:
|
706
|
+
check_type :scale
|
709
707
|
m||=mean
|
710
708
|
@gsl.variance_with_fixed_mean(m)
|
711
709
|
end
|
712
710
|
def standard_deviation_population(m=nil) # :nodoc:
|
711
|
+
check_type :scale
|
713
712
|
m||=mean
|
714
713
|
@gsl.sd_with_fixed_mean(m)
|
715
714
|
end
|
716
715
|
def skew
|
716
|
+
check_type :scale
|
717
717
|
@gsl.skew
|
718
718
|
end
|
719
719
|
def kurtosis
|
720
|
+
check_type :scale
|
720
721
|
@gsl.kurtosis
|
721
722
|
end
|
722
723
|
# Create a GSL::Histogram
|
723
724
|
# With a fixnum, creates X bins within the range of data
|
724
725
|
# With an Array, each value will be a cut point
|
725
726
|
def histogram(bins=10)
|
727
|
+
check_type :scale
|
726
728
|
if bins.is_a? Array
|
727
729
|
h=GSL::Histogram.alloc(bins)
|
728
730
|
else
|
@@ -734,35 +736,18 @@ class Vector < DelegateClass(Array)
|
|
734
736
|
h
|
735
737
|
end
|
736
738
|
def plot_histogram(bins=10,options="")
|
739
|
+
check_type :scale
|
737
740
|
self.histogram(bins).graph(options)
|
738
741
|
end
|
739
|
-
|
740
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
741
|
-
r.sample(@gsl, k).to_a
|
742
|
-
end
|
743
|
-
def sample_without_replacement(k)
|
744
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
745
|
-
r.choose(@gsl, k).to_a
|
746
|
-
end
|
742
|
+
|
747
743
|
end
|
748
744
|
|
749
745
|
# Coefficient of variation
|
750
746
|
# Calculed with the sample standard deviation
|
751
747
|
def coefficient_of_variation
|
748
|
+
check_type :scale
|
752
749
|
standard_deviation_sample.quo(mean)
|
753
750
|
end
|
754
|
-
def summary(labels,out="")
|
755
|
-
out << sprintf("n valid:%d\n",n_valid)
|
756
|
-
out << "mean:"+mean.to_s+"\n"
|
757
|
-
out << "sum:"+sum.to_s+"\n"
|
758
|
-
out << "range:"+range.to_s+"\n"
|
759
|
-
out << "variance (pop):"+variance_population.to_s+"\n"
|
760
|
-
out << "sd (pop):"+sdp.to_s+"\n"
|
761
|
-
out << "variance (sample):"+variance_sample.to_s+"\n"
|
762
|
-
out << "sd (sample):"+sds.to_s+"\n"
|
763
|
-
|
764
|
-
out
|
765
|
-
end
|
766
751
|
|
767
752
|
alias_method :sdp, :standard_deviation_population
|
768
753
|
alias_method :sds, :standard_deviation_sample
|