statsample 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +79 -0
- data/Manifest.txt +56 -0
- data/README.txt +77 -0
- data/Rakefile +22 -0
- data/bin/statsample +2 -0
- data/demo/benchmark.rb +52 -0
- data/demo/chi-square.rb +44 -0
- data/demo/dice.rb +13 -0
- data/demo/distribution_t.rb +95 -0
- data/demo/graph.rb +9 -0
- data/demo/item_analysis.rb +30 -0
- data/demo/mean.rb +81 -0
- data/demo/proportion.rb +57 -0
- data/demo/sample_test.csv +113 -0
- data/demo/strata_proportion.rb +152 -0
- data/demo/stratum.rb +141 -0
- data/lib/spss.rb +131 -0
- data/lib/statsample.rb +216 -0
- data/lib/statsample/anova.rb +74 -0
- data/lib/statsample/bivariate.rb +255 -0
- data/lib/statsample/chidistribution.rb +39 -0
- data/lib/statsample/codification.rb +120 -0
- data/lib/statsample/converters.rb +338 -0
- data/lib/statsample/crosstab.rb +122 -0
- data/lib/statsample/dataset.rb +526 -0
- data/lib/statsample/dominanceanalysis.rb +259 -0
- data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
- data/lib/statsample/graph/gdchart.rb +45 -0
- data/lib/statsample/graph/svgboxplot.rb +108 -0
- data/lib/statsample/graph/svggraph.rb +181 -0
- data/lib/statsample/graph/svghistogram.rb +208 -0
- data/lib/statsample/graph/svgscatterplot.rb +111 -0
- data/lib/statsample/htmlreport.rb +232 -0
- data/lib/statsample/multiset.rb +281 -0
- data/lib/statsample/regression.rb +522 -0
- data/lib/statsample/reliability.rb +235 -0
- data/lib/statsample/resample.rb +20 -0
- data/lib/statsample/srs.rb +159 -0
- data/lib/statsample/test.rb +25 -0
- data/lib/statsample/vector.rb +759 -0
- data/test/_test_chart.rb +58 -0
- data/test/test_anova.rb +31 -0
- data/test/test_codification.rb +59 -0
- data/test/test_crosstab.rb +55 -0
- data/test/test_csv.csv +7 -0
- data/test/test_csv.rb +27 -0
- data/test/test_dataset.rb +293 -0
- data/test/test_ggobi.rb +42 -0
- data/test/test_multiset.rb +98 -0
- data/test/test_regression.rb +108 -0
- data/test/test_reliability.rb +32 -0
- data/test/test_resample.rb +23 -0
- data/test/test_srs.rb +14 -0
- data/test/test_statistics.rb +152 -0
- data/test/test_stratified.rb +19 -0
- data/test/test_svg_graph.rb +63 -0
- data/test/test_vector.rb +265 -0
- data/test/test_xls.rb +32 -0
- metadata +158 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Reliability
|
3
|
+
class << self
|
4
|
+
# Calculate Chonbach's alpha for a given dataset.
|
5
|
+
# only uses tuples without missing data
|
6
|
+
def cronbach_alpha(ods)
|
7
|
+
ds=ods.dup_only_valid
|
8
|
+
n_items=ds.fields.size
|
9
|
+
sum_var_items=ds.vectors.inject(0) {|ac,v|
|
10
|
+
ac+v[1].variance_sample
|
11
|
+
}
|
12
|
+
total=ds.vector_sum
|
13
|
+
(n_items / (n_items-1).to_f) * (1-(sum_var_items/ total.variance_sample))
|
14
|
+
end
|
15
|
+
# Calculate Chonbach's alpha for a given dataset
|
16
|
+
# using standarized values for every vector.
|
17
|
+
# Only uses tuples without missing data
|
18
|
+
|
19
|
+
def cronbach_alpha_standarized(ods)
|
20
|
+
ds=ods.fields.inject({}){|a,f|
|
21
|
+
a[f]=ods[f].vector_standarized
|
22
|
+
a
|
23
|
+
}.to_dataset
|
24
|
+
cronbach_alpha(ds)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class ItemCharacteristicCurve
|
29
|
+
attr_reader :totals, :counts,:vector_total
|
30
|
+
def initialize (ds, vector_total=nil)
|
31
|
+
vector_total||=ds.vector_sum
|
32
|
+
raise "Total size != Dataset size" if vector_total.size!=ds.cases
|
33
|
+
@vector_total=vector_total
|
34
|
+
@ds=ds
|
35
|
+
@totals={}
|
36
|
+
@counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
|
37
|
+
process
|
38
|
+
end
|
39
|
+
def process
|
40
|
+
i=0
|
41
|
+
@ds.each{|row|
|
42
|
+
tot=@vector_total[i]
|
43
|
+
@totals[tot]||=0
|
44
|
+
@totals[tot]+=1
|
45
|
+
@ds.fields.each {|f|
|
46
|
+
item=row[f].to_s
|
47
|
+
@counts[f][tot]||={}
|
48
|
+
@counts[f][tot][item]||=0
|
49
|
+
@counts[f][tot][item] += 1
|
50
|
+
}
|
51
|
+
i+=1
|
52
|
+
}
|
53
|
+
end
|
54
|
+
def curve_field(field, item)
|
55
|
+
out={}
|
56
|
+
item=item.to_s
|
57
|
+
@totals.each{|value,n|
|
58
|
+
count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
|
59
|
+
out[value]=count_value.to_f/n.to_f
|
60
|
+
}
|
61
|
+
out
|
62
|
+
end
|
63
|
+
end
|
64
|
+
class ItemAnalysis
|
65
|
+
attr_reader :mean, :sd,:valid_n, :alpha , :alpha_standarized
|
66
|
+
def initialize(ds)
|
67
|
+
@ds=ds.dup_only_valid
|
68
|
+
@total=@ds.vector_sum
|
69
|
+
@mean=@total.mean
|
70
|
+
@median=@total.median
|
71
|
+
@skew=@total.skew
|
72
|
+
@kurtosis=@total.kurtosis
|
73
|
+
@sd=@total.sdp
|
74
|
+
@valid_n=@total.size
|
75
|
+
begin
|
76
|
+
@alpha=Statsample::Reliability.cronbach_alpha(ds)
|
77
|
+
@alpha_standarized=Statsample::Reliability.cronbach_alpha_standarized(ds)
|
78
|
+
rescue => e
|
79
|
+
raise DatasetException.new(@ds,e), "Problem on calculate alpha"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
# Returns a hash with structure
|
83
|
+
def item_characteristic_curve
|
84
|
+
i=0
|
85
|
+
out={}
|
86
|
+
total={}
|
87
|
+
@ds.each{|row|
|
88
|
+
tot=@total[i]
|
89
|
+
@ds.fields.each {|f|
|
90
|
+
out[f]||= {}
|
91
|
+
total[f]||={}
|
92
|
+
out[f][tot]||= 0
|
93
|
+
total[f][tot]||=0
|
94
|
+
out[f][tot]+= row[f]
|
95
|
+
total[f][tot]+=1
|
96
|
+
}
|
97
|
+
i+=1
|
98
|
+
}
|
99
|
+
total.each{|f,var|
|
100
|
+
var.each{|tot,v|
|
101
|
+
out[f][tot]=out[f][tot].to_f / total[f][tot]
|
102
|
+
}
|
103
|
+
}
|
104
|
+
out
|
105
|
+
end
|
106
|
+
def gnuplot_item_characteristic_curve(directory, base="crd",options={})
|
107
|
+
require 'gnuplot'
|
108
|
+
|
109
|
+
crd=item_characteristic_curve
|
110
|
+
@ds.fields.each {|f|
|
111
|
+
x=[]
|
112
|
+
y=[]
|
113
|
+
Gnuplot.open do |gp|
|
114
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
115
|
+
crd[f].sort.each{|tot,prop|
|
116
|
+
x.push(tot)
|
117
|
+
y.push((prop*100).to_i.to_f/100)
|
118
|
+
}
|
119
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
120
|
+
ds.with = "linespoints"
|
121
|
+
ds.notitle
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
}
|
127
|
+
|
128
|
+
end
|
129
|
+
def svggraph_item_characteristic_curve(directory, base="icc",options={})
|
130
|
+
require 'statsample/graph/svggraph'
|
131
|
+
crd=ItemCharacteristicCurve.new(@ds)
|
132
|
+
@ds.fields.each {|f|
|
133
|
+
factors=@ds[f].factors.sort
|
134
|
+
options={
|
135
|
+
:height=>500,
|
136
|
+
:width=>800,
|
137
|
+
:key=>true
|
138
|
+
}.update(options)
|
139
|
+
graph = ::SVG::Graph::Plot.new(options)
|
140
|
+
factors.each{|factor|
|
141
|
+
factor=factor.to_s
|
142
|
+
dataset=[]
|
143
|
+
crd.curve_field(f, factor).each{|tot,prop|
|
144
|
+
dataset.push(tot)
|
145
|
+
dataset.push((prop*100).to_i.to_f/100)
|
146
|
+
}
|
147
|
+
graph.add_data({
|
148
|
+
:title=>"#{factor}",
|
149
|
+
:data=>dataset
|
150
|
+
})
|
151
|
+
}
|
152
|
+
File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
|
153
|
+
fp.puts(graph.burn())
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
end
|
158
|
+
def item_total_correlation
|
159
|
+
@ds.fields.inject({}) do |a,v|
|
160
|
+
vector=@ds[v].dup
|
161
|
+
ds2=@ds.dup
|
162
|
+
ds2.delete_vector(v)
|
163
|
+
total=ds2.vector_sum
|
164
|
+
a[v]=Statsample::Bivariate.pearson(vector,total)
|
165
|
+
a
|
166
|
+
end
|
167
|
+
end
|
168
|
+
def item_statistics
|
169
|
+
@ds.fields.inject({}) do |a,v|
|
170
|
+
a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
|
171
|
+
a
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def stats_if_deleted
|
176
|
+
@ds.fields.inject({}){|a,v|
|
177
|
+
ds2=@ds.dup
|
178
|
+
ds2.delete_vector(v)
|
179
|
+
total=ds2.vector_sum
|
180
|
+
a[v]={}
|
181
|
+
a[v][:mean]=total.mean
|
182
|
+
a[v][:sds]=total.sds
|
183
|
+
a[v][:variance_sample]=total.variance_sample
|
184
|
+
a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
|
185
|
+
a
|
186
|
+
}
|
187
|
+
end
|
188
|
+
def html_summary
|
189
|
+
html = <<EOF
|
190
|
+
<p><strong>Summary for scale:</strong></p>
|
191
|
+
<ul>
|
192
|
+
<li>Mean=#{@mean}</li>
|
193
|
+
<li>Std.Dv.=#{@sd}</li>
|
194
|
+
<li>Median=#{@median}</li>
|
195
|
+
<li>Skewness=#{sprintf("%0.3f",@skew)}</li>
|
196
|
+
<li>Kurtosis=#{sprintf("%0.3f",@kurtosis)}</li>
|
197
|
+
|
198
|
+
<li>Valid n:#{@valid_n}</li>
|
199
|
+
<li>Cronbach alpha: #{@alpha}</li>
|
200
|
+
</ul>
|
201
|
+
<table><thead><th>Variable</th>
|
202
|
+
|
203
|
+
<th>Mean</th>
|
204
|
+
<th>StDv.</th>
|
205
|
+
<th>Mean if deleted</th><th>Var. if
|
206
|
+
deleted</th><th> StDv. if
|
207
|
+
deleted</th><th> Itm-Totl
|
208
|
+
Correl.</th><th>Alpha if
|
209
|
+
deleted</th></thead>
|
210
|
+
EOF
|
211
|
+
|
212
|
+
itc=item_total_correlation
|
213
|
+
sid=stats_if_deleted
|
214
|
+
is=item_statistics
|
215
|
+
@ds.fields.each {|f|
|
216
|
+
html << <<EOF
|
217
|
+
<tr>
|
218
|
+
<td>#{f}</td>
|
219
|
+
<td>#{sprintf("%0.5f",is[f][:mean])}</td>
|
220
|
+
<td>#{sprintf("%0.5f",is[f][:sds])}</td>
|
221
|
+
<td>#{sprintf("%0.5f",sid[f][:mean])}</td>
|
222
|
+
<td>#{sprintf("%0.5f",sid[f][:variance_sample])}</td>
|
223
|
+
<td>#{sprintf("%0.5f",sid[f][:sds])}</td>
|
224
|
+
<td>#{sprintf("%0.5f",itc[f])}</td>
|
225
|
+
<td>#{sprintf("%0.5f",sid[f][:alpha])}</td>
|
226
|
+
</tr>
|
227
|
+
EOF
|
228
|
+
}
|
229
|
+
html << "</table><hr />"
|
230
|
+
html
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
end
|
235
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Resample
|
3
|
+
class << self
|
4
|
+
def repeat_and_save(times,&action)
|
5
|
+
(1..times).inject([]) {|a,x|
|
6
|
+
a.push(action.call)
|
7
|
+
a
|
8
|
+
}
|
9
|
+
end
|
10
|
+
|
11
|
+
def generate (size,low,upper)
|
12
|
+
range=upper-low+1
|
13
|
+
Vector.new((0...size).collect {|x|
|
14
|
+
rand(range)+low
|
15
|
+
},:scale)
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Several methods to estimate parameters for simple random sampling
|
3
|
+
module SRS
|
4
|
+
class << self
|
5
|
+
|
6
|
+
########################
|
7
|
+
#
|
8
|
+
# Proportion estimation
|
9
|
+
#
|
10
|
+
########################
|
11
|
+
# Finite population correction (over variance)
|
12
|
+
# Source: Cochran(1972)
|
13
|
+
def fpc_var(sam,pop)
|
14
|
+
(pop - sam).quo(pop - 1)
|
15
|
+
end
|
16
|
+
# Finite population correction (over standard deviation)
|
17
|
+
def fpc(sam,pop)
|
18
|
+
Math::sqrt((pop-sam).quo(pop-1))
|
19
|
+
end
|
20
|
+
|
21
|
+
# Non sample fraction.
|
22
|
+
#
|
23
|
+
# 1 - sample fraction
|
24
|
+
def qf(sam , pop)
|
25
|
+
1-(sam.quo(pop))
|
26
|
+
end
|
27
|
+
# Sample size estimation for proportions, infinite poblation
|
28
|
+
def estimation_n0(d,prop,margin=0.95)
|
29
|
+
t=GSL::Cdf.ugaussian_Pinv(1-(1-margin).quo(2))
|
30
|
+
var=prop*(1-prop)
|
31
|
+
t**2*var.quo(d**2)
|
32
|
+
end
|
33
|
+
# Sample size estimation for proportions, finite poblation.
|
34
|
+
def estimation_n(d,prop,n_pobl,margin=0.95)
|
35
|
+
n0=estimation_n0(d,prop,margin)
|
36
|
+
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
|
37
|
+
end
|
38
|
+
# Proportion confidence interval with t values
|
39
|
+
# Uses estimated proportion, sample without replacement.
|
40
|
+
|
41
|
+
def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
|
42
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-margin).quo(2)) , n_sample-1)
|
43
|
+
proportion_confidence_interval(prop,n_sample,n_population, t)
|
44
|
+
end
|
45
|
+
# Proportion confidence interval with z values
|
46
|
+
# Uses estimated proportion, sample without replacement.
|
47
|
+
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
48
|
+
z=GSL::Cdf.ugaussian_Pinv(1-((1-margin).quo(2)))
|
49
|
+
proportion_confidence_interval(p,n_sample,n_population, z)
|
50
|
+
end
|
51
|
+
# Proportion confidence interval with x value
|
52
|
+
# Uses estimated proportion, sample without replacement
|
53
|
+
|
54
|
+
def proportion_confidence_interval(p, sam,pop , x)
|
55
|
+
f=sam.quo(pop)
|
56
|
+
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)) / (sam-1)) + (1.quo(sam * 2.0))
|
57
|
+
[p-one_range, p+one_range]
|
58
|
+
end
|
59
|
+
# Standard deviation for sample distribution of a proportion
|
60
|
+
# Know proportion, sample with replacement.
|
61
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx
|
62
|
+
def proportion_sd_kp_wr(p, n_sample)
|
63
|
+
Math::sqrt(p*(1-p).quo(n_sample))
|
64
|
+
end
|
65
|
+
# Standard deviation for sample distribution of a proportion
|
66
|
+
# Know proportion, sample without replacement.
|
67
|
+
#
|
68
|
+
# Sources:
|
69
|
+
# * http://stattrek.com/Lesson6/SRS.aspx
|
70
|
+
# * Cochran(1972)
|
71
|
+
def proportion_sd_kp_wor(p, sam, pop)
|
72
|
+
fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
|
73
|
+
end
|
74
|
+
# Standard deviation for sample distribution of a proportion
|
75
|
+
# Estimated proportion, sample with replacement
|
76
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx.
|
77
|
+
def proportion_sd_ep_wr(p, n_sample)
|
78
|
+
Math::sqrt(p*(1-p).quo(n_sample-1))
|
79
|
+
end
|
80
|
+
# Standard deviation for sample distribution of a proportion.
|
81
|
+
# Estimated proportion, sample without replacement.
|
82
|
+
# Source: Cochran, 1972, Técnicas de muestreo
|
83
|
+
def proportion_sd_ep_wor(p, sam,pop)
|
84
|
+
fsc=(pop-sam).quo((sam-1)*pop)
|
85
|
+
Math::sqrt(fsc*p*(1-p))
|
86
|
+
end
|
87
|
+
|
88
|
+
# Total estimation sd based on sample.
|
89
|
+
# Known proportion, sample without replacement
|
90
|
+
# Source: Cochran(1972)
|
91
|
+
def proportion_total_sd_kp_wor(prop, sam, pop)
|
92
|
+
pob * proportion_sd_kp_wor(p, sam, pop)
|
93
|
+
end
|
94
|
+
# Total estimation sd based on sample.
|
95
|
+
# Estimated proportion, sample without replacement
|
96
|
+
# Source: Cochran(1972)
|
97
|
+
def proportion_total_sd_ep_wor(prop, sam, pop)
|
98
|
+
fsc=((pop - sam).to_f / ( sam - 1))
|
99
|
+
Math::sqrt(fsc*pop*prop*(1-prop))
|
100
|
+
end
|
101
|
+
|
102
|
+
########################
|
103
|
+
#
|
104
|
+
# Mean stimation
|
105
|
+
#
|
106
|
+
########################
|
107
|
+
|
108
|
+
|
109
|
+
# Standard error. Known variance, sample with replacement.
|
110
|
+
def standard_error_ksd_wr(s, sam, pop)
|
111
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
|
112
|
+
end
|
113
|
+
|
114
|
+
# Standard error of the mean. Known variance, sample w/o replacement
|
115
|
+
def standard_error_ksd_wor(s,sam,pop)
|
116
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
117
|
+
end
|
118
|
+
|
119
|
+
alias_method :standard_error_esd_wr, :standard_error_ksd_wr
|
120
|
+
|
121
|
+
# Standard error of the mean.
|
122
|
+
# Estimated variance, without replacement
|
123
|
+
# Cochran (1972) p.47
|
124
|
+
def standard_error_esd_wor(s,sam,pop)
|
125
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
126
|
+
end
|
127
|
+
|
128
|
+
alias_method :standard_error, :standard_error_esd_wor
|
129
|
+
alias_method :se, :standard_error_esd_wor
|
130
|
+
|
131
|
+
# Standard error of total estimation
|
132
|
+
|
133
|
+
def standard_error_total(s,sam,pop)
|
134
|
+
pop*se(s,sam,pop)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Confidence Interval using T-Student
|
138
|
+
# Use with n < 60
|
139
|
+
def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
|
140
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-margin) / 2),n_sample-1)
|
141
|
+
mean_confidence_interval(mean,s,n_sample,n_population,t)
|
142
|
+
end
|
143
|
+
# Confidente Interval using Z
|
144
|
+
# Use with n > 60
|
145
|
+
def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
|
146
|
+
z=GSL::Cdf.ugaussian_Pinv(1-((1-margin) / 2))
|
147
|
+
mean_confidence_interval(mean,s,n_sample,n_population, z)
|
148
|
+
end
|
149
|
+
# Confidente interval using X.
|
150
|
+
#
|
151
|
+
# Better use mean_confidence_interval_z or mean_confidence_interval_t
|
152
|
+
def mean_confidence_interval(mean,s,n_sample,n_population,x)
|
153
|
+
range=x*se(s,n_sample,n_population)
|
154
|
+
[mean-range,mean+range]
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Statsample
|
2
|
+
# module for several statistical tests
|
3
|
+
module Test
|
4
|
+
# Calculate chi square for two Matrix
|
5
|
+
class << self
|
6
|
+
def chi_square(real,expected)
|
7
|
+
raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
|
8
|
+
sum=0
|
9
|
+
(0...real.row_size).each {|row_i|
|
10
|
+
(0...real.column_size).each {|col_i|
|
11
|
+
|
12
|
+
val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
|
13
|
+
# puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
|
14
|
+
# puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
|
15
|
+
sum+=val
|
16
|
+
}
|
17
|
+
}
|
18
|
+
sum
|
19
|
+
end
|
20
|
+
def t_significance
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,759 @@
|
|
1
|
+
class Array
|
2
|
+
def to_vector(*args)
|
3
|
+
Statsample::Vector.new(self,*args)
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Statsample
|
8
|
+
class << self
|
9
|
+
# Create a matrix using vectors as columns
|
10
|
+
# Use:
|
11
|
+
#
|
12
|
+
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
13
|
+
def vector_cols_matrix(*vs)
|
14
|
+
# test
|
15
|
+
size=vs[0].size
|
16
|
+
vs.each{|v|
|
17
|
+
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
18
|
+
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
19
|
+
}
|
20
|
+
Matrix.rows((0...size).to_a.collect() {|i|
|
21
|
+
vs.collect{|v| v[i]}
|
22
|
+
})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
# Returns a duplicate of the input vectors, without missing data
|
26
|
+
# for any of the vectors
|
27
|
+
#
|
28
|
+
# a=[1,2,3,6,7,nil,3,5].to_vector(:scale)
|
29
|
+
# b=[nil,nil,5,6,4,5,10,2].to_vector(:scale)
|
30
|
+
# c=[2,4,6,7,4,5,6,7].to_vector(:scale)
|
31
|
+
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
32
|
+
# => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
33
|
+
# #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
34
|
+
# #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
35
|
+
#
|
36
|
+
def self.only_valid(*vs)
|
37
|
+
i=1
|
38
|
+
h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
|
39
|
+
ds=Statsample::Dataset.new(h).dup_only_valid
|
40
|
+
ds.vectors.values
|
41
|
+
end
|
42
|
+
class Vector < DelegateClass(Array)
|
43
|
+
|
44
|
+
include Enumerable
|
45
|
+
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
|
46
|
+
attr_accessor :labels
|
47
|
+
# Creates a new
|
48
|
+
# data = Array of data
|
49
|
+
# t = level of meausurement. Could be:
|
50
|
+
# [:nominal] : Nominal level of measurement
|
51
|
+
# [:ordinal] : Ordinal level of measurement
|
52
|
+
# [:scale] : Scale level of meausurement
|
53
|
+
#
|
54
|
+
def initialize(data=[],t=:nominal,missing_values=[],labels={})
|
55
|
+
raise "Data should be an array" unless data.is_a? Array
|
56
|
+
@data=data
|
57
|
+
@missing_values=missing_values
|
58
|
+
@labels=labels
|
59
|
+
@type=t
|
60
|
+
@valid_data=[]
|
61
|
+
@data_with_nils=[]
|
62
|
+
@missing_data=[]
|
63
|
+
@has_missing_data=nil
|
64
|
+
_set_valid_data
|
65
|
+
self.type=t
|
66
|
+
super(@delegate)
|
67
|
+
end
|
68
|
+
def dup
|
69
|
+
Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
|
70
|
+
end
|
71
|
+
# Returns an empty duplicate of the vector. Maintains the type, missing
|
72
|
+
# values, labels
|
73
|
+
def dup_empty
|
74
|
+
Vector.new([],@type,@missing_values.dup,@labels.dup)
|
75
|
+
end
|
76
|
+
# Return a vector usign the standarized values for data
|
77
|
+
# with sd with denominator N
|
78
|
+
def vector_standarized_pop
|
79
|
+
vector_standarized(true)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return a vector usign the standarized values for data
|
83
|
+
# with sd with denominator n-1
|
84
|
+
|
85
|
+
def vector_standarized(use_population=false)
|
86
|
+
raise "Should be a scale" unless @type==:scale
|
87
|
+
mean=@delegate.mean
|
88
|
+
sd=use_population ? @delegate.sdp : @delegate.sds
|
89
|
+
@data_with_nils.collect{|x|
|
90
|
+
if !x.nil?
|
91
|
+
(x.to_f - mean).quo(sd)
|
92
|
+
else
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
}.to_vector(:scale)
|
96
|
+
end
|
97
|
+
alias_method :standarized, :vector_standarized
|
98
|
+
def box_cox_transformation(lambda)
|
99
|
+
raise "Should be a scale" unless @type==:scale
|
100
|
+
@data_with_nils.collect{|x|
|
101
|
+
if !x.nil?
|
102
|
+
if(lambda==0)
|
103
|
+
Math.log(x)
|
104
|
+
else
|
105
|
+
(x**lambda-1).quo(lambda)
|
106
|
+
end
|
107
|
+
else
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
}.to_vector(:scale)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Vector equality
|
114
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
115
|
+
def ==(v2)
|
116
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
117
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
118
|
+
end
|
119
|
+
def _dump(i)
|
120
|
+
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
121
|
+
end
|
122
|
+
def self._load(data)
|
123
|
+
h=Marshal.load(data)
|
124
|
+
Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
|
125
|
+
end
|
126
|
+
def recode
|
127
|
+
@data.collect{|x|
|
128
|
+
yield x
|
129
|
+
}.to_vector(@type)
|
130
|
+
end
|
131
|
+
def recode!
|
132
|
+
@data.collect!{|x|
|
133
|
+
yield x
|
134
|
+
}
|
135
|
+
set_valid_data
|
136
|
+
end
|
137
|
+
def each
|
138
|
+
@data.each{|x|
|
139
|
+
yield(x)
|
140
|
+
}
|
141
|
+
end
|
142
|
+
# Add a value at the end of the vector
|
143
|
+
# If second argument set to false, you should update valid data usign
|
144
|
+
# Vector#set_valid_data at the end of your insertion cycle
|
145
|
+
def add(v,update_valid=true)
|
146
|
+
@data.push(v)
|
147
|
+
set_valid_data if update_valid
|
148
|
+
end
|
149
|
+
def set_valid_data
|
150
|
+
@valid_data.clear
|
151
|
+
@missing_data.clear
|
152
|
+
@data_with_nils.clear
|
153
|
+
_set_valid_data
|
154
|
+
@delegate.set_gsl if(@type==:scale)
|
155
|
+
end
|
156
|
+
def _set_valid_data
|
157
|
+
if Statsample::OPTIMIZED
|
158
|
+
Statsample::_set_valid_data(self)
|
159
|
+
else
|
160
|
+
@data.each do |n|
|
161
|
+
if is_valid? n
|
162
|
+
@valid_data.push(n)
|
163
|
+
@data_with_nils.push(n)
|
164
|
+
else
|
165
|
+
@data_with_nils.push(nil)
|
166
|
+
@missing_data.push(n)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
@has_missing_data=@missing_data.size>0
|
170
|
+
end
|
171
|
+
end
|
172
|
+
# Retrieves true if data has one o more missing values
|
173
|
+
def has_missing_data?
|
174
|
+
@has_missing_data
|
175
|
+
end
|
176
|
+
def labeling(x)
|
177
|
+
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
178
|
+
end
|
179
|
+
# Returns a Vector with the data with labels replaced by the label
|
180
|
+
def vector_labeled
|
181
|
+
d=@data.collect{|x|
|
182
|
+
if @labels.has_key? x
|
183
|
+
@labels[x]
|
184
|
+
else
|
185
|
+
x
|
186
|
+
end
|
187
|
+
}
|
188
|
+
Vector.new(d,@type)
|
189
|
+
end
|
190
|
+
def size
|
191
|
+
@data.size
|
192
|
+
end
|
193
|
+
def [](i)
|
194
|
+
@data[i]
|
195
|
+
end
|
196
|
+
def []=(i,v)
|
197
|
+
@data[i]=v
|
198
|
+
end
|
199
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
200
|
+
def is_valid?(x)
|
201
|
+
!(x.nil? or @missing_values.include? x)
|
202
|
+
end
|
203
|
+
# Set missing_values
|
204
|
+
def missing_values=(vals)
|
205
|
+
@missing_values = vals
|
206
|
+
set_valid_data
|
207
|
+
end
|
208
|
+
# Set level of measurement.
|
209
|
+
def type=(t)
|
210
|
+
case t
|
211
|
+
when :nominal
|
212
|
+
@delegate=Nominal.new(@valid_data)
|
213
|
+
when :ordinal
|
214
|
+
@delegate=Ordinal.new(@valid_data)
|
215
|
+
when :scale
|
216
|
+
@delegate=Scale.new(@valid_data)
|
217
|
+
else
|
218
|
+
raise "Type doesn't exists"
|
219
|
+
end
|
220
|
+
__setobj__(@delegate)
|
221
|
+
@type=t
|
222
|
+
end
|
223
|
+
def n; @data.size ; end
|
224
|
+
def to_a
|
225
|
+
@data.dup
|
226
|
+
end
|
227
|
+
# Redundant, but necessary
|
228
|
+
# Spreadsheet creates Array#sum, so calling sum
|
229
|
+
# doesn't call the delegates method
|
230
|
+
def sum
|
231
|
+
@delegate.sum
|
232
|
+
end
|
233
|
+
alias_method :to_ary, :to_a
|
234
|
+
# Vector sum.
|
235
|
+
# - If v is a scalar, add this value to all elements
|
236
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
237
|
+
# every item of this vector will be added to the value of the
|
238
|
+
# item at the same position on the other vector
|
239
|
+
def +(v)
|
240
|
+
_vector_ari("+",v)
|
241
|
+
end
|
242
|
+
# Vector rest.
|
243
|
+
# - If v is a scalar, rest this value to all elements
|
244
|
+
# - If v is a Array or a Vector, should be of the same
|
245
|
+
# size of this vector
|
246
|
+
# every item of this vector will be rested to the value of the
|
247
|
+
# item at the same position on the other vector
|
248
|
+
|
249
|
+
def -(v)
|
250
|
+
_vector_ari("-",v)
|
251
|
+
end
|
252
|
+
# Reports all values that doesn't comply with a condition
|
253
|
+
# Returns a hash with the index of data and the invalid data
|
254
|
+
def verify
|
255
|
+
h={}
|
256
|
+
(0...@data.size).to_a.each{|i|
|
257
|
+
if !(yield @data[i])
|
258
|
+
h[i]=@data[i]
|
259
|
+
end
|
260
|
+
}
|
261
|
+
h
|
262
|
+
end
|
263
|
+
def _vector_ari(method,v) # :nodoc:
|
264
|
+
if(v.is_a? Vector or v.is_a? Array)
|
265
|
+
if v.size==@data.size
|
266
|
+
i=0
|
267
|
+
sum=[]
|
268
|
+
0.upto(v.size-1) {|i|
|
269
|
+
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
270
|
+
sum.push(@data[i].send(method,v[i]))
|
271
|
+
else
|
272
|
+
sum.push(nil)
|
273
|
+
end
|
274
|
+
}
|
275
|
+
Statsample::Vector.new(sum)
|
276
|
+
else
|
277
|
+
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
278
|
+
end
|
279
|
+
elsif(v.respond_to? method )
|
280
|
+
Statsample::Vector.new(
|
281
|
+
@data.collect {|x|
|
282
|
+
if(!x.nil?)
|
283
|
+
x.send(method,v)
|
284
|
+
else
|
285
|
+
nil
|
286
|
+
end
|
287
|
+
}
|
288
|
+
)
|
289
|
+
else
|
290
|
+
raise TypeError,"You should pass a scalar or a array/vector"
|
291
|
+
end
|
292
|
+
|
293
|
+
end
|
294
|
+
# Return an array with the data splitted by a separator
|
295
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
296
|
+
# a.splitted
|
297
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
298
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
299
|
+
@data.collect{|x|
|
300
|
+
if x.nil?
|
301
|
+
nil
|
302
|
+
elsif (x.respond_to? :split)
|
303
|
+
x.split(sep)
|
304
|
+
else
|
305
|
+
[x]
|
306
|
+
end
|
307
|
+
}
|
308
|
+
end
|
309
|
+
# Returns a hash of Vectors, defined by the different values
|
310
|
+
# defined on the fields
|
311
|
+
# Example:
|
312
|
+
#
|
313
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
314
|
+
# a.split_by_separator
|
315
|
+
# {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 @data=[1, 0, 1]>,
|
316
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 @data=[1, 1, 0]>,
|
317
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 @data=[0, 1, 1]>}
|
318
|
+
#
|
319
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
320
|
+
split_data=splitted(sep)
|
321
|
+
factors=split_data.flatten.uniq.compact
|
322
|
+
out=factors.inject({}) {|a,x|
|
323
|
+
a[x]=[]
|
324
|
+
a
|
325
|
+
}
|
326
|
+
split_data.each{|r|
|
327
|
+
if r.nil?
|
328
|
+
factors.each{|f|
|
329
|
+
out[f].push(nil)
|
330
|
+
}
|
331
|
+
else
|
332
|
+
factors.each{|f|
|
333
|
+
out[f].push(r.include?(f) ? 1:0)
|
334
|
+
}
|
335
|
+
end
|
336
|
+
}
|
337
|
+
out.inject({}){|s,v|
|
338
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
339
|
+
s
|
340
|
+
}
|
341
|
+
end
|
342
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
343
|
+
split_by_separator(sep).inject({}) {|a,v|
|
344
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
345
|
+
a
|
346
|
+
}
|
347
|
+
end
|
348
|
+
|
349
|
+
# Returns an random sample of size n, with replacement,
|
350
|
+
# only with valid data.
|
351
|
+
#
|
352
|
+
# In all the trails, every item have the same probability
|
353
|
+
# of been selected
|
354
|
+
def sample_with_replacement(sample=1)
|
355
|
+
Vector.new(@delegate.sample_with_replacement(sample) ,@type)
|
356
|
+
end
|
357
|
+
# Returns an random sample of size n, without replacement,
|
358
|
+
# only with valid data.
|
359
|
+
#
|
360
|
+
# Every element could only be selected once
|
361
|
+
# A sample of the same size of the vector is the vector itself
|
362
|
+
|
363
|
+
def sample_without_replacement(sample=1)
|
364
|
+
Vector.new(@delegate.sample_without_replacement(sample),@type)
|
365
|
+
end
|
366
|
+
|
367
|
+
def count(x=false)
|
368
|
+
if block_given?
|
369
|
+
r=@data.inject(0) {|s, i|
|
370
|
+
r=yield i
|
371
|
+
s+(r ? 1 : 0)
|
372
|
+
}
|
373
|
+
r.nil? ? 0 : r
|
374
|
+
else
|
375
|
+
frequencies[x].nil? ? 0 : frequencies[x]
|
376
|
+
end
|
377
|
+
end
|
378
|
+
# returns the real type for the vector, according to its content
|
379
|
+
def db_type(dbs='mysql')
|
380
|
+
# first, detect any character not number
|
381
|
+
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
382
|
+
return "DATE"
|
383
|
+
elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
|
384
|
+
return "VARCHAR (255)"
|
385
|
+
elsif @data.find {|v| v.to_s=~/\./}
|
386
|
+
return "DOUBLE"
|
387
|
+
else
|
388
|
+
return "INTEGER"
|
389
|
+
end
|
390
|
+
end
|
391
|
+
def summary(out="")
|
392
|
+
@delegate.summary(@labels,out)
|
393
|
+
end
|
394
|
+
def to_s
|
395
|
+
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
396
|
+
end
|
397
|
+
def inspect
|
398
|
+
self.to_s
|
399
|
+
end
|
400
|
+
|
401
|
+
end
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
class Nominal
|
406
|
+
def initialize(data)
|
407
|
+
@data=data
|
408
|
+
# @factors=data.uniq
|
409
|
+
end
|
410
|
+
def delegate_data
|
411
|
+
@data
|
412
|
+
end
|
413
|
+
# Return an array of the different values of the data
|
414
|
+
def factors
|
415
|
+
@data.uniq.sort
|
416
|
+
end
|
417
|
+
# Returns a hash with the distribution of frecuencies of
|
418
|
+
# the sample
|
419
|
+
def frequencies_slow
|
420
|
+
@data.inject(Hash.new) {|a,x|
|
421
|
+
a[x]||=0
|
422
|
+
a[x]=a[x]+1
|
423
|
+
a
|
424
|
+
}
|
425
|
+
end
|
426
|
+
# Plot frequencies on a chart, using gnuplot
|
427
|
+
def plot_frequencies
|
428
|
+
require 'gnuplot'
|
429
|
+
x=[]
|
430
|
+
y=[]
|
431
|
+
self.frequencies.sort.each{|k,v|
|
432
|
+
x.push(k)
|
433
|
+
y.push(v)
|
434
|
+
}
|
435
|
+
Gnuplot.open do |gp|
|
436
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
437
|
+
plot.boxwidth("0.9 absolute")
|
438
|
+
plot.yrange("[0:#{y.max}]")
|
439
|
+
plot.style("fill solid 1.00 border -1")
|
440
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
441
|
+
plot.style("histogram")
|
442
|
+
plot.style("data histogram")
|
443
|
+
i=-1
|
444
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
445
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
end
|
451
|
+
|
452
|
+
|
453
|
+
# Returns the most frequent item
|
454
|
+
def mode
|
455
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
456
|
+
end
|
457
|
+
# The numbers of item with valid data
|
458
|
+
def n_valid
|
459
|
+
@data.size
|
460
|
+
end
|
461
|
+
# Returns a hash with the distribution of proportions of
|
462
|
+
# the sample
|
463
|
+
def proportions
|
464
|
+
frequencies.inject({}){|a,v|
|
465
|
+
a[v[0]] = v[1].quo(@data.size)
|
466
|
+
a
|
467
|
+
}
|
468
|
+
end
|
469
|
+
# Proportion of a given value.
|
470
|
+
def proportion(v=1)
|
471
|
+
frequencies[v].quo(@data.size)
|
472
|
+
end
|
473
|
+
def summary(labels,out="")
|
474
|
+
out << sprintf("n valid:%d\n",n_valid)
|
475
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
476
|
+
out << "mode:"+mode.to_s+"\n"
|
477
|
+
out << "Distribution:\n"
|
478
|
+
frequencies.sort.each{|k,v|
|
479
|
+
key=labels.has_key?(k) ? labels[k]:k
|
480
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
481
|
+
}
|
482
|
+
out
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns an random sample of size n, with replacement,
|
486
|
+
# only with valid data.
|
487
|
+
#
|
488
|
+
# In all the trails, every item have the same probability
|
489
|
+
# of been selected
|
490
|
+
def sample_with_replacement(sample)
|
491
|
+
(0...sample).collect{ @data[rand(@data.size)] }
|
492
|
+
end
|
493
|
+
# Returns an random sample of size n, without replacement,
|
494
|
+
# only with valid data.
|
495
|
+
#
|
496
|
+
# Every element could only be selected once
|
497
|
+
# A sample of the same size of the vector is the vector itself
|
498
|
+
|
499
|
+
def sample_without_replacement(sample)
|
500
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
|
501
|
+
out=[]
|
502
|
+
size=@data.size
|
503
|
+
while out.size<sample
|
504
|
+
value=rand(size)
|
505
|
+
out.push(value) if !out.include?value
|
506
|
+
end
|
507
|
+
out.collect{|i|@data[i]}
|
508
|
+
end
|
509
|
+
|
510
|
+
|
511
|
+
# Variance of p, according to poblation size
|
512
|
+
def variance_proportion(n_poblation, v=1)
|
513
|
+
Statsample::proportion_variance_sample(self.proportion(v), @data.size, n_poblation)
|
514
|
+
end
|
515
|
+
def variance_total(n_poblation, v=1)
|
516
|
+
Statsample::total_variance_sample(self.proportion(v), @data.size, n_poblation)
|
517
|
+
end
|
518
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
519
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @data.size, n_poblation, margin)
|
520
|
+
end
|
521
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
522
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @data.size, n_poblation, margin)
|
523
|
+
end
|
524
|
+
self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
|
525
|
+
met_or=met.gsub("_slow","")
|
526
|
+
if !self.method_defined?(met_or)
|
527
|
+
alias_method met_or, met
|
528
|
+
end
|
529
|
+
}
|
530
|
+
end
|
531
|
+
|
532
|
+
class Ordinal <Nominal
|
533
|
+
# Return the value of the percentil q
|
534
|
+
def percentil(q)
|
535
|
+
sorted=@data.sort
|
536
|
+
v= (n_valid * q).quo(100)
|
537
|
+
if(v.to_i!=v)
|
538
|
+
sorted[v.to_i]
|
539
|
+
else
|
540
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
541
|
+
end
|
542
|
+
end
|
543
|
+
# Returns a ranked vector
|
544
|
+
def ranked(type=:ordinal)
|
545
|
+
i=0
|
546
|
+
r=frequencies.sort.inject({}){|a,v|
|
547
|
+
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
548
|
+
i+=v[1]
|
549
|
+
a
|
550
|
+
}
|
551
|
+
@data.collect {|c|
|
552
|
+
r[c]
|
553
|
+
}.to_vector(type)
|
554
|
+
end
|
555
|
+
# Return the median (percentil 50)
|
556
|
+
def median
|
557
|
+
percentil(50)
|
558
|
+
end
|
559
|
+
if HAS_GSL
|
560
|
+
%w{median}.each{|m|
|
561
|
+
m_nuevo=(m+"_slow").intern
|
562
|
+
alias_method m_nuevo, m.intern
|
563
|
+
}
|
564
|
+
|
565
|
+
#def percentil(p)
|
566
|
+
# v=GSL::Vector.alloc(@data.sort)
|
567
|
+
# v.stats_quantile_from_sorted_data(p)
|
568
|
+
#end
|
569
|
+
def median # :nodoc:
|
570
|
+
GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
|
571
|
+
end
|
572
|
+
end
|
573
|
+
# Minimun value
|
574
|
+
def min; @data.min;end
|
575
|
+
# Maximum value
|
576
|
+
def max; @data.max; end
|
577
|
+
|
578
|
+
|
579
|
+
def summary(labels,out="")
|
580
|
+
out << sprintf("n valid:%d\n",n_valid)
|
581
|
+
out << "median:"+median.to_s+"\n"
|
582
|
+
out << "percentil 25:"+percentil(25).to_s+"\n"
|
583
|
+
out << "percentil 75:"+percentil(75).to_s+"\n"
|
584
|
+
out
|
585
|
+
end
|
586
|
+
end
|
587
|
+
class Scale <Ordinal
|
588
|
+
attr_reader :gsl
|
589
|
+
def initialize(data)
|
590
|
+
# puts "Inicializando Scale..."
|
591
|
+
super(data)
|
592
|
+
|
593
|
+
set_gsl
|
594
|
+
end
|
595
|
+
|
596
|
+
def _dump(i)
|
597
|
+
Marshal.dump(@data)
|
598
|
+
end
|
599
|
+
def _load(data)
|
600
|
+
@data=Marshal.restore(data)
|
601
|
+
set_gsl
|
602
|
+
end
|
603
|
+
def set_gsl # :nodoc
|
604
|
+
data = @data.collect!{|x|
|
605
|
+
if x.is_a? Numeric
|
606
|
+
x
|
607
|
+
elsif x.is_a? String and x.to_i==x.to_f
|
608
|
+
x.to_i
|
609
|
+
else
|
610
|
+
x.to_f
|
611
|
+
end
|
612
|
+
}
|
613
|
+
if HAS_GSL
|
614
|
+
@gsl=GSL::Vector.alloc(@data) if @data.size>0
|
615
|
+
end
|
616
|
+
end
|
617
|
+
# The range of the data (max - min)
|
618
|
+
def range; @data.max - @data.min; end
|
619
|
+
# The sum of values for the data
|
620
|
+
def sum
|
621
|
+
@data.inject(0){|a,x|x+a} ; end
|
622
|
+
# The arithmetical mean of data
|
623
|
+
def mean
|
624
|
+
sum.to_f.quo(n_valid)
|
625
|
+
end
|
626
|
+
def sum_of_squares(m=nil)
|
627
|
+
m||=mean
|
628
|
+
@data.inject(0){|a,x| a+(x-m).square}
|
629
|
+
end
|
630
|
+
|
631
|
+
# Sum of squared deviation
|
632
|
+
def sum_of_squared_deviation
|
633
|
+
@data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
634
|
+
end
|
635
|
+
|
636
|
+
# Population variance (divided by n)
|
637
|
+
def variance_population(m=nil)
|
638
|
+
m||=mean
|
639
|
+
squares=@data.inject(0){|a,x| x.square+a}
|
640
|
+
squares.quo(n_valid) - m.square
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
# Population Standard deviation (divided by n)
|
645
|
+
def standard_deviation_population(m=nil)
|
646
|
+
Math::sqrt( variance_population(m) )
|
647
|
+
end
|
648
|
+
# Sample Variance (divided by n-1)
|
649
|
+
|
650
|
+
def variance_sample(m=nil)
|
651
|
+
m||=mean
|
652
|
+
sum_of_squares(m).quo(n_valid - 1)
|
653
|
+
end
|
654
|
+
|
655
|
+
# Sample Standard deviation (divided by n-1)
|
656
|
+
|
657
|
+
def standard_deviation_sample(m=nil)
|
658
|
+
m||=m
|
659
|
+
Math::sqrt(variance_sample(m))
|
660
|
+
end
|
661
|
+
def skew
|
662
|
+
m=mean
|
663
|
+
thirds=@data.inject(0){|a,x| a+((x-mean)**3)}
|
664
|
+
thirds.quo((@data.size-1)*sd**3)
|
665
|
+
end
|
666
|
+
def kurtosis
|
667
|
+
m=mean
|
668
|
+
thirds=@data.inject(0){|a,x| a+((x-mean)**4)}
|
669
|
+
thirds.quo((@data.size-1)*sd**4)
|
670
|
+
|
671
|
+
end
|
672
|
+
|
673
|
+
if HAS_GSL
|
674
|
+
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
675
|
+
m_nuevo=(m+"_slow").intern
|
676
|
+
alias_method m_nuevo, m.intern
|
677
|
+
}
|
678
|
+
def sum # :nodoc:
|
679
|
+
@gsl.sum
|
680
|
+
end
|
681
|
+
def mean # :nodoc:
|
682
|
+
@gsl.mean
|
683
|
+
end
|
684
|
+
def variance_sample(m=nil) # :nodoc:
|
685
|
+
m||=mean
|
686
|
+
@gsl.variance_m
|
687
|
+
end
|
688
|
+
def standard_deviation_sample(m=nil) # :nodoc:
|
689
|
+
m||=mean
|
690
|
+
@gsl.sd(m)
|
691
|
+
end
|
692
|
+
|
693
|
+
def variance_population(m=nil) # :nodoc:
|
694
|
+
m||=mean
|
695
|
+
@gsl.variance_with_fixed_mean(m)
|
696
|
+
end
|
697
|
+
def standard_deviation_population(m=nil) # :nodoc:
|
698
|
+
m||=mean
|
699
|
+
@gsl.sd_with_fixed_mean(m)
|
700
|
+
end
|
701
|
+
def skew
|
702
|
+
@gsl.skew
|
703
|
+
end
|
704
|
+
def kurtosis
|
705
|
+
@gsl.kurtosis
|
706
|
+
end
|
707
|
+
# Create a GSL::Histogram
|
708
|
+
# With a fixnum, creates X bins within the range of data
|
709
|
+
# With an Array, each value will be a cut point
|
710
|
+
def histogram(bins=10)
|
711
|
+
if bins.is_a? Array
|
712
|
+
h=GSL::Histogram.alloc(bins)
|
713
|
+
else
|
714
|
+
# ugly patch. The upper limit for a bin has the form
|
715
|
+
# x < range
|
716
|
+
h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001])
|
717
|
+
end
|
718
|
+
h.increment(@gsl)
|
719
|
+
h
|
720
|
+
end
|
721
|
+
def plot_histogram(bins=10,options="")
|
722
|
+
self.histogram(bins).graph(options)
|
723
|
+
end
|
724
|
+
def sample_with_replacement(k)
|
725
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
726
|
+
r.sample(@gsl, k).to_a
|
727
|
+
end
|
728
|
+
def sample_without_replacement(k)
|
729
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
730
|
+
r.choose(@gsl, k).to_a
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
# Coefficient of variation
|
735
|
+
# Calculed with the sample standard deviation
|
736
|
+
def coefficient_of_variation
|
737
|
+
standard_deviation_sample.quo(mean)
|
738
|
+
end
|
739
|
+
def summary(labels,out="")
|
740
|
+
out << sprintf("n valid:%d\n",n_valid)
|
741
|
+
out << "mean:"+mean.to_s+"\n"
|
742
|
+
out << "sum:"+sum.to_s+"\n"
|
743
|
+
out << "range:"+range.to_s+"\n"
|
744
|
+
out << "variance (pop):"+variance_population.to_s+"\n"
|
745
|
+
out << "sd (pop):"+sdp.to_s+"\n"
|
746
|
+
out << "variance (sample):"+variance_sample.to_s+"\n"
|
747
|
+
out << "sd (sample):"+sds.to_s+"\n"
|
748
|
+
|
749
|
+
out
|
750
|
+
end
|
751
|
+
|
752
|
+
alias_method :sdp, :standard_deviation_population
|
753
|
+
alias_method :sds, :standard_deviation_sample
|
754
|
+
alias_method :cov, :coefficient_of_variation
|
755
|
+
alias_method :variance, :variance_sample
|
756
|
+
alias_method :sd, :standard_deviation_sample
|
757
|
+
alias_method :ss, :sum_of_squares
|
758
|
+
end
|
759
|
+
end
|