statsample 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +79 -0
- data/Manifest.txt +56 -0
- data/README.txt +77 -0
- data/Rakefile +22 -0
- data/bin/statsample +2 -0
- data/demo/benchmark.rb +52 -0
- data/demo/chi-square.rb +44 -0
- data/demo/dice.rb +13 -0
- data/demo/distribution_t.rb +95 -0
- data/demo/graph.rb +9 -0
- data/demo/item_analysis.rb +30 -0
- data/demo/mean.rb +81 -0
- data/demo/proportion.rb +57 -0
- data/demo/sample_test.csv +113 -0
- data/demo/strata_proportion.rb +152 -0
- data/demo/stratum.rb +141 -0
- data/lib/spss.rb +131 -0
- data/lib/statsample.rb +216 -0
- data/lib/statsample/anova.rb +74 -0
- data/lib/statsample/bivariate.rb +255 -0
- data/lib/statsample/chidistribution.rb +39 -0
- data/lib/statsample/codification.rb +120 -0
- data/lib/statsample/converters.rb +338 -0
- data/lib/statsample/crosstab.rb +122 -0
- data/lib/statsample/dataset.rb +526 -0
- data/lib/statsample/dominanceanalysis.rb +259 -0
- data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
- data/lib/statsample/graph/gdchart.rb +45 -0
- data/lib/statsample/graph/svgboxplot.rb +108 -0
- data/lib/statsample/graph/svggraph.rb +181 -0
- data/lib/statsample/graph/svghistogram.rb +208 -0
- data/lib/statsample/graph/svgscatterplot.rb +111 -0
- data/lib/statsample/htmlreport.rb +232 -0
- data/lib/statsample/multiset.rb +281 -0
- data/lib/statsample/regression.rb +522 -0
- data/lib/statsample/reliability.rb +235 -0
- data/lib/statsample/resample.rb +20 -0
- data/lib/statsample/srs.rb +159 -0
- data/lib/statsample/test.rb +25 -0
- data/lib/statsample/vector.rb +759 -0
- data/test/_test_chart.rb +58 -0
- data/test/test_anova.rb +31 -0
- data/test/test_codification.rb +59 -0
- data/test/test_crosstab.rb +55 -0
- data/test/test_csv.csv +7 -0
- data/test/test_csv.rb +27 -0
- data/test/test_dataset.rb +293 -0
- data/test/test_ggobi.rb +42 -0
- data/test/test_multiset.rb +98 -0
- data/test/test_regression.rb +108 -0
- data/test/test_reliability.rb +32 -0
- data/test/test_resample.rb +23 -0
- data/test/test_srs.rb +14 -0
- data/test/test_statistics.rb +152 -0
- data/test/test_stratified.rb +19 -0
- data/test/test_svg_graph.rb +63 -0
- data/test/test_vector.rb +265 -0
- data/test/test_xls.rb +32 -0
- metadata +158 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Reliability
|
3
|
+
class << self
|
4
|
+
# Calculate Chonbach's alpha for a given dataset.
|
5
|
+
# only uses tuples without missing data
|
6
|
+
def cronbach_alpha(ods)
|
7
|
+
ds=ods.dup_only_valid
|
8
|
+
n_items=ds.fields.size
|
9
|
+
sum_var_items=ds.vectors.inject(0) {|ac,v|
|
10
|
+
ac+v[1].variance_sample
|
11
|
+
}
|
12
|
+
total=ds.vector_sum
|
13
|
+
(n_items / (n_items-1).to_f) * (1-(sum_var_items/ total.variance_sample))
|
14
|
+
end
|
15
|
+
# Calculate Chonbach's alpha for a given dataset
|
16
|
+
# using standarized values for every vector.
|
17
|
+
# Only uses tuples without missing data
|
18
|
+
|
19
|
+
def cronbach_alpha_standarized(ods)
|
20
|
+
ds=ods.fields.inject({}){|a,f|
|
21
|
+
a[f]=ods[f].vector_standarized
|
22
|
+
a
|
23
|
+
}.to_dataset
|
24
|
+
cronbach_alpha(ds)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class ItemCharacteristicCurve
|
29
|
+
attr_reader :totals, :counts,:vector_total
|
30
|
+
def initialize (ds, vector_total=nil)
|
31
|
+
vector_total||=ds.vector_sum
|
32
|
+
raise "Total size != Dataset size" if vector_total.size!=ds.cases
|
33
|
+
@vector_total=vector_total
|
34
|
+
@ds=ds
|
35
|
+
@totals={}
|
36
|
+
@counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
|
37
|
+
process
|
38
|
+
end
|
39
|
+
def process
|
40
|
+
i=0
|
41
|
+
@ds.each{|row|
|
42
|
+
tot=@vector_total[i]
|
43
|
+
@totals[tot]||=0
|
44
|
+
@totals[tot]+=1
|
45
|
+
@ds.fields.each {|f|
|
46
|
+
item=row[f].to_s
|
47
|
+
@counts[f][tot]||={}
|
48
|
+
@counts[f][tot][item]||=0
|
49
|
+
@counts[f][tot][item] += 1
|
50
|
+
}
|
51
|
+
i+=1
|
52
|
+
}
|
53
|
+
end
|
54
|
+
def curve_field(field, item)
|
55
|
+
out={}
|
56
|
+
item=item.to_s
|
57
|
+
@totals.each{|value,n|
|
58
|
+
count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
|
59
|
+
out[value]=count_value.to_f/n.to_f
|
60
|
+
}
|
61
|
+
out
|
62
|
+
end
|
63
|
+
end
|
64
|
+
class ItemAnalysis
|
65
|
+
attr_reader :mean, :sd,:valid_n, :alpha , :alpha_standarized
|
66
|
+
def initialize(ds)
|
67
|
+
@ds=ds.dup_only_valid
|
68
|
+
@total=@ds.vector_sum
|
69
|
+
@mean=@total.mean
|
70
|
+
@median=@total.median
|
71
|
+
@skew=@total.skew
|
72
|
+
@kurtosis=@total.kurtosis
|
73
|
+
@sd=@total.sdp
|
74
|
+
@valid_n=@total.size
|
75
|
+
begin
|
76
|
+
@alpha=Statsample::Reliability.cronbach_alpha(ds)
|
77
|
+
@alpha_standarized=Statsample::Reliability.cronbach_alpha_standarized(ds)
|
78
|
+
rescue => e
|
79
|
+
raise DatasetException.new(@ds,e), "Problem on calculate alpha"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
# Returns a hash with structure
|
83
|
+
def item_characteristic_curve
|
84
|
+
i=0
|
85
|
+
out={}
|
86
|
+
total={}
|
87
|
+
@ds.each{|row|
|
88
|
+
tot=@total[i]
|
89
|
+
@ds.fields.each {|f|
|
90
|
+
out[f]||= {}
|
91
|
+
total[f]||={}
|
92
|
+
out[f][tot]||= 0
|
93
|
+
total[f][tot]||=0
|
94
|
+
out[f][tot]+= row[f]
|
95
|
+
total[f][tot]+=1
|
96
|
+
}
|
97
|
+
i+=1
|
98
|
+
}
|
99
|
+
total.each{|f,var|
|
100
|
+
var.each{|tot,v|
|
101
|
+
out[f][tot]=out[f][tot].to_f / total[f][tot]
|
102
|
+
}
|
103
|
+
}
|
104
|
+
out
|
105
|
+
end
|
106
|
+
def gnuplot_item_characteristic_curve(directory, base="crd",options={})
|
107
|
+
require 'gnuplot'
|
108
|
+
|
109
|
+
crd=item_characteristic_curve
|
110
|
+
@ds.fields.each {|f|
|
111
|
+
x=[]
|
112
|
+
y=[]
|
113
|
+
Gnuplot.open do |gp|
|
114
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
115
|
+
crd[f].sort.each{|tot,prop|
|
116
|
+
x.push(tot)
|
117
|
+
y.push((prop*100).to_i.to_f/100)
|
118
|
+
}
|
119
|
+
plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
|
120
|
+
ds.with = "linespoints"
|
121
|
+
ds.notitle
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
126
|
+
}
|
127
|
+
|
128
|
+
end
|
129
|
+
def svggraph_item_characteristic_curve(directory, base="icc",options={})
|
130
|
+
require 'statsample/graph/svggraph'
|
131
|
+
crd=ItemCharacteristicCurve.new(@ds)
|
132
|
+
@ds.fields.each {|f|
|
133
|
+
factors=@ds[f].factors.sort
|
134
|
+
options={
|
135
|
+
:height=>500,
|
136
|
+
:width=>800,
|
137
|
+
:key=>true
|
138
|
+
}.update(options)
|
139
|
+
graph = ::SVG::Graph::Plot.new(options)
|
140
|
+
factors.each{|factor|
|
141
|
+
factor=factor.to_s
|
142
|
+
dataset=[]
|
143
|
+
crd.curve_field(f, factor).each{|tot,prop|
|
144
|
+
dataset.push(tot)
|
145
|
+
dataset.push((prop*100).to_i.to_f/100)
|
146
|
+
}
|
147
|
+
graph.add_data({
|
148
|
+
:title=>"#{factor}",
|
149
|
+
:data=>dataset
|
150
|
+
})
|
151
|
+
}
|
152
|
+
File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
|
153
|
+
fp.puts(graph.burn())
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
end
|
158
|
+
def item_total_correlation
|
159
|
+
@ds.fields.inject({}) do |a,v|
|
160
|
+
vector=@ds[v].dup
|
161
|
+
ds2=@ds.dup
|
162
|
+
ds2.delete_vector(v)
|
163
|
+
total=ds2.vector_sum
|
164
|
+
a[v]=Statsample::Bivariate.pearson(vector,total)
|
165
|
+
a
|
166
|
+
end
|
167
|
+
end
|
168
|
+
def item_statistics
|
169
|
+
@ds.fields.inject({}) do |a,v|
|
170
|
+
a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
|
171
|
+
a
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def stats_if_deleted
|
176
|
+
@ds.fields.inject({}){|a,v|
|
177
|
+
ds2=@ds.dup
|
178
|
+
ds2.delete_vector(v)
|
179
|
+
total=ds2.vector_sum
|
180
|
+
a[v]={}
|
181
|
+
a[v][:mean]=total.mean
|
182
|
+
a[v][:sds]=total.sds
|
183
|
+
a[v][:variance_sample]=total.variance_sample
|
184
|
+
a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
|
185
|
+
a
|
186
|
+
}
|
187
|
+
end
|
188
|
+
def html_summary
|
189
|
+
html = <<EOF
|
190
|
+
<p><strong>Summary for scale:</strong></p>
|
191
|
+
<ul>
|
192
|
+
<li>Mean=#{@mean}</li>
|
193
|
+
<li>Std.Dv.=#{@sd}</li>
|
194
|
+
<li>Median=#{@median}</li>
|
195
|
+
<li>Skewness=#{sprintf("%0.3f",@skew)}</li>
|
196
|
+
<li>Kurtosis=#{sprintf("%0.3f",@kurtosis)}</li>
|
197
|
+
|
198
|
+
<li>Valid n:#{@valid_n}</li>
|
199
|
+
<li>Cronbach alpha: #{@alpha}</li>
|
200
|
+
</ul>
|
201
|
+
<table><thead><th>Variable</th>
|
202
|
+
|
203
|
+
<th>Mean</th>
|
204
|
+
<th>StDv.</th>
|
205
|
+
<th>Mean if deleted</th><th>Var. if
|
206
|
+
deleted</th><th> StDv. if
|
207
|
+
deleted</th><th> Itm-Totl
|
208
|
+
Correl.</th><th>Alpha if
|
209
|
+
deleted</th></thead>
|
210
|
+
EOF
|
211
|
+
|
212
|
+
itc=item_total_correlation
|
213
|
+
sid=stats_if_deleted
|
214
|
+
is=item_statistics
|
215
|
+
@ds.fields.each {|f|
|
216
|
+
html << <<EOF
|
217
|
+
<tr>
|
218
|
+
<td>#{f}</td>
|
219
|
+
<td>#{sprintf("%0.5f",is[f][:mean])}</td>
|
220
|
+
<td>#{sprintf("%0.5f",is[f][:sds])}</td>
|
221
|
+
<td>#{sprintf("%0.5f",sid[f][:mean])}</td>
|
222
|
+
<td>#{sprintf("%0.5f",sid[f][:variance_sample])}</td>
|
223
|
+
<td>#{sprintf("%0.5f",sid[f][:sds])}</td>
|
224
|
+
<td>#{sprintf("%0.5f",itc[f])}</td>
|
225
|
+
<td>#{sprintf("%0.5f",sid[f][:alpha])}</td>
|
226
|
+
</tr>
|
227
|
+
EOF
|
228
|
+
}
|
229
|
+
html << "</table><hr />"
|
230
|
+
html
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
end
|
235
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Resample
|
3
|
+
class << self
|
4
|
+
def repeat_and_save(times,&action)
|
5
|
+
(1..times).inject([]) {|a,x|
|
6
|
+
a.push(action.call)
|
7
|
+
a
|
8
|
+
}
|
9
|
+
end
|
10
|
+
|
11
|
+
def generate (size,low,upper)
|
12
|
+
range=upper-low+1
|
13
|
+
Vector.new((0...size).collect {|x|
|
14
|
+
rand(range)+low
|
15
|
+
},:scale)
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Several methods to estimate parameters for simple random sampling
|
3
|
+
module SRS
|
4
|
+
class << self
|
5
|
+
|
6
|
+
########################
|
7
|
+
#
|
8
|
+
# Proportion estimation
|
9
|
+
#
|
10
|
+
########################
|
11
|
+
# Finite population correction (over variance)
|
12
|
+
# Source: Cochran(1972)
|
13
|
+
def fpc_var(sam,pop)
|
14
|
+
(pop - sam).quo(pop - 1)
|
15
|
+
end
|
16
|
+
# Finite population correction (over standard deviation)
|
17
|
+
def fpc(sam,pop)
|
18
|
+
Math::sqrt((pop-sam).quo(pop-1))
|
19
|
+
end
|
20
|
+
|
21
|
+
# Non sample fraction.
|
22
|
+
#
|
23
|
+
# 1 - sample fraction
|
24
|
+
def qf(sam , pop)
|
25
|
+
1-(sam.quo(pop))
|
26
|
+
end
|
27
|
+
# Sample size estimation for proportions, infinite poblation
|
28
|
+
def estimation_n0(d,prop,margin=0.95)
|
29
|
+
t=GSL::Cdf.ugaussian_Pinv(1-(1-margin).quo(2))
|
30
|
+
var=prop*(1-prop)
|
31
|
+
t**2*var.quo(d**2)
|
32
|
+
end
|
33
|
+
# Sample size estimation for proportions, finite poblation.
|
34
|
+
def estimation_n(d,prop,n_pobl,margin=0.95)
|
35
|
+
n0=estimation_n0(d,prop,margin)
|
36
|
+
n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
|
37
|
+
end
|
38
|
+
# Proportion confidence interval with t values
|
39
|
+
# Uses estimated proportion, sample without replacement.
|
40
|
+
|
41
|
+
def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
|
42
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-margin).quo(2)) , n_sample-1)
|
43
|
+
proportion_confidence_interval(prop,n_sample,n_population, t)
|
44
|
+
end
|
45
|
+
# Proportion confidence interval with z values
|
46
|
+
# Uses estimated proportion, sample without replacement.
|
47
|
+
def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
|
48
|
+
z=GSL::Cdf.ugaussian_Pinv(1-((1-margin).quo(2)))
|
49
|
+
proportion_confidence_interval(p,n_sample,n_population, z)
|
50
|
+
end
|
51
|
+
# Proportion confidence interval with x value
|
52
|
+
# Uses estimated proportion, sample without replacement
|
53
|
+
|
54
|
+
def proportion_confidence_interval(p, sam,pop , x)
|
55
|
+
f=sam.quo(pop)
|
56
|
+
one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)) / (sam-1)) + (1.quo(sam * 2.0))
|
57
|
+
[p-one_range, p+one_range]
|
58
|
+
end
|
59
|
+
# Standard deviation for sample distribution of a proportion
|
60
|
+
# Know proportion, sample with replacement.
|
61
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx
|
62
|
+
def proportion_sd_kp_wr(p, n_sample)
|
63
|
+
Math::sqrt(p*(1-p).quo(n_sample))
|
64
|
+
end
|
65
|
+
# Standard deviation for sample distribution of a proportion
|
66
|
+
# Know proportion, sample without replacement.
|
67
|
+
#
|
68
|
+
# Sources:
|
69
|
+
# * http://stattrek.com/Lesson6/SRS.aspx
|
70
|
+
# * Cochran(1972)
|
71
|
+
def proportion_sd_kp_wor(p, sam, pop)
|
72
|
+
fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
|
73
|
+
end
|
74
|
+
# Standard deviation for sample distribution of a proportion
|
75
|
+
# Estimated proportion, sample with replacement
|
76
|
+
# Based on http://stattrek.com/Lesson6/SRS.aspx.
|
77
|
+
def proportion_sd_ep_wr(p, n_sample)
|
78
|
+
Math::sqrt(p*(1-p).quo(n_sample-1))
|
79
|
+
end
|
80
|
+
# Standard deviation for sample distribution of a proportion.
|
81
|
+
# Estimated proportion, sample without replacement.
|
82
|
+
# Source: Cochran, 1972, Técnicas de muestreo
|
83
|
+
def proportion_sd_ep_wor(p, sam,pop)
|
84
|
+
fsc=(pop-sam).quo((sam-1)*pop)
|
85
|
+
Math::sqrt(fsc*p*(1-p))
|
86
|
+
end
|
87
|
+
|
88
|
+
# Total estimation sd based on sample.
|
89
|
+
# Known proportion, sample without replacement
|
90
|
+
# Source: Cochran(1972)
|
91
|
+
def proportion_total_sd_kp_wor(prop, sam, pop)
|
92
|
+
pob * proportion_sd_kp_wor(p, sam, pop)
|
93
|
+
end
|
94
|
+
# Total estimation sd based on sample.
|
95
|
+
# Estimated proportion, sample without replacement
|
96
|
+
# Source: Cochran(1972)
|
97
|
+
def proportion_total_sd_ep_wor(prop, sam, pop)
|
98
|
+
fsc=((pop - sam).to_f / ( sam - 1))
|
99
|
+
Math::sqrt(fsc*pop*prop*(1-prop))
|
100
|
+
end
|
101
|
+
|
102
|
+
########################
|
103
|
+
#
|
104
|
+
# Mean stimation
|
105
|
+
#
|
106
|
+
########################
|
107
|
+
|
108
|
+
|
109
|
+
# Standard error. Known variance, sample with replacement.
|
110
|
+
def standard_error_ksd_wr(s, sam, pop)
|
111
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
|
112
|
+
end
|
113
|
+
|
114
|
+
# Standard error of the mean. Known variance, sample w/o replacement
|
115
|
+
def standard_error_ksd_wor(s,sam,pop)
|
116
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
117
|
+
end
|
118
|
+
|
119
|
+
alias_method :standard_error_esd_wr, :standard_error_ksd_wr
|
120
|
+
|
121
|
+
# Standard error of the mean.
|
122
|
+
# Estimated variance, without replacement
|
123
|
+
# Cochran (1972) p.47
|
124
|
+
def standard_error_esd_wor(s,sam,pop)
|
125
|
+
s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
|
126
|
+
end
|
127
|
+
|
128
|
+
alias_method :standard_error, :standard_error_esd_wor
|
129
|
+
alias_method :se, :standard_error_esd_wor
|
130
|
+
|
131
|
+
# Standard error of total estimation
|
132
|
+
|
133
|
+
def standard_error_total(s,sam,pop)
|
134
|
+
pop*se(s,sam,pop)
|
135
|
+
end
|
136
|
+
|
137
|
+
# Confidence Interval using T-Student
|
138
|
+
# Use with n < 60
|
139
|
+
def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
|
140
|
+
t=GSL::Cdf.tdist_Pinv(1-((1-margin) / 2),n_sample-1)
|
141
|
+
mean_confidence_interval(mean,s,n_sample,n_population,t)
|
142
|
+
end
|
143
|
+
# Confidente Interval using Z
|
144
|
+
# Use with n > 60
|
145
|
+
def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
|
146
|
+
z=GSL::Cdf.ugaussian_Pinv(1-((1-margin) / 2))
|
147
|
+
mean_confidence_interval(mean,s,n_sample,n_population, z)
|
148
|
+
end
|
149
|
+
# Confidente interval using X.
|
150
|
+
#
|
151
|
+
# Better use mean_confidence_interval_z or mean_confidence_interval_t
|
152
|
+
def mean_confidence_interval(mean,s,n_sample,n_population,x)
|
153
|
+
range=x*se(s,n_sample,n_population)
|
154
|
+
[mean-range,mean+range]
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Statsample
|
2
|
+
# module for several statistical tests
|
3
|
+
module Test
|
4
|
+
# Calculate chi square for two Matrix
|
5
|
+
class << self
|
6
|
+
def chi_square(real,expected)
|
7
|
+
raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
|
8
|
+
sum=0
|
9
|
+
(0...real.row_size).each {|row_i|
|
10
|
+
(0...real.column_size).each {|col_i|
|
11
|
+
|
12
|
+
val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
|
13
|
+
# puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
|
14
|
+
# puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
|
15
|
+
sum+=val
|
16
|
+
}
|
17
|
+
}
|
18
|
+
sum
|
19
|
+
end
|
20
|
+
def t_significance
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,759 @@
|
|
1
|
+
class Array
|
2
|
+
def to_vector(*args)
|
3
|
+
Statsample::Vector.new(self,*args)
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Statsample
|
8
|
+
class << self
|
9
|
+
# Create a matrix using vectors as columns
|
10
|
+
# Use:
|
11
|
+
#
|
12
|
+
# matrix=Statsample.vector_cols_matrix(v1,v2)
|
13
|
+
def vector_cols_matrix(*vs)
|
14
|
+
# test
|
15
|
+
size=vs[0].size
|
16
|
+
vs.each{|v|
|
17
|
+
raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
|
18
|
+
raise ArgumentError,"Vectors size should be the same" if v.size!=size
|
19
|
+
}
|
20
|
+
Matrix.rows((0...size).to_a.collect() {|i|
|
21
|
+
vs.collect{|v| v[i]}
|
22
|
+
})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
# Returns a duplicate of the input vectors, without missing data
|
26
|
+
# for any of the vectors
|
27
|
+
#
|
28
|
+
# a=[1,2,3,6,7,nil,3,5].to_vector(:scale)
|
29
|
+
# b=[nil,nil,5,6,4,5,10,2].to_vector(:scale)
|
30
|
+
# c=[2,4,6,7,4,5,6,7].to_vector(:scale)
|
31
|
+
# a2,b2,c2=Statsample.only_valid(a,b,c)
|
32
|
+
# => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
|
33
|
+
# #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
|
34
|
+
# #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
|
35
|
+
#
|
36
|
+
def self.only_valid(*vs)
|
37
|
+
i=1
|
38
|
+
h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
|
39
|
+
ds=Statsample::Dataset.new(h).dup_only_valid
|
40
|
+
ds.vectors.values
|
41
|
+
end
|
42
|
+
class Vector < DelegateClass(Array)
|
43
|
+
|
44
|
+
include Enumerable
|
45
|
+
attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
|
46
|
+
attr_accessor :labels
|
47
|
+
# Creates a new
|
48
|
+
# data = Array of data
|
49
|
+
# t = level of meausurement. Could be:
|
50
|
+
# [:nominal] : Nominal level of measurement
|
51
|
+
# [:ordinal] : Ordinal level of measurement
|
52
|
+
# [:scale] : Scale level of meausurement
|
53
|
+
#
|
54
|
+
def initialize(data=[],t=:nominal,missing_values=[],labels={})
|
55
|
+
raise "Data should be an array" unless data.is_a? Array
|
56
|
+
@data=data
|
57
|
+
@missing_values=missing_values
|
58
|
+
@labels=labels
|
59
|
+
@type=t
|
60
|
+
@valid_data=[]
|
61
|
+
@data_with_nils=[]
|
62
|
+
@missing_data=[]
|
63
|
+
@has_missing_data=nil
|
64
|
+
_set_valid_data
|
65
|
+
self.type=t
|
66
|
+
super(@delegate)
|
67
|
+
end
|
68
|
+
def dup
|
69
|
+
Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
|
70
|
+
end
|
71
|
+
# Returns an empty duplicate of the vector. Maintains the type, missing
|
72
|
+
# values, labels
|
73
|
+
def dup_empty
|
74
|
+
Vector.new([],@type,@missing_values.dup,@labels.dup)
|
75
|
+
end
|
76
|
+
# Return a vector usign the standarized values for data
|
77
|
+
# with sd with denominator N
|
78
|
+
def vector_standarized_pop
|
79
|
+
vector_standarized(true)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return a vector usign the standarized values for data
|
83
|
+
# with sd with denominator n-1
|
84
|
+
|
85
|
+
def vector_standarized(use_population=false)
|
86
|
+
raise "Should be a scale" unless @type==:scale
|
87
|
+
mean=@delegate.mean
|
88
|
+
sd=use_population ? @delegate.sdp : @delegate.sds
|
89
|
+
@data_with_nils.collect{|x|
|
90
|
+
if !x.nil?
|
91
|
+
(x.to_f - mean).quo(sd)
|
92
|
+
else
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
}.to_vector(:scale)
|
96
|
+
end
|
97
|
+
alias_method :standarized, :vector_standarized
|
98
|
+
def box_cox_transformation(lambda)
|
99
|
+
raise "Should be a scale" unless @type==:scale
|
100
|
+
@data_with_nils.collect{|x|
|
101
|
+
if !x.nil?
|
102
|
+
if(lambda==0)
|
103
|
+
Math.log(x)
|
104
|
+
else
|
105
|
+
(x**lambda-1).quo(lambda)
|
106
|
+
end
|
107
|
+
else
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
}.to_vector(:scale)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Vector equality
|
114
|
+
# Two vector will be the same if their data, missing values, type, labels are equals
|
115
|
+
def ==(v2)
|
116
|
+
raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
|
117
|
+
@data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
|
118
|
+
end
|
119
|
+
def _dump(i)
|
120
|
+
Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
|
121
|
+
end
|
122
|
+
def self._load(data)
|
123
|
+
h=Marshal.load(data)
|
124
|
+
Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
|
125
|
+
end
|
126
|
+
def recode
|
127
|
+
@data.collect{|x|
|
128
|
+
yield x
|
129
|
+
}.to_vector(@type)
|
130
|
+
end
|
131
|
+
def recode!
|
132
|
+
@data.collect!{|x|
|
133
|
+
yield x
|
134
|
+
}
|
135
|
+
set_valid_data
|
136
|
+
end
|
137
|
+
def each
|
138
|
+
@data.each{|x|
|
139
|
+
yield(x)
|
140
|
+
}
|
141
|
+
end
|
142
|
+
# Add a value at the end of the vector
|
143
|
+
# If second argument set to false, you should update valid data usign
|
144
|
+
# Vector#set_valid_data at the end of your insertion cycle
|
145
|
+
def add(v,update_valid=true)
|
146
|
+
@data.push(v)
|
147
|
+
set_valid_data if update_valid
|
148
|
+
end
|
149
|
+
def set_valid_data
|
150
|
+
@valid_data.clear
|
151
|
+
@missing_data.clear
|
152
|
+
@data_with_nils.clear
|
153
|
+
_set_valid_data
|
154
|
+
@delegate.set_gsl if(@type==:scale)
|
155
|
+
end
|
156
|
+
def _set_valid_data
|
157
|
+
if Statsample::OPTIMIZED
|
158
|
+
Statsample::_set_valid_data(self)
|
159
|
+
else
|
160
|
+
@data.each do |n|
|
161
|
+
if is_valid? n
|
162
|
+
@valid_data.push(n)
|
163
|
+
@data_with_nils.push(n)
|
164
|
+
else
|
165
|
+
@data_with_nils.push(nil)
|
166
|
+
@missing_data.push(n)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
@has_missing_data=@missing_data.size>0
|
170
|
+
end
|
171
|
+
end
|
172
|
+
# Retrieves true if data has one o more missing values
|
173
|
+
def has_missing_data?
|
174
|
+
@has_missing_data
|
175
|
+
end
|
176
|
+
def labeling(x)
|
177
|
+
@labels.has_key?(x) ? @labels[x].to_s : x.to_s
|
178
|
+
end
|
179
|
+
# Returns a Vector with the data with labels replaced by the label
|
180
|
+
def vector_labeled
|
181
|
+
d=@data.collect{|x|
|
182
|
+
if @labels.has_key? x
|
183
|
+
@labels[x]
|
184
|
+
else
|
185
|
+
x
|
186
|
+
end
|
187
|
+
}
|
188
|
+
Vector.new(d,@type)
|
189
|
+
end
|
190
|
+
def size
|
191
|
+
@data.size
|
192
|
+
end
|
193
|
+
def [](i)
|
194
|
+
@data[i]
|
195
|
+
end
|
196
|
+
def []=(i,v)
|
197
|
+
@data[i]=v
|
198
|
+
end
|
199
|
+
# Return true if a value is valid (not nil and not included on missing values)
|
200
|
+
def is_valid?(x)
|
201
|
+
!(x.nil? or @missing_values.include? x)
|
202
|
+
end
|
203
|
+
# Set missing_values
|
204
|
+
def missing_values=(vals)
|
205
|
+
@missing_values = vals
|
206
|
+
set_valid_data
|
207
|
+
end
|
208
|
+
# Set level of measurement.
|
209
|
+
def type=(t)
|
210
|
+
case t
|
211
|
+
when :nominal
|
212
|
+
@delegate=Nominal.new(@valid_data)
|
213
|
+
when :ordinal
|
214
|
+
@delegate=Ordinal.new(@valid_data)
|
215
|
+
when :scale
|
216
|
+
@delegate=Scale.new(@valid_data)
|
217
|
+
else
|
218
|
+
raise "Type doesn't exists"
|
219
|
+
end
|
220
|
+
__setobj__(@delegate)
|
221
|
+
@type=t
|
222
|
+
end
|
223
|
+
def n; @data.size ; end
|
224
|
+
def to_a
|
225
|
+
@data.dup
|
226
|
+
end
|
227
|
+
# Redundant, but necessary
|
228
|
+
# Spreadsheet creates Array#sum, so calling sum
|
229
|
+
# doesn't call the delegates method
|
230
|
+
def sum
|
231
|
+
@delegate.sum
|
232
|
+
end
|
233
|
+
alias_method :to_ary, :to_a
|
234
|
+
# Vector sum.
|
235
|
+
# - If v is a scalar, add this value to all elements
|
236
|
+
# - If v is a Array or a Vector, should be of the same size of this vector
|
237
|
+
# every item of this vector will be added to the value of the
|
238
|
+
# item at the same position on the other vector
|
239
|
+
def +(v)
|
240
|
+
_vector_ari("+",v)
|
241
|
+
end
|
242
|
+
# Vector rest.
|
243
|
+
# - If v is a scalar, rest this value to all elements
|
244
|
+
# - If v is a Array or a Vector, should be of the same
|
245
|
+
# size of this vector
|
246
|
+
# every item of this vector will be rested to the value of the
|
247
|
+
# item at the same position on the other vector
|
248
|
+
|
249
|
+
def -(v)
|
250
|
+
_vector_ari("-",v)
|
251
|
+
end
|
252
|
+
# Reports all values that doesn't comply with a condition
|
253
|
+
# Returns a hash with the index of data and the invalid data
|
254
|
+
def verify
|
255
|
+
h={}
|
256
|
+
(0...@data.size).to_a.each{|i|
|
257
|
+
if !(yield @data[i])
|
258
|
+
h[i]=@data[i]
|
259
|
+
end
|
260
|
+
}
|
261
|
+
h
|
262
|
+
end
|
263
|
+
def _vector_ari(method,v) # :nodoc:
|
264
|
+
if(v.is_a? Vector or v.is_a? Array)
|
265
|
+
if v.size==@data.size
|
266
|
+
i=0
|
267
|
+
sum=[]
|
268
|
+
0.upto(v.size-1) {|i|
|
269
|
+
if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
|
270
|
+
sum.push(@data[i].send(method,v[i]))
|
271
|
+
else
|
272
|
+
sum.push(nil)
|
273
|
+
end
|
274
|
+
}
|
275
|
+
Statsample::Vector.new(sum)
|
276
|
+
else
|
277
|
+
raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
|
278
|
+
end
|
279
|
+
elsif(v.respond_to? method )
|
280
|
+
Statsample::Vector.new(
|
281
|
+
@data.collect {|x|
|
282
|
+
if(!x.nil?)
|
283
|
+
x.send(method,v)
|
284
|
+
else
|
285
|
+
nil
|
286
|
+
end
|
287
|
+
}
|
288
|
+
)
|
289
|
+
else
|
290
|
+
raise TypeError,"You should pass a scalar or a array/vector"
|
291
|
+
end
|
292
|
+
|
293
|
+
end
|
294
|
+
# Return an array with the data splitted by a separator
|
295
|
+
# a=Vector.new(["a,b","c,d","a,b","d"])
|
296
|
+
# a.splitted
|
297
|
+
# [["a","b"],["c","d"],["a","b"],["d"]]
|
298
|
+
def splitted(sep=Statsample::SPLIT_TOKEN)
|
299
|
+
@data.collect{|x|
|
300
|
+
if x.nil?
|
301
|
+
nil
|
302
|
+
elsif (x.respond_to? :split)
|
303
|
+
x.split(sep)
|
304
|
+
else
|
305
|
+
[x]
|
306
|
+
end
|
307
|
+
}
|
308
|
+
end
|
309
|
+
# Returns a hash of Vectors, defined by the different values
|
310
|
+
# defined on the fields
|
311
|
+
# Example:
|
312
|
+
#
|
313
|
+
# a=Vector.new(["a,b","c,d","a,b"])
|
314
|
+
# a.split_by_separator
|
315
|
+
# {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 @data=[1, 0, 1]>,
|
316
|
+
# "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 @data=[1, 1, 0]>,
|
317
|
+
# "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 @data=[0, 1, 1]>}
|
318
|
+
#
|
319
|
+
def split_by_separator(sep=Statsample::SPLIT_TOKEN)
|
320
|
+
split_data=splitted(sep)
|
321
|
+
factors=split_data.flatten.uniq.compact
|
322
|
+
out=factors.inject({}) {|a,x|
|
323
|
+
a[x]=[]
|
324
|
+
a
|
325
|
+
}
|
326
|
+
split_data.each{|r|
|
327
|
+
if r.nil?
|
328
|
+
factors.each{|f|
|
329
|
+
out[f].push(nil)
|
330
|
+
}
|
331
|
+
else
|
332
|
+
factors.each{|f|
|
333
|
+
out[f].push(r.include?(f) ? 1:0)
|
334
|
+
}
|
335
|
+
end
|
336
|
+
}
|
337
|
+
out.inject({}){|s,v|
|
338
|
+
s[v[0]]=Vector.new(v[1],:nominal)
|
339
|
+
s
|
340
|
+
}
|
341
|
+
end
|
342
|
+
def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
|
343
|
+
split_by_separator(sep).inject({}) {|a,v|
|
344
|
+
a[v[0]]=v[1].inject {|s,x| s+x.to_i}
|
345
|
+
a
|
346
|
+
}
|
347
|
+
end
|
348
|
+
|
349
|
+
# Returns an random sample of size n, with replacement,
|
350
|
+
# only with valid data.
|
351
|
+
#
|
352
|
+
# In all the trails, every item have the same probability
|
353
|
+
# of been selected
|
354
|
+
def sample_with_replacement(sample=1)
|
355
|
+
Vector.new(@delegate.sample_with_replacement(sample) ,@type)
|
356
|
+
end
|
357
|
+
# Returns an random sample of size n, without replacement,
|
358
|
+
# only with valid data.
|
359
|
+
#
|
360
|
+
# Every element could only be selected once
|
361
|
+
# A sample of the same size of the vector is the vector itself
|
362
|
+
|
363
|
+
def sample_without_replacement(sample=1)
|
364
|
+
Vector.new(@delegate.sample_without_replacement(sample),@type)
|
365
|
+
end
|
366
|
+
|
367
|
+
def count(x=false)
|
368
|
+
if block_given?
|
369
|
+
r=@data.inject(0) {|s, i|
|
370
|
+
r=yield i
|
371
|
+
s+(r ? 1 : 0)
|
372
|
+
}
|
373
|
+
r.nil? ? 0 : r
|
374
|
+
else
|
375
|
+
frequencies[x].nil? ? 0 : frequencies[x]
|
376
|
+
end
|
377
|
+
end
|
378
|
+
# returns the real type for the vector, according to its content
|
379
|
+
def db_type(dbs='mysql')
|
380
|
+
# first, detect any character not number
|
381
|
+
if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
|
382
|
+
return "DATE"
|
383
|
+
elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
|
384
|
+
return "VARCHAR (255)"
|
385
|
+
elsif @data.find {|v| v.to_s=~/\./}
|
386
|
+
return "DOUBLE"
|
387
|
+
else
|
388
|
+
return "INTEGER"
|
389
|
+
end
|
390
|
+
end
|
391
|
+
def summary(out="")
|
392
|
+
@delegate.summary(@labels,out)
|
393
|
+
end
|
394
|
+
def to_s
|
395
|
+
sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
|
396
|
+
end
|
397
|
+
def inspect
|
398
|
+
self.to_s
|
399
|
+
end
|
400
|
+
|
401
|
+
end
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
class Nominal
|
406
|
+
def initialize(data)
|
407
|
+
@data=data
|
408
|
+
# @factors=data.uniq
|
409
|
+
end
|
410
|
+
def delegate_data
|
411
|
+
@data
|
412
|
+
end
|
413
|
+
# Return an array of the different values of the data
|
414
|
+
def factors
|
415
|
+
@data.uniq.sort
|
416
|
+
end
|
417
|
+
# Returns a hash with the distribution of frecuencies of
|
418
|
+
# the sample
|
419
|
+
def frequencies_slow
|
420
|
+
@data.inject(Hash.new) {|a,x|
|
421
|
+
a[x]||=0
|
422
|
+
a[x]=a[x]+1
|
423
|
+
a
|
424
|
+
}
|
425
|
+
end
|
426
|
+
# Plot frequencies on a chart, using gnuplot
|
427
|
+
def plot_frequencies
|
428
|
+
require 'gnuplot'
|
429
|
+
x=[]
|
430
|
+
y=[]
|
431
|
+
self.frequencies.sort.each{|k,v|
|
432
|
+
x.push(k)
|
433
|
+
y.push(v)
|
434
|
+
}
|
435
|
+
Gnuplot.open do |gp|
|
436
|
+
Gnuplot::Plot.new( gp ) do |plot|
|
437
|
+
plot.boxwidth("0.9 absolute")
|
438
|
+
plot.yrange("[0:#{y.max}]")
|
439
|
+
plot.style("fill solid 1.00 border -1")
|
440
|
+
plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
|
441
|
+
plot.style("histogram")
|
442
|
+
plot.style("data histogram")
|
443
|
+
i=-1
|
444
|
+
plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
|
445
|
+
plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
end
|
451
|
+
|
452
|
+
|
453
|
+
# Returns the most frequent item
|
454
|
+
def mode
|
455
|
+
frequencies.max{|a,b| a[1]<=>b[1]}[0]
|
456
|
+
end
|
457
|
+
# The numbers of item with valid data
|
458
|
+
def n_valid
|
459
|
+
@data.size
|
460
|
+
end
|
461
|
+
# Returns a hash with the distribution of proportions of
|
462
|
+
# the sample
|
463
|
+
def proportions
|
464
|
+
frequencies.inject({}){|a,v|
|
465
|
+
a[v[0]] = v[1].quo(@data.size)
|
466
|
+
a
|
467
|
+
}
|
468
|
+
end
|
469
|
+
# Proportion of a given value.
|
470
|
+
def proportion(v=1)
|
471
|
+
frequencies[v].quo(@data.size)
|
472
|
+
end
|
473
|
+
def summary(labels,out="")
|
474
|
+
out << sprintf("n valid:%d\n",n_valid)
|
475
|
+
out << sprintf("factors:%s\n",factors.join(","))
|
476
|
+
out << "mode:"+mode.to_s+"\n"
|
477
|
+
out << "Distribution:\n"
|
478
|
+
frequencies.sort.each{|k,v|
|
479
|
+
key=labels.has_key?(k) ? labels[k]:k
|
480
|
+
out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
|
481
|
+
}
|
482
|
+
out
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns an random sample of size n, with replacement,
|
486
|
+
# only with valid data.
|
487
|
+
#
|
488
|
+
# In all the trails, every item have the same probability
|
489
|
+
# of been selected
|
490
|
+
def sample_with_replacement(sample)
|
491
|
+
(0...sample).collect{ @data[rand(@data.size)] }
|
492
|
+
end
|
493
|
+
# Returns an random sample of size n, without replacement,
|
494
|
+
# only with valid data.
|
495
|
+
#
|
496
|
+
# Every element could only be selected once
|
497
|
+
# A sample of the same size of the vector is the vector itself
|
498
|
+
|
499
|
+
def sample_without_replacement(sample)
|
500
|
+
raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
|
501
|
+
out=[]
|
502
|
+
size=@data.size
|
503
|
+
while out.size<sample
|
504
|
+
value=rand(size)
|
505
|
+
out.push(value) if !out.include?value
|
506
|
+
end
|
507
|
+
out.collect{|i|@data[i]}
|
508
|
+
end
|
509
|
+
|
510
|
+
|
511
|
+
# Variance of p, according to poblation size
|
512
|
+
def variance_proportion(n_poblation, v=1)
|
513
|
+
Statsample::proportion_variance_sample(self.proportion(v), @data.size, n_poblation)
|
514
|
+
end
|
515
|
+
def variance_total(n_poblation, v=1)
|
516
|
+
Statsample::total_variance_sample(self.proportion(v), @data.size, n_poblation)
|
517
|
+
end
|
518
|
+
def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
|
519
|
+
Statsample::proportion_confidence_interval_t(proportion(v), @data.size, n_poblation, margin)
|
520
|
+
end
|
521
|
+
def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
|
522
|
+
Statsample::proportion_confidence_interval_z(proportion(v), @data.size, n_poblation, margin)
|
523
|
+
end
|
524
|
+
self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
|
525
|
+
met_or=met.gsub("_slow","")
|
526
|
+
if !self.method_defined?(met_or)
|
527
|
+
alias_method met_or, met
|
528
|
+
end
|
529
|
+
}
|
530
|
+
end
|
531
|
+
|
532
|
+
class Ordinal <Nominal
|
533
|
+
# Return the value of the percentil q
|
534
|
+
def percentil(q)
|
535
|
+
sorted=@data.sort
|
536
|
+
v= (n_valid * q).quo(100)
|
537
|
+
if(v.to_i!=v)
|
538
|
+
sorted[v.to_i]
|
539
|
+
else
|
540
|
+
(sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
|
541
|
+
end
|
542
|
+
end
|
543
|
+
# Returns a ranked vector
|
544
|
+
def ranked(type=:ordinal)
|
545
|
+
i=0
|
546
|
+
r=frequencies.sort.inject({}){|a,v|
|
547
|
+
a[v[0]]=(i+1 + i+v[1]).quo(2)
|
548
|
+
i+=v[1]
|
549
|
+
a
|
550
|
+
}
|
551
|
+
@data.collect {|c|
|
552
|
+
r[c]
|
553
|
+
}.to_vector(type)
|
554
|
+
end
|
555
|
+
# Return the median (percentil 50)
|
556
|
+
def median
|
557
|
+
percentil(50)
|
558
|
+
end
|
559
|
+
if HAS_GSL
|
560
|
+
%w{median}.each{|m|
|
561
|
+
m_nuevo=(m+"_slow").intern
|
562
|
+
alias_method m_nuevo, m.intern
|
563
|
+
}
|
564
|
+
|
565
|
+
#def percentil(p)
|
566
|
+
# v=GSL::Vector.alloc(@data.sort)
|
567
|
+
# v.stats_quantile_from_sorted_data(p)
|
568
|
+
#end
|
569
|
+
def median # :nodoc:
|
570
|
+
GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
|
571
|
+
end
|
572
|
+
end
|
573
|
+
# Minimun value
|
574
|
+
def min; @data.min;end
|
575
|
+
# Maximum value
|
576
|
+
def max; @data.max; end
|
577
|
+
|
578
|
+
|
579
|
+
def summary(labels,out="")
|
580
|
+
out << sprintf("n valid:%d\n",n_valid)
|
581
|
+
out << "median:"+median.to_s+"\n"
|
582
|
+
out << "percentil 25:"+percentil(25).to_s+"\n"
|
583
|
+
out << "percentil 75:"+percentil(75).to_s+"\n"
|
584
|
+
out
|
585
|
+
end
|
586
|
+
end
|
587
|
+
class Scale <Ordinal
|
588
|
+
attr_reader :gsl
|
589
|
+
def initialize(data)
|
590
|
+
# puts "Inicializando Scale..."
|
591
|
+
super(data)
|
592
|
+
|
593
|
+
set_gsl
|
594
|
+
end
|
595
|
+
|
596
|
+
def _dump(i)
|
597
|
+
Marshal.dump(@data)
|
598
|
+
end
|
599
|
+
def _load(data)
|
600
|
+
@data=Marshal.restore(data)
|
601
|
+
set_gsl
|
602
|
+
end
|
603
|
+
def set_gsl # :nodoc
|
604
|
+
data = @data.collect!{|x|
|
605
|
+
if x.is_a? Numeric
|
606
|
+
x
|
607
|
+
elsif x.is_a? String and x.to_i==x.to_f
|
608
|
+
x.to_i
|
609
|
+
else
|
610
|
+
x.to_f
|
611
|
+
end
|
612
|
+
}
|
613
|
+
if HAS_GSL
|
614
|
+
@gsl=GSL::Vector.alloc(@data) if @data.size>0
|
615
|
+
end
|
616
|
+
end
|
617
|
+
# The range of the data (max - min)
|
618
|
+
def range; @data.max - @data.min; end
|
619
|
+
# The sum of values for the data
|
620
|
+
def sum
|
621
|
+
@data.inject(0){|a,x|x+a} ; end
|
622
|
+
# The arithmetical mean of data
|
623
|
+
def mean
|
624
|
+
sum.to_f.quo(n_valid)
|
625
|
+
end
|
626
|
+
def sum_of_squares(m=nil)
|
627
|
+
m||=mean
|
628
|
+
@data.inject(0){|a,x| a+(x-m).square}
|
629
|
+
end
|
630
|
+
|
631
|
+
# Sum of squared deviation
|
632
|
+
def sum_of_squared_deviation
|
633
|
+
@data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
|
634
|
+
end
|
635
|
+
|
636
|
+
# Population variance (divided by n)
|
637
|
+
def variance_population(m=nil)
|
638
|
+
m||=mean
|
639
|
+
squares=@data.inject(0){|a,x| x.square+a}
|
640
|
+
squares.quo(n_valid) - m.square
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
# Population Standard deviation (divided by n)
|
645
|
+
def standard_deviation_population(m=nil)
|
646
|
+
Math::sqrt( variance_population(m) )
|
647
|
+
end
|
648
|
+
# Sample Variance (divided by n-1)
|
649
|
+
|
650
|
+
def variance_sample(m=nil)
|
651
|
+
m||=mean
|
652
|
+
sum_of_squares(m).quo(n_valid - 1)
|
653
|
+
end
|
654
|
+
|
655
|
+
# Sample Standard deviation (divided by n-1)
|
656
|
+
|
657
|
+
def standard_deviation_sample(m=nil)
|
658
|
+
m||=m
|
659
|
+
Math::sqrt(variance_sample(m))
|
660
|
+
end
|
661
|
+
def skew
|
662
|
+
m=mean
|
663
|
+
thirds=@data.inject(0){|a,x| a+((x-mean)**3)}
|
664
|
+
thirds.quo((@data.size-1)*sd**3)
|
665
|
+
end
|
666
|
+
def kurtosis
|
667
|
+
m=mean
|
668
|
+
thirds=@data.inject(0){|a,x| a+((x-mean)**4)}
|
669
|
+
thirds.quo((@data.size-1)*sd**4)
|
670
|
+
|
671
|
+
end
|
672
|
+
|
673
|
+
if HAS_GSL
|
674
|
+
%w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
|
675
|
+
m_nuevo=(m+"_slow").intern
|
676
|
+
alias_method m_nuevo, m.intern
|
677
|
+
}
|
678
|
+
def sum # :nodoc:
|
679
|
+
@gsl.sum
|
680
|
+
end
|
681
|
+
def mean # :nodoc:
|
682
|
+
@gsl.mean
|
683
|
+
end
|
684
|
+
def variance_sample(m=nil) # :nodoc:
|
685
|
+
m||=mean
|
686
|
+
@gsl.variance_m
|
687
|
+
end
|
688
|
+
def standard_deviation_sample(m=nil) # :nodoc:
|
689
|
+
m||=mean
|
690
|
+
@gsl.sd(m)
|
691
|
+
end
|
692
|
+
|
693
|
+
def variance_population(m=nil) # :nodoc:
|
694
|
+
m||=mean
|
695
|
+
@gsl.variance_with_fixed_mean(m)
|
696
|
+
end
|
697
|
+
def standard_deviation_population(m=nil) # :nodoc:
|
698
|
+
m||=mean
|
699
|
+
@gsl.sd_with_fixed_mean(m)
|
700
|
+
end
|
701
|
+
def skew
|
702
|
+
@gsl.skew
|
703
|
+
end
|
704
|
+
def kurtosis
|
705
|
+
@gsl.kurtosis
|
706
|
+
end
|
707
|
+
# Create a GSL::Histogram
|
708
|
+
# With a fixnum, creates X bins within the range of data
|
709
|
+
# With an Array, each value will be a cut point
|
710
|
+
def histogram(bins=10)
|
711
|
+
if bins.is_a? Array
|
712
|
+
h=GSL::Histogram.alloc(bins)
|
713
|
+
else
|
714
|
+
# ugly patch. The upper limit for a bin has the form
|
715
|
+
# x < range
|
716
|
+
h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001])
|
717
|
+
end
|
718
|
+
h.increment(@gsl)
|
719
|
+
h
|
720
|
+
end
|
721
|
+
def plot_histogram(bins=10,options="")
|
722
|
+
self.histogram(bins).graph(options)
|
723
|
+
end
|
724
|
+
def sample_with_replacement(k)
|
725
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
726
|
+
r.sample(@gsl, k).to_a
|
727
|
+
end
|
728
|
+
def sample_without_replacement(k)
|
729
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
|
730
|
+
r.choose(@gsl, k).to_a
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
# Coefficient of variation
|
735
|
+
# Calculed with the sample standard deviation
|
736
|
+
def coefficient_of_variation
|
737
|
+
standard_deviation_sample.quo(mean)
|
738
|
+
end
|
739
|
+
def summary(labels,out="")
|
740
|
+
out << sprintf("n valid:%d\n",n_valid)
|
741
|
+
out << "mean:"+mean.to_s+"\n"
|
742
|
+
out << "sum:"+sum.to_s+"\n"
|
743
|
+
out << "range:"+range.to_s+"\n"
|
744
|
+
out << "variance (pop):"+variance_population.to_s+"\n"
|
745
|
+
out << "sd (pop):"+sdp.to_s+"\n"
|
746
|
+
out << "variance (sample):"+variance_sample.to_s+"\n"
|
747
|
+
out << "sd (sample):"+sds.to_s+"\n"
|
748
|
+
|
749
|
+
out
|
750
|
+
end
|
751
|
+
|
752
|
+
alias_method :sdp, :standard_deviation_population
|
753
|
+
alias_method :sds, :standard_deviation_sample
|
754
|
+
alias_method :cov, :coefficient_of_variation
|
755
|
+
alias_method :variance, :variance_sample
|
756
|
+
alias_method :sd, :standard_deviation_sample
|
757
|
+
alias_method :ss, :sum_of_squares
|
758
|
+
end
|
759
|
+
end
|