statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,235 @@
1
+ module Statsample
2
+ module Reliability
3
+ class << self
4
+ # Calculate Chonbach's alpha for a given dataset.
5
+ # only uses tuples without missing data
6
+ def cronbach_alpha(ods)
7
+ ds=ods.dup_only_valid
8
+ n_items=ds.fields.size
9
+ sum_var_items=ds.vectors.inject(0) {|ac,v|
10
+ ac+v[1].variance_sample
11
+ }
12
+ total=ds.vector_sum
13
+ (n_items / (n_items-1).to_f) * (1-(sum_var_items/ total.variance_sample))
14
+ end
15
+ # Calculate Chonbach's alpha for a given dataset
16
+ # using standarized values for every vector.
17
+ # Only uses tuples without missing data
18
+
19
+ def cronbach_alpha_standarized(ods)
20
+ ds=ods.fields.inject({}){|a,f|
21
+ a[f]=ods[f].vector_standarized
22
+ a
23
+ }.to_dataset
24
+ cronbach_alpha(ds)
25
+ end
26
+ end
27
+
28
+ class ItemCharacteristicCurve
29
+ attr_reader :totals, :counts,:vector_total
30
+ def initialize (ds, vector_total=nil)
31
+ vector_total||=ds.vector_sum
32
+ raise "Total size != Dataset size" if vector_total.size!=ds.cases
33
+ @vector_total=vector_total
34
+ @ds=ds
35
+ @totals={}
36
+ @counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
37
+ process
38
+ end
39
+ def process
40
+ i=0
41
+ @ds.each{|row|
42
+ tot=@vector_total[i]
43
+ @totals[tot]||=0
44
+ @totals[tot]+=1
45
+ @ds.fields.each {|f|
46
+ item=row[f].to_s
47
+ @counts[f][tot]||={}
48
+ @counts[f][tot][item]||=0
49
+ @counts[f][tot][item] += 1
50
+ }
51
+ i+=1
52
+ }
53
+ end
54
+ def curve_field(field, item)
55
+ out={}
56
+ item=item.to_s
57
+ @totals.each{|value,n|
58
+ count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
59
+ out[value]=count_value.to_f/n.to_f
60
+ }
61
+ out
62
+ end
63
+ end
64
+ class ItemAnalysis
65
+ attr_reader :mean, :sd,:valid_n, :alpha , :alpha_standarized
66
+ def initialize(ds)
67
+ @ds=ds.dup_only_valid
68
+ @total=@ds.vector_sum
69
+ @mean=@total.mean
70
+ @median=@total.median
71
+ @skew=@total.skew
72
+ @kurtosis=@total.kurtosis
73
+ @sd=@total.sdp
74
+ @valid_n=@total.size
75
+ begin
76
+ @alpha=Statsample::Reliability.cronbach_alpha(ds)
77
+ @alpha_standarized=Statsample::Reliability.cronbach_alpha_standarized(ds)
78
+ rescue => e
79
+ raise DatasetException.new(@ds,e), "Problem on calculate alpha"
80
+ end
81
+ end
82
+ # Returns a hash with structure
83
+ def item_characteristic_curve
84
+ i=0
85
+ out={}
86
+ total={}
87
+ @ds.each{|row|
88
+ tot=@total[i]
89
+ @ds.fields.each {|f|
90
+ out[f]||= {}
91
+ total[f]||={}
92
+ out[f][tot]||= 0
93
+ total[f][tot]||=0
94
+ out[f][tot]+= row[f]
95
+ total[f][tot]+=1
96
+ }
97
+ i+=1
98
+ }
99
+ total.each{|f,var|
100
+ var.each{|tot,v|
101
+ out[f][tot]=out[f][tot].to_f / total[f][tot]
102
+ }
103
+ }
104
+ out
105
+ end
106
+ def gnuplot_item_characteristic_curve(directory, base="crd",options={})
107
+ require 'gnuplot'
108
+
109
+ crd=item_characteristic_curve
110
+ @ds.fields.each {|f|
111
+ x=[]
112
+ y=[]
113
+ Gnuplot.open do |gp|
114
+ Gnuplot::Plot.new( gp ) do |plot|
115
+ crd[f].sort.each{|tot,prop|
116
+ x.push(tot)
117
+ y.push((prop*100).to_i.to_f/100)
118
+ }
119
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
120
+ ds.with = "linespoints"
121
+ ds.notitle
122
+ end
123
+
124
+ end
125
+ end
126
+ }
127
+
128
+ end
129
+ def svggraph_item_characteristic_curve(directory, base="icc",options={})
130
+ require 'statsample/graph/svggraph'
131
+ crd=ItemCharacteristicCurve.new(@ds)
132
+ @ds.fields.each {|f|
133
+ factors=@ds[f].factors.sort
134
+ options={
135
+ :height=>500,
136
+ :width=>800,
137
+ :key=>true
138
+ }.update(options)
139
+ graph = ::SVG::Graph::Plot.new(options)
140
+ factors.each{|factor|
141
+ factor=factor.to_s
142
+ dataset=[]
143
+ crd.curve_field(f, factor).each{|tot,prop|
144
+ dataset.push(tot)
145
+ dataset.push((prop*100).to_i.to_f/100)
146
+ }
147
+ graph.add_data({
148
+ :title=>"#{factor}",
149
+ :data=>dataset
150
+ })
151
+ }
152
+ File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
153
+ fp.puts(graph.burn())
154
+ }
155
+ }
156
+
157
+ end
158
+ def item_total_correlation
159
+ @ds.fields.inject({}) do |a,v|
160
+ vector=@ds[v].dup
161
+ ds2=@ds.dup
162
+ ds2.delete_vector(v)
163
+ total=ds2.vector_sum
164
+ a[v]=Statsample::Bivariate.pearson(vector,total)
165
+ a
166
+ end
167
+ end
168
+ def item_statistics
169
+ @ds.fields.inject({}) do |a,v|
170
+ a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
171
+ a
172
+ end
173
+ end
174
+
175
+ def stats_if_deleted
176
+ @ds.fields.inject({}){|a,v|
177
+ ds2=@ds.dup
178
+ ds2.delete_vector(v)
179
+ total=ds2.vector_sum
180
+ a[v]={}
181
+ a[v][:mean]=total.mean
182
+ a[v][:sds]=total.sds
183
+ a[v][:variance_sample]=total.variance_sample
184
+ a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
185
+ a
186
+ }
187
+ end
188
+ def html_summary
189
+ html = <<EOF
190
+ <p><strong>Summary for scale:</strong></p>
191
+ <ul>
192
+ <li>Mean=#{@mean}</li>
193
+ <li>Std.Dv.=#{@sd}</li>
194
+ <li>Median=#{@median}</li>
195
+ <li>Skewness=#{sprintf("%0.3f",@skew)}</li>
196
+ <li>Kurtosis=#{sprintf("%0.3f",@kurtosis)}</li>
197
+
198
+ <li>Valid n:#{@valid_n}</li>
199
+ <li>Cronbach alpha: #{@alpha}</li>
200
+ </ul>
201
+ <table><thead><th>Variable</th>
202
+
203
+ <th>Mean</th>
204
+ <th>StDv.</th>
205
+ <th>Mean if deleted</th><th>Var. if
206
+ deleted</th><th> StDv. if
207
+ deleted</th><th> Itm-Totl
208
+ Correl.</th><th>Alpha if
209
+ deleted</th></thead>
210
+ EOF
211
+
212
+ itc=item_total_correlation
213
+ sid=stats_if_deleted
214
+ is=item_statistics
215
+ @ds.fields.each {|f|
216
+ html << <<EOF
217
+ <tr>
218
+ <td>#{f}</td>
219
+ <td>#{sprintf("%0.5f",is[f][:mean])}</td>
220
+ <td>#{sprintf("%0.5f",is[f][:sds])}</td>
221
+ <td>#{sprintf("%0.5f",sid[f][:mean])}</td>
222
+ <td>#{sprintf("%0.5f",sid[f][:variance_sample])}</td>
223
+ <td>#{sprintf("%0.5f",sid[f][:sds])}</td>
224
+ <td>#{sprintf("%0.5f",itc[f])}</td>
225
+ <td>#{sprintf("%0.5f",sid[f][:alpha])}</td>
226
+ </tr>
227
+ EOF
228
+ }
229
+ html << "</table><hr />"
230
+ html
231
+ end
232
+ end
233
+
234
+ end
235
+ end
@@ -0,0 +1,20 @@
1
+ module Statsample
2
+ module Resample
3
+ class << self
4
+ def repeat_and_save(times,&action)
5
+ (1..times).inject([]) {|a,x|
6
+ a.push(action.call)
7
+ a
8
+ }
9
+ end
10
+
11
+ def generate (size,low,upper)
12
+ range=upper-low+1
13
+ Vector.new((0...size).collect {|x|
14
+ rand(range)+low
15
+ },:scale)
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,159 @@
1
+ module Statsample
2
+ # Several methods to estimate parameters for simple random sampling
3
+ module SRS
4
+ class << self
5
+
6
+ ########################
7
+ #
8
+ # Proportion estimation
9
+ #
10
+ ########################
11
+ # Finite population correction (over variance)
12
+ # Source: Cochran(1972)
13
+ def fpc_var(sam,pop)
14
+ (pop - sam).quo(pop - 1)
15
+ end
16
+ # Finite population correction (over standard deviation)
17
+ def fpc(sam,pop)
18
+ Math::sqrt((pop-sam).quo(pop-1))
19
+ end
20
+
21
+ # Non sample fraction.
22
+ #
23
+ # 1 - sample fraction
24
+ def qf(sam , pop)
25
+ 1-(sam.quo(pop))
26
+ end
27
+ # Sample size estimation for proportions, infinite poblation
28
+ def estimation_n0(d,prop,margin=0.95)
29
+ t=GSL::Cdf.ugaussian_Pinv(1-(1-margin).quo(2))
30
+ var=prop*(1-prop)
31
+ t**2*var.quo(d**2)
32
+ end
33
+ # Sample size estimation for proportions, finite poblation.
34
+ def estimation_n(d,prop,n_pobl,margin=0.95)
35
+ n0=estimation_n0(d,prop,margin)
36
+ n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
37
+ end
38
+ # Proportion confidence interval with t values
39
+ # Uses estimated proportion, sample without replacement.
40
+
41
+ def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
42
+ t=GSL::Cdf.tdist_Pinv(1-((1-margin).quo(2)) , n_sample-1)
43
+ proportion_confidence_interval(prop,n_sample,n_population, t)
44
+ end
45
+ # Proportion confidence interval with z values
46
+ # Uses estimated proportion, sample without replacement.
47
+ def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
48
+ z=GSL::Cdf.ugaussian_Pinv(1-((1-margin).quo(2)))
49
+ proportion_confidence_interval(p,n_sample,n_population, z)
50
+ end
51
+ # Proportion confidence interval with x value
52
+ # Uses estimated proportion, sample without replacement
53
+
54
+ def proportion_confidence_interval(p, sam,pop , x)
55
+ f=sam.quo(pop)
56
+ one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)) / (sam-1)) + (1.quo(sam * 2.0))
57
+ [p-one_range, p+one_range]
58
+ end
59
+ # Standard deviation for sample distribution of a proportion
60
+ # Know proportion, sample with replacement.
61
+ # Based on http://stattrek.com/Lesson6/SRS.aspx
62
+ def proportion_sd_kp_wr(p, n_sample)
63
+ Math::sqrt(p*(1-p).quo(n_sample))
64
+ end
65
+ # Standard deviation for sample distribution of a proportion
66
+ # Know proportion, sample without replacement.
67
+ #
68
+ # Sources:
69
+ # * http://stattrek.com/Lesson6/SRS.aspx
70
+ # * Cochran(1972)
71
+ def proportion_sd_kp_wor(p, sam, pop)
72
+ fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
73
+ end
74
+ # Standard deviation for sample distribution of a proportion
75
+ # Estimated proportion, sample with replacement
76
+ # Based on http://stattrek.com/Lesson6/SRS.aspx.
77
+ def proportion_sd_ep_wr(p, n_sample)
78
+ Math::sqrt(p*(1-p).quo(n_sample-1))
79
+ end
80
+ # Standard deviation for sample distribution of a proportion.
81
+ # Estimated proportion, sample without replacement.
82
+ # Source: Cochran, 1972, Técnicas de muestreo
83
+ def proportion_sd_ep_wor(p, sam,pop)
84
+ fsc=(pop-sam).quo((sam-1)*pop)
85
+ Math::sqrt(fsc*p*(1-p))
86
+ end
87
+
88
+ # Total estimation sd based on sample.
89
+ # Known proportion, sample without replacement
90
+ # Source: Cochran(1972)
91
+ def proportion_total_sd_kp_wor(prop, sam, pop)
92
+ pob * proportion_sd_kp_wor(p, sam, pop)
93
+ end
94
+ # Total estimation sd based on sample.
95
+ # Estimated proportion, sample without replacement
96
+ # Source: Cochran(1972)
97
+ def proportion_total_sd_ep_wor(prop, sam, pop)
98
+ fsc=((pop - sam).to_f / ( sam - 1))
99
+ Math::sqrt(fsc*pop*prop*(1-prop))
100
+ end
101
+
102
+ ########################
103
+ #
104
+ # Mean stimation
105
+ #
106
+ ########################
107
+
108
+
109
+ # Standard error. Known variance, sample with replacement.
110
+ def standard_error_ksd_wr(s, sam, pop)
111
+ s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
112
+ end
113
+
114
+ # Standard error of the mean. Known variance, sample w/o replacement
115
+ def standard_error_ksd_wor(s,sam,pop)
116
+ s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
117
+ end
118
+
119
+ alias_method :standard_error_esd_wr, :standard_error_ksd_wr
120
+
121
+ # Standard error of the mean.
122
+ # Estimated variance, without replacement
123
+ # Cochran (1972) p.47
124
+ def standard_error_esd_wor(s,sam,pop)
125
+ s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
126
+ end
127
+
128
+ alias_method :standard_error, :standard_error_esd_wor
129
+ alias_method :se, :standard_error_esd_wor
130
+
131
+ # Standard error of total estimation
132
+
133
+ def standard_error_total(s,sam,pop)
134
+ pop*se(s,sam,pop)
135
+ end
136
+
137
+ # Confidence Interval using T-Student
138
+ # Use with n < 60
139
+ def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
140
+ t=GSL::Cdf.tdist_Pinv(1-((1-margin) / 2),n_sample-1)
141
+ mean_confidence_interval(mean,s,n_sample,n_population,t)
142
+ end
143
+ # Confidente Interval using Z
144
+ # Use with n > 60
145
+ def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
146
+ z=GSL::Cdf.ugaussian_Pinv(1-((1-margin) / 2))
147
+ mean_confidence_interval(mean,s,n_sample,n_population, z)
148
+ end
149
+ # Confidente interval using X.
150
+ #
151
+ # Better use mean_confidence_interval_z or mean_confidence_interval_t
152
+ def mean_confidence_interval(mean,s,n_sample,n_population,x)
153
+ range=x*se(s,n_sample,n_population)
154
+ [mean-range,mean+range]
155
+ end
156
+ end
157
+ end
158
+
159
+ end
@@ -0,0 +1,25 @@
1
+ module Statsample
2
+ # module for several statistical tests
3
+ module Test
4
+ # Calculate chi square for two Matrix
5
+ class << self
6
+ def chi_square(real,expected)
7
+ raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
8
+ sum=0
9
+ (0...real.row_size).each {|row_i|
10
+ (0...real.column_size).each {|col_i|
11
+
12
+ val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
13
+ # puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
14
+ # puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
15
+ sum+=val
16
+ }
17
+ }
18
+ sum
19
+ end
20
+ def t_significance
21
+
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,759 @@
1
+ class Array
2
+ def to_vector(*args)
3
+ Statsample::Vector.new(self,*args)
4
+ end
5
+ end
6
+
7
+ module Statsample
8
+ class << self
9
+ # Create a matrix using vectors as columns
10
+ # Use:
11
+ #
12
+ # matrix=Statsample.vector_cols_matrix(v1,v2)
13
+ def vector_cols_matrix(*vs)
14
+ # test
15
+ size=vs[0].size
16
+ vs.each{|v|
17
+ raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
18
+ raise ArgumentError,"Vectors size should be the same" if v.size!=size
19
+ }
20
+ Matrix.rows((0...size).to_a.collect() {|i|
21
+ vs.collect{|v| v[i]}
22
+ })
23
+ end
24
+ end
25
+ # Returns a duplicate of the input vectors, without missing data
26
+ # for any of the vectors
27
+ #
28
+ # a=[1,2,3,6,7,nil,3,5].to_vector(:scale)
29
+ # b=[nil,nil,5,6,4,5,10,2].to_vector(:scale)
30
+ # c=[2,4,6,7,4,5,6,7].to_vector(:scale)
31
+ # a2,b2,c2=Statsample.only_valid(a,b,c)
32
+ # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
33
+ # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
34
+ # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
35
+ #
36
+ def self.only_valid(*vs)
37
+ i=1
38
+ h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
39
+ ds=Statsample::Dataset.new(h).dup_only_valid
40
+ ds.vectors.values
41
+ end
42
+ class Vector < DelegateClass(Array)
43
+
44
+ include Enumerable
45
+ attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
46
+ attr_accessor :labels
47
+ # Creates a new
48
+ # data = Array of data
49
+ # t = level of meausurement. Could be:
50
+ # [:nominal] : Nominal level of measurement
51
+ # [:ordinal] : Ordinal level of measurement
52
+ # [:scale] : Scale level of meausurement
53
+ #
54
+ def initialize(data=[],t=:nominal,missing_values=[],labels={})
55
+ raise "Data should be an array" unless data.is_a? Array
56
+ @data=data
57
+ @missing_values=missing_values
58
+ @labels=labels
59
+ @type=t
60
+ @valid_data=[]
61
+ @data_with_nils=[]
62
+ @missing_data=[]
63
+ @has_missing_data=nil
64
+ _set_valid_data
65
+ self.type=t
66
+ super(@delegate)
67
+ end
68
+ def dup
69
+ Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
70
+ end
71
+ # Returns an empty duplicate of the vector. Maintains the type, missing
72
+ # values, labels
73
+ def dup_empty
74
+ Vector.new([],@type,@missing_values.dup,@labels.dup)
75
+ end
76
+ # Return a vector usign the standarized values for data
77
+ # with sd with denominator N
78
+ def vector_standarized_pop
79
+ vector_standarized(true)
80
+ end
81
+
82
+ # Return a vector usign the standarized values for data
83
+ # with sd with denominator n-1
84
+
85
+ def vector_standarized(use_population=false)
86
+ raise "Should be a scale" unless @type==:scale
87
+ mean=@delegate.mean
88
+ sd=use_population ? @delegate.sdp : @delegate.sds
89
+ @data_with_nils.collect{|x|
90
+ if !x.nil?
91
+ (x.to_f - mean).quo(sd)
92
+ else
93
+ nil
94
+ end
95
+ }.to_vector(:scale)
96
+ end
97
+ alias_method :standarized, :vector_standarized
98
+ def box_cox_transformation(lambda)
99
+ raise "Should be a scale" unless @type==:scale
100
+ @data_with_nils.collect{|x|
101
+ if !x.nil?
102
+ if(lambda==0)
103
+ Math.log(x)
104
+ else
105
+ (x**lambda-1).quo(lambda)
106
+ end
107
+ else
108
+ nil
109
+ end
110
+ }.to_vector(:scale)
111
+ end
112
+
113
+ # Vector equality
114
+ # Two vector will be the same if their data, missing values, type, labels are equals
115
+ def ==(v2)
116
+ raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
117
+ @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
118
+ end
119
+ def _dump(i)
120
+ Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
121
+ end
122
+ def self._load(data)
123
+ h=Marshal.load(data)
124
+ Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
125
+ end
126
+ def recode
127
+ @data.collect{|x|
128
+ yield x
129
+ }.to_vector(@type)
130
+ end
131
+ def recode!
132
+ @data.collect!{|x|
133
+ yield x
134
+ }
135
+ set_valid_data
136
+ end
137
+ def each
138
+ @data.each{|x|
139
+ yield(x)
140
+ }
141
+ end
142
+ # Add a value at the end of the vector
143
+ # If second argument set to false, you should update valid data usign
144
+ # Vector#set_valid_data at the end of your insertion cycle
145
+ def add(v,update_valid=true)
146
+ @data.push(v)
147
+ set_valid_data if update_valid
148
+ end
149
+ def set_valid_data
150
+ @valid_data.clear
151
+ @missing_data.clear
152
+ @data_with_nils.clear
153
+ _set_valid_data
154
+ @delegate.set_gsl if(@type==:scale)
155
+ end
156
+ def _set_valid_data
157
+ if Statsample::OPTIMIZED
158
+ Statsample::_set_valid_data(self)
159
+ else
160
+ @data.each do |n|
161
+ if is_valid? n
162
+ @valid_data.push(n)
163
+ @data_with_nils.push(n)
164
+ else
165
+ @data_with_nils.push(nil)
166
+ @missing_data.push(n)
167
+ end
168
+ end
169
+ @has_missing_data=@missing_data.size>0
170
+ end
171
+ end
172
+ # Retrieves true if data has one o more missing values
173
+ def has_missing_data?
174
+ @has_missing_data
175
+ end
176
+ def labeling(x)
177
+ @labels.has_key?(x) ? @labels[x].to_s : x.to_s
178
+ end
179
+ # Returns a Vector with the data with labels replaced by the label
180
+ def vector_labeled
181
+ d=@data.collect{|x|
182
+ if @labels.has_key? x
183
+ @labels[x]
184
+ else
185
+ x
186
+ end
187
+ }
188
+ Vector.new(d,@type)
189
+ end
190
+ def size
191
+ @data.size
192
+ end
193
+ def [](i)
194
+ @data[i]
195
+ end
196
+ def []=(i,v)
197
+ @data[i]=v
198
+ end
199
+ # Return true if a value is valid (not nil and not included on missing values)
200
+ def is_valid?(x)
201
+ !(x.nil? or @missing_values.include? x)
202
+ end
203
+ # Set missing_values
204
+ def missing_values=(vals)
205
+ @missing_values = vals
206
+ set_valid_data
207
+ end
208
+ # Set level of measurement.
209
+ def type=(t)
210
+ case t
211
+ when :nominal
212
+ @delegate=Nominal.new(@valid_data)
213
+ when :ordinal
214
+ @delegate=Ordinal.new(@valid_data)
215
+ when :scale
216
+ @delegate=Scale.new(@valid_data)
217
+ else
218
+ raise "Type doesn't exists"
219
+ end
220
+ __setobj__(@delegate)
221
+ @type=t
222
+ end
223
+ def n; @data.size ; end
224
+ def to_a
225
+ @data.dup
226
+ end
227
+ # Redundant, but necessary
228
+ # Spreadsheet creates Array#sum, so calling sum
229
+ # doesn't call the delegates method
230
+ def sum
231
+ @delegate.sum
232
+ end
233
+ alias_method :to_ary, :to_a
234
+ # Vector sum.
235
+ # - If v is a scalar, add this value to all elements
236
+ # - If v is a Array or a Vector, should be of the same size of this vector
237
+ # every item of this vector will be added to the value of the
238
+ # item at the same position on the other vector
239
+ def +(v)
240
+ _vector_ari("+",v)
241
+ end
242
+ # Vector rest.
243
+ # - If v is a scalar, rest this value to all elements
244
+ # - If v is a Array or a Vector, should be of the same
245
+ # size of this vector
246
+ # every item of this vector will be rested to the value of the
247
+ # item at the same position on the other vector
248
+
249
+ def -(v)
250
+ _vector_ari("-",v)
251
+ end
252
+ # Reports all values that doesn't comply with a condition
253
+ # Returns a hash with the index of data and the invalid data
254
+ def verify
255
+ h={}
256
+ (0...@data.size).to_a.each{|i|
257
+ if !(yield @data[i])
258
+ h[i]=@data[i]
259
+ end
260
+ }
261
+ h
262
+ end
263
+ def _vector_ari(method,v) # :nodoc:
264
+ if(v.is_a? Vector or v.is_a? Array)
265
+ if v.size==@data.size
266
+ i=0
267
+ sum=[]
268
+ 0.upto(v.size-1) {|i|
269
+ if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
270
+ sum.push(@data[i].send(method,v[i]))
271
+ else
272
+ sum.push(nil)
273
+ end
274
+ }
275
+ Statsample::Vector.new(sum)
276
+ else
277
+ raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
278
+ end
279
+ elsif(v.respond_to? method )
280
+ Statsample::Vector.new(
281
+ @data.collect {|x|
282
+ if(!x.nil?)
283
+ x.send(method,v)
284
+ else
285
+ nil
286
+ end
287
+ }
288
+ )
289
+ else
290
+ raise TypeError,"You should pass a scalar or a array/vector"
291
+ end
292
+
293
+ end
294
+ # Return an array with the data splitted by a separator
295
+ # a=Vector.new(["a,b","c,d","a,b","d"])
296
+ # a.splitted
297
+ # [["a","b"],["c","d"],["a","b"],["d"]]
298
+ def splitted(sep=Statsample::SPLIT_TOKEN)
299
+ @data.collect{|x|
300
+ if x.nil?
301
+ nil
302
+ elsif (x.respond_to? :split)
303
+ x.split(sep)
304
+ else
305
+ [x]
306
+ end
307
+ }
308
+ end
309
+ # Returns a hash of Vectors, defined by the different values
310
+ # defined on the fields
311
+ # Example:
312
+ #
313
+ # a=Vector.new(["a,b","c,d","a,b"])
314
+ # a.split_by_separator
315
+ # {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 @data=[1, 0, 1]>,
316
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 @data=[1, 1, 0]>,
317
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 @data=[0, 1, 1]>}
318
+ #
319
+ def split_by_separator(sep=Statsample::SPLIT_TOKEN)
320
+ split_data=splitted(sep)
321
+ factors=split_data.flatten.uniq.compact
322
+ out=factors.inject({}) {|a,x|
323
+ a[x]=[]
324
+ a
325
+ }
326
+ split_data.each{|r|
327
+ if r.nil?
328
+ factors.each{|f|
329
+ out[f].push(nil)
330
+ }
331
+ else
332
+ factors.each{|f|
333
+ out[f].push(r.include?(f) ? 1:0)
334
+ }
335
+ end
336
+ }
337
+ out.inject({}){|s,v|
338
+ s[v[0]]=Vector.new(v[1],:nominal)
339
+ s
340
+ }
341
+ end
342
+ def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
343
+ split_by_separator(sep).inject({}) {|a,v|
344
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
345
+ a
346
+ }
347
+ end
348
+
349
+ # Returns an random sample of size n, with replacement,
350
+ # only with valid data.
351
+ #
352
+ # In all the trails, every item have the same probability
353
+ # of been selected
354
+ def sample_with_replacement(sample=1)
355
+ Vector.new(@delegate.sample_with_replacement(sample) ,@type)
356
+ end
357
+ # Returns an random sample of size n, without replacement,
358
+ # only with valid data.
359
+ #
360
+ # Every element could only be selected once
361
+ # A sample of the same size of the vector is the vector itself
362
+
363
+ def sample_without_replacement(sample=1)
364
+ Vector.new(@delegate.sample_without_replacement(sample),@type)
365
+ end
366
+
367
+ def count(x=false)
368
+ if block_given?
369
+ r=@data.inject(0) {|s, i|
370
+ r=yield i
371
+ s+(r ? 1 : 0)
372
+ }
373
+ r.nil? ? 0 : r
374
+ else
375
+ frequencies[x].nil? ? 0 : frequencies[x]
376
+ end
377
+ end
378
+ # returns the real type for the vector, according to its content
379
+ def db_type(dbs='mysql')
380
+ # first, detect any character not number
381
+ if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
382
+ return "DATE"
383
+ elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
384
+ return "VARCHAR (255)"
385
+ elsif @data.find {|v| v.to_s=~/\./}
386
+ return "DOUBLE"
387
+ else
388
+ return "INTEGER"
389
+ end
390
+ end
391
+ def summary(out="")
392
+ @delegate.summary(@labels,out)
393
+ end
394
+ def to_s
395
+ sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
396
+ end
397
+ def inspect
398
+ self.to_s
399
+ end
400
+
401
+ end
402
+
403
+
404
+
405
+ class Nominal
406
+ def initialize(data)
407
+ @data=data
408
+ # @factors=data.uniq
409
+ end
410
+ def delegate_data
411
+ @data
412
+ end
413
+ # Return an array of the different values of the data
414
+ def factors
415
+ @data.uniq.sort
416
+ end
417
+ # Returns a hash with the distribution of frecuencies of
418
+ # the sample
419
+ def frequencies_slow
420
+ @data.inject(Hash.new) {|a,x|
421
+ a[x]||=0
422
+ a[x]=a[x]+1
423
+ a
424
+ }
425
+ end
426
+ # Plot frequencies on a chart, using gnuplot
427
+ def plot_frequencies
428
+ require 'gnuplot'
429
+ x=[]
430
+ y=[]
431
+ self.frequencies.sort.each{|k,v|
432
+ x.push(k)
433
+ y.push(v)
434
+ }
435
+ Gnuplot.open do |gp|
436
+ Gnuplot::Plot.new( gp ) do |plot|
437
+ plot.boxwidth("0.9 absolute")
438
+ plot.yrange("[0:#{y.max}]")
439
+ plot.style("fill solid 1.00 border -1")
440
+ plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
441
+ plot.style("histogram")
442
+ plot.style("data histogram")
443
+ i=-1
444
+ plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
445
+ plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
446
+ end
447
+ end
448
+ end
449
+
450
+ end
451
+
452
+
453
+ # Returns the most frequent item
454
+ def mode
455
+ frequencies.max{|a,b| a[1]<=>b[1]}[0]
456
+ end
457
+ # The numbers of item with valid data
458
+ def n_valid
459
+ @data.size
460
+ end
461
+ # Returns a hash with the distribution of proportions of
462
+ # the sample
463
+ def proportions
464
+ frequencies.inject({}){|a,v|
465
+ a[v[0]] = v[1].quo(@data.size)
466
+ a
467
+ }
468
+ end
469
+ # Proportion of a given value.
470
+ def proportion(v=1)
471
+ frequencies[v].quo(@data.size)
472
+ end
473
+ def summary(labels,out="")
474
+ out << sprintf("n valid:%d\n",n_valid)
475
+ out << sprintf("factors:%s\n",factors.join(","))
476
+ out << "mode:"+mode.to_s+"\n"
477
+ out << "Distribution:\n"
478
+ frequencies.sort.each{|k,v|
479
+ key=labels.has_key?(k) ? labels[k]:k
480
+ out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
481
+ }
482
+ out
483
+ end
484
+
485
+ # Returns an random sample of size n, with replacement,
486
+ # only with valid data.
487
+ #
488
+ # In all the trails, every item have the same probability
489
+ # of been selected
490
+ def sample_with_replacement(sample)
491
+ (0...sample).collect{ @data[rand(@data.size)] }
492
+ end
493
+ # Returns an random sample of size n, without replacement,
494
+ # only with valid data.
495
+ #
496
+ # Every element could only be selected once
497
+ # A sample of the same size of the vector is the vector itself
498
+
499
+ def sample_without_replacement(sample)
500
+ raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
501
+ out=[]
502
+ size=@data.size
503
+ while out.size<sample
504
+ value=rand(size)
505
+ out.push(value) if !out.include?value
506
+ end
507
+ out.collect{|i|@data[i]}
508
+ end
509
+
510
+
511
+ # Variance of p, according to poblation size
512
+ def variance_proportion(n_poblation, v=1)
513
+ Statsample::proportion_variance_sample(self.proportion(v), @data.size, n_poblation)
514
+ end
515
+ def variance_total(n_poblation, v=1)
516
+ Statsample::total_variance_sample(self.proportion(v), @data.size, n_poblation)
517
+ end
518
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
519
+ Statsample::proportion_confidence_interval_t(proportion(v), @data.size, n_poblation, margin)
520
+ end
521
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
522
+ Statsample::proportion_confidence_interval_z(proportion(v), @data.size, n_poblation, margin)
523
+ end
524
+ self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
525
+ met_or=met.gsub("_slow","")
526
+ if !self.method_defined?(met_or)
527
+ alias_method met_or, met
528
+ end
529
+ }
530
+ end
531
+
532
+ class Ordinal <Nominal
533
+ # Return the value of the percentil q
534
+ def percentil(q)
535
+ sorted=@data.sort
536
+ v= (n_valid * q).quo(100)
537
+ if(v.to_i!=v)
538
+ sorted[v.to_i]
539
+ else
540
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
541
+ end
542
+ end
543
+ # Returns a ranked vector
544
+ def ranked(type=:ordinal)
545
+ i=0
546
+ r=frequencies.sort.inject({}){|a,v|
547
+ a[v[0]]=(i+1 + i+v[1]).quo(2)
548
+ i+=v[1]
549
+ a
550
+ }
551
+ @data.collect {|c|
552
+ r[c]
553
+ }.to_vector(type)
554
+ end
555
+ # Return the median (percentil 50)
556
+ def median
557
+ percentil(50)
558
+ end
559
+ if HAS_GSL
560
+ %w{median}.each{|m|
561
+ m_nuevo=(m+"_slow").intern
562
+ alias_method m_nuevo, m.intern
563
+ }
564
+
565
+ #def percentil(p)
566
+ # v=GSL::Vector.alloc(@data.sort)
567
+ # v.stats_quantile_from_sorted_data(p)
568
+ #end
569
+ def median # :nodoc:
570
+ GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
571
+ end
572
+ end
573
+ # Minimun value
574
+ def min; @data.min;end
575
+ # Maximum value
576
+ def max; @data.max; end
577
+
578
+
579
+ def summary(labels,out="")
580
+ out << sprintf("n valid:%d\n",n_valid)
581
+ out << "median:"+median.to_s+"\n"
582
+ out << "percentil 25:"+percentil(25).to_s+"\n"
583
+ out << "percentil 75:"+percentil(75).to_s+"\n"
584
+ out
585
+ end
586
+ end
587
+ class Scale <Ordinal
588
+ attr_reader :gsl
589
+ def initialize(data)
590
+ # puts "Inicializando Scale..."
591
+ super(data)
592
+
593
+ set_gsl
594
+ end
595
+
596
+ def _dump(i)
597
+ Marshal.dump(@data)
598
+ end
599
+ def _load(data)
600
+ @data=Marshal.restore(data)
601
+ set_gsl
602
+ end
603
+ def set_gsl # :nodoc
604
+ data = @data.collect!{|x|
605
+ if x.is_a? Numeric
606
+ x
607
+ elsif x.is_a? String and x.to_i==x.to_f
608
+ x.to_i
609
+ else
610
+ x.to_f
611
+ end
612
+ }
613
+ if HAS_GSL
614
+ @gsl=GSL::Vector.alloc(@data) if @data.size>0
615
+ end
616
+ end
617
+ # The range of the data (max - min)
618
+ def range; @data.max - @data.min; end
619
+ # The sum of values for the data
620
+ def sum
621
+ @data.inject(0){|a,x|x+a} ; end
622
+ # The arithmetical mean of data
623
+ def mean
624
+ sum.to_f.quo(n_valid)
625
+ end
626
+ def sum_of_squares(m=nil)
627
+ m||=mean
628
+ @data.inject(0){|a,x| a+(x-m).square}
629
+ end
630
+
631
+ # Sum of squared deviation
632
+ def sum_of_squared_deviation
633
+ @data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
634
+ end
635
+
636
+ # Population variance (divided by n)
637
+ def variance_population(m=nil)
638
+ m||=mean
639
+ squares=@data.inject(0){|a,x| x.square+a}
640
+ squares.quo(n_valid) - m.square
641
+ end
642
+
643
+
644
+ # Population Standard deviation (divided by n)
645
+ def standard_deviation_population(m=nil)
646
+ Math::sqrt( variance_population(m) )
647
+ end
648
+ # Sample Variance (divided by n-1)
649
+
650
+ def variance_sample(m=nil)
651
+ m||=mean
652
+ sum_of_squares(m).quo(n_valid - 1)
653
+ end
654
+
655
+ # Sample Standard deviation (divided by n-1)
656
+
657
+ def standard_deviation_sample(m=nil)
658
+ m||=m
659
+ Math::sqrt(variance_sample(m))
660
+ end
661
+ def skew
662
+ m=mean
663
+ thirds=@data.inject(0){|a,x| a+((x-mean)**3)}
664
+ thirds.quo((@data.size-1)*sd**3)
665
+ end
666
+ def kurtosis
667
+ m=mean
668
+ thirds=@data.inject(0){|a,x| a+((x-mean)**4)}
669
+ thirds.quo((@data.size-1)*sd**4)
670
+
671
+ end
672
+
673
+ if HAS_GSL
674
+ %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
675
+ m_nuevo=(m+"_slow").intern
676
+ alias_method m_nuevo, m.intern
677
+ }
678
+ def sum # :nodoc:
679
+ @gsl.sum
680
+ end
681
+ def mean # :nodoc:
682
+ @gsl.mean
683
+ end
684
+ def variance_sample(m=nil) # :nodoc:
685
+ m||=mean
686
+ @gsl.variance_m
687
+ end
688
+ def standard_deviation_sample(m=nil) # :nodoc:
689
+ m||=mean
690
+ @gsl.sd(m)
691
+ end
692
+
693
+ def variance_population(m=nil) # :nodoc:
694
+ m||=mean
695
+ @gsl.variance_with_fixed_mean(m)
696
+ end
697
+ def standard_deviation_population(m=nil) # :nodoc:
698
+ m||=mean
699
+ @gsl.sd_with_fixed_mean(m)
700
+ end
701
+ def skew
702
+ @gsl.skew
703
+ end
704
+ def kurtosis
705
+ @gsl.kurtosis
706
+ end
707
+ # Create a GSL::Histogram
708
+ # With a fixnum, creates X bins within the range of data
709
+ # With an Array, each value will be a cut point
710
+ def histogram(bins=10)
711
+ if bins.is_a? Array
712
+ h=GSL::Histogram.alloc(bins)
713
+ else
714
+ # ugly patch. The upper limit for a bin has the form
715
+ # x < range
716
+ h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001])
717
+ end
718
+ h.increment(@gsl)
719
+ h
720
+ end
721
+ def plot_histogram(bins=10,options="")
722
+ self.histogram(bins).graph(options)
723
+ end
724
+ def sample_with_replacement(k)
725
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
726
+ r.sample(@gsl, k).to_a
727
+ end
728
+ def sample_without_replacement(k)
729
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
730
+ r.choose(@gsl, k).to_a
731
+ end
732
+ end
733
+
734
+ # Coefficient of variation
735
+ # Calculed with the sample standard deviation
736
+ def coefficient_of_variation
737
+ standard_deviation_sample.quo(mean)
738
+ end
739
+ def summary(labels,out="")
740
+ out << sprintf("n valid:%d\n",n_valid)
741
+ out << "mean:"+mean.to_s+"\n"
742
+ out << "sum:"+sum.to_s+"\n"
743
+ out << "range:"+range.to_s+"\n"
744
+ out << "variance (pop):"+variance_population.to_s+"\n"
745
+ out << "sd (pop):"+sdp.to_s+"\n"
746
+ out << "variance (sample):"+variance_sample.to_s+"\n"
747
+ out << "sd (sample):"+sds.to_s+"\n"
748
+
749
+ out
750
+ end
751
+
752
+ alias_method :sdp, :standard_deviation_population
753
+ alias_method :sds, :standard_deviation_sample
754
+ alias_method :cov, :coefficient_of_variation
755
+ alias_method :variance, :variance_sample
756
+ alias_method :sd, :standard_deviation_sample
757
+ alias_method :ss, :sum_of_squares
758
+ end
759
+ end