statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,235 @@
1
+ module Statsample
2
+ module Reliability
3
+ class << self
4
+ # Calculate Chonbach's alpha for a given dataset.
5
+ # only uses tuples without missing data
6
+ def cronbach_alpha(ods)
7
+ ds=ods.dup_only_valid
8
+ n_items=ds.fields.size
9
+ sum_var_items=ds.vectors.inject(0) {|ac,v|
10
+ ac+v[1].variance_sample
11
+ }
12
+ total=ds.vector_sum
13
+ (n_items / (n_items-1).to_f) * (1-(sum_var_items/ total.variance_sample))
14
+ end
15
+ # Calculate Chonbach's alpha for a given dataset
16
+ # using standarized values for every vector.
17
+ # Only uses tuples without missing data
18
+
19
+ def cronbach_alpha_standarized(ods)
20
+ ds=ods.fields.inject({}){|a,f|
21
+ a[f]=ods[f].vector_standarized
22
+ a
23
+ }.to_dataset
24
+ cronbach_alpha(ds)
25
+ end
26
+ end
27
+
28
+ class ItemCharacteristicCurve
29
+ attr_reader :totals, :counts,:vector_total
30
+ def initialize (ds, vector_total=nil)
31
+ vector_total||=ds.vector_sum
32
+ raise "Total size != Dataset size" if vector_total.size!=ds.cases
33
+ @vector_total=vector_total
34
+ @ds=ds
35
+ @totals={}
36
+ @counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
37
+ process
38
+ end
39
+ def process
40
+ i=0
41
+ @ds.each{|row|
42
+ tot=@vector_total[i]
43
+ @totals[tot]||=0
44
+ @totals[tot]+=1
45
+ @ds.fields.each {|f|
46
+ item=row[f].to_s
47
+ @counts[f][tot]||={}
48
+ @counts[f][tot][item]||=0
49
+ @counts[f][tot][item] += 1
50
+ }
51
+ i+=1
52
+ }
53
+ end
54
+ def curve_field(field, item)
55
+ out={}
56
+ item=item.to_s
57
+ @totals.each{|value,n|
58
+ count_value= @counts[field][value][item].nil? ? 0 : @counts[field][value][item]
59
+ out[value]=count_value.to_f/n.to_f
60
+ }
61
+ out
62
+ end
63
+ end
64
+ class ItemAnalysis
65
+ attr_reader :mean, :sd,:valid_n, :alpha , :alpha_standarized
66
+ def initialize(ds)
67
+ @ds=ds.dup_only_valid
68
+ @total=@ds.vector_sum
69
+ @mean=@total.mean
70
+ @median=@total.median
71
+ @skew=@total.skew
72
+ @kurtosis=@total.kurtosis
73
+ @sd=@total.sdp
74
+ @valid_n=@total.size
75
+ begin
76
+ @alpha=Statsample::Reliability.cronbach_alpha(ds)
77
+ @alpha_standarized=Statsample::Reliability.cronbach_alpha_standarized(ds)
78
+ rescue => e
79
+ raise DatasetException.new(@ds,e), "Problem on calculate alpha"
80
+ end
81
+ end
82
+ # Returns a hash with structure
83
+ def item_characteristic_curve
84
+ i=0
85
+ out={}
86
+ total={}
87
+ @ds.each{|row|
88
+ tot=@total[i]
89
+ @ds.fields.each {|f|
90
+ out[f]||= {}
91
+ total[f]||={}
92
+ out[f][tot]||= 0
93
+ total[f][tot]||=0
94
+ out[f][tot]+= row[f]
95
+ total[f][tot]+=1
96
+ }
97
+ i+=1
98
+ }
99
+ total.each{|f,var|
100
+ var.each{|tot,v|
101
+ out[f][tot]=out[f][tot].to_f / total[f][tot]
102
+ }
103
+ }
104
+ out
105
+ end
106
+ def gnuplot_item_characteristic_curve(directory, base="crd",options={})
107
+ require 'gnuplot'
108
+
109
+ crd=item_characteristic_curve
110
+ @ds.fields.each {|f|
111
+ x=[]
112
+ y=[]
113
+ Gnuplot.open do |gp|
114
+ Gnuplot::Plot.new( gp ) do |plot|
115
+ crd[f].sort.each{|tot,prop|
116
+ x.push(tot)
117
+ y.push((prop*100).to_i.to_f/100)
118
+ }
119
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
120
+ ds.with = "linespoints"
121
+ ds.notitle
122
+ end
123
+
124
+ end
125
+ end
126
+ }
127
+
128
+ end
129
+ def svggraph_item_characteristic_curve(directory, base="icc",options={})
130
+ require 'statsample/graph/svggraph'
131
+ crd=ItemCharacteristicCurve.new(@ds)
132
+ @ds.fields.each {|f|
133
+ factors=@ds[f].factors.sort
134
+ options={
135
+ :height=>500,
136
+ :width=>800,
137
+ :key=>true
138
+ }.update(options)
139
+ graph = ::SVG::Graph::Plot.new(options)
140
+ factors.each{|factor|
141
+ factor=factor.to_s
142
+ dataset=[]
143
+ crd.curve_field(f, factor).each{|tot,prop|
144
+ dataset.push(tot)
145
+ dataset.push((prop*100).to_i.to_f/100)
146
+ }
147
+ graph.add_data({
148
+ :title=>"#{factor}",
149
+ :data=>dataset
150
+ })
151
+ }
152
+ File.open(directory+"/"+base+"_#{f}.svg","w") {|fp|
153
+ fp.puts(graph.burn())
154
+ }
155
+ }
156
+
157
+ end
158
+ def item_total_correlation
159
+ @ds.fields.inject({}) do |a,v|
160
+ vector=@ds[v].dup
161
+ ds2=@ds.dup
162
+ ds2.delete_vector(v)
163
+ total=ds2.vector_sum
164
+ a[v]=Statsample::Bivariate.pearson(vector,total)
165
+ a
166
+ end
167
+ end
168
+ def item_statistics
169
+ @ds.fields.inject({}) do |a,v|
170
+ a[v]={:mean=>@ds[v].mean,:sds=>@ds[v].sds}
171
+ a
172
+ end
173
+ end
174
+
175
+ def stats_if_deleted
176
+ @ds.fields.inject({}){|a,v|
177
+ ds2=@ds.dup
178
+ ds2.delete_vector(v)
179
+ total=ds2.vector_sum
180
+ a[v]={}
181
+ a[v][:mean]=total.mean
182
+ a[v][:sds]=total.sds
183
+ a[v][:variance_sample]=total.variance_sample
184
+ a[v][:alpha]=Statsample::Reliability.cronbach_alpha(ds2)
185
+ a
186
+ }
187
+ end
188
+ def html_summary
189
+ html = <<EOF
190
+ <p><strong>Summary for scale:</strong></p>
191
+ <ul>
192
+ <li>Mean=#{@mean}</li>
193
+ <li>Std.Dv.=#{@sd}</li>
194
+ <li>Median=#{@median}</li>
195
+ <li>Skewness=#{sprintf("%0.3f",@skew)}</li>
196
+ <li>Kurtosis=#{sprintf("%0.3f",@kurtosis)}</li>
197
+
198
+ <li>Valid n:#{@valid_n}</li>
199
+ <li>Cronbach alpha: #{@alpha}</li>
200
+ </ul>
201
+ <table><thead><th>Variable</th>
202
+
203
+ <th>Mean</th>
204
+ <th>StDv.</th>
205
+ <th>Mean if deleted</th><th>Var. if
206
+ deleted</th><th> StDv. if
207
+ deleted</th><th> Itm-Totl
208
+ Correl.</th><th>Alpha if
209
+ deleted</th></thead>
210
+ EOF
211
+
212
+ itc=item_total_correlation
213
+ sid=stats_if_deleted
214
+ is=item_statistics
215
+ @ds.fields.each {|f|
216
+ html << <<EOF
217
+ <tr>
218
+ <td>#{f}</td>
219
+ <td>#{sprintf("%0.5f",is[f][:mean])}</td>
220
+ <td>#{sprintf("%0.5f",is[f][:sds])}</td>
221
+ <td>#{sprintf("%0.5f",sid[f][:mean])}</td>
222
+ <td>#{sprintf("%0.5f",sid[f][:variance_sample])}</td>
223
+ <td>#{sprintf("%0.5f",sid[f][:sds])}</td>
224
+ <td>#{sprintf("%0.5f",itc[f])}</td>
225
+ <td>#{sprintf("%0.5f",sid[f][:alpha])}</td>
226
+ </tr>
227
+ EOF
228
+ }
229
+ html << "</table><hr />"
230
+ html
231
+ end
232
+ end
233
+
234
+ end
235
+ end
@@ -0,0 +1,20 @@
1
+ module Statsample
2
+ module Resample
3
+ class << self
4
+ def repeat_and_save(times,&action)
5
+ (1..times).inject([]) {|a,x|
6
+ a.push(action.call)
7
+ a
8
+ }
9
+ end
10
+
11
+ def generate (size,low,upper)
12
+ range=upper-low+1
13
+ Vector.new((0...size).collect {|x|
14
+ rand(range)+low
15
+ },:scale)
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,159 @@
1
+ module Statsample
2
+ # Several methods to estimate parameters for simple random sampling
3
+ module SRS
4
+ class << self
5
+
6
+ ########################
7
+ #
8
+ # Proportion estimation
9
+ #
10
+ ########################
11
+ # Finite population correction (over variance)
12
+ # Source: Cochran(1972)
13
+ def fpc_var(sam,pop)
14
+ (pop - sam).quo(pop - 1)
15
+ end
16
+ # Finite population correction (over standard deviation)
17
+ def fpc(sam,pop)
18
+ Math::sqrt((pop-sam).quo(pop-1))
19
+ end
20
+
21
+ # Non sample fraction.
22
+ #
23
+ # 1 - sample fraction
24
+ def qf(sam , pop)
25
+ 1-(sam.quo(pop))
26
+ end
27
+ # Sample size estimation for proportions, infinite poblation
28
+ def estimation_n0(d,prop,margin=0.95)
29
+ t=GSL::Cdf.ugaussian_Pinv(1-(1-margin).quo(2))
30
+ var=prop*(1-prop)
31
+ t**2*var.quo(d**2)
32
+ end
33
+ # Sample size estimation for proportions, finite poblation.
34
+ def estimation_n(d,prop,n_pobl,margin=0.95)
35
+ n0=estimation_n0(d,prop,margin)
36
+ n0.quo( 1 + ((n0 - 1).quo(n_pobl)))
37
+ end
38
+ # Proportion confidence interval with t values
39
+ # Uses estimated proportion, sample without replacement.
40
+
41
+ def proportion_confidence_interval_t(prop, n_sample, n_population, margin=0.95)
42
+ t=GSL::Cdf.tdist_Pinv(1-((1-margin).quo(2)) , n_sample-1)
43
+ proportion_confidence_interval(prop,n_sample,n_population, t)
44
+ end
45
+ # Proportion confidence interval with z values
46
+ # Uses estimated proportion, sample without replacement.
47
+ def proportion_confidence_interval_z(p, n_sample, n_population, margin=0.95)
48
+ z=GSL::Cdf.ugaussian_Pinv(1-((1-margin).quo(2)))
49
+ proportion_confidence_interval(p,n_sample,n_population, z)
50
+ end
51
+ # Proportion confidence interval with x value
52
+ # Uses estimated proportion, sample without replacement
53
+
54
+ def proportion_confidence_interval(p, sam,pop , x)
55
+ f=sam.quo(pop)
56
+ one_range=x * Math::sqrt((qf(sam, pop) * p * (1-p)) / (sam-1)) + (1.quo(sam * 2.0))
57
+ [p-one_range, p+one_range]
58
+ end
59
+ # Standard deviation for sample distribution of a proportion
60
+ # Know proportion, sample with replacement.
61
+ # Based on http://stattrek.com/Lesson6/SRS.aspx
62
+ def proportion_sd_kp_wr(p, n_sample)
63
+ Math::sqrt(p*(1-p).quo(n_sample))
64
+ end
65
+ # Standard deviation for sample distribution of a proportion
66
+ # Know proportion, sample without replacement.
67
+ #
68
+ # Sources:
69
+ # * http://stattrek.com/Lesson6/SRS.aspx
70
+ # * Cochran(1972)
71
+ def proportion_sd_kp_wor(p, sam, pop)
72
+ fpc(sam,pop)*Math::sqrt(p*(1-p).quo(sam))
73
+ end
74
+ # Standard deviation for sample distribution of a proportion
75
+ # Estimated proportion, sample with replacement
76
+ # Based on http://stattrek.com/Lesson6/SRS.aspx.
77
+ def proportion_sd_ep_wr(p, n_sample)
78
+ Math::sqrt(p*(1-p).quo(n_sample-1))
79
+ end
80
+ # Standard deviation for sample distribution of a proportion.
81
+ # Estimated proportion, sample without replacement.
82
+ # Source: Cochran, 1972, Técnicas de muestreo
83
+ def proportion_sd_ep_wor(p, sam,pop)
84
+ fsc=(pop-sam).quo((sam-1)*pop)
85
+ Math::sqrt(fsc*p*(1-p))
86
+ end
87
+
88
+ # Total estimation sd based on sample.
89
+ # Known proportion, sample without replacement
90
+ # Source: Cochran(1972)
91
+ def proportion_total_sd_kp_wor(prop, sam, pop)
92
+ pob * proportion_sd_kp_wor(p, sam, pop)
93
+ end
94
+ # Total estimation sd based on sample.
95
+ # Estimated proportion, sample without replacement
96
+ # Source: Cochran(1972)
97
+ def proportion_total_sd_ep_wor(prop, sam, pop)
98
+ fsc=((pop - sam).to_f / ( sam - 1))
99
+ Math::sqrt(fsc*pop*prop*(1-prop))
100
+ end
101
+
102
+ ########################
103
+ #
104
+ # Mean stimation
105
+ #
106
+ ########################
107
+
108
+
109
+ # Standard error. Known variance, sample with replacement.
110
+ def standard_error_ksd_wr(s, sam, pop)
111
+ s.quo(Math::sqrt(sam)) * Math::sqrt((pop-1).quo(pop))
112
+ end
113
+
114
+ # Standard error of the mean. Known variance, sample w/o replacement
115
+ def standard_error_ksd_wor(s,sam,pop)
116
+ s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
117
+ end
118
+
119
+ alias_method :standard_error_esd_wr, :standard_error_ksd_wr
120
+
121
+ # Standard error of the mean.
122
+ # Estimated variance, without replacement
123
+ # Cochran (1972) p.47
124
+ def standard_error_esd_wor(s,sam,pop)
125
+ s.quo(Math::sqrt(sam)) * Math::sqrt(qf(sam,pop))
126
+ end
127
+
128
+ alias_method :standard_error, :standard_error_esd_wor
129
+ alias_method :se, :standard_error_esd_wor
130
+
131
+ # Standard error of total estimation
132
+
133
+ def standard_error_total(s,sam,pop)
134
+ pop*se(s,sam,pop)
135
+ end
136
+
137
+ # Confidence Interval using T-Student
138
+ # Use with n < 60
139
+ def mean_confidence_interval_t(mean,s,n_sample,n_population,margin=0.95)
140
+ t=GSL::Cdf.tdist_Pinv(1-((1-margin) / 2),n_sample-1)
141
+ mean_confidence_interval(mean,s,n_sample,n_population,t)
142
+ end
143
+ # Confidente Interval using Z
144
+ # Use with n > 60
145
+ def mean_confidence_interval_z(mean,s,n_sample,n_population,margin=0.95)
146
+ z=GSL::Cdf.ugaussian_Pinv(1-((1-margin) / 2))
147
+ mean_confidence_interval(mean,s,n_sample,n_population, z)
148
+ end
149
+ # Confidente interval using X.
150
+ #
151
+ # Better use mean_confidence_interval_z or mean_confidence_interval_t
152
+ def mean_confidence_interval(mean,s,n_sample,n_population,x)
153
+ range=x*se(s,n_sample,n_population)
154
+ [mean-range,mean+range]
155
+ end
156
+ end
157
+ end
158
+
159
+ end
@@ -0,0 +1,25 @@
1
+ module Statsample
2
+ # module for several statistical tests
3
+ module Test
4
+ # Calculate chi square for two Matrix
5
+ class << self
6
+ def chi_square(real,expected)
7
+ raise TypeError, "Both argument should be Matrix" unless real.is_a? Matrix and expected.is_a?Matrix
8
+ sum=0
9
+ (0...real.row_size).each {|row_i|
10
+ (0...real.column_size).each {|col_i|
11
+
12
+ val=((real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2) / expected[row_i,col_i].to_f
13
+ # puts "Real: #{real[row_i,col_i].to_f} ; esperado: #{expected[row_i,col_i].to_f}"
14
+ # puts "Diferencial al cuadrado: #{(real[row_i,col_i].to_f - expected[row_i,col_i].to_f)**2}"
15
+ sum+=val
16
+ }
17
+ }
18
+ sum
19
+ end
20
+ def t_significance
21
+
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,759 @@
1
+ class Array
2
+ def to_vector(*args)
3
+ Statsample::Vector.new(self,*args)
4
+ end
5
+ end
6
+
7
+ module Statsample
8
+ class << self
9
+ # Create a matrix using vectors as columns
10
+ # Use:
11
+ #
12
+ # matrix=Statsample.vector_cols_matrix(v1,v2)
13
+ def vector_cols_matrix(*vs)
14
+ # test
15
+ size=vs[0].size
16
+ vs.each{|v|
17
+ raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
18
+ raise ArgumentError,"Vectors size should be the same" if v.size!=size
19
+ }
20
+ Matrix.rows((0...size).to_a.collect() {|i|
21
+ vs.collect{|v| v[i]}
22
+ })
23
+ end
24
+ end
25
+ # Returns a duplicate of the input vectors, without missing data
26
+ # for any of the vectors
27
+ #
28
+ # a=[1,2,3,6,7,nil,3,5].to_vector(:scale)
29
+ # b=[nil,nil,5,6,4,5,10,2].to_vector(:scale)
30
+ # c=[2,4,6,7,4,5,6,7].to_vector(:scale)
31
+ # a2,b2,c2=Statsample.only_valid(a,b,c)
32
+ # => [#<Statsample::Scale:0xb748c8c8 @data=[3, 6, 7, 3, 5]>,
33
+ # #<Statsample::Scale:0xb748c814 @data=[5, 6, 4, 10, 2]>,
34
+ # #<Statsample::Scale:0xb748c760 @data=[6, 7, 4, 6, 7]>]
35
+ #
36
+ def self.only_valid(*vs)
37
+ i=1
38
+ h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
39
+ ds=Statsample::Dataset.new(h).dup_only_valid
40
+ ds.vectors.values
41
+ end
42
+ class Vector < DelegateClass(Array)
43
+
44
+ include Enumerable
45
+ attr_reader :type, :data, :valid_data, :missing_values, :missing_data, :data_with_nils
46
+ attr_accessor :labels
47
+ # Creates a new
48
+ # data = Array of data
49
+ # t = level of meausurement. Could be:
50
+ # [:nominal] : Nominal level of measurement
51
+ # [:ordinal] : Ordinal level of measurement
52
+ # [:scale] : Scale level of meausurement
53
+ #
54
+ def initialize(data=[],t=:nominal,missing_values=[],labels={})
55
+ raise "Data should be an array" unless data.is_a? Array
56
+ @data=data
57
+ @missing_values=missing_values
58
+ @labels=labels
59
+ @type=t
60
+ @valid_data=[]
61
+ @data_with_nils=[]
62
+ @missing_data=[]
63
+ @has_missing_data=nil
64
+ _set_valid_data
65
+ self.type=t
66
+ super(@delegate)
67
+ end
68
+ def dup
69
+ Vector.new(@data.dup,@type,@missing_values.dup,@labels.dup)
70
+ end
71
+ # Returns an empty duplicate of the vector. Maintains the type, missing
72
+ # values, labels
73
+ def dup_empty
74
+ Vector.new([],@type,@missing_values.dup,@labels.dup)
75
+ end
76
+ # Return a vector usign the standarized values for data
77
+ # with sd with denominator N
78
+ def vector_standarized_pop
79
+ vector_standarized(true)
80
+ end
81
+
82
+ # Return a vector usign the standarized values for data
83
+ # with sd with denominator n-1
84
+
85
+ def vector_standarized(use_population=false)
86
+ raise "Should be a scale" unless @type==:scale
87
+ mean=@delegate.mean
88
+ sd=use_population ? @delegate.sdp : @delegate.sds
89
+ @data_with_nils.collect{|x|
90
+ if !x.nil?
91
+ (x.to_f - mean).quo(sd)
92
+ else
93
+ nil
94
+ end
95
+ }.to_vector(:scale)
96
+ end
97
+ alias_method :standarized, :vector_standarized
98
+ def box_cox_transformation(lambda)
99
+ raise "Should be a scale" unless @type==:scale
100
+ @data_with_nils.collect{|x|
101
+ if !x.nil?
102
+ if(lambda==0)
103
+ Math.log(x)
104
+ else
105
+ (x**lambda-1).quo(lambda)
106
+ end
107
+ else
108
+ nil
109
+ end
110
+ }.to_vector(:scale)
111
+ end
112
+
113
+ # Vector equality
114
+ # Two vector will be the same if their data, missing values, type, labels are equals
115
+ def ==(v2)
116
+ raise TypeError,"Argument should be a Vector" unless v2.instance_of? Statsample::Vector
117
+ @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels=v2.labels
118
+ end
119
+ def _dump(i)
120
+ Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type})
121
+ end
122
+ def self._load(data)
123
+ h=Marshal.load(data)
124
+ Vector.new(h['data'], h['type'], h['missing_values'], h['labels'])
125
+ end
126
+ def recode
127
+ @data.collect{|x|
128
+ yield x
129
+ }.to_vector(@type)
130
+ end
131
+ def recode!
132
+ @data.collect!{|x|
133
+ yield x
134
+ }
135
+ set_valid_data
136
+ end
137
+ def each
138
+ @data.each{|x|
139
+ yield(x)
140
+ }
141
+ end
142
+ # Add a value at the end of the vector
143
+ # If second argument set to false, you should update valid data usign
144
+ # Vector#set_valid_data at the end of your insertion cycle
145
+ def add(v,update_valid=true)
146
+ @data.push(v)
147
+ set_valid_data if update_valid
148
+ end
149
+ def set_valid_data
150
+ @valid_data.clear
151
+ @missing_data.clear
152
+ @data_with_nils.clear
153
+ _set_valid_data
154
+ @delegate.set_gsl if(@type==:scale)
155
+ end
156
+ def _set_valid_data
157
+ if Statsample::OPTIMIZED
158
+ Statsample::_set_valid_data(self)
159
+ else
160
+ @data.each do |n|
161
+ if is_valid? n
162
+ @valid_data.push(n)
163
+ @data_with_nils.push(n)
164
+ else
165
+ @data_with_nils.push(nil)
166
+ @missing_data.push(n)
167
+ end
168
+ end
169
+ @has_missing_data=@missing_data.size>0
170
+ end
171
+ end
172
+ # Retrieves true if data has one o more missing values
173
+ def has_missing_data?
174
+ @has_missing_data
175
+ end
176
+ def labeling(x)
177
+ @labels.has_key?(x) ? @labels[x].to_s : x.to_s
178
+ end
179
+ # Returns a Vector with the data with labels replaced by the label
180
+ def vector_labeled
181
+ d=@data.collect{|x|
182
+ if @labels.has_key? x
183
+ @labels[x]
184
+ else
185
+ x
186
+ end
187
+ }
188
+ Vector.new(d,@type)
189
+ end
190
+ def size
191
+ @data.size
192
+ end
193
+ def [](i)
194
+ @data[i]
195
+ end
196
+ def []=(i,v)
197
+ @data[i]=v
198
+ end
199
+ # Return true if a value is valid (not nil and not included on missing values)
200
+ def is_valid?(x)
201
+ !(x.nil? or @missing_values.include? x)
202
+ end
203
+ # Set missing_values
204
+ def missing_values=(vals)
205
+ @missing_values = vals
206
+ set_valid_data
207
+ end
208
+ # Set level of measurement.
209
+ def type=(t)
210
+ case t
211
+ when :nominal
212
+ @delegate=Nominal.new(@valid_data)
213
+ when :ordinal
214
+ @delegate=Ordinal.new(@valid_data)
215
+ when :scale
216
+ @delegate=Scale.new(@valid_data)
217
+ else
218
+ raise "Type doesn't exists"
219
+ end
220
+ __setobj__(@delegate)
221
+ @type=t
222
+ end
223
+ def n; @data.size ; end
224
+ def to_a
225
+ @data.dup
226
+ end
227
+ # Redundant, but necessary
228
+ # Spreadsheet creates Array#sum, so calling sum
229
+ # doesn't call the delegates method
230
+ def sum
231
+ @delegate.sum
232
+ end
233
+ alias_method :to_ary, :to_a
234
+ # Vector sum.
235
+ # - If v is a scalar, add this value to all elements
236
+ # - If v is a Array or a Vector, should be of the same size of this vector
237
+ # every item of this vector will be added to the value of the
238
+ # item at the same position on the other vector
239
+ def +(v)
240
+ _vector_ari("+",v)
241
+ end
242
+ # Vector rest.
243
+ # - If v is a scalar, rest this value to all elements
244
+ # - If v is a Array or a Vector, should be of the same
245
+ # size of this vector
246
+ # every item of this vector will be rested to the value of the
247
+ # item at the same position on the other vector
248
+
249
+ def -(v)
250
+ _vector_ari("-",v)
251
+ end
252
+ # Reports all values that doesn't comply with a condition
253
+ # Returns a hash with the index of data and the invalid data
254
+ def verify
255
+ h={}
256
+ (0...@data.size).to_a.each{|i|
257
+ if !(yield @data[i])
258
+ h[i]=@data[i]
259
+ end
260
+ }
261
+ h
262
+ end
263
+ def _vector_ari(method,v) # :nodoc:
264
+ if(v.is_a? Vector or v.is_a? Array)
265
+ if v.size==@data.size
266
+ i=0
267
+ sum=[]
268
+ 0.upto(v.size-1) {|i|
269
+ if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
270
+ sum.push(@data[i].send(method,v[i]))
271
+ else
272
+ sum.push(nil)
273
+ end
274
+ }
275
+ Statsample::Vector.new(sum)
276
+ else
277
+ raise ArgumentError, "The array/vector parameter should be of the same size of the original vector"
278
+ end
279
+ elsif(v.respond_to? method )
280
+ Statsample::Vector.new(
281
+ @data.collect {|x|
282
+ if(!x.nil?)
283
+ x.send(method,v)
284
+ else
285
+ nil
286
+ end
287
+ }
288
+ )
289
+ else
290
+ raise TypeError,"You should pass a scalar or a array/vector"
291
+ end
292
+
293
+ end
294
+ # Return an array with the data splitted by a separator
295
+ # a=Vector.new(["a,b","c,d","a,b","d"])
296
+ # a.splitted
297
+ # [["a","b"],["c","d"],["a","b"],["d"]]
298
+ def splitted(sep=Statsample::SPLIT_TOKEN)
299
+ @data.collect{|x|
300
+ if x.nil?
301
+ nil
302
+ elsif (x.respond_to? :split)
303
+ x.split(sep)
304
+ else
305
+ [x]
306
+ end
307
+ }
308
+ end
309
+ # Returns a hash of Vectors, defined by the different values
310
+ # defined on the fields
311
+ # Example:
312
+ #
313
+ # a=Vector.new(["a,b","c,d","a,b"])
314
+ # a.split_by_separator
315
+ # {"a"=>#<Statsample::Type::Nominal:0x7f2dbcc09d88 @data=[1, 0, 1]>,
316
+ # "b"=>#<Statsample::Type::Nominal:0x7f2dbcc09c48 @data=[1, 1, 0]>,
317
+ # "c"=>#<Statsample::Type::Nominal:0x7f2dbcc09b08 @data=[0, 1, 1]>}
318
+ #
319
+ def split_by_separator(sep=Statsample::SPLIT_TOKEN)
320
+ split_data=splitted(sep)
321
+ factors=split_data.flatten.uniq.compact
322
+ out=factors.inject({}) {|a,x|
323
+ a[x]=[]
324
+ a
325
+ }
326
+ split_data.each{|r|
327
+ if r.nil?
328
+ factors.each{|f|
329
+ out[f].push(nil)
330
+ }
331
+ else
332
+ factors.each{|f|
333
+ out[f].push(r.include?(f) ? 1:0)
334
+ }
335
+ end
336
+ }
337
+ out.inject({}){|s,v|
338
+ s[v[0]]=Vector.new(v[1],:nominal)
339
+ s
340
+ }
341
+ end
342
+ def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
343
+ split_by_separator(sep).inject({}) {|a,v|
344
+ a[v[0]]=v[1].inject {|s,x| s+x.to_i}
345
+ a
346
+ }
347
+ end
348
+
349
+ # Returns an random sample of size n, with replacement,
350
+ # only with valid data.
351
+ #
352
+ # In all the trails, every item have the same probability
353
+ # of been selected
354
+ def sample_with_replacement(sample=1)
355
+ Vector.new(@delegate.sample_with_replacement(sample) ,@type)
356
+ end
357
+ # Returns an random sample of size n, without replacement,
358
+ # only with valid data.
359
+ #
360
+ # Every element could only be selected once
361
+ # A sample of the same size of the vector is the vector itself
362
+
363
+ def sample_without_replacement(sample=1)
364
+ Vector.new(@delegate.sample_without_replacement(sample),@type)
365
+ end
366
+
367
+ def count(x=false)
368
+ if block_given?
369
+ r=@data.inject(0) {|s, i|
370
+ r=yield i
371
+ s+(r ? 1 : 0)
372
+ }
373
+ r.nil? ? 0 : r
374
+ else
375
+ frequencies[x].nil? ? 0 : frequencies[x]
376
+ end
377
+ end
378
+ # returns the real type for the vector, according to its content
379
+ def db_type(dbs='mysql')
380
+ # first, detect any character not number
381
+ if @data.find {|v| v.to_s=~/\d{2,2}-\d{2,2}-\d{4,4}/} or @data.find {|v| v.to_s=~/\d{4,4}-\d{2,2}-\d{2,2}/}
382
+ return "DATE"
383
+ elsif @data.find {|v| v.to_s=~/[^0-9e.-]/ }
384
+ return "VARCHAR (255)"
385
+ elsif @data.find {|v| v.to_s=~/\./}
386
+ return "DOUBLE"
387
+ else
388
+ return "INTEGER"
389
+ end
390
+ end
391
+ def summary(out="")
392
+ @delegate.summary(@labels,out)
393
+ end
394
+ def to_s
395
+ sprintf("Vector(type:%s, n:%d)[%s]",@type.to_s,@data.size, @data.collect{|d| d.nil? ? "nil":d}.join(","))
396
+ end
397
+ def inspect
398
+ self.to_s
399
+ end
400
+
401
+ end
402
+
403
+
404
+
405
+ class Nominal
406
+ def initialize(data)
407
+ @data=data
408
+ # @factors=data.uniq
409
+ end
410
+ def delegate_data
411
+ @data
412
+ end
413
+ # Return an array of the different values of the data
414
+ def factors
415
+ @data.uniq.sort
416
+ end
417
+ # Returns a hash with the distribution of frecuencies of
418
+ # the sample
419
+ def frequencies_slow
420
+ @data.inject(Hash.new) {|a,x|
421
+ a[x]||=0
422
+ a[x]=a[x]+1
423
+ a
424
+ }
425
+ end
426
+ # Plot frequencies on a chart, using gnuplot
427
+ def plot_frequencies
428
+ require 'gnuplot'
429
+ x=[]
430
+ y=[]
431
+ self.frequencies.sort.each{|k,v|
432
+ x.push(k)
433
+ y.push(v)
434
+ }
435
+ Gnuplot.open do |gp|
436
+ Gnuplot::Plot.new( gp ) do |plot|
437
+ plot.boxwidth("0.9 absolute")
438
+ plot.yrange("[0:#{y.max}]")
439
+ plot.style("fill solid 1.00 border -1")
440
+ plot.set("xtics border in scale 1,0.5 nomirror rotate by -45 offset character 0, 0, 0")
441
+ plot.style("histogram")
442
+ plot.style("data histogram")
443
+ i=-1
444
+ plot.set("xtics","("+x.collect{|v| i+=1; sprintf("\"%s\" %d",v,i)}.join(",")+")")
445
+ plot.data << Gnuplot::DataSet.new( [y] ) do |ds|
446
+ end
447
+ end
448
+ end
449
+
450
+ end
451
+
452
+
453
+ # Returns the most frequent item
454
+ def mode
455
+ frequencies.max{|a,b| a[1]<=>b[1]}[0]
456
+ end
457
+ # The numbers of item with valid data
458
+ def n_valid
459
+ @data.size
460
+ end
461
+ # Returns a hash with the distribution of proportions of
462
+ # the sample
463
+ def proportions
464
+ frequencies.inject({}){|a,v|
465
+ a[v[0]] = v[1].quo(@data.size)
466
+ a
467
+ }
468
+ end
469
+ # Proportion of a given value.
470
+ def proportion(v=1)
471
+ frequencies[v].quo(@data.size)
472
+ end
473
+ def summary(labels,out="")
474
+ out << sprintf("n valid:%d\n",n_valid)
475
+ out << sprintf("factors:%s\n",factors.join(","))
476
+ out << "mode:"+mode.to_s+"\n"
477
+ out << "Distribution:\n"
478
+ frequencies.sort.each{|k,v|
479
+ key=labels.has_key?(k) ? labels[k]:k
480
+ out << sprintf("%s : %s (%0.2f%%)\n",key,v, (v.quo(n_valid))*100)
481
+ }
482
+ out
483
+ end
484
+
485
+ # Returns an random sample of size n, with replacement,
486
+ # only with valid data.
487
+ #
488
+ # In all the trails, every item have the same probability
489
+ # of been selected
490
+ def sample_with_replacement(sample)
491
+ (0...sample).collect{ @data[rand(@data.size)] }
492
+ end
493
+ # Returns an random sample of size n, without replacement,
494
+ # only with valid data.
495
+ #
496
+ # Every element could only be selected once
497
+ # A sample of the same size of the vector is the vector itself
498
+
499
+ def sample_without_replacement(sample)
500
+ raise ArgumentError, "Sample size couldn't be greater than n" if sample>@data.size
501
+ out=[]
502
+ size=@data.size
503
+ while out.size<sample
504
+ value=rand(size)
505
+ out.push(value) if !out.include?value
506
+ end
507
+ out.collect{|i|@data[i]}
508
+ end
509
+
510
+
511
+ # Variance of p, according to poblation size
512
+ def variance_proportion(n_poblation, v=1)
513
+ Statsample::proportion_variance_sample(self.proportion(v), @data.size, n_poblation)
514
+ end
515
+ def variance_total(n_poblation, v=1)
516
+ Statsample::total_variance_sample(self.proportion(v), @data.size, n_poblation)
517
+ end
518
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
519
+ Statsample::proportion_confidence_interval_t(proportion(v), @data.size, n_poblation, margin)
520
+ end
521
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
522
+ Statsample::proportion_confidence_interval_z(proportion(v), @data.size, n_poblation, margin)
523
+ end
524
+ self.instance_methods.find_all{|met| met=~/_slow$/}.each{|met|
525
+ met_or=met.gsub("_slow","")
526
+ if !self.method_defined?(met_or)
527
+ alias_method met_or, met
528
+ end
529
+ }
530
+ end
531
+
532
+ class Ordinal <Nominal
533
+ # Return the value of the percentil q
534
+ def percentil(q)
535
+ sorted=@data.sort
536
+ v= (n_valid * q).quo(100)
537
+ if(v.to_i!=v)
538
+ sorted[v.to_i]
539
+ else
540
+ (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
541
+ end
542
+ end
543
+ # Returns a ranked vector
544
+ def ranked(type=:ordinal)
545
+ i=0
546
+ r=frequencies.sort.inject({}){|a,v|
547
+ a[v[0]]=(i+1 + i+v[1]).quo(2)
548
+ i+=v[1]
549
+ a
550
+ }
551
+ @data.collect {|c|
552
+ r[c]
553
+ }.to_vector(type)
554
+ end
555
+ # Return the median (percentil 50)
556
+ def median
557
+ percentil(50)
558
+ end
559
+ if HAS_GSL
560
+ %w{median}.each{|m|
561
+ m_nuevo=(m+"_slow").intern
562
+ alias_method m_nuevo, m.intern
563
+ }
564
+
565
+ #def percentil(p)
566
+ # v=GSL::Vector.alloc(@data.sort)
567
+ # v.stats_quantile_from_sorted_data(p)
568
+ #end
569
+ def median # :nodoc:
570
+ GSL::Stats::median_from_sorted_data(GSL::Vector.alloc(@data.sort))
571
+ end
572
+ end
573
+ # Minimun value
574
+ def min; @data.min;end
575
+ # Maximum value
576
+ def max; @data.max; end
577
+
578
+
579
+ def summary(labels,out="")
580
+ out << sprintf("n valid:%d\n",n_valid)
581
+ out << "median:"+median.to_s+"\n"
582
+ out << "percentil 25:"+percentil(25).to_s+"\n"
583
+ out << "percentil 75:"+percentil(75).to_s+"\n"
584
+ out
585
+ end
586
+ end
587
+ class Scale <Ordinal
588
+ attr_reader :gsl
589
+ def initialize(data)
590
+ # puts "Inicializando Scale..."
591
+ super(data)
592
+
593
+ set_gsl
594
+ end
595
+
596
+ def _dump(i)
597
+ Marshal.dump(@data)
598
+ end
599
+ def _load(data)
600
+ @data=Marshal.restore(data)
601
+ set_gsl
602
+ end
603
+ def set_gsl # :nodoc
604
+ data = @data.collect!{|x|
605
+ if x.is_a? Numeric
606
+ x
607
+ elsif x.is_a? String and x.to_i==x.to_f
608
+ x.to_i
609
+ else
610
+ x.to_f
611
+ end
612
+ }
613
+ if HAS_GSL
614
+ @gsl=GSL::Vector.alloc(@data) if @data.size>0
615
+ end
616
+ end
617
+ # The range of the data (max - min)
618
+ def range; @data.max - @data.min; end
619
+ # The sum of values for the data
620
+ def sum
621
+ @data.inject(0){|a,x|x+a} ; end
622
+ # The arithmetical mean of data
623
+ def mean
624
+ sum.to_f.quo(n_valid)
625
+ end
626
+ def sum_of_squares(m=nil)
627
+ m||=mean
628
+ @data.inject(0){|a,x| a+(x-m).square}
629
+ end
630
+
631
+ # Sum of squared deviation
632
+ def sum_of_squared_deviation
633
+ @data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
634
+ end
635
+
636
+ # Population variance (divided by n)
637
+ def variance_population(m=nil)
638
+ m||=mean
639
+ squares=@data.inject(0){|a,x| x.square+a}
640
+ squares.quo(n_valid) - m.square
641
+ end
642
+
643
+
644
+ # Population Standard deviation (divided by n)
645
+ def standard_deviation_population(m=nil)
646
+ Math::sqrt( variance_population(m) )
647
+ end
648
+ # Sample Variance (divided by n-1)
649
+
650
+ def variance_sample(m=nil)
651
+ m||=mean
652
+ sum_of_squares(m).quo(n_valid - 1)
653
+ end
654
+
655
+ # Sample Standard deviation (divided by n-1)
656
+
657
+ def standard_deviation_sample(m=nil)
658
+ m||=m
659
+ Math::sqrt(variance_sample(m))
660
+ end
661
+ def skew
662
+ m=mean
663
+ thirds=@data.inject(0){|a,x| a+((x-mean)**3)}
664
+ thirds.quo((@data.size-1)*sd**3)
665
+ end
666
+ def kurtosis
667
+ m=mean
668
+ thirds=@data.inject(0){|a,x| a+((x-mean)**4)}
669
+ thirds.quo((@data.size-1)*sd**4)
670
+
671
+ end
672
+
673
+ if HAS_GSL
674
+ %w{skew kurtosis variance_sample standard_deviation_sample variance_population standard_deviation_population mean sum}.each{|m|
675
+ m_nuevo=(m+"_slow").intern
676
+ alias_method m_nuevo, m.intern
677
+ }
678
+ def sum # :nodoc:
679
+ @gsl.sum
680
+ end
681
+ def mean # :nodoc:
682
+ @gsl.mean
683
+ end
684
+ def variance_sample(m=nil) # :nodoc:
685
+ m||=mean
686
+ @gsl.variance_m
687
+ end
688
+ def standard_deviation_sample(m=nil) # :nodoc:
689
+ m||=mean
690
+ @gsl.sd(m)
691
+ end
692
+
693
+ def variance_population(m=nil) # :nodoc:
694
+ m||=mean
695
+ @gsl.variance_with_fixed_mean(m)
696
+ end
697
+ def standard_deviation_population(m=nil) # :nodoc:
698
+ m||=mean
699
+ @gsl.sd_with_fixed_mean(m)
700
+ end
701
+ def skew
702
+ @gsl.skew
703
+ end
704
+ def kurtosis
705
+ @gsl.kurtosis
706
+ end
707
+ # Create a GSL::Histogram
708
+ # With a fixnum, creates X bins within the range of data
709
+ # With an Array, each value will be a cut point
710
+ def histogram(bins=10)
711
+ if bins.is_a? Array
712
+ h=GSL::Histogram.alloc(bins)
713
+ else
714
+ # ugly patch. The upper limit for a bin has the form
715
+ # x < range
716
+ h=GSL::Histogram.alloc(bins,[@data.min,@data.max+0.0001])
717
+ end
718
+ h.increment(@gsl)
719
+ h
720
+ end
721
+ def plot_histogram(bins=10,options="")
722
+ self.histogram(bins).graph(options)
723
+ end
724
+ def sample_with_replacement(k)
725
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
726
+ r.sample(@gsl, k).to_a
727
+ end
728
+ def sample_without_replacement(k)
729
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
730
+ r.choose(@gsl, k).to_a
731
+ end
732
+ end
733
+
734
+ # Coefficient of variation
735
+ # Calculed with the sample standard deviation
736
+ def coefficient_of_variation
737
+ standard_deviation_sample.quo(mean)
738
+ end
739
+ def summary(labels,out="")
740
+ out << sprintf("n valid:%d\n",n_valid)
741
+ out << "mean:"+mean.to_s+"\n"
742
+ out << "sum:"+sum.to_s+"\n"
743
+ out << "range:"+range.to_s+"\n"
744
+ out << "variance (pop):"+variance_population.to_s+"\n"
745
+ out << "sd (pop):"+sdp.to_s+"\n"
746
+ out << "variance (sample):"+variance_sample.to_s+"\n"
747
+ out << "sd (sample):"+sds.to_s+"\n"
748
+
749
+ out
750
+ end
751
+
752
+ alias_method :sdp, :standard_deviation_population
753
+ alias_method :sds, :standard_deviation_sample
754
+ alias_method :cov, :coefficient_of_variation
755
+ alias_method :variance, :variance_sample
756
+ alias_method :sd, :standard_deviation_sample
757
+ alias_method :ss, :sum_of_squares
758
+ end
759
+ end