statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,259 @@
1
+ require 'statsample/dominanceanalysis/bootstrap'
2
+ module Statsample
3
+ class DominanceAnalysis
4
+ def initialize(ds,y_var, r_class = Regression::MultipleRegressionPairwise)
5
+ @y_var=y_var
6
+ @dy=ds[@y_var]
7
+ @ds=ds
8
+ @r_class=r_class
9
+ @ds_indep=ds.dup(ds.fields-[y_var])
10
+ @fields=@ds_indep.fields
11
+ create_models
12
+ fill_models
13
+ end
14
+ def fill_models
15
+ @models.each{|m|
16
+ @fields.each{|f|
17
+ next if m.include? f
18
+ base_model=md(m)
19
+ comp_model=md(m+[f])
20
+ base_model.add_contribution(f,comp_model.r2)
21
+ }
22
+ }
23
+ end
24
+ def dominance_for_nil_model(i,j)
25
+ if md(i).r2>md(j).r2
26
+ 1
27
+ elsif md(i).r2<md(j).r2
28
+ 0
29
+ else
30
+ 0.5
31
+ end
32
+ end
33
+ # Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
34
+ def total_dominance_pairwise(i,j)
35
+ dm=dominance_for_nil_model(i,j)
36
+ return 0.5 if dm==0.5
37
+ dominances=[dm]
38
+ @models_data.each{|k,m|
39
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
40
+ if m.contributions[i]>m.contributions[j]
41
+ dominances.push(1)
42
+ elsif m.contributions[i]<m.contributions[j]
43
+ dominances.push(0)
44
+ else
45
+ dominances.push(0.5)
46
+ end
47
+ end
48
+ }
49
+ final=dominances.uniq
50
+ final.size>1 ? 0.5 : final[0]
51
+ end
52
+
53
+ # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
54
+ def conditional_dominance_pairwise(i,j)
55
+ dm=dominance_for_nil_model(i,j)
56
+ return 0.5 if dm==0.5
57
+ dominances=[dm]
58
+ for k in 1...@fields.size
59
+ a=average_k(k)
60
+ if a[i]>a[j]
61
+ dominances.push(1)
62
+ elsif a[i]<a[j]
63
+ dominances.push(0)
64
+ else
65
+ a(0.5)
66
+ end
67
+ end
68
+ final=dominances.uniq
69
+ final.size>1 ? 0.5 : final[0]
70
+ end
71
+ # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
72
+ def general_dominance_pairwise(i,j)
73
+ ga=general_averages
74
+ if ga[i]>ga[j]
75
+ 1
76
+ elsif ga[i]<ga[j]
77
+ 0
78
+ else
79
+ 0.5
80
+ end
81
+ end
82
+ def pairs
83
+ @models.find_all{|m| m.size==2}
84
+ end
85
+ def total_dominance
86
+ pairs.inject({}){|a,pair|
87
+ a[pair]=total_dominance_pairwise(pair[0], pair[1])
88
+ a
89
+ }
90
+ end
91
+ def conditional_dominance
92
+ pairs.inject({}){|a,pair|
93
+ a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
94
+ a
95
+ }
96
+ end
97
+ def general_dominance
98
+ pairs.inject({}){|a,pair|
99
+ a[pair]=general_dominance_pairwise(pair[0], pair[1])
100
+ a
101
+ }
102
+ end
103
+
104
+ def md(m)
105
+ @models_data[m.sort]
106
+ end
107
+ # Get all model of size k
108
+ def md_k(k)
109
+ out=[]
110
+ models=@models.each{|m|
111
+ out.push(md(m)) if m.size==k
112
+ }
113
+ out
114
+ end
115
+ def average_k(k)
116
+ return nil if k==@fields.size
117
+ models=md_k(k)
118
+ averages=@fields.inject({}) {|a,v| a[v]=[];a}
119
+ models.each{|m|
120
+ @fields.each{|f|
121
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
122
+ }
123
+ }
124
+ out={}
125
+ averages.each{|k,v|
126
+ out[k]=v.to_vector(:scale).mean
127
+ }
128
+ out
129
+ end
130
+ def general_averages
131
+ if @general_averages.nil?
132
+ averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
133
+ for k in 1...@fields.size
134
+ ak=average_k(k)
135
+ @fields.each{|f|
136
+ averages[f].push(ak[f])
137
+ }
138
+ end
139
+ out={}
140
+ averages.each{|k,v|
141
+ out[k]=v.to_vector(:scale).mean
142
+ }
143
+ @general_averages=out
144
+ end
145
+ @general_averages
146
+ end
147
+ def create_models
148
+ @models=[]
149
+ @models_data={}
150
+ for i in 1..@fields.size
151
+ c = GSL::Combination.calloc(@fields.size, i);
152
+ begin
153
+ convert=c.data.to_a.collect {|i|
154
+ @fields[i]
155
+ }
156
+ @models.push(convert)
157
+ ds_prev=@ds.dup(convert+[@y_var])
158
+ modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
159
+
160
+ @models_data[convert.sort]=modeldata
161
+ end while c.next == GSL::SUCCESS
162
+ end
163
+ end
164
+ def summary(report_type=ConsoleSummary)
165
+ out=""
166
+ out.extend report_type
167
+ out << "Summary for Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
168
+ t=Statsample::ReportTable.new
169
+ t.header=["","r2","sign"]+@fields
170
+ row=["Model 0","",""]+@fields.collect{|f|
171
+ sprintf("%0.3f",md(f).r2)
172
+ }
173
+ t.add_row(row)
174
+ t.add_horizontal_line
175
+ for i in 1..@fields.size
176
+ mk=md_k(i)
177
+ mk.each{|m|
178
+ t.add_row(m.add_table_row)
179
+ }
180
+ # Report averages
181
+ a=average_k(i)
182
+ if !a.nil?
183
+ t.add_horizontal_line
184
+ row=["k=#{i} Average","",""] + @fields.collect{|f|
185
+ sprintf("%0.3f",a[f])
186
+ }
187
+ t.add_row(row)
188
+ t.add_horizontal_line
189
+
190
+ end
191
+
192
+ end
193
+
194
+ g=general_averages
195
+ t.add_horizontal_line
196
+
197
+ row=["Overall averages","",""]+@fields.collect{|f|
198
+ sprintf("%0.3f",g[f])
199
+ }
200
+ t.add_row(row)
201
+ out.parse_table(t)
202
+
203
+ out.nl
204
+ out << "Pairwise\n"
205
+ td=total_dominance
206
+ cd=conditional_dominance
207
+ gd=general_dominance
208
+ t=Statsample::ReportTable.new(["Pairs","T","C","G"])
209
+ pairs.each{|p|
210
+ name=p.join(" - ")
211
+ row=[name, sprintf("%0.1f",td[p]), sprintf("%0.1f",cd[p]), sprintf("%0.1f",gd[p])]
212
+ t.add_row(row)
213
+ }
214
+ out.parse_table(t)
215
+ return out
216
+ end
217
+ class ModelData
218
+ attr_reader :contributions
219
+ def initialize(name,ds,y_var,fields,r_class)
220
+ @name=name
221
+ @fields=fields
222
+ @contributions=@fields.inject({}){|a,v| a[v]=nil;a}
223
+ r_class=Regression::MultipleRegressionPairwise if r_class.nil?
224
+ @lr=r_class.new(ds,y_var)
225
+ end
226
+ def add_contribution(f,v)
227
+ @contributions[f]=v-r2
228
+ end
229
+ def r2
230
+ @lr.r2
231
+ end
232
+ def add_table_row
233
+ [@name.join("*"), sprintf("%0.3f",r2), sprintf("%0.3f",@lr.significance)]+@fields.collect{|k|
234
+ v=@contributions[k]
235
+ if v.nil?
236
+ "--"
237
+ else
238
+ sprintf("%0.3f",v)
239
+ end
240
+ }
241
+ end
242
+ def summary
243
+ out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",@name.join("*"),r2,@lr.significance,@lr.sst)
244
+ out << @fields.collect{|k|
245
+ v=@contributions[k]
246
+ if v.nil?
247
+ "--"
248
+ else
249
+ sprintf("%s=%0.3f",k,v)
250
+ end
251
+ }.join(" | ")
252
+ out << "\n"
253
+
254
+ return out
255
+ end
256
+ end
257
+ end
258
+
259
+ end
@@ -0,0 +1,126 @@
1
+ module Statsample
2
+ class DominanceAnalysis
3
+ class Bootstrap
4
+ include Writable
5
+ attr_reader :samples_td,:samples_cd,:samples_gd,:samples_ga, :fields
6
+ attr_writer :lr_class
7
+ def initialize(ds,y_var)
8
+ @ds=ds
9
+ @y_var=y_var
10
+ @n=ds.cases
11
+ @fields=ds.fields-[y_var]
12
+ @samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
13
+ @n_samples=0
14
+ @lr_class=Regression::MultipleRegressionPairwise
15
+ create_samples_pairs
16
+ end
17
+ def lr_class=(lr)
18
+ @lr_class=lr
19
+ end
20
+ def da
21
+ if @da.nil?
22
+ @da=DominanceAnalysis.new(@ds,@y_var,@lr_class)
23
+ end
24
+ @da
25
+ end
26
+ def bootstrap(number_samples,n=nil)
27
+ number_samples.times{ |t|
28
+ @n_samples+=1
29
+ puts "Bootstrap #{t+1} of #{number_samples}"
30
+ ds_boot=@ds.bootstrap(n)
31
+ da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
32
+ da_1.total_dominance.each{|k,v|
33
+ @samples_td[k].push(v)
34
+ }
35
+ da_1.conditional_dominance.each{|k,v|
36
+ @samples_cd[k].push(v)
37
+ }
38
+ da_1.general_dominance.each{|k,v|
39
+ @samples_gd[k].push(v)
40
+ }
41
+ da_1.general_averages.each{|k,v|
42
+ @samples_ga[k].push(v)
43
+ }
44
+ }
45
+ end
46
+ def create_samples_pairs
47
+ @samples_td={}
48
+ @samples_cd={}
49
+ @samples_gd={}
50
+ @pairs=[]
51
+ c = GSL::Combination.calloc(@fields.size, 2);
52
+ begin
53
+ convert=c.data.to_a.collect {|i|
54
+ @fields[i]
55
+ }
56
+ @pairs.push(convert)
57
+ [@samples_td,@samples_cd,@samples_gd].each{|s|
58
+ s[convert]=[]
59
+ }
60
+ end while c.next == GSL::SUCCESS
61
+ end
62
+ def summary(report_type=ConsoleSummary)
63
+ out =""
64
+ raise "You should bootstrap first" if @n_samples==0
65
+ alfa=0.95
66
+ t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
67
+ out.extend report_type
68
+ out.add "Summary for Bootstrap Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
69
+ out.add "Size of sample: #{@n_samples}\n"
70
+ out.add "t:#{t}\n"
71
+ out.nl
72
+ table=ReportTable.new
73
+ header=["pairs","sD","Dij","SE(Dij)","Pij","Pji","Pno","Reprod"]
74
+ table.header=header
75
+ table.add_row(["Complete dominance"])
76
+ table.add_horizontal_line
77
+ @pairs.each{|pair|
78
+ std=@samples_td[pair].to_vector(:scale)
79
+ ttd=da.total_dominance_pairwise(pair[0],pair[1])
80
+ table.add_row(summary_pairs(pair,std,ttd))
81
+ }
82
+ table.add_horizontal_line
83
+ table.add_row(["Conditional dominance"])
84
+ table.add_horizontal_line
85
+ @pairs.each{|pair|
86
+ std=@samples_cd[pair].to_vector(:scale)
87
+ ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
88
+ table.add_row(summary_pairs(pair,std,ttd))
89
+
90
+ }
91
+ table.add_horizontal_line
92
+ table.add_row(["General Dominance"])
93
+ table.add_horizontal_line
94
+ @pairs.each{|pair|
95
+ std=@samples_gd[pair].to_vector(:scale)
96
+ ttd=da.general_dominance_pairwise(pair[0],pair[1])
97
+ table.add_row(summary_pairs(pair,std,ttd))
98
+ }
99
+ out.parse_table(table)
100
+ out.add("General averages")
101
+ table=Statsample::ReportTable.new
102
+ table.header=["var","mean","se","p.5","p.95"]
103
+ @fields.each{|f|
104
+ v=@samples_ga[f].to_vector(:scale)
105
+ row=[f, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
106
+ table.add_row(row)
107
+
108
+ }
109
+ out.parse_table(table)
110
+ out
111
+ end
112
+ def summary_pairs(pair,std,ttd)
113
+ freqs=std.proportions
114
+ [0,0.5,1].each{|n|
115
+ freqs[n]=0 if freqs[n].nil?
116
+ }
117
+ name=pair[0]+" - "+pair[1]
118
+ [name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
119
+ end
120
+ def f(v,n=3)
121
+ prec="%0.#{n}f"
122
+ sprintf(prec,v)
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,45 @@
1
+ require 'GDChart'
2
+ module Statsample
3
+ module Util
4
+ class << self
5
+ def chart_gdchart(file,width,height,chart_type, labels, options,num_datasets,data)
6
+ require 'GDChart'
7
+ gdc=GDChart.new
8
+ gdc.title="Generic title"
9
+ gdc.bg_color=0xFFFFFF
10
+ gdc.image_type=GDChart::JPEG
11
+ options.each{|k,v|
12
+ gdc.send(k+"=",v)
13
+ }
14
+ f=File.open(file,"w") {|f|
15
+ gdc.out_graph(width,height,f,chart_type, data.length/num_datasets,labels,num_datasets,data)
16
+ }
17
+ end
18
+ end
19
+ end
20
+ class Nominal
21
+ # Creates a barchart using ruby-gdchart
22
+ def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={})
23
+ labels,data=[],[]
24
+ self.frequencies.sort.each{|k,v|
25
+ labels.push(k.to_s)
26
+ data.push(v)
27
+ }
28
+ options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
29
+ Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
30
+ end
31
+ end
32
+ class Scale < Ordinal
33
+ def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={})
34
+ labels=[]
35
+ h=histogram(bins)
36
+ data=[]
37
+ (0...bins).each{|bin|
38
+ data.push(h[bin])
39
+ range=h.get_range(bin)
40
+ labels.push(((range[0]+range[1]) / 2.to_f).to_s)
41
+ }
42
+ Statsample::Util.chart_gdchart(file, width, height, chart_type, labels,options, 1,data)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,108 @@
1
+ module Statsample
2
+ module Graph
3
+ class SvgBoxplot < SVG::Graph::Bar
4
+ def initialize(config={})
5
+ config[:fields]=["dummy"]
6
+ super(config)
7
+ end
8
+ def get_x_labels
9
+ @data.collect{|d|
10
+ d[:title]
11
+ }
12
+ end
13
+
14
+ def min_value
15
+ min = 0
16
+ if min_scale_value.nil?
17
+ min = @data.collect{|x| x[:data].min}.min
18
+ if min > 0
19
+ if min > 10
20
+ min=min-2
21
+ else
22
+ min=0
23
+ end
24
+ end
25
+ else
26
+ min = min_scale_value
27
+ end
28
+ return min
29
+ end
30
+ def draw_data
31
+ minvalue = min_value
32
+ fieldwidth = field_width
33
+ unit_size = (@graph_height.to_f - font_size*2*top_font) /
34
+ (get_y_labels.max - get_y_labels.min)
35
+ bargap = bar_gap ? (fieldwidth < 10 ? fieldwidth / 2 : 10) : 0
36
+
37
+ bar_width = fieldwidth - bargap
38
+ bar_width /= @data.length if stack == :side
39
+ x_mod = (@graph_width-bargap) / 2 - (stack==:side ? bar_width/2 : 0)
40
+
41
+ bottom = @graph_height
42
+
43
+ field_count = 0
44
+ for dataset in @data
45
+
46
+ # cases (assume 0 = +ve):
47
+ # value min length
48
+ # +ve +ve value - min
49
+ # +ve -ve value - 0
50
+ # -ve -ve value.abs - 0
51
+
52
+ min=dataset[:data].min
53
+ max=dataset[:data].max
54
+ median=dataset[:vector].median
55
+ q1=dataset[:vector].percentil(25)
56
+ q3=dataset[:vector].percentil(75)
57
+ iqr=q3-q1
58
+ left = (fieldwidth * field_count)
59
+ #length = (value.abs - (minvalue > 0 ? minvalue : 0)) * unit_size
60
+ # top is 0 if value is negative
61
+ top_wisk=(q3+iqr*1.5 < max) ? q3+iqr*1.5 : max
62
+ down_wisk= (q1-iqr*1.5 > min) ? q1-iqr*1.5 : min
63
+
64
+ top=@graph_height-((top_wisk-minvalue)*unit_size)
65
+ down=@graph_height-((down_wisk-minvalue)*unit_size)
66
+
67
+ median_bar=@graph_height-((median-minvalue)*unit_size)
68
+ middle= left+(bar_width / 2)
69
+ left_whis=left+(bar_width * 0.4)
70
+ rigth_whis=left+(bar_width*0.6)
71
+ left_rect= left+(bar_width * 0.25)
72
+ rigth_rect = left+ (bar_width * 0.75)
73
+ top_rect=@graph_height-((q3-minvalue)*unit_size)
74
+ height_rect=iqr*unit_size
75
+ path="M #{left_whis} #{top} H #{rigth_whis} M #{middle} #{top} V #{down} M #{left_whis} #{down} H #{rigth_whis} M #{left_rect} #{median_bar} H #{rigth_rect}"
76
+
77
+
78
+ # Marcamos Outliers
79
+ if top_wisk!=max or down_wisk!=min
80
+ dataset[:vector].valid_data.each{|d|
81
+ if(d<down_wisk) or (d>top_wisk)
82
+ y_out=(@graph_height - (d -minvalue)*unit_size).to_s
83
+ @graph.add_element( "circle", {
84
+ "cx" => (middle).to_s,
85
+ "cy" => y_out,
86
+ "r" => "3",
87
+ "class" => "dataPoint#{field_count+1}"
88
+ })
89
+ @graph.add_element( "text", {
90
+ "x" => (middle+20).to_s,
91
+ "y" => y_out,
92
+ "class" => "dataPointLabel",
93
+ "style" => "#{style} stroke: #000;"
94
+ }).text = d.to_s end
95
+ }
96
+ end
97
+
98
+ @graph.add_element( "rect", { "x" => left_rect.to_s, "y" => top_rect.to_s, "width" => (bar_width / 2).to_s, "height" => (height_rect).to_s, "class" => "fill#{field_count+1}"})
99
+
100
+ @graph.add_element("path",{"d"=>path, "style"=>"stroke:black;stroke-width:2"})
101
+
102
+ field_count += 1
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+ end