statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,259 @@
1
+ require 'statsample/dominanceanalysis/bootstrap'
2
+ module Statsample
3
+ class DominanceAnalysis
4
+ def initialize(ds,y_var, r_class = Regression::MultipleRegressionPairwise)
5
+ @y_var=y_var
6
+ @dy=ds[@y_var]
7
+ @ds=ds
8
+ @r_class=r_class
9
+ @ds_indep=ds.dup(ds.fields-[y_var])
10
+ @fields=@ds_indep.fields
11
+ create_models
12
+ fill_models
13
+ end
14
+ def fill_models
15
+ @models.each{|m|
16
+ @fields.each{|f|
17
+ next if m.include? f
18
+ base_model=md(m)
19
+ comp_model=md(m+[f])
20
+ base_model.add_contribution(f,comp_model.r2)
21
+ }
22
+ }
23
+ end
24
+ def dominance_for_nil_model(i,j)
25
+ if md(i).r2>md(j).r2
26
+ 1
27
+ elsif md(i).r2<md(j).r2
28
+ 0
29
+ else
30
+ 0.5
31
+ end
32
+ end
33
+ # Returns 1 if i D k, 0 if j dominates i and 0.5 if undetermined
34
+ def total_dominance_pairwise(i,j)
35
+ dm=dominance_for_nil_model(i,j)
36
+ return 0.5 if dm==0.5
37
+ dominances=[dm]
38
+ @models_data.each{|k,m|
39
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
40
+ if m.contributions[i]>m.contributions[j]
41
+ dominances.push(1)
42
+ elsif m.contributions[i]<m.contributions[j]
43
+ dominances.push(0)
44
+ else
45
+ dominances.push(0.5)
46
+ end
47
+ end
48
+ }
49
+ final=dominances.uniq
50
+ final.size>1 ? 0.5 : final[0]
51
+ end
52
+
53
+ # Returns 1 if i cD k, 0 if j cD i and 0.5 if undetermined
54
+ def conditional_dominance_pairwise(i,j)
55
+ dm=dominance_for_nil_model(i,j)
56
+ return 0.5 if dm==0.5
57
+ dominances=[dm]
58
+ for k in 1...@fields.size
59
+ a=average_k(k)
60
+ if a[i]>a[j]
61
+ dominances.push(1)
62
+ elsif a[i]<a[j]
63
+ dominances.push(0)
64
+ else
65
+ a(0.5)
66
+ end
67
+ end
68
+ final=dominances.uniq
69
+ final.size>1 ? 0.5 : final[0]
70
+ end
71
+ # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
72
+ def general_dominance_pairwise(i,j)
73
+ ga=general_averages
74
+ if ga[i]>ga[j]
75
+ 1
76
+ elsif ga[i]<ga[j]
77
+ 0
78
+ else
79
+ 0.5
80
+ end
81
+ end
82
+ def pairs
83
+ @models.find_all{|m| m.size==2}
84
+ end
85
+ def total_dominance
86
+ pairs.inject({}){|a,pair|
87
+ a[pair]=total_dominance_pairwise(pair[0], pair[1])
88
+ a
89
+ }
90
+ end
91
+ def conditional_dominance
92
+ pairs.inject({}){|a,pair|
93
+ a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
94
+ a
95
+ }
96
+ end
97
+ def general_dominance
98
+ pairs.inject({}){|a,pair|
99
+ a[pair]=general_dominance_pairwise(pair[0], pair[1])
100
+ a
101
+ }
102
+ end
103
+
104
+ def md(m)
105
+ @models_data[m.sort]
106
+ end
107
+ # Get all model of size k
108
+ def md_k(k)
109
+ out=[]
110
+ models=@models.each{|m|
111
+ out.push(md(m)) if m.size==k
112
+ }
113
+ out
114
+ end
115
+ def average_k(k)
116
+ return nil if k==@fields.size
117
+ models=md_k(k)
118
+ averages=@fields.inject({}) {|a,v| a[v]=[];a}
119
+ models.each{|m|
120
+ @fields.each{|f|
121
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
122
+ }
123
+ }
124
+ out={}
125
+ averages.each{|k,v|
126
+ out[k]=v.to_vector(:scale).mean
127
+ }
128
+ out
129
+ end
130
+ def general_averages
131
+ if @general_averages.nil?
132
+ averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
133
+ for k in 1...@fields.size
134
+ ak=average_k(k)
135
+ @fields.each{|f|
136
+ averages[f].push(ak[f])
137
+ }
138
+ end
139
+ out={}
140
+ averages.each{|k,v|
141
+ out[k]=v.to_vector(:scale).mean
142
+ }
143
+ @general_averages=out
144
+ end
145
+ @general_averages
146
+ end
147
+ def create_models
148
+ @models=[]
149
+ @models_data={}
150
+ for i in 1..@fields.size
151
+ c = GSL::Combination.calloc(@fields.size, i);
152
+ begin
153
+ convert=c.data.to_a.collect {|i|
154
+ @fields[i]
155
+ }
156
+ @models.push(convert)
157
+ ds_prev=@ds.dup(convert+[@y_var])
158
+ modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
159
+
160
+ @models_data[convert.sort]=modeldata
161
+ end while c.next == GSL::SUCCESS
162
+ end
163
+ end
164
+ def summary(report_type=ConsoleSummary)
165
+ out=""
166
+ out.extend report_type
167
+ out << "Summary for Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
168
+ t=Statsample::ReportTable.new
169
+ t.header=["","r2","sign"]+@fields
170
+ row=["Model 0","",""]+@fields.collect{|f|
171
+ sprintf("%0.3f",md(f).r2)
172
+ }
173
+ t.add_row(row)
174
+ t.add_horizontal_line
175
+ for i in 1..@fields.size
176
+ mk=md_k(i)
177
+ mk.each{|m|
178
+ t.add_row(m.add_table_row)
179
+ }
180
+ # Report averages
181
+ a=average_k(i)
182
+ if !a.nil?
183
+ t.add_horizontal_line
184
+ row=["k=#{i} Average","",""] + @fields.collect{|f|
185
+ sprintf("%0.3f",a[f])
186
+ }
187
+ t.add_row(row)
188
+ t.add_horizontal_line
189
+
190
+ end
191
+
192
+ end
193
+
194
+ g=general_averages
195
+ t.add_horizontal_line
196
+
197
+ row=["Overall averages","",""]+@fields.collect{|f|
198
+ sprintf("%0.3f",g[f])
199
+ }
200
+ t.add_row(row)
201
+ out.parse_table(t)
202
+
203
+ out.nl
204
+ out << "Pairwise\n"
205
+ td=total_dominance
206
+ cd=conditional_dominance
207
+ gd=general_dominance
208
+ t=Statsample::ReportTable.new(["Pairs","T","C","G"])
209
+ pairs.each{|p|
210
+ name=p.join(" - ")
211
+ row=[name, sprintf("%0.1f",td[p]), sprintf("%0.1f",cd[p]), sprintf("%0.1f",gd[p])]
212
+ t.add_row(row)
213
+ }
214
+ out.parse_table(t)
215
+ return out
216
+ end
217
+ class ModelData
218
+ attr_reader :contributions
219
+ def initialize(name,ds,y_var,fields,r_class)
220
+ @name=name
221
+ @fields=fields
222
+ @contributions=@fields.inject({}){|a,v| a[v]=nil;a}
223
+ r_class=Regression::MultipleRegressionPairwise if r_class.nil?
224
+ @lr=r_class.new(ds,y_var)
225
+ end
226
+ def add_contribution(f,v)
227
+ @contributions[f]=v-r2
228
+ end
229
+ def r2
230
+ @lr.r2
231
+ end
232
+ def add_table_row
233
+ [@name.join("*"), sprintf("%0.3f",r2), sprintf("%0.3f",@lr.significance)]+@fields.collect{|k|
234
+ v=@contributions[k]
235
+ if v.nil?
236
+ "--"
237
+ else
238
+ sprintf("%0.3f",v)
239
+ end
240
+ }
241
+ end
242
+ def summary
243
+ out=sprintf("%s: r2=%0.3f(p=%0.2f)\n",@name.join("*"),r2,@lr.significance,@lr.sst)
244
+ out << @fields.collect{|k|
245
+ v=@contributions[k]
246
+ if v.nil?
247
+ "--"
248
+ else
249
+ sprintf("%s=%0.3f",k,v)
250
+ end
251
+ }.join(" | ")
252
+ out << "\n"
253
+
254
+ return out
255
+ end
256
+ end
257
+ end
258
+
259
+ end
@@ -0,0 +1,126 @@
1
+ module Statsample
2
+ class DominanceAnalysis
3
+ class Bootstrap
4
+ include Writable
5
+ attr_reader :samples_td,:samples_cd,:samples_gd,:samples_ga, :fields
6
+ attr_writer :lr_class
7
+ def initialize(ds,y_var)
8
+ @ds=ds
9
+ @y_var=y_var
10
+ @n=ds.cases
11
+ @fields=ds.fields-[y_var]
12
+ @samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
13
+ @n_samples=0
14
+ @lr_class=Regression::MultipleRegressionPairwise
15
+ create_samples_pairs
16
+ end
17
+ def lr_class=(lr)
18
+ @lr_class=lr
19
+ end
20
+ def da
21
+ if @da.nil?
22
+ @da=DominanceAnalysis.new(@ds,@y_var,@lr_class)
23
+ end
24
+ @da
25
+ end
26
+ def bootstrap(number_samples,n=nil)
27
+ number_samples.times{ |t|
28
+ @n_samples+=1
29
+ puts "Bootstrap #{t+1} of #{number_samples}"
30
+ ds_boot=@ds.bootstrap(n)
31
+ da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
32
+ da_1.total_dominance.each{|k,v|
33
+ @samples_td[k].push(v)
34
+ }
35
+ da_1.conditional_dominance.each{|k,v|
36
+ @samples_cd[k].push(v)
37
+ }
38
+ da_1.general_dominance.each{|k,v|
39
+ @samples_gd[k].push(v)
40
+ }
41
+ da_1.general_averages.each{|k,v|
42
+ @samples_ga[k].push(v)
43
+ }
44
+ }
45
+ end
46
+ def create_samples_pairs
47
+ @samples_td={}
48
+ @samples_cd={}
49
+ @samples_gd={}
50
+ @pairs=[]
51
+ c = GSL::Combination.calloc(@fields.size, 2);
52
+ begin
53
+ convert=c.data.to_a.collect {|i|
54
+ @fields[i]
55
+ }
56
+ @pairs.push(convert)
57
+ [@samples_td,@samples_cd,@samples_gd].each{|s|
58
+ s[convert]=[]
59
+ }
60
+ end while c.next == GSL::SUCCESS
61
+ end
62
+ def summary(report_type=ConsoleSummary)
63
+ out =""
64
+ raise "You should bootstrap first" if @n_samples==0
65
+ alfa=0.95
66
+ t=GSL::Cdf.tdist_Pinv(1-((1-alfa) / 2),@n_samples - 1)
67
+ out.extend report_type
68
+ out.add "Summary for Bootstrap Dominance Analysis of "+@fields.join(", ")+" over "+@y_var+"\n"
69
+ out.add "Size of sample: #{@n_samples}\n"
70
+ out.add "t:#{t}\n"
71
+ out.nl
72
+ table=ReportTable.new
73
+ header=["pairs","sD","Dij","SE(Dij)","Pij","Pji","Pno","Reprod"]
74
+ table.header=header
75
+ table.add_row(["Complete dominance"])
76
+ table.add_horizontal_line
77
+ @pairs.each{|pair|
78
+ std=@samples_td[pair].to_vector(:scale)
79
+ ttd=da.total_dominance_pairwise(pair[0],pair[1])
80
+ table.add_row(summary_pairs(pair,std,ttd))
81
+ }
82
+ table.add_horizontal_line
83
+ table.add_row(["Conditional dominance"])
84
+ table.add_horizontal_line
85
+ @pairs.each{|pair|
86
+ std=@samples_cd[pair].to_vector(:scale)
87
+ ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
88
+ table.add_row(summary_pairs(pair,std,ttd))
89
+
90
+ }
91
+ table.add_horizontal_line
92
+ table.add_row(["General Dominance"])
93
+ table.add_horizontal_line
94
+ @pairs.each{|pair|
95
+ std=@samples_gd[pair].to_vector(:scale)
96
+ ttd=da.general_dominance_pairwise(pair[0],pair[1])
97
+ table.add_row(summary_pairs(pair,std,ttd))
98
+ }
99
+ out.parse_table(table)
100
+ out.add("General averages")
101
+ table=Statsample::ReportTable.new
102
+ table.header=["var","mean","se","p.5","p.95"]
103
+ @fields.each{|f|
104
+ v=@samples_ga[f].to_vector(:scale)
105
+ row=[f, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
106
+ table.add_row(row)
107
+
108
+ }
109
+ out.parse_table(table)
110
+ out
111
+ end
112
+ def summary_pairs(pair,std,ttd)
113
+ freqs=std.proportions
114
+ [0,0.5,1].each{|n|
115
+ freqs[n]=0 if freqs[n].nil?
116
+ }
117
+ name=pair[0]+" - "+pair[1]
118
+ [name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
119
+ end
120
+ def f(v,n=3)
121
+ prec="%0.#{n}f"
122
+ sprintf(prec,v)
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,45 @@
1
+ require 'GDChart'
2
+ module Statsample
3
+ module Util
4
+ class << self
5
+ def chart_gdchart(file,width,height,chart_type, labels, options,num_datasets,data)
6
+ require 'GDChart'
7
+ gdc=GDChart.new
8
+ gdc.title="Generic title"
9
+ gdc.bg_color=0xFFFFFF
10
+ gdc.image_type=GDChart::JPEG
11
+ options.each{|k,v|
12
+ gdc.send(k+"=",v)
13
+ }
14
+ f=File.open(file,"w") {|f|
15
+ gdc.out_graph(width,height,f,chart_type, data.length/num_datasets,labels,num_datasets,data)
16
+ }
17
+ end
18
+ end
19
+ end
20
+ class Nominal
21
+ # Creates a barchart using ruby-gdchart
22
+ def gdchart_frequencies(file, width=300, height=150, chart_type=GDChart::BAR, options={})
23
+ labels,data=[],[]
24
+ self.frequencies.sort.each{|k,v|
25
+ labels.push(k.to_s)
26
+ data.push(v)
27
+ }
28
+ options['ext_color']=[0xFF3399,0xFF9933,0xFFEE33,0x33FF33, 0x9966FF]
29
+ Statsample::Util.chart_gdchart(file,width,height,chart_type, labels,options,1,data)
30
+ end
31
+ end
32
+ class Scale < Ordinal
33
+ def gdchart_histogram(bins,file, width=300, height=150, chart_type=GDChart::BAR, options={})
34
+ labels=[]
35
+ h=histogram(bins)
36
+ data=[]
37
+ (0...bins).each{|bin|
38
+ data.push(h[bin])
39
+ range=h.get_range(bin)
40
+ labels.push(((range[0]+range[1]) / 2.to_f).to_s)
41
+ }
42
+ Statsample::Util.chart_gdchart(file, width, height, chart_type, labels,options, 1,data)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,108 @@
1
+ module Statsample
2
+ module Graph
3
+ class SvgBoxplot < SVG::Graph::Bar
4
+ def initialize(config={})
5
+ config[:fields]=["dummy"]
6
+ super(config)
7
+ end
8
+ def get_x_labels
9
+ @data.collect{|d|
10
+ d[:title]
11
+ }
12
+ end
13
+
14
+ def min_value
15
+ min = 0
16
+ if min_scale_value.nil?
17
+ min = @data.collect{|x| x[:data].min}.min
18
+ if min > 0
19
+ if min > 10
20
+ min=min-2
21
+ else
22
+ min=0
23
+ end
24
+ end
25
+ else
26
+ min = min_scale_value
27
+ end
28
+ return min
29
+ end
30
+ def draw_data
31
+ minvalue = min_value
32
+ fieldwidth = field_width
33
+ unit_size = (@graph_height.to_f - font_size*2*top_font) /
34
+ (get_y_labels.max - get_y_labels.min)
35
+ bargap = bar_gap ? (fieldwidth < 10 ? fieldwidth / 2 : 10) : 0
36
+
37
+ bar_width = fieldwidth - bargap
38
+ bar_width /= @data.length if stack == :side
39
+ x_mod = (@graph_width-bargap) / 2 - (stack==:side ? bar_width/2 : 0)
40
+
41
+ bottom = @graph_height
42
+
43
+ field_count = 0
44
+ for dataset in @data
45
+
46
+ # cases (assume 0 = +ve):
47
+ # value min length
48
+ # +ve +ve value - min
49
+ # +ve -ve value - 0
50
+ # -ve -ve value.abs - 0
51
+
52
+ min=dataset[:data].min
53
+ max=dataset[:data].max
54
+ median=dataset[:vector].median
55
+ q1=dataset[:vector].percentil(25)
56
+ q3=dataset[:vector].percentil(75)
57
+ iqr=q3-q1
58
+ left = (fieldwidth * field_count)
59
+ #length = (value.abs - (minvalue > 0 ? minvalue : 0)) * unit_size
60
+ # top is 0 if value is negative
61
+ top_wisk=(q3+iqr*1.5 < max) ? q3+iqr*1.5 : max
62
+ down_wisk= (q1-iqr*1.5 > min) ? q1-iqr*1.5 : min
63
+
64
+ top=@graph_height-((top_wisk-minvalue)*unit_size)
65
+ down=@graph_height-((down_wisk-minvalue)*unit_size)
66
+
67
+ median_bar=@graph_height-((median-minvalue)*unit_size)
68
+ middle= left+(bar_width / 2)
69
+ left_whis=left+(bar_width * 0.4)
70
+ rigth_whis=left+(bar_width*0.6)
71
+ left_rect= left+(bar_width * 0.25)
72
+ rigth_rect = left+ (bar_width * 0.75)
73
+ top_rect=@graph_height-((q3-minvalue)*unit_size)
74
+ height_rect=iqr*unit_size
75
+ path="M #{left_whis} #{top} H #{rigth_whis} M #{middle} #{top} V #{down} M #{left_whis} #{down} H #{rigth_whis} M #{left_rect} #{median_bar} H #{rigth_rect}"
76
+
77
+
78
+ # Marcamos Outliers
79
+ if top_wisk!=max or down_wisk!=min
80
+ dataset[:vector].valid_data.each{|d|
81
+ if(d<down_wisk) or (d>top_wisk)
82
+ y_out=(@graph_height - (d -minvalue)*unit_size).to_s
83
+ @graph.add_element( "circle", {
84
+ "cx" => (middle).to_s,
85
+ "cy" => y_out,
86
+ "r" => "3",
87
+ "class" => "dataPoint#{field_count+1}"
88
+ })
89
+ @graph.add_element( "text", {
90
+ "x" => (middle+20).to_s,
91
+ "y" => y_out,
92
+ "class" => "dataPointLabel",
93
+ "style" => "#{style} stroke: #000;"
94
+ }).text = d.to_s end
95
+ }
96
+ end
97
+
98
+ @graph.add_element( "rect", { "x" => left_rect.to_s, "y" => top_rect.to_s, "width" => (bar_width / 2).to_s, "height" => (height_rect).to_s, "class" => "fill#{field_count+1}"})
99
+
100
+ @graph.add_element("path",{"d"=>path, "style"=>"stroke:black;stroke-width:2"})
101
+
102
+ field_count += 1
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+ end