statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,232 @@
1
+ require 'statsample/graph/svggraph'
2
+
3
+ module Statsample
4
+ class HtmlReport
5
+ def initialize(name,dir=nil)
6
+ require 'fileutils'
7
+ @uniq=1
8
+ @uniq_file=0
9
+ @name=name
10
+ @partials=[]
11
+ @anchors=[]
12
+ dir||=@name+"/"
13
+ @dir=dir
14
+ @level=1
15
+ FileUtils.mkdir(@dir) if !File.exists? @dir
16
+ end
17
+ def add_summary(name,summary)
18
+ add_anchor(name)
19
+ @partials.push(summary)
20
+ end
21
+ def add_anchor(name)
22
+ @anchors.push([name,@level,@uniq])
23
+ @partials.push("<a name='#{@uniq}'> </a>")
24
+ @uniq+=1
25
+ end
26
+ def uniq_file(prepend="file")
27
+ @uniq_file+=1
28
+ "#{prepend}_#{@uniq_file}_#{Time.now.to_i}"
29
+ end
30
+ def add_correlation_matrix(ds)
31
+ add_anchor("Correlation Matrix")
32
+ html="<h2>Correlation Matrix</h2> <table><thead><th>-</th><th>"+ds.fields.join("</th><th>")+"</th> </thead> <tbody>"
33
+ matrix=Statsample::Bivariate.correlation_matrix(ds)
34
+ pmatrix=Statsample::Bivariate.correlation_probability_matrix(ds)
35
+
36
+
37
+ (0...(matrix.row_size)).each {|row|
38
+ html+="<tr><td>"+ds.fields[row]+"</td>"
39
+ (0...(matrix.column_size)).each {|col|
40
+ if matrix[row,col].nil?
41
+ html+="<td>--</td>"
42
+ else
43
+ sig=""
44
+ prob_out=""
45
+ if !pmatrix[row,col].nil?
46
+ prob=pmatrix[row,col]
47
+ prob_out=sprintf("%0.3f",prob)
48
+ if prob<0.01
49
+ sig="**"
50
+ elsif prob<0.05
51
+ sig="*"
52
+ else
53
+ sig=""
54
+ end
55
+ end
56
+ if sig==""
57
+ html+="<td>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</td>"
58
+ else
59
+ html+="<td><strong>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</strong></td>"
60
+
61
+ end
62
+ end
63
+ }
64
+ html+="</tr>"
65
+ }
66
+ html+="</tbody></table>"
67
+ @partials.push(html)
68
+ end
69
+ # Add a scale
70
+ # First arg is the name of the scale
71
+ # Other are fields
72
+ def add_scale(ds,name, fields,icc=false)
73
+ raise "Fields are empty" if fields.size==0
74
+ add_anchor("Scale:#{name}")
75
+
76
+ ds_partial=ds.dup(fields)
77
+ ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
78
+ html="<h2>Scale: #{name}</h2>"
79
+ html << ia.html_summary
80
+ @partials.push(html)
81
+ @level+=1
82
+ v=ds_partial.vector_mean
83
+ add_histogram(name, v)
84
+ add_runsequence_plot(name, v)
85
+ add_normalprobability_plot(name,v)
86
+ add_icc(name,fields) if icc
87
+ @level-=1
88
+ end
89
+
90
+ def add_boxplot(name,vector,options={})
91
+ add_graph("Box Plot #{name}", name, vector.svggraph_boxplot(options))
92
+ end
93
+ def add_graph(name,id,graph)
94
+ add_anchor(name)
95
+ rs_file=@dir+"/#{uniq_file()}.svg"
96
+ html = "<h3>#{name}</h3> <p><embed src='#{rs_file}' width='#{graph.width}' height='#{graph.height}' type='image/svg+xml' /></p>\n"
97
+ File.open(rs_file, "w") {|f|
98
+ f.puts(graph.burn)
99
+ }
100
+ @partials.push(html)
101
+ end
102
+ def add_runsequence_plot(name, vector,options={})
103
+ add_graph("Run-Sequence Plot #{name}", name, vector.svggraph_runsequence_plot(options))
104
+ end
105
+ def add_lag_plot(name,vector, options={})
106
+ add_graph("Lag Plot #{name}", name,vector.svggraph_lag_plot(options))
107
+ end
108
+
109
+ def add_normalprobability_plot(name,vector,options={})
110
+ add_graph("Normal Probability Plot #{name}", name, vector.svggraph_normalprobability_plot(options))
111
+ end
112
+
113
+ def add_scatterplot(name, ds,x_field=nil, y_fields=nil,config={})
114
+ add_anchor("Scatterplot: #{name}")
115
+ x_field||=ds.fields[0]
116
+ y_fields||=ds.fields-[x_field]
117
+ ds_partial=ds.dup([x_field]+y_fields)
118
+ sc=Statsample::Graph::SvgScatterplot.new(ds_partial, config)
119
+ sc.parse
120
+ sc_file=@dir+"/#{uniq_file("sc")}.svg"
121
+ html = "<h3>Scatterplot #{name}</h3> <p><embed src='#{sc_file}' width='#{sc.width}' height='#{sc.height}' type='image/svg+xml' /></p>\n"
122
+ File.open(sc_file, "w") {|f|
123
+ f.puts(sc.burn)
124
+ }
125
+ @partials.push(html)
126
+ end
127
+
128
+
129
+ def add_boxplots(name, ds,options={})
130
+ add_anchor("Boxplots: #{name}")
131
+ options={:graph_title=>"Boxplots:#{name}", :show_graph_title=>true, :height=>500}.merge! options
132
+ graph = Statsample::Graph::SvgBoxplot.new(options)
133
+ ds.fields.each{|f|
134
+ graph.add_data(:title=>f,
135
+ :data=>ds[f].valid_data,
136
+ :vector=>ds[f]
137
+ )
138
+ }
139
+ add_graph(name,name,graph)
140
+ graph
141
+ end
142
+ def add_histogram(name,vector,bins=nil,options={})
143
+ bins||=vector.size / 15
144
+ bins=15 if bins>15
145
+ graph=vector.svggraph_histogram(bins,options)
146
+ add_graph("Histogram:#{name}",name,graph)
147
+ html = "<ul><li>Skewness=#{sprintf("%0.3f",vector.skew)}</li>
148
+ <li>Kurtosis=#{sprintf("%0.3f",vector.kurtosis)}</li></ul>"
149
+ @partials.push(html)
150
+ end
151
+ def add_icc(name,ds, fields)
152
+ require 'statsample/graph/svggraph'
153
+ raise "Fields are empty" if fields.size==0
154
+ add_anchor("ICC:#{name}")
155
+ ds_partial=ds.dup(fields)
156
+ ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
157
+ html="<h3>ICC for scale: #{name}</h3>"
158
+ ia.svggraph_item_characteristic_curve(@dir ,name, {:width=>400,:height=>300})
159
+ ds_partial.fields.sort.each{|f|
160
+ html << "<div><p><strong>#{f}</strong></p><embed src='#{@dir}/#{name}_#{f}.svg' width='400' height='300' type='image/svg+xml' /></div>\n"
161
+ }
162
+ @partials.push(html)
163
+ end
164
+ def css
165
+ <<HERE
166
+ table {
167
+ border-collapse: collapse;
168
+ }
169
+ th {
170
+ text-align: left;
171
+ padding-right: 1em;
172
+ border-bottom: 3px solid #ccc;
173
+ }
174
+ th.active img {
175
+ display: inline;
176
+ }
177
+ tr.even, tr.odd {
178
+ background-color: #eee;
179
+ border-bottom: 1px solid #ccc;
180
+ }
181
+ tr.even, tr.odd {
182
+ padding: 0.1em 0.6em;
183
+ }
184
+ td.active {
185
+ background-color: #ddd;
186
+ }
187
+ table td {
188
+ border:1px solid #aaa;
189
+ }
190
+ table tr.line td{
191
+ border-top: 2px solid black;
192
+ }
193
+
194
+ HERE
195
+ end
196
+
197
+ def create_uls(level)
198
+ if @c_level!=level
199
+ if level>@c_level
200
+ "<ul>\n" * (level-@c_level)
201
+ else
202
+ "</ul>\n" * (@c_level-level)
203
+ end
204
+ else
205
+ ""
206
+ end
207
+ end
208
+
209
+ def parse
210
+ html="<html><head><title>#{@name}</title><style>#{css()}</style></head><body><h1>Report: #{@name}</h1>"
211
+ if @anchors.size>0
212
+ html << "<div class='index'>Index</div><ul>"
213
+ @c_level=1
214
+ @anchors.each{|name,level,uniq|
215
+ html << create_uls(level)
216
+ @c_level=level
217
+ html << "<li><a href='#"+uniq.to_s+"'>#{name}</a></li>"
218
+ }
219
+ html << create_uls(1)
220
+ html << "</ul></div>"
221
+ end
222
+ html+="<div class='section'>"+@partials.join("</div><div class='section'>")+"</div>"
223
+ html+="</body></html>"
224
+ html
225
+ end
226
+ def save(filename)
227
+ File.open(filename,"w") {|fp|
228
+ fp.write(parse)
229
+ }
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,281 @@
1
+ module Statsample
2
+ # Multiset joins multiple dataset with the same fields and vectors
3
+ # but with different number of cases.
4
+ # This is the base class for stratified and cluster sampling estimation
5
+ class Multiset
6
+ attr_reader :fields, :datasets
7
+ # To create a multiset
8
+ # * Multiset.new(%w{f1 f2 f3}) # define only fields
9
+ def initialize(fields)
10
+ @fields=fields
11
+ @datasets={}
12
+ end
13
+ def self.new_empty_vectors(fields,ds_names)
14
+ ms=Multiset.new(fields)
15
+ ds_names.each{|d|
16
+ ms.add_dataset(d,Dataset.new(fields))
17
+ }
18
+ ms
19
+ end
20
+ def datasets_names
21
+ @datasets.keys.sort
22
+ end
23
+ def n_datasets
24
+ @datasets.size
25
+ end
26
+ def add_dataset(key,ds)
27
+ if(ds.fields!=@fields)
28
+ raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
29
+ else
30
+ @datasets[key]=ds
31
+ end
32
+ end
33
+ def sum_field(field)
34
+ @datasets.inject(0) {|a,da|
35
+ stratum_name=da[0]
36
+ vector=da[1][field]
37
+ val=yield stratum_name,vector
38
+ a+val
39
+ }
40
+ end
41
+ def collect_vector(field)
42
+ @datasets.collect {|k,v|
43
+ yield k, v[field]
44
+ }
45
+ end
46
+ def[](i)
47
+ @datasets[i]
48
+ end
49
+ end
50
+ class StratifiedSample
51
+ class << self
52
+ # mean for an array of vectors
53
+ def mean(*v)
54
+ n_total=0
55
+ a=v.inject(0){|a,v|
56
+ n_total+=v.size
57
+ a+v.sum
58
+ }
59
+ a.to_f/n_total
60
+ end
61
+
62
+ def standard_error_ksd_wr(es)
63
+ n_total=0
64
+ sum=es.inject(0){|a,h|
65
+ n_total+=h['N']
66
+ a+((h['N']**2 * h['s']**2) / h['n'].to_f)
67
+ }
68
+ (1.to_f / n_total)*Math::sqrt(sum)
69
+ end
70
+
71
+
72
+ def variance_ksd_wr(es)
73
+ standard_error_ksd_wr(es)**2
74
+ end
75
+
76
+ # Source : Cochran (1972)
77
+
78
+ def variance_ksd_wor(es)
79
+ n_total=es.inject(0) {|a,h|
80
+ a+h['N']
81
+ }
82
+ es.inject(0){|a,h|
83
+ val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
84
+ a+val
85
+ }
86
+ end
87
+ def standard_error_ksd_wor(es)
88
+ Math::sqrt(variance_ksd_wor(es))
89
+ end
90
+
91
+
92
+
93
+ def variance_esd_wor(es)
94
+ n_total=es.inject(0) {|a,h|
95
+ a+h['N']
96
+ }
97
+
98
+ sum=es.inject(0){|a,h|
99
+ val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
100
+ a+val
101
+ }
102
+ (1.0/(n_total**2))*sum
103
+ end
104
+
105
+
106
+ def standard_error_esd_wor(es)
107
+ Math::sqrt(variance_ksd_wor(es))
108
+ end
109
+ # Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
110
+ def variance_esd_wr(es)
111
+ n_total=es.inject(0) {|a,h|
112
+ a+h['N']
113
+ }
114
+
115
+ sum=es.inject(0){|a,h|
116
+ val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
117
+ a+val
118
+ }
119
+ (1.0/(n_total**2))*sum
120
+ end
121
+ def standard_error_esd_wr(es)
122
+ Math::sqrt(variance_esd_wr(es))
123
+ end
124
+
125
+ def proportion_variance_ksd_wor(es)
126
+ n_total=es.inject(0) {|a,h|
127
+ a+h['N']
128
+ }
129
+
130
+ es.inject(0){|a,h|
131
+ val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
132
+ a+val
133
+ }
134
+ end
135
+ def proportion_sd_ksd_wor(es)
136
+ Math::sqrt(proportion_variance_ksd_wor(es))
137
+ end
138
+
139
+
140
+ def proportion_sd_ksd_wr(es)
141
+ n_total=es.inject(0) {|a,h|
142
+ a+h['N']
143
+ }
144
+
145
+ sum=es.inject(0){|a,h|
146
+ val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
147
+ a+val
148
+ }
149
+ Math::sqrt(sum) * (1.0/n_total)
150
+ end
151
+ def proportion_variance_ksd_wr(es)
152
+ proportion_variance_ksd_wor(es)**2
153
+ end
154
+
155
+ def proportion_variance_esd_wor(es)
156
+ n_total=es.inject(0) {|a,h|
157
+ a+h['N']
158
+ }
159
+
160
+ sum=es.inject(0){|a,h|
161
+ a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
162
+ a+val
163
+ }
164
+ Math::sqrt(sum) * (1.0/n_total**2)
165
+ end
166
+ def proportion_sd_esd_wor(es)
167
+ Math::sqrt(proportion_variance_ksd_wor(es))
168
+ end
169
+
170
+
171
+
172
+ end
173
+ def initialize(ms,strata_sizes)
174
+ raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
175
+ @ms=ms
176
+ raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
177
+ @strata_sizes=strata_sizes
178
+ @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
179
+ @strata_number=@ms.n_datasets
180
+ @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
181
+ end
182
+ # Number of strata
183
+ def strata_number
184
+ @strata_number
185
+ end
186
+ # Population size. Equal to sum of strata sizes
187
+ # Symbol: N<sub>h</sub>
188
+ def population_size
189
+ @population_size
190
+ end
191
+ # Sample size. Equal to sum of sample of each stratum
192
+ def sample_size
193
+ @sample_size
194
+ end
195
+ # Size of stratum x
196
+ def stratum_size(h)
197
+ @strata_sizes[h]
198
+ end
199
+ def vectors_by_field(field)
200
+ @ms.datasets.collect{|k,ds|
201
+ ds[field]
202
+ }
203
+ end
204
+ # Population proportion based on strata
205
+ def proportion(field, v=1)
206
+ @ms.sum_field(field) {|s_name,vector|
207
+ stratum_ponderation(s_name)*vector.proportion(v)
208
+ }
209
+ end
210
+ # Stratum ponderation.
211
+ # Symbol: W\<sub>h\</sub>
212
+ def stratum_ponderation(h)
213
+ @strata_sizes[h].to_f / @population_size
214
+ end
215
+ alias_method :wh, :stratum_ponderation
216
+
217
+ # Population mean based on strata
218
+ def mean(field)
219
+ @ms.sum_field(field) {|s_name,vector|
220
+ stratum_ponderation(s_name)*vector.mean
221
+ }
222
+ end
223
+ # Standard error with estimated population variance and without replacement.
224
+ # Source: Cochran (1972)
225
+ def standard_error_wor(field)
226
+ es=@ms.collect_vector(field) {|s_n, vector|
227
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
228
+ }
229
+
230
+ StratifiedSample.standard_error_esd_wor(es)
231
+ end
232
+
233
+ # Standard error with estimated population variance and without replacement.
234
+ # Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
235
+
236
+ def standard_error_wor_2(field)
237
+ sum=@ms.sum_field(field) {|s_name,vector|
238
+ s_size=@strata_sizes[s_name]
239
+ (s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
240
+ }
241
+ (1/@population_size.to_f)*Math::sqrt(sum)
242
+ end
243
+
244
+ def standard_error_wr(field)
245
+ es=@ms.collect_vector(field) {|s_n, vector|
246
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
247
+ }
248
+
249
+ StratifiedSample.standard_error_esd_wr(es)
250
+ end
251
+ def proportion_sd_esd_wor(field,v=1)
252
+ es=@ms.collect_vector(field) {|s_n, vector|
253
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
254
+ }
255
+
256
+ StratifiedSample.proportion_sd_esd_wor(es)
257
+ end
258
+
259
+ def proportion_standard_error(field,v=1)
260
+ prop=proportion(field,v)
261
+ sum=@ms.sum_field(field) {|s_name,vector|
262
+ nh=vector.size
263
+ s_size=@strata_sizes[s_name]
264
+ (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
265
+ }
266
+ (1/@population_size.to_f) * Math::sqrt(sum)
267
+ end
268
+ # Cochran(1971), p. 150
269
+ def variance_pst(field,v=1)
270
+ sum=@ms.datasets.inject(0) {|a,da|
271
+ stratum_name=da[0]
272
+ ds=da[1]
273
+ nh=ds.cases.to_f
274
+ s_size=@strata_sizes[stratum_name]
275
+ prop=ds[field].proportion(v)
276
+ a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
277
+ }
278
+ (1/@population_size.to_f ** 2)*sum
279
+ end
280
+ end
281
+ end