statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,232 @@
1
+ require 'statsample/graph/svggraph'
2
+
3
+ module Statsample
4
+ class HtmlReport
5
+ def initialize(name,dir=nil)
6
+ require 'fileutils'
7
+ @uniq=1
8
+ @uniq_file=0
9
+ @name=name
10
+ @partials=[]
11
+ @anchors=[]
12
+ dir||=@name+"/"
13
+ @dir=dir
14
+ @level=1
15
+ FileUtils.mkdir(@dir) if !File.exists? @dir
16
+ end
17
+ def add_summary(name,summary)
18
+ add_anchor(name)
19
+ @partials.push(summary)
20
+ end
21
+ def add_anchor(name)
22
+ @anchors.push([name,@level,@uniq])
23
+ @partials.push("<a name='#{@uniq}'> </a>")
24
+ @uniq+=1
25
+ end
26
+ def uniq_file(prepend="file")
27
+ @uniq_file+=1
28
+ "#{prepend}_#{@uniq_file}_#{Time.now.to_i}"
29
+ end
30
+ def add_correlation_matrix(ds)
31
+ add_anchor("Correlation Matrix")
32
+ html="<h2>Correlation Matrix</h2> <table><thead><th>-</th><th>"+ds.fields.join("</th><th>")+"</th> </thead> <tbody>"
33
+ matrix=Statsample::Bivariate.correlation_matrix(ds)
34
+ pmatrix=Statsample::Bivariate.correlation_probability_matrix(ds)
35
+
36
+
37
+ (0...(matrix.row_size)).each {|row|
38
+ html+="<tr><td>"+ds.fields[row]+"</td>"
39
+ (0...(matrix.column_size)).each {|col|
40
+ if matrix[row,col].nil?
41
+ html+="<td>--</td>"
42
+ else
43
+ sig=""
44
+ prob_out=""
45
+ if !pmatrix[row,col].nil?
46
+ prob=pmatrix[row,col]
47
+ prob_out=sprintf("%0.3f",prob)
48
+ if prob<0.01
49
+ sig="**"
50
+ elsif prob<0.05
51
+ sig="*"
52
+ else
53
+ sig=""
54
+ end
55
+ end
56
+ if sig==""
57
+ html+="<td>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</td>"
58
+ else
59
+ html+="<td><strong>#{sprintf("%0.3f",matrix[row,col])} #{sig}<br /> #{prob_out}</strong></td>"
60
+
61
+ end
62
+ end
63
+ }
64
+ html+="</tr>"
65
+ }
66
+ html+="</tbody></table>"
67
+ @partials.push(html)
68
+ end
69
+ # Add a scale
70
+ # First arg is the name of the scale
71
+ # Other are fields
72
+ def add_scale(ds,name, fields,icc=false)
73
+ raise "Fields are empty" if fields.size==0
74
+ add_anchor("Scale:#{name}")
75
+
76
+ ds_partial=ds.dup(fields)
77
+ ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
78
+ html="<h2>Scale: #{name}</h2>"
79
+ html << ia.html_summary
80
+ @partials.push(html)
81
+ @level+=1
82
+ v=ds_partial.vector_mean
83
+ add_histogram(name, v)
84
+ add_runsequence_plot(name, v)
85
+ add_normalprobability_plot(name,v)
86
+ add_icc(name,fields) if icc
87
+ @level-=1
88
+ end
89
+
90
+ def add_boxplot(name,vector,options={})
91
+ add_graph("Box Plot #{name}", name, vector.svggraph_boxplot(options))
92
+ end
93
+ def add_graph(name,id,graph)
94
+ add_anchor(name)
95
+ rs_file=@dir+"/#{uniq_file()}.svg"
96
+ html = "<h3>#{name}</h3> <p><embed src='#{rs_file}' width='#{graph.width}' height='#{graph.height}' type='image/svg+xml' /></p>\n"
97
+ File.open(rs_file, "w") {|f|
98
+ f.puts(graph.burn)
99
+ }
100
+ @partials.push(html)
101
+ end
102
+ def add_runsequence_plot(name, vector,options={})
103
+ add_graph("Run-Sequence Plot #{name}", name, vector.svggraph_runsequence_plot(options))
104
+ end
105
+ def add_lag_plot(name,vector, options={})
106
+ add_graph("Lag Plot #{name}", name,vector.svggraph_lag_plot(options))
107
+ end
108
+
109
+ def add_normalprobability_plot(name,vector,options={})
110
+ add_graph("Normal Probability Plot #{name}", name, vector.svggraph_normalprobability_plot(options))
111
+ end
112
+
113
+ def add_scatterplot(name, ds,x_field=nil, y_fields=nil,config={})
114
+ add_anchor("Scatterplot: #{name}")
115
+ x_field||=ds.fields[0]
116
+ y_fields||=ds.fields-[x_field]
117
+ ds_partial=ds.dup([x_field]+y_fields)
118
+ sc=Statsample::Graph::SvgScatterplot.new(ds_partial, config)
119
+ sc.parse
120
+ sc_file=@dir+"/#{uniq_file("sc")}.svg"
121
+ html = "<h3>Scatterplot #{name}</h3> <p><embed src='#{sc_file}' width='#{sc.width}' height='#{sc.height}' type='image/svg+xml' /></p>\n"
122
+ File.open(sc_file, "w") {|f|
123
+ f.puts(sc.burn)
124
+ }
125
+ @partials.push(html)
126
+ end
127
+
128
+
129
+ def add_boxplots(name, ds,options={})
130
+ add_anchor("Boxplots: #{name}")
131
+ options={:graph_title=>"Boxplots:#{name}", :show_graph_title=>true, :height=>500}.merge! options
132
+ graph = Statsample::Graph::SvgBoxplot.new(options)
133
+ ds.fields.each{|f|
134
+ graph.add_data(:title=>f,
135
+ :data=>ds[f].valid_data,
136
+ :vector=>ds[f]
137
+ )
138
+ }
139
+ add_graph(name,name,graph)
140
+ graph
141
+ end
142
+ def add_histogram(name,vector,bins=nil,options={})
143
+ bins||=vector.size / 15
144
+ bins=15 if bins>15
145
+ graph=vector.svggraph_histogram(bins,options)
146
+ add_graph("Histogram:#{name}",name,graph)
147
+ html = "<ul><li>Skewness=#{sprintf("%0.3f",vector.skew)}</li>
148
+ <li>Kurtosis=#{sprintf("%0.3f",vector.kurtosis)}</li></ul>"
149
+ @partials.push(html)
150
+ end
151
+ def add_icc(name,ds, fields)
152
+ require 'statsample/graph/svggraph'
153
+ raise "Fields are empty" if fields.size==0
154
+ add_anchor("ICC:#{name}")
155
+ ds_partial=ds.dup(fields)
156
+ ia=Statsample::Reliability::ItemAnalysis.new(ds_partial)
157
+ html="<h3>ICC for scale: #{name}</h3>"
158
+ ia.svggraph_item_characteristic_curve(@dir ,name, {:width=>400,:height=>300})
159
+ ds_partial.fields.sort.each{|f|
160
+ html << "<div><p><strong>#{f}</strong></p><embed src='#{@dir}/#{name}_#{f}.svg' width='400' height='300' type='image/svg+xml' /></div>\n"
161
+ }
162
+ @partials.push(html)
163
+ end
164
+ def css
165
+ <<HERE
166
+ table {
167
+ border-collapse: collapse;
168
+ }
169
+ th {
170
+ text-align: left;
171
+ padding-right: 1em;
172
+ border-bottom: 3px solid #ccc;
173
+ }
174
+ th.active img {
175
+ display: inline;
176
+ }
177
+ tr.even, tr.odd {
178
+ background-color: #eee;
179
+ border-bottom: 1px solid #ccc;
180
+ }
181
+ tr.even, tr.odd {
182
+ padding: 0.1em 0.6em;
183
+ }
184
+ td.active {
185
+ background-color: #ddd;
186
+ }
187
+ table td {
188
+ border:1px solid #aaa;
189
+ }
190
+ table tr.line td{
191
+ border-top: 2px solid black;
192
+ }
193
+
194
+ HERE
195
+ end
196
+
197
+ def create_uls(level)
198
+ if @c_level!=level
199
+ if level>@c_level
200
+ "<ul>\n" * (level-@c_level)
201
+ else
202
+ "</ul>\n" * (@c_level-level)
203
+ end
204
+ else
205
+ ""
206
+ end
207
+ end
208
+
209
+ def parse
210
+ html="<html><head><title>#{@name}</title><style>#{css()}</style></head><body><h1>Report: #{@name}</h1>"
211
+ if @anchors.size>0
212
+ html << "<div class='index'>Index</div><ul>"
213
+ @c_level=1
214
+ @anchors.each{|name,level,uniq|
215
+ html << create_uls(level)
216
+ @c_level=level
217
+ html << "<li><a href='#"+uniq.to_s+"'>#{name}</a></li>"
218
+ }
219
+ html << create_uls(1)
220
+ html << "</ul></div>"
221
+ end
222
+ html+="<div class='section'>"+@partials.join("</div><div class='section'>")+"</div>"
223
+ html+="</body></html>"
224
+ html
225
+ end
226
+ def save(filename)
227
+ File.open(filename,"w") {|fp|
228
+ fp.write(parse)
229
+ }
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,281 @@
1
+ module Statsample
2
+ # Multiset joins multiple dataset with the same fields and vectors
3
+ # but with different number of cases.
4
+ # This is the base class for stratified and cluster sampling estimation
5
+ class Multiset
6
+ attr_reader :fields, :datasets
7
+ # To create a multiset
8
+ # * Multiset.new(%w{f1 f2 f3}) # define only fields
9
+ def initialize(fields)
10
+ @fields=fields
11
+ @datasets={}
12
+ end
13
+ def self.new_empty_vectors(fields,ds_names)
14
+ ms=Multiset.new(fields)
15
+ ds_names.each{|d|
16
+ ms.add_dataset(d,Dataset.new(fields))
17
+ }
18
+ ms
19
+ end
20
+ def datasets_names
21
+ @datasets.keys.sort
22
+ end
23
+ def n_datasets
24
+ @datasets.size
25
+ end
26
+ def add_dataset(key,ds)
27
+ if(ds.fields!=@fields)
28
+ raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
29
+ else
30
+ @datasets[key]=ds
31
+ end
32
+ end
33
+ def sum_field(field)
34
+ @datasets.inject(0) {|a,da|
35
+ stratum_name=da[0]
36
+ vector=da[1][field]
37
+ val=yield stratum_name,vector
38
+ a+val
39
+ }
40
+ end
41
+ def collect_vector(field)
42
+ @datasets.collect {|k,v|
43
+ yield k, v[field]
44
+ }
45
+ end
46
+ def[](i)
47
+ @datasets[i]
48
+ end
49
+ end
50
+ class StratifiedSample
51
+ class << self
52
+ # mean for an array of vectors
53
+ def mean(*v)
54
+ n_total=0
55
+ a=v.inject(0){|a,v|
56
+ n_total+=v.size
57
+ a+v.sum
58
+ }
59
+ a.to_f/n_total
60
+ end
61
+
62
+ def standard_error_ksd_wr(es)
63
+ n_total=0
64
+ sum=es.inject(0){|a,h|
65
+ n_total+=h['N']
66
+ a+((h['N']**2 * h['s']**2) / h['n'].to_f)
67
+ }
68
+ (1.to_f / n_total)*Math::sqrt(sum)
69
+ end
70
+
71
+
72
+ def variance_ksd_wr(es)
73
+ standard_error_ksd_wr(es)**2
74
+ end
75
+
76
+ # Source : Cochran (1972)
77
+
78
+ def variance_ksd_wor(es)
79
+ n_total=es.inject(0) {|a,h|
80
+ a+h['N']
81
+ }
82
+ es.inject(0){|a,h|
83
+ val=((h['N'].to_f / n_total)**2) * (h['s']**2 / h['n'].to_f) * (1 - (h['n'].to_f / h['N']))
84
+ a+val
85
+ }
86
+ end
87
+ def standard_error_ksd_wor(es)
88
+ Math::sqrt(variance_ksd_wor(es))
89
+ end
90
+
91
+
92
+
93
+ def variance_esd_wor(es)
94
+ n_total=es.inject(0) {|a,h|
95
+ a+h['N']
96
+ }
97
+
98
+ sum=es.inject(0){|a,h|
99
+ val=h['N']*(h['N']-h['n'])*(h['s']**2 / h['n'].to_f)
100
+ a+val
101
+ }
102
+ (1.0/(n_total**2))*sum
103
+ end
104
+
105
+
106
+ def standard_error_esd_wor(es)
107
+ Math::sqrt(variance_ksd_wor(es))
108
+ end
109
+ # Based on http://stattrek.com/Lesson6/STRAnalysis.aspx
110
+ def variance_esd_wr(es)
111
+ n_total=es.inject(0) {|a,h|
112
+ a+h['N']
113
+ }
114
+
115
+ sum=es.inject(0){|a,h|
116
+ val= ((h['s']**2 * h['N']**2) / h['n'].to_f)
117
+ a+val
118
+ }
119
+ (1.0/(n_total**2))*sum
120
+ end
121
+ def standard_error_esd_wr(es)
122
+ Math::sqrt(variance_esd_wr(es))
123
+ end
124
+
125
+ def proportion_variance_ksd_wor(es)
126
+ n_total=es.inject(0) {|a,h|
127
+ a+h['N']
128
+ }
129
+
130
+ es.inject(0){|a,h|
131
+ val= (((h['N'].to_f / n_total)**2 * h['p']*(1-h['p'])) / (h['n'])) * (1- (h['n'].to_f / h['N']))
132
+ a+val
133
+ }
134
+ end
135
+ def proportion_sd_ksd_wor(es)
136
+ Math::sqrt(proportion_variance_ksd_wor(es))
137
+ end
138
+
139
+
140
+ def proportion_sd_ksd_wr(es)
141
+ n_total=es.inject(0) {|a,h|
142
+ a+h['N']
143
+ }
144
+
145
+ sum=es.inject(0){|a,h|
146
+ val= (h['N']**2 * h['p']*(1-h['p'])) / h['n'].to_f
147
+ a+val
148
+ }
149
+ Math::sqrt(sum) * (1.0/n_total)
150
+ end
151
+ def proportion_variance_ksd_wr(es)
152
+ proportion_variance_ksd_wor(es)**2
153
+ end
154
+
155
+ def proportion_variance_esd_wor(es)
156
+ n_total=es.inject(0) {|a,h|
157
+ a+h['N']
158
+ }
159
+
160
+ sum=es.inject(0){|a,h|
161
+ a=(h['N']**2 * (h['N']-h['n']) * h['p']*(1.0-h['p'])) / ((h['n']-1)*(h['N']-1))
162
+ a+val
163
+ }
164
+ Math::sqrt(sum) * (1.0/n_total**2)
165
+ end
166
+ def proportion_sd_esd_wor(es)
167
+ Math::sqrt(proportion_variance_ksd_wor(es))
168
+ end
169
+
170
+
171
+
172
+ end
173
+ def initialize(ms,strata_sizes)
174
+ raise TypeError,"ms should be a Multiset" unless ms.is_a? Statsample::Multiset
175
+ @ms=ms
176
+ raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
177
+ @strata_sizes=strata_sizes
178
+ @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
179
+ @strata_number=@ms.n_datasets
180
+ @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
181
+ end
182
+ # Number of strata
183
+ def strata_number
184
+ @strata_number
185
+ end
186
+ # Population size. Equal to sum of strata sizes
187
+ # Symbol: N<sub>h</sub>
188
+ def population_size
189
+ @population_size
190
+ end
191
+ # Sample size. Equal to sum of sample of each stratum
192
+ def sample_size
193
+ @sample_size
194
+ end
195
+ # Size of stratum x
196
+ def stratum_size(h)
197
+ @strata_sizes[h]
198
+ end
199
+ def vectors_by_field(field)
200
+ @ms.datasets.collect{|k,ds|
201
+ ds[field]
202
+ }
203
+ end
204
+ # Population proportion based on strata
205
+ def proportion(field, v=1)
206
+ @ms.sum_field(field) {|s_name,vector|
207
+ stratum_ponderation(s_name)*vector.proportion(v)
208
+ }
209
+ end
210
+ # Stratum ponderation.
211
+ # Symbol: W\<sub>h\</sub>
212
+ def stratum_ponderation(h)
213
+ @strata_sizes[h].to_f / @population_size
214
+ end
215
+ alias_method :wh, :stratum_ponderation
216
+
217
+ # Population mean based on strata
218
+ def mean(field)
219
+ @ms.sum_field(field) {|s_name,vector|
220
+ stratum_ponderation(s_name)*vector.mean
221
+ }
222
+ end
223
+ # Standard error with estimated population variance and without replacement.
224
+ # Source: Cochran (1972)
225
+ def standard_error_wor(field)
226
+ es=@ms.collect_vector(field) {|s_n, vector|
227
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
228
+ }
229
+
230
+ StratifiedSample.standard_error_esd_wor(es)
231
+ end
232
+
233
+ # Standard error with estimated population variance and without replacement.
234
+ # Source: http://stattrek.com/Lesson6/STRAnalysis.aspx
235
+
236
+ def standard_error_wor_2(field)
237
+ sum=@ms.sum_field(field) {|s_name,vector|
238
+ s_size=@strata_sizes[s_name]
239
+ (s_size**2 * (1-(vector.size.to_f / s_size)) * vector.variance_sample / vector.size.to_f)
240
+ }
241
+ (1/@population_size.to_f)*Math::sqrt(sum)
242
+ end
243
+
244
+ def standard_error_wr(field)
245
+ es=@ms.collect_vector(field) {|s_n, vector|
246
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 's'=>vector.sds}
247
+ }
248
+
249
+ StratifiedSample.standard_error_esd_wr(es)
250
+ end
251
+ def proportion_sd_esd_wor(field,v=1)
252
+ es=@ms.collect_vector(field) {|s_n, vector|
253
+ {'N'=>@strata_sizes[s_n],'n'=>vector.size, 'p'=>vector.proportion(v)}
254
+ }
255
+
256
+ StratifiedSample.proportion_sd_esd_wor(es)
257
+ end
258
+
259
+ def proportion_standard_error(field,v=1)
260
+ prop=proportion(field,v)
261
+ sum=@ms.sum_field(field) {|s_name,vector|
262
+ nh=vector.size
263
+ s_size=@strata_sizes[s_name]
264
+ (s_size**2 * (1-(nh/s_size)) * prop * (1-prop) / (nh -1 ))
265
+ }
266
+ (1/@population_size.to_f) * Math::sqrt(sum)
267
+ end
268
+ # Cochran(1971), p. 150
269
+ def variance_pst(field,v=1)
270
+ sum=@ms.datasets.inject(0) {|a,da|
271
+ stratum_name=da[0]
272
+ ds=da[1]
273
+ nh=ds.cases.to_f
274
+ s_size=@strata_sizes[stratum_name]
275
+ prop=ds[field].proportion(v)
276
+ a + (((s_size**2 * (s_size-nh)) / (s_size-1))*(prop*(1-prop) / (nh-1)))
277
+ }
278
+ (1/@population_size.to_f ** 2)*sum
279
+ end
280
+ end
281
+ end