statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,74 @@
1
+ module Statsample
2
+ module Anova
3
+ # One Way Anova
4
+ # Example:
5
+ # v1=[2,3,4,5,6].to_vector(:scale)
6
+ # v2=[3,3,4,5,6].to_vector(:scale)
7
+ # v3=[5,3,1,5,6].to_vector(:scale)
8
+ # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
+ # puts anova.f
10
+ # puts anova.significance
11
+ class OneWay
12
+ def initialize(vectors)
13
+ @vectors=vectors
14
+ end
15
+ # Total sum
16
+ def sum
17
+ @vectors.inject(0){|a,v| a+v.sum}
18
+ end
19
+ # Total mean
20
+ def mean
21
+ sum.quo(n)
22
+ end
23
+ # Total sum of squares
24
+ def sst
25
+ m=mean.to_f
26
+ @vectors.inject(0) {|total,vector|
27
+ total+vector.sum_of_squares(m)
28
+ }
29
+ end
30
+ # Sum of squares within groups
31
+ def sswg
32
+ @vectors.inject(0) {|total,vector|
33
+ total+vector.sum_of_squares
34
+ }
35
+ end
36
+ # Sum of squares between groups
37
+ def ssbg
38
+ m=mean
39
+ @vectors.inject(0) {|total,vector|
40
+ total+(vector.mean-m).square*vector.size
41
+ }
42
+ end
43
+ # Degrees of freedom within groups
44
+ def df_wg
45
+ @vectors.inject(0) {|a,v| a+(v.size-1)}
46
+ end
47
+ # Degrees of freedom between groups
48
+ def df_bg
49
+ @vectors.size-1
50
+ end
51
+ # Total Degrees of freedom
52
+ def df_total
53
+ n-1
54
+ end
55
+ # Total number of cases
56
+ def n
57
+ @vectors.inject(0){|a,v| a+v.size}
58
+ end
59
+ # Fisher
60
+ def f
61
+ k=@vectors.size
62
+ (ssbg*(n-k)) / (sswg*(k-1))
63
+ end
64
+ # Significance of Fisher
65
+ def significance
66
+ if HAS_GSL
67
+ GSL::Cdf.fdist_Q(f,df_bg,df_wg)
68
+ else
69
+ raise "Need Ruby/GSL"
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,255 @@
1
+ module Statsample
2
+ # Diverse correlation methods
3
+ module Bivariate
4
+ class << self
5
+ # Covariance between two vectors
6
+ def covariance(v1,v2)
7
+ v1a,v2a=Statsample.only_valid(v1,v2)
8
+ return nil if v1a.size==0
9
+ if HAS_GSL
10
+ GSL::Stats::covariance(v1a.gsl, v2a.gsl)
11
+ else
12
+ covariance_slow(v1a,v2a)
13
+ end
14
+ end
15
+ # Covariance. The denominator is n-1
16
+ def covariance_slow(v1a,v2a)
17
+ t=0
18
+ m1=v1a.mean
19
+ m2=v1a.mean
20
+ (0...v1a.size).each {|i|
21
+ t+=((v1a[i]-m1)*(v2a[i]-m2))
22
+ }
23
+ t.to_f / (v1a.size-1)
24
+ end
25
+ # Calculate Pearson correlation coefficient between 2 vectors
26
+ def pearson(v1,v2)
27
+ v1a,v2a=Statsample.only_valid(v1,v2)
28
+ return nil if v1a.size ==0
29
+ if HAS_GSL
30
+ GSL::Stats::correlation(v1a.gsl, v2a.gsl)
31
+ else
32
+ pearson_slow(v1a,v2a)
33
+ end
34
+ end
35
+ #:nodoc:
36
+ def pearson_slow(v1a,v2a)
37
+ v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
38
+ t=0
39
+ siz=v1s.size
40
+ (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
41
+ t.to_f/v2s.size
42
+ end
43
+ # Retrieves the value for t test for a pearson correlation
44
+ # between two vectors to test the null hipothesis of r=0
45
+ def t_pearson(v1,v2)
46
+ v1a,v2a=Statsample.only_valid(v1,v2)
47
+ r=pearson(v1a,v2a)
48
+ if(r==1.0)
49
+ 0
50
+ else
51
+ t_r(r,v1a.size)
52
+ end
53
+ end
54
+ # Retrieves the value for t test for a pearson correlation
55
+ # giving r and vector size
56
+ def t_r(r,size)
57
+ r*Math::sqrt(((size)-2).to_f / (1 - r**2))
58
+ end
59
+ # Retrieves the probability value (a la SPSS)
60
+ # for a given t, size and number of tails
61
+ def prop_pearson(t,size, tails=2)
62
+ if HAS_GSL
63
+ t=-t if t>0
64
+ cdf=GSL::Cdf::tdist_P(t,(size)-2)
65
+ cdf*tails
66
+ else
67
+ raise "Needs ruby-gsl"
68
+ end
69
+ end
70
+ # Returns residual score after delete variance
71
+ # from another variable
72
+ #
73
+ def residuals(from,del)
74
+ r=Statsample::Bivariate.pearson(from,del)
75
+ froms, dels = from.vector_standarized, del.vector_standarized
76
+ nv=[]
77
+ froms.data_with_nils.each_index{|i|
78
+ if froms[i].nil? or dels[i].nil?
79
+ nv.push(nil)
80
+ else
81
+ nv.push(froms[i]-r*dels[i])
82
+ end
83
+ }
84
+ nv.to_vector(:scale)
85
+ end
86
+ def partial_correlation(v1,v2,control)
87
+ v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
88
+ rv1v2=pearson(v1a,v2a)
89
+ rv1con=pearson(v1a,cona)
90
+ rv2con=pearson(v2a,cona)
91
+
92
+ (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
93
+
94
+ end
95
+ # Covariance matrix
96
+ def covariance_matrix(ds)
97
+ ds.collect_matrix do |row,col|
98
+ if (ds[row].type!=:scale or ds[col].type!=:scale)
99
+ nil
100
+ else
101
+ covariance(ds[row],ds[col])
102
+ end
103
+ end
104
+ end
105
+
106
+ # The classic correlation matrix for all fields of a dataset
107
+
108
+ def correlation_matrix(ds)
109
+ ds.collect_matrix {|row,col|
110
+ if row==col
111
+ 1.0
112
+ elsif (ds[row].type!=:scale or ds[col].type!=:scale)
113
+ nil
114
+ else
115
+ pearson(ds[row],ds[col])
116
+ end
117
+ }
118
+ end
119
+ # Retrieves the n valid pairwise
120
+ def n_valid_matrix(ds)
121
+ ds.collect_matrix {|row,col|
122
+ if row==col
123
+ ds[row].valid_data.size
124
+ else
125
+ rowa,rowb=Statsample.only_valid(ds[row],ds[col])
126
+ rowa.size
127
+ end
128
+ }
129
+ end
130
+ def correlation_probability_matrix(ds)
131
+ rows=ds.fields.collect{|row|
132
+ ds.fields.collect{|col|
133
+ v1a,v2a=Statsample.only_valid(ds[row],ds[col])
134
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
135
+ }
136
+ }
137
+ Matrix.rows(rows)
138
+ end
139
+ # Calculate Spearman correlation coefficient between 2 vectors
140
+ def spearman(v1,v2)
141
+ v1a,v2a=Statsample.only_valid(v1,v2)
142
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
143
+ pearson(v1r,v2r)
144
+ end
145
+ # Calculate Point biserial correlation.
146
+ # Equal to Pearson correlation, with one dichotomous value replaced
147
+ # by "0" and the other by "1"
148
+ def point_biserial(dichotomous,continous)
149
+ ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
150
+ raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
151
+ raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
152
+ f0=ds['d'].factors.sort[0]
153
+ m0=ds.filter_field('c') {|c| c['d']==f0}
154
+ m1=ds.filter_field('c') {|c| c['d']!=f0}
155
+ ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
156
+ end
157
+ # Kendall Rank Correlation Coefficient.
158
+ #
159
+ # Based on Hervé Adbi article
160
+ def tau_a(v1,v2)
161
+ v1a,v2a=Statsample.only_valid(v1,v2)
162
+ n=v1.size
163
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
164
+ o1=ordered_pairs(v1r)
165
+ o2=ordered_pairs(v2r)
166
+ delta= o1.size*2-(o2 & o1).size*2
167
+ 1-(delta * 2 / (n*(n-1)).to_f)
168
+ end
169
+ # Calculates Tau b correlation.
170
+ #
171
+ # Tau-b defines perfect association as strict monotonicity.
172
+ # Although it requires strict monotonicity to reach 1.0,
173
+ # it does not penalize ties as much as some other measures.
174
+ #
175
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
176
+ def tau_b(matrix)
177
+ v=pairs(matrix)
178
+ ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
179
+ end
180
+ # Calculates Goodman and Kruskal's gamma.
181
+ #
182
+ # Gamma is the surplus of concordant pairs over discordant pairs,
183
+ # as a percentage of all pairs ignoring ties.
184
+ #
185
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
186
+ def gamma(matrix)
187
+ v=pairs(matrix)
188
+ (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
189
+ end
190
+ # Calculate indexes for a matrix
191
+ # the rows and cols has to be ordered
192
+ def pairs(matrix)
193
+ # calculate concordant
194
+ #p matrix
195
+ rs=matrix.row_size
196
+ cs=matrix.column_size
197
+ conc=disc=ties_x=ties_y=0
198
+ (0...(rs-1)).each {|x|
199
+ (0...(cs-1)).each{|y|
200
+ ((x+1)...rs).each{|x2|
201
+ ((y+1)...cs).each{|y2|
202
+ #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
203
+ conc+=matrix[x,y]*matrix[x2,y2]
204
+ }
205
+ }
206
+ }
207
+ }
208
+ (0...(rs-1)).each {|x|
209
+ (1...(cs)).each{|y|
210
+ ((x+1)...rs).each{|x2|
211
+ (0...y).each{|y2|
212
+ #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
213
+ disc+=matrix[x,y]*matrix[x2,y2]
214
+ }
215
+ }
216
+ }
217
+ }
218
+ (0...(rs-1)).each {|x|
219
+ (0...(cs)).each{|y|
220
+ ((x+1)...(rs)).each{|x2|
221
+ ties_x+=matrix[x,y]*matrix[x2,y]
222
+ }
223
+ }
224
+ }
225
+ (0...rs).each {|x|
226
+ (0...(cs-1)).each{|y|
227
+ ((y+1)...(cs)).each{|y2|
228
+ ties_y+=matrix[x,y]*matrix[x,y2]
229
+ }
230
+ }
231
+ }
232
+ {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
233
+ end
234
+ def ordered_pairs(vector)
235
+ d=vector.data
236
+ a=[]
237
+ (0...(d.size-1)).each{|i|
238
+ ((i+1)...(d.size)).each {|j|
239
+ a.push([d[i],d[j]])
240
+ }
241
+ }
242
+ a
243
+ end
244
+ def sum_of_codeviated(v1,v2)
245
+ v1a,v2a=Statsample.only_valid(v1,v2)
246
+ sum=0
247
+ (0...v1a.size).each{|i|
248
+ sum+=v1a[i]*v2a[i]
249
+ }
250
+ sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
251
+ end
252
+ end
253
+ end
254
+ end
255
+
@@ -0,0 +1,39 @@
1
+ module Statsample
2
+ # Based on Babatunde, Iyiola & Eni () :
3
+ # "A Numerical Procedure for Computing Chi-Square Percentage Points"
4
+ #
5
+ module ChiDistribution
6
+ class << self
7
+ def steps(av, bv, itv)
8
+ steps = ((bv.to_f - av.to_f) / itv.to_f).to_i
9
+ end
10
+ def loggamma(k)
11
+ c1 = 76.18009173
12
+ c2 = -86.50532033
13
+ c3 = 24.01409822
14
+ c4 = -1.231739516
15
+ c5 = 0.00120858
16
+ c6 = -0.000005364
17
+ c7 = 2.506628275
18
+ x1 = k - 1
19
+ ws = x1 + 5.5
20
+ ws = (x1 + 0.5) * Math::log(ws) - ws
21
+ s = 1 + c1 / (x1 + 1) + c2 / (x1 + 2) + c3 / (x1 + 3) + c4 / (x1 + 4) + c5 / (x1 + 5) + c6 / (x1 + 6)
22
+ ws + Math::log(c7 * s)
23
+ end
24
+ def f(x, k)
25
+ Math::exp(0.5 * k * Math::log(0.5 * x) - Math::log(x) - loggamma(0.5 * k) - 0.5 * x)
26
+ end
27
+ def cdf(b,k)
28
+ a = 0.001
29
+ b=b.to_f
30
+ if k==2
31
+ 1 - Math::exp( -b.to_f / 2)
32
+ else
33
+ w = (b - a) / 28.to_f
34
+ 2 * w / 45 * (7 * (f(a, k) + f(a + 28 * w, k)) + 12 * (f(a + 2 * w, k) + f(a + 6 * w, k) + f(a + 10 * w, k) + f(a + 14 * w, k) + f(a + 18 * w, k) + f(a + 22 * w, k) + f(a + 26 * w, k)) + 14 * (f(a + 4 * w, k) + f(a + 8 * w, k) + f(a + 12 * w, k) + f(a + 16 * w, k) + f(a + 20 * w, k) + f(a + 24 * w, k)) + 32 * (f(a + w, k) + f(a + 3 * w, k) + f(a + 5 * w, k) + f(a + 7 * w, k) + f(a + 9 * w, k) + f(a + 11 * w, k) + f(a + 13 * w, k) + f(a + 15 * w, k) + f(a + 17 * w, k) + f(a + 19 * w, k) + f(a + 21 * w, k) + f(a + 23 * w, k) + f(a + 25 * w, k) + f(a + 27 * w, k)))
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,120 @@
1
+ require 'yaml'
2
+
3
+ module Statsample
4
+ # Codification
5
+ #
6
+ # This tool aids to code open questions
7
+ # * Load one or more vectors on the workflow, to create a file on yaml of values. If data have Statsample::SEPARATOR_TOKEN, the value will be splitted on two or more values
8
+ # * Edit the yaml and replace the values with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SEPARATOR_TOKEN)
9
+ # * Recode the vectors, loading the yaml file:
10
+ # * The new vectors have the same name of the original plus "_recoded"
11
+ # * Instead of load new recoded vectors, create many vectors as values, as add_vectors_by_split
12
+ #
13
+ # Usage:
14
+ # recode_file="recodification.yaml"
15
+ # phase=:first # flag
16
+ # if phase==:first
17
+ # File.open(recode_file,"w") {|fp|
18
+ # Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
19
+ # } # Edit the file recodification.yaml
20
+ # elsif phase==:second
21
+ # File.open(recode_file,"r") {|fp|
22
+ # Statsample::Codification.verify(fp,['vector1'])
23
+ # }
24
+ # elsif phase==:third
25
+ # File.open(recode_file,"r") {|fp|
26
+ # Statsample::Codification.recode_dataset_split!(ds,fp,"*")
27
+ # }
28
+ # end
29
+ #
30
+ module Codification
31
+ class << self
32
+ # Create a yaml dump for a hash, based on vectors
33
+ # The keys will be vectors name on dataset and the values
34
+ # will be hashes, with keys = values, for recodification
35
+ #
36
+ # v1=%w{a,b b,c d}.to_vector
37
+ # ds={"v1"=>v1}.to_dataset
38
+ # Statsample::Codification.create_yaml(ds,['v1'])
39
+ # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
40
+ def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
41
+ raise ArgumentError,"Array should't be empty" if vectors.size==0
42
+ pro_hash=vectors.inject({}){|h,v_name|
43
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
44
+ v=dataset[v_name]
45
+ split_data=v.splitted(sep)
46
+ factors=split_data.flatten.uniq.compact.sort.inject({}) {|a,v| a[v]=v;a}
47
+ h[v_name]=factors
48
+ h
49
+ }
50
+ YAML.dump(pro_hash,io)
51
+ end
52
+ def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
53
+ h.inject({}) {|a,v|
54
+ v[1].split(sep).each {|val|
55
+ a[val]||=[]
56
+ a[val].push(v[0])
57
+ }
58
+ a
59
+ }
60
+ end
61
+ def dictionary(h,sep=Statsample::SPLIT_TOKEN)
62
+ h.inject({}) {|a,v|
63
+ a[v[0]]=v[1].split(sep)
64
+ a
65
+ }
66
+ end
67
+ def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
68
+ dict=dictionary(h,sep)
69
+ new_data=v.splitted(sep)
70
+ recoded=new_data.collect{|c|
71
+ if c.nil?
72
+ nil
73
+ else
74
+ c.collect{|value|
75
+ dict[value]
76
+ }.flatten.uniq
77
+ end
78
+ }
79
+ end
80
+ def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
81
+ _recode_dataset(dataset,yaml,sep,false)
82
+ end
83
+ def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
84
+ _recode_dataset(dataset,yaml,sep,true)
85
+ end
86
+
87
+ def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
88
+ h=YAML::load(yaml)
89
+ v_names||=h.keys
90
+ v_names.each do |v_name|
91
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
92
+ recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
93
+ if c.nil?
94
+ nil
95
+ else
96
+ c.join(sep)
97
+ end
98
+ }.to_vector
99
+ if(split)
100
+ recoded.split_by_separator(sep).each {|k,v|
101
+ dataset[v_name+"_"+k]=v
102
+ }
103
+ else
104
+ dataset[v_name+"_recoded"]=recoded
105
+ end
106
+ end
107
+ end
108
+ def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
109
+ require 'pp'
110
+ h=YAML::load(yaml)
111
+ v_names||=h.keys
112
+ v_names.each{|v_name|
113
+ inverse=inverse_hash(h[v_name],sep)
114
+ io.puts "Vector: #{v_name}"
115
+ YAML.dump(inverse.sort,io)
116
+ }
117
+ end
118
+ end
119
+ end
120
+ end