statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,74 @@
1
+ module Statsample
2
+ module Anova
3
+ # One Way Anova
4
+ # Example:
5
+ # v1=[2,3,4,5,6].to_vector(:scale)
6
+ # v2=[3,3,4,5,6].to_vector(:scale)
7
+ # v3=[5,3,1,5,6].to_vector(:scale)
8
+ # anova=Statsample::Anova::OneWay.new([v1,v2,v3])
9
+ # puts anova.f
10
+ # puts anova.significance
11
+ class OneWay
12
+ def initialize(vectors)
13
+ @vectors=vectors
14
+ end
15
+ # Total sum
16
+ def sum
17
+ @vectors.inject(0){|a,v| a+v.sum}
18
+ end
19
+ # Total mean
20
+ def mean
21
+ sum.quo(n)
22
+ end
23
+ # Total sum of squares
24
+ def sst
25
+ m=mean.to_f
26
+ @vectors.inject(0) {|total,vector|
27
+ total+vector.sum_of_squares(m)
28
+ }
29
+ end
30
+ # Sum of squares within groups
31
+ def sswg
32
+ @vectors.inject(0) {|total,vector|
33
+ total+vector.sum_of_squares
34
+ }
35
+ end
36
+ # Sum of squares between groups
37
+ def ssbg
38
+ m=mean
39
+ @vectors.inject(0) {|total,vector|
40
+ total+(vector.mean-m).square*vector.size
41
+ }
42
+ end
43
+ # Degrees of freedom within groups
44
+ def df_wg
45
+ @vectors.inject(0) {|a,v| a+(v.size-1)}
46
+ end
47
+ # Degrees of freedom between groups
48
+ def df_bg
49
+ @vectors.size-1
50
+ end
51
+ # Total Degrees of freedom
52
+ def df_total
53
+ n-1
54
+ end
55
+ # Total number of cases
56
+ def n
57
+ @vectors.inject(0){|a,v| a+v.size}
58
+ end
59
+ # Fisher
60
+ def f
61
+ k=@vectors.size
62
+ (ssbg*(n-k)) / (sswg*(k-1))
63
+ end
64
+ # Significance of Fisher
65
+ def significance
66
+ if HAS_GSL
67
+ GSL::Cdf.fdist_Q(f,df_bg,df_wg)
68
+ else
69
+ raise "Need Ruby/GSL"
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,255 @@
1
+ module Statsample
2
+ # Diverse correlation methods
3
+ module Bivariate
4
+ class << self
5
+ # Covariance between two vectors
6
+ def covariance(v1,v2)
7
+ v1a,v2a=Statsample.only_valid(v1,v2)
8
+ return nil if v1a.size==0
9
+ if HAS_GSL
10
+ GSL::Stats::covariance(v1a.gsl, v2a.gsl)
11
+ else
12
+ covariance_slow(v1a,v2a)
13
+ end
14
+ end
15
+ # Covariance. The denominator is n-1
16
+ def covariance_slow(v1a,v2a)
17
+ t=0
18
+ m1=v1a.mean
19
+ m2=v1a.mean
20
+ (0...v1a.size).each {|i|
21
+ t+=((v1a[i]-m1)*(v2a[i]-m2))
22
+ }
23
+ t.to_f / (v1a.size-1)
24
+ end
25
+ # Calculate Pearson correlation coefficient between 2 vectors
26
+ def pearson(v1,v2)
27
+ v1a,v2a=Statsample.only_valid(v1,v2)
28
+ return nil if v1a.size ==0
29
+ if HAS_GSL
30
+ GSL::Stats::correlation(v1a.gsl, v2a.gsl)
31
+ else
32
+ pearson_slow(v1a,v2a)
33
+ end
34
+ end
35
+ #:nodoc:
36
+ def pearson_slow(v1a,v2a)
37
+ v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
38
+ t=0
39
+ siz=v1s.size
40
+ (0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
41
+ t.to_f/v2s.size
42
+ end
43
+ # Retrieves the value for t test for a pearson correlation
44
+ # between two vectors to test the null hipothesis of r=0
45
+ def t_pearson(v1,v2)
46
+ v1a,v2a=Statsample.only_valid(v1,v2)
47
+ r=pearson(v1a,v2a)
48
+ if(r==1.0)
49
+ 0
50
+ else
51
+ t_r(r,v1a.size)
52
+ end
53
+ end
54
+ # Retrieves the value for t test for a pearson correlation
55
+ # giving r and vector size
56
+ def t_r(r,size)
57
+ r*Math::sqrt(((size)-2).to_f / (1 - r**2))
58
+ end
59
+ # Retrieves the probability value (a la SPSS)
60
+ # for a given t, size and number of tails
61
+ def prop_pearson(t,size, tails=2)
62
+ if HAS_GSL
63
+ t=-t if t>0
64
+ cdf=GSL::Cdf::tdist_P(t,(size)-2)
65
+ cdf*tails
66
+ else
67
+ raise "Needs ruby-gsl"
68
+ end
69
+ end
70
+ # Returns residual score after delete variance
71
+ # from another variable
72
+ #
73
+ def residuals(from,del)
74
+ r=Statsample::Bivariate.pearson(from,del)
75
+ froms, dels = from.vector_standarized, del.vector_standarized
76
+ nv=[]
77
+ froms.data_with_nils.each_index{|i|
78
+ if froms[i].nil? or dels[i].nil?
79
+ nv.push(nil)
80
+ else
81
+ nv.push(froms[i]-r*dels[i])
82
+ end
83
+ }
84
+ nv.to_vector(:scale)
85
+ end
86
+ def partial_correlation(v1,v2,control)
87
+ v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
88
+ rv1v2=pearson(v1a,v2a)
89
+ rv1con=pearson(v1a,cona)
90
+ rv2con=pearson(v2a,cona)
91
+
92
+ (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
93
+
94
+ end
95
+ # Covariance matrix
96
+ def covariance_matrix(ds)
97
+ ds.collect_matrix do |row,col|
98
+ if (ds[row].type!=:scale or ds[col].type!=:scale)
99
+ nil
100
+ else
101
+ covariance(ds[row],ds[col])
102
+ end
103
+ end
104
+ end
105
+
106
+ # The classic correlation matrix for all fields of a dataset
107
+
108
+ def correlation_matrix(ds)
109
+ ds.collect_matrix {|row,col|
110
+ if row==col
111
+ 1.0
112
+ elsif (ds[row].type!=:scale or ds[col].type!=:scale)
113
+ nil
114
+ else
115
+ pearson(ds[row],ds[col])
116
+ end
117
+ }
118
+ end
119
+ # Retrieves the n valid pairwise
120
+ def n_valid_matrix(ds)
121
+ ds.collect_matrix {|row,col|
122
+ if row==col
123
+ ds[row].valid_data.size
124
+ else
125
+ rowa,rowb=Statsample.only_valid(ds[row],ds[col])
126
+ rowa.size
127
+ end
128
+ }
129
+ end
130
+ def correlation_probability_matrix(ds)
131
+ rows=ds.fields.collect{|row|
132
+ ds.fields.collect{|col|
133
+ v1a,v2a=Statsample.only_valid(ds[row],ds[col])
134
+ (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
135
+ }
136
+ }
137
+ Matrix.rows(rows)
138
+ end
139
+ # Calculate Spearman correlation coefficient between 2 vectors
140
+ def spearman(v1,v2)
141
+ v1a,v2a=Statsample.only_valid(v1,v2)
142
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
143
+ pearson(v1r,v2r)
144
+ end
145
+ # Calculate Point biserial correlation.
146
+ # Equal to Pearson correlation, with one dichotomous value replaced
147
+ # by "0" and the other by "1"
148
+ def point_biserial(dichotomous,continous)
149
+ ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
150
+ raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
151
+ raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
152
+ f0=ds['d'].factors.sort[0]
153
+ m0=ds.filter_field('c') {|c| c['d']==f0}
154
+ m1=ds.filter_field('c') {|c| c['d']!=f0}
155
+ ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
156
+ end
157
+ # Kendall Rank Correlation Coefficient.
158
+ #
159
+ # Based on Hervé Adbi article
160
+ def tau_a(v1,v2)
161
+ v1a,v2a=Statsample.only_valid(v1,v2)
162
+ n=v1.size
163
+ v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
164
+ o1=ordered_pairs(v1r)
165
+ o2=ordered_pairs(v2r)
166
+ delta= o1.size*2-(o2 & o1).size*2
167
+ 1-(delta * 2 / (n*(n-1)).to_f)
168
+ end
169
+ # Calculates Tau b correlation.
170
+ #
171
+ # Tau-b defines perfect association as strict monotonicity.
172
+ # Although it requires strict monotonicity to reach 1.0,
173
+ # it does not penalize ties as much as some other measures.
174
+ #
175
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
176
+ def tau_b(matrix)
177
+ v=pairs(matrix)
178
+ ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
179
+ end
180
+ # Calculates Goodman and Kruskal's gamma.
181
+ #
182
+ # Gamma is the surplus of concordant pairs over discordant pairs,
183
+ # as a percentage of all pairs ignoring ties.
184
+ #
185
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
186
+ def gamma(matrix)
187
+ v=pairs(matrix)
188
+ (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
189
+ end
190
+ # Calculate indexes for a matrix
191
+ # the rows and cols has to be ordered
192
+ def pairs(matrix)
193
+ # calculate concordant
194
+ #p matrix
195
+ rs=matrix.row_size
196
+ cs=matrix.column_size
197
+ conc=disc=ties_x=ties_y=0
198
+ (0...(rs-1)).each {|x|
199
+ (0...(cs-1)).each{|y|
200
+ ((x+1)...rs).each{|x2|
201
+ ((y+1)...cs).each{|y2|
202
+ #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
203
+ conc+=matrix[x,y]*matrix[x2,y2]
204
+ }
205
+ }
206
+ }
207
+ }
208
+ (0...(rs-1)).each {|x|
209
+ (1...(cs)).each{|y|
210
+ ((x+1)...rs).each{|x2|
211
+ (0...y).each{|y2|
212
+ #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
213
+ disc+=matrix[x,y]*matrix[x2,y2]
214
+ }
215
+ }
216
+ }
217
+ }
218
+ (0...(rs-1)).each {|x|
219
+ (0...(cs)).each{|y|
220
+ ((x+1)...(rs)).each{|x2|
221
+ ties_x+=matrix[x,y]*matrix[x2,y]
222
+ }
223
+ }
224
+ }
225
+ (0...rs).each {|x|
226
+ (0...(cs-1)).each{|y|
227
+ ((y+1)...(cs)).each{|y2|
228
+ ties_y+=matrix[x,y]*matrix[x,y2]
229
+ }
230
+ }
231
+ }
232
+ {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
233
+ end
234
+ def ordered_pairs(vector)
235
+ d=vector.data
236
+ a=[]
237
+ (0...(d.size-1)).each{|i|
238
+ ((i+1)...(d.size)).each {|j|
239
+ a.push([d[i],d[j]])
240
+ }
241
+ }
242
+ a
243
+ end
244
+ def sum_of_codeviated(v1,v2)
245
+ v1a,v2a=Statsample.only_valid(v1,v2)
246
+ sum=0
247
+ (0...v1a.size).each{|i|
248
+ sum+=v1a[i]*v2a[i]
249
+ }
250
+ sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
251
+ end
252
+ end
253
+ end
254
+ end
255
+
@@ -0,0 +1,39 @@
1
+ module Statsample
2
+ # Based on Babatunde, Iyiola & Eni () :
3
+ # "A Numerical Procedure for Computing Chi-Square Percentage Points"
4
+ #
5
+ module ChiDistribution
6
+ class << self
7
+ def steps(av, bv, itv)
8
+ steps = ((bv.to_f - av.to_f) / itv.to_f).to_i
9
+ end
10
+ def loggamma(k)
11
+ c1 = 76.18009173
12
+ c2 = -86.50532033
13
+ c3 = 24.01409822
14
+ c4 = -1.231739516
15
+ c5 = 0.00120858
16
+ c6 = -0.000005364
17
+ c7 = 2.506628275
18
+ x1 = k - 1
19
+ ws = x1 + 5.5
20
+ ws = (x1 + 0.5) * Math::log(ws) - ws
21
+ s = 1 + c1 / (x1 + 1) + c2 / (x1 + 2) + c3 / (x1 + 3) + c4 / (x1 + 4) + c5 / (x1 + 5) + c6 / (x1 + 6)
22
+ ws + Math::log(c7 * s)
23
+ end
24
+ def f(x, k)
25
+ Math::exp(0.5 * k * Math::log(0.5 * x) - Math::log(x) - loggamma(0.5 * k) - 0.5 * x)
26
+ end
27
+ def cdf(b,k)
28
+ a = 0.001
29
+ b=b.to_f
30
+ if k==2
31
+ 1 - Math::exp( -b.to_f / 2)
32
+ else
33
+ w = (b - a) / 28.to_f
34
+ 2 * w / 45 * (7 * (f(a, k) + f(a + 28 * w, k)) + 12 * (f(a + 2 * w, k) + f(a + 6 * w, k) + f(a + 10 * w, k) + f(a + 14 * w, k) + f(a + 18 * w, k) + f(a + 22 * w, k) + f(a + 26 * w, k)) + 14 * (f(a + 4 * w, k) + f(a + 8 * w, k) + f(a + 12 * w, k) + f(a + 16 * w, k) + f(a + 20 * w, k) + f(a + 24 * w, k)) + 32 * (f(a + w, k) + f(a + 3 * w, k) + f(a + 5 * w, k) + f(a + 7 * w, k) + f(a + 9 * w, k) + f(a + 11 * w, k) + f(a + 13 * w, k) + f(a + 15 * w, k) + f(a + 17 * w, k) + f(a + 19 * w, k) + f(a + 21 * w, k) + f(a + 23 * w, k) + f(a + 25 * w, k) + f(a + 27 * w, k)))
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,120 @@
1
+ require 'yaml'
2
+
3
+ module Statsample
4
+ # Codification
5
+ #
6
+ # This tool aids to code open questions
7
+ # * Load one or more vectors on the workflow, to create a file on yaml of values. If data have Statsample::SEPARATOR_TOKEN, the value will be splitted on two or more values
8
+ # * Edit the yaml and replace the values with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SEPARATOR_TOKEN)
9
+ # * Recode the vectors, loading the yaml file:
10
+ # * The new vectors have the same name of the original plus "_recoded"
11
+ # * Instead of load new recoded vectors, create many vectors as values, as add_vectors_by_split
12
+ #
13
+ # Usage:
14
+ # recode_file="recodification.yaml"
15
+ # phase=:first # flag
16
+ # if phase==:first
17
+ # File.open(recode_file,"w") {|fp|
18
+ # Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
19
+ # } # Edit the file recodification.yaml
20
+ # elsif phase==:second
21
+ # File.open(recode_file,"r") {|fp|
22
+ # Statsample::Codification.verify(fp,['vector1'])
23
+ # }
24
+ # elsif phase==:third
25
+ # File.open(recode_file,"r") {|fp|
26
+ # Statsample::Codification.recode_dataset_split!(ds,fp,"*")
27
+ # }
28
+ # end
29
+ #
30
+ module Codification
31
+ class << self
32
+ # Create a yaml dump for a hash, based on vectors
33
+ # The keys will be vectors name on dataset and the values
34
+ # will be hashes, with keys = values, for recodification
35
+ #
36
+ # v1=%w{a,b b,c d}.to_vector
37
+ # ds={"v1"=>v1}.to_dataset
38
+ # Statsample::Codification.create_yaml(ds,['v1'])
39
+ # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
40
+ def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
41
+ raise ArgumentError,"Array should't be empty" if vectors.size==0
42
+ pro_hash=vectors.inject({}){|h,v_name|
43
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
44
+ v=dataset[v_name]
45
+ split_data=v.splitted(sep)
46
+ factors=split_data.flatten.uniq.compact.sort.inject({}) {|a,v| a[v]=v;a}
47
+ h[v_name]=factors
48
+ h
49
+ }
50
+ YAML.dump(pro_hash,io)
51
+ end
52
+ def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
53
+ h.inject({}) {|a,v|
54
+ v[1].split(sep).each {|val|
55
+ a[val]||=[]
56
+ a[val].push(v[0])
57
+ }
58
+ a
59
+ }
60
+ end
61
+ def dictionary(h,sep=Statsample::SPLIT_TOKEN)
62
+ h.inject({}) {|a,v|
63
+ a[v[0]]=v[1].split(sep)
64
+ a
65
+ }
66
+ end
67
+ def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
68
+ dict=dictionary(h,sep)
69
+ new_data=v.splitted(sep)
70
+ recoded=new_data.collect{|c|
71
+ if c.nil?
72
+ nil
73
+ else
74
+ c.collect{|value|
75
+ dict[value]
76
+ }.flatten.uniq
77
+ end
78
+ }
79
+ end
80
+ def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
81
+ _recode_dataset(dataset,yaml,sep,false)
82
+ end
83
+ def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
84
+ _recode_dataset(dataset,yaml,sep,true)
85
+ end
86
+
87
+ def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
88
+ h=YAML::load(yaml)
89
+ v_names||=h.keys
90
+ v_names.each do |v_name|
91
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
92
+ recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
93
+ if c.nil?
94
+ nil
95
+ else
96
+ c.join(sep)
97
+ end
98
+ }.to_vector
99
+ if(split)
100
+ recoded.split_by_separator(sep).each {|k,v|
101
+ dataset[v_name+"_"+k]=v
102
+ }
103
+ else
104
+ dataset[v_name+"_recoded"]=recoded
105
+ end
106
+ end
107
+ end
108
+ def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
109
+ require 'pp'
110
+ h=YAML::load(yaml)
111
+ v_names||=h.keys
112
+ v_names.each{|v_name|
113
+ inverse=inverse_hash(h[v_name],sep)
114
+ io.puts "Vector: #{v_name}"
115
+ YAML.dump(inverse.sort,io)
116
+ }
117
+ end
118
+ end
119
+ end
120
+ end