statsample-ekatena 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,88 @@
1
+ module Statsample
2
+ module Test
3
+ # = Levene Test for Equality of Variances
4
+ # From NIST/SEMATECH:
5
+ # <blockquote>Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.</blockquote>
6
+ # Use:
7
+ # require 'statsample'
8
+ # a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10])
9
+ # b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120])
10
+ #
11
+ # levene=Statsample::Test::Levene.new([a,b])
12
+ # puts levene.summary
13
+ #
14
+ # Output:
15
+ # Levene Test
16
+ # F: 0.778121319848449
17
+ # p: 0.389344552595791
18
+ #
19
+ # Reference:
20
+ # * NIST/SEMATECH e-Handbook of Statistical Methods. Available on http://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
21
+ class Levene
22
+ include Statsample::Test
23
+ include Summarizable
24
+ # Degrees of freedom 1 (k-1)
25
+ attr_reader :d1
26
+ # Degrees of freedom 2 (n-k)
27
+ attr_reader :d2
28
+ # Name of test
29
+ attr_accessor :name
30
+ # Input could be an array of vectors or a dataset
31
+ def initialize(input, opts=Hash.new())
32
+ if input.is_a? Daru::DataFrame
33
+ @vectors = input.to_hash.values
34
+ else
35
+ @vectors = input
36
+ end
37
+ @name=_("Levene Test")
38
+ opts.each{|k,v|
39
+ self.send("#{k}=",v) if self.respond_to? k
40
+ }
41
+ compute
42
+ end
43
+ # Value of the test
44
+ def f
45
+ @w
46
+ end
47
+ def report_building(builder) # :nodoc:
48
+ builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
49
+ end
50
+ def compute
51
+ n=@vectors.inject(0) { |ac,v| ac + v.reject_values(*Daru::MISSING_VALUES).size }
52
+
53
+ zi=@vectors.collect do |vector|
54
+ mean=vector.mean
55
+ Daru::Vector.new(vector.collect { |v| (v - mean).abs })
56
+ end
57
+
58
+ total_mean = Daru::Vector.new(
59
+ zi.inject([]) do |ac,vector|
60
+ ac + vector.reject_values(*Daru::MISSING_VALUES).to_a
61
+ end
62
+ ).mean
63
+
64
+ k = @vectors.size
65
+ sum_num = zi.inject(0) do |ac,vector|
66
+ ac + (vector.size * (vector.mean - total_mean)**2)
67
+ end
68
+
69
+ sum_den = zi.inject(0) do |ac,vector|
70
+ z_mean = vector.mean
71
+ ac + vector.reject_values(*Daru::MISSING_VALUES).to_a.inject(0) do |acp,zij|
72
+ acp + (zij - z_mean)**2
73
+ end
74
+ end
75
+
76
+ @w = ((n - k) * sum_num).quo((k - 1) * sum_den)
77
+ @d1 = k - 1
78
+ @d2 = n - k
79
+ end
80
+ private :compute
81
+ # Probability.
82
+ # With H_0 = Sum(s2)=0, probability of getting a value of the test upper or equal to the obtained on the sample
83
+ def probability
84
+ p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,309 @@
1
+ module Statsample
2
+ module Test
3
+ # A t-test is any statistical hypothesis test in which the test
4
+ # statistic follows a Student's t distribution, if the null
5
+ # hypothesis is supported
6
+ class T
7
+
8
+ class << self
9
+ include Math
10
+ # Test the null hypothesis that the population mean is equal to a specified value u, one uses the statistic.
11
+ # Is the same formula used on t-test for paired sample.
12
+ # * <tt>x</tt>: sample/differences mean
13
+ # * <tt>u</tt>: population mean
14
+ # * <tt>s</tt>: sample/differences standard deviation
15
+ # * <tt>n</tt>: sample size
16
+ def one_sample(x,u,s,n)
17
+ (x-u)*Math::sqrt(n).quo(s)
18
+ end
19
+ # Test if means of two samples are different.
20
+ # * <tt>x1</tt>: sample 1 mean
21
+ # * <tt>x2</tt>: sample 2 mean
22
+ # * <tt>s1</tt>: sample 1 standard deviation
23
+ # * <tt>s2</tt>: sample 2 standard deviation
24
+ # * <tt>n1</tt>: sample 1 size
25
+ # * <tt>n2</tt>: sample 2 size
26
+ # * <tt>equal_variance</tt>: true if equal_variance assumed
27
+ #
28
+ def two_sample_independent(x1, x2, s1, s2, n1, n2, equal_variance = false)
29
+ num=x1-x2
30
+ if equal_variance
31
+ sx1x2 = sqrt(((n1-1)*s1**2 + (n2-1)*s2**2).quo(n1+n2-2))
32
+ den = sx1x2*sqrt(1.quo(n1)+1.quo(n2))
33
+ else
34
+ den=sqrt((s1**2).quo(n1) + (s2**2).quo(n2))
35
+ end
36
+ num.quo(den)
37
+ end
38
+ # Degrees of freedom for equal variance on t test
39
+ def df_equal_variance(n1,n2)
40
+ n1+n2-2
41
+ end
42
+ # Degrees of freedom for unequal variance
43
+ # * <tt>s1</tt>: sample 1 standard deviation
44
+ # * <tt>s2</tt>: sample 2 standard deviation
45
+ # * <tt>n1</tt>: sample 1 size
46
+ # * <tt>n2</tt>: sample 2 size
47
+ # == Reference
48
+ # * http://en.wikipedia.org/wiki/Welch-Satterthwaite_equation
49
+ def df_not_equal_variance(s1,s2,n1,n2)
50
+ s2_1=s1**2
51
+ s2_2=s2**2
52
+ num=(s2_1.quo(n1)+s2_2.quo(n2))**2
53
+ den=(s2_1.quo(n1)**2).quo(n1-1) + (s2_2.quo(n2)**2).quo(n2-1)
54
+ num.quo(den)
55
+ end
56
+ end
57
+
58
+ include Statsample::Test
59
+ include Summarizable
60
+ attr_reader :standard_error, :estimate, :df
61
+ # Tails for p-value (:both, :left or :right). Default :both
62
+ attr_accessor :tails
63
+ # Name of F analysis
64
+ attr_accessor :name
65
+ attr_accessor :confidence_level
66
+ attr_reader :t
67
+ attr_accessor :estimate_name, :standard_error_name
68
+ # Creates a generic t test. Use OneSample or TwoSamplesIndependent
69
+ # classes for better summaries.
70
+ # Parameters:
71
+ # * estimate: estimate
72
+ # * standard_error: standard error of estimate
73
+ # * df: degrees of freedom
74
+ def initialize(estimate, standard_error, df, opts=Hash.new)
75
+ @estimate=estimate
76
+ @standard_error=standard_error
77
+ @df=df
78
+ @t = @estimate / @standard_error.to_f
79
+ opts_default={ :tails=>:both,
80
+ :name=>_("T Test"),
81
+ :estimate_name=>_("Estimate"),
82
+ :standard_error_name=>_("Std.Err.of Estimate"),
83
+ :confidence_level=>0.95}
84
+ @opts = opts_default.merge(opts)
85
+
86
+ @opts.keys.each {|k|
87
+ send("#{k}=", @opts[k]) if respond_to? k
88
+ }
89
+ end
90
+
91
+ alias :se :standard_error
92
+
93
+ def to_f
94
+ t
95
+ end
96
+
97
+ # probability
98
+ def probability
99
+ p_using_cdf(Distribution::T.cdf(t, df), tails)
100
+ end
101
+
102
+ def confidence_interval(cl=nil)
103
+ cl||=confidence_level
104
+ t_crit = t_critical(cl, df)
105
+ [estimate - se*t_crit, estimate + se*t_crit]
106
+ end
107
+ alias :ci :confidence_interval
108
+
109
+
110
+ def report_building(builder) #:nodoc:
111
+ builder.section(:name=>@name) do |section|
112
+ section.text _("%s: %0.4f | %s: %0.4f") % [@estimate_name, @estimate, @standard_error_name, se]
113
+ report_building_t(section)
114
+ end
115
+ end
116
+ def report_building_t(s)
117
+ df_f=@df.is_a?(Integer) ? "%d" : "%0.4f"
118
+ s.text _("t(%d) = %0.4f, p=%0.4f (%s tails)") % [df, t,probability, tails]
119
+ s.text _("CI(%d%%): %0.4f - %0.4f") % [confidence_level*100, ci[0],ci[1]]
120
+
121
+ end
122
+
123
+
124
+ # One Sample t-test
125
+ # == Usage
126
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
127
+ # t_1=Statsample::Test::T::OneSample.new(a, {:u=>50})
128
+ # t_1.summary
129
+ #
130
+ # === Output
131
+ #
132
+ # = One Sample T Test
133
+ # Sample mean: 48.954
134
+ # Population mean:50
135
+ # Tails: both
136
+ # t = -1.1573, p=0.2474, d.f=999
137
+
138
+ class OneSample
139
+ include Math
140
+ include Statsample::Test
141
+ include Summarizable
142
+ # Options
143
+ attr_accessor :opts
144
+ # Name of test
145
+ attr_accessor :name
146
+ # Population mean to contrast
147
+ attr_accessor :u
148
+ # Degress of freedom
149
+ attr_reader :df
150
+ # Tails for probability (:both, :left or :right)
151
+ attr_accessor :tails
152
+
153
+ # Create a One Sample T Test
154
+ # Options:
155
+ # * :u = Mean to compare. Default= 0
156
+ # * :name = Name of the analysis
157
+ # * :tails = Tail for probability. Could be :both, :left, :right
158
+ def initialize(vector, opts=Hash.new)
159
+ @vector=vector
160
+ default={:u=>0, :name=>"One Sample T Test", :tails=>:both}
161
+ @opts=default.merge(opts)
162
+ @name=@opts[:name]
163
+ @u=@opts[:u]
164
+ @tails=@opts[:tails]
165
+ @confidence_level=@opts[:confidence_level] || 0.95
166
+ @df= @vector.reject_values(*Daru::MISSING_VALUES).size-1
167
+ @t=nil
168
+ end
169
+ def t_object
170
+ T.new(@vector.mean-u, @vector.se, @vector.reject_values(*Daru::MISSING_VALUES).size-1, opts)
171
+ end
172
+ def t
173
+ t_object.t
174
+ end
175
+ def probability
176
+ t_object.probability
177
+ end
178
+ def standard_error
179
+ t_object.standard_error
180
+ end
181
+ alias :se :standard_error
182
+ def confidence_interval(cl=nil)
183
+ t_object.confidence_interval(cl)
184
+ end
185
+ alias :ci :confidence_interval
186
+ def report_building(b) # :nodoc:
187
+ b.section(:name=>@name) {|s|
188
+ s.text _("Sample mean: %0.4f | Sample sd: %0.4f | se : %0.4f") % [@vector.mean, @vector.sd, se]
189
+ s.text _("Population mean: %0.4f") % u if u!=0
190
+ t_object.report_building_t(s)
191
+ }
192
+ end
193
+ end
194
+ # Two Sample t-test.
195
+ #
196
+ # == Usage
197
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
198
+ # b = Daru::Vector.new(1000.times.map {rand(100)})
199
+ # t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b)
200
+ # t_2.summary
201
+ # === Output
202
+ # = Two Sample T Test
203
+ # Mean and standard deviation
204
+ # +----------+---------+---------+------+
205
+ # | Variable | m | sd | n |
206
+ # +----------+---------+---------+------+
207
+ # | 1 | 49.3310 | 29.3042 | 1000 |
208
+ # | 2 | 47.8180 | 28.8640 | 1000 |
209
+ # +----------+---------+---------+------+
210
+ #
211
+ # == Levene Test
212
+ # Levene Test
213
+ # F: 0.3596
214
+ # p: 0.5488
215
+ # T statistics
216
+ # +--------------------+--------+-----------+----------------+
217
+ # | Type | t | df | p (both tails) |
218
+ # +--------------------+--------+-----------+----------------+
219
+ # | Equal variance | 1.1632 | 1998 | 0.2449 |
220
+ # | Non equal variance | 1.1632 | 1997.5424 | 0.1362 |
221
+ # +--------------------+--------+-----------+----------------+
222
+
223
+ class TwoSamplesIndependent
224
+ include Math
225
+ include Statsample::Test
226
+
227
+ include DirtyMemoize
228
+ include Summarizable
229
+ # Options
230
+ attr_accessor :opts
231
+ # Name of test
232
+ attr_accessor :name
233
+ # Degress of freedom (equal variance)
234
+ attr_reader :df_equal_variance
235
+ # Degress of freedom (not equal variance)
236
+ attr_reader :df_not_equal_variance
237
+ # Value of t for equal_variance
238
+ attr_reader :t_equal_variance
239
+ # Value of t for non-equal_variance
240
+ attr_reader :t_not_equal_variance
241
+ # Probability(equal variance)
242
+ attr_reader :probability_equal_variance
243
+ # Probability(unequal variance)
244
+ attr_reader :probability_not_equal_variance
245
+ # Tails for probability (:both, :left or :right)
246
+ attr_accessor :tails
247
+ # Create the object
248
+
249
+ dirty_writer :tails
250
+ dirty_memoize :t_equal_variance, :t_not_equal_variance, :probability_equal_variance, :probability_not_equal_variance, :df_equal_variance, :df_not_equal_variance
251
+
252
+ # Create a Two Independent T Test
253
+ # Options:
254
+ # * :name = Name of the analysis
255
+ # * :tails = Tail for probability. Could be :both, :left, :right
256
+ def initialize(v1, v2, opts=Hash.new)
257
+ @v1=v1
258
+ @v2=v2
259
+ default={:u=>0, :name=>"Two Sample T Test", :tails=>:both}
260
+ @opts=default.merge(opts)
261
+ @name=@opts[:name]
262
+ @tails=@opts[:tails]
263
+ end
264
+
265
+ # Set t and probability for given u
266
+ def compute
267
+ @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size,true)
268
+
269
+ @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size, false)
270
+
271
+ @df_equal_variance=T.df_equal_variance(@v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size)
272
+ @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size)
273
+
274
+ @probability_equal_variance = p_using_cdf(Distribution::T.cdf(@t_equal_variance, @df_equal_variance), tails)
275
+
276
+ @probability_not_equal_variance = p_using_cdf(Distribution::T.cdf(@t_not_equal_variance, @df_not_equal_variance), tails)
277
+
278
+ end
279
+ # Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data
280
+ def d
281
+ n1=@v1.reject_values(*Daru::MISSING_VALUES).size
282
+ n2=@v2.reject_values(*Daru::MISSING_VALUES).size
283
+ num=@v1.mean-@v2.mean
284
+ den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2))
285
+ num.quo(den)
286
+ end
287
+
288
+ def report_building(b) # :nodoc:
289
+ b.section(:name=>@name) {|g|
290
+ g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t|
291
+ t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd, @v1.reject_values(*Daru::MISSING_VALUES).size])
292
+ t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.reject_values(*Daru::MISSING_VALUES).size])
293
+ }
294
+ g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
295
+
296
+ g.table(:name=>_("T statistics"),:header=>["Type","t","df", "p (#{tails} tails)"].map{|v| _(v)}) {|t|
297
+ t.row([_("Equal variance"), "%0.4f" % t_equal_variance, df_equal_variance, "%0.4f" % probability_equal_variance])
298
+ t.row([_("Non equal variance"), "%0.4f" % t_not_equal_variance, "%0.4f" % df_not_equal_variance, "%0.4f" % probability_not_equal_variance])
299
+ }
300
+ g.table(:name=>_("Effect size")) do |t|
301
+ t.row ['x1-x2', "%0.4f" % (@v1.mean-@v2.mean)]
302
+ t.row ['d', "%0.4f" % d]
303
+ end
304
+ }
305
+ end
306
+ end
307
+ end
308
+ end
309
+ end
@@ -0,0 +1,208 @@
1
+ module Statsample
2
+ module Test
3
+ #
4
+ # = U Mann-Whitney test
5
+ #
6
+ # Non-parametric test for assessing whether two independent samples
7
+ # of observations come from the same distribution.
8
+ #
9
+ # == Assumptions
10
+ #
11
+ # * The two samples under investigation in the test are independent of each other and the observations within each sample are independent.
12
+ # * The observations are comparable (i.e., for any two observations, one can assess whether they are equal or, if not, which one is greater).
13
+ # * The variances in the two groups are approximately equal.
14
+ #
15
+ # Higher differences of distributions correspond to
16
+ # to lower values of U.
17
+ #
18
+ class UMannWhitney
19
+ # Max for m*n allowed for exact calculation of probability
20
+ MAX_MN_EXACT=10000
21
+
22
+ # U sampling distribution, based on Dinneen & Blakesley (1973) algorithm.
23
+ # This is the algorithm used on SPSS.
24
+ #
25
+ # Parameters:
26
+ # * <tt>n1</tt>: group 1 size
27
+ # * <tt>n2</tt>: group 2 size
28
+ # == Reference:
29
+ # * Dinneen, L., & Blakesley, B. (1973). Algorithm AS 62: A Generator for the Sampling Distribution of the Mann- Whitney U Statistic. <em>Journal of the Royal Statistical Society, 22</em>(2), 269-273
30
+ #
31
+ def self.u_sampling_distribution_as62(n1,n2)
32
+
33
+ freq=[]
34
+ work=[]
35
+ mn1=n1*n2+1
36
+ max_u=n1*n2
37
+ minmn=n1<n2 ? n1 : n2
38
+ maxmn=n1>n2 ? n1 : n2
39
+ n1=maxmn+1
40
+ (1..n1).each{|i| freq[i]=1}
41
+ n1+=1
42
+ (n1..mn1).each{|i| freq[i]=0}
43
+ work[1]=0
44
+ xin=maxmn
45
+ (2..minmn).each do |i|
46
+ work[i]=0
47
+ xin=xin+maxmn
48
+ n1=xin+2
49
+ l=1+xin.quo(2)
50
+ k=i
51
+ (1..l).each do |j|
52
+ k=k+1
53
+ n1=n1-1
54
+ sum=freq[j]+work[j]
55
+ freq[j]=sum
56
+ work[k]=sum-freq[n1]
57
+ freq[n1]=sum
58
+ end
59
+ end
60
+
61
+ # Generate percentages for normal U
62
+ dist=(1+max_u/2).to_i
63
+ freq.shift
64
+ total=freq.inject(0) {|a,v| a+v }
65
+ (0...dist).collect {|i|
66
+ if i!=max_u-i
67
+ ues=freq[i]*2
68
+ else
69
+ ues=freq[i]
70
+ end
71
+ ues.quo(total)
72
+ }
73
+ end
74
+
75
+ # Generate distribution for permutations.
76
+ # Very expensive, but useful for demostrations
77
+
78
+ def self.distribution_permutations(n1,n2)
79
+ base=[0]*n1+[1]*n2
80
+ po=Statsample::Permutation.new(base)
81
+
82
+ total=n1*n2
83
+ req={}
84
+ po.each do |perm|
85
+ r0,s0=0,0
86
+ perm.each_index {|c_i|
87
+ if perm[c_i]==0
88
+ r0+=c_i+1
89
+ s0+=1
90
+ end
91
+ }
92
+ u1=r0-((s0*(s0+1)).quo(2))
93
+ u2=total-u1
94
+ temp_u= (u1 <= u2) ? u1 : u2
95
+ req[perm]=temp_u
96
+ end
97
+ req
98
+ end
99
+ # Sample 1 Rank sum
100
+ attr_reader :r1
101
+ # Sample 2 Rank sum
102
+ attr_reader :r2
103
+ # Sample 1 U (useful for demostration)
104
+ attr_reader :u1
105
+ # Sample 2 U (useful for demostration)
106
+ attr_reader :u2
107
+ # U Value
108
+ attr_reader :u
109
+ # Value of compensation for ties (useful for demostration)
110
+ attr_reader :t
111
+ # Name of test
112
+ attr_accessor :name
113
+ include Summarizable
114
+ #
115
+ # Create a new U Mann-Whitney test
116
+ # Params: Two Daru::Vectors
117
+ #
118
+ def initialize(v1,v2, opts=Hash.new)
119
+ @v1 = v1
120
+ @v2 = v2
121
+ v1_valid = v1.reject_values(*Daru::MISSING_VALUES).reset_index!
122
+ v2_valid = v2.reject_values(*Daru::MISSING_VALUES).reset_index!
123
+ @n1 = v1_valid.size
124
+ @n2 = v2_valid.size
125
+ data = Daru::Vector.new(v1_valid.to_a + v2_valid.to_a)
126
+ groups = Daru::Vector.new(([0] * @n1) + ([1] * @n2))
127
+ ds = Daru::DataFrame.new({:g => groups, :data => data})
128
+ @t = nil
129
+ @ties = data.to_a.size != data.to_a.uniq.size
130
+ if @ties
131
+ adjust_for_ties(ds[:data])
132
+ end
133
+ ds[:ranked] = ds[:data].ranked
134
+ @n = ds.nrows
135
+
136
+ @r1 = ds.filter_rows { |r| r[:g] == 0}[:ranked].sum
137
+ @r2 = ((ds.nrows * (ds.nrows + 1)).quo(2)) - r1
138
+ @u1 = r1 - ((@n1 * (@n1 + 1)).quo(2))
139
+ @u2 = r2 - ((@n2 * (@n2 + 1)).quo(2))
140
+ @u = (u1 < u2) ? u1 : u2
141
+ opts_default = { :name=>_("Mann-Whitney's U") }
142
+ @opts = opts_default.merge(opts)
143
+ opts_default.keys.each {|k|
144
+ send("#{k}=", @opts[k])
145
+ }
146
+ end
147
+ def report_building(generator) # :nodoc:
148
+ generator.section(:name=>@name) do |s|
149
+ s.table(:name=>_("%s results") % @name) do |t|
150
+ t.row([_("Sum of ranks %s") % @v1.name, "%0.3f" % @r1])
151
+ t.row([_("Sum of ranks %s") % @v2.name, "%0.3f" % @r2])
152
+ t.row([_("U Value"), "%0.3f" % @u])
153
+ t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
154
+ if @n1*@n2<MAX_MN_EXACT
155
+ t.row([_("Exact p (Dinneen & Blakesley, 1973):"), "%0.3f" % probability_exact])
156
+ end
157
+ end
158
+ end
159
+ end
160
+ # Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
161
+ # Uses u_sampling_distribution_as62
162
+ def probability_exact
163
+ dist = UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
164
+ sum = 0
165
+ (0..@u.to_i).each {|i|
166
+ sum+=dist[i]
167
+ }
168
+ sum
169
+ end
170
+ # Adjunt for ties.
171
+ #
172
+ # == Reference:
173
+ # * http://europe.isixsigma.com/library/content/c080806a.asp
174
+ def adjust_for_ties(data)
175
+ @t = data.frequencies.find_all { |k,v| v > 1 }.inject(0) { |a,v|
176
+ a + (v[1]**3 - v[1]).quo(12)
177
+ }
178
+ end
179
+
180
+ private :adjust_for_ties
181
+
182
+ # Z value for U, with adjust for ties.
183
+ # For large samples, U is approximately normally distributed.
184
+ # In that case, you can use z to obtain probabily for U.
185
+ # == Reference:
186
+ # * SPSS Manual
187
+ def z
188
+ mu=(@n1*@n2).quo(2)
189
+ if(!@ties)
190
+ ou=Math::sqrt(((@n1*@n2)*(@n1+@n2+1)).quo(12))
191
+ else
192
+ n=@n1+@n2
193
+ first=(@n1*@n2).quo(n*(n-1))
194
+ second=((n**3-n).quo(12))-@t
195
+ ou=Math::sqrt(first*second)
196
+ end
197
+ (@u-mu).quo(ou)
198
+ end
199
+ # Assuming H_0, the proportion of cdf with values of U lower
200
+ # than the sample, using normal approximation.
201
+ # Use with more than 30 cases per group.
202
+ def probability_z
203
+ (1-Distribution::Normal.cdf(z.abs()))*2
204
+ end
205
+ end
206
+
207
+ end
208
+ end