statsample-ekatena 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,207 @@
1
+ module Statsample
2
+ module Anova
3
+ # = Generic Anova two-way.
4
+ # You could enter the sum of squares or the mean squares for a, b, axb and within.
5
+ # You should enter the degrees of freedom for a,b and within, because df_axb=df_a*df_b
6
+ # == Usage
7
+ # anova=Statsample::Anova::TwoWay(:ss_a=>10,:ss_b=>20,:ss_axb=>10, :ss_within=>20, :df_a=>2, :df_b=>3,df_within=100 @name=>"ANOVA for....")
8
+ class TwoWay
9
+ include Summarizable
10
+ attr_reader :df_a, :df_b, :df_axb, :df_within, :df_total
11
+ attr_reader :ss_a, :ss_b, :ss_axb, :ss_within, :ss_total
12
+ attr_reader :ms_a, :ms_b, :ms_axb, :ms_within, :ms_total
13
+ # Name of ANOVA Analisys
14
+ attr_accessor :name
15
+ # Name of a factor
16
+ attr_accessor :name_a
17
+ # Name of b factor
18
+ attr_accessor :name_b
19
+ # Name of within factor
20
+ attr_accessor :name_within
21
+
22
+ attr_reader :f_a_object, :f_b_object, :f_axb_object
23
+ def initialize(opts=Hash.new)
24
+ # First see if sum of squares or mean squares are entered
25
+ raise ArgumentError, "You should set all d.f." unless [:df_a, :df_b, :df_within].all? {|v| opts.has_key? v}
26
+
27
+ @df_a=opts.delete :df_a
28
+ @df_b=opts.delete :df_b
29
+ @df_axb=@df_a*@df_b
30
+ @df_within=opts.delete :df_within
31
+ @df_total=@df_a+@df_b+@df_axb+@df_within
32
+
33
+ if [:ss_a, :ss_b, :ss_axb, :ss_within].all? {|v| opts.has_key? v}
34
+ @ss_a = opts.delete :ss_a
35
+ @ss_b = opts.delete :ss_b
36
+ @ss_axb = opts.delete :ss_axb
37
+ @ss_within = opts.delete :ss_within
38
+
39
+ @ms_a =@ss_a.quo(@df_a)
40
+ @ms_b =@ss_b.quo(@df_b)
41
+ @ms_axb =@ss_axb.quo(@df_axb)
42
+ @ms_within =@ss_within.quo(@df_within)
43
+
44
+ elsif [:ms_a, :ms_b, :ms_axb, :ms_within].all? {|v| opts.has_key? v}
45
+ @ms_a = opts.delete :ms_a
46
+ @ms_b = opts.delete :ms_b
47
+ @ms_axb = opts.delete :ms_axb
48
+ @ms_within = opts.delete :ms_within
49
+
50
+ @ss_a =@ms_a*@df_a
51
+ @ss_b =@ms_b*@df_b
52
+ @ss_axb =@ms_axb*@df_axb
53
+ @ss_within =@ms_within*@df_within
54
+ else
55
+ raise "You should set all ss or ss"
56
+ end
57
+ @ss_total=@ss_a+@ss_b+@ss_axb+@ss_within
58
+ @ms_total=@ms_a+@ms_b+@ms_axb+@ms_within
59
+ opts_default={:name=>_("ANOVA Two-Way"),
60
+ :name_a=>_("A"),
61
+ :name_b=>_("B"),
62
+ :name_within=>_("Within")
63
+ }
64
+ @opts=opts_default.merge(opts)
65
+ opts_default.keys.each {|k|
66
+ send("#{k}=", @opts[k])
67
+ }
68
+ @f_a_object=Statsample::Test::F.new(@ms_a,@ms_within,@df_a,@df_within)
69
+ @f_b_object=Statsample::Test::F.new(@ms_b,@ms_within,@df_b,@df_within)
70
+ @f_axb_object=Statsample::Test::F.new(@ms_axb,@ms_within,@df_axb,@df_within)
71
+ end
72
+ def f_a
73
+ @f_a_object.f
74
+ end
75
+ def f_b
76
+ @f_b_object.f
77
+ end
78
+ def f_axb
79
+ @f_axb_object.f
80
+ end
81
+ def f_a_probability
82
+ @f_a_object.probability
83
+ end
84
+ def f_b_probability
85
+ @f_b_object.probability
86
+ end
87
+ def f_axb_probability
88
+ @f_axb_object.probability
89
+ end
90
+
91
+
92
+ def report_building(builder) #:nodoc:
93
+ builder.section(:name=>@name) do |b|
94
+ report_building_table(b)
95
+ end
96
+ end
97
+ def report_building_table(builder) #:nodoc:
98
+ builder.table(:name=>_("%s Table") % @name, :header=>%w{source ss df ms f p}.map {|v| _(v)}) do |t|
99
+ t.row([@name_a, "%0.3f" % @ss_a, @df_a, "%0.3f" % @ms_a , "%0.3f" % f_a, "%0.4f" % f_a_probability] )
100
+ t.row([@name_b, "%0.3f" % @ss_b, @df_b, "%0.3f" % @ms_b , "%0.3f" % f_b, "%0.4f" % f_b_probability] )
101
+ t.row(["%s X %s" % [@name_a, @name_b], "%0.3f" % @ss_axb, @df_axb, "%0.3f" % @ms_axb , "%0.3f" % f_axb, "%0.4f" % f_axb_probability] )
102
+ t.row([@name_within, "%0.3f" % @ss_within, @df_within, nil,nil,nil] )
103
+ t.row([_("Total"), "%0.3f" % @ss_total, @df_total, nil,nil,nil] )
104
+ end
105
+ end
106
+ end
107
+
108
+ # Two Way Anova with vectors
109
+ # Example:
110
+ # v1 = Daru::Vector.new([1,1,2,2])
111
+ # v2 = Daru::Vector.new([1,2,1,2])
112
+ # v3 = Daru::Vector.new([5,3,1,5])
113
+ # anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
114
+ #
115
+ class TwoWayWithVectors < TwoWay
116
+ # Show summary Levene test
117
+ attr_accessor :summary_levene
118
+ # Show summary descriptives for variables (means)
119
+ attr_accessor :summary_descriptives
120
+ attr_reader :a_var, :b_var, :dep_var
121
+ # For now, only equal sample cells allowed
122
+ def initialize(opts=Hash.new)
123
+ raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
124
+ @a_var = :a
125
+ @b_var = :b
126
+ @dep_var = :dependent
127
+ @a_vector, @b_vector, @dep_vector =
128
+ Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
129
+
130
+ ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
131
+ @ds = ds.clone_only_valid
132
+ _p = @a_vector.factors.size
133
+ _q = @b_vector.factors.size
134
+ @x_general = @dep_vector.mean
135
+ @axb_means = {}
136
+ @axb_sd = {}
137
+ @vectors = []
138
+ n=nil
139
+ @ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
140
+ @axb_means[k] = v.mean
141
+ @axb_sd[k] = v.sd
142
+ @vectors << v
143
+ n ||= v.size
144
+ raise "All cell sizes should be equal" if n!=v.size
145
+ }
146
+
147
+ @a_means={}
148
+ @ds.to_multiset_by_split(a_var).each_vector(dep_var) {|k,v|
149
+ @a_means[k]=v.mean
150
+ }
151
+ @b_means={}
152
+ @ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
153
+ @b_means[k]=v.mean
154
+ }
155
+ ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
156
+ ac + (@a_means[v]-@x_general)**2
157
+ }
158
+ ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
159
+ ac+(@b_means[v]-@x_general)**2
160
+ }
161
+ ss_within = @ds.collect(:row) { |row|
162
+ (row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
163
+ }.sum
164
+ ss_axb = n*@axb_means.inject(0) {|ac,v|
165
+ j,k=v[0]
166
+ xjk=v[1]
167
+ ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
168
+ }
169
+
170
+ df_a=_p-1
171
+ df_b=_q-1
172
+ df_within=(_p*_q)*(n-1)
173
+
174
+ opts_default={:name=>_("Anova Two-Way on %s") % @ds[dep_var].name,
175
+ :name_a=>@ds[a_var].name,
176
+ :name_b=>@ds[b_var].name,
177
+ :summary_descriptives=>true,
178
+ :summary_levene=>false}
179
+
180
+ @opts=opts_default.merge(opts).merge({:ss_a=>ss_a,:ss_b=>ss_b, :ss_axb=>ss_axb, :ss_within=>ss_within, :df_a=>df_a, :df_b=>df_b, :df_within=>df_within})
181
+
182
+
183
+ super(@opts)
184
+ end
185
+ def levene
186
+ Statsample::Test.levene(@vectors, :name=>_("Test of Homogeneity of variances (Levene)"))
187
+ end
188
+ def report_building(builder) #:nodoc:#
189
+ builder.section(:name=>@name) do |s|
190
+ if summary_descriptives
191
+ s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
192
+ @ds[b_var].factors.each do |b|
193
+ t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
194
+ end
195
+ t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
196
+ end
197
+ end
198
+ if summary_levene
199
+ s.parse_element(levene)
200
+ end
201
+ report_building_table(s)
202
+
203
+ end
204
+ end
205
+ end
206
+ end
207
+ end
@@ -0,0 +1,406 @@
1
+ require 'statsample/bivariate/pearson'
2
+ module Statsample
3
+ # Diverse methods and classes to calculate bivariate relations
4
+ # Specific classes:
5
+ # * Statsample::Bivariate::Pearson : Pearson correlation coefficient (r)
6
+ # * Statsample::Bivariate::Tetrachoric : Tetrachoric correlation
7
+ # * Statsample::Bivariate::Polychoric : Polychoric correlation (using joint, two-step and polychoric series)
8
+ module Bivariate
9
+ autoload(:Polychoric, 'statsample/bivariate/polychoric')
10
+ autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
11
+ class << self
12
+ # Covariance between two vectors
13
+ def covariance(v1,v2)
14
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
15
+
16
+ return nil if v1a.size==0
17
+ if Statsample.has_gsl?
18
+ GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
19
+ else
20
+ covariance_slow(v1a,v2a)
21
+ end
22
+ end
23
+ # Estimate the ML between two dichotomic vectors
24
+ def maximum_likehood_dichotomic(pred,real)
25
+ preda,reala=Statsample.only_valid_clone(pred,real)
26
+ sum=0
27
+ preda.each_index{|i|
28
+ sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
29
+ }
30
+ sum
31
+ end
32
+
33
+ def covariance_slow(v1,v2) # :nodoc:
34
+ v1a,v2a=Statsample.only_valid(v1,v2)
35
+ sum_of_squares(v1a,v2a) / (v1a.size-1)
36
+ end
37
+ def sum_of_squares(v1,v2)
38
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
39
+ v1a.reset_index!
40
+ v2a.reset_index!
41
+ m1=v1a.mean
42
+ m2=v2a.mean
43
+ (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
44
+ end
45
+ # Calculate Pearson correlation coefficient (r) between 2 vectors
46
+ def pearson(v1,v2)
47
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
48
+ return nil if v1a.size ==0
49
+ if Statsample.has_gsl?
50
+ GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
51
+ else
52
+ pearson_slow(v1a,v2a)
53
+ end
54
+ end
55
+ def pearson_slow(v1,v2) # :nodoc:
56
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
57
+
58
+ # Calculate sum of squares
59
+ ss=sum_of_squares(v1a,v2a)
60
+ ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
61
+ end
62
+ alias :correlation :pearson
63
+ # Retrieves the value for t test for a pearson correlation
64
+ # between two vectors to test the null hipothesis of r=0
65
+ def t_pearson(v1,v2)
66
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
67
+ r=pearson(v1a,v2a)
68
+ if(r==1.0)
69
+ 0
70
+ else
71
+ t_r(r,v1a.size)
72
+ end
73
+ end
74
+ # Retrieves the value for t test for a pearson correlation
75
+ # giving r and vector size
76
+ # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
77
+ def t_r(r,size)
78
+ r * Math::sqrt(((size)-2).to_f / (1 - r**2))
79
+ end
80
+ # Retrieves the probability value (a la SPSS)
81
+ # for a given t, size and number of tails.
82
+ # Uses a second parameter
83
+ # * :both or 2 : for r!=0 (default)
84
+ # * :right, :positive or 1 : for r > 0
85
+ # * :left, :negative : for r < 0
86
+
87
+ def prop_pearson(t, size, tails=:both)
88
+ tails=:both if tails==2
89
+ tails=:right if tails==1 or tails==:positive
90
+ tails=:left if tails==:negative
91
+
92
+ n_tails=case tails
93
+ when :both then 2
94
+ else 1
95
+ end
96
+ t=-t if t>0 and (tails==:both)
97
+ cdf=Distribution::T.cdf(t, size-2)
98
+ if(tails==:right)
99
+ 1.0-(cdf*n_tails)
100
+ else
101
+ cdf*n_tails
102
+ end
103
+ end
104
+
105
+
106
+ # Predicted time for pairwise correlation matrix, in miliseconds
107
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
108
+
109
+ def prediction_pairwise(vars,cases)
110
+ ((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
111
+ end
112
+ # Predicted time for optimized correlation matrix, in miliseconds
113
+ # See benchmarks/correlation_matrix.rb to see mode of calculation
114
+
115
+ def prediction_optimized(vars,cases)
116
+ ((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
117
+ end
118
+ # Returns residual score after delete variance
119
+ # from another variable
120
+ #
121
+ def residuals(from,del)
122
+ r=Statsample::Bivariate.pearson(from,del)
123
+ froms, dels = from.vector_standarized, del.vector_standarized
124
+ nv=[]
125
+ froms.reset_index!
126
+ dels.reset_index!
127
+ froms.each_index do |i|
128
+ if froms[i].nil? or dels[i].nil?
129
+ nv.push(nil)
130
+ else
131
+ nv.push(froms[i]-r*dels[i])
132
+ end
133
+ end
134
+ Daru::Vector.new(nv)
135
+ end
136
+ # Correlation between v1 and v2, controling the effect of
137
+ # control on both.
138
+ def partial_correlation(v1,v2,control)
139
+ v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
140
+ rv1v2=pearson(v1a,v2a)
141
+ rv1con=pearson(v1a,cona)
142
+ rv2con=pearson(v2a,cona)
143
+ (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
144
+ end
145
+
146
+ def covariance_matrix_optimized(ds)
147
+ x=ds.to_gsl
148
+ n=x.row_size
149
+ m=x.column_size
150
+ means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
151
+ centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
152
+ ss=centered.transpose*centered
153
+ s=((1/(n-1).to_f))*ss
154
+ s
155
+ end
156
+
157
+ # Covariance matrix.
158
+ # Order of rows and columns depends on Dataset#fields order
159
+
160
+ def covariance_matrix(ds)
161
+ vars,cases = ds.ncols, ds.nrows
162
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
163
+ cm=covariance_matrix_optimized(ds)
164
+ else
165
+ cm=covariance_matrix_pairwise(ds)
166
+ end
167
+ cm.extend(Statsample::CovariateMatrix)
168
+ cm.fields = ds.vectors.to_a
169
+ cm
170
+ end
171
+
172
+
173
+ def covariance_matrix_pairwise(ds)
174
+ cache={}
175
+ vectors = ds.vectors.to_a
176
+ mat_rows = vectors.collect do |row|
177
+ vectors.collect do |col|
178
+ if (ds[row].type!=:numeric or ds[col].type!=:numeric)
179
+ nil
180
+ elsif row==col
181
+ ds[row].variance
182
+ else
183
+ if cache[[col,row]].nil?
184
+ cov=covariance(ds[row],ds[col])
185
+ cache[[row,col]]=cov
186
+ cov
187
+ else
188
+ cache[[col,row]]
189
+ end
190
+ end
191
+ end
192
+ end
193
+
194
+ Matrix.rows mat_rows
195
+ end
196
+
197
+ # Correlation matrix.
198
+ # Order of rows and columns depends on Dataset#fields order
199
+ def correlation_matrix(ds)
200
+ vars, cases = ds.ncols, ds.nrows
201
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
202
+ cm=correlation_matrix_optimized(ds)
203
+ else
204
+ cm=correlation_matrix_pairwise(ds)
205
+ end
206
+ cm.extend(Statsample::CovariateMatrix)
207
+ cm.fields = ds.vectors.to_a
208
+ cm
209
+ end
210
+
211
+ def correlation_matrix_optimized(ds)
212
+ s=covariance_matrix_optimized(ds)
213
+ sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
214
+ cm=sds*s*sds
215
+ # Fix diagonal
216
+ s.row_size.times {|i|
217
+ cm[i,i]=1.0
218
+ }
219
+ cm
220
+ end
221
+ def correlation_matrix_pairwise(ds)
222
+ cache={}
223
+ vectors = ds.vectors.to_a
224
+ cm = vectors.collect do |row|
225
+ vectors.collect do |col|
226
+ if row==col
227
+ 1.0
228
+ elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
229
+ nil
230
+ else
231
+ if cache[[col,row]].nil?
232
+ r=pearson(ds[row],ds[col])
233
+ cache[[row,col]]=r
234
+ r
235
+ else
236
+ cache[[col,row]]
237
+ end
238
+ end
239
+ end
240
+ end
241
+
242
+ Matrix.rows cm
243
+ end
244
+
245
+ # Retrieves the n valid pairwise.
246
+ def n_valid_matrix(ds)
247
+ vectors = ds.vectors.to_a
248
+ m = vectors.collect do |row|
249
+ vectors.collect do |col|
250
+ if row==col
251
+ ds[row].reject_values(*Daru::MISSING_VALUES).size
252
+ else
253
+ rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
254
+ rowa.size
255
+ end
256
+ end
257
+ end
258
+
259
+ Matrix.rows m
260
+ end
261
+
262
+ # Matrix of correlation probabilities.
263
+ # Order of rows and columns depends on Dataset#fields order
264
+
265
+ def correlation_probability_matrix(ds, tails=:both)
266
+ rows=ds.fields.collect do |row|
267
+ ds.fields.collect do |col|
268
+ v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
269
+ (row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
270
+ end
271
+ end
272
+ Matrix.rows(rows)
273
+ end
274
+
275
+ # Spearman ranked correlation coefficient (rho) between 2 vectors
276
+ def spearman(v1,v2)
277
+ v1a,v2a = Statsample.only_valid_clone(v1,v2)
278
+ v1r,v2r = v1a.ranked, v2a.ranked
279
+ pearson(v1r,v2r)
280
+ end
281
+ # Calculate Point biserial correlation. Equal to Pearson correlation, with
282
+ # one dichotomous value replaced by "0" and the other by "1"
283
+ def point_biserial(dichotomous,continous)
284
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
285
+ raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
286
+ raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
287
+ f0=ds[:d].factors.sort.to_a[0]
288
+ m0=ds.filter_vector(:c) {|c| c[:d] == f0}
289
+ m1=ds.filter_vector(:c) {|c| c[:d] != f0}
290
+ ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
291
+ end
292
+ # Kendall Rank Correlation Coefficient (Tau a)
293
+ # Based on Hervé Adbi article
294
+ def tau_a(v1,v2)
295
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
296
+ n=v1.size
297
+ v1r,v2r=v1a.ranked,v2a.ranked
298
+ o1=ordered_pairs(v1r)
299
+ o2=ordered_pairs(v2r)
300
+ delta= o1.size*2-(o2 & o1).size*2
301
+ 1-(delta * 2 / (n*(n-1)).to_f)
302
+ end
303
+ # Calculates Goodman and Kruskal’s Tau b correlation.
304
+ # Tb is an asymmetric P-R-E measure of association for nominal scales
305
+ # (Mielke, X)
306
+ #
307
+ # Tau-b defines perfect association as strict monotonicity. Although it
308
+ # requires strict monotonicity to reach 1.0, it does not penalize ties as
309
+ # much as some other measures.
310
+ # == Reference
311
+ # Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
312
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
313
+ def tau_b(matrix)
314
+ v=pairs(matrix)
315
+ ((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
316
+ end
317
+ # Calculates Goodman and Kruskal's gamma.
318
+ #
319
+ # Gamma is the surplus of concordant pairs over discordant pairs, as a
320
+ # percentage of all pairs ignoring ties.
321
+ #
322
+ # Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
323
+ def gamma(matrix)
324
+ v=pairs(matrix)
325
+ (v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
326
+ end
327
+ # Calculate indexes for a matrix the rows and cols has to be ordered
328
+ def pairs(matrix)
329
+ # calculate concordant #p matrix
330
+ rs=matrix.row_size
331
+ cs=matrix.column_size
332
+ conc=disc=ties_x=ties_y=0
333
+ (0...(rs-1)).each do |x|
334
+ (0...(cs-1)).each do |y|
335
+ ((x+1)...rs).each do |x2|
336
+ ((y+1)...cs).each do |y2|
337
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
338
+ conc+=matrix[x,y]*matrix[x2,y2]
339
+ end
340
+ end
341
+ end
342
+ end
343
+ (0...(rs-1)).each {|x|
344
+ (1...(cs)).each{|y|
345
+ ((x+1)...rs).each{|x2|
346
+ (0...y).each{|y2|
347
+ # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
348
+ disc+=matrix[x,y]*matrix[x2,y2]
349
+ }
350
+ }
351
+ }
352
+ }
353
+ (0...(rs-1)).each {|x|
354
+ (0...(cs)).each{|y|
355
+ ((x+1)...(rs)).each{|x2|
356
+ ties_x+=matrix[x,y]*matrix[x2,y]
357
+ }
358
+ }
359
+ }
360
+ (0...rs).each {|x|
361
+ (0...(cs-1)).each{|y|
362
+ ((y+1)...(cs)).each{|y2|
363
+ ties_y+=matrix[x,y]*matrix[x,y2]
364
+ }
365
+ }
366
+ }
367
+ {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
368
+ end
369
+
370
+ def ordered_pairs(vector)
371
+ d = vector.to_a
372
+ a = []
373
+ (0...(d.size-1)).each do |i|
374
+ ((i+1)...(d.size)).each do |j|
375
+ a.push([d[i],d[j]])
376
+ end
377
+ end
378
+ a
379
+ end
380
+ =begin
381
+ def sum_of_codeviated(v1,v2)
382
+ v1a,v2a=Statsample.only_valid(v1,v2)
383
+ sum=0
384
+ (0...v1a.size).each{|i|
385
+ sum+=v1a[i]*v2a[i]
386
+ }
387
+ sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
388
+ end
389
+ =end
390
+ # Report the minimum number of cases valid of a covariate matrix
391
+ # based on a dataset
392
+ def min_n_valid(ds)
393
+ min = ds.nrows
394
+ m = n_valid_matrix(ds)
395
+ for x in 0...m.row_size
396
+ for y in 0...m.column_size
397
+ min=m[x,y] if m[x,y] < min
398
+ end
399
+ end
400
+ min
401
+ end
402
+ end
403
+ end
404
+ end
405
+
406
+