alglib 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/History.txt +7 -0
  2. data/Manifest.txt +253 -0
  3. data/README.txt +33 -0
  4. data/Rakefile +27 -0
  5. data/ext/Rakefile +24 -0
  6. data/ext/alglib.i +24 -0
  7. data/ext/alglib/Makefile +157 -0
  8. data/ext/alglib/airyf.cpp +372 -0
  9. data/ext/alglib/airyf.h +81 -0
  10. data/ext/alglib/alglib.cpp +8558 -0
  11. data/ext/alglib/alglib_util.cpp +19 -0
  12. data/ext/alglib/alglib_util.h +14 -0
  13. data/ext/alglib/ap.cpp +877 -0
  14. data/ext/alglib/ap.english.html +364 -0
  15. data/ext/alglib/ap.h +666 -0
  16. data/ext/alglib/ap.russian.html +442 -0
  17. data/ext/alglib/apvt.h +754 -0
  18. data/ext/alglib/bdss.cpp +1500 -0
  19. data/ext/alglib/bdss.h +251 -0
  20. data/ext/alglib/bdsvd.cpp +1339 -0
  21. data/ext/alglib/bdsvd.h +164 -0
  22. data/ext/alglib/bessel.cpp +1226 -0
  23. data/ext/alglib/bessel.h +331 -0
  24. data/ext/alglib/betaf.cpp +105 -0
  25. data/ext/alglib/betaf.h +74 -0
  26. data/ext/alglib/bidiagonal.cpp +1328 -0
  27. data/ext/alglib/bidiagonal.h +350 -0
  28. data/ext/alglib/binomialdistr.cpp +247 -0
  29. data/ext/alglib/binomialdistr.h +153 -0
  30. data/ext/alglib/blas.cpp +576 -0
  31. data/ext/alglib/blas.h +132 -0
  32. data/ext/alglib/cblas.cpp +226 -0
  33. data/ext/alglib/cblas.h +57 -0
  34. data/ext/alglib/cdet.cpp +138 -0
  35. data/ext/alglib/cdet.h +92 -0
  36. data/ext/alglib/chebyshev.cpp +216 -0
  37. data/ext/alglib/chebyshev.h +76 -0
  38. data/ext/alglib/chisquaredistr.cpp +157 -0
  39. data/ext/alglib/chisquaredistr.h +144 -0
  40. data/ext/alglib/cholesky.cpp +285 -0
  41. data/ext/alglib/cholesky.h +86 -0
  42. data/ext/alglib/cinverse.cpp +298 -0
  43. data/ext/alglib/cinverse.h +111 -0
  44. data/ext/alglib/clu.cpp +337 -0
  45. data/ext/alglib/clu.h +120 -0
  46. data/ext/alglib/correlation.cpp +280 -0
  47. data/ext/alglib/correlation.h +77 -0
  48. data/ext/alglib/correlationtests.cpp +726 -0
  49. data/ext/alglib/correlationtests.h +134 -0
  50. data/ext/alglib/crcond.cpp +826 -0
  51. data/ext/alglib/crcond.h +148 -0
  52. data/ext/alglib/creflections.cpp +310 -0
  53. data/ext/alglib/creflections.h +165 -0
  54. data/ext/alglib/csolve.cpp +312 -0
  55. data/ext/alglib/csolve.h +99 -0
  56. data/ext/alglib/ctrinverse.cpp +387 -0
  57. data/ext/alglib/ctrinverse.h +98 -0
  58. data/ext/alglib/ctrlinsolve.cpp +297 -0
  59. data/ext/alglib/ctrlinsolve.h +81 -0
  60. data/ext/alglib/dawson.cpp +234 -0
  61. data/ext/alglib/dawson.h +74 -0
  62. data/ext/alglib/descriptivestatistics.cpp +436 -0
  63. data/ext/alglib/descriptivestatistics.h +112 -0
  64. data/ext/alglib/det.cpp +140 -0
  65. data/ext/alglib/det.h +94 -0
  66. data/ext/alglib/dforest.cpp +1819 -0
  67. data/ext/alglib/dforest.h +316 -0
  68. data/ext/alglib/elliptic.cpp +497 -0
  69. data/ext/alglib/elliptic.h +217 -0
  70. data/ext/alglib/estnorm.cpp +429 -0
  71. data/ext/alglib/estnorm.h +107 -0
  72. data/ext/alglib/expintegrals.cpp +422 -0
  73. data/ext/alglib/expintegrals.h +108 -0
  74. data/ext/alglib/faq.english.html +258 -0
  75. data/ext/alglib/faq.russian.html +272 -0
  76. data/ext/alglib/fdistr.cpp +202 -0
  77. data/ext/alglib/fdistr.h +163 -0
  78. data/ext/alglib/fresnel.cpp +211 -0
  79. data/ext/alglib/fresnel.h +91 -0
  80. data/ext/alglib/gammaf.cpp +338 -0
  81. data/ext/alglib/gammaf.h +104 -0
  82. data/ext/alglib/gqgengauss.cpp +235 -0
  83. data/ext/alglib/gqgengauss.h +92 -0
  84. data/ext/alglib/gqgenhermite.cpp +268 -0
  85. data/ext/alglib/gqgenhermite.h +63 -0
  86. data/ext/alglib/gqgenjacobi.cpp +297 -0
  87. data/ext/alglib/gqgenjacobi.h +72 -0
  88. data/ext/alglib/gqgenlaguerre.cpp +265 -0
  89. data/ext/alglib/gqgenlaguerre.h +69 -0
  90. data/ext/alglib/gqgenlegendre.cpp +300 -0
  91. data/ext/alglib/gqgenlegendre.h +62 -0
  92. data/ext/alglib/gqgenlobatto.cpp +305 -0
  93. data/ext/alglib/gqgenlobatto.h +97 -0
  94. data/ext/alglib/gqgenradau.cpp +232 -0
  95. data/ext/alglib/gqgenradau.h +95 -0
  96. data/ext/alglib/hbisinv.cpp +480 -0
  97. data/ext/alglib/hbisinv.h +183 -0
  98. data/ext/alglib/hblas.cpp +228 -0
  99. data/ext/alglib/hblas.h +64 -0
  100. data/ext/alglib/hcholesky.cpp +339 -0
  101. data/ext/alglib/hcholesky.h +91 -0
  102. data/ext/alglib/hermite.cpp +114 -0
  103. data/ext/alglib/hermite.h +49 -0
  104. data/ext/alglib/hessenberg.cpp +370 -0
  105. data/ext/alglib/hessenberg.h +152 -0
  106. data/ext/alglib/hevd.cpp +247 -0
  107. data/ext/alglib/hevd.h +107 -0
  108. data/ext/alglib/hsschur.cpp +1316 -0
  109. data/ext/alglib/hsschur.h +108 -0
  110. data/ext/alglib/htridiagonal.cpp +734 -0
  111. data/ext/alglib/htridiagonal.h +180 -0
  112. data/ext/alglib/ialglib.cpp +6 -0
  113. data/ext/alglib/ialglib.h +9 -0
  114. data/ext/alglib/ibetaf.cpp +960 -0
  115. data/ext/alglib/ibetaf.h +125 -0
  116. data/ext/alglib/igammaf.cpp +430 -0
  117. data/ext/alglib/igammaf.h +157 -0
  118. data/ext/alglib/inv.cpp +274 -0
  119. data/ext/alglib/inv.h +115 -0
  120. data/ext/alglib/inverseupdate.cpp +480 -0
  121. data/ext/alglib/inverseupdate.h +185 -0
  122. data/ext/alglib/jacobianelliptic.cpp +164 -0
  123. data/ext/alglib/jacobianelliptic.h +94 -0
  124. data/ext/alglib/jarquebera.cpp +2271 -0
  125. data/ext/alglib/jarquebera.h +80 -0
  126. data/ext/alglib/kmeans.cpp +356 -0
  127. data/ext/alglib/kmeans.h +76 -0
  128. data/ext/alglib/laguerre.cpp +94 -0
  129. data/ext/alglib/laguerre.h +48 -0
  130. data/ext/alglib/lbfgs.cpp +1167 -0
  131. data/ext/alglib/lbfgs.h +218 -0
  132. data/ext/alglib/lda.cpp +434 -0
  133. data/ext/alglib/lda.h +133 -0
  134. data/ext/alglib/ldlt.cpp +1130 -0
  135. data/ext/alglib/ldlt.h +124 -0
  136. data/ext/alglib/leastsquares.cpp +1252 -0
  137. data/ext/alglib/leastsquares.h +290 -0
  138. data/ext/alglib/legendre.cpp +107 -0
  139. data/ext/alglib/legendre.h +49 -0
  140. data/ext/alglib/linreg.cpp +1185 -0
  141. data/ext/alglib/linreg.h +380 -0
  142. data/ext/alglib/logit.cpp +1523 -0
  143. data/ext/alglib/logit.h +333 -0
  144. data/ext/alglib/lq.cpp +399 -0
  145. data/ext/alglib/lq.h +160 -0
  146. data/ext/alglib/lu.cpp +462 -0
  147. data/ext/alglib/lu.h +119 -0
  148. data/ext/alglib/mannwhitneyu.cpp +4490 -0
  149. data/ext/alglib/mannwhitneyu.h +115 -0
  150. data/ext/alglib/minlm.cpp +918 -0
  151. data/ext/alglib/minlm.h +312 -0
  152. data/ext/alglib/mlpbase.cpp +3375 -0
  153. data/ext/alglib/mlpbase.h +589 -0
  154. data/ext/alglib/mlpe.cpp +1369 -0
  155. data/ext/alglib/mlpe.h +552 -0
  156. data/ext/alglib/mlptrain.cpp +1056 -0
  157. data/ext/alglib/mlptrain.h +283 -0
  158. data/ext/alglib/nearunityunit.cpp +91 -0
  159. data/ext/alglib/nearunityunit.h +17 -0
  160. data/ext/alglib/normaldistr.cpp +377 -0
  161. data/ext/alglib/normaldistr.h +175 -0
  162. data/ext/alglib/nsevd.cpp +1869 -0
  163. data/ext/alglib/nsevd.h +140 -0
  164. data/ext/alglib/pca.cpp +168 -0
  165. data/ext/alglib/pca.h +87 -0
  166. data/ext/alglib/poissondistr.cpp +143 -0
  167. data/ext/alglib/poissondistr.h +130 -0
  168. data/ext/alglib/polinterpolation.cpp +685 -0
  169. data/ext/alglib/polinterpolation.h +206 -0
  170. data/ext/alglib/psif.cpp +173 -0
  171. data/ext/alglib/psif.h +88 -0
  172. data/ext/alglib/qr.cpp +414 -0
  173. data/ext/alglib/qr.h +168 -0
  174. data/ext/alglib/ratinterpolation.cpp +134 -0
  175. data/ext/alglib/ratinterpolation.h +72 -0
  176. data/ext/alglib/rcond.cpp +705 -0
  177. data/ext/alglib/rcond.h +140 -0
  178. data/ext/alglib/reflections.cpp +504 -0
  179. data/ext/alglib/reflections.h +165 -0
  180. data/ext/alglib/rotations.cpp +473 -0
  181. data/ext/alglib/rotations.h +128 -0
  182. data/ext/alglib/rsolve.cpp +221 -0
  183. data/ext/alglib/rsolve.h +99 -0
  184. data/ext/alglib/sbisinv.cpp +217 -0
  185. data/ext/alglib/sbisinv.h +171 -0
  186. data/ext/alglib/sblas.cpp +185 -0
  187. data/ext/alglib/sblas.h +64 -0
  188. data/ext/alglib/schur.cpp +156 -0
  189. data/ext/alglib/schur.h +102 -0
  190. data/ext/alglib/sdet.cpp +193 -0
  191. data/ext/alglib/sdet.h +101 -0
  192. data/ext/alglib/sevd.cpp +116 -0
  193. data/ext/alglib/sevd.h +99 -0
  194. data/ext/alglib/sinverse.cpp +672 -0
  195. data/ext/alglib/sinverse.h +138 -0
  196. data/ext/alglib/spddet.cpp +138 -0
  197. data/ext/alglib/spddet.h +96 -0
  198. data/ext/alglib/spdgevd.cpp +842 -0
  199. data/ext/alglib/spdgevd.h +200 -0
  200. data/ext/alglib/spdinverse.cpp +509 -0
  201. data/ext/alglib/spdinverse.h +122 -0
  202. data/ext/alglib/spdrcond.cpp +421 -0
  203. data/ext/alglib/spdrcond.h +118 -0
  204. data/ext/alglib/spdsolve.cpp +275 -0
  205. data/ext/alglib/spdsolve.h +105 -0
  206. data/ext/alglib/spline2d.cpp +1192 -0
  207. data/ext/alglib/spline2d.h +301 -0
  208. data/ext/alglib/spline3.cpp +1264 -0
  209. data/ext/alglib/spline3.h +290 -0
  210. data/ext/alglib/srcond.cpp +595 -0
  211. data/ext/alglib/srcond.h +127 -0
  212. data/ext/alglib/ssolve.cpp +895 -0
  213. data/ext/alglib/ssolve.h +139 -0
  214. data/ext/alglib/stdafx.h +0 -0
  215. data/ext/alglib/stest.cpp +131 -0
  216. data/ext/alglib/stest.h +94 -0
  217. data/ext/alglib/studenttdistr.cpp +222 -0
  218. data/ext/alglib/studenttdistr.h +115 -0
  219. data/ext/alglib/studentttests.cpp +377 -0
  220. data/ext/alglib/studentttests.h +178 -0
  221. data/ext/alglib/svd.cpp +620 -0
  222. data/ext/alglib/svd.h +126 -0
  223. data/ext/alglib/tdbisinv.cpp +2608 -0
  224. data/ext/alglib/tdbisinv.h +228 -0
  225. data/ext/alglib/tdevd.cpp +1229 -0
  226. data/ext/alglib/tdevd.h +115 -0
  227. data/ext/alglib/tridiagonal.cpp +594 -0
  228. data/ext/alglib/tridiagonal.h +171 -0
  229. data/ext/alglib/trigintegrals.cpp +490 -0
  230. data/ext/alglib/trigintegrals.h +131 -0
  231. data/ext/alglib/trinverse.cpp +345 -0
  232. data/ext/alglib/trinverse.h +98 -0
  233. data/ext/alglib/trlinsolve.cpp +926 -0
  234. data/ext/alglib/trlinsolve.h +73 -0
  235. data/ext/alglib/tsort.cpp +405 -0
  236. data/ext/alglib/tsort.h +54 -0
  237. data/ext/alglib/variancetests.cpp +245 -0
  238. data/ext/alglib/variancetests.h +134 -0
  239. data/ext/alglib/wsr.cpp +6285 -0
  240. data/ext/alglib/wsr.h +96 -0
  241. data/ext/ap.i +97 -0
  242. data/ext/correlation.i +24 -0
  243. data/ext/extconf.rb +6 -0
  244. data/ext/logit.i +89 -0
  245. data/lib/alglib.rb +71 -0
  246. data/lib/alglib/correlation.rb +26 -0
  247. data/lib/alglib/linearregression.rb +63 -0
  248. data/lib/alglib/logit.rb +42 -0
  249. data/test/test_alglib.rb +52 -0
  250. data/test/test_correlation.rb +44 -0
  251. data/test/test_correlationtest.rb +45 -0
  252. data/test/test_linreg.rb +35 -0
  253. data/test/test_logit.rb +43 -0
  254. data/test/test_pca.rb +27 -0
  255. metadata +326 -0
@@ -0,0 +1,1500 @@
1
+ /*************************************************************************
2
+ Copyright 2008 by Sergey Bochkanov (ALGLIB project).
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ - Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ - Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer listed
13
+ in this license in the documentation and/or other materials
14
+ provided with the distribution.
15
+
16
+ - Neither the name of the copyright holders nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ *************************************************************************/
32
+
33
+ #include <stdafx.h>
34
+ #include "bdss.h"
35
+
36
+ static void dskfoldsplit(const ap::real_2d_array& xy,
37
+ int npoints,
38
+ int nclasses,
39
+ int foldscount,
40
+ bool stratifiedsplits,
41
+ ap::integer_1d_array& folds);
42
+ static double xlny(double x, double y);
43
+ static double getcv(const ap::integer_1d_array& cnt, int nc);
44
+ static void tieaddc(const ap::integer_1d_array& c,
45
+ const ap::integer_1d_array& ties,
46
+ int ntie,
47
+ int nc,
48
+ ap::integer_1d_array& cnt);
49
+ static void tiesubc(const ap::integer_1d_array& c,
50
+ const ap::integer_1d_array& ties,
51
+ int ntie,
52
+ int nc,
53
+ ap::integer_1d_array& cnt);
54
+ static void tiegetc(const ap::integer_1d_array& c,
55
+ const ap::integer_1d_array& ties,
56
+ int ntie,
57
+ int nc,
58
+ ap::integer_1d_array& cnt);
59
+
60
+ /*************************************************************************
61
+ This set of routines (DSErrAllocate, DSErrAccumulate, DSErrFinish)
62
+ calculates different error functions (classification error, cross-entropy,
63
+ rms, avg, avg.rel errors).
64
+
65
+ 1. DSErrAllocate prepares buffer.
66
+ 2. DSErrAccumulate accumulates individual errors:
67
+ * Y contains predicted output (posterior probabilities for classification)
68
+ * DesiredY contains desired output (class number for classification)
69
+ 3. DSErrFinish outputs results:
70
+ * Buf[0] contains relative classification error (zero for regression tasks)
71
+ * Buf[1] contains avg. cross-entropy (zero for regression tasks)
72
+ * Buf[2] contains rms error (regression, classification)
73
+ * Buf[3] contains average error (regression, classification)
74
+ * Buf[4] contains average relative error (regression, classification)
75
+
76
+ NOTES(1):
77
+ "NClasses>0" means that we have classification task.
78
+ "NClasses<0" means regression task with -NClasses real outputs.
79
+
80
+ NOTES(2):
81
+ rms. avg, avg.rel errors for classification tasks are interpreted as
82
+ errors in posterior probabilities with respect to probabilities given
83
+ by training/test set.
84
+
85
+ -- ALGLIB --
86
+ Copyright 11.01.2009 by Bochkanov Sergey
87
+ *************************************************************************/
88
+ void dserrallocate(int nclasses, ap::real_1d_array& buf)
89
+ {
90
+
91
+ buf.setbounds(0, 7);
92
+ buf(0) = 0;
93
+ buf(1) = 0;
94
+ buf(2) = 0;
95
+ buf(3) = 0;
96
+ buf(4) = 0;
97
+ buf(5) = nclasses;
98
+ buf(6) = 0;
99
+ buf(7) = 0;
100
+ }
101
+
102
+
103
+ /*************************************************************************
104
+ See DSErrAllocate for comments on this routine.
105
+
106
+ -- ALGLIB --
107
+ Copyright 11.01.2009 by Bochkanov Sergey
108
+ *************************************************************************/
109
+ void dserraccumulate(ap::real_1d_array& buf,
110
+ const ap::real_1d_array& y,
111
+ const ap::real_1d_array& desiredy)
112
+ {
113
+ int nclasses;
114
+ int nout;
115
+ int offs;
116
+ int mmax;
117
+ int rmax;
118
+ int j;
119
+ double v;
120
+ double ev;
121
+
122
+ offs = 5;
123
+ nclasses = ap::round(buf(offs));
124
+ if( nclasses>0 )
125
+ {
126
+
127
+ //
128
+ // Classification
129
+ //
130
+ rmax = ap::round(desiredy(0));
131
+ mmax = 0;
132
+ for(j = 1; j <= nclasses-1; j++)
133
+ {
134
+ if( y(j)>y(mmax) )
135
+ {
136
+ mmax = j;
137
+ }
138
+ }
139
+ if( mmax!=rmax )
140
+ {
141
+ buf(0) = buf(0)+1;
142
+ }
143
+ if( y(rmax)>0 )
144
+ {
145
+ buf(1) = buf(1)-log(y(rmax));
146
+ }
147
+ else
148
+ {
149
+ buf(1) = buf(1)+log(ap::maxrealnumber);
150
+ }
151
+ for(j = 0; j <= nclasses-1; j++)
152
+ {
153
+ v = y(j);
154
+ if( j==rmax )
155
+ {
156
+ ev = 1;
157
+ }
158
+ else
159
+ {
160
+ ev = 0;
161
+ }
162
+ buf(2) = buf(2)+ap::sqr(v-ev);
163
+ buf(3) = buf(3)+fabs(v-ev);
164
+ if( ev!=0 )
165
+ {
166
+ buf(4) = buf(4)+fabs((v-ev)/ev);
167
+ buf(offs+2) = buf(offs+2)+1;
168
+ }
169
+ }
170
+ buf(offs+1) = buf(offs+1)+1;
171
+ }
172
+ else
173
+ {
174
+
175
+ //
176
+ // Regression
177
+ //
178
+ nout = -nclasses;
179
+ rmax = 0;
180
+ for(j = 1; j <= nout-1; j++)
181
+ {
182
+ if( desiredy(j)>desiredy(rmax) )
183
+ {
184
+ rmax = j;
185
+ }
186
+ }
187
+ mmax = 0;
188
+ for(j = 1; j <= nout-1; j++)
189
+ {
190
+ if( y(j)>y(mmax) )
191
+ {
192
+ mmax = j;
193
+ }
194
+ }
195
+ if( mmax!=rmax )
196
+ {
197
+ buf(0) = buf(0)+1;
198
+ }
199
+ for(j = 0; j <= nout-1; j++)
200
+ {
201
+ v = y(j);
202
+ ev = desiredy(j);
203
+ buf(2) = buf(2)+ap::sqr(v-ev);
204
+ buf(3) = buf(3)+fabs(v-ev);
205
+ if( ev!=0 )
206
+ {
207
+ buf(4) = buf(4)+fabs((v-ev)/ev);
208
+ buf(offs+2) = buf(offs+2)+1;
209
+ }
210
+ }
211
+ buf(offs+1) = buf(offs+1)+1;
212
+ }
213
+ }
214
+
215
+
216
+ /*************************************************************************
217
+ See DSErrAllocate for comments on this routine.
218
+
219
+ -- ALGLIB --
220
+ Copyright 11.01.2009 by Bochkanov Sergey
221
+ *************************************************************************/
222
+ void dserrfinish(ap::real_1d_array& buf)
223
+ {
224
+ int nout;
225
+ int offs;
226
+
227
+ offs = 5;
228
+ nout = abs(ap::round(buf(offs)));
229
+ if( buf(offs+1)!=0 )
230
+ {
231
+ buf(0) = buf(0)/buf(offs+1);
232
+ buf(1) = buf(1)/buf(offs+1);
233
+ buf(2) = sqrt(buf(2)/(nout*buf(offs+1)));
234
+ buf(3) = buf(3)/(nout*buf(offs+1));
235
+ }
236
+ if( buf(offs+2)!=0 )
237
+ {
238
+ buf(4) = buf(4)/buf(offs+2);
239
+ }
240
+ }
241
+
242
+
243
+ /*************************************************************************
244
+
245
+ -- ALGLIB --
246
+ Copyright 19.05.2008 by Bochkanov Sergey
247
+ *************************************************************************/
248
+ void dsnormalize(ap::real_2d_array& xy,
249
+ int npoints,
250
+ int nvars,
251
+ int& info,
252
+ ap::real_1d_array& means,
253
+ ap::real_1d_array& sigmas)
254
+ {
255
+ int i;
256
+ int j;
257
+ ap::real_1d_array tmp;
258
+ double mean;
259
+ double variance;
260
+ double skewness;
261
+ double kurtosis;
262
+
263
+
264
+ //
265
+ // Test parameters
266
+ //
267
+ if( npoints<=0||nvars<1 )
268
+ {
269
+ info = -1;
270
+ return;
271
+ }
272
+ info = 1;
273
+
274
+ //
275
+ // Standartization
276
+ //
277
+ means.setbounds(0, nvars-1);
278
+ sigmas.setbounds(0, nvars-1);
279
+ tmp.setbounds(0, npoints-1);
280
+ for(j = 0; j <= nvars-1; j++)
281
+ {
282
+ ap::vmove(tmp.getvector(0, npoints-1), xy.getcolumn(j, 0, npoints-1));
283
+ calculatemoments(tmp, npoints, mean, variance, skewness, kurtosis);
284
+ means(j) = mean;
285
+ sigmas(j) = sqrt(variance);
286
+ if( sigmas(j)==0 )
287
+ {
288
+ sigmas(j) = 1;
289
+ }
290
+ for(i = 0; i <= npoints-1; i++)
291
+ {
292
+ xy(i,j) = (xy(i,j)-means(j))/sigmas(j);
293
+ }
294
+ }
295
+ }
296
+
297
+
298
+ /*************************************************************************
299
+
300
+ -- ALGLIB --
301
+ Copyright 19.05.2008 by Bochkanov Sergey
302
+ *************************************************************************/
303
+ void dsnormalizec(const ap::real_2d_array& xy,
304
+ int npoints,
305
+ int nvars,
306
+ int& info,
307
+ ap::real_1d_array& means,
308
+ ap::real_1d_array& sigmas)
309
+ {
310
+ int i;
311
+ int j;
312
+ ap::real_1d_array tmp;
313
+ double mean;
314
+ double variance;
315
+ double skewness;
316
+ double kurtosis;
317
+
318
+
319
+ //
320
+ // Test parameters
321
+ //
322
+ if( npoints<=0||nvars<1 )
323
+ {
324
+ info = -1;
325
+ return;
326
+ }
327
+ info = 1;
328
+
329
+ //
330
+ // Standartization
331
+ //
332
+ means.setbounds(0, nvars-1);
333
+ sigmas.setbounds(0, nvars-1);
334
+ tmp.setbounds(0, npoints-1);
335
+ for(j = 0; j <= nvars-1; j++)
336
+ {
337
+ ap::vmove(tmp.getvector(0, npoints-1), xy.getcolumn(j, 0, npoints-1));
338
+ calculatemoments(tmp, npoints, mean, variance, skewness, kurtosis);
339
+ means(j) = mean;
340
+ sigmas(j) = sqrt(variance);
341
+ if( sigmas(j)==0 )
342
+ {
343
+ sigmas(j) = 1;
344
+ }
345
+ }
346
+ }
347
+
348
+
349
+ /*************************************************************************
350
+
351
+ -- ALGLIB --
352
+ Copyright 19.05.2008 by Bochkanov Sergey
353
+ *************************************************************************/
354
+ double dsgetmeanmindistance(const ap::real_2d_array& xy,
355
+ int npoints,
356
+ int nvars)
357
+ {
358
+ double result;
359
+ int i;
360
+ int j;
361
+ ap::real_1d_array tmp;
362
+ ap::real_1d_array tmp2;
363
+ double v;
364
+
365
+
366
+ //
367
+ // Test parameters
368
+ //
369
+ if( npoints<=0||nvars<1 )
370
+ {
371
+ result = 0;
372
+ return result;
373
+ }
374
+
375
+ //
376
+ // Process
377
+ //
378
+ tmp.setbounds(0, npoints-1);
379
+ for(i = 0; i <= npoints-1; i++)
380
+ {
381
+ tmp(i) = ap::maxrealnumber;
382
+ }
383
+ tmp2.setbounds(0, nvars-1);
384
+ for(i = 0; i <= npoints-1; i++)
385
+ {
386
+ for(j = i+1; j <= npoints-1; j++)
387
+ {
388
+ ap::vmove(&tmp2(0), &xy(i, 0), ap::vlen(0,nvars-1));
389
+ ap::vsub(&tmp2(0), &xy(j, 0), ap::vlen(0,nvars-1));
390
+ v = ap::vdotproduct(&tmp2(0), &tmp2(0), ap::vlen(0,nvars-1));
391
+ v = sqrt(v);
392
+ tmp(i) = ap::minreal(tmp(i), v);
393
+ tmp(j) = ap::minreal(tmp(j), v);
394
+ }
395
+ }
396
+ result = 0;
397
+ for(i = 0; i <= npoints-1; i++)
398
+ {
399
+ result = result+tmp(i)/npoints;
400
+ }
401
+ return result;
402
+ }
403
+
404
+
405
+ /*************************************************************************
406
+
407
+ -- ALGLIB --
408
+ Copyright 19.05.2008 by Bochkanov Sergey
409
+ *************************************************************************/
410
+ void dstie(ap::real_1d_array& a,
411
+ int n,
412
+ ap::integer_1d_array& ties,
413
+ int& tiecount,
414
+ ap::integer_1d_array& p1,
415
+ ap::integer_1d_array& p2)
416
+ {
417
+ int i;
418
+ int k;
419
+ ap::integer_1d_array tmp;
420
+
421
+
422
+ //
423
+ // Special case
424
+ //
425
+ if( n<=0 )
426
+ {
427
+ tiecount = 0;
428
+ return;
429
+ }
430
+
431
+ //
432
+ // Sort A
433
+ //
434
+ tagsort(a, n, p1, p2);
435
+
436
+ //
437
+ // Process ties
438
+ //
439
+ tiecount = 1;
440
+ for(i = 1; i <= n-1; i++)
441
+ {
442
+ if( a(i)!=a(i-1) )
443
+ {
444
+ tiecount = tiecount+1;
445
+ }
446
+ }
447
+ ties.setbounds(0, tiecount);
448
+ ties(0) = 0;
449
+ k = 1;
450
+ for(i = 1; i <= n-1; i++)
451
+ {
452
+ if( a(i)!=a(i-1) )
453
+ {
454
+ ties(k) = i;
455
+ k = k+1;
456
+ }
457
+ }
458
+ ties(tiecount) = n;
459
+ }
460
+
461
+
462
+ /*************************************************************************
463
+
464
+ -- ALGLIB --
465
+ Copyright 11.12.2008 by Bochkanov Sergey
466
+ *************************************************************************/
467
+ void dstiefasti(ap::real_1d_array& a,
468
+ ap::integer_1d_array& b,
469
+ int n,
470
+ ap::integer_1d_array& ties,
471
+ int& tiecount)
472
+ {
473
+ int i;
474
+ int k;
475
+ ap::integer_1d_array tmp;
476
+
477
+
478
+ //
479
+ // Special case
480
+ //
481
+ if( n<=0 )
482
+ {
483
+ tiecount = 0;
484
+ return;
485
+ }
486
+
487
+ //
488
+ // Sort A
489
+ //
490
+ tagsortfasti(a, b, n);
491
+
492
+ //
493
+ // Process ties
494
+ //
495
+ ties(0) = 0;
496
+ k = 1;
497
+ for(i = 1; i <= n-1; i++)
498
+ {
499
+ if( a(i)!=a(i-1) )
500
+ {
501
+ ties(k) = i;
502
+ k = k+1;
503
+ }
504
+ }
505
+ ties(k) = n;
506
+ tiecount = k;
507
+ }
508
+
509
+
510
+ /*************************************************************************
511
+ Optimal partition, internal subroutine.
512
+
513
+ -- ALGLIB --
514
+ Copyright 22.05.2008 by Bochkanov Sergey
515
+ *************************************************************************/
516
+ void dsoptimalsplit2(ap::real_1d_array a,
517
+ ap::integer_1d_array c,
518
+ int n,
519
+ int& info,
520
+ double& threshold,
521
+ double& pal,
522
+ double& pbl,
523
+ double& par,
524
+ double& pbr,
525
+ double& cve)
526
+ {
527
+ int i;
528
+ int t;
529
+ double s;
530
+ double pea;
531
+ double peb;
532
+ ap::integer_1d_array ties;
533
+ int tiecount;
534
+ ap::integer_1d_array p1;
535
+ ap::integer_1d_array p2;
536
+ double v1;
537
+ double v2;
538
+ int k;
539
+ int koptimal;
540
+ double pak;
541
+ double pbk;
542
+ double cvoptimal;
543
+ double cv;
544
+
545
+
546
+ //
547
+ // Test for errors in inputs
548
+ //
549
+ if( n<=0 )
550
+ {
551
+ info = -1;
552
+ return;
553
+ }
554
+ for(i = 0; i <= n-1; i++)
555
+ {
556
+ if( c(i)!=0&&c(i)!=1 )
557
+ {
558
+ info = -2;
559
+ return;
560
+ }
561
+ }
562
+ info = 1;
563
+
564
+ //
565
+ // Tie
566
+ //
567
+ dstie(a, n, ties, tiecount, p1, p2);
568
+ for(i = 0; i <= n-1; i++)
569
+ {
570
+ if( p2(i)!=i )
571
+ {
572
+ t = c(i);
573
+ c(i) = c(p2(i));
574
+ c(p2(i)) = t;
575
+ }
576
+ }
577
+
578
+ //
579
+ // Special case: number of ties is 1.
580
+ //
581
+ // NOTE: we assume that P[i,j] equals to 0 or 1,
582
+ // intermediate values are not allowed.
583
+ //
584
+ if( tiecount==1 )
585
+ {
586
+ info = -3;
587
+ return;
588
+ }
589
+
590
+ //
591
+ // General case, number of ties > 1
592
+ //
593
+ // NOTE: we assume that P[i,j] equals to 0 or 1,
594
+ // intermediate values are not allowed.
595
+ //
596
+ pal = 0;
597
+ pbl = 0;
598
+ par = 0;
599
+ pbr = 0;
600
+ for(i = 0; i <= n-1; i++)
601
+ {
602
+ if( c(i)==0 )
603
+ {
604
+ par = par+1;
605
+ }
606
+ if( c(i)==1 )
607
+ {
608
+ pbr = pbr+1;
609
+ }
610
+ }
611
+ koptimal = -1;
612
+ cvoptimal = ap::maxrealnumber;
613
+ for(k = 0; k <= tiecount-2; k++)
614
+ {
615
+
616
+ //
617
+ // first, obtain information about K-th tie which is
618
+ // moved from R-part to L-part
619
+ //
620
+ pak = 0;
621
+ pbk = 0;
622
+ for(i = ties(k); i <= ties(k+1)-1; i++)
623
+ {
624
+ if( c(i)==0 )
625
+ {
626
+ pak = pak+1;
627
+ }
628
+ if( c(i)==1 )
629
+ {
630
+ pbk = pbk+1;
631
+ }
632
+ }
633
+
634
+ //
635
+ // Calculate cross-validation CE
636
+ //
637
+ cv = 0;
638
+ cv = cv-xlny(pal+pak, (pal+pak)/(pal+pak+pbl+pbk+1));
639
+ cv = cv-xlny(pbl+pbk, (pbl+pbk)/(pal+pak+1+pbl+pbk));
640
+ cv = cv-xlny(par-pak, (par-pak)/(par-pak+pbr-pbk+1));
641
+ cv = cv-xlny(pbr-pbk, (pbr-pbk)/(par-pak+1+pbr-pbk));
642
+
643
+ //
644
+ // Compare with best
645
+ //
646
+ if( cv<cvoptimal )
647
+ {
648
+ cvoptimal = cv;
649
+ koptimal = k;
650
+ }
651
+
652
+ //
653
+ // update
654
+ //
655
+ pal = pal+pak;
656
+ pbl = pbl+pbk;
657
+ par = par-pak;
658
+ pbr = pbr-pbk;
659
+ }
660
+ cve = cvoptimal;
661
+ threshold = 0.5*(a(ties(koptimal))+a(ties(koptimal+1)));
662
+ pal = 0;
663
+ pbl = 0;
664
+ par = 0;
665
+ pbr = 0;
666
+ for(i = 0; i <= n-1; i++)
667
+ {
668
+ if( a(i)<threshold )
669
+ {
670
+ if( c(i)==0 )
671
+ {
672
+ pal = pal+1;
673
+ }
674
+ else
675
+ {
676
+ pbl = pbl+1;
677
+ }
678
+ }
679
+ else
680
+ {
681
+ if( c(i)==0 )
682
+ {
683
+ par = par+1;
684
+ }
685
+ else
686
+ {
687
+ pbr = pbr+1;
688
+ }
689
+ }
690
+ }
691
+ s = pal+pbl;
692
+ pal = pal/s;
693
+ pbl = pbl/s;
694
+ s = par+pbr;
695
+ par = par/s;
696
+ pbr = pbr/s;
697
+ }
698
+
699
+
700
+ /*************************************************************************
701
+ Optimal partition, internal subroutine. Fast version.
702
+
703
+ Accepts:
704
+ A array[0..N-1] array of attributes array[0..N-1]
705
+ C array[0..N-1] array of class labels
706
+ TiesBuf array[0..N] temporaries (ties)
707
+ CntBuf array[0..2*NC-1] temporaries (counts)
708
+ Alpha centering factor (0<=alpha<=1, recommended value - 0.05)
709
+
710
+ Output:
711
+ Info error code (">0"=OK, "<0"=bad)
712
+ RMS training set RMS error
713
+ CVRMS leave-one-out RMS error
714
+
715
+ Note:
716
+ content of all arrays is changed by subroutine
717
+
718
+ -- ALGLIB --
719
+ Copyright 11.12.2008 by Bochkanov Sergey
720
+ *************************************************************************/
721
+ void dsoptimalsplit2fast(ap::real_1d_array& a,
722
+ ap::integer_1d_array& c,
723
+ ap::integer_1d_array& tiesbuf,
724
+ ap::integer_1d_array& cntbuf,
725
+ int n,
726
+ int nc,
727
+ double alpha,
728
+ int& info,
729
+ double& threshold,
730
+ double& rms,
731
+ double& cvrms)
732
+ {
733
+ int i;
734
+ int k;
735
+ int cl;
736
+ int tiecount;
737
+ double cbest;
738
+ double cc;
739
+ int koptimal;
740
+ int sl;
741
+ int sr;
742
+ double v;
743
+ double w;
744
+ double x;
745
+
746
+
747
+ //
748
+ // Test for errors in inputs
749
+ //
750
+ if( n<=0||nc<2 )
751
+ {
752
+ info = -1;
753
+ return;
754
+ }
755
+ for(i = 0; i <= n-1; i++)
756
+ {
757
+ if( c(i)<0||c(i)>=nc )
758
+ {
759
+ info = -2;
760
+ return;
761
+ }
762
+ }
763
+ info = 1;
764
+
765
+ //
766
+ // Tie
767
+ //
768
+ dstiefasti(a, c, n, tiesbuf, tiecount);
769
+
770
+ //
771
+ // Special case: number of ties is 1.
772
+ //
773
+ if( tiecount==1 )
774
+ {
775
+ info = -3;
776
+ return;
777
+ }
778
+
779
+ //
780
+ // General case, number of ties > 1
781
+ //
782
+ for(i = 0; i <= 2*nc-1; i++)
783
+ {
784
+ cntbuf(i) = 0;
785
+ }
786
+ for(i = 0; i <= n-1; i++)
787
+ {
788
+ cntbuf(nc+c(i)) = cntbuf(nc+c(i))+1;
789
+ }
790
+ koptimal = -1;
791
+ threshold = a(n-1);
792
+ cbest = ap::maxrealnumber;
793
+ sl = 0;
794
+ sr = n;
795
+ for(k = 0; k <= tiecount-2; k++)
796
+ {
797
+
798
+ //
799
+ // first, move Kth tie from right to left
800
+ //
801
+ for(i = tiesbuf(k); i <= tiesbuf(k+1)-1; i++)
802
+ {
803
+ cl = c(i);
804
+ cntbuf(cl) = cntbuf(cl)+1;
805
+ cntbuf(nc+cl) = cntbuf(nc+cl)-1;
806
+ }
807
+ sl = sl+(tiesbuf(k+1)-tiesbuf(k));
808
+ sr = sr-(tiesbuf(k+1)-tiesbuf(k));
809
+
810
+ //
811
+ // Calculate RMS error
812
+ //
813
+ v = 0;
814
+ for(i = 0; i <= nc-1; i++)
815
+ {
816
+ w = cntbuf(i);
817
+ v = v+w*ap::sqr(w/sl-1);
818
+ v = v+(sl-w)*ap::sqr(w/sl);
819
+ w = cntbuf(nc+i);
820
+ v = v+w*ap::sqr(w/sr-1);
821
+ v = v+(sr-w)*ap::sqr(w/sr);
822
+ }
823
+ v = sqrt(v/(nc*n));
824
+
825
+ //
826
+ // Compare with best
827
+ //
828
+ x = double(2*sl)/double(sl+sr)-1;
829
+ cc = v*(1-alpha+alpha*ap::sqr(x));
830
+ if( cc<cbest )
831
+ {
832
+
833
+ //
834
+ // store split
835
+ //
836
+ rms = v;
837
+ koptimal = k;
838
+ cbest = cc;
839
+
840
+ //
841
+ // calculate CVRMS error
842
+ //
843
+ cvrms = 0;
844
+ for(i = 0; i <= nc-1; i++)
845
+ {
846
+ if( sl>1 )
847
+ {
848
+ w = cntbuf(i);
849
+ cvrms = cvrms+w*ap::sqr((w-1)/(sl-1)-1);
850
+ cvrms = cvrms+(sl-w)*ap::sqr(w/(sl-1));
851
+ }
852
+ else
853
+ {
854
+ w = cntbuf(i);
855
+ cvrms = cvrms+w*ap::sqr(double(1)/double(nc)-1);
856
+ cvrms = cvrms+(sl-w)*ap::sqr(double(1)/double(nc));
857
+ }
858
+ if( sr>1 )
859
+ {
860
+ w = cntbuf(nc+i);
861
+ cvrms = cvrms+w*ap::sqr((w-1)/(sr-1)-1);
862
+ cvrms = cvrms+(sr-w)*ap::sqr(w/(sr-1));
863
+ }
864
+ else
865
+ {
866
+ w = cntbuf(nc+i);
867
+ cvrms = cvrms+w*ap::sqr(double(1)/double(nc)-1);
868
+ cvrms = cvrms+(sr-w)*ap::sqr(double(1)/double(nc));
869
+ }
870
+ }
871
+ cvrms = sqrt(cvrms/(nc*n));
872
+ }
873
+ }
874
+
875
+ //
876
+ // Calculate threshold.
877
+ // Code is a bit complicated because there can be such
878
+ // numbers that 0.5(A+B) equals to A or B (if A-B=epsilon)
879
+ //
880
+ threshold = 0.5*(a(tiesbuf(koptimal))+a(tiesbuf(koptimal+1)));
881
+ if( threshold<=a(tiesbuf(koptimal)) )
882
+ {
883
+ threshold = a(tiesbuf(koptimal+1));
884
+ }
885
+ }
886
+
887
+
888
+ /*************************************************************************
889
+ Automatic non-optimal discretization, internal subroutine.
890
+
891
+ -- ALGLIB --
892
+ Copyright 22.05.2008 by Bochkanov Sergey
893
+ *************************************************************************/
894
+ void dssplitk(ap::real_1d_array a,
895
+ ap::integer_1d_array c,
896
+ int n,
897
+ int nc,
898
+ int kmax,
899
+ int& info,
900
+ ap::real_1d_array& thresholds,
901
+ int& ni,
902
+ double& cve)
903
+ {
904
+ int i;
905
+ int j;
906
+ int j1;
907
+ int k;
908
+ ap::integer_1d_array ties;
909
+ int tiecount;
910
+ ap::integer_1d_array p1;
911
+ ap::integer_1d_array p2;
912
+ ap::integer_1d_array cnt;
913
+ double v2;
914
+ int bestk;
915
+ double bestcve;
916
+ ap::integer_1d_array bestsizes;
917
+ double curcve;
918
+ ap::integer_1d_array cursizes;
919
+
920
+
921
+ //
922
+ // Test for errors in inputs
923
+ //
924
+ if( n<=0||nc<2||kmax<2 )
925
+ {
926
+ info = -1;
927
+ return;
928
+ }
929
+ for(i = 0; i <= n-1; i++)
930
+ {
931
+ if( c(i)<0||c(i)>=nc )
932
+ {
933
+ info = -2;
934
+ return;
935
+ }
936
+ }
937
+ info = 1;
938
+
939
+ //
940
+ // Tie
941
+ //
942
+ dstie(a, n, ties, tiecount, p1, p2);
943
+ for(i = 0; i <= n-1; i++)
944
+ {
945
+ if( p2(i)!=i )
946
+ {
947
+ k = c(i);
948
+ c(i) = c(p2(i));
949
+ c(p2(i)) = k;
950
+ }
951
+ }
952
+
953
+ //
954
+ // Special cases
955
+ //
956
+ if( tiecount==1 )
957
+ {
958
+ info = -3;
959
+ return;
960
+ }
961
+
962
+ //
963
+ // General case:
964
+ // 0. allocate arrays
965
+ //
966
+ kmax = ap::minint(kmax, tiecount);
967
+ bestsizes.setbounds(0, kmax-1);
968
+ cursizes.setbounds(0, kmax-1);
969
+ cnt.setbounds(0, nc-1);
970
+
971
+ //
972
+ // General case:
973
+ // 1. prepare "weak" solution (two subintervals, divided at median)
974
+ //
975
+ v2 = ap::maxrealnumber;
976
+ j = -1;
977
+ for(i = 1; i <= tiecount-1; i++)
978
+ {
979
+ if( fabs(ties(i)-0.5*(n-1))<v2 )
980
+ {
981
+ v2 = fabs(ties(i)-0.5*n);
982
+ j = i;
983
+ }
984
+ }
985
+ ap::ap_error::make_assertion(j>0, "DSSplitK: internal error #1!");
986
+ bestk = 2;
987
+ bestsizes(0) = ties(j);
988
+ bestsizes(1) = n-j;
989
+ bestcve = 0;
990
+ for(i = 0; i <= nc-1; i++)
991
+ {
992
+ cnt(i) = 0;
993
+ }
994
+ for(i = 0; i <= j-1; i++)
995
+ {
996
+ tieaddc(c, ties, i, nc, cnt);
997
+ }
998
+ bestcve = bestcve+getcv(cnt, nc);
999
+ for(i = 0; i <= nc-1; i++)
1000
+ {
1001
+ cnt(i) = 0;
1002
+ }
1003
+ for(i = j; i <= tiecount-1; i++)
1004
+ {
1005
+ tieaddc(c, ties, i, nc, cnt);
1006
+ }
1007
+ bestcve = bestcve+getcv(cnt, nc);
1008
+
1009
+ //
1010
+ // General case:
1011
+ // 2. Use greedy algorithm to find sub-optimal split in O(KMax*N) time
1012
+ //
1013
+ for(k = 2; k <= kmax; k++)
1014
+ {
1015
+
1016
+ //
1017
+ // Prepare greedy K-interval split
1018
+ //
1019
+ for(i = 0; i <= k-1; i++)
1020
+ {
1021
+ cursizes(i) = 0;
1022
+ }
1023
+ i = 0;
1024
+ j = 0;
1025
+ while(j<=tiecount-1&&i<=k-1)
1026
+ {
1027
+
1028
+ //
1029
+ // Rule: I-th bin is empty, fill it
1030
+ //
1031
+ if( cursizes(i)==0 )
1032
+ {
1033
+ cursizes(i) = ties(j+1)-ties(j);
1034
+ j = j+1;
1035
+ continue;
1036
+ }
1037
+
1038
+ //
1039
+ // Rule: (K-1-I) bins left, (K-1-I) ties left (1 tie per bin); next bin
1040
+ //
1041
+ if( tiecount-j==k-1-i )
1042
+ {
1043
+ i = i+1;
1044
+ continue;
1045
+ }
1046
+
1047
+ //
1048
+ // Rule: last bin, always place in current
1049
+ //
1050
+ if( i==k-1 )
1051
+ {
1052
+ cursizes(i) = cursizes(i)+ties(j+1)-ties(j);
1053
+ j = j+1;
1054
+ continue;
1055
+ }
1056
+
1057
+ //
1058
+ // Place J-th tie in I-th bin, or leave for I+1-th bin.
1059
+ //
1060
+ if( fabs(cursizes(i)+ties(j+1)-ties(j)-double(n)/double(k))<fabs(cursizes(i)-double(n)/double(k)) )
1061
+ {
1062
+ cursizes(i) = cursizes(i)+ties(j+1)-ties(j);
1063
+ j = j+1;
1064
+ }
1065
+ else
1066
+ {
1067
+ i = i+1;
1068
+ }
1069
+ }
1070
+ ap::ap_error::make_assertion(cursizes(k-1)!=0&&j==tiecount, "DSSplitK: internal error #1");
1071
+
1072
+ //
1073
+ // Calculate CVE
1074
+ //
1075
+ curcve = 0;
1076
+ j = 0;
1077
+ for(i = 0; i <= k-1; i++)
1078
+ {
1079
+ for(j1 = 0; j1 <= nc-1; j1++)
1080
+ {
1081
+ cnt(j1) = 0;
1082
+ }
1083
+ for(j1 = j; j1 <= j+cursizes(i)-1; j1++)
1084
+ {
1085
+ cnt(c(j1)) = cnt(c(j1))+1;
1086
+ }
1087
+ curcve = curcve+getcv(cnt, nc);
1088
+ j = j+cursizes(i);
1089
+ }
1090
+
1091
+ //
1092
+ // Choose best variant
1093
+ //
1094
+ if( curcve<bestcve )
1095
+ {
1096
+ for(i = 0; i <= k-1; i++)
1097
+ {
1098
+ bestsizes(i) = cursizes(i);
1099
+ }
1100
+ bestcve = curcve;
1101
+ bestk = k;
1102
+ }
1103
+ }
1104
+
1105
+ //
1106
+ // Transform from sizes to thresholds
1107
+ //
1108
+ cve = bestcve;
1109
+ ni = bestk;
1110
+ thresholds.setbounds(0, ni-2);
1111
+ j = bestsizes(0);
1112
+ for(i = 1; i <= bestk-1; i++)
1113
+ {
1114
+ thresholds(i-1) = 0.5*(a(j-1)+a(j));
1115
+ j = j+bestsizes(i);
1116
+ }
1117
+ }
1118
+
1119
+
1120
+ /*************************************************************************
1121
+ Automatic optimal discretization, internal subroutine.
1122
+
1123
+ -- ALGLIB --
1124
+ Copyright 22.05.2008 by Bochkanov Sergey
1125
+ *************************************************************************/
1126
+ void dsoptimalsplitk(ap::real_1d_array a,
1127
+ ap::integer_1d_array c,
1128
+ int n,
1129
+ int nc,
1130
+ int kmax,
1131
+ int& info,
1132
+ ap::real_1d_array& thresholds,
1133
+ int& ni,
1134
+ double& cve)
1135
+ {
1136
+ int i;
1137
+ int j;
1138
+ int s;
1139
+ int jl;
1140
+ int jr;
1141
+ double v1;
1142
+ double v2;
1143
+ double v3;
1144
+ double v4;
1145
+ ap::integer_1d_array ties;
1146
+ int tiecount;
1147
+ ap::integer_1d_array p1;
1148
+ ap::integer_1d_array p2;
1149
+ double cvtemp;
1150
+ ap::integer_1d_array cnt;
1151
+ ap::integer_1d_array cnt2;
1152
+ ap::real_2d_array cv;
1153
+ ap::integer_2d_array splits;
1154
+ int k;
1155
+ int koptimal;
1156
+ double cvoptimal;
1157
+
1158
+
1159
+ //
1160
+ // Test for errors in inputs
1161
+ //
1162
+ if( n<=0||nc<2||kmax<2 )
1163
+ {
1164
+ info = -1;
1165
+ return;
1166
+ }
1167
+ for(i = 0; i <= n-1; i++)
1168
+ {
1169
+ if( c(i)<0||c(i)>=nc )
1170
+ {
1171
+ info = -2;
1172
+ return;
1173
+ }
1174
+ }
1175
+ info = 1;
1176
+
1177
+ //
1178
+ // Tie
1179
+ //
1180
+ dstie(a, n, ties, tiecount, p1, p2);
1181
+ for(i = 0; i <= n-1; i++)
1182
+ {
1183
+ if( p2(i)!=i )
1184
+ {
1185
+ k = c(i);
1186
+ c(i) = c(p2(i));
1187
+ c(p2(i)) = k;
1188
+ }
1189
+ }
1190
+
1191
+ //
1192
+ // Special cases
1193
+ //
1194
+ if( tiecount==1 )
1195
+ {
1196
+ info = -3;
1197
+ return;
1198
+ }
1199
+
1200
+ //
1201
+ // General case
1202
+ // Use dynamic programming to find best split in O(KMax*NC*TieCount^2) time
1203
+ //
1204
+ kmax = ap::minint(kmax, tiecount);
1205
+ cv.setbounds(0, kmax-1, 0, tiecount-1);
1206
+ splits.setbounds(0, kmax-1, 0, tiecount-1);
1207
+ cnt.setbounds(0, nc-1);
1208
+ cnt2.setbounds(0, nc-1);
1209
+ for(j = 0; j <= nc-1; j++)
1210
+ {
1211
+ cnt(j) = 0;
1212
+ }
1213
+ for(j = 0; j <= tiecount-1; j++)
1214
+ {
1215
+ tieaddc(c, ties, j, nc, cnt);
1216
+ splits(0,j) = 0;
1217
+ cv(0,j) = getcv(cnt, nc);
1218
+ }
1219
+ for(k = 1; k <= kmax-1; k++)
1220
+ {
1221
+ for(j = 0; j <= nc-1; j++)
1222
+ {
1223
+ cnt(j) = 0;
1224
+ }
1225
+
1226
+ //
1227
+ // Subtask size J in [K..TieCount-1]:
1228
+ // optimal K-splitting on ties from 0-th to J-th.
1229
+ //
1230
+ for(j = k; j <= tiecount-1; j++)
1231
+ {
1232
+
1233
+ //
1234
+ // Update Cnt - let it contain classes of ties from K-th to J-th
1235
+ //
1236
+ tieaddc(c, ties, j, nc, cnt);
1237
+
1238
+ //
1239
+ // Search for optimal split point S in [K..J]
1240
+ //
1241
+ for(i = 0; i <= nc-1; i++)
1242
+ {
1243
+ cnt2(i) = cnt(i);
1244
+ }
1245
+ cv(k,j) = cv(k-1,j-1)+getcv(cnt2, nc);
1246
+ splits(k,j) = j;
1247
+ for(s = k+1; s <= j; s++)
1248
+ {
1249
+
1250
+ //
1251
+ // Update Cnt2 - let it contain classes of ties from S-th to J-th
1252
+ //
1253
+ tiesubc(c, ties, s-1, nc, cnt2);
1254
+
1255
+ //
1256
+ // Calculate CVE
1257
+ //
1258
+ cvtemp = cv(k-1,s-1)+getcv(cnt2, nc);
1259
+ if( cvtemp<cv(k,j) )
1260
+ {
1261
+ cv(k,j) = cvtemp;
1262
+ splits(k,j) = s;
1263
+ }
1264
+ }
1265
+ }
1266
+ }
1267
+
1268
+ //
1269
+ // Choose best partition, output result
1270
+ //
1271
+ koptimal = -1;
1272
+ cvoptimal = ap::maxrealnumber;
1273
+ for(k = 0; k <= kmax-1; k++)
1274
+ {
1275
+ if( cv(k,tiecount-1)<cvoptimal )
1276
+ {
1277
+ cvoptimal = cv(k,tiecount-1);
1278
+ koptimal = k;
1279
+ }
1280
+ }
1281
+ ap::ap_error::make_assertion(koptimal>=0, "DSOptimalSplitK: internal error #1!");
1282
+ if( koptimal==0 )
1283
+ {
1284
+
1285
+ //
1286
+ // Special case: best partition is one big interval.
1287
+ // Even 2-partition is not better.
1288
+ // This is possible when dealing with "weak" predictor variables.
1289
+ //
1290
+ // Make binary split as close to the median as possible.
1291
+ //
1292
+ v2 = ap::maxrealnumber;
1293
+ j = -1;
1294
+ for(i = 1; i <= tiecount-1; i++)
1295
+ {
1296
+ if( fabs(ties(i)-0.5*(n-1))<v2 )
1297
+ {
1298
+ v2 = fabs(ties(i)-0.5*(n-1));
1299
+ j = i;
1300
+ }
1301
+ }
1302
+ ap::ap_error::make_assertion(j>0, "DSOptimalSplitK: internal error #2!");
1303
+ thresholds.setbounds(0, 0);
1304
+ thresholds(0) = 0.5*(a(ties(j-1))+a(ties(j)));
1305
+ ni = 2;
1306
+ cve = 0;
1307
+ for(i = 0; i <= nc-1; i++)
1308
+ {
1309
+ cnt(i) = 0;
1310
+ }
1311
+ for(i = 0; i <= j-1; i++)
1312
+ {
1313
+ tieaddc(c, ties, i, nc, cnt);
1314
+ }
1315
+ cve = cve+getcv(cnt, nc);
1316
+ for(i = 0; i <= nc-1; i++)
1317
+ {
1318
+ cnt(i) = 0;
1319
+ }
1320
+ for(i = j; i <= tiecount-1; i++)
1321
+ {
1322
+ tieaddc(c, ties, i, nc, cnt);
1323
+ }
1324
+ cve = cve+getcv(cnt, nc);
1325
+ }
1326
+ else
1327
+ {
1328
+
1329
+ //
1330
+ // General case: 2 or more intervals
1331
+ //
1332
+ thresholds.setbounds(0, koptimal-1);
1333
+ ni = koptimal+1;
1334
+ cve = cv(koptimal,tiecount-1);
1335
+ jl = splits(koptimal,tiecount-1);
1336
+ jr = tiecount-1;
1337
+ for(k = koptimal; k >= 1; k--)
1338
+ {
1339
+ thresholds(k-1) = 0.5*(a(ties(jl-1))+a(ties(jl)));
1340
+ jr = jl-1;
1341
+ jl = splits(k-1,jl-1);
1342
+ }
1343
+ }
1344
+ }
1345
+
1346
+
1347
+ /*************************************************************************
1348
+ Subroutine prepares K-fold split of the training set.
1349
+
1350
+ NOTES:
1351
+ "NClasses>0" means that we have classification task.
1352
+ "NClasses<0" means regression task with -NClasses real outputs.
1353
+
1354
+ -- ALGLIB --
1355
+ Copyright 11.01.2009 by Bochkanov Sergey
1356
+ *************************************************************************/
1357
+ static void dskfoldsplit(const ap::real_2d_array& xy,
1358
+ int npoints,
1359
+ int nclasses,
1360
+ int foldscount,
1361
+ bool stratifiedsplits,
1362
+ ap::integer_1d_array& folds)
1363
+ {
1364
+ int i;
1365
+ int j;
1366
+ int k;
1367
+
1368
+
1369
+ //
1370
+ // test parameters
1371
+ //
1372
+ ap::ap_error::make_assertion(npoints>0, "DSKFoldSplit: wrong NPoints!");
1373
+ ap::ap_error::make_assertion(nclasses>1||nclasses<0, "DSKFoldSplit: wrong NClasses!");
1374
+ ap::ap_error::make_assertion(foldscount>=2&&foldscount<=npoints, "DSKFoldSplit: wrong FoldsCount!");
1375
+ ap::ap_error::make_assertion(!stratifiedsplits, "DSKFoldSplit: stratified splits are not supported!");
1376
+
1377
+ //
1378
+ // Folds
1379
+ //
1380
+ folds.setbounds(0, npoints-1);
1381
+ for(i = 0; i <= npoints-1; i++)
1382
+ {
1383
+ folds(i) = i*foldscount/npoints;
1384
+ }
1385
+ for(i = 0; i <= npoints-2; i++)
1386
+ {
1387
+ j = i+ap::randominteger(npoints-i);
1388
+ if( j!=i )
1389
+ {
1390
+ k = folds(i);
1391
+ folds(i) = folds(j);
1392
+ folds(j) = k;
1393
+ }
1394
+ }
1395
+ }
1396
+
1397
+
1398
+ /*************************************************************************
1399
+ Internal function
1400
+ *************************************************************************/
1401
+ static double xlny(double x, double y)
1402
+ {
1403
+ double result;
1404
+
1405
+ if( x==0 )
1406
+ {
1407
+ result = 0;
1408
+ }
1409
+ else
1410
+ {
1411
+ result = x*log(y);
1412
+ }
1413
+ return result;
1414
+ }
1415
+
1416
+
1417
+ /*************************************************************************
1418
+ Internal function,
1419
+ returns number of samples of class I in Cnt[I]
1420
+ *************************************************************************/
1421
+ static double getcv(const ap::integer_1d_array& cnt, int nc)
1422
+ {
1423
+ double result;
1424
+ int i;
1425
+ double s;
1426
+
1427
+ s = 0;
1428
+ for(i = 0; i <= nc-1; i++)
1429
+ {
1430
+ s = s+cnt(i);
1431
+ }
1432
+ result = 0;
1433
+ for(i = 0; i <= nc-1; i++)
1434
+ {
1435
+ result = result-xlny(double(cnt(i)), cnt(i)/(s+nc-1));
1436
+ }
1437
+ return result;
1438
+ }
1439
+
1440
+
1441
+ /*************************************************************************
1442
+ Internal function, adds number of samples of class I in tie NTie to Cnt[I]
1443
+ *************************************************************************/
1444
+ static void tieaddc(const ap::integer_1d_array& c,
1445
+ const ap::integer_1d_array& ties,
1446
+ int ntie,
1447
+ int nc,
1448
+ ap::integer_1d_array& cnt)
1449
+ {
1450
+ int i;
1451
+
1452
+ for(i = ties(ntie); i <= ties(ntie+1)-1; i++)
1453
+ {
1454
+ cnt(c(i)) = cnt(c(i))+1;
1455
+ }
1456
+ }
1457
+
1458
+
1459
+ /*************************************************************************
1460
+ Internal function, subtracts number of samples of class I in tie NTie to Cnt[I]
1461
+ *************************************************************************/
1462
+ static void tiesubc(const ap::integer_1d_array& c,
1463
+ const ap::integer_1d_array& ties,
1464
+ int ntie,
1465
+ int nc,
1466
+ ap::integer_1d_array& cnt)
1467
+ {
1468
+ int i;
1469
+
1470
+ for(i = ties(ntie); i <= ties(ntie+1)-1; i++)
1471
+ {
1472
+ cnt(c(i)) = cnt(c(i))-1;
1473
+ }
1474
+ }
1475
+
1476
+
1477
+ /*************************************************************************
1478
+ Internal function,
1479
+ returns number of samples of class I in Cnt[I]
1480
+ *************************************************************************/
1481
+ static void tiegetc(const ap::integer_1d_array& c,
1482
+ const ap::integer_1d_array& ties,
1483
+ int ntie,
1484
+ int nc,
1485
+ ap::integer_1d_array& cnt)
1486
+ {
1487
+ int i;
1488
+
1489
+ for(i = 0; i <= nc-1; i++)
1490
+ {
1491
+ cnt(i) = 0;
1492
+ }
1493
+ for(i = ties(ntie); i <= ties(ntie+1)-1; i++)
1494
+ {
1495
+ cnt(c(i)) = cnt(c(i))+1;
1496
+ }
1497
+ }
1498
+
1499
+
1500
+