alglib 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/History.txt +7 -0
  2. data/Manifest.txt +253 -0
  3. data/README.txt +33 -0
  4. data/Rakefile +27 -0
  5. data/ext/Rakefile +24 -0
  6. data/ext/alglib.i +24 -0
  7. data/ext/alglib/Makefile +157 -0
  8. data/ext/alglib/airyf.cpp +372 -0
  9. data/ext/alglib/airyf.h +81 -0
  10. data/ext/alglib/alglib.cpp +8558 -0
  11. data/ext/alglib/alglib_util.cpp +19 -0
  12. data/ext/alglib/alglib_util.h +14 -0
  13. data/ext/alglib/ap.cpp +877 -0
  14. data/ext/alglib/ap.english.html +364 -0
  15. data/ext/alglib/ap.h +666 -0
  16. data/ext/alglib/ap.russian.html +442 -0
  17. data/ext/alglib/apvt.h +754 -0
  18. data/ext/alglib/bdss.cpp +1500 -0
  19. data/ext/alglib/bdss.h +251 -0
  20. data/ext/alglib/bdsvd.cpp +1339 -0
  21. data/ext/alglib/bdsvd.h +164 -0
  22. data/ext/alglib/bessel.cpp +1226 -0
  23. data/ext/alglib/bessel.h +331 -0
  24. data/ext/alglib/betaf.cpp +105 -0
  25. data/ext/alglib/betaf.h +74 -0
  26. data/ext/alglib/bidiagonal.cpp +1328 -0
  27. data/ext/alglib/bidiagonal.h +350 -0
  28. data/ext/alglib/binomialdistr.cpp +247 -0
  29. data/ext/alglib/binomialdistr.h +153 -0
  30. data/ext/alglib/blas.cpp +576 -0
  31. data/ext/alglib/blas.h +132 -0
  32. data/ext/alglib/cblas.cpp +226 -0
  33. data/ext/alglib/cblas.h +57 -0
  34. data/ext/alglib/cdet.cpp +138 -0
  35. data/ext/alglib/cdet.h +92 -0
  36. data/ext/alglib/chebyshev.cpp +216 -0
  37. data/ext/alglib/chebyshev.h +76 -0
  38. data/ext/alglib/chisquaredistr.cpp +157 -0
  39. data/ext/alglib/chisquaredistr.h +144 -0
  40. data/ext/alglib/cholesky.cpp +285 -0
  41. data/ext/alglib/cholesky.h +86 -0
  42. data/ext/alglib/cinverse.cpp +298 -0
  43. data/ext/alglib/cinverse.h +111 -0
  44. data/ext/alglib/clu.cpp +337 -0
  45. data/ext/alglib/clu.h +120 -0
  46. data/ext/alglib/correlation.cpp +280 -0
  47. data/ext/alglib/correlation.h +77 -0
  48. data/ext/alglib/correlationtests.cpp +726 -0
  49. data/ext/alglib/correlationtests.h +134 -0
  50. data/ext/alglib/crcond.cpp +826 -0
  51. data/ext/alglib/crcond.h +148 -0
  52. data/ext/alglib/creflections.cpp +310 -0
  53. data/ext/alglib/creflections.h +165 -0
  54. data/ext/alglib/csolve.cpp +312 -0
  55. data/ext/alglib/csolve.h +99 -0
  56. data/ext/alglib/ctrinverse.cpp +387 -0
  57. data/ext/alglib/ctrinverse.h +98 -0
  58. data/ext/alglib/ctrlinsolve.cpp +297 -0
  59. data/ext/alglib/ctrlinsolve.h +81 -0
  60. data/ext/alglib/dawson.cpp +234 -0
  61. data/ext/alglib/dawson.h +74 -0
  62. data/ext/alglib/descriptivestatistics.cpp +436 -0
  63. data/ext/alglib/descriptivestatistics.h +112 -0
  64. data/ext/alglib/det.cpp +140 -0
  65. data/ext/alglib/det.h +94 -0
  66. data/ext/alglib/dforest.cpp +1819 -0
  67. data/ext/alglib/dforest.h +316 -0
  68. data/ext/alglib/elliptic.cpp +497 -0
  69. data/ext/alglib/elliptic.h +217 -0
  70. data/ext/alglib/estnorm.cpp +429 -0
  71. data/ext/alglib/estnorm.h +107 -0
  72. data/ext/alglib/expintegrals.cpp +422 -0
  73. data/ext/alglib/expintegrals.h +108 -0
  74. data/ext/alglib/faq.english.html +258 -0
  75. data/ext/alglib/faq.russian.html +272 -0
  76. data/ext/alglib/fdistr.cpp +202 -0
  77. data/ext/alglib/fdistr.h +163 -0
  78. data/ext/alglib/fresnel.cpp +211 -0
  79. data/ext/alglib/fresnel.h +91 -0
  80. data/ext/alglib/gammaf.cpp +338 -0
  81. data/ext/alglib/gammaf.h +104 -0
  82. data/ext/alglib/gqgengauss.cpp +235 -0
  83. data/ext/alglib/gqgengauss.h +92 -0
  84. data/ext/alglib/gqgenhermite.cpp +268 -0
  85. data/ext/alglib/gqgenhermite.h +63 -0
  86. data/ext/alglib/gqgenjacobi.cpp +297 -0
  87. data/ext/alglib/gqgenjacobi.h +72 -0
  88. data/ext/alglib/gqgenlaguerre.cpp +265 -0
  89. data/ext/alglib/gqgenlaguerre.h +69 -0
  90. data/ext/alglib/gqgenlegendre.cpp +300 -0
  91. data/ext/alglib/gqgenlegendre.h +62 -0
  92. data/ext/alglib/gqgenlobatto.cpp +305 -0
  93. data/ext/alglib/gqgenlobatto.h +97 -0
  94. data/ext/alglib/gqgenradau.cpp +232 -0
  95. data/ext/alglib/gqgenradau.h +95 -0
  96. data/ext/alglib/hbisinv.cpp +480 -0
  97. data/ext/alglib/hbisinv.h +183 -0
  98. data/ext/alglib/hblas.cpp +228 -0
  99. data/ext/alglib/hblas.h +64 -0
  100. data/ext/alglib/hcholesky.cpp +339 -0
  101. data/ext/alglib/hcholesky.h +91 -0
  102. data/ext/alglib/hermite.cpp +114 -0
  103. data/ext/alglib/hermite.h +49 -0
  104. data/ext/alglib/hessenberg.cpp +370 -0
  105. data/ext/alglib/hessenberg.h +152 -0
  106. data/ext/alglib/hevd.cpp +247 -0
  107. data/ext/alglib/hevd.h +107 -0
  108. data/ext/alglib/hsschur.cpp +1316 -0
  109. data/ext/alglib/hsschur.h +108 -0
  110. data/ext/alglib/htridiagonal.cpp +734 -0
  111. data/ext/alglib/htridiagonal.h +180 -0
  112. data/ext/alglib/ialglib.cpp +6 -0
  113. data/ext/alglib/ialglib.h +9 -0
  114. data/ext/alglib/ibetaf.cpp +960 -0
  115. data/ext/alglib/ibetaf.h +125 -0
  116. data/ext/alglib/igammaf.cpp +430 -0
  117. data/ext/alglib/igammaf.h +157 -0
  118. data/ext/alglib/inv.cpp +274 -0
  119. data/ext/alglib/inv.h +115 -0
  120. data/ext/alglib/inverseupdate.cpp +480 -0
  121. data/ext/alglib/inverseupdate.h +185 -0
  122. data/ext/alglib/jacobianelliptic.cpp +164 -0
  123. data/ext/alglib/jacobianelliptic.h +94 -0
  124. data/ext/alglib/jarquebera.cpp +2271 -0
  125. data/ext/alglib/jarquebera.h +80 -0
  126. data/ext/alglib/kmeans.cpp +356 -0
  127. data/ext/alglib/kmeans.h +76 -0
  128. data/ext/alglib/laguerre.cpp +94 -0
  129. data/ext/alglib/laguerre.h +48 -0
  130. data/ext/alglib/lbfgs.cpp +1167 -0
  131. data/ext/alglib/lbfgs.h +218 -0
  132. data/ext/alglib/lda.cpp +434 -0
  133. data/ext/alglib/lda.h +133 -0
  134. data/ext/alglib/ldlt.cpp +1130 -0
  135. data/ext/alglib/ldlt.h +124 -0
  136. data/ext/alglib/leastsquares.cpp +1252 -0
  137. data/ext/alglib/leastsquares.h +290 -0
  138. data/ext/alglib/legendre.cpp +107 -0
  139. data/ext/alglib/legendre.h +49 -0
  140. data/ext/alglib/linreg.cpp +1185 -0
  141. data/ext/alglib/linreg.h +380 -0
  142. data/ext/alglib/logit.cpp +1523 -0
  143. data/ext/alglib/logit.h +333 -0
  144. data/ext/alglib/lq.cpp +399 -0
  145. data/ext/alglib/lq.h +160 -0
  146. data/ext/alglib/lu.cpp +462 -0
  147. data/ext/alglib/lu.h +119 -0
  148. data/ext/alglib/mannwhitneyu.cpp +4490 -0
  149. data/ext/alglib/mannwhitneyu.h +115 -0
  150. data/ext/alglib/minlm.cpp +918 -0
  151. data/ext/alglib/minlm.h +312 -0
  152. data/ext/alglib/mlpbase.cpp +3375 -0
  153. data/ext/alglib/mlpbase.h +589 -0
  154. data/ext/alglib/mlpe.cpp +1369 -0
  155. data/ext/alglib/mlpe.h +552 -0
  156. data/ext/alglib/mlptrain.cpp +1056 -0
  157. data/ext/alglib/mlptrain.h +283 -0
  158. data/ext/alglib/nearunityunit.cpp +91 -0
  159. data/ext/alglib/nearunityunit.h +17 -0
  160. data/ext/alglib/normaldistr.cpp +377 -0
  161. data/ext/alglib/normaldistr.h +175 -0
  162. data/ext/alglib/nsevd.cpp +1869 -0
  163. data/ext/alglib/nsevd.h +140 -0
  164. data/ext/alglib/pca.cpp +168 -0
  165. data/ext/alglib/pca.h +87 -0
  166. data/ext/alglib/poissondistr.cpp +143 -0
  167. data/ext/alglib/poissondistr.h +130 -0
  168. data/ext/alglib/polinterpolation.cpp +685 -0
  169. data/ext/alglib/polinterpolation.h +206 -0
  170. data/ext/alglib/psif.cpp +173 -0
  171. data/ext/alglib/psif.h +88 -0
  172. data/ext/alglib/qr.cpp +414 -0
  173. data/ext/alglib/qr.h +168 -0
  174. data/ext/alglib/ratinterpolation.cpp +134 -0
  175. data/ext/alglib/ratinterpolation.h +72 -0
  176. data/ext/alglib/rcond.cpp +705 -0
  177. data/ext/alglib/rcond.h +140 -0
  178. data/ext/alglib/reflections.cpp +504 -0
  179. data/ext/alglib/reflections.h +165 -0
  180. data/ext/alglib/rotations.cpp +473 -0
  181. data/ext/alglib/rotations.h +128 -0
  182. data/ext/alglib/rsolve.cpp +221 -0
  183. data/ext/alglib/rsolve.h +99 -0
  184. data/ext/alglib/sbisinv.cpp +217 -0
  185. data/ext/alglib/sbisinv.h +171 -0
  186. data/ext/alglib/sblas.cpp +185 -0
  187. data/ext/alglib/sblas.h +64 -0
  188. data/ext/alglib/schur.cpp +156 -0
  189. data/ext/alglib/schur.h +102 -0
  190. data/ext/alglib/sdet.cpp +193 -0
  191. data/ext/alglib/sdet.h +101 -0
  192. data/ext/alglib/sevd.cpp +116 -0
  193. data/ext/alglib/sevd.h +99 -0
  194. data/ext/alglib/sinverse.cpp +672 -0
  195. data/ext/alglib/sinverse.h +138 -0
  196. data/ext/alglib/spddet.cpp +138 -0
  197. data/ext/alglib/spddet.h +96 -0
  198. data/ext/alglib/spdgevd.cpp +842 -0
  199. data/ext/alglib/spdgevd.h +200 -0
  200. data/ext/alglib/spdinverse.cpp +509 -0
  201. data/ext/alglib/spdinverse.h +122 -0
  202. data/ext/alglib/spdrcond.cpp +421 -0
  203. data/ext/alglib/spdrcond.h +118 -0
  204. data/ext/alglib/spdsolve.cpp +275 -0
  205. data/ext/alglib/spdsolve.h +105 -0
  206. data/ext/alglib/spline2d.cpp +1192 -0
  207. data/ext/alglib/spline2d.h +301 -0
  208. data/ext/alglib/spline3.cpp +1264 -0
  209. data/ext/alglib/spline3.h +290 -0
  210. data/ext/alglib/srcond.cpp +595 -0
  211. data/ext/alglib/srcond.h +127 -0
  212. data/ext/alglib/ssolve.cpp +895 -0
  213. data/ext/alglib/ssolve.h +139 -0
  214. data/ext/alglib/stdafx.h +0 -0
  215. data/ext/alglib/stest.cpp +131 -0
  216. data/ext/alglib/stest.h +94 -0
  217. data/ext/alglib/studenttdistr.cpp +222 -0
  218. data/ext/alglib/studenttdistr.h +115 -0
  219. data/ext/alglib/studentttests.cpp +377 -0
  220. data/ext/alglib/studentttests.h +178 -0
  221. data/ext/alglib/svd.cpp +620 -0
  222. data/ext/alglib/svd.h +126 -0
  223. data/ext/alglib/tdbisinv.cpp +2608 -0
  224. data/ext/alglib/tdbisinv.h +228 -0
  225. data/ext/alglib/tdevd.cpp +1229 -0
  226. data/ext/alglib/tdevd.h +115 -0
  227. data/ext/alglib/tridiagonal.cpp +594 -0
  228. data/ext/alglib/tridiagonal.h +171 -0
  229. data/ext/alglib/trigintegrals.cpp +490 -0
  230. data/ext/alglib/trigintegrals.h +131 -0
  231. data/ext/alglib/trinverse.cpp +345 -0
  232. data/ext/alglib/trinverse.h +98 -0
  233. data/ext/alglib/trlinsolve.cpp +926 -0
  234. data/ext/alglib/trlinsolve.h +73 -0
  235. data/ext/alglib/tsort.cpp +405 -0
  236. data/ext/alglib/tsort.h +54 -0
  237. data/ext/alglib/variancetests.cpp +245 -0
  238. data/ext/alglib/variancetests.h +134 -0
  239. data/ext/alglib/wsr.cpp +6285 -0
  240. data/ext/alglib/wsr.h +96 -0
  241. data/ext/ap.i +97 -0
  242. data/ext/correlation.i +24 -0
  243. data/ext/extconf.rb +6 -0
  244. data/ext/logit.i +89 -0
  245. data/lib/alglib.rb +71 -0
  246. data/lib/alglib/correlation.rb +26 -0
  247. data/lib/alglib/linearregression.rb +63 -0
  248. data/lib/alglib/logit.rb +42 -0
  249. data/test/test_alglib.rb +52 -0
  250. data/test/test_correlation.rb +44 -0
  251. data/test/test_correlationtest.rb +45 -0
  252. data/test/test_linreg.rb +35 -0
  253. data/test/test_logit.rb +43 -0
  254. data/test/test_pca.rb +27 -0
  255. metadata +326 -0
@@ -0,0 +1,112 @@
1
+ /*************************************************************************
2
+ Copyright (c) 2007, Sergey Bochkanov (ALGLIB project).
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ - Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ - Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer listed
13
+ in this license in the documentation and/or other materials
14
+ provided with the distribution.
15
+
16
+ - Neither the name of the copyright holders nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ *************************************************************************/
32
+
33
+ #ifndef _descriptivestatistics_h
34
+ #define _descriptivestatistics_h
35
+
36
+ #include "ap.h"
37
+ #include "ialglib.h"
38
+
39
+ /*************************************************************************
40
+ Calculation of the distribution moments: mean, variance, slewness, kurtosis.
41
+
42
+ Input parameters:
43
+ X - sample. Array with whose indexes range within [0..N-1]
44
+ N - sample size.
45
+
46
+ Output parameters:
47
+ Mean - mean.
48
+ Variance- variance.
49
+ Skewness- skewness (if variance<>0; zero otherwise).
50
+ Kurtosis- kurtosis (if variance<>0; zero otherwise).
51
+
52
+ -- ALGLIB --
53
+ Copyright 06.09.2006 by Bochkanov Sergey
54
+ *************************************************************************/
55
+ void calculatemoments(const ap::real_1d_array& x,
56
+ int n,
57
+ double& mean,
58
+ double& variance,
59
+ double& skewness,
60
+ double& kurtosis);
61
+
62
+
63
+ /*************************************************************************
64
+ ADev
65
+
66
+ Input parameters:
67
+ X - sample (array indexes: [0..N-1])
68
+ N - sample size
69
+
70
+ Output parameters:
71
+ ADev- ADev
72
+
73
+ -- ALGLIB --
74
+ Copyright 06.09.2006 by Bochkanov Sergey
75
+ *************************************************************************/
76
+ void calculateadev(const ap::real_1d_array& x, int n, double& adev);
77
+
78
+
79
+ /*************************************************************************
80
+ Median calculation.
81
+
82
+ Input parameters:
83
+ X - sample (array indexes: [0..N-1])
84
+ N - sample size
85
+
86
+ Output parameters:
87
+ Median
88
+
89
+ -- ALGLIB --
90
+ Copyright 06.09.2006 by Bochkanov Sergey
91
+ *************************************************************************/
92
+ void calculatemedian(ap::real_1d_array x, int n, double& median);
93
+
94
+
95
+ /*************************************************************************
96
+ Percentile calculation.
97
+
98
+ Input parameters:
99
+ X - sample (array indexes: [0..N-1])
100
+ N - sample size, N>1
101
+ P - percentile (0<=P<=1)
102
+
103
+ Output parameters:
104
+ V - percentile
105
+
106
+ -- ALGLIB --
107
+ Copyright 01.03.2008 by Bochkanov Sergey
108
+ *************************************************************************/
109
+ void calculatepercentile(ap::real_1d_array x, int n, double p, double& v);
110
+
111
+
112
+ #endif
@@ -0,0 +1,140 @@
1
+ /*************************************************************************
2
+ Copyright (c) 2005-2007, Sergey Bochkanov (ALGLIB project).
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ - Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ - Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer listed
13
+ in this license in the documentation and/or other materials
14
+ provided with the distribution.
15
+
16
+ - Neither the name of the copyright holders nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ *************************************************************************/
32
+
33
+ #include <stdafx.h>
34
+ #include "det.h"
35
+
36
+ /*************************************************************************
37
+ Determinant calculation of the matrix given by its LU decomposition.
38
+
39
+ Input parameters:
40
+ A - LU decomposition of the matrix (output of
41
+ RMatrixLU subroutine).
42
+ Pivots - table of permutations which were made during
43
+ the LU decomposition.
44
+ Output of RMatrixLU subroutine.
45
+ N - size of matrix A.
46
+
47
+ Result: matrix determinant.
48
+
49
+ -- ALGLIB --
50
+ Copyright 2005 by Bochkanov Sergey
51
+ *************************************************************************/
52
+ double rmatrixludet(const ap::real_2d_array& a,
53
+ const ap::integer_1d_array& pivots,
54
+ int n)
55
+ {
56
+ double result;
57
+ int i;
58
+ int s;
59
+
60
+ result = 1;
61
+ s = 1;
62
+ for(i = 0; i <= n-1; i++)
63
+ {
64
+ result = result*a(i,i);
65
+ if( pivots(i)!=i )
66
+ {
67
+ s = -s;
68
+ }
69
+ }
70
+ result = result*s;
71
+ return result;
72
+ }
73
+
74
+
75
+ /*************************************************************************
76
+ Calculation of the determinant of a general matrix
77
+
78
+ Input parameters:
79
+ A - matrix, array[0..N-1, 0..N-1]
80
+ N - size of matrix A.
81
+
82
+ Result: determinant of matrix A.
83
+
84
+ -- ALGLIB --
85
+ Copyright 2005 by Bochkanov Sergey
86
+ *************************************************************************/
87
+ double rmatrixdet(ap::real_2d_array a, int n)
88
+ {
89
+ double result;
90
+ ap::integer_1d_array pivots;
91
+
92
+ rmatrixlu(a, n, n, pivots);
93
+ result = rmatrixludet(a, pivots, n);
94
+ return result;
95
+ }
96
+
97
+
98
+ /*************************************************************************
99
+ Obsolete 1-based subroutine.
100
+ See RMatrixDetLU for 0-based replacement.
101
+ *************************************************************************/
102
+ double determinantlu(const ap::real_2d_array& a,
103
+ const ap::integer_1d_array& pivots,
104
+ int n)
105
+ {
106
+ double result;
107
+ int i;
108
+ int s;
109
+
110
+ result = 1;
111
+ s = 1;
112
+ for(i = 1; i <= n; i++)
113
+ {
114
+ result = result*a(i,i);
115
+ if( pivots(i)!=i )
116
+ {
117
+ s = -s;
118
+ }
119
+ }
120
+ result = result*s;
121
+ return result;
122
+ }
123
+
124
+
125
+ /*************************************************************************
126
+ Obsolete 1-based subroutine.
127
+ See RMatrixDet for 0-based replacement.
128
+ *************************************************************************/
129
+ double determinant(ap::real_2d_array a, int n)
130
+ {
131
+ double result;
132
+ ap::integer_1d_array pivots;
133
+
134
+ ludecomposition(a, n, n, pivots);
135
+ result = determinantlu(a, pivots, n);
136
+ return result;
137
+ }
138
+
139
+
140
+
@@ -0,0 +1,94 @@
1
+ /*************************************************************************
2
+ Copyright (c) 2005-2007, Sergey Bochkanov (ALGLIB project).
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ - Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ - Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer listed
13
+ in this license in the documentation and/or other materials
14
+ provided with the distribution.
15
+
16
+ - Neither the name of the copyright holders nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ *************************************************************************/
32
+
33
+ #ifndef _det_h
34
+ #define _det_h
35
+
36
+ #include "ap.h"
37
+ #include "ialglib.h"
38
+
39
+ #include "lu.h"
40
+
41
+
42
+ /*************************************************************************
43
+ Determinant calculation of the matrix given by its LU decomposition.
44
+
45
+ Input parameters:
46
+ A - LU decomposition of the matrix (output of
47
+ RMatrixLU subroutine).
48
+ Pivots - table of permutations which were made during
49
+ the LU decomposition.
50
+ Output of RMatrixLU subroutine.
51
+ N - size of matrix A.
52
+
53
+ Result: matrix determinant.
54
+
55
+ -- ALGLIB --
56
+ Copyright 2005 by Bochkanov Sergey
57
+ *************************************************************************/
58
+ double rmatrixludet(const ap::real_2d_array& a,
59
+ const ap::integer_1d_array& pivots,
60
+ int n);
61
+
62
+
63
+ /*************************************************************************
64
+ Calculation of the determinant of a general matrix
65
+
66
+ Input parameters:
67
+ A - matrix, array[0..N-1, 0..N-1]
68
+ N - size of matrix A.
69
+
70
+ Result: determinant of matrix A.
71
+
72
+ -- ALGLIB --
73
+ Copyright 2005 by Bochkanov Sergey
74
+ *************************************************************************/
75
+ double rmatrixdet(ap::real_2d_array a, int n);
76
+
77
+
78
+ /*************************************************************************
79
+ Obsolete 1-based subroutine.
80
+ See RMatrixDetLU for 0-based replacement.
81
+ *************************************************************************/
82
+ double determinantlu(const ap::real_2d_array& a,
83
+ const ap::integer_1d_array& pivots,
84
+ int n);
85
+
86
+
87
+ /*************************************************************************
88
+ Obsolete 1-based subroutine.
89
+ See RMatrixDet for 0-based replacement.
90
+ *************************************************************************/
91
+ double determinant(ap::real_2d_array a, int n);
92
+
93
+
94
+ #endif
@@ -0,0 +1,1819 @@
1
+ /*************************************************************************
2
+ Copyright (c) 2009, Sergey Bochkanov (ALGLIB project).
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ - Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ - Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer listed
13
+ in this license in the documentation and/or other materials
14
+ provided with the distribution.
15
+
16
+ - Neither the name of the copyright holders nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ *************************************************************************/
32
+
33
+ #include <stdafx.h>
34
+ #include "dforest.h"
35
+
36
+ static const int dfvnum = 8;
37
+ static const int innernodewidth = 3;
38
+ static const int leafnodewidth = 2;
39
+ static const int dfusestrongsplits = 1;
40
+ static const int dfuseevs = 2;
41
+
42
+ static int dfclserror(const decisionforest& df,
43
+ const ap::real_2d_array& xy,
44
+ int npoints);
45
+ static void dfprocessinternal(const decisionforest& df,
46
+ int offs,
47
+ const ap::real_1d_array& x,
48
+ ap::real_1d_array& y);
49
+ static void dfbuildtree(const ap::real_2d_array& xy,
50
+ int npoints,
51
+ int nvars,
52
+ int nclasses,
53
+ int nfeatures,
54
+ int nvarsinpool,
55
+ int flags,
56
+ dfinternalbuffers& bufs);
57
+ static void dfbuildtreerec(const ap::real_2d_array& xy,
58
+ int npoints,
59
+ int nvars,
60
+ int nclasses,
61
+ int nfeatures,
62
+ int nvarsinpool,
63
+ int flags,
64
+ int& numprocessed,
65
+ int idx1,
66
+ int idx2,
67
+ dfinternalbuffers& bufs);
68
+ static void dfweakspliti(ap::real_1d_array& x,
69
+ ap::integer_1d_array& y,
70
+ int n,
71
+ int nclasses,
72
+ int& info,
73
+ double& threshold,
74
+ double& e);
75
+ static void dfsplitc(ap::real_1d_array& x,
76
+ ap::integer_1d_array& c,
77
+ ap::integer_1d_array& cntbuf,
78
+ int n,
79
+ int nc,
80
+ int flags,
81
+ int& info,
82
+ double& threshold,
83
+ double& e);
84
+ static void dfsplitr(ap::real_1d_array& x,
85
+ ap::real_1d_array& y,
86
+ int n,
87
+ int flags,
88
+ int& info,
89
+ double& threshold,
90
+ double& e);
91
+
92
+ /*************************************************************************
93
+ This subroutine builds random decision forest.
94
+
95
+ INPUT PARAMETERS:
96
+ XY - training set
97
+ NPoints - training set size, NPoints>=1
98
+ NVars - number of independent variables, NVars>=1
99
+ NClasses - task type:
100
+ * NClasses=1 - regression task with one
101
+ dependent variable
102
+ * NClasses>1 - classification task with
103
+ NClasses classes.
104
+ NTrees - number of trees in a forest, NTrees>=1.
105
+ recommended values: 50-100.
106
+ R - percent of a training set used to build
107
+ individual trees. 0<R<=1.
108
+ recommended values: 0.1 <= R <= 0.66.
109
+
110
+ OUTPUT PARAMETERS:
111
+ Info - return code:
112
+ * -2, if there is a point with class number
113
+ outside of [0..NClasses-1].
114
+ * -1, if incorrect parameters was passed
115
+ (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
116
+ or R>1).
117
+ * 1, if task has been solved
118
+ DF - model built
119
+ Rep - training report, contains error on a training set
120
+ and out-of-bag estimates of generalization error.
121
+
122
+ -- ALGLIB --
123
+ Copyright 19.02.2009 by Bochkanov Sergey
124
+ *************************************************************************/
125
+ void dfbuildrandomdecisionforest(const ap::real_2d_array& xy,
126
+ int npoints,
127
+ int nvars,
128
+ int nclasses,
129
+ int ntrees,
130
+ double r,
131
+ int& info,
132
+ decisionforest& df,
133
+ dfreport& rep)
134
+ {
135
+ int samplesize;
136
+
137
+ if( r<=0||r>1 )
138
+ {
139
+ info = -1;
140
+ return;
141
+ }
142
+ samplesize = ap::maxint(ap::round(r*npoints), 1);
143
+ dfbuildinternal(xy, npoints, nvars, nclasses, ntrees, samplesize, ap::maxint(nvars/2, 1), dfusestrongsplits+dfuseevs, info, df, rep);
144
+ }
145
+
146
+
147
+ /*************************************************************************
148
+ Internal decision forest building subroutine,
149
+ should not be called by user.
150
+
151
+ -- ALGLIB --
152
+ Copyright 19.02.2009 by Bochkanov Sergey
153
+ *************************************************************************/
154
+ void dfbuildinternal(const ap::real_2d_array& xy,
155
+ int npoints,
156
+ int nvars,
157
+ int nclasses,
158
+ int ntrees,
159
+ int samplesize,
160
+ int nfeatures,
161
+ int flags,
162
+ int& info,
163
+ decisionforest& df,
164
+ dfreport& rep)
165
+ {
166
+ int i;
167
+ int j;
168
+ int k;
169
+ int tmpi;
170
+ int lasttreeoffs;
171
+ int offs;
172
+ int ooboffs;
173
+ int treesize;
174
+ int nvarsinpool;
175
+ bool useevs;
176
+ dfinternalbuffers bufs;
177
+ ap::integer_1d_array permbuf;
178
+ ap::real_1d_array oobbuf;
179
+ ap::integer_1d_array oobcntbuf;
180
+ ap::real_2d_array xys;
181
+ ap::real_1d_array x;
182
+ ap::real_1d_array y;
183
+ int oobcnt;
184
+ int oobrelcnt;
185
+ double v;
186
+ double vmin;
187
+ double vmax;
188
+ bool bflag;
189
+
190
+
191
+ //
192
+ // Test for inputs
193
+ //
194
+ if( npoints<1||samplesize<1||samplesize>npoints||nvars<1||nclasses<1||ntrees<1||nfeatures<1 )
195
+ {
196
+ info = -1;
197
+ return;
198
+ }
199
+ if( nclasses>1 )
200
+ {
201
+ for(i = 0; i <= npoints-1; i++)
202
+ {
203
+ if( ap::round(xy(i,nvars))<0||ap::round(xy(i,nvars))>=nclasses )
204
+ {
205
+ info = -2;
206
+ return;
207
+ }
208
+ }
209
+ }
210
+ info = 1;
211
+
212
+ //
213
+ // Flags
214
+ //
215
+ useevs = flags/dfuseevs%2!=0;
216
+
217
+ //
218
+ // Allocate data, prepare header
219
+ //
220
+ treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize;
221
+ permbuf.setbounds(0, npoints-1);
222
+ bufs.treebuf.setbounds(0, treesize-1);
223
+ bufs.idxbuf.setbounds(0, npoints-1);
224
+ bufs.tmpbufr.setbounds(0, npoints-1);
225
+ bufs.tmpbufr2.setbounds(0, npoints-1);
226
+ bufs.tmpbufi.setbounds(0, npoints-1);
227
+ bufs.varpool.setbounds(0, nvars-1);
228
+ bufs.evsbin.setbounds(0, nvars-1);
229
+ bufs.evssplits.setbounds(0, nvars-1);
230
+ bufs.classibuf.setbounds(0, 2*nclasses-1);
231
+ oobbuf.setbounds(0, nclasses*npoints-1);
232
+ oobcntbuf.setbounds(0, npoints-1);
233
+ df.trees.setbounds(0, ntrees*treesize-1);
234
+ xys.setbounds(0, samplesize-1, 0, nvars);
235
+ x.setbounds(0, nvars-1);
236
+ y.setbounds(0, nclasses-1);
237
+ for(i = 0; i <= npoints-1; i++)
238
+ {
239
+ permbuf(i) = i;
240
+ }
241
+ for(i = 0; i <= npoints*nclasses-1; i++)
242
+ {
243
+ oobbuf(i) = 0;
244
+ }
245
+ for(i = 0; i <= npoints-1; i++)
246
+ {
247
+ oobcntbuf(i) = 0;
248
+ }
249
+
250
+ //
251
+ // Prepare variable pool and EVS (extended variable selection/splitting) buffers
252
+ // (whether EVS is turned on or not):
253
+ // 1. detect binary variables and pre-calculate splits for them
254
+ // 2. detect variables with non-distinct values and exclude them from pool
255
+ //
256
+ for(i = 0; i <= nvars-1; i++)
257
+ {
258
+ bufs.varpool(i) = i;
259
+ }
260
+ nvarsinpool = nvars;
261
+ if( useevs )
262
+ {
263
+ for(j = 0; j <= nvars-1; j++)
264
+ {
265
+ vmin = xy(0,j);
266
+ vmax = vmin;
267
+ for(i = 0; i <= npoints-1; i++)
268
+ {
269
+ v = xy(i,j);
270
+ vmin = ap::minreal(vmin, v);
271
+ vmax = ap::maxreal(vmax, v);
272
+ }
273
+ if( vmin==vmax )
274
+ {
275
+
276
+ //
277
+ // exclude variable from pool
278
+ //
279
+ bufs.varpool(j) = bufs.varpool(nvarsinpool-1);
280
+ bufs.varpool(nvarsinpool-1) = -1;
281
+ nvarsinpool = nvarsinpool-1;
282
+ continue;
283
+ }
284
+ bflag = false;
285
+ for(i = 0; i <= npoints-1; i++)
286
+ {
287
+ v = xy(i,j);
288
+ if( v!=vmin&&v!=vmax )
289
+ {
290
+ bflag = true;
291
+ break;
292
+ }
293
+ }
294
+ if( bflag )
295
+ {
296
+
297
+ //
298
+ // non-binary variable
299
+ //
300
+ bufs.evsbin(j) = false;
301
+ }
302
+ else
303
+ {
304
+
305
+ //
306
+ // Prepare
307
+ //
308
+ bufs.evsbin(j) = true;
309
+ bufs.evssplits(j) = 0.5*(vmin+vmax);
310
+ if( bufs.evssplits(j)<=vmin )
311
+ {
312
+ bufs.evssplits(j) = vmax;
313
+ }
314
+ }
315
+ }
316
+ }
317
+
318
+ //
319
+ // RANDOM FOREST FORMAT
320
+ // W[0] - size of array
321
+ // W[1] - version number
322
+ // W[2] - NVars
323
+ // W[3] - NClasses (1 for regression)
324
+ // W[4] - NTrees
325
+ // W[5] - trees offset
326
+ //
327
+ //
328
+ // TREE FORMAT
329
+ // W[Offs] - size of sub-array
330
+ // node info:
331
+ // W[K+0] - variable number (-1 for leaf mode)
332
+ // W[K+1] - threshold (class/value for leaf node)
333
+ // W[K+2] - ">=" branch index (absent for leaf node)
334
+ //
335
+ //
336
+ df.nvars = nvars;
337
+ df.nclasses = nclasses;
338
+ df.ntrees = ntrees;
339
+
340
+ //
341
+ // Build forest
342
+ //
343
+ offs = 0;
344
+ for(i = 0; i <= ntrees-1; i++)
345
+ {
346
+
347
+ //
348
+ // Prepare sample
349
+ //
350
+ for(k = 0; k <= samplesize-1; k++)
351
+ {
352
+ j = k+ap::randominteger(npoints-k);
353
+ tmpi = permbuf(k);
354
+ permbuf(k) = permbuf(j);
355
+ permbuf(j) = tmpi;
356
+ j = permbuf(k);
357
+ ap::vmove(&xys(k, 0), &xy(j, 0), ap::vlen(0,nvars));
358
+ }
359
+
360
+ //
361
+ // build tree, copy
362
+ //
363
+ dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs);
364
+ j = ap::round(bufs.treebuf(0));
365
+ ap::vmove(&df.trees(offs), &bufs.treebuf(0), ap::vlen(offs,offs+j-1));
366
+ lasttreeoffs = offs;
367
+ offs = offs+j;
368
+
369
+ //
370
+ // OOB estimates
371
+ //
372
+ for(k = samplesize; k <= npoints-1; k++)
373
+ {
374
+ for(j = 0; j <= nclasses-1; j++)
375
+ {
376
+ y(j) = 0;
377
+ }
378
+ j = permbuf(k);
379
+ ap::vmove(&x(0), &xy(j, 0), ap::vlen(0,nvars-1));
380
+ dfprocessinternal(df, lasttreeoffs, x, y);
381
+ ap::vadd(&oobbuf(j*nclasses), &y(0), ap::vlen(j*nclasses,(j+1)*nclasses-1));
382
+ oobcntbuf(j) = oobcntbuf(j)+1;
383
+ }
384
+ }
385
+ df.bufsize = offs;
386
+
387
+ //
388
+ // Normalize OOB results
389
+ //
390
+ for(i = 0; i <= npoints-1; i++)
391
+ {
392
+ if( oobcntbuf(i)!=0 )
393
+ {
394
+ v = double(1)/double(oobcntbuf(i));
395
+ ap::vmul(&oobbuf(i*nclasses), ap::vlen(i*nclasses,i*nclasses+nclasses-1), v);
396
+ }
397
+ }
398
+
399
+ //
400
+ // Calculate training set estimates
401
+ //
402
+ rep.relclserror = dfrelclserror(df, xy, npoints);
403
+ rep.avgce = dfavgce(df, xy, npoints);
404
+ rep.rmserror = dfrmserror(df, xy, npoints);
405
+ rep.avgerror = dfavgerror(df, xy, npoints);
406
+ rep.avgrelerror = dfavgrelerror(df, xy, npoints);
407
+
408
+ //
409
+ // Calculate OOB estimates.
410
+ //
411
+ rep.oobrelclserror = 0;
412
+ rep.oobavgce = 0;
413
+ rep.oobrmserror = 0;
414
+ rep.oobavgerror = 0;
415
+ rep.oobavgrelerror = 0;
416
+ oobcnt = 0;
417
+ oobrelcnt = 0;
418
+ for(i = 0; i <= npoints-1; i++)
419
+ {
420
+ if( oobcntbuf(i)!=0 )
421
+ {
422
+ ooboffs = i*nclasses;
423
+ if( nclasses>1 )
424
+ {
425
+
426
+ //
427
+ // classification-specific code
428
+ //
429
+ k = ap::round(xy(i,nvars));
430
+ tmpi = 0;
431
+ for(j = 1; j <= nclasses-1; j++)
432
+ {
433
+ if( oobbuf(ooboffs+j)>oobbuf(ooboffs+tmpi) )
434
+ {
435
+ tmpi = j;
436
+ }
437
+ }
438
+ if( tmpi!=k )
439
+ {
440
+ rep.oobrelclserror = rep.oobrelclserror+1;
441
+ }
442
+ if( oobbuf(ooboffs+k)!=0 )
443
+ {
444
+ rep.oobavgce = rep.oobavgce-log(oobbuf(ooboffs+k));
445
+ }
446
+ else
447
+ {
448
+ rep.oobavgce = rep.oobavgce-log(ap::minrealnumber);
449
+ }
450
+ for(j = 0; j <= nclasses-1; j++)
451
+ {
452
+ if( j==k )
453
+ {
454
+ rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs+j)-1);
455
+ rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs+j)-1);
456
+ rep.oobavgrelerror = rep.oobavgrelerror+fabs(oobbuf(ooboffs+j)-1);
457
+ oobrelcnt = oobrelcnt+1;
458
+ }
459
+ else
460
+ {
461
+ rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs+j));
462
+ rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs+j));
463
+ }
464
+ }
465
+ }
466
+ else
467
+ {
468
+
469
+ //
470
+ // regression-specific code
471
+ //
472
+ rep.oobrmserror = rep.oobrmserror+ap::sqr(oobbuf(ooboffs)-xy(i,nvars));
473
+ rep.oobavgerror = rep.oobavgerror+fabs(oobbuf(ooboffs)-xy(i,nvars));
474
+ if( xy(i,nvars)!=0 )
475
+ {
476
+ rep.oobavgrelerror = rep.oobavgrelerror+fabs((oobbuf(ooboffs)-xy(i,nvars))/xy(i,nvars));
477
+ oobrelcnt = oobrelcnt+1;
478
+ }
479
+ }
480
+
481
+ //
482
+ // update OOB estimates count.
483
+ //
484
+ oobcnt = oobcnt+1;
485
+ }
486
+ }
487
+ if( oobcnt>0 )
488
+ {
489
+ rep.oobrelclserror = rep.oobrelclserror/oobcnt;
490
+ rep.oobavgce = rep.oobavgce/oobcnt;
491
+ rep.oobrmserror = sqrt(rep.oobrmserror/(oobcnt*nclasses));
492
+ rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
493
+ if( oobrelcnt>0 )
494
+ {
495
+ rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
496
+ }
497
+ }
498
+ }
499
+
500
+
501
+ /*************************************************************************
502
+ Procesing
503
+
504
+ INPUT PARAMETERS:
505
+ DF - decision forest model
506
+ X - input vector, array[0..NVars-1].
507
+
508
+ OUTPUT PARAMETERS:
509
+ Y - result. Regression estimate when solving regression task,
510
+ vector of posterior probabilities for classification task.
511
+ Subroutine does not allocate memory for this vector, it is
512
+ responsibility of a caller to allocate it. Array must be
513
+ at least [0..NClasses-1].
514
+
515
+ -- ALGLIB --
516
+ Copyright 16.02.2009 by Bochkanov Sergey
517
+ *************************************************************************/
518
+ void dfprocess(const decisionforest& df,
519
+ const ap::real_1d_array& x,
520
+ ap::real_1d_array& y)
521
+ {
522
+ int offs;
523
+ int i;
524
+ double v;
525
+
526
+
527
+ //
528
+ // Proceed
529
+ //
530
+ offs = 0;
531
+ for(i = 0; i <= df.nclasses-1; i++)
532
+ {
533
+ y(i) = 0;
534
+ }
535
+ for(i = 0; i <= df.ntrees-1; i++)
536
+ {
537
+
538
+ //
539
+ // Process basic tree
540
+ //
541
+ dfprocessinternal(df, offs, x, y);
542
+
543
+ //
544
+ // Next tree
545
+ //
546
+ offs = offs+ap::round(df.trees(offs));
547
+ }
548
+ v = double(1)/double(df.ntrees);
549
+ ap::vmul(&y(0), ap::vlen(0,df.nclasses-1), v);
550
+ }
551
+
552
+
553
+ /*************************************************************************
554
+ Relative classification error on the test set
555
+
556
+ INPUT PARAMETERS:
557
+ DF - decision forest model
558
+ XY - test set
559
+ NPoints - test set size
560
+
561
+ RESULT:
562
+ percent of incorrectly classified cases.
563
+ Zero if model solves regression task.
564
+
565
+ -- ALGLIB --
566
+ Copyright 16.02.2009 by Bochkanov Sergey
567
+ *************************************************************************/
568
+ double dfrelclserror(const decisionforest& df,
569
+ const ap::real_2d_array& xy,
570
+ int npoints)
571
+ {
572
+ double result;
573
+
574
+ result = double(dfclserror(df, xy, npoints))/double(npoints);
575
+ return result;
576
+ }
577
+
578
+
579
+ /*************************************************************************
580
+ Average cross-entropy (in bits per element) on the test set
581
+
582
+ INPUT PARAMETERS:
583
+ DF - decision forest model
584
+ XY - test set
585
+ NPoints - test set size
586
+
587
+ RESULT:
588
+ CrossEntropy/(NPoints*LN(2)).
589
+ Zero if model solves regression task.
590
+
591
+ -- ALGLIB --
592
+ Copyright 16.02.2009 by Bochkanov Sergey
593
+ *************************************************************************/
594
+ double dfavgce(const decisionforest& df,
595
+ const ap::real_2d_array& xy,
596
+ int npoints)
597
+ {
598
+ double result;
599
+ ap::real_1d_array x;
600
+ ap::real_1d_array y;
601
+ int i;
602
+ int j;
603
+ int k;
604
+ int tmpi;
605
+
606
+ x.setbounds(0, df.nvars-1);
607
+ y.setbounds(0, df.nclasses-1);
608
+ result = 0;
609
+ for(i = 0; i <= npoints-1; i++)
610
+ {
611
+ ap::vmove(&x(0), &xy(i, 0), ap::vlen(0,df.nvars-1));
612
+ dfprocess(df, x, y);
613
+ if( df.nclasses>1 )
614
+ {
615
+
616
+ //
617
+ // classification-specific code
618
+ //
619
+ k = ap::round(xy(i,df.nvars));
620
+ tmpi = 0;
621
+ for(j = 1; j <= df.nclasses-1; j++)
622
+ {
623
+ if( y(j)>y(tmpi) )
624
+ {
625
+ tmpi = j;
626
+ }
627
+ }
628
+ if( y(k)!=0 )
629
+ {
630
+ result = result-log(y(k));
631
+ }
632
+ else
633
+ {
634
+ result = result-log(ap::minrealnumber);
635
+ }
636
+ }
637
+ }
638
+ result = result/npoints;
639
+ return result;
640
+ }
641
+
642
+
643
+ /*************************************************************************
644
+ RMS error on the test set
645
+
646
+ INPUT PARAMETERS:
647
+ DF - decision forest model
648
+ XY - test set
649
+ NPoints - test set size
650
+
651
+ RESULT:
652
+ root mean square error.
653
+ Its meaning for regression task is obvious. As for
654
+ classification task, RMS error means error when estimating posterior
655
+ probabilities.
656
+
657
+ -- ALGLIB --
658
+ Copyright 16.02.2009 by Bochkanov Sergey
659
+ *************************************************************************/
660
+ double dfrmserror(const decisionforest& df,
661
+ const ap::real_2d_array& xy,
662
+ int npoints)
663
+ {
664
+ double result;
665
+ ap::real_1d_array x;
666
+ ap::real_1d_array y;
667
+ int i;
668
+ int j;
669
+ int k;
670
+ int tmpi;
671
+
672
+ x.setbounds(0, df.nvars-1);
673
+ y.setbounds(0, df.nclasses-1);
674
+ result = 0;
675
+ for(i = 0; i <= npoints-1; i++)
676
+ {
677
+ ap::vmove(&x(0), &xy(i, 0), ap::vlen(0,df.nvars-1));
678
+ dfprocess(df, x, y);
679
+ if( df.nclasses>1 )
680
+ {
681
+
682
+ //
683
+ // classification-specific code
684
+ //
685
+ k = ap::round(xy(i,df.nvars));
686
+ tmpi = 0;
687
+ for(j = 1; j <= df.nclasses-1; j++)
688
+ {
689
+ if( y(j)>y(tmpi) )
690
+ {
691
+ tmpi = j;
692
+ }
693
+ }
694
+ for(j = 0; j <= df.nclasses-1; j++)
695
+ {
696
+ if( j==k )
697
+ {
698
+ result = result+ap::sqr(y(j)-1);
699
+ }
700
+ else
701
+ {
702
+ result = result+ap::sqr(y(j));
703
+ }
704
+ }
705
+ }
706
+ else
707
+ {
708
+
709
+ //
710
+ // regression-specific code
711
+ //
712
+ result = result+ap::sqr(y(0)-xy(i,df.nvars));
713
+ }
714
+ }
715
+ result = sqrt(result/(npoints*df.nclasses));
716
+ return result;
717
+ }
718
+
719
+
720
+ /*************************************************************************
721
+ Average error on the test set
722
+
723
+ INPUT PARAMETERS:
724
+ DF - decision forest model
725
+ XY - test set
726
+ NPoints - test set size
727
+
728
+ RESULT:
729
+ Its meaning for regression task is obvious. As for
730
+ classification task, it means average error when estimating posterior
731
+ probabilities.
732
+
733
+ -- ALGLIB --
734
+ Copyright 16.02.2009 by Bochkanov Sergey
735
+ *************************************************************************/
736
+ double dfavgerror(const decisionforest& df,
737
+ const ap::real_2d_array& xy,
738
+ int npoints)
739
+ {
740
+ double result;
741
+ ap::real_1d_array x;
742
+ ap::real_1d_array y;
743
+ int i;
744
+ int j;
745
+ int k;
746
+
747
+ x.setbounds(0, df.nvars-1);
748
+ y.setbounds(0, df.nclasses-1);
749
+ result = 0;
750
+ for(i = 0; i <= npoints-1; i++)
751
+ {
752
+ ap::vmove(&x(0), &xy(i, 0), ap::vlen(0,df.nvars-1));
753
+ dfprocess(df, x, y);
754
+ if( df.nclasses>1 )
755
+ {
756
+
757
+ //
758
+ // classification-specific code
759
+ //
760
+ k = ap::round(xy(i,df.nvars));
761
+ for(j = 0; j <= df.nclasses-1; j++)
762
+ {
763
+ if( j==k )
764
+ {
765
+ result = result+fabs(y(j)-1);
766
+ }
767
+ else
768
+ {
769
+ result = result+fabs(y(j));
770
+ }
771
+ }
772
+ }
773
+ else
774
+ {
775
+
776
+ //
777
+ // regression-specific code
778
+ //
779
+ result = result+fabs(y(0)-xy(i,df.nvars));
780
+ }
781
+ }
782
+ result = result/(npoints*df.nclasses);
783
+ return result;
784
+ }
785
+
786
+
787
+ /*************************************************************************
788
+ Average relative error on the test set
789
+
790
+ INPUT PARAMETERS:
791
+ DF - decision forest model
792
+ XY - test set
793
+ NPoints - test set size
794
+
795
+ RESULT:
796
+ Its meaning for regression task is obvious. As for
797
+ classification task, it means average relative error when estimating
798
+ posterior probability of belonging to the correct class.
799
+
800
+ -- ALGLIB --
801
+ Copyright 16.02.2009 by Bochkanov Sergey
802
+ *************************************************************************/
803
+ double dfavgrelerror(const decisionforest& df,
804
+ const ap::real_2d_array& xy,
805
+ int npoints)
806
+ {
807
+ double result;
808
+ ap::real_1d_array x;
809
+ ap::real_1d_array y;
810
+ int relcnt;
811
+ int i;
812
+ int j;
813
+ int k;
814
+
815
+ x.setbounds(0, df.nvars-1);
816
+ y.setbounds(0, df.nclasses-1);
817
+ result = 0;
818
+ relcnt = 0;
819
+ for(i = 0; i <= npoints-1; i++)
820
+ {
821
+ ap::vmove(&x(0), &xy(i, 0), ap::vlen(0,df.nvars-1));
822
+ dfprocess(df, x, y);
823
+ if( df.nclasses>1 )
824
+ {
825
+
826
+ //
827
+ // classification-specific code
828
+ //
829
+ k = ap::round(xy(i,df.nvars));
830
+ for(j = 0; j <= df.nclasses-1; j++)
831
+ {
832
+ if( j==k )
833
+ {
834
+ result = result+fabs(y(j)-1);
835
+ relcnt = relcnt+1;
836
+ }
837
+ }
838
+ }
839
+ else
840
+ {
841
+
842
+ //
843
+ // regression-specific code
844
+ //
845
+ if( xy(i,df.nvars)!=0 )
846
+ {
847
+ result = result+fabs((y(0)-xy(i,df.nvars))/xy(i,df.nvars));
848
+ relcnt = relcnt+1;
849
+ }
850
+ }
851
+ }
852
+ if( relcnt>0 )
853
+ {
854
+ result = result/relcnt;
855
+ }
856
+ return result;
857
+ }
858
+
859
+
860
+ /*************************************************************************
861
+ Copying of DecisionForest strucure
862
+
863
+ INPUT PARAMETERS:
864
+ DF1 - original
865
+
866
+ OUTPUT PARAMETERS:
867
+ DF2 - copy
868
+
869
+ -- ALGLIB --
870
+ Copyright 13.02.2009 by Bochkanov Sergey
871
+ *************************************************************************/
872
+ void dfcopy(const decisionforest& df1, decisionforest& df2)
873
+ {
874
+
875
+ df2.nvars = df1.nvars;
876
+ df2.nclasses = df1.nclasses;
877
+ df2.ntrees = df1.ntrees;
878
+ df2.bufsize = df1.bufsize;
879
+ df2.trees.setbounds(0, df1.bufsize-1);
880
+ ap::vmove(&df2.trees(0), &df1.trees(0), ap::vlen(0,df1.bufsize-1));
881
+ }
882
+
883
+
884
+ /*************************************************************************
885
+ Serialization of DecisionForest strucure
886
+
887
+ INPUT PARAMETERS:
888
+ DF - original
889
+
890
+ OUTPUT PARAMETERS:
891
+ RA - array of real numbers which stores decision forest,
892
+ array[0..RLen-1]
893
+ RLen - RA lenght
894
+
895
+ -- ALGLIB --
896
+ Copyright 13.02.2009 by Bochkanov Sergey
897
+ *************************************************************************/
898
+ void dfserialize(const decisionforest& df, ap::real_1d_array& ra, int& rlen)
899
+ {
900
+
901
+ ra.setbounds(0, df.bufsize+5-1);
902
+ ra(0) = dfvnum;
903
+ ra(1) = df.nvars;
904
+ ra(2) = df.nclasses;
905
+ ra(3) = df.ntrees;
906
+ ra(4) = df.bufsize;
907
+ ap::vmove(&ra(5), &df.trees(0), ap::vlen(5,5+df.bufsize-1));
908
+ rlen = 5+df.bufsize;
909
+ }
910
+
911
+
912
+ /*************************************************************************
913
+ Unserialization of DecisionForest strucure
914
+
915
+ INPUT PARAMETERS:
916
+ RA - real array which stores decision forest
917
+
918
+ OUTPUT PARAMETERS:
919
+ DF - restored structure
920
+
921
+ -- ALGLIB --
922
+ Copyright 13.02.2009 by Bochkanov Sergey
923
+ *************************************************************************/
924
+ void dfunserialize(const ap::real_1d_array& ra, decisionforest& df)
925
+ {
926
+
927
+ ap::ap_error::make_assertion(ap::round(ra(0))==dfvnum, "DFUnserialize: incorrect array!");
928
+ df.nvars = ap::round(ra(1));
929
+ df.nclasses = ap::round(ra(2));
930
+ df.ntrees = ap::round(ra(3));
931
+ df.bufsize = ap::round(ra(4));
932
+ df.trees.setbounds(0, df.bufsize-1);
933
+ ap::vmove(&df.trees(0), &ra(5), ap::vlen(0,df.bufsize-1));
934
+ }
935
+
936
+
937
+ /*************************************************************************
938
+ Classification error
939
+ *************************************************************************/
940
+ static int dfclserror(const decisionforest& df,
941
+ const ap::real_2d_array& xy,
942
+ int npoints)
943
+ {
944
+ int result;
945
+ ap::real_1d_array x;
946
+ ap::real_1d_array y;
947
+ int i;
948
+ int j;
949
+ int k;
950
+ int tmpi;
951
+
952
+ if( df.nclasses<=1 )
953
+ {
954
+ result = 0;
955
+ return result;
956
+ }
957
+ x.setbounds(0, df.nvars-1);
958
+ y.setbounds(0, df.nclasses-1);
959
+ result = 0;
960
+ for(i = 0; i <= npoints-1; i++)
961
+ {
962
+ ap::vmove(&x(0), &xy(i, 0), ap::vlen(0,df.nvars-1));
963
+ dfprocess(df, x, y);
964
+ k = ap::round(xy(i,df.nvars));
965
+ tmpi = 0;
966
+ for(j = 1; j <= df.nclasses-1; j++)
967
+ {
968
+ if( y(j)>y(tmpi) )
969
+ {
970
+ tmpi = j;
971
+ }
972
+ }
973
+ if( tmpi!=k )
974
+ {
975
+ result = result+1;
976
+ }
977
+ }
978
+ return result;
979
+ }
980
+
981
+
982
+ /*************************************************************************
983
+ Internal subroutine for processing one decision tree starting at Offs
984
+ *************************************************************************/
985
+ static void dfprocessinternal(const decisionforest& df,
986
+ int offs,
987
+ const ap::real_1d_array& x,
988
+ ap::real_1d_array& y)
989
+ {
990
+ int i;
991
+ int k;
992
+ int idx;
993
+
994
+
995
+ //
996
+ // Set pointer to the root
997
+ //
998
+ k = offs+1;
999
+
1000
+ //
1001
+ // Navigate through the tree
1002
+ //
1003
+ while(true)
1004
+ {
1005
+ if( df.trees(k)==-1 )
1006
+ {
1007
+ if( df.nclasses==1 )
1008
+ {
1009
+ y(0) = y(0)+df.trees(k+1);
1010
+ }
1011
+ else
1012
+ {
1013
+ idx = ap::round(df.trees(k+1));
1014
+ y(idx) = y(idx)+1;
1015
+ }
1016
+ break;
1017
+ }
1018
+ if( x(ap::round(df.trees(k)))<df.trees(k+1) )
1019
+ {
1020
+ k = k+innernodewidth;
1021
+ }
1022
+ else
1023
+ {
1024
+ k = offs+ap::round(df.trees(k+2));
1025
+ }
1026
+ }
1027
+ }
1028
+
1029
+
1030
+ /*************************************************************************
1031
+ Builds one decision tree. Just a wrapper for the DFBuildTreeRec.
1032
+ *************************************************************************/
1033
+ static void dfbuildtree(const ap::real_2d_array& xy,
1034
+ int npoints,
1035
+ int nvars,
1036
+ int nclasses,
1037
+ int nfeatures,
1038
+ int nvarsinpool,
1039
+ int flags,
1040
+ dfinternalbuffers& bufs)
1041
+ {
1042
+ int numprocessed;
1043
+ int i;
1044
+
1045
+ ap::ap_error::make_assertion(npoints>0, "");
1046
+
1047
+ //
1048
+ // Prepare IdxBuf. It stores indices of the training set elements.
1049
+ // When training set is being split, contents of IdxBuf is
1050
+ // correspondingly reordered so we can know which elements belong
1051
+ // to which branch of decision tree.
1052
+ //
1053
+ for(i = 0; i <= npoints-1; i++)
1054
+ {
1055
+ bufs.idxbuf(i) = i;
1056
+ }
1057
+
1058
+ //
1059
+ // Recursive procedure
1060
+ //
1061
+ numprocessed = 1;
1062
+ dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, numprocessed, 0, npoints-1, bufs);
1063
+ bufs.treebuf(0) = numprocessed;
1064
+ }
1065
+
1066
+
1067
+ /*************************************************************************
1068
+ Builds one decision tree (internal recursive subroutine)
1069
+
1070
+ Parameters:
1071
+ TreeBuf - large enough array, at least TreeSize
1072
+ IdxBuf - at least NPoints elements
1073
+ TmpBufR - at least NPoints
1074
+ TmpBufR2 - at least NPoints
1075
+ TmpBufI - at least NPoints
1076
+ TmpBufI2 - at least NPoints+1
1077
+ *************************************************************************/
1078
+ static void dfbuildtreerec(const ap::real_2d_array& xy,
1079
+ int npoints,
1080
+ int nvars,
1081
+ int nclasses,
1082
+ int nfeatures,
1083
+ int nvarsinpool,
1084
+ int flags,
1085
+ int& numprocessed,
1086
+ int idx1,
1087
+ int idx2,
1088
+ dfinternalbuffers& bufs)
1089
+ {
1090
+ int i;
1091
+ int j;
1092
+ int k;
1093
+ bool bflag;
1094
+ int offs;
1095
+ int i1;
1096
+ int i2;
1097
+ int lsize;
1098
+ int info;
1099
+ double sl;
1100
+ double sr;
1101
+ double w;
1102
+ int idxbest;
1103
+ double ebest;
1104
+ double tbest;
1105
+ int varcur;
1106
+ double s;
1107
+ double v;
1108
+ double v1;
1109
+ double v2;
1110
+ int nbuf;
1111
+ double threshold;
1112
+ int oldnp;
1113
+ double e;
1114
+ double currms;
1115
+ double curcvrms;
1116
+ bool useevs;
1117
+
1118
+ ap::ap_error::make_assertion(npoints>0, "");
1119
+ ap::ap_error::make_assertion(idx2>=idx1, "");
1120
+ useevs = flags/dfuseevs%2!=0;
1121
+
1122
+ //
1123
+ // Leaf node
1124
+ //
1125
+ if( idx2==idx1 )
1126
+ {
1127
+ bufs.treebuf(numprocessed) = -1;
1128
+ bufs.treebuf(numprocessed+1) = xy(bufs.idxbuf(idx1),nvars);
1129
+ numprocessed = numprocessed+leafnodewidth;
1130
+ return;
1131
+ }
1132
+
1133
+ //
1134
+ // Non-leaf node.
1135
+ // Select random variable, prepare split:
1136
+ // 1. prepare default solution - no splitting, class at random
1137
+ // 2. investigate possible splits, compare with default/best
1138
+ //
1139
+ idxbest = -1;
1140
+ if( nclasses>1 )
1141
+ {
1142
+
1143
+ //
1144
+ // default solution for classification
1145
+ //
1146
+ for(i = 0; i <= nclasses-1; i++)
1147
+ {
1148
+ bufs.classibuf(i) = 0;
1149
+ }
1150
+ s = idx2-idx1+1;
1151
+ for(i = idx1; i <= idx2; i++)
1152
+ {
1153
+ j = ap::round(xy(bufs.idxbuf(i),nvars));
1154
+ bufs.classibuf(j) = bufs.classibuf(j)+1;
1155
+ }
1156
+ ebest = 0;
1157
+ for(i = 0; i <= nclasses-1; i++)
1158
+ {
1159
+ ebest = ebest+bufs.classibuf(i)*ap::sqr(1-bufs.classibuf(i)/s)+(s-bufs.classibuf(i))*ap::sqr(bufs.classibuf(i)/s);
1160
+ }
1161
+ ebest = sqrt(ebest/(nclasses*(idx2-idx1+1)));
1162
+ }
1163
+ else
1164
+ {
1165
+
1166
+ //
1167
+ // default solution for regression
1168
+ //
1169
+ v = 0;
1170
+ for(i = idx1; i <= idx2; i++)
1171
+ {
1172
+ v = v+xy(bufs.idxbuf(i),nvars);
1173
+ }
1174
+ v = v/(idx2-idx1+1);
1175
+ ebest = 0;
1176
+ for(i = idx1; i <= idx2; i++)
1177
+ {
1178
+ ebest = ebest+ap::sqr(xy(bufs.idxbuf(i),nvars)-v);
1179
+ }
1180
+ ebest = sqrt(ebest/(idx2-idx1+1));
1181
+ }
1182
+ i = 0;
1183
+ while(i<=ap::minint(nfeatures, nvarsinpool)-1)
1184
+ {
1185
+
1186
+ //
1187
+ // select variables from pool
1188
+ //
1189
+ j = i+ap::randominteger(nvarsinpool-i);
1190
+ k = bufs.varpool(i);
1191
+ bufs.varpool(i) = bufs.varpool(j);
1192
+ bufs.varpool(j) = k;
1193
+ varcur = bufs.varpool(i);
1194
+
1195
+ //
1196
+ // load variable values to working array
1197
+ //
1198
+ // apply EVS preprocessing: if all variable values are same,
1199
+ // variable is excluded from pool.
1200
+ //
1201
+ // This is necessary for binary pre-splits (see later) to work.
1202
+ //
1203
+ for(j = idx1; j <= idx2; j++)
1204
+ {
1205
+ bufs.tmpbufr(j-idx1) = xy(bufs.idxbuf(j),varcur);
1206
+ }
1207
+ if( useevs )
1208
+ {
1209
+ bflag = false;
1210
+ v = bufs.tmpbufr(0);
1211
+ for(j = 0; j <= idx2-idx1; j++)
1212
+ {
1213
+ if( bufs.tmpbufr(j)!=v )
1214
+ {
1215
+ bflag = true;
1216
+ break;
1217
+ }
1218
+ }
1219
+ if( !bflag )
1220
+ {
1221
+
1222
+ //
1223
+ // exclude variable from pool,
1224
+ // go to the next iteration.
1225
+ // I is not increased.
1226
+ //
1227
+ k = bufs.varpool(i);
1228
+ bufs.varpool(i) = bufs.varpool(nvarsinpool-1);
1229
+ bufs.varpool(nvarsinpool-1) = k;
1230
+ nvarsinpool = nvarsinpool-1;
1231
+ continue;
1232
+ }
1233
+ }
1234
+
1235
+ //
1236
+ // load labels to working array
1237
+ //
1238
+ if( nclasses>1 )
1239
+ {
1240
+ for(j = idx1; j <= idx2; j++)
1241
+ {
1242
+ bufs.tmpbufi(j-idx1) = ap::round(xy(bufs.idxbuf(j),nvars));
1243
+ }
1244
+ }
1245
+ else
1246
+ {
1247
+ for(j = idx1; j <= idx2; j++)
1248
+ {
1249
+ bufs.tmpbufr2(j-idx1) = xy(bufs.idxbuf(j),nvars);
1250
+ }
1251
+ }
1252
+
1253
+ //
1254
+ // calculate split
1255
+ //
1256
+ if( useevs&&bufs.evsbin(varcur) )
1257
+ {
1258
+
1259
+ //
1260
+ // Pre-calculated splits for binary variables.
1261
+ // Threshold is already known, just calculate RMS error
1262
+ //
1263
+ threshold = bufs.evssplits(varcur);
1264
+ if( nclasses>1 )
1265
+ {
1266
+
1267
+ //
1268
+ // classification-specific code
1269
+ //
1270
+ for(j = 0; j <= 2*nclasses-1; j++)
1271
+ {
1272
+ bufs.classibuf(j) = 0;
1273
+ }
1274
+ sl = 0;
1275
+ sr = 0;
1276
+ for(j = 0; j <= idx2-idx1; j++)
1277
+ {
1278
+ k = bufs.tmpbufi(j);
1279
+ if( bufs.tmpbufr(j)<threshold )
1280
+ {
1281
+ bufs.classibuf(k) = bufs.classibuf(k)+1;
1282
+ sl = sl+1;
1283
+ }
1284
+ else
1285
+ {
1286
+ bufs.classibuf(k+nclasses) = bufs.classibuf(k+nclasses)+1;
1287
+ sr = sr+1;
1288
+ }
1289
+ }
1290
+ ap::ap_error::make_assertion(sl!=0&&sr!=0, "DFBuildTreeRec: something strange!");
1291
+ currms = 0;
1292
+ for(j = 0; j <= nclasses-1; j++)
1293
+ {
1294
+ w = bufs.classibuf(j);
1295
+ currms = currms+w*ap::sqr(w/sl-1);
1296
+ currms = currms+(sl-w)*ap::sqr(w/sl);
1297
+ w = bufs.classibuf(nclasses+j);
1298
+ currms = currms+w*ap::sqr(w/sr-1);
1299
+ currms = currms+(sr-w)*ap::sqr(w/sr);
1300
+ }
1301
+ currms = sqrt(currms/(nclasses*(idx2-idx1+1)));
1302
+ }
1303
+ else
1304
+ {
1305
+
1306
+ //
1307
+ // regression-specific code
1308
+ //
1309
+ sl = 0;
1310
+ sr = 0;
1311
+ v1 = 0;
1312
+ v2 = 0;
1313
+ for(j = 0; j <= idx2-idx1; j++)
1314
+ {
1315
+ if( bufs.tmpbufr(j)<threshold )
1316
+ {
1317
+ v1 = v1+bufs.tmpbufr2(j);
1318
+ sl = sl+1;
1319
+ }
1320
+ else
1321
+ {
1322
+ v2 = v2+bufs.tmpbufr2(j);
1323
+ sr = sr+1;
1324
+ }
1325
+ }
1326
+ ap::ap_error::make_assertion(sl!=0&&sr!=0, "DFBuildTreeRec: something strange!");
1327
+ v1 = v1/sl;
1328
+ v2 = v2/sr;
1329
+ currms = 0;
1330
+ for(j = 0; j <= idx2-idx1; j++)
1331
+ {
1332
+ if( bufs.tmpbufr(j)<threshold )
1333
+ {
1334
+ currms = currms+ap::sqr(v1-bufs.tmpbufr2(j));
1335
+ }
1336
+ else
1337
+ {
1338
+ currms = currms+ap::sqr(v2-bufs.tmpbufr2(j));
1339
+ }
1340
+ }
1341
+ currms = sqrt(currms/(idx2-idx1+1));
1342
+ }
1343
+ info = 1;
1344
+ }
1345
+ else
1346
+ {
1347
+
1348
+ //
1349
+ // Generic splits
1350
+ //
1351
+ if( nclasses>1 )
1352
+ {
1353
+ dfsplitc(bufs.tmpbufr, bufs.tmpbufi, bufs.classibuf, idx2-idx1+1, nclasses, dfusestrongsplits, info, threshold, currms);
1354
+ }
1355
+ else
1356
+ {
1357
+ dfsplitr(bufs.tmpbufr, bufs.tmpbufr2, idx2-idx1+1, dfusestrongsplits, info, threshold, currms);
1358
+ }
1359
+ }
1360
+ if( info>0 )
1361
+ {
1362
+ if( currms<=ebest )
1363
+ {
1364
+ ebest = currms;
1365
+ idxbest = varcur;
1366
+ tbest = threshold;
1367
+ }
1368
+ }
1369
+
1370
+ //
1371
+ // Next iteration
1372
+ //
1373
+ i = i+1;
1374
+ }
1375
+
1376
+ //
1377
+ // to split or not to split
1378
+ //
1379
+ if( idxbest<0 )
1380
+ {
1381
+
1382
+ //
1383
+ // All values are same, cannot split.
1384
+ //
1385
+ bufs.treebuf(numprocessed) = -1;
1386
+ if( nclasses>1 )
1387
+ {
1388
+
1389
+ //
1390
+ // Select random class label (randomness allows us to
1391
+ // approximate distribution of the classes)
1392
+ //
1393
+ bufs.treebuf(numprocessed+1) = ap::round(xy(bufs.idxbuf(idx1+ap::randominteger(idx2-idx1+1)),nvars));
1394
+ }
1395
+ else
1396
+ {
1397
+
1398
+ //
1399
+ // Select average (for regression task).
1400
+ //
1401
+ v = 0;
1402
+ for(i = idx1; i <= idx2; i++)
1403
+ {
1404
+ v = v+xy(bufs.idxbuf(i),nvars)/(idx2-idx1+1);
1405
+ }
1406
+ bufs.treebuf(numprocessed+1) = v;
1407
+ }
1408
+ numprocessed = numprocessed+leafnodewidth;
1409
+ }
1410
+ else
1411
+ {
1412
+
1413
+ //
1414
+ // we can split
1415
+ //
1416
+ bufs.treebuf(numprocessed) = idxbest;
1417
+ bufs.treebuf(numprocessed+1) = tbest;
1418
+ i1 = idx1;
1419
+ i2 = idx2;
1420
+ while(i1<=i2)
1421
+ {
1422
+
1423
+ //
1424
+ // Reorder indices so that left partition is in [Idx1..I1-1],
1425
+ // and right partition is in [I2+1..Idx2]
1426
+ //
1427
+ if( xy(bufs.idxbuf(i1),idxbest)<tbest )
1428
+ {
1429
+ i1 = i1+1;
1430
+ continue;
1431
+ }
1432
+ if( xy(bufs.idxbuf(i2),idxbest)>=tbest )
1433
+ {
1434
+ i2 = i2-1;
1435
+ continue;
1436
+ }
1437
+ j = bufs.idxbuf(i1);
1438
+ bufs.idxbuf(i1) = bufs.idxbuf(i2);
1439
+ bufs.idxbuf(i2) = j;
1440
+ i1 = i1+1;
1441
+ i2 = i2-1;
1442
+ }
1443
+ oldnp = numprocessed;
1444
+ numprocessed = numprocessed+innernodewidth;
1445
+ dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, numprocessed, idx1, i1-1, bufs);
1446
+ bufs.treebuf(oldnp+2) = numprocessed;
1447
+ dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, numprocessed, i2+1, idx2, bufs);
1448
+ }
1449
+ }
1450
+
1451
+
1452
+ /*************************************************************************
1453
+ Makes weak split on attribute
1454
+ *************************************************************************/
1455
+ static void dfweakspliti(ap::real_1d_array& x,
1456
+ ap::integer_1d_array& y,
1457
+ int n,
1458
+ int nclasses,
1459
+ int& info,
1460
+ double& threshold,
1461
+ double& e)
1462
+ {
1463
+ int i;
1464
+ int neq;
1465
+ int nless;
1466
+ int ngreater;
1467
+
1468
+ tagsortfasti(x, y, n);
1469
+ if( n%2==1 )
1470
+ {
1471
+
1472
+ //
1473
+ // odd number of elements
1474
+ //
1475
+ threshold = x(n/2);
1476
+ }
1477
+ else
1478
+ {
1479
+
1480
+ //
1481
+ // even number of elements.
1482
+ //
1483
+ // if two closest to the middle of the array are equal,
1484
+ // we will select one of them (to avoid possible problems with
1485
+ // floating point errors).
1486
+ // we will select halfsum otherwise.
1487
+ //
1488
+ if( x(n/2-1)==x(n/2) )
1489
+ {
1490
+ threshold = x(n/2-1);
1491
+ }
1492
+ else
1493
+ {
1494
+ threshold = 0.5*(x(n/2-1)+x(n/2));
1495
+ }
1496
+ }
1497
+ neq = 0;
1498
+ nless = 0;
1499
+ ngreater = 0;
1500
+ for(i = 0; i <= n-1; i++)
1501
+ {
1502
+ if( x(i)<threshold )
1503
+ {
1504
+ nless = nless+1;
1505
+ }
1506
+ if( x(i)==threshold )
1507
+ {
1508
+ neq = neq+1;
1509
+ }
1510
+ if( x(i)>threshold )
1511
+ {
1512
+ ngreater = ngreater+1;
1513
+ }
1514
+ }
1515
+ if( nless==0&&ngreater==0 )
1516
+ {
1517
+ info = -3;
1518
+ }
1519
+ else
1520
+ {
1521
+ if( neq!=0 )
1522
+ {
1523
+ if( nless<ngreater )
1524
+ {
1525
+ threshold = 0.5*(x(nless+neq-1)+x(nless+neq));
1526
+ }
1527
+ else
1528
+ {
1529
+ threshold = 0.5*(x(nless-1)+x(nless));
1530
+ }
1531
+ }
1532
+ info = 1;
1533
+ e = 0;
1534
+ }
1535
+ }
1536
+
1537
+
1538
+ /*************************************************************************
1539
+ Makes split on attribute
1540
+ *************************************************************************/
1541
+ static void dfsplitc(ap::real_1d_array& x,
1542
+ ap::integer_1d_array& c,
1543
+ ap::integer_1d_array& cntbuf,
1544
+ int n,
1545
+ int nc,
1546
+ int flags,
1547
+ int& info,
1548
+ double& threshold,
1549
+ double& e)
1550
+ {
1551
+ int i;
1552
+ int neq;
1553
+ int nless;
1554
+ int ngreater;
1555
+ int q;
1556
+ int qmin;
1557
+ int qmax;
1558
+ int qcnt;
1559
+ double cursplit;
1560
+ int nleft;
1561
+ double v;
1562
+ double cure;
1563
+ double w;
1564
+ double sl;
1565
+ double sr;
1566
+
1567
+ tagsortfasti(x, c, n);
1568
+ e = ap::maxrealnumber;
1569
+ threshold = 0.5*(x(0)+x(n-1));
1570
+ info = -3;
1571
+ if( flags/dfusestrongsplits%2==0 )
1572
+ {
1573
+
1574
+ //
1575
+ // weak splits, split at half
1576
+ //
1577
+ qcnt = 2;
1578
+ qmin = 1;
1579
+ qmax = 1;
1580
+ }
1581
+ else
1582
+ {
1583
+
1584
+ //
1585
+ // strong splits: choose best quartile
1586
+ //
1587
+ qcnt = 4;
1588
+ qmin = 1;
1589
+ qmax = 3;
1590
+ }
1591
+ for(q = qmin; q <= qmax; q++)
1592
+ {
1593
+ cursplit = x(n*q/qcnt);
1594
+ neq = 0;
1595
+ nless = 0;
1596
+ ngreater = 0;
1597
+ for(i = 0; i <= n-1; i++)
1598
+ {
1599
+ if( x(i)<cursplit )
1600
+ {
1601
+ nless = nless+1;
1602
+ }
1603
+ if( x(i)==cursplit )
1604
+ {
1605
+ neq = neq+1;
1606
+ }
1607
+ if( x(i)>cursplit )
1608
+ {
1609
+ ngreater = ngreater+1;
1610
+ }
1611
+ }
1612
+ ap::ap_error::make_assertion(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1613
+ if( nless!=0||ngreater!=0 )
1614
+ {
1615
+
1616
+ //
1617
+ // set threshold between two partitions, with
1618
+ // some tweaking to avoid problems with floating point
1619
+ // arithmetics.
1620
+ //
1621
+ // The problem is that when you calculates C = 0.5*(A+B) there
1622
+ // can be no C which lies strictly between A and B (for example,
1623
+ // there is no floating point number which is
1624
+ // greater than 1 and less than 1+eps). In such situations
1625
+ // we choose right side as theshold (remember that
1626
+ // points which lie on threshold falls to the right side).
1627
+ //
1628
+ if( nless<ngreater )
1629
+ {
1630
+ cursplit = 0.5*(x(nless+neq-1)+x(nless+neq));
1631
+ nleft = nless+neq;
1632
+ if( cursplit<=x(nless+neq-1) )
1633
+ {
1634
+ cursplit = x(nless+neq);
1635
+ }
1636
+ }
1637
+ else
1638
+ {
1639
+ cursplit = 0.5*(x(nless-1)+x(nless));
1640
+ nleft = nless;
1641
+ if( cursplit<=x(nless-1) )
1642
+ {
1643
+ cursplit = x(nless);
1644
+ }
1645
+ }
1646
+ info = 1;
1647
+ cure = 0;
1648
+ for(i = 0; i <= 2*nc-1; i++)
1649
+ {
1650
+ cntbuf(i) = 0;
1651
+ }
1652
+ for(i = 0; i <= nleft-1; i++)
1653
+ {
1654
+ cntbuf(c(i)) = cntbuf(c(i))+1;
1655
+ }
1656
+ for(i = nleft; i <= n-1; i++)
1657
+ {
1658
+ cntbuf(nc+c(i)) = cntbuf(nc+c(i))+1;
1659
+ }
1660
+ sl = nleft;
1661
+ sr = n-nleft;
1662
+ v = 0;
1663
+ for(i = 0; i <= nc-1; i++)
1664
+ {
1665
+ w = cntbuf(i);
1666
+ v = v+w*ap::sqr(w/sl-1);
1667
+ v = v+(sl-w)*ap::sqr(w/sl);
1668
+ w = cntbuf(nc+i);
1669
+ v = v+w*ap::sqr(w/sr-1);
1670
+ v = v+(sr-w)*ap::sqr(w/sr);
1671
+ }
1672
+ cure = sqrt(v/(nc*n));
1673
+ if( cure<e )
1674
+ {
1675
+ threshold = cursplit;
1676
+ e = cure;
1677
+ }
1678
+ }
1679
+ }
1680
+ }
1681
+
1682
+
1683
+ /*************************************************************************
1684
+ Makes split on attribute
1685
+ *************************************************************************/
1686
+ static void dfsplitr(ap::real_1d_array& x,
1687
+ ap::real_1d_array& y,
1688
+ int n,
1689
+ int flags,
1690
+ int& info,
1691
+ double& threshold,
1692
+ double& e)
1693
+ {
1694
+ int i;
1695
+ int neq;
1696
+ int nless;
1697
+ int ngreater;
1698
+ int q;
1699
+ int qmin;
1700
+ int qmax;
1701
+ int qcnt;
1702
+ double cursplit;
1703
+ int nleft;
1704
+ double v;
1705
+ double cure;
1706
+
1707
+ tagsortfastr(x, y, n);
1708
+ e = ap::maxrealnumber;
1709
+ threshold = 0.5*(x(0)+x(n-1));
1710
+ info = -3;
1711
+ if( flags/dfusestrongsplits%2==0 )
1712
+ {
1713
+
1714
+ //
1715
+ // weak splits, split at half
1716
+ //
1717
+ qcnt = 2;
1718
+ qmin = 1;
1719
+ qmax = 1;
1720
+ }
1721
+ else
1722
+ {
1723
+
1724
+ //
1725
+ // strong splits: choose best quartile
1726
+ //
1727
+ qcnt = 4;
1728
+ qmin = 1;
1729
+ qmax = 3;
1730
+ }
1731
+ for(q = qmin; q <= qmax; q++)
1732
+ {
1733
+ cursplit = x(n*q/qcnt);
1734
+ neq = 0;
1735
+ nless = 0;
1736
+ ngreater = 0;
1737
+ for(i = 0; i <= n-1; i++)
1738
+ {
1739
+ if( x(i)<cursplit )
1740
+ {
1741
+ nless = nless+1;
1742
+ }
1743
+ if( x(i)==cursplit )
1744
+ {
1745
+ neq = neq+1;
1746
+ }
1747
+ if( x(i)>cursplit )
1748
+ {
1749
+ ngreater = ngreater+1;
1750
+ }
1751
+ }
1752
+ ap::ap_error::make_assertion(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1753
+ if( nless!=0||ngreater!=0 )
1754
+ {
1755
+
1756
+ //
1757
+ // set threshold between two partitions, with
1758
+ // some tweaking to avoid problems with floating point
1759
+ // arithmetics.
1760
+ //
1761
+ // The problem is that when you calculates C = 0.5*(A+B) there
1762
+ // can be no C which lies strictly between A and B (for example,
1763
+ // there is no floating point number which is
1764
+ // greater than 1 and less than 1+eps). In such situations
1765
+ // we choose right side as theshold (remember that
1766
+ // points which lie on threshold falls to the right side).
1767
+ //
1768
+ if( nless<ngreater )
1769
+ {
1770
+ cursplit = 0.5*(x(nless+neq-1)+x(nless+neq));
1771
+ nleft = nless+neq;
1772
+ if( cursplit<=x(nless+neq-1) )
1773
+ {
1774
+ cursplit = x(nless+neq);
1775
+ }
1776
+ }
1777
+ else
1778
+ {
1779
+ cursplit = 0.5*(x(nless-1)+x(nless));
1780
+ nleft = nless;
1781
+ if( cursplit<=x(nless-1) )
1782
+ {
1783
+ cursplit = x(nless);
1784
+ }
1785
+ }
1786
+ info = 1;
1787
+ cure = 0;
1788
+ v = 0;
1789
+ for(i = 0; i <= nleft-1; i++)
1790
+ {
1791
+ v = v+y(i);
1792
+ }
1793
+ v = v/nleft;
1794
+ for(i = 0; i <= nleft-1; i++)
1795
+ {
1796
+ cure = cure+ap::sqr(y(i)-v);
1797
+ }
1798
+ v = 0;
1799
+ for(i = nleft; i <= n-1; i++)
1800
+ {
1801
+ v = v+y(i);
1802
+ }
1803
+ v = v/(n-nleft);
1804
+ for(i = nleft; i <= n-1; i++)
1805
+ {
1806
+ cure = cure+ap::sqr(y(i)-v);
1807
+ }
1808
+ cure = sqrt(cure/n);
1809
+ if( cure<e )
1810
+ {
1811
+ threshold = cursplit;
1812
+ e = cure;
1813
+ }
1814
+ }
1815
+ }
1816
+ }
1817
+
1818
+
1819
+