isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,1932 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* TODO: should create versions of these functions that would work on the
66
+ serialized raw bytes instead, as it will likely be faster due to better
67
+ cache utilizations and those objects use less memory. */
68
+
69
+ /* TODO: these trees are all created in a depth-first fashion, which will
70
+ not be cache-friendly when predictions are sent to a right-side branch. In
71
+ order to make predictions faster, could re-arrange the trees after-the-fact
72
+ so that they contain batches of consecutive nodes (parent and children and
73
+ grandchildren) up to some depth - that way these prediction functions would
74
+ run faster. After that, could also do a manual tree leaves unroll within each
75
+ batch with stack-assigned variables for an even faster prediction function. */
76
+
77
+
78
+ /* Predict outlier score, average depth, or terminal node numbers
79
+ *
80
+ * Parameters
81
+ * ==========
82
+ * - numeric_data[nrows * ncols_numeric]
83
+ * Pointer to numeric data for which to make predictions. May be ordered by rows
84
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
85
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
86
+ * (see parameter 'is_col_major').
87
+ * Pass NULL if there are no dense numeric columns.
88
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
89
+ * - categ_data[nrows * ncols_categ]
90
+ * Pointer to categorical data for which to make predictions. May be ordered by rows
91
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
92
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
93
+ * (see parameter 'is_col_major').
94
+ * Pass NULL if there are no categorical columns.
95
+ * Each category should be represented as an integer, and these integers must start at zero and
96
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
97
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
98
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
99
+ * must be the same as was used in the data to which the model was fit.
100
+ * - is_col_major
101
+ * Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
102
+ * model was fit. If passing 'false', will assume they are in row-major order. Note that most of
103
+ * the functions in this library work only with column-major order, but here both are suitable
104
+ * and row-major is preferred. Both arrays must have the same orientation (row/column major).
105
+ * If there is numeric sparse data in combination with categorical dense data and there are many
106
+ * rows, it is recommended to pass the categorical data in column major order, as it will take
107
+ * a faster route.
108
+ * If passing 'is_col_major=true', must also provide 'ld_numeric' and/or 'ld_categ'.
109
+ * - ld_numeric
110
+ * Leading dimension of the array 'numeric_data', if it is passed in row-major format.
111
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
112
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
113
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
114
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
115
+ * data in sparse format.
116
+ * - ld_categ
117
+ * Leading dimension of the array 'categ_data', if it is passed in row-major format.
118
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
119
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
120
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
121
+ * leading dimension corresponds to the number of rows.
122
+ * - Xc[nnz]
123
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
124
+ * Pass NULL if there are no sparse numeric columns.
125
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
126
+ * - Xc_ind[nnz]
127
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
128
+ * Must be in sorted order, otherwise results will be incorrect.
129
+ * Pass NULL if there are no sparse numeric columns in CSC format.
130
+ * - Xc_indptr[ncols_categ + 1]
131
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
132
+ * start and at entry [col + 1] where does column 'col' end.
133
+ * Pass NULL if there are no sparse numeric columns in CSC format.
134
+ * - Xr[nnz]
135
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
136
+ * Pass NULL if there are no sparse numeric columns.
137
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
138
+ * - Xr_ind[nnz]
139
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
140
+ * Must be in sorted order, otherwise results will be incorrect.
141
+ * Pass NULL if there are no sparse numeric columns in CSR format.
142
+ * - Xr_indptr[nrows + 1]
143
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
144
+ * start and at entry [row + 1] where does row 'row' end.
145
+ * Pass NULL if there are no sparse numeric columns in CSR format.
146
+ * - nrows
147
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
148
+ * - nthreads
149
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
150
+ * allocated, even if the thread does not end up being used. Ignored when not building with
151
+ * OpenMP support.
152
+ * - standardize
153
+ * Whether to standardize the average depths for each row according to their relative magnitude
154
+ * compared to the expected average, in order to obtain an outlier score. If passing 'false',
155
+ * will output the average depth instead.
156
+ * Ignored when not passing 'output_depths'.
157
+ * - model_outputs
158
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
159
+ * if the predictions are to be made from an extended model. Can only pass one of
160
+ * 'model_outputs' and 'model_outputs_ext'.
161
+ * - model_outputs_ext
162
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
163
+ * if the predictions are to be made from a single-variable model. Can only pass one of
164
+ * 'model_outputs' and 'model_outputs_ext'.
165
+ * - output_depths[nrows] (out)
166
+ * Pointer to array where the output average depths or outlier scores will be written into
167
+ * (the return type is controlled according to parameter 'standardize').
168
+ * Should always be passed when calling this function (it is not optional).
169
+ * - tree_num[nrows * ntrees] (out)
170
+ * Pointer to array where the output terminal node numbers will be written into.
171
+ * Note that the mapping between tree node and terminal tree node is not stored in
172
+ * the model object for efficiency reasons, so this mapping will be determined on-the-fly
173
+ * when passing this parameter, and as such, there will be some overhead regardless of
174
+ * the actual number of rows. Output will be in column-major order ([nrows, ntrees]).
175
+ * This will not be calculable when using 'ndim==1' alongside with either
176
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
177
+ * Pass NULL if this type of output is not needed.
178
+ * - per_tree_depths[nrows * ntrees] (out)
179
+ * Pointer to array where to output per-tree depths or expected depths for each row.
180
+ * Note that these will not include range penalities ('penalize_range=true').
181
+ * Output will be in row-major order ([nrows, ntrees]).
182
+ * This will not be calculable when using 'ndim==1' alongside with either
183
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
184
+ * Pass NULL if this type of output is not needed.
185
+ * - indexer
186
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
187
+ * which can be used to speed up tree numbers/indices predictions.
188
+ * This is ignored when not passing 'tree_num'.
189
+ * Pass NULL if the indexer has not been constructed.
190
+ */
191
+ template <class real_t, class sparse_ix>
192
+ void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
193
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
194
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
195
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
196
+ size_t nrows, int nthreads, bool standardize,
197
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
198
+ double *restrict output_depths, sparse_ix *restrict tree_num,
199
+ double *restrict per_tree_depths,
200
+ TreesIndexer *indexer)
201
+ {
202
+ if (unlikely(!nrows)) return;
203
+
204
+ /* put data in a struct for passing it in fewer lines */
205
+ PredictionData<real_t, sparse_ix>
206
+ prediction_data = {numeric_data, categ_data, nrows,
207
+ is_col_major, ld_numeric, ld_categ,
208
+ Xc, Xc_ind, Xc_indptr,
209
+ Xr, Xr_ind, Xr_indptr};
210
+
211
+ int nthreads_orig = nthreads;
212
+ if ((size_t)nthreads > nrows)
213
+ nthreads = nrows;
214
+
215
+ /* For batch predictions of sparse CSC, will take a specialized route */
216
+ if (prediction_data.Xc_indptr != NULL && (prediction_data.categ_data == NULL || prediction_data.is_col_major))
217
+ {
218
+ batched_csc_predict(prediction_data, nthreads_orig,
219
+ model_outputs, model_outputs_ext,
220
+ output_depths, tree_num,
221
+ per_tree_depths);
222
+ }
223
+
224
+ /* Regular case (no specialized CSC route) */
225
+ else if (model_outputs != NULL)
226
+ {
227
+ if (
228
+ model_outputs->missing_action == Fail &&
229
+ (model_outputs->new_cat_action != Weighted || model_outputs->cat_split_type == SingleCateg || prediction_data.categ_data == NULL) &&
230
+ prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL &&
231
+ !model_outputs->has_range_penalty
232
+ )
233
+ {
234
+ if (prediction_data.categ_data == NULL && (nrows == 1 || !prediction_data.is_col_major))
235
+ {
236
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
237
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
238
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
239
+ {
240
+ double score = 0;
241
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
242
+ {
243
+ traverse_itree_fast(model_outputs->trees[tree],
244
+ *model_outputs,
245
+ prediction_data.numeric_data + row * prediction_data.ncols_numeric,
246
+ score,
247
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
248
+ (per_tree_depths == NULL)?
249
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
250
+ (size_t) row);
251
+ }
252
+ output_depths[row] = score;
253
+ }
254
+ }
255
+
256
+ else
257
+ {
258
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
259
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
260
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
261
+ {
262
+ double score = 0;
263
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
264
+ {
265
+ traverse_itree_no_recurse(model_outputs->trees[tree],
266
+ *model_outputs,
267
+ prediction_data,
268
+ score,
269
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
270
+ (per_tree_depths == NULL)?
271
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
272
+ (size_t) row);
273
+ }
274
+ output_depths[row] = score;
275
+ }
276
+ }
277
+ }
278
+
279
+ else
280
+ {
281
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
282
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
283
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
284
+ {
285
+ double score = 0;
286
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
287
+ {
288
+ score += traverse_itree(model_outputs->trees[tree],
289
+ *model_outputs,
290
+ prediction_data,
291
+ (std::vector<ImputeNode>*)NULL,
292
+ (ImputedData<sparse_ix, double>*)NULL,
293
+ (double)0,
294
+ (size_t) row,
295
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
296
+ (per_tree_depths == NULL)?
297
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
298
+ (size_t) 0);
299
+ }
300
+ output_depths[row] = score;
301
+ }
302
+ }
303
+ }
304
+
305
+
306
+ else
307
+ {
308
+ if (
309
+ model_outputs_ext->missing_action == Fail &&
310
+ prediction_data.categ_data == NULL &&
311
+ prediction_data.Xc_indptr == NULL &&
312
+ prediction_data.Xr_indptr == NULL &&
313
+ !model_outputs_ext->has_range_penalty
314
+ )
315
+ {
316
+ if (prediction_data.is_col_major && nrows > 1)
317
+ {
318
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
319
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
320
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
321
+ {
322
+ double score = 0;
323
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
324
+ {
325
+ traverse_hplane_fast_colmajor(model_outputs_ext->hplanes[tree],
326
+ *model_outputs_ext,
327
+ prediction_data,
328
+ score,
329
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
330
+ (per_tree_depths == NULL)?
331
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
332
+ (size_t) row);
333
+ }
334
+ output_depths[row] = score;
335
+ }
336
+ }
337
+
338
+ else
339
+ {
340
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
341
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
342
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
343
+ {
344
+ double score = 0;
345
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
346
+ {
347
+ traverse_hplane_fast_rowmajor(model_outputs_ext->hplanes[tree],
348
+ *model_outputs_ext,
349
+ prediction_data.numeric_data + row * prediction_data.ncols_numeric,
350
+ score,
351
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
352
+ (per_tree_depths == NULL)?
353
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
354
+ (size_t) row);
355
+ }
356
+ output_depths[row] = score;
357
+ }
358
+ }
359
+ }
360
+
361
+ else
362
+ {
363
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
364
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
365
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
366
+ {
367
+ double score = 0;
368
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
369
+ {
370
+ traverse_hplane(model_outputs_ext->hplanes[tree],
371
+ *model_outputs_ext,
372
+ prediction_data,
373
+ score,
374
+ (std::vector<ImputeNode>*)NULL,
375
+ (ImputedData<sparse_ix, double>*)NULL,
376
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
377
+ (per_tree_depths == NULL)?
378
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
379
+ (size_t) row);
380
+ }
381
+ output_depths[row] = score;
382
+ }
383
+ }
384
+ }
385
+
386
+ /* translate sum-of-depths to outlier score */
387
+ double ntrees, depth_divisor;
388
+ if (model_outputs != NULL)
389
+ {
390
+ ntrees = (double) model_outputs->trees.size();
391
+ depth_divisor = ntrees * (model_outputs->exp_avg_depth);
392
+ }
393
+
394
+ else
395
+ {
396
+ ntrees = (double) model_outputs_ext->hplanes.size();
397
+ depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
398
+ }
399
+
400
+
401
+ /* for density and boxed_ratio, each tree will have 'log(d)'' instead of 'd' */
402
+ bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
403
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
404
+ bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
405
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
406
+ bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
407
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
408
+ bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity2) ||
409
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity2);
410
+
411
+ if (standardize)
412
+ {
413
+ if (is_density || is_bdens2)
414
+ {
415
+ ntrees = -ntrees;
416
+ for (size_t row = 0; row < nrows; row++)
417
+ output_depths[row] /= ntrees;
418
+ }
419
+
420
+ else if (is_bdens)
421
+ {
422
+ #ifndef _WIN32
423
+ #pragma omp simd
424
+ #endif
425
+ for (size_t row = 0; row < nrows; row++)
426
+ output_depths[row] = -std::exp(output_depths[row] / ntrees);
427
+ }
428
+
429
+ else if (is_bratio)
430
+ {
431
+ for (size_t row = 0; row < nrows; row++)
432
+ output_depths[row] = output_depths[row] / ntrees;
433
+ }
434
+
435
+ else
436
+ {
437
+ #ifndef _WIN32
438
+ #pragma omp simd
439
+ #endif
440
+ for (size_t row = 0; row < nrows; row++)
441
+ output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
442
+ }
443
+ }
444
+
445
+ else
446
+ {
447
+ if (is_density || is_bdens || is_bdens2)
448
+ {
449
+ #ifndef _WIN32
450
+ #pragma omp simd
451
+ #endif
452
+ for (size_t row = 0; row < nrows; row++)
453
+ output_depths[row] = std::exp(output_depths[row] / ntrees);
454
+ }
455
+
456
+ else if (is_bratio)
457
+ {
458
+ ntrees = -ntrees;
459
+ for (size_t row = 0; row < nrows; row++)
460
+ output_depths[row] /= ntrees;
461
+ }
462
+
463
+ else
464
+ {
465
+ for (size_t row = 0; row < nrows; row++)
466
+ output_depths[row] /= ntrees;
467
+ }
468
+ }
469
+
470
+ if (per_tree_depths != NULL && (is_density || is_bdens || is_bdens2))
471
+ {
472
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
473
+ #ifndef _WIN32
474
+ #pragma omp simd
475
+ #endif
476
+ for (size_t ix = 0; ix < nrows*ntrees; ix++)
477
+ per_tree_depths[ix] = std::exp(per_tree_depths[ix]);
478
+ }
479
+
480
+
481
+ /* re-map tree numbers to start at zero (if predicting tree numbers) */
482
+ /* Note: usually this type of 'prediction' is not required,
483
+ thus this mapping is not stored in the model objects so as to
484
+ save memory */
485
+ if (tree_num != NULL)
486
+ {
487
+ if (indexer != NULL && !indexer->indices.empty())
488
+ {
489
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
490
+ if (model_outputs != NULL)
491
+ {
492
+ if (model_outputs->missing_action == Divide)
493
+ goto manual_remap;
494
+ if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL)
495
+ goto manual_remap;
496
+ }
497
+
498
+ for (size_t tree = 0; tree < ntrees; tree++)
499
+ {
500
+ size_t *restrict mapping = indexer->indices[tree].terminal_node_mappings.data();
501
+ for (size_t row = 0; row < nrows; row++)
502
+ {
503
+ tree_num[row + tree*nrows] = mapping[tree_num[row + tree*nrows]];
504
+ }
505
+ }
506
+ }
507
+
508
+ else
509
+ {
510
+ manual_remap:
511
+ remap_terminal_trees(model_outputs, model_outputs_ext,
512
+ prediction_data, tree_num, nthreads);
513
+ }
514
+ }
515
+ }
516
+
517
+ template <class real_t, class sparse_ix>
518
+ void traverse_itree_fast(std::vector<IsoTree> &tree,
519
+ IsoForest &model_outputs,
520
+ real_t *restrict row_numeric_data,
521
+ double &restrict output_depth,
522
+ sparse_ix *restrict tree_num,
523
+ double *restrict tree_depth,
524
+ size_t row) noexcept
525
+ {
526
+ size_t curr_lev = 0;
527
+ double xval;
528
+ while (true)
529
+ {
530
+ if (unlikely(tree[curr_lev].tree_left == 0))
531
+ {
532
+ output_depth += tree[curr_lev].score;
533
+ if (unlikely(tree_num != NULL))
534
+ tree_num[row] = curr_lev;
535
+ if (unlikely(tree_depth != NULL))
536
+ *tree_depth = tree[curr_lev].score;
537
+ break;
538
+ }
539
+
540
+ else
541
+ {
542
+ xval = row_numeric_data[tree[curr_lev].col_num];
543
+ curr_lev = (xval <= tree[curr_lev].num_split)?
544
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
545
+ }
546
+ }
547
+ }
548
+
549
+ template <class PredictionData, class sparse_ix>
550
+ void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
551
+ IsoForest &model_outputs,
552
+ PredictionData &prediction_data,
553
+ double &restrict output_depth,
554
+ sparse_ix *restrict tree_num,
555
+ double *restrict tree_depth,
556
+ size_t row) noexcept
557
+ {
558
+ size_t curr_lev = 0;
559
+ double xval;
560
+ int cval;
561
+ while (true)
562
+ {
563
+ // if (tree[curr_lev].score > 0)
564
+ if (unlikely(tree[curr_lev].tree_left == 0))
565
+ {
566
+ output_depth += tree[curr_lev].score;
567
+ if (unlikely(tree_num != NULL))
568
+ tree_num[row] = curr_lev;
569
+ if (unlikely(tree_depth != NULL))
570
+ *tree_depth = tree[curr_lev].score;
571
+ break;
572
+ }
573
+
574
+ else
575
+ {
576
+ switch (tree[curr_lev].col_type)
577
+ {
578
+ case Numeric:
579
+ {
580
+ xval = prediction_data.numeric_data[
581
+ prediction_data.is_col_major?
582
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
583
+ :
584
+ (tree[curr_lev].col_num + row * prediction_data.ncols_numeric)
585
+ ];
586
+ curr_lev = (xval <= tree[curr_lev].num_split)?
587
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
588
+ break;
589
+ }
590
+
591
+ case Categorical:
592
+ {
593
+ cval = prediction_data.categ_data[
594
+ prediction_data.is_col_major?
595
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
596
+ :
597
+ (tree[curr_lev].col_num + row * prediction_data.ncols_categ)
598
+ ];
599
+ switch (model_outputs.cat_split_type)
600
+ {
601
+ case SubSet:
602
+ {
603
+
604
+ if (tree[curr_lev].cat_split.empty()) /* this is for binary columns */
605
+ {
606
+ if (cval <= 1)
607
+ {
608
+ curr_lev = (cval == 0)?
609
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
610
+ }
611
+
612
+ else /* can only work with 'Smallest' + no NAs if reaching this point */
613
+ {
614
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
615
+ }
616
+ }
617
+
618
+ else
619
+ {
620
+
621
+ switch (model_outputs.new_cat_action)
622
+ {
623
+ case Random:
624
+ {
625
+ cval = (cval >= (int)tree[curr_lev].cat_split.size())?
626
+ (cval % (int)tree[curr_lev].cat_split.size()) : cval;
627
+ curr_lev = (tree[curr_lev].cat_split[cval])?
628
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
629
+ break;
630
+ }
631
+
632
+ case Smallest:
633
+ {
634
+ if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
635
+ {
636
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
637
+ }
638
+
639
+ else
640
+ {
641
+ curr_lev = (tree[curr_lev].cat_split[cval])?
642
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
643
+ }
644
+ break;
645
+ }
646
+
647
+ default:
648
+ {
649
+ assert(0);
650
+ break;
651
+ }
652
+ }
653
+ }
654
+ break;
655
+ }
656
+
657
+ case SingleCateg:
658
+ {
659
+ curr_lev = (cval == tree[curr_lev].chosen_cat)?
660
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
661
+ break;
662
+ }
663
+ }
664
+ break;
665
+ }
666
+
667
+ default:
668
+ {
669
+ assert(0);
670
+ break;
671
+ }
672
+ }
673
+ }
674
+ }
675
+ }
676
+
677
+ enum NumericConfig {DenseRowMajor, DenseColMajor, SparseCSR, SparseCSC};
678
+
679
+ template <class PredictionData, class sparse_ix, class ImputedData>
680
+ double traverse_itree(std::vector<IsoTree> &tree,
681
+ IsoForest &model_outputs,
682
+ PredictionData &prediction_data,
683
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
684
+ ImputedData *imputed_data, /* only when imputing missing */
685
+ double curr_weight, /* only when imputing missing */
686
+ size_t row,
687
+ sparse_ix *restrict tree_num,
688
+ double *restrict tree_depth,
689
+ size_t curr_lev) noexcept
690
+ {
691
+ double xval;
692
+ int cval;
693
+ double range_penalty = 0;
694
+
695
+ NumericConfig numeric_config;
696
+ if (prediction_data.Xr_indptr != NULL)
697
+ numeric_config = SparseCSR;
698
+ else if (prediction_data.Xc_indptr != NULL)
699
+ numeric_config = SparseCSC;
700
+ else if (prediction_data.is_col_major)
701
+ numeric_config = DenseColMajor;
702
+ else
703
+ numeric_config = DenseRowMajor;
704
+
705
+ sparse_ix *row_st = NULL, *row_end = NULL;
706
+ if (numeric_config == SparseCSR)
707
+ {
708
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
709
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
710
+ }
711
+
712
+ while (true)
713
+ {
714
+ // if (tree[curr_lev].score >= 0.)
715
+ if (unlikely(tree[curr_lev].tree_left == 0))
716
+ {
717
+ if (unlikely(tree_num != NULL))
718
+ tree_num[row] = curr_lev;
719
+ if (unlikely(tree_depth != NULL))
720
+ *tree_depth = tree[curr_lev].score;
721
+ if (unlikely(imputed_data != NULL))
722
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
723
+
724
+ return tree[curr_lev].score - range_penalty;
725
+ }
726
+
727
+ else
728
+ {
729
+ switch(tree[curr_lev].col_type)
730
+ {
731
+ case Numeric:
732
+ {
733
+ switch(numeric_config)
734
+ {
735
+ case DenseRowMajor:
736
+ {
737
+ xval = prediction_data.numeric_data[tree[curr_lev].col_num + row * prediction_data.ncols_numeric];
738
+ break;
739
+ }
740
+
741
+ case DenseColMajor:
742
+ {
743
+ xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
744
+ break;
745
+ }
746
+
747
+ case SparseCSR:
748
+ {
749
+ xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
750
+ break;
751
+ }
752
+
753
+ case SparseCSC:
754
+ {
755
+ xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
756
+ break;
757
+ }
758
+ }
759
+
760
+ if (unlikely(std::isnan(xval)))
761
+ {
762
+ switch(model_outputs.missing_action)
763
+ {
764
+ case Divide:
765
+ {
766
+ return
767
+ tree[curr_lev].pct_tree_left
768
+ * traverse_itree(tree, model_outputs, prediction_data,
769
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
770
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
771
+ + (1. - tree[curr_lev].pct_tree_left)
772
+ * traverse_itree(tree, model_outputs, prediction_data,
773
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
774
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
775
+ - range_penalty;
776
+ }
777
+
778
+ case Impute:
779
+ {
780
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
781
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
782
+ break;
783
+ }
784
+
785
+ case Fail:
786
+ {
787
+ return NAN;
788
+ }
789
+ }
790
+ }
791
+
792
+ else
793
+ {
794
+ range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
795
+ curr_lev = (xval <= tree[curr_lev].num_split)?
796
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
797
+ }
798
+ break;
799
+ }
800
+
801
+ case Categorical:
802
+ {
803
+ cval = prediction_data.categ_data[
804
+ prediction_data.is_col_major?
805
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
806
+ :
807
+ (tree[curr_lev].col_num + row * prediction_data.ncols_categ)
808
+ ];
809
+ if (unlikely(cval < 0))
810
+ {
811
+ switch(model_outputs.missing_action)
812
+ {
813
+ case Divide:
814
+ {
815
+ return
816
+ tree[curr_lev].pct_tree_left
817
+ * traverse_itree(tree, model_outputs, prediction_data,
818
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
819
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
820
+ + (1. - tree[curr_lev].pct_tree_left)
821
+ * traverse_itree(tree, model_outputs, prediction_data,
822
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
823
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
824
+ - range_penalty;
825
+ }
826
+
827
+ case Impute:
828
+ {
829
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
830
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
831
+ break;
832
+ }
833
+
834
+ case Fail:
835
+ {
836
+ return NAN;
837
+ }
838
+ }
839
+ }
840
+
841
+ else
842
+ {
843
+ switch(model_outputs.cat_split_type)
844
+ {
845
+ case SingleCateg:
846
+ {
847
+ curr_lev = (cval == tree[curr_lev].chosen_cat)?
848
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
849
+ break;
850
+ }
851
+
852
+ case SubSet:
853
+ {
854
+
855
+ if (tree[curr_lev].cat_split.empty())
856
+ {
857
+ if (cval <= 1)
858
+ {
859
+ curr_lev = (cval == 0)?
860
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
861
+ }
862
+
863
+ else
864
+ {
865
+ switch(model_outputs.new_cat_action)
866
+ {
867
+ case Smallest:
868
+ {
869
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
870
+ break;
871
+ }
872
+
873
+ case Weighted:
874
+ {
875
+ return
876
+ tree[curr_lev].pct_tree_left
877
+ * traverse_itree(tree, model_outputs, prediction_data,
878
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
879
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
880
+ + (1. - tree[curr_lev].pct_tree_left)
881
+ * traverse_itree(tree, model_outputs, prediction_data,
882
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
883
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
884
+ - range_penalty;
885
+ }
886
+
887
+ default:
888
+ {
889
+ assert(0);
890
+ break;
891
+ }
892
+ }
893
+ }
894
+ }
895
+
896
+ else
897
+ {
898
+ switch(model_outputs.new_cat_action)
899
+ {
900
+ case Random:
901
+ {
902
+ cval = (cval >= (int)tree[curr_lev].cat_split.size())?
903
+ (cval % (int)tree[curr_lev].cat_split.size()) : cval;
904
+ curr_lev = (tree[curr_lev].cat_split[cval])?
905
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
906
+ break;
907
+ }
908
+
909
+ case Smallest:
910
+ {
911
+ if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
912
+ {
913
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
914
+ }
915
+
916
+ else
917
+ {
918
+ curr_lev = (tree[curr_lev].cat_split[cval])?
919
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
920
+ }
921
+ break;
922
+ }
923
+
924
+ case Weighted:
925
+ {
926
+ if (cval >= (int)tree[curr_lev].cat_split.size()
927
+ ||
928
+ tree[curr_lev].cat_split[cval] == (-1))
929
+ {
930
+ return
931
+ tree[curr_lev].pct_tree_left
932
+ * traverse_itree(tree, model_outputs, prediction_data,
933
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
934
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
935
+ + (1. - tree[curr_lev].pct_tree_left)
936
+ * traverse_itree(tree, model_outputs, prediction_data,
937
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
938
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
939
+ - range_penalty;
940
+ }
941
+
942
+ else
943
+ {
944
+ curr_lev = (tree[curr_lev].cat_split[cval])?
945
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
946
+ }
947
+ break;
948
+ }
949
+ }
950
+ }
951
+ break;
952
+ }
953
+ }
954
+ }
955
+ break;
956
+ }
957
+
958
+ default:
959
+ {
960
+ assert(0);
961
+ break;
962
+ }
963
+ }
964
+ }
965
+ }
966
+ }
967
+
968
+ /* this is a simpler version for situations in which there is
969
+ only numeric data in dense arrays, no missing values, no range penalty */
970
+ template <class PredictionData, class sparse_ix>
971
+ void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
972
+ ExtIsoForest &model_outputs,
973
+ PredictionData &prediction_data,
974
+ double &restrict output_depth,
975
+ sparse_ix *restrict tree_num,
976
+ double *restrict tree_depth,
977
+ size_t row) noexcept
978
+ {
979
+ size_t curr_lev = 0;
980
+ double hval;
981
+
982
+ while(true)
983
+ {
984
+ // if (hplane[curr_lev].score > 0)
985
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
986
+ {
987
+ output_depth += hplane[curr_lev].score;
988
+ if (unlikely(tree_num != NULL))
989
+ tree_num[row] = curr_lev;
990
+ if (unlikely(tree_depth != NULL))
991
+ *tree_depth = hplane[curr_lev].score;
992
+ return;
993
+ }
994
+
995
+ else
996
+ {
997
+ hval = 0;
998
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
999
+ hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
1000
+ - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
1001
+
1002
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1003
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1004
+
1005
+ }
1006
+ }
1007
+ }
1008
+
1009
+ template <class real_t, class sparse_ix>
1010
+ void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
1011
+ ExtIsoForest &model_outputs,
1012
+ real_t *restrict row_numeric_data,
1013
+ double &restrict output_depth,
1014
+ sparse_ix *restrict tree_num,
1015
+ double *restrict tree_depth,
1016
+ size_t row) noexcept
1017
+ {
1018
+ size_t curr_lev = 0;
1019
+ double hval;
1020
+
1021
+ while(true)
1022
+ {
1023
+ // if (hplane[curr_lev].score > 0)
1024
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
1025
+ {
1026
+ output_depth += hplane[curr_lev].score;
1027
+ if (unlikely(tree_num != NULL))
1028
+ tree_num[row] = curr_lev;
1029
+ if (unlikely(tree_depth != NULL))
1030
+ *tree_depth = hplane[curr_lev].score;
1031
+ return;
1032
+ }
1033
+
1034
+ else
1035
+ {
1036
+ hval = 0;
1037
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
1038
+ hval += (row_numeric_data[hplane[curr_lev].col_num[col]]
1039
+ - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
1040
+
1041
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1042
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1043
+
1044
+ }
1045
+ }
1046
+ }
1047
+
1048
+ /* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
1049
+ template <class PredictionData, class sparse_ix, class ImputedData>
1050
+ void traverse_hplane(std::vector<IsoHPlane> &hplane,
1051
+ ExtIsoForest &model_outputs,
1052
+ PredictionData &prediction_data,
1053
+ double &restrict output_depth,
1054
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
1055
+ ImputedData *imputed_data, /* only when imputing missing */
1056
+ sparse_ix *restrict tree_num,
1057
+ double *restrict tree_depth,
1058
+ size_t row) noexcept
1059
+ {
1060
+ size_t curr_lev = 0;
1061
+ double xval;
1062
+ int cval;
1063
+ double hval;
1064
+
1065
+ size_t ncols_numeric, ncols_categ;
1066
+
1067
+ NumericConfig numeric_config;
1068
+ if (prediction_data.Xr_indptr != NULL)
1069
+ numeric_config = SparseCSR;
1070
+ else if (prediction_data.Xc_indptr != NULL)
1071
+ numeric_config = SparseCSC;
1072
+ else if (prediction_data.is_col_major)
1073
+ numeric_config = DenseColMajor;
1074
+ else
1075
+ numeric_config = DenseRowMajor;
1076
+
1077
+ sparse_ix *row_st = NULL, *row_end = NULL;
1078
+ size_t lb, ub;
1079
+ if (numeric_config == SparseCSR)
1080
+ {
1081
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
1082
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
1083
+ lb = *row_st;
1084
+ ub = *(row_end-1);
1085
+ }
1086
+
1087
+ while (true)
1088
+ {
1089
+ // if (hplane[curr_lev].score > 0)
1090
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
1091
+ {
1092
+ output_depth += hplane[curr_lev].score;
1093
+ if (unlikely(tree_num != NULL))
1094
+ tree_num[row] = curr_lev;
1095
+ if (unlikely(tree_depth != NULL))
1096
+ *tree_depth = hplane[curr_lev].score;
1097
+ if (unlikely(imputed_data != NULL))
1098
+ {
1099
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
1100
+ }
1101
+ return;
1102
+ }
1103
+
1104
+ else
1105
+ {
1106
+ hval = 0;
1107
+ ncols_numeric = 0; ncols_categ = 0;
1108
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
1109
+ {
1110
+ switch(hplane[curr_lev].col_type[col])
1111
+ {
1112
+ case Numeric:
1113
+ {
1114
+ switch(numeric_config)
1115
+ {
1116
+ case DenseRowMajor:
1117
+ {
1118
+ xval = prediction_data.numeric_data[hplane[curr_lev].col_num[col] + row * prediction_data.ncols_numeric];
1119
+ break;
1120
+ }
1121
+
1122
+ case DenseColMajor:
1123
+ {
1124
+ xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
1125
+ break;
1126
+ }
1127
+
1128
+ case SparseCSR:
1129
+ {
1130
+ xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col], lb, ub);
1131
+ break;
1132
+ }
1133
+
1134
+ case SparseCSC:
1135
+ {
1136
+ xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
1137
+ break;
1138
+ }
1139
+ }
1140
+
1141
+ if (unlikely(is_na_or_inf(xval)))
1142
+ {
1143
+ if (model_outputs.missing_action != Fail)
1144
+ {
1145
+ hval += hplane[curr_lev].fill_val[col];
1146
+ }
1147
+
1148
+ else
1149
+ {
1150
+ output_depth = NAN;
1151
+ return;
1152
+ }
1153
+ }
1154
+
1155
+ else
1156
+ {
1157
+ hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
1158
+ }
1159
+
1160
+ ncols_numeric++;
1161
+ break;
1162
+ }
1163
+
1164
+ case Categorical:
1165
+ {
1166
+ cval = prediction_data.categ_data[
1167
+ prediction_data.is_col_major?
1168
+ (row + hplane[curr_lev].col_num[col] * prediction_data.nrows)
1169
+ :
1170
+ (hplane[curr_lev].col_num[col] + row * prediction_data.ncols_categ)
1171
+ ];
1172
+ if (unlikely(cval < 0))
1173
+ {
1174
+ if (model_outputs.missing_action != Fail)
1175
+ {
1176
+ hval += hplane[curr_lev].fill_val[col];
1177
+ }
1178
+
1179
+ else
1180
+ {
1181
+ output_depth = NAN;
1182
+ return;
1183
+ }
1184
+ }
1185
+
1186
+ else
1187
+ {
1188
+ switch(model_outputs.cat_split_type)
1189
+ {
1190
+ case SingleCateg:
1191
+ {
1192
+ hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
1193
+ break;
1194
+ }
1195
+
1196
+ case SubSet:
1197
+ {
1198
+ if (unlikely(cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size()))
1199
+ {
1200
+ if (model_outputs.new_cat_action == Random) {
1201
+ cval = cval % (int)hplane[curr_lev].cat_coef[ncols_categ].size();
1202
+ hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
1203
+ }
1204
+
1205
+ else {
1206
+ hval += hplane[curr_lev].fill_new[ncols_categ];
1207
+ }
1208
+ }
1209
+
1210
+ else
1211
+ {
1212
+ hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
1213
+ }
1214
+
1215
+ break;
1216
+ }
1217
+ }
1218
+ }
1219
+
1220
+ ncols_categ++;
1221
+ break;
1222
+ }
1223
+
1224
+ default:
1225
+ {
1226
+ assert(0);
1227
+ break;
1228
+ }
1229
+ }
1230
+
1231
+ }
1232
+
1233
+ output_depth -= (hval < hplane[curr_lev].range_low) ||
1234
+ (hval > hplane[curr_lev].range_high);
1235
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1236
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1237
+ }
1238
+ }
1239
+ }
1240
+
1241
+ template <class real_t, class sparse_ix>
1242
+ void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
1243
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1244
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1245
+ double *restrict per_tree_depths)
1246
+ {
1247
+ #ifdef _OPENMP
1248
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1249
+ if ((size_t)nthreads > ntrees)
1250
+ nthreads = ntrees;
1251
+ #else
1252
+ nthreads = 1;
1253
+ #endif
1254
+ std::vector<WorkerForPredictCSC> worker_memory(nthreads);
1255
+
1256
+ bool threw_exception = false;
1257
+ std::exception_ptr ex = NULL;
1258
+
1259
+ if (model_outputs != NULL)
1260
+ {
1261
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
1262
+ shared(worker_memory, model_outputs, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
1263
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs->trees.size(); tree++)
1264
+ {
1265
+ if (threw_exception) continue;
1266
+ try
1267
+ {
1268
+ WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
1269
+ if (!ptr_worker->depths.size())
1270
+ {
1271
+ ptr_worker->depths.resize(prediction_data.nrows);
1272
+ ptr_worker->ix_arr.resize(prediction_data.nrows);
1273
+ std::iota(ptr_worker->ix_arr.begin(),
1274
+ ptr_worker->ix_arr.end(),
1275
+ (size_t)0);
1276
+
1277
+ if (model_outputs->missing_action == Divide ||
1278
+ (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)
1279
+ ) {
1280
+ ptr_worker->weights_arr.resize(prediction_data.nrows);
1281
+ }
1282
+ }
1283
+
1284
+ ptr_worker->st = 0;
1285
+ ptr_worker->end = prediction_data.nrows - 1;
1286
+ if (model_outputs->missing_action == Divide)
1287
+ std::fill(ptr_worker->weights_arr.begin(),
1288
+ ptr_worker->weights_arr.end(),
1289
+ (double)1);
1290
+
1291
+ traverse_itree_csc(*ptr_worker,
1292
+ model_outputs->trees[tree],
1293
+ *model_outputs,
1294
+ prediction_data,
1295
+ (tree_num == NULL)?
1296
+ ((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
1297
+ per_tree_depths,
1298
+ (size_t)0,
1299
+ model_outputs->has_range_penalty);
1300
+ }
1301
+
1302
+ catch (...)
1303
+ {
1304
+ #pragma omp critical
1305
+ {
1306
+ if (!threw_exception)
1307
+ {
1308
+ threw_exception = true;
1309
+ ex = std::current_exception();
1310
+ }
1311
+ }
1312
+ }
1313
+ }
1314
+ }
1315
+
1316
+ else
1317
+ {
1318
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
1319
+ shared(worker_memory, model_outputs_ext, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
1320
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs_ext->hplanes.size(); tree++)
1321
+ {
1322
+ if (threw_exception) continue;
1323
+ try
1324
+ {
1325
+ WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
1326
+ if (!ptr_worker->depths.size())
1327
+ {
1328
+ ptr_worker->depths.resize(prediction_data.nrows);
1329
+ ptr_worker->comb_val.resize(prediction_data.nrows);
1330
+ ptr_worker->ix_arr.resize(prediction_data.nrows);
1331
+ std::iota(ptr_worker->ix_arr.begin(),
1332
+ ptr_worker->ix_arr.end(),
1333
+ (size_t)0);
1334
+ }
1335
+
1336
+ ptr_worker->st = 0;
1337
+ ptr_worker->end = prediction_data.nrows - 1;
1338
+
1339
+ traverse_hplane_csc(*ptr_worker,
1340
+ model_outputs_ext->hplanes[tree],
1341
+ *model_outputs_ext,
1342
+ prediction_data,
1343
+ (tree_num == NULL)?
1344
+ ((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
1345
+ per_tree_depths,
1346
+ (size_t)0,
1347
+ model_outputs_ext->has_range_penalty);
1348
+ }
1349
+
1350
+ catch (...)
1351
+ {
1352
+ #pragma omp critical
1353
+ {
1354
+ if (!threw_exception)
1355
+ {
1356
+ threw_exception = true;
1357
+ ex = std::current_exception();
1358
+ }
1359
+ }
1360
+ }
1361
+ }
1362
+
1363
+ if (threw_exception)
1364
+ std::rethrow_exception(ex);
1365
+ }
1366
+
1367
+ #ifdef _OPENMP
1368
+ if (nthreads <= 1)
1369
+ #endif
1370
+ {
1371
+ std::copy(worker_memory.front().depths.begin(), worker_memory.front().depths.end(), output_depths);
1372
+ }
1373
+
1374
+ #ifdef _OPENMP
1375
+ else
1376
+ {
1377
+ std::fill(output_depths, output_depths + prediction_data.nrows, (double)0);
1378
+ for (auto &workspace : worker_memory)
1379
+ if (workspace.depths.size())
1380
+ #if !defined(_MSC_VER) && !defined(_WIN32)
1381
+ #pragma omp simd
1382
+ #endif
1383
+ for (size_t row = 0; row < prediction_data.nrows; row++)
1384
+ output_depths[row] += workspace.depths[row];
1385
+ }
1386
+ #endif
1387
+ }
1388
+
1389
+ template <class PredictionData, class sparse_ix>
1390
+ void traverse_itree_csc(WorkerForPredictCSC &workspace,
1391
+ std::vector<IsoTree> &trees,
1392
+ IsoForest &model_outputs,
1393
+ PredictionData &prediction_data,
1394
+ sparse_ix *restrict tree_num,
1395
+ double *restrict per_tree_depths,
1396
+ size_t curr_tree,
1397
+ bool has_range_penalty)
1398
+ {
1399
+ // if (trees[curr_tree].score >= 0)
1400
+ if (unlikely(trees[curr_tree].tree_left == 0))
1401
+ {
1402
+ if (model_outputs.missing_action != Divide)
1403
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1404
+ workspace.depths[workspace.ix_arr[row]] += trees[curr_tree].score;
1405
+ else
1406
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1407
+ workspace.depths[workspace.ix_arr[row]] += workspace.weights_arr[workspace.ix_arr[row]] * trees[curr_tree].score;
1408
+ if (unlikely(tree_num != NULL))
1409
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1410
+ tree_num[workspace.ix_arr[row]] = curr_tree;
1411
+ if (unlikely(per_tree_depths != NULL))
1412
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1413
+ per_tree_depths[workspace.ix_arr[row]] = trees[curr_tree].score;
1414
+ return;
1415
+ }
1416
+
1417
+ /* in this case, the indices are sorted in the csc penalty function */
1418
+ if (!(has_range_penalty && model_outputs.missing_action != Divide && curr_tree > 0) && trees[curr_tree].col_type == Numeric)
1419
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1420
+
1421
+ /* TODO: should mix the splitting function with the range penalty */
1422
+
1423
+ /* divide according to tree */
1424
+ size_t orig_end = workspace.end;
1425
+ size_t st_NA, end_NA, split_ix;
1426
+ switch (trees[curr_tree].col_type)
1427
+ {
1428
+ case Numeric:
1429
+ {
1430
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
1431
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1432
+ trees[curr_tree].num_split, model_outputs.missing_action,
1433
+ st_NA, end_NA, split_ix);
1434
+ break;
1435
+ }
1436
+
1437
+ case Categorical:
1438
+ {
1439
+ switch (model_outputs.cat_split_type)
1440
+ {
1441
+ case SingleCateg:
1442
+ {
1443
+ divide_subset_split(workspace.ix_arr.data(),
1444
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1445
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
1446
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
1447
+ break;
1448
+ }
1449
+
1450
+ case SubSet:
1451
+ {
1452
+ if (!trees[curr_tree].cat_split.size())
1453
+ divide_subset_split(workspace.ix_arr.data(),
1454
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1455
+ workspace.st, workspace.end,
1456
+ model_outputs.missing_action, model_outputs.new_cat_action,
1457
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
1458
+ else
1459
+ divide_subset_split(workspace.ix_arr.data(),
1460
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1461
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
1462
+ (int) trees[curr_tree].cat_split.size(),
1463
+ model_outputs.missing_action, model_outputs.new_cat_action,
1464
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
1465
+ break;
1466
+ }
1467
+ }
1468
+ break;
1469
+ }
1470
+
1471
+ default:
1472
+ {
1473
+ assert(0);
1474
+ break;
1475
+ }
1476
+ }
1477
+
1478
+ /* continue splitting recursively */
1479
+ if (unlikely(model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL))
1480
+ goto missing_action_divide;
1481
+ switch (model_outputs.missing_action)
1482
+ {
1483
+ case Impute:
1484
+ {
1485
+ split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
1486
+ }
1487
+
1488
+ case Fail:
1489
+ {
1490
+ if (split_ix > workspace.st)
1491
+ {
1492
+ workspace.end = split_ix - 1;
1493
+
1494
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1495
+ add_csc_range_penalty(workspace,
1496
+ prediction_data,
1497
+ (double*)NULL,
1498
+ trees[curr_tree].col_num,
1499
+ trees[curr_tree].range_low,
1500
+ trees[curr_tree].range_high);
1501
+
1502
+ traverse_itree_csc(workspace,
1503
+ trees,
1504
+ model_outputs,
1505
+ prediction_data,
1506
+ tree_num,
1507
+ per_tree_depths,
1508
+ trees[curr_tree].tree_left,
1509
+ has_range_penalty);
1510
+ }
1511
+
1512
+
1513
+ if (split_ix <= orig_end)
1514
+ {
1515
+ workspace.st = split_ix;
1516
+ workspace.end = orig_end;
1517
+
1518
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1519
+ add_csc_range_penalty(workspace,
1520
+ prediction_data,
1521
+ (double*)NULL,
1522
+ trees[curr_tree].col_num,
1523
+ trees[curr_tree].range_low,
1524
+ trees[curr_tree].range_high);
1525
+
1526
+ traverse_itree_csc(workspace,
1527
+ trees,
1528
+ model_outputs,
1529
+ prediction_data,
1530
+ tree_num,
1531
+ per_tree_depths,
1532
+ trees[curr_tree].tree_right,
1533
+ has_range_penalty);
1534
+ }
1535
+ break;
1536
+ }
1537
+
1538
+ case Divide:
1539
+ {
1540
+ missing_action_divide:
1541
+ /* TODO: maybe here it shouldn't copy the whole ix_arr,
1542
+ but then it'd need to re-generate it from outside too */
1543
+ std::vector<double> weights_arr;
1544
+ std::vector<size_t> ix_arr;
1545
+ if (end_NA > workspace.st)
1546
+ {
1547
+ weights_arr.assign(workspace.weights_arr.begin(),
1548
+ workspace.weights_arr.begin() + end_NA);
1549
+ ix_arr.assign(workspace.ix_arr.data(),
1550
+ workspace.ix_arr.data() + end_NA);
1551
+ }
1552
+
1553
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1554
+ {
1555
+ size_t st = workspace.st;
1556
+ size_t end = workspace.end;
1557
+
1558
+ if (workspace.st < st_NA)
1559
+ {
1560
+ workspace.end = st_NA - 1;
1561
+ add_csc_range_penalty(workspace,
1562
+ prediction_data,
1563
+ workspace.weights_arr.data(),
1564
+ trees[curr_tree].col_num,
1565
+ trees[curr_tree].range_low,
1566
+ trees[curr_tree].range_high);
1567
+ }
1568
+
1569
+ if (workspace.end >= end_NA)
1570
+ {
1571
+ workspace.st = end_NA;
1572
+ workspace.end = end;
1573
+ add_csc_range_penalty(workspace,
1574
+ prediction_data,
1575
+ workspace.weights_arr.data(),
1576
+ trees[curr_tree].col_num,
1577
+ trees[curr_tree].range_low,
1578
+ trees[curr_tree].range_high);
1579
+ }
1580
+
1581
+ workspace.st = st;
1582
+ workspace.end = end;
1583
+ }
1584
+
1585
+ if (end_NA > workspace.st)
1586
+ {
1587
+ workspace.end = end_NA - 1;
1588
+ for (size_t row = st_NA; row < end_NA; row++)
1589
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
1590
+ traverse_itree_csc(workspace,
1591
+ trees,
1592
+ model_outputs,
1593
+ prediction_data,
1594
+ tree_num,
1595
+ per_tree_depths,
1596
+ trees[curr_tree].tree_left,
1597
+ has_range_penalty);
1598
+ }
1599
+
1600
+ if (st_NA <= orig_end)
1601
+ {
1602
+ workspace.st = st_NA;
1603
+ workspace.end = orig_end;
1604
+ if (weights_arr.size())
1605
+ {
1606
+ std::copy(weights_arr.begin(),
1607
+ weights_arr.end(),
1608
+ workspace.weights_arr.begin());
1609
+ std::copy(ix_arr.begin(),
1610
+ ix_arr.end(),
1611
+ workspace.ix_arr.begin());
1612
+ weights_arr.clear();
1613
+ weights_arr.shrink_to_fit();
1614
+ ix_arr.clear();
1615
+ ix_arr.shrink_to_fit();
1616
+ }
1617
+
1618
+ for (size_t row = st_NA; row < end_NA; row++)
1619
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
1620
+ traverse_itree_csc(workspace,
1621
+ trees,
1622
+ model_outputs,
1623
+ prediction_data,
1624
+ tree_num,
1625
+ per_tree_depths,
1626
+ trees[curr_tree].tree_right,
1627
+ has_range_penalty);
1628
+ }
1629
+ break;
1630
+ }
1631
+ }
1632
+ }
1633
+
1634
+ template <class PredictionData, class sparse_ix>
1635
+ void traverse_hplane_csc(WorkerForPredictCSC &workspace,
1636
+ std::vector<IsoHPlane> &hplanes,
1637
+ ExtIsoForest &model_outputs,
1638
+ PredictionData &prediction_data,
1639
+ sparse_ix *restrict tree_num,
1640
+ double *restrict per_tree_depths,
1641
+ size_t curr_tree,
1642
+ bool has_range_penalty)
1643
+ {
1644
+ // if (hplanes[curr_tree].score >= 0)
1645
+ if (unlikely(hplanes[curr_tree].hplane_left == 0))
1646
+ {
1647
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1648
+ workspace.depths[workspace.ix_arr[row]] += hplanes[curr_tree].score;
1649
+ if (unlikely(tree_num != NULL))
1650
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1651
+ tree_num[workspace.ix_arr[row]] = curr_tree;
1652
+ if (unlikely(per_tree_depths != NULL))
1653
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1654
+ per_tree_depths[workspace.ix_arr[row]] = hplanes[curr_tree].score;
1655
+ return;
1656
+ }
1657
+
1658
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1659
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0.);
1660
+ double unused;
1661
+
1662
+ if (likely(prediction_data.categ_data == NULL))
1663
+ {
1664
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1665
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
1666
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
1667
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1668
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
1669
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1670
+ model_outputs.missing_action, NULL, NULL, false);
1671
+ }
1672
+
1673
+ else
1674
+ {
1675
+ size_t ncols_numeric = 0;
1676
+ size_t ncols_categ = 0;
1677
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1678
+ {
1679
+ switch (hplanes[curr_tree].col_type[col])
1680
+ {
1681
+ case Numeric:
1682
+ {
1683
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
1684
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
1685
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1686
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
1687
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1688
+ model_outputs.missing_action, NULL, NULL, false);
1689
+ ncols_numeric++;
1690
+ break;
1691
+ }
1692
+
1693
+ case Categorical:
1694
+ {
1695
+ add_linear_comb<double>(
1696
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1697
+ prediction_data.categ_data + hplanes[curr_tree].col_num[col] * prediction_data.nrows,
1698
+ (model_outputs.cat_split_type == SubSet)? (int)hplanes[curr_tree].cat_coef[ncols_categ].size() : 0,
1699
+ (model_outputs.cat_split_type == SubSet)? hplanes[curr_tree].cat_coef[ncols_categ].data() : NULL,
1700
+ (model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].fill_new[ncols_categ] : 0.,
1701
+ (model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].chosen_cat[ncols_categ] : 0,
1702
+ hplanes[curr_tree].fill_val[col], hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
1703
+ model_outputs.new_cat_action, model_outputs.missing_action, model_outputs.cat_split_type, false);
1704
+ ncols_categ++;
1705
+ break;
1706
+ }
1707
+
1708
+ default:
1709
+ {
1710
+ assert(0);
1711
+ break;
1712
+ }
1713
+ }
1714
+ }
1715
+ }
1716
+
1717
+ if (has_range_penalty)
1718
+ {
1719
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1720
+ workspace.depths[workspace.ix_arr[row]]
1721
+ -=
1722
+ (workspace.comb_val[row - workspace.st] < hplanes[curr_tree].range_low) ||
1723
+ (workspace.comb_val[row - workspace.st] > hplanes[curr_tree].range_high);
1724
+ }
1725
+
1726
+ /* divide data */
1727
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
1728
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
1729
+
1730
+ /* continue splitting recursively */
1731
+ size_t orig_end = workspace.end;
1732
+ if (split_ix > workspace.st)
1733
+ {
1734
+ workspace.end = split_ix - 1;
1735
+ traverse_hplane_csc(workspace,
1736
+ hplanes,
1737
+ model_outputs,
1738
+ prediction_data,
1739
+ tree_num,
1740
+ per_tree_depths,
1741
+ hplanes[curr_tree].hplane_left,
1742
+ has_range_penalty);
1743
+ }
1744
+
1745
+ if (split_ix <= orig_end)
1746
+ {
1747
+ workspace.st = split_ix;
1748
+ workspace.end = orig_end;
1749
+ traverse_hplane_csc(workspace,
1750
+ hplanes,
1751
+ model_outputs,
1752
+ prediction_data,
1753
+ tree_num,
1754
+ per_tree_depths,
1755
+ hplanes[curr_tree].hplane_right,
1756
+ has_range_penalty);
1757
+ }
1758
+ }
1759
+
1760
+ template <class PredictionData>
1761
+ void add_csc_range_penalty(WorkerForPredictCSC &workspace,
1762
+ PredictionData &prediction_data,
1763
+ double *restrict weights_arr,
1764
+ size_t col_num,
1765
+ double range_low,
1766
+ double range_high)
1767
+ {
1768
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1769
+
1770
+ size_t st_col = prediction_data.Xc_indptr[col_num];
1771
+ size_t end_col = prediction_data.Xc_indptr[col_num + 1] - 1;
1772
+ size_t curr_pos = st_col;
1773
+ size_t ind_end_col = prediction_data.Xc_ind[end_col];
1774
+ size_t *ptr_st = std::lower_bound(workspace.ix_arr.data() + workspace.st,
1775
+ workspace.ix_arr.data() + workspace.end + 1,
1776
+ prediction_data.Xc_ind[st_col]);
1777
+
1778
+ if (range_low <= 0 && range_high >= 0)
1779
+ {
1780
+ for (size_t *row = ptr_st;
1781
+ row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
1782
+ )
1783
+ {
1784
+ if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
1785
+ {
1786
+ if (likely(!std::isnan(prediction_data.Xc[curr_pos])
1787
+ &&
1788
+ ( prediction_data.Xc[curr_pos] < range_low ||
1789
+ prediction_data.Xc[curr_pos] > range_high )))
1790
+ {
1791
+ workspace.depths[*row] -= (weights_arr == NULL)? 1. : weights_arr[*row];
1792
+ }
1793
+
1794
+ if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
1795
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1796
+ prediction_data.Xc_ind + end_col + 1,
1797
+ *(++row))
1798
+ - prediction_data.Xc_ind;
1799
+ }
1800
+
1801
+ else
1802
+ {
1803
+ if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
1804
+ row = std::lower_bound(row + 1,
1805
+ workspace.ix_arr.data() + workspace.end + 1,
1806
+ prediction_data.Xc_ind[curr_pos]);
1807
+ else
1808
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1809
+ prediction_data.Xc_ind + end_col + 1,
1810
+ *row)
1811
+ - prediction_data.Xc_ind;
1812
+ }
1813
+ }
1814
+ }
1815
+
1816
+ else
1817
+ {
1818
+ if (likely(weights_arr == NULL))
1819
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1820
+ workspace.depths[workspace.ix_arr[row]]--;
1821
+ else
1822
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1823
+ workspace.depths[workspace.ix_arr[row]] -= weights_arr[workspace.ix_arr[row]];
1824
+
1825
+
1826
+ for (size_t *row = ptr_st;
1827
+ row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
1828
+ )
1829
+ {
1830
+ if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
1831
+ {
1832
+ if (likely(std::isnan(prediction_data.Xc[curr_pos])
1833
+ ||
1834
+ ( prediction_data.Xc[curr_pos] >= range_low &&
1835
+ prediction_data.Xc[curr_pos] <= range_high )))
1836
+ {
1837
+ workspace.depths[*row] += (weights_arr == NULL)? 1. : weights_arr[*row];
1838
+ }
1839
+
1840
+ if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
1841
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1842
+ prediction_data.Xc_ind + end_col + 1,
1843
+ *(++row))
1844
+ - prediction_data.Xc_ind;
1845
+ }
1846
+
1847
+ else
1848
+ {
1849
+ if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
1850
+ row = std::lower_bound(row + 1,
1851
+ workspace.ix_arr.data() + workspace.end + 1,
1852
+ prediction_data.Xc_ind[curr_pos]);
1853
+ else
1854
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1855
+ prediction_data.Xc_ind + end_col + 1,
1856
+ *row)
1857
+ - prediction_data.Xc_ind;
1858
+ }
1859
+ }
1860
+ }
1861
+ }
1862
+
1863
+ template <class PredictionData>
1864
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept
1865
+ {
1866
+ decltype(prediction_data.Xc_indptr)
1867
+ search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
1868
+ prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
1869
+ row);
1870
+ if (
1871
+ search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
1872
+ ||
1873
+ (*search_res) != static_cast<typename std::remove_pointer<decltype(search_res)>::type>(row)
1874
+ )
1875
+ return 0.;
1876
+ else
1877
+ return prediction_data.Xc[search_res - prediction_data.Xc_ind];
1878
+ }
1879
+
1880
+ template <class PredictionData, class sparse_ix>
1881
+ static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept
1882
+ {
1883
+ if (row_end == row_st || col_num < lb || col_num > ub)
1884
+ return 0.;
1885
+ sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
1886
+ if (search_res == row_end || *search_res != (sparse_ix)col_num)
1887
+ return 0.;
1888
+ else
1889
+ return prediction_data.Xr[search_res - prediction_data.Xr_ind];
1890
+ }
1891
+
1892
+ template <class PredictionData, class sparse_ix>
1893
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept
1894
+ {
1895
+ if (row_end == row_st)
1896
+ return 0.;
1897
+ sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
1898
+ if (search_res == row_end || *search_res != (sparse_ix)col_num)
1899
+ return 0.;
1900
+ else
1901
+ return prediction_data.Xr[search_res - prediction_data.Xr_ind];
1902
+ }
1903
+
1904
+ template <class sparse_ix>
1905
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
1906
+ {
1907
+ std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
1908
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
1909
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs.trees.size(); tree++)
1910
+ {
1911
+ n_nodes[tree] = model_outputs.trees[tree].size();
1912
+ for (IsoTree &node : model_outputs.trees[tree])
1913
+ {
1914
+ n_terminal[tree] += (node.tree_left == 0);
1915
+ }
1916
+ }
1917
+ }
1918
+
1919
+ template <class sparse_ix>
1920
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
1921
+ {
1922
+ std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
1923
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
1924
+ for (size_t_for hplane = 0; hplane <(decltype(hplane)) model_outputs.hplanes.size(); hplane++)
1925
+ {
1926
+ n_nodes[hplane] = model_outputs.hplanes[hplane].size();
1927
+ for (IsoHPlane &node : model_outputs.hplanes[hplane])
1928
+ {
1929
+ n_terminal[hplane] += (node.hplane_left == 0);
1930
+ }
1931
+ }
1932
+ }