isotree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
@@ -0,0 +1,849 @@
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
+
* of categorical variables and missing values.
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
+
*
|
5
|
+
* This library is based on the following works:
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
+
* "Isolation forest."
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
+
* "Isolation-based anomaly detection."
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
+
* "Extended Isolation Forest."
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
+
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
+
*
|
24
|
+
* BSD 2-Clause License
|
25
|
+
* Copyright (c) 2019, David Cortes
|
26
|
+
* All rights reserved.
|
27
|
+
* Redistribution and use in source and binary forms, with or without
|
28
|
+
* modification, are permitted provided that the following conditions are met:
|
29
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
+
* list of conditions and the following disclaimer.
|
31
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
+
* this list of conditions and the following disclaimer in the documentation
|
33
|
+
* and/or other materials provided with the distribution.
|
34
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
+
*/
|
45
|
+
#include "isotree.hpp"
|
46
|
+
|
47
|
+
/* Predict outlier score, average depth, or terminal node numbers
|
48
|
+
*
|
49
|
+
* Parameters
|
50
|
+
* ==========
|
51
|
+
* - numeric_data[nrows * ncols_numeric]
|
52
|
+
* Pointer to numeric data for which to make predictions. Must be ordered by columns like Fortran,
|
53
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
|
54
|
+
* and the column order must be the same as in the data that was used to fit the model.
|
55
|
+
* Pass NULL if there are no dense numeric columns.
|
56
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
57
|
+
* - categ_data[nrows * ncols_categ]
|
58
|
+
* Pointer to categorical data for which to make predictions. Must be ordered by columns like Fortran,
|
59
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
|
60
|
+
* and the column order must be the same as in the data that was used to fit the model.
|
61
|
+
* Pass NULL if there are no categorical columns.
|
62
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
63
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
64
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
65
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
66
|
+
* must be the same as was used in the data to which the model was fit.
|
67
|
+
* - Xc[nnz]
|
68
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
69
|
+
* Pass NULL if there are no sparse numeric columns.
|
70
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
71
|
+
* - Xc_ind[nnz]
|
72
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
73
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
74
|
+
* - Xc_indptr[ncols_categ + 1]
|
75
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
76
|
+
* start and at entry [col + 1] where does column 'col' end.
|
77
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
78
|
+
* - Xr[nnz]
|
79
|
+
* Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
|
80
|
+
* Pass NULL if there are no sparse numeric columns.
|
81
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
82
|
+
* - Xr_ind[nnz]
|
83
|
+
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
84
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
85
|
+
* - Xr_indptr[nrows + 1]
|
86
|
+
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
87
|
+
* start and at entry [row + 1] where does row 'row' end.
|
88
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
89
|
+
* - nrows
|
90
|
+
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
91
|
+
* - nthreads
|
92
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
93
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
94
|
+
* OpenMP support.
|
95
|
+
* - standardize
|
96
|
+
* Whether to standardize the average depths for each row according to their relative magnitude
|
97
|
+
* compared to the expected average, in order to obtain an outlier score. If passing 'false',
|
98
|
+
* will output the average depth instead.
|
99
|
+
* Ignored when not passing 'output_depths'.
|
100
|
+
* - model_outputs
|
101
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
102
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
103
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
104
|
+
* - model_outputs_ext
|
105
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
106
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
107
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
108
|
+
* - output_depths[nrows] (out)
|
109
|
+
* Pointer to array where the output average depths or outlier scores will be written into
|
110
|
+
* (the return type is control according to parameter 'standardize').
|
111
|
+
* Must already be initialized to zeros. Must also be passed and when the desired output
|
112
|
+
* is terminal node numbers.
|
113
|
+
* - tree_num[nrows * ntrees] (out)
|
114
|
+
* Pointer to array where the output terminal node numbers will be written into.
|
115
|
+
* Note that the mapping between tree node and terminal tree node is not stored in
|
116
|
+
* the model object for efficiency reasons, so this mapping will be determined on-the-fly
|
117
|
+
* when passing this parameter, and as such, there will be some overhead regardless of
|
118
|
+
* the actual number of rows. Pass NULL if only average depths or outlier scores are desired.
|
119
|
+
*/
|
120
|
+
void predict_iforest(double numeric_data[], int categ_data[],
|
121
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
122
|
+
double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
123
|
+
size_t nrows, int nthreads, bool standardize,
|
124
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
125
|
+
double output_depths[], sparse_ix tree_num[])
|
126
|
+
{
|
127
|
+
/* put data in a struct for passing it in fewer lines */
|
128
|
+
PredictionData prediction_data = {numeric_data, categ_data, nrows,
|
129
|
+
Xc, Xc_ind, Xc_indptr,
|
130
|
+
Xr, Xr_ind, Xr_indptr};
|
131
|
+
|
132
|
+
if ((size_t)nthreads > nrows)
|
133
|
+
nthreads = nrows;
|
134
|
+
|
135
|
+
if (model_outputs != NULL)
|
136
|
+
{
|
137
|
+
if (
|
138
|
+
model_outputs->missing_action == Fail &&
|
139
|
+
(model_outputs->new_cat_action != Weighted || prediction_data.categ_data == NULL) &&
|
140
|
+
prediction_data.Xc == NULL && prediction_data.Xr == NULL
|
141
|
+
)
|
142
|
+
{
|
143
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
|
144
|
+
for (size_t_for row = 0; row < nrows; row++)
|
145
|
+
{
|
146
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
147
|
+
{
|
148
|
+
traverse_itree_no_recurse(tree,
|
149
|
+
*model_outputs,
|
150
|
+
prediction_data,
|
151
|
+
output_depths[row],
|
152
|
+
(tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
|
153
|
+
(size_t) row);
|
154
|
+
}
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
else
|
159
|
+
{
|
160
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
|
161
|
+
for (size_t_for row = 0; row < nrows; row++)
|
162
|
+
{
|
163
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
164
|
+
{
|
165
|
+
output_depths[row] += traverse_itree(tree,
|
166
|
+
*model_outputs,
|
167
|
+
prediction_data,
|
168
|
+
NULL, NULL, 0,
|
169
|
+
(size_t) row,
|
170
|
+
(tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
|
171
|
+
(size_t) 0);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
|
178
|
+
else
|
179
|
+
{
|
180
|
+
if (
|
181
|
+
model_outputs_ext->missing_action == Fail &&
|
182
|
+
prediction_data.categ_data == NULL &&
|
183
|
+
prediction_data.Xc == NULL &&
|
184
|
+
prediction_data.Xr == NULL
|
185
|
+
)
|
186
|
+
{
|
187
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
|
188
|
+
for (size_t_for row = 0; row < nrows; row++)
|
189
|
+
{
|
190
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
191
|
+
{
|
192
|
+
traverse_hplane_fast(hplane,
|
193
|
+
*model_outputs_ext,
|
194
|
+
prediction_data,
|
195
|
+
output_depths[row],
|
196
|
+
(tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
|
197
|
+
(size_t) row);
|
198
|
+
}
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
else
|
203
|
+
{
|
204
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
|
205
|
+
for (size_t_for row = 0; row < nrows; row++)
|
206
|
+
{
|
207
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
208
|
+
{
|
209
|
+
traverse_hplane(hplane,
|
210
|
+
*model_outputs_ext,
|
211
|
+
prediction_data,
|
212
|
+
output_depths[row],
|
213
|
+
NULL, NULL,
|
214
|
+
(tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
|
215
|
+
(size_t) row);
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
}
|
220
|
+
|
221
|
+
/* translate sum-of-depths to outlier score */
|
222
|
+
double ntrees, depth_divisor;
|
223
|
+
if (model_outputs != NULL)
|
224
|
+
{
|
225
|
+
ntrees = (double) model_outputs->trees.size();
|
226
|
+
depth_divisor = ntrees * (model_outputs->exp_avg_depth);
|
227
|
+
}
|
228
|
+
|
229
|
+
else
|
230
|
+
{
|
231
|
+
ntrees = (double) model_outputs_ext->hplanes.size();
|
232
|
+
depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
|
233
|
+
}
|
234
|
+
|
235
|
+
if (standardize)
|
236
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, depth_divisor)
|
237
|
+
for (size_t_for row = 0; row < nrows; row++)
|
238
|
+
output_depths[row] = exp2( - output_depths[row] / depth_divisor );
|
239
|
+
else
|
240
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, ntrees)
|
241
|
+
for (size_t_for row = 0; row < nrows; row++)
|
242
|
+
output_depths[row] /= ntrees;
|
243
|
+
|
244
|
+
|
245
|
+
/* re-map tree numbers to start at zero (if predicting tree numbers) */
|
246
|
+
/* Note: usually this type of 'prediction' is not required,
|
247
|
+
thus this mapping is not stored in the model objects so as to
|
248
|
+
save memory */
|
249
|
+
if (tree_num != NULL)
|
250
|
+
remap_terminal_trees(model_outputs, model_outputs_ext,
|
251
|
+
prediction_data, tree_num, nthreads);
|
252
|
+
}
|
253
|
+
|
254
|
+
|
255
|
+
void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
|
256
|
+
IsoForest &model_outputs,
|
257
|
+
PredictionData &prediction_data,
|
258
|
+
double &output_depth,
|
259
|
+
sparse_ix *restrict tree_num,
|
260
|
+
size_t row)
|
261
|
+
{
|
262
|
+
size_t curr_lev = 0;
|
263
|
+
double xval;
|
264
|
+
while (true)
|
265
|
+
{
|
266
|
+
if (tree[curr_lev].score > 0)
|
267
|
+
{
|
268
|
+
output_depth += tree[curr_lev].score;
|
269
|
+
if (tree_num != NULL)
|
270
|
+
tree_num[row] = curr_lev;
|
271
|
+
break;
|
272
|
+
}
|
273
|
+
|
274
|
+
else
|
275
|
+
{
|
276
|
+
switch(tree[curr_lev].col_type)
|
277
|
+
{
|
278
|
+
case Numeric:
|
279
|
+
{
|
280
|
+
xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
|
281
|
+
curr_lev = (xval <= tree[curr_lev].num_split)?
|
282
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
283
|
+
output_depth += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
|
284
|
+
break;
|
285
|
+
}
|
286
|
+
|
287
|
+
case Categorical:
|
288
|
+
{
|
289
|
+
switch(model_outputs.cat_split_type)
|
290
|
+
{
|
291
|
+
case SubSet:
|
292
|
+
{
|
293
|
+
|
294
|
+
if (!tree[curr_lev].cat_split.size()) /* this is for binary columns */
|
295
|
+
{
|
296
|
+
if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
|
297
|
+
{
|
298
|
+
curr_lev = (
|
299
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
300
|
+
== 0
|
301
|
+
)?
|
302
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
303
|
+
}
|
304
|
+
|
305
|
+
else /* can only work with 'Smallest' + no NAs if reaching this point */
|
306
|
+
{
|
307
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
308
|
+
}
|
309
|
+
}
|
310
|
+
|
311
|
+
else
|
312
|
+
{
|
313
|
+
|
314
|
+
switch(model_outputs.new_cat_action)
|
315
|
+
{
|
316
|
+
case Random:
|
317
|
+
{
|
318
|
+
curr_lev = (tree[curr_lev].cat_split[
|
319
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
320
|
+
]
|
321
|
+
)?
|
322
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
323
|
+
break;
|
324
|
+
}
|
325
|
+
|
326
|
+
case Smallest:
|
327
|
+
{
|
328
|
+
if (
|
329
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
330
|
+
>= (int)tree[curr_lev].cat_split.size()
|
331
|
+
)
|
332
|
+
{
|
333
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
334
|
+
}
|
335
|
+
|
336
|
+
else
|
337
|
+
{
|
338
|
+
curr_lev = (tree[curr_lev].cat_split[
|
339
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
340
|
+
]
|
341
|
+
)?
|
342
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
343
|
+
}
|
344
|
+
break;
|
345
|
+
}
|
346
|
+
}
|
347
|
+
}
|
348
|
+
break;
|
349
|
+
}
|
350
|
+
|
351
|
+
case SingleCateg:
|
352
|
+
{
|
353
|
+
curr_lev = (
|
354
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
355
|
+
==
|
356
|
+
tree[curr_lev].chosen_cat
|
357
|
+
)?
|
358
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
359
|
+
break;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
break;
|
363
|
+
}
|
364
|
+
}
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
368
|
+
|
369
|
+
|
370
|
+
double traverse_itree(std::vector<IsoTree> &tree,
|
371
|
+
IsoForest &model_outputs,
|
372
|
+
PredictionData &prediction_data,
|
373
|
+
std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
|
374
|
+
ImputedData *imputed_data, /* only when imputing missing */
|
375
|
+
double curr_weight, /* only when imputing missing */
|
376
|
+
size_t row,
|
377
|
+
sparse_ix *restrict tree_num,
|
378
|
+
size_t curr_lev)
|
379
|
+
{
|
380
|
+
double xval;
|
381
|
+
double range_penalty = 0;
|
382
|
+
|
383
|
+
sparse_ix *row_st = NULL, *row_end = NULL;
|
384
|
+
if (prediction_data.Xr != NULL)
|
385
|
+
{
|
386
|
+
row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
|
387
|
+
row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
|
388
|
+
}
|
389
|
+
|
390
|
+
while (true)
|
391
|
+
{
|
392
|
+
if (tree[curr_lev].score >= 0.)
|
393
|
+
{
|
394
|
+
if (tree_num != NULL)
|
395
|
+
tree_num[row] = curr_lev;
|
396
|
+
if (imputed_data != NULL)
|
397
|
+
add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
|
398
|
+
|
399
|
+
return tree[curr_lev].score + range_penalty;
|
400
|
+
}
|
401
|
+
|
402
|
+
else
|
403
|
+
{
|
404
|
+
switch(tree[curr_lev].col_type)
|
405
|
+
{
|
406
|
+
case Numeric:
|
407
|
+
{
|
408
|
+
|
409
|
+
if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
|
410
|
+
xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
|
411
|
+
else if (row_st != NULL)
|
412
|
+
xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
|
413
|
+
else
|
414
|
+
xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
|
415
|
+
|
416
|
+
if (isnan(xval))
|
417
|
+
{
|
418
|
+
switch(model_outputs.missing_action)
|
419
|
+
{
|
420
|
+
case Divide:
|
421
|
+
{
|
422
|
+
return
|
423
|
+
tree[curr_lev].pct_tree_left
|
424
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
425
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
426
|
+
row, NULL, tree[curr_lev].tree_left)
|
427
|
+
+ (1 - tree[curr_lev].pct_tree_left)
|
428
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
429
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
430
|
+
row, NULL, tree[curr_lev].tree_right)
|
431
|
+
+ range_penalty;
|
432
|
+
}
|
433
|
+
|
434
|
+
case Impute:
|
435
|
+
{
|
436
|
+
curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
|
437
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
438
|
+
break;
|
439
|
+
}
|
440
|
+
|
441
|
+
case Fail:
|
442
|
+
{
|
443
|
+
return NAN;
|
444
|
+
}
|
445
|
+
}
|
446
|
+
}
|
447
|
+
|
448
|
+
else
|
449
|
+
{
|
450
|
+
curr_lev = (xval <=tree[curr_lev].num_split)?
|
451
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
452
|
+
range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
|
453
|
+
}
|
454
|
+
break;
|
455
|
+
}
|
456
|
+
|
457
|
+
case Categorical:
|
458
|
+
{
|
459
|
+
|
460
|
+
if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] < 0)
|
461
|
+
{
|
462
|
+
switch(model_outputs.missing_action)
|
463
|
+
{
|
464
|
+
case Divide:
|
465
|
+
{
|
466
|
+
return
|
467
|
+
tree[curr_lev].pct_tree_left
|
468
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
469
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
470
|
+
row, NULL, tree[curr_lev].tree_left)
|
471
|
+
+ (1 - tree[curr_lev].pct_tree_left)
|
472
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
473
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
474
|
+
row, NULL, tree[curr_lev].tree_right)
|
475
|
+
+ range_penalty;
|
476
|
+
}
|
477
|
+
|
478
|
+
case Impute:
|
479
|
+
{
|
480
|
+
curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
|
481
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
482
|
+
break;
|
483
|
+
}
|
484
|
+
|
485
|
+
case Fail:
|
486
|
+
{
|
487
|
+
return NAN;
|
488
|
+
}
|
489
|
+
}
|
490
|
+
}
|
491
|
+
|
492
|
+
else
|
493
|
+
{
|
494
|
+
switch(model_outputs.cat_split_type)
|
495
|
+
{
|
496
|
+
case SingleCateg:
|
497
|
+
{
|
498
|
+
curr_lev = (
|
499
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
500
|
+
==
|
501
|
+
tree[curr_lev].chosen_cat
|
502
|
+
)?
|
503
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
504
|
+
break;
|
505
|
+
}
|
506
|
+
|
507
|
+
case SubSet:
|
508
|
+
{
|
509
|
+
|
510
|
+
if (!tree[curr_lev].cat_split.size())
|
511
|
+
{
|
512
|
+
if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
|
513
|
+
{
|
514
|
+
curr_lev = (
|
515
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
516
|
+
== 0
|
517
|
+
)?
|
518
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
519
|
+
}
|
520
|
+
|
521
|
+
else
|
522
|
+
{
|
523
|
+
switch(model_outputs.new_cat_action)
|
524
|
+
{
|
525
|
+
case Smallest:
|
526
|
+
{
|
527
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
528
|
+
break;
|
529
|
+
}
|
530
|
+
|
531
|
+
case Weighted:
|
532
|
+
{
|
533
|
+
return
|
534
|
+
tree[curr_lev].pct_tree_left
|
535
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
536
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
537
|
+
row, NULL, tree[curr_lev].tree_left)
|
538
|
+
+ (1 - tree[curr_lev].pct_tree_left)
|
539
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
540
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
541
|
+
row, NULL, tree[curr_lev].tree_right)
|
542
|
+
+ range_penalty;
|
543
|
+
}
|
544
|
+
}
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
else
|
549
|
+
{
|
550
|
+
switch(model_outputs.new_cat_action)
|
551
|
+
{
|
552
|
+
case Random:
|
553
|
+
{
|
554
|
+
curr_lev = (tree[curr_lev].cat_split[
|
555
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
556
|
+
]
|
557
|
+
)?
|
558
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
559
|
+
break;
|
560
|
+
}
|
561
|
+
|
562
|
+
case Smallest:
|
563
|
+
{
|
564
|
+
if (
|
565
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
566
|
+
>= (int)tree[curr_lev].cat_split.size()
|
567
|
+
)
|
568
|
+
{
|
569
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
570
|
+
}
|
571
|
+
|
572
|
+
else
|
573
|
+
{
|
574
|
+
curr_lev = (tree[curr_lev].cat_split[
|
575
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
576
|
+
]
|
577
|
+
)?
|
578
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
579
|
+
}
|
580
|
+
break;
|
581
|
+
}
|
582
|
+
|
583
|
+
case Weighted:
|
584
|
+
{
|
585
|
+
if (
|
586
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
587
|
+
>= (int)tree[curr_lev].cat_split.size()
|
588
|
+
||
|
589
|
+
tree[curr_lev].cat_split[
|
590
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
591
|
+
]
|
592
|
+
== (-1)
|
593
|
+
)
|
594
|
+
{
|
595
|
+
return
|
596
|
+
tree[curr_lev].pct_tree_left
|
597
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
598
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
599
|
+
row, NULL, tree[curr_lev].tree_left)
|
600
|
+
+ (1 - tree[curr_lev].pct_tree_left)
|
601
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
602
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
603
|
+
row, NULL, tree[curr_lev].tree_right)
|
604
|
+
+ range_penalty;
|
605
|
+
}
|
606
|
+
|
607
|
+
else
|
608
|
+
{
|
609
|
+
curr_lev = (tree[curr_lev].cat_split[
|
610
|
+
prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
|
611
|
+
]
|
612
|
+
)?
|
613
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
614
|
+
}
|
615
|
+
break;
|
616
|
+
}
|
617
|
+
}
|
618
|
+
}
|
619
|
+
break;
|
620
|
+
}
|
621
|
+
}
|
622
|
+
}
|
623
|
+
break;
|
624
|
+
}
|
625
|
+
}
|
626
|
+
}
|
627
|
+
}
|
628
|
+
}
|
629
|
+
|
630
|
+
/* this is a simpler version for situations in which there is
|
631
|
+
only numeric data in dense arrays and no missing values */
|
632
|
+
void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
|
633
|
+
ExtIsoForest &model_outputs,
|
634
|
+
PredictionData &prediction_data,
|
635
|
+
double &output_depth,
|
636
|
+
sparse_ix *restrict tree_num,
|
637
|
+
size_t row)
|
638
|
+
{
|
639
|
+
size_t curr_lev = 0;
|
640
|
+
double hval;
|
641
|
+
|
642
|
+
while(true)
|
643
|
+
{
|
644
|
+
if (hplane[curr_lev].score > 0)
|
645
|
+
{
|
646
|
+
output_depth += hplane[curr_lev].score;
|
647
|
+
if (tree_num != NULL)
|
648
|
+
tree_num[row] = curr_lev;
|
649
|
+
return;
|
650
|
+
}
|
651
|
+
|
652
|
+
else
|
653
|
+
{
|
654
|
+
hval = 0;
|
655
|
+
for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
|
656
|
+
hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
|
657
|
+
- hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
|
658
|
+
}
|
659
|
+
|
660
|
+
output_depth += (hval < hplane[curr_lev].range_low) ||
|
661
|
+
(hval > hplane[curr_lev].range_high);
|
662
|
+
curr_lev = (hval <= hplane[curr_lev].split_point)?
|
663
|
+
hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
|
664
|
+
}
|
665
|
+
}
|
666
|
+
|
667
|
+
/* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
|
668
|
+
void traverse_hplane(std::vector<IsoHPlane> &hplane,
|
669
|
+
ExtIsoForest &model_outputs,
|
670
|
+
PredictionData &prediction_data,
|
671
|
+
double &output_depth,
|
672
|
+
std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
|
673
|
+
ImputedData *imputed_data, /* only when imputing missing */
|
674
|
+
sparse_ix *restrict tree_num,
|
675
|
+
size_t row)
|
676
|
+
{
|
677
|
+
size_t curr_lev = 0;
|
678
|
+
double xval;
|
679
|
+
int cval;
|
680
|
+
double hval;
|
681
|
+
|
682
|
+
size_t ncols_numeric, ncols_categ;
|
683
|
+
|
684
|
+
sparse_ix *row_st = NULL, *row_end = NULL;
|
685
|
+
if (prediction_data.Xr != NULL)
|
686
|
+
{
|
687
|
+
row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
|
688
|
+
row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
|
689
|
+
}
|
690
|
+
|
691
|
+
while(true)
|
692
|
+
{
|
693
|
+
if (hplane[curr_lev].score > 0)
|
694
|
+
{
|
695
|
+
output_depth += hplane[curr_lev].score;
|
696
|
+
if (tree_num != NULL)
|
697
|
+
tree_num[row] = curr_lev;
|
698
|
+
if (imputed_data != NULL)
|
699
|
+
{
|
700
|
+
add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
|
701
|
+
}
|
702
|
+
return;
|
703
|
+
}
|
704
|
+
|
705
|
+
else
|
706
|
+
{
|
707
|
+
hval = 0;
|
708
|
+
ncols_numeric = 0; ncols_categ = 0;
|
709
|
+
for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
|
710
|
+
{
|
711
|
+
switch(hplane[curr_lev].col_type[col])
|
712
|
+
{
|
713
|
+
case Numeric:
|
714
|
+
{
|
715
|
+
if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
|
716
|
+
xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
|
717
|
+
else if (row_st != NULL)
|
718
|
+
xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col]);
|
719
|
+
else
|
720
|
+
xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
|
721
|
+
|
722
|
+
if (is_na_or_inf(xval))
|
723
|
+
{
|
724
|
+
if (model_outputs.missing_action != Fail)
|
725
|
+
{
|
726
|
+
hval += hplane[curr_lev].fill_val[col];
|
727
|
+
}
|
728
|
+
|
729
|
+
else
|
730
|
+
{
|
731
|
+
output_depth = NAN;
|
732
|
+
return;
|
733
|
+
}
|
734
|
+
}
|
735
|
+
|
736
|
+
else
|
737
|
+
{
|
738
|
+
hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
|
739
|
+
}
|
740
|
+
|
741
|
+
ncols_numeric++;
|
742
|
+
break;
|
743
|
+
}
|
744
|
+
|
745
|
+
case Categorical:
|
746
|
+
{
|
747
|
+
cval = prediction_data.categ_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
|
748
|
+
if (cval < 0)
|
749
|
+
{
|
750
|
+
if (model_outputs.missing_action != Fail)
|
751
|
+
{
|
752
|
+
hval += hplane[curr_lev].fill_val[col];
|
753
|
+
}
|
754
|
+
|
755
|
+
else
|
756
|
+
{
|
757
|
+
output_depth = NAN;
|
758
|
+
return;
|
759
|
+
}
|
760
|
+
}
|
761
|
+
|
762
|
+
else
|
763
|
+
{
|
764
|
+
switch(model_outputs.cat_split_type)
|
765
|
+
{
|
766
|
+
case SingleCateg:
|
767
|
+
{
|
768
|
+
hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
|
769
|
+
break;
|
770
|
+
}
|
771
|
+
|
772
|
+
case SubSet:
|
773
|
+
{
|
774
|
+
if (cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size())
|
775
|
+
hval += hplane[curr_lev].fill_new[ncols_categ];
|
776
|
+
else
|
777
|
+
hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
|
778
|
+
break;
|
779
|
+
}
|
780
|
+
}
|
781
|
+
}
|
782
|
+
|
783
|
+
ncols_categ++;
|
784
|
+
break;
|
785
|
+
}
|
786
|
+
}
|
787
|
+
|
788
|
+
}
|
789
|
+
|
790
|
+
output_depth += (hval < hplane[curr_lev].range_low) ||
|
791
|
+
(hval > hplane[curr_lev].range_high);
|
792
|
+
curr_lev = (hval <= hplane[curr_lev].split_point)?
|
793
|
+
hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
|
794
|
+
}
|
795
|
+
}
|
796
|
+
}
|
797
|
+
|
798
|
+
double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num)
|
799
|
+
{
|
800
|
+
sparse_ix *search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
|
801
|
+
prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
|
802
|
+
(sparse_ix) row);
|
803
|
+
if (
|
804
|
+
search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
|
805
|
+
||
|
806
|
+
*search_res != row
|
807
|
+
)
|
808
|
+
return 0;
|
809
|
+
else
|
810
|
+
return prediction_data.Xc[search_res - prediction_data.Xc_ind];
|
811
|
+
}
|
812
|
+
|
813
|
+
double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num)
|
814
|
+
{
|
815
|
+
sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
|
816
|
+
if (search_res == row_end || *search_res != (sparse_ix)col_num)
|
817
|
+
return 0;
|
818
|
+
else
|
819
|
+
return prediction_data.Xr[search_res - prediction_data.Xr_ind];
|
820
|
+
}
|
821
|
+
|
822
|
+
void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
|
823
|
+
{
|
824
|
+
std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
|
825
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
|
826
|
+
for (size_t_for tree = 0; tree < model_outputs.trees.size(); tree++)
|
827
|
+
{
|
828
|
+
n_nodes[tree] = model_outputs.trees[tree].size();
|
829
|
+
for (IsoTree &node : model_outputs.trees[tree])
|
830
|
+
{
|
831
|
+
n_terminal[tree] += (node.score > 0);
|
832
|
+
}
|
833
|
+
}
|
834
|
+
}
|
835
|
+
|
836
|
+
void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
|
837
|
+
{
|
838
|
+
std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
|
839
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
|
840
|
+
for (size_t_for hplane = 0; hplane < model_outputs.hplanes.size(); hplane++)
|
841
|
+
{
|
842
|
+
n_nodes[hplane] = model_outputs.hplanes[hplane].size();
|
843
|
+
for (IsoHPlane &node : model_outputs.hplanes[hplane])
|
844
|
+
{
|
845
|
+
n_terminal[hplane] += (node.score > 0);
|
846
|
+
}
|
847
|
+
}
|
848
|
+
}
|
849
|
+
|