isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,849 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Predict outlier score, average depth, or terminal node numbers
48
+ *
49
+ * Parameters
50
+ * ==========
51
+ * - numeric_data[nrows * ncols_numeric]
52
+ * Pointer to numeric data for which to make predictions. Must be ordered by columns like Fortran,
53
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
54
+ * and the column order must be the same as in the data that was used to fit the model.
55
+ * Pass NULL if there are no dense numeric columns.
56
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
57
+ * - categ_data[nrows * ncols_categ]
58
+ * Pointer to categorical data for which to make predictions. Must be ordered by columns like Fortran,
59
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
60
+ * and the column order must be the same as in the data that was used to fit the model.
61
+ * Pass NULL if there are no categorical columns.
62
+ * Each category should be represented as an integer, and these integers must start at zero and
63
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
64
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
65
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
66
+ * must be the same as was used in the data to which the model was fit.
67
+ * - Xc[nnz]
68
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
69
+ * Pass NULL if there are no sparse numeric columns.
70
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
71
+ * - Xc_ind[nnz]
72
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
73
+ * Pass NULL if there are no sparse numeric columns in CSC format.
74
+ * - Xc_indptr[ncols_categ + 1]
75
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
76
+ * start and at entry [col + 1] where does column 'col' end.
77
+ * Pass NULL if there are no sparse numeric columns in CSC format.
78
+ * - Xr[nnz]
79
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
80
+ * Pass NULL if there are no sparse numeric columns.
81
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
82
+ * - Xr_ind[nnz]
83
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
84
+ * Pass NULL if there are no sparse numeric columns in CSR format.
85
+ * - Xr_indptr[nrows + 1]
86
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
87
+ * start and at entry [row + 1] where does row 'row' end.
88
+ * Pass NULL if there are no sparse numeric columns in CSR format.
89
+ * - nrows
90
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
91
+ * - nthreads
92
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
93
+ * allocated, even if the thread does not end up being used. Ignored when not building with
94
+ * OpenMP support.
95
+ * - standardize
96
+ * Whether to standardize the average depths for each row according to their relative magnitude
97
+ * compared to the expected average, in order to obtain an outlier score. If passing 'false',
98
+ * will output the average depth instead.
99
+ * Ignored when not passing 'output_depths'.
100
+ * - model_outputs
101
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
102
+ * if the predictions are to be made from an extended model. Can only pass one of
103
+ * 'model_outputs' and 'model_outputs_ext'.
104
+ * - model_outputs_ext
105
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
106
+ * if the predictions are to be made from a single-variable model. Can only pass one of
107
+ * 'model_outputs' and 'model_outputs_ext'.
108
+ * - output_depths[nrows] (out)
109
+ * Pointer to array where the output average depths or outlier scores will be written into
110
+ * (the return type is control according to parameter 'standardize').
111
+ * Must already be initialized to zeros. Must also be passed and when the desired output
112
+ * is terminal node numbers.
113
+ * - tree_num[nrows * ntrees] (out)
114
+ * Pointer to array where the output terminal node numbers will be written into.
115
+ * Note that the mapping between tree node and terminal tree node is not stored in
116
+ * the model object for efficiency reasons, so this mapping will be determined on-the-fly
117
+ * when passing this parameter, and as such, there will be some overhead regardless of
118
+ * the actual number of rows. Pass NULL if only average depths or outlier scores are desired.
119
+ */
120
+ void predict_iforest(double numeric_data[], int categ_data[],
121
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
122
+ double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
123
+ size_t nrows, int nthreads, bool standardize,
124
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
125
+ double output_depths[], sparse_ix tree_num[])
126
+ {
127
+ /* put data in a struct for passing it in fewer lines */
128
+ PredictionData prediction_data = {numeric_data, categ_data, nrows,
129
+ Xc, Xc_ind, Xc_indptr,
130
+ Xr, Xr_ind, Xr_indptr};
131
+
132
+ if ((size_t)nthreads > nrows)
133
+ nthreads = nrows;
134
+
135
+ if (model_outputs != NULL)
136
+ {
137
+ if (
138
+ model_outputs->missing_action == Fail &&
139
+ (model_outputs->new_cat_action != Weighted || prediction_data.categ_data == NULL) &&
140
+ prediction_data.Xc == NULL && prediction_data.Xr == NULL
141
+ )
142
+ {
143
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
144
+ for (size_t_for row = 0; row < nrows; row++)
145
+ {
146
+ for (std::vector<IsoTree> &tree : model_outputs->trees)
147
+ {
148
+ traverse_itree_no_recurse(tree,
149
+ *model_outputs,
150
+ prediction_data,
151
+ output_depths[row],
152
+ (tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
153
+ (size_t) row);
154
+ }
155
+ }
156
+ }
157
+
158
+ else
159
+ {
160
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
161
+ for (size_t_for row = 0; row < nrows; row++)
162
+ {
163
+ for (std::vector<IsoTree> &tree : model_outputs->trees)
164
+ {
165
+ output_depths[row] += traverse_itree(tree,
166
+ *model_outputs,
167
+ prediction_data,
168
+ NULL, NULL, 0,
169
+ (size_t) row,
170
+ (tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
171
+ (size_t) 0);
172
+ }
173
+ }
174
+ }
175
+ }
176
+
177
+
178
+ else
179
+ {
180
+ if (
181
+ model_outputs_ext->missing_action == Fail &&
182
+ prediction_data.categ_data == NULL &&
183
+ prediction_data.Xc == NULL &&
184
+ prediction_data.Xr == NULL
185
+ )
186
+ {
187
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
188
+ for (size_t_for row = 0; row < nrows; row++)
189
+ {
190
+ for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
191
+ {
192
+ traverse_hplane_fast(hplane,
193
+ *model_outputs_ext,
194
+ prediction_data,
195
+ output_depths[row],
196
+ (tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
197
+ (size_t) row);
198
+ }
199
+ }
200
+ }
201
+
202
+ else
203
+ {
204
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
205
+ for (size_t_for row = 0; row < nrows; row++)
206
+ {
207
+ for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
208
+ {
209
+ traverse_hplane(hplane,
210
+ *model_outputs_ext,
211
+ prediction_data,
212
+ output_depths[row],
213
+ NULL, NULL,
214
+ (tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
215
+ (size_t) row);
216
+ }
217
+ }
218
+ }
219
+ }
220
+
221
+ /* translate sum-of-depths to outlier score */
222
+ double ntrees, depth_divisor;
223
+ if (model_outputs != NULL)
224
+ {
225
+ ntrees = (double) model_outputs->trees.size();
226
+ depth_divisor = ntrees * (model_outputs->exp_avg_depth);
227
+ }
228
+
229
+ else
230
+ {
231
+ ntrees = (double) model_outputs_ext->hplanes.size();
232
+ depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
233
+ }
234
+
235
+ if (standardize)
236
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, depth_divisor)
237
+ for (size_t_for row = 0; row < nrows; row++)
238
+ output_depths[row] = exp2( - output_depths[row] / depth_divisor );
239
+ else
240
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, ntrees)
241
+ for (size_t_for row = 0; row < nrows; row++)
242
+ output_depths[row] /= ntrees;
243
+
244
+
245
+ /* re-map tree numbers to start at zero (if predicting tree numbers) */
246
+ /* Note: usually this type of 'prediction' is not required,
247
+ thus this mapping is not stored in the model objects so as to
248
+ save memory */
249
+ if (tree_num != NULL)
250
+ remap_terminal_trees(model_outputs, model_outputs_ext,
251
+ prediction_data, tree_num, nthreads);
252
+ }
253
+
254
+
255
+ void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
256
+ IsoForest &model_outputs,
257
+ PredictionData &prediction_data,
258
+ double &output_depth,
259
+ sparse_ix *restrict tree_num,
260
+ size_t row)
261
+ {
262
+ size_t curr_lev = 0;
263
+ double xval;
264
+ while (true)
265
+ {
266
+ if (tree[curr_lev].score > 0)
267
+ {
268
+ output_depth += tree[curr_lev].score;
269
+ if (tree_num != NULL)
270
+ tree_num[row] = curr_lev;
271
+ break;
272
+ }
273
+
274
+ else
275
+ {
276
+ switch(tree[curr_lev].col_type)
277
+ {
278
+ case Numeric:
279
+ {
280
+ xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
281
+ curr_lev = (xval <= tree[curr_lev].num_split)?
282
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
283
+ output_depth += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
284
+ break;
285
+ }
286
+
287
+ case Categorical:
288
+ {
289
+ switch(model_outputs.cat_split_type)
290
+ {
291
+ case SubSet:
292
+ {
293
+
294
+ if (!tree[curr_lev].cat_split.size()) /* this is for binary columns */
295
+ {
296
+ if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
297
+ {
298
+ curr_lev = (
299
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
300
+ == 0
301
+ )?
302
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
303
+ }
304
+
305
+ else /* can only work with 'Smallest' + no NAs if reaching this point */
306
+ {
307
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
308
+ }
309
+ }
310
+
311
+ else
312
+ {
313
+
314
+ switch(model_outputs.new_cat_action)
315
+ {
316
+ case Random:
317
+ {
318
+ curr_lev = (tree[curr_lev].cat_split[
319
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
320
+ ]
321
+ )?
322
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
323
+ break;
324
+ }
325
+
326
+ case Smallest:
327
+ {
328
+ if (
329
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
330
+ >= (int)tree[curr_lev].cat_split.size()
331
+ )
332
+ {
333
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
334
+ }
335
+
336
+ else
337
+ {
338
+ curr_lev = (tree[curr_lev].cat_split[
339
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
340
+ ]
341
+ )?
342
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
343
+ }
344
+ break;
345
+ }
346
+ }
347
+ }
348
+ break;
349
+ }
350
+
351
+ case SingleCateg:
352
+ {
353
+ curr_lev = (
354
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
355
+ ==
356
+ tree[curr_lev].chosen_cat
357
+ )?
358
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
359
+ break;
360
+ }
361
+ }
362
+ break;
363
+ }
364
+ }
365
+ }
366
+ }
367
+ }
368
+
369
+
370
+ double traverse_itree(std::vector<IsoTree> &tree,
371
+ IsoForest &model_outputs,
372
+ PredictionData &prediction_data,
373
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
374
+ ImputedData *imputed_data, /* only when imputing missing */
375
+ double curr_weight, /* only when imputing missing */
376
+ size_t row,
377
+ sparse_ix *restrict tree_num,
378
+ size_t curr_lev)
379
+ {
380
+ double xval;
381
+ double range_penalty = 0;
382
+
383
+ sparse_ix *row_st = NULL, *row_end = NULL;
384
+ if (prediction_data.Xr != NULL)
385
+ {
386
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
387
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
388
+ }
389
+
390
+ while (true)
391
+ {
392
+ if (tree[curr_lev].score >= 0.)
393
+ {
394
+ if (tree_num != NULL)
395
+ tree_num[row] = curr_lev;
396
+ if (imputed_data != NULL)
397
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
398
+
399
+ return tree[curr_lev].score + range_penalty;
400
+ }
401
+
402
+ else
403
+ {
404
+ switch(tree[curr_lev].col_type)
405
+ {
406
+ case Numeric:
407
+ {
408
+
409
+ if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
410
+ xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
411
+ else if (row_st != NULL)
412
+ xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
413
+ else
414
+ xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
415
+
416
+ if (isnan(xval))
417
+ {
418
+ switch(model_outputs.missing_action)
419
+ {
420
+ case Divide:
421
+ {
422
+ return
423
+ tree[curr_lev].pct_tree_left
424
+ * traverse_itree(tree, model_outputs, prediction_data,
425
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
426
+ row, NULL, tree[curr_lev].tree_left)
427
+ + (1 - tree[curr_lev].pct_tree_left)
428
+ * traverse_itree(tree, model_outputs, prediction_data,
429
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
430
+ row, NULL, tree[curr_lev].tree_right)
431
+ + range_penalty;
432
+ }
433
+
434
+ case Impute:
435
+ {
436
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
437
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
438
+ break;
439
+ }
440
+
441
+ case Fail:
442
+ {
443
+ return NAN;
444
+ }
445
+ }
446
+ }
447
+
448
+ else
449
+ {
450
+ curr_lev = (xval <=tree[curr_lev].num_split)?
451
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
452
+ range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
453
+ }
454
+ break;
455
+ }
456
+
457
+ case Categorical:
458
+ {
459
+
460
+ if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] < 0)
461
+ {
462
+ switch(model_outputs.missing_action)
463
+ {
464
+ case Divide:
465
+ {
466
+ return
467
+ tree[curr_lev].pct_tree_left
468
+ * traverse_itree(tree, model_outputs, prediction_data,
469
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
470
+ row, NULL, tree[curr_lev].tree_left)
471
+ + (1 - tree[curr_lev].pct_tree_left)
472
+ * traverse_itree(tree, model_outputs, prediction_data,
473
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
474
+ row, NULL, tree[curr_lev].tree_right)
475
+ + range_penalty;
476
+ }
477
+
478
+ case Impute:
479
+ {
480
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
481
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
482
+ break;
483
+ }
484
+
485
+ case Fail:
486
+ {
487
+ return NAN;
488
+ }
489
+ }
490
+ }
491
+
492
+ else
493
+ {
494
+ switch(model_outputs.cat_split_type)
495
+ {
496
+ case SingleCateg:
497
+ {
498
+ curr_lev = (
499
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
500
+ ==
501
+ tree[curr_lev].chosen_cat
502
+ )?
503
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
504
+ break;
505
+ }
506
+
507
+ case SubSet:
508
+ {
509
+
510
+ if (!tree[curr_lev].cat_split.size())
511
+ {
512
+ if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
513
+ {
514
+ curr_lev = (
515
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
516
+ == 0
517
+ )?
518
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
519
+ }
520
+
521
+ else
522
+ {
523
+ switch(model_outputs.new_cat_action)
524
+ {
525
+ case Smallest:
526
+ {
527
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
528
+ break;
529
+ }
530
+
531
+ case Weighted:
532
+ {
533
+ return
534
+ tree[curr_lev].pct_tree_left
535
+ * traverse_itree(tree, model_outputs, prediction_data,
536
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
537
+ row, NULL, tree[curr_lev].tree_left)
538
+ + (1 - tree[curr_lev].pct_tree_left)
539
+ * traverse_itree(tree, model_outputs, prediction_data,
540
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
541
+ row, NULL, tree[curr_lev].tree_right)
542
+ + range_penalty;
543
+ }
544
+ }
545
+ }
546
+ }
547
+
548
+ else
549
+ {
550
+ switch(model_outputs.new_cat_action)
551
+ {
552
+ case Random:
553
+ {
554
+ curr_lev = (tree[curr_lev].cat_split[
555
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
556
+ ]
557
+ )?
558
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
559
+ break;
560
+ }
561
+
562
+ case Smallest:
563
+ {
564
+ if (
565
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
566
+ >= (int)tree[curr_lev].cat_split.size()
567
+ )
568
+ {
569
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
570
+ }
571
+
572
+ else
573
+ {
574
+ curr_lev = (tree[curr_lev].cat_split[
575
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
576
+ ]
577
+ )?
578
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
579
+ }
580
+ break;
581
+ }
582
+
583
+ case Weighted:
584
+ {
585
+ if (
586
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
587
+ >= (int)tree[curr_lev].cat_split.size()
588
+ ||
589
+ tree[curr_lev].cat_split[
590
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
591
+ ]
592
+ == (-1)
593
+ )
594
+ {
595
+ return
596
+ tree[curr_lev].pct_tree_left
597
+ * traverse_itree(tree, model_outputs, prediction_data,
598
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
599
+ row, NULL, tree[curr_lev].tree_left)
600
+ + (1 - tree[curr_lev].pct_tree_left)
601
+ * traverse_itree(tree, model_outputs, prediction_data,
602
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
603
+ row, NULL, tree[curr_lev].tree_right)
604
+ + range_penalty;
605
+ }
606
+
607
+ else
608
+ {
609
+ curr_lev = (tree[curr_lev].cat_split[
610
+ prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
611
+ ]
612
+ )?
613
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
614
+ }
615
+ break;
616
+ }
617
+ }
618
+ }
619
+ break;
620
+ }
621
+ }
622
+ }
623
+ break;
624
+ }
625
+ }
626
+ }
627
+ }
628
+ }
629
+
630
+ /* this is a simpler version for situations in which there is
631
+ only numeric data in dense arrays and no missing values */
632
+ void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
633
+ ExtIsoForest &model_outputs,
634
+ PredictionData &prediction_data,
635
+ double &output_depth,
636
+ sparse_ix *restrict tree_num,
637
+ size_t row)
638
+ {
639
+ size_t curr_lev = 0;
640
+ double hval;
641
+
642
+ while(true)
643
+ {
644
+ if (hplane[curr_lev].score > 0)
645
+ {
646
+ output_depth += hplane[curr_lev].score;
647
+ if (tree_num != NULL)
648
+ tree_num[row] = curr_lev;
649
+ return;
650
+ }
651
+
652
+ else
653
+ {
654
+ hval = 0;
655
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
656
+ hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
657
+ - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
658
+ }
659
+
660
+ output_depth += (hval < hplane[curr_lev].range_low) ||
661
+ (hval > hplane[curr_lev].range_high);
662
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
663
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
664
+ }
665
+ }
666
+
667
+ /* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
668
+ void traverse_hplane(std::vector<IsoHPlane> &hplane,
669
+ ExtIsoForest &model_outputs,
670
+ PredictionData &prediction_data,
671
+ double &output_depth,
672
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
673
+ ImputedData *imputed_data, /* only when imputing missing */
674
+ sparse_ix *restrict tree_num,
675
+ size_t row)
676
+ {
677
+ size_t curr_lev = 0;
678
+ double xval;
679
+ int cval;
680
+ double hval;
681
+
682
+ size_t ncols_numeric, ncols_categ;
683
+
684
+ sparse_ix *row_st = NULL, *row_end = NULL;
685
+ if (prediction_data.Xr != NULL)
686
+ {
687
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
688
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
689
+ }
690
+
691
+ while(true)
692
+ {
693
+ if (hplane[curr_lev].score > 0)
694
+ {
695
+ output_depth += hplane[curr_lev].score;
696
+ if (tree_num != NULL)
697
+ tree_num[row] = curr_lev;
698
+ if (imputed_data != NULL)
699
+ {
700
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
701
+ }
702
+ return;
703
+ }
704
+
705
+ else
706
+ {
707
+ hval = 0;
708
+ ncols_numeric = 0; ncols_categ = 0;
709
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
710
+ {
711
+ switch(hplane[curr_lev].col_type[col])
712
+ {
713
+ case Numeric:
714
+ {
715
+ if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
716
+ xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
717
+ else if (row_st != NULL)
718
+ xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col]);
719
+ else
720
+ xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
721
+
722
+ if (is_na_or_inf(xval))
723
+ {
724
+ if (model_outputs.missing_action != Fail)
725
+ {
726
+ hval += hplane[curr_lev].fill_val[col];
727
+ }
728
+
729
+ else
730
+ {
731
+ output_depth = NAN;
732
+ return;
733
+ }
734
+ }
735
+
736
+ else
737
+ {
738
+ hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
739
+ }
740
+
741
+ ncols_numeric++;
742
+ break;
743
+ }
744
+
745
+ case Categorical:
746
+ {
747
+ cval = prediction_data.categ_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
748
+ if (cval < 0)
749
+ {
750
+ if (model_outputs.missing_action != Fail)
751
+ {
752
+ hval += hplane[curr_lev].fill_val[col];
753
+ }
754
+
755
+ else
756
+ {
757
+ output_depth = NAN;
758
+ return;
759
+ }
760
+ }
761
+
762
+ else
763
+ {
764
+ switch(model_outputs.cat_split_type)
765
+ {
766
+ case SingleCateg:
767
+ {
768
+ hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
769
+ break;
770
+ }
771
+
772
+ case SubSet:
773
+ {
774
+ if (cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size())
775
+ hval += hplane[curr_lev].fill_new[ncols_categ];
776
+ else
777
+ hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
778
+ break;
779
+ }
780
+ }
781
+ }
782
+
783
+ ncols_categ++;
784
+ break;
785
+ }
786
+ }
787
+
788
+ }
789
+
790
+ output_depth += (hval < hplane[curr_lev].range_low) ||
791
+ (hval > hplane[curr_lev].range_high);
792
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
793
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
794
+ }
795
+ }
796
+ }
797
+
798
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num)
799
+ {
800
+ sparse_ix *search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
801
+ prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
802
+ (sparse_ix) row);
803
+ if (
804
+ search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
805
+ ||
806
+ *search_res != row
807
+ )
808
+ return 0;
809
+ else
810
+ return prediction_data.Xc[search_res - prediction_data.Xc_ind];
811
+ }
812
+
813
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num)
814
+ {
815
+ sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
816
+ if (search_res == row_end || *search_res != (sparse_ix)col_num)
817
+ return 0;
818
+ else
819
+ return prediction_data.Xr[search_res - prediction_data.Xr_ind];
820
+ }
821
+
822
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
823
+ {
824
+ std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
825
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
826
+ for (size_t_for tree = 0; tree < model_outputs.trees.size(); tree++)
827
+ {
828
+ n_nodes[tree] = model_outputs.trees[tree].size();
829
+ for (IsoTree &node : model_outputs.trees[tree])
830
+ {
831
+ n_terminal[tree] += (node.score > 0);
832
+ }
833
+ }
834
+ }
835
+
836
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
837
+ {
838
+ std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
839
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
840
+ for (size_t_for hplane = 0; hplane < model_outputs.hplanes.size(); hplane++)
841
+ {
842
+ n_nodes[hplane] = model_outputs.hplanes[hplane].size();
843
+ for (IsoHPlane &node : model_outputs.hplanes[hplane])
844
+ {
845
+ n_terminal[hplane] += (node.score > 0);
846
+ }
847
+ }
848
+ }
849
+