isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Reason behind this file: Cython (as of v0.29) will not auto-deallocate
48
+ structs which are part of a cdef'd class, which produces a memory leak
49
+ but can be force-destructed. Unfortunately, Cython itself doesn't even
50
+ allow calling destructors for structs, so it has to be done externally.
51
+ These functions should otherwise have no reason to be. */
52
+
53
+ void dealloc_IsoForest(IsoForest &model_outputs)
54
+ {
55
+ model_outputs.~IsoForest();
56
+ }
57
+
58
+ void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext)
59
+ {
60
+ model_outputs_ext.~ExtIsoForest();
61
+ }
62
+
63
+ void dealloc_Imputer(Imputer &imputer)
64
+ {
65
+ imputer.~Imputer();
66
+ }
@@ -0,0 +1,749 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+
48
+ /* Calculate distance or similarity between data points
49
+ *
50
+ * Parameters
51
+ * ==========
52
+ * - numeric_data[nrows * ncols_numeric]
53
+ * Pointer to numeric data for which to make calculations. Must be ordered by columns like Fortran,
54
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
55
+ * and the column order must be the same as in the data that was used to fit the model.
56
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
57
+ * the first group is assumed to be the earlier rows here.
58
+ * Pass NULL if there are no dense numeric columns.
59
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
60
+ * - categ_data[nrows * ncols_categ]
61
+ * Pointer to categorical data for which to make calculations. Must be ordered by columns like Fortran,
62
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
63
+ * and the column order must be the same as in the data that was used to fit the model.
64
+ * Pass NULL if there are no categorical columns.
65
+ * Each category should be represented as an integer, and these integers must start at zero and
66
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
67
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
68
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
69
+ * must be the same as was used in the data to which the model was fit.
70
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
71
+ * the first group is assumed to be the earlier rows here.
72
+ * - Xc[nnz]
73
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
74
+ * Pass NULL if there are no sparse numeric columns.
75
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
76
+ * - Xc_ind[nnz]
77
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
78
+ * Pass NULL if there are no sparse numeric columns in CSC format.
79
+ * - Xc_indptr[ncols_categ + 1]
80
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
81
+ * start and at entry [col + 1] where does column 'col' end.
82
+ * Pass NULL if there are no sparse numeric columns in CSC format.
83
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
84
+ * the first group is assumed to be the earlier rows here.
85
+ * - nrows
86
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
87
+ * - nthreads
88
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
89
+ * allocated, even if the thread does not end up being used. Ignored when not building with
90
+ * OpenMP support.
91
+ * - assume_full_distr
92
+ * Whether to assume that the fitted model represents a full population distribution (will use a
93
+ * standardizing criterion assuming infinite sample, and the results of the similarity between two points
94
+ * at prediction time will not depend on the prescence of any third point that is similar to them, but will
95
+ * differ more compared to the pairwise distances between points from which the model was fit). If passing
96
+ * 'false', will calculate pairwise distances as if the new observations at prediction time were added to
97
+ * the sample to which each tree was fit, which will make the distances between two points potentially vary
98
+ * according to other newly introduced points.
99
+ * - standardize_dist
100
+ * Whether to standardize the resulting average separation depths between rows according
101
+ * to the expected average separation depth in a similar way as when predicting outlierness,
102
+ * in order to obtain a standardized distance. If passing 'false', will output the average
103
+ * separation depth instead.
104
+ * - model_outputs
105
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
106
+ * if the calculations are to be made from an extended model. Can only pass one of
107
+ * 'model_outputs' and 'model_outputs_ext'.
108
+ * - model_outputs_ext
109
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
110
+ * if the calculations are to be made from a single-variable model. Can only pass one of
111
+ * 'model_outputs' and 'model_outputs_ext'.
112
+ * - tmat[nrows * (nrows - 1) / 2] (out)
113
+ * Pointer to array where the resulting pairwise distances or average separation depths will
114
+ * be written into. As the output is a symmetric matrix, this function will only fill in the
115
+ * upper-triangular part, in which entry 0 <= i < j < n will be located at position
116
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
117
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
118
+ * The array must already be initialized to zeros.
119
+ * If calculating distance/separation from a group of points to another group of points,
120
+ * pass NULL here and use 'rmat' instead.
121
+ * - rmat[nrows1 * nrows2] (out)
122
+ * Pointer to array where to write the distances or separation depths between each row in
123
+ * one set of observations and each row in a different set of observations. If doing these
124
+ * calculations for all pairs of observations/rows, pass 'rmat' instead.
125
+ * Will take the first group of observations as the rows in this matrix, and the second
126
+ * group as the columns. The groups are assumed to be in the same data arrays, with the
127
+ * first group corresponding to the earlier rows there.
128
+ * This matrix will be used in row-major order (i.e. entries 1..n_from contain the first row).
129
+ * Must be already initialized to zeros.
130
+ * Ignored when 'tmat' is passed.
131
+ * - n_from
132
+ * When calculating distances between two groups of points, this indicates the number of
133
+ * observations/rows belonging to the first group (the rows in 'rmat'), which will be
134
+ * assumed to be the first 'n_from' rows.
135
+ * Ignored when 'tmat' is passed.
136
+ */
137
+ void calc_similarity(double numeric_data[], int categ_data[],
138
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
139
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
140
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
141
+ double tmat[], double rmat[], size_t n_from)
142
+ {
143
+ PredictionData prediction_data = {numeric_data, categ_data, nrows,
144
+ Xc, Xc_ind, Xc_indptr,
145
+ NULL, NULL, NULL};
146
+
147
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
148
+
149
+ if (tmat != NULL) n_from = 0;
150
+
151
+ if ((size_t)nthreads > ntrees)
152
+ nthreads = (int)ntrees;
153
+ #ifdef _OPENMP
154
+ std::vector<WorkerForSimilarity> worker_memory(nthreads);
155
+ #else
156
+ std::vector<WorkerForSimilarity> worker_memory(1);
157
+ #endif
158
+
159
+ if (model_outputs != NULL)
160
+ {
161
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(ntrees, worker_memory, prediction_data, model_outputs)
162
+ for (size_t_for tree = 0; tree < ntrees; tree++)
163
+ {
164
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
165
+ model_outputs, NULL, n_from, assume_full_distr);
166
+ traverse_tree_sim(worker_memory[omp_get_thread_num()],
167
+ prediction_data,
168
+ *model_outputs,
169
+ model_outputs->trees[tree],
170
+ (size_t)0);
171
+ }
172
+ }
173
+
174
+ else
175
+ {
176
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(ntrees, worker_memory, prediction_data, model_outputs_ext)
177
+ for (size_t_for hplane = 0; hplane < ntrees; hplane++)
178
+ {
179
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
180
+ NULL, model_outputs_ext, n_from, assume_full_distr);
181
+ traverse_hplane_sim(worker_memory[omp_get_thread_num()],
182
+ prediction_data,
183
+ *model_outputs_ext,
184
+ model_outputs_ext->hplanes[hplane],
185
+ (size_t)0);
186
+ }
187
+ }
188
+
189
+ /* gather and transform the results */
190
+ gather_sim_result(&worker_memory, NULL,
191
+ &prediction_data, NULL,
192
+ model_outputs, model_outputs_ext,
193
+ tmat, rmat, n_from,
194
+ ntrees, assume_full_distr,
195
+ standardize_dist, nthreads);
196
+ }
197
+
198
+ void traverse_tree_sim(WorkerForSimilarity &workspace,
199
+ PredictionData &prediction_data,
200
+ IsoForest &model_outputs,
201
+ std::vector<IsoTree> &trees,
202
+ size_t curr_tree)
203
+ {
204
+ if (workspace.st == workspace.end)
205
+ return;
206
+
207
+ if (!workspace.tmat_sep.size())
208
+ {
209
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
210
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
211
+ return;
212
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
213
+ return;
214
+ }
215
+
216
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
217
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
218
+ obtain the average separation depth. */
219
+ if (trees[curr_tree].score >= 0.)
220
+ {
221
+ long double rem = (long double) trees[curr_tree].remainder;
222
+ if (!workspace.weights_arr.size())
223
+ {
224
+ rem += (long double)(workspace.end - workspace.st + 1);
225
+ if (workspace.tmat_sep.size())
226
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
227
+ prediction_data.nrows, workspace.tmat_sep.data(),
228
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
229
+ else
230
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
231
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(),
232
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
233
+ }
234
+
235
+ else
236
+ {
237
+ if (!workspace.assume_full_distr)
238
+ {
239
+ rem += std::accumulate(workspace.ix_arr.begin() + workspace.st,
240
+ workspace.ix_arr.begin() + workspace.end,
241
+ (long double) 0.,
242
+ [&workspace](long double curr, size_t ix)
243
+ {return curr + (long double)workspace.weights_arr[ix];}
244
+ );
245
+ }
246
+
247
+ if (workspace.tmat_sep.size())
248
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
249
+ prediction_data.nrows, workspace.tmat_sep.data(),
250
+ workspace.weights_arr.data(),
251
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
252
+ else
253
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
254
+ workspace.n_from, prediction_data.nrows,
255
+ workspace.rmat.data(), workspace.weights_arr.data(),
256
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
257
+ }
258
+ return;
259
+ }
260
+
261
+ else if (curr_tree > 0)
262
+ {
263
+ if (workspace.tmat_sep.size())
264
+ if (!workspace.weights_arr.size())
265
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
266
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
267
+ else
268
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
269
+ prediction_data.nrows, workspace.tmat_sep.data(),
270
+ workspace.weights_arr.data(), -1.);
271
+ else
272
+ if (!workspace.weights_arr.size())
273
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
274
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(), -1.);
275
+ else
276
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
277
+ workspace.n_from, prediction_data.nrows,
278
+ workspace.rmat.data(), workspace.weights_arr.data(), -1.);
279
+ }
280
+
281
+
282
+ /* divide according to tree */
283
+ if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
284
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
285
+ size_t st_NA, end_NA, split_ix;
286
+ switch(trees[curr_tree].col_type)
287
+ {
288
+ case Numeric:
289
+ {
290
+ if (prediction_data.Xc == NULL)
291
+ divide_subset_split(workspace.ix_arr.data(),
292
+ prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
293
+ workspace.st, workspace.end, trees[curr_tree].num_split,
294
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
295
+ else
296
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
297
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
298
+ trees[curr_tree].num_split, model_outputs.missing_action,
299
+ st_NA, end_NA, split_ix);
300
+
301
+ break;
302
+ }
303
+
304
+ case Categorical:
305
+ {
306
+ switch(model_outputs.cat_split_type)
307
+ {
308
+ case SingleCateg:
309
+ {
310
+ divide_subset_split(workspace.ix_arr.data(),
311
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
312
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
313
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
314
+ break;
315
+ }
316
+
317
+ case SubSet:
318
+ {
319
+ if (!trees[curr_tree].cat_split.size())
320
+ divide_subset_split(workspace.ix_arr.data(),
321
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
322
+ workspace.st, workspace.end,
323
+ model_outputs.missing_action, model_outputs.new_cat_action,
324
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
325
+ else
326
+ divide_subset_split(workspace.ix_arr.data(),
327
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
328
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
329
+ (int) trees[curr_tree].cat_split.size(),
330
+ model_outputs.missing_action, model_outputs.new_cat_action,
331
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
332
+ break;
333
+ }
334
+ }
335
+ break;
336
+ }
337
+ }
338
+
339
+
340
+ /* continue splitting recursively */
341
+ size_t orig_end = workspace.end;
342
+ switch(model_outputs.missing_action)
343
+ {
344
+ case Impute:
345
+ {
346
+ split_ix = (trees.back().pct_tree_left >= .5)? end_NA : st_NA;
347
+ }
348
+
349
+ case Fail:
350
+ {
351
+ if (split_ix > workspace.st)
352
+ {
353
+ workspace.end = split_ix - 1;
354
+ traverse_tree_sim(workspace,
355
+ prediction_data,
356
+ model_outputs,
357
+ trees,
358
+ trees[curr_tree].tree_left);
359
+ }
360
+
361
+
362
+ if (split_ix < orig_end)
363
+ {
364
+ workspace.st = split_ix;
365
+ workspace.end = orig_end;
366
+ traverse_tree_sim(workspace,
367
+ prediction_data,
368
+ model_outputs,
369
+ trees,
370
+ trees[curr_tree].tree_right);
371
+ }
372
+ break;
373
+ }
374
+
375
+ case Divide: /* new_cat_action = 'Weighted' will also fall here */
376
+ {
377
+ std::vector<double> weights_arr;
378
+ std::vector<size_t> ix_arr;
379
+ if (end_NA > workspace.st)
380
+ {
381
+ weights_arr.assign(workspace.weights_arr.begin(),
382
+ workspace.weights_arr.begin() + end_NA);
383
+ ix_arr.assign(workspace.ix_arr.begin(),
384
+ workspace.ix_arr.begin() + end_NA);
385
+ }
386
+
387
+ if (end_NA > workspace.st)
388
+ {
389
+ workspace.end = end_NA - 1;
390
+ for (size_t row = st_NA; row < end_NA; row++)
391
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
392
+ traverse_tree_sim(workspace,
393
+ prediction_data,
394
+ model_outputs,
395
+ trees,
396
+ trees[curr_tree].tree_left);
397
+ }
398
+
399
+ if (st_NA < orig_end)
400
+ {
401
+ workspace.st = st_NA;
402
+ workspace.end = orig_end;
403
+ if (weights_arr.size())
404
+ {
405
+ std::copy(weights_arr.begin(),
406
+ weights_arr.end(),
407
+ workspace.weights_arr.begin());
408
+ std::copy(ix_arr.begin(),
409
+ ix_arr.end(),
410
+ workspace.ix_arr.begin());
411
+ weights_arr.clear();
412
+ weights_arr.shrink_to_fit();
413
+ ix_arr.clear();
414
+ ix_arr.shrink_to_fit();
415
+ }
416
+
417
+ for (size_t row = st_NA; row < end_NA; row++)
418
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1 - trees[curr_tree].pct_tree_left);
419
+ traverse_tree_sim(workspace,
420
+ prediction_data,
421
+ model_outputs,
422
+ trees,
423
+ trees[curr_tree].tree_right);
424
+ }
425
+ break;
426
+ }
427
+ }
428
+ }
429
+
430
+ void traverse_hplane_sim(WorkerForSimilarity &workspace,
431
+ PredictionData &prediction_data,
432
+ ExtIsoForest &model_outputs,
433
+ std::vector<IsoHPlane> &hplanes,
434
+ size_t curr_tree)
435
+ {
436
+ if (workspace.st == workspace.end)
437
+ return;
438
+
439
+ if (!workspace.tmat_sep.size())
440
+ {
441
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
442
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
443
+ return;
444
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
445
+ return;
446
+ }
447
+
448
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
449
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
450
+ obtain the average separation depth. */
451
+ if (hplanes[curr_tree].score >= 0)
452
+ {
453
+ if (workspace.tmat_sep.size())
454
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
455
+ prediction_data.nrows, workspace.tmat_sep.data(),
456
+ workspace.assume_full_distr? 3. :
457
+ expected_separation_depth((long double) hplanes[curr_tree].remainder
458
+ + (long double)(workspace.end - workspace.st + 1))
459
+ );
460
+ else
461
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
462
+ prediction_data.nrows, workspace.rmat.data(),
463
+ workspace.assume_full_distr? 3. :
464
+ expected_separation_depth((long double) hplanes[curr_tree].remainder
465
+ + (long double)(workspace.end - workspace.st + 1))
466
+ );
467
+ return;
468
+ }
469
+
470
+ else if (curr_tree > 0)
471
+ {
472
+ if (workspace.tmat_sep.size())
473
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
474
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
475
+ else
476
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
477
+ prediction_data.nrows, workspace.rmat.data(), -1.);
478
+ }
479
+
480
+ if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
481
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
482
+
483
+ /* reconstruct linear combination */
484
+ size_t ncols_numeric = 0;
485
+ size_t ncols_categ = 0;
486
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
487
+ if (prediction_data.categ_data != NULL || prediction_data.Xc != NULL)
488
+ {
489
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
490
+ {
491
+ switch(hplanes[curr_tree].col_type[col])
492
+ {
493
+ case Numeric:
494
+ {
495
+ if (prediction_data.Xc == NULL)
496
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
497
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
498
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
499
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
500
+ model_outputs.missing_action, NULL, NULL, false);
501
+ else
502
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
503
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
504
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
505
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
506
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
507
+ model_outputs.missing_action, NULL, NULL, false);
508
+ ncols_numeric++;
509
+ break;
510
+ }
511
+
512
+ case Categorical:
513
+ {
514
+ switch(model_outputs.cat_split_type)
515
+ {
516
+ case SingleCateg:
517
+ {
518
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
519
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
520
+ (int)0, NULL, hplanes[curr_tree].fill_new[ncols_categ],
521
+ hplanes[curr_tree].chosen_cat[ncols_categ],
522
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
523
+ workspace.comb_val[0], NULL, NULL, model_outputs.new_cat_action,
524
+ model_outputs.missing_action, SingleCateg, false);
525
+ break;
526
+ }
527
+
528
+ case SubSet:
529
+ {
530
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
531
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
532
+ (int) hplanes[curr_tree].cat_coef[ncols_categ].size(),
533
+ hplanes[curr_tree].cat_coef[ncols_categ].data(), (double) 0, (int) 0,
534
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
535
+ hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
536
+ model_outputs.new_cat_action, model_outputs.missing_action, SubSet, false);
537
+ break;
538
+ }
539
+ }
540
+ ncols_categ++;
541
+ break;
542
+ }
543
+ }
544
+ }
545
+ }
546
+
547
+
548
+ else /* faster version for numerical-only */
549
+ {
550
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
551
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
552
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
553
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
554
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
555
+ model_outputs.missing_action, NULL, NULL, false);
556
+ }
557
+
558
+ /* divide data */
559
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
560
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
561
+
562
+ /* continue splitting recursively */
563
+ size_t orig_end = workspace.end;
564
+ if (split_ix > workspace.st)
565
+ {
566
+ workspace.end = split_ix - 1;
567
+ traverse_hplane_sim(workspace,
568
+ prediction_data,
569
+ model_outputs,
570
+ hplanes,
571
+ hplanes[curr_tree].hplane_left);
572
+ }
573
+
574
+ if (split_ix < orig_end)
575
+ {
576
+ workspace.st = split_ix;
577
+ workspace.end = orig_end;
578
+ traverse_hplane_sim(workspace,
579
+ prediction_data,
580
+ model_outputs,
581
+ hplanes,
582
+ hplanes[curr_tree].hplane_right);
583
+ }
584
+
585
+ }
586
+
587
+ void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
588
+ std::vector<WorkerMemory> *worker_memory_m,
589
+ PredictionData *prediction_data, InputData *input_data,
590
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
591
+ double *restrict tmat, double *restrict rmat, size_t n_from,
592
+ size_t ntrees, bool assume_full_distr,
593
+ bool standardize_dist, int nthreads)
594
+ {
595
+ size_t ncomb = (prediction_data != NULL)?
596
+ (prediction_data->nrows * (prediction_data->nrows - 1)) / 2
597
+ :
598
+ (input_data->nrows * (input_data->nrows - 1)) / 2;
599
+ size_t n_to = (prediction_data != NULL)? (prediction_data->nrows - n_from) : 0;
600
+
601
+ #ifdef _OPENMP
602
+ if (nthreads > 1)
603
+ {
604
+ if (worker_memory != NULL)
605
+ {
606
+ for (WorkerForSimilarity &w : *worker_memory)
607
+ {
608
+ if (w.tmat_sep.size())
609
+ {
610
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory)
611
+ for (size_t_for ix = 0; ix < ncomb; ix++)
612
+ tmat[ix] += w.tmat_sep[ix];
613
+ }
614
+ else if (w.rmat.size())
615
+ {
616
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(rmat, w, worker_memory)
617
+ for (size_t_for ix = 0; ix < w.rmat.size(); ix++)
618
+ rmat[ix] += w.rmat[ix];
619
+ }
620
+ }
621
+ }
622
+
623
+ else
624
+ {
625
+ for (WorkerMemory &w : *worker_memory_m)
626
+ {
627
+ if (w.tmat_sep.size())
628
+ {
629
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory_m)
630
+ for (size_t_for ix = 0; ix < ncomb; ix++)
631
+ tmat[ix] += w.tmat_sep[ix];
632
+ }
633
+ }
634
+ }
635
+ }
636
+
637
+ else
638
+ #endif
639
+ {
640
+ if (worker_memory != NULL)
641
+ {
642
+ if ((*worker_memory)[0].tmat_sep.size())
643
+ std::copy((*worker_memory)[0].tmat_sep.begin(), (*worker_memory)[0].tmat_sep.end(), tmat);
644
+ else
645
+ std::copy((*worker_memory)[0].rmat.begin(), (*worker_memory)[0].rmat.end(), rmat);
646
+ }
647
+
648
+ else
649
+ {
650
+ std::copy((*worker_memory_m)[0].tmat_sep.begin(), (*worker_memory_m)[0].tmat_sep.end(), tmat);
651
+ }
652
+ }
653
+
654
+ double ntrees_dbl = (double) ntrees;
655
+ if (standardize_dist)
656
+ {
657
+ /* Note: the separation distances up this point are missing the first hop, which is always
658
+ a +1 to every combination. Thus, it needs to be added back for the average separation depth.
659
+ For the standardized metric, it takes the expected divisor as 2(=3-1) instead of 3, given
660
+ that every combination will always get a +1 at the beginning. Since what's obtained here
661
+ is a sum across all trees, adding this +1 means adding the number of trees. */
662
+ double div_trees = ntrees_dbl;
663
+ if (assume_full_distr)
664
+ {
665
+ div_trees *= 2;
666
+ }
667
+
668
+ else if (input_data != NULL)
669
+ {
670
+ div_trees *= (expected_separation_depth(input_data->nrows) - 1);
671
+ }
672
+
673
+ else
674
+ {
675
+ div_trees *= ((
676
+ (model_outputs != NULL)?
677
+ expected_separation_depth_hotstart(model_outputs->exp_avg_sep,
678
+ model_outputs->orig_sample_size,
679
+ model_outputs->orig_sample_size + prediction_data->nrows)
680
+ :
681
+ expected_separation_depth_hotstart(model_outputs_ext->exp_avg_sep,
682
+ model_outputs_ext->orig_sample_size,
683
+ model_outputs_ext->orig_sample_size + prediction_data->nrows)
684
+ ) - 1);
685
+ }
686
+
687
+
688
+ if (tmat != NULL)
689
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, ntrees_dbl, div_trees)
690
+ for (size_t_for ix = 0; ix < ncomb; ix++)
691
+ tmat[ix] = exp2( - tmat[ix] / div_trees);
692
+ else
693
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, rmat, ntrees_dbl, div_trees)
694
+ for (size_t_for ix = 0; ix < n_from * n_to; ix++)
695
+ rmat[ix] = exp2( - rmat[ix] / div_trees);
696
+ }
697
+
698
+ else
699
+ {
700
+ if (tmat != NULL)
701
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, ntrees_dbl)
702
+ for (size_t_for ix = 0; ix < ncomb; ix++)
703
+ tmat[ix] = (tmat[ix] + ntrees) / ntrees_dbl;
704
+ else
705
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(n_from, rmat, ntrees_dbl)
706
+ for (size_t_for ix = 0; ix < n_from * n_to; ix++)
707
+ rmat[ix] = (rmat[ix] + ntrees) / ntrees_dbl;
708
+ }
709
+ }
710
+
711
+ void initialize_worker_for_sim(WorkerForSimilarity &workspace,
712
+ PredictionData &prediction_data,
713
+ IsoForest *model_outputs,
714
+ ExtIsoForest *model_outputs_ext,
715
+ size_t n_from,
716
+ bool assume_full_distr)
717
+ {
718
+ workspace.st = 0;
719
+ workspace.end = prediction_data.nrows - 1;
720
+ workspace.n_from = n_from;
721
+ workspace.assume_full_distr = assume_full_distr; /* doesn't need to have one copy per worker */
722
+
723
+ if (!workspace.ix_arr.size())
724
+ {
725
+ workspace.ix_arr.resize(prediction_data.nrows);
726
+ std::iota(workspace.ix_arr.begin(), workspace.ix_arr.end(), (size_t)0);
727
+ if (!n_from)
728
+ workspace.tmat_sep.resize((prediction_data.nrows * (prediction_data.nrows - 1)) / 2, 0);
729
+ else
730
+ workspace.rmat.resize(prediction_data.nrows * n_from, 0);
731
+ }
732
+
733
+ if (model_outputs != NULL && (model_outputs->missing_action == Divide || model_outputs->new_cat_action == Weighted))
734
+ {
735
+ if (!workspace.weights_arr.size())
736
+ workspace.weights_arr.resize(prediction_data.nrows, 1);
737
+ else
738
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), 1);
739
+ }
740
+
741
+ if (model_outputs_ext != NULL)
742
+ {
743
+ if (!workspace.comb_val.size())
744
+ workspace.comb_val.resize(prediction_data.nrows, 0);
745
+ else
746
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.end(), 0);
747
+ }
748
+ }
749
+