isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Reason behind this file: Cython (as of v0.29) will not auto-deallocate
48
+ structs which are part of a cdef'd class, which produces a memory leak
49
+ but can be force-destructed. Unfortunately, Cython itself doesn't even
50
+ allow calling destructors for structs, so it has to be done externally.
51
+ These functions should otherwise have no reason to be. */
52
+
53
+ void dealloc_IsoForest(IsoForest &model_outputs)
54
+ {
55
+ model_outputs.~IsoForest();
56
+ }
57
+
58
+ void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext)
59
+ {
60
+ model_outputs_ext.~ExtIsoForest();
61
+ }
62
+
63
+ void dealloc_Imputer(Imputer &imputer)
64
+ {
65
+ imputer.~Imputer();
66
+ }
@@ -0,0 +1,749 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+
48
+ /* Calculate distance or similarity between data points
49
+ *
50
+ * Parameters
51
+ * ==========
52
+ * - numeric_data[nrows * ncols_numeric]
53
+ * Pointer to numeric data for which to make calculations. Must be ordered by columns like Fortran,
54
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
55
+ * and the column order must be the same as in the data that was used to fit the model.
56
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
57
+ * the first group is assumed to be the earlier rows here.
58
+ * Pass NULL if there are no dense numeric columns.
59
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
60
+ * - categ_data[nrows * ncols_categ]
61
+ * Pointer to categorical data for which to make calculations. Must be ordered by columns like Fortran,
62
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
63
+ * and the column order must be the same as in the data that was used to fit the model.
64
+ * Pass NULL if there are no categorical columns.
65
+ * Each category should be represented as an integer, and these integers must start at zero and
66
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
67
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
68
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
69
+ * must be the same as was used in the data to which the model was fit.
70
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
71
+ * the first group is assumed to be the earlier rows here.
72
+ * - Xc[nnz]
73
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
74
+ * Pass NULL if there are no sparse numeric columns.
75
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
76
+ * - Xc_ind[nnz]
77
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
78
+ * Pass NULL if there are no sparse numeric columns in CSC format.
79
+ * - Xc_indptr[ncols_categ + 1]
80
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
81
+ * start and at entry [col + 1] where does column 'col' end.
82
+ * Pass NULL if there are no sparse numeric columns in CSC format.
83
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
84
+ * the first group is assumed to be the earlier rows here.
85
+ * - nrows
86
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
87
+ * - nthreads
88
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
89
+ * allocated, even if the thread does not end up being used. Ignored when not building with
90
+ * OpenMP support.
91
+ * - assume_full_distr
92
+ * Whether to assume that the fitted model represents a full population distribution (will use a
93
+ * standardizing criterion assuming infinite sample, and the results of the similarity between two points
94
+ * at prediction time will not depend on the prescence of any third point that is similar to them, but will
95
+ * differ more compared to the pairwise distances between points from which the model was fit). If passing
96
+ * 'false', will calculate pairwise distances as if the new observations at prediction time were added to
97
+ * the sample to which each tree was fit, which will make the distances between two points potentially vary
98
+ * according to other newly introduced points.
99
+ * - standardize_dist
100
+ * Whether to standardize the resulting average separation depths between rows according
101
+ * to the expected average separation depth in a similar way as when predicting outlierness,
102
+ * in order to obtain a standardized distance. If passing 'false', will output the average
103
+ * separation depth instead.
104
+ * - model_outputs
105
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
106
+ * if the calculations are to be made from an extended model. Can only pass one of
107
+ * 'model_outputs' and 'model_outputs_ext'.
108
+ * - model_outputs_ext
109
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
110
+ * if the calculations are to be made from a single-variable model. Can only pass one of
111
+ * 'model_outputs' and 'model_outputs_ext'.
112
+ * - tmat[nrows * (nrows - 1) / 2] (out)
113
+ * Pointer to array where the resulting pairwise distances or average separation depths will
114
+ * be written into. As the output is a symmetric matrix, this function will only fill in the
115
+ * upper-triangular part, in which entry 0 <= i < j < n will be located at position
116
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
117
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
118
+ * The array must already be initialized to zeros.
119
+ * If calculating distance/separation from a group of points to another group of points,
120
+ * pass NULL here and use 'rmat' instead.
121
+ * - rmat[nrows1 * nrows2] (out)
122
+ * Pointer to array where to write the distances or separation depths between each row in
123
+ * one set of observations and each row in a different set of observations. If doing these
124
+ * calculations for all pairs of observations/rows, pass 'rmat' instead.
125
+ * Will take the first group of observations as the rows in this matrix, and the second
126
+ * group as the columns. The groups are assumed to be in the same data arrays, with the
127
+ * first group corresponding to the earlier rows there.
128
+ * This matrix will be used in row-major order (i.e. entries 1..n_from contain the first row).
129
+ * Must be already initialized to zeros.
130
+ * Ignored when 'tmat' is passed.
131
+ * - n_from
132
+ * When calculating distances between two groups of points, this indicates the number of
133
+ * observations/rows belonging to the first group (the rows in 'rmat'), which will be
134
+ * assumed to be the first 'n_from' rows.
135
+ * Ignored when 'tmat' is passed.
136
+ */
137
+ void calc_similarity(double numeric_data[], int categ_data[],
138
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
139
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
140
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
141
+ double tmat[], double rmat[], size_t n_from)
142
+ {
143
+ PredictionData prediction_data = {numeric_data, categ_data, nrows,
144
+ Xc, Xc_ind, Xc_indptr,
145
+ NULL, NULL, NULL};
146
+
147
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
148
+
149
+ if (tmat != NULL) n_from = 0;
150
+
151
+ if ((size_t)nthreads > ntrees)
152
+ nthreads = (int)ntrees;
153
+ #ifdef _OPENMP
154
+ std::vector<WorkerForSimilarity> worker_memory(nthreads);
155
+ #else
156
+ std::vector<WorkerForSimilarity> worker_memory(1);
157
+ #endif
158
+
159
+ if (model_outputs != NULL)
160
+ {
161
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(ntrees, worker_memory, prediction_data, model_outputs)
162
+ for (size_t_for tree = 0; tree < ntrees; tree++)
163
+ {
164
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
165
+ model_outputs, NULL, n_from, assume_full_distr);
166
+ traverse_tree_sim(worker_memory[omp_get_thread_num()],
167
+ prediction_data,
168
+ *model_outputs,
169
+ model_outputs->trees[tree],
170
+ (size_t)0);
171
+ }
172
+ }
173
+
174
+ else
175
+ {
176
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(ntrees, worker_memory, prediction_data, model_outputs_ext)
177
+ for (size_t_for hplane = 0; hplane < ntrees; hplane++)
178
+ {
179
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
180
+ NULL, model_outputs_ext, n_from, assume_full_distr);
181
+ traverse_hplane_sim(worker_memory[omp_get_thread_num()],
182
+ prediction_data,
183
+ *model_outputs_ext,
184
+ model_outputs_ext->hplanes[hplane],
185
+ (size_t)0);
186
+ }
187
+ }
188
+
189
+ /* gather and transform the results */
190
+ gather_sim_result(&worker_memory, NULL,
191
+ &prediction_data, NULL,
192
+ model_outputs, model_outputs_ext,
193
+ tmat, rmat, n_from,
194
+ ntrees, assume_full_distr,
195
+ standardize_dist, nthreads);
196
+ }
197
+
198
+ void traverse_tree_sim(WorkerForSimilarity &workspace,
199
+ PredictionData &prediction_data,
200
+ IsoForest &model_outputs,
201
+ std::vector<IsoTree> &trees,
202
+ size_t curr_tree)
203
+ {
204
+ if (workspace.st == workspace.end)
205
+ return;
206
+
207
+ if (!workspace.tmat_sep.size())
208
+ {
209
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
210
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
211
+ return;
212
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
213
+ return;
214
+ }
215
+
216
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
217
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
218
+ obtain the average separation depth. */
219
+ if (trees[curr_tree].score >= 0.)
220
+ {
221
+ long double rem = (long double) trees[curr_tree].remainder;
222
+ if (!workspace.weights_arr.size())
223
+ {
224
+ rem += (long double)(workspace.end - workspace.st + 1);
225
+ if (workspace.tmat_sep.size())
226
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
227
+ prediction_data.nrows, workspace.tmat_sep.data(),
228
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
229
+ else
230
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
231
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(),
232
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
233
+ }
234
+
235
+ else
236
+ {
237
+ if (!workspace.assume_full_distr)
238
+ {
239
+ rem += std::accumulate(workspace.ix_arr.begin() + workspace.st,
240
+ workspace.ix_arr.begin() + workspace.end,
241
+ (long double) 0.,
242
+ [&workspace](long double curr, size_t ix)
243
+ {return curr + (long double)workspace.weights_arr[ix];}
244
+ );
245
+ }
246
+
247
+ if (workspace.tmat_sep.size())
248
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
249
+ prediction_data.nrows, workspace.tmat_sep.data(),
250
+ workspace.weights_arr.data(),
251
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
252
+ else
253
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
254
+ workspace.n_from, prediction_data.nrows,
255
+ workspace.rmat.data(), workspace.weights_arr.data(),
256
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
257
+ }
258
+ return;
259
+ }
260
+
261
+ else if (curr_tree > 0)
262
+ {
263
+ if (workspace.tmat_sep.size())
264
+ if (!workspace.weights_arr.size())
265
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
266
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
267
+ else
268
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
269
+ prediction_data.nrows, workspace.tmat_sep.data(),
270
+ workspace.weights_arr.data(), -1.);
271
+ else
272
+ if (!workspace.weights_arr.size())
273
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
274
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(), -1.);
275
+ else
276
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
277
+ workspace.n_from, prediction_data.nrows,
278
+ workspace.rmat.data(), workspace.weights_arr.data(), -1.);
279
+ }
280
+
281
+
282
+ /* divide according to tree */
283
+ if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
284
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
285
+ size_t st_NA, end_NA, split_ix;
286
+ switch(trees[curr_tree].col_type)
287
+ {
288
+ case Numeric:
289
+ {
290
+ if (prediction_data.Xc == NULL)
291
+ divide_subset_split(workspace.ix_arr.data(),
292
+ prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
293
+ workspace.st, workspace.end, trees[curr_tree].num_split,
294
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
295
+ else
296
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
297
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
298
+ trees[curr_tree].num_split, model_outputs.missing_action,
299
+ st_NA, end_NA, split_ix);
300
+
301
+ break;
302
+ }
303
+
304
+ case Categorical:
305
+ {
306
+ switch(model_outputs.cat_split_type)
307
+ {
308
+ case SingleCateg:
309
+ {
310
+ divide_subset_split(workspace.ix_arr.data(),
311
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
312
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
313
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
314
+ break;
315
+ }
316
+
317
+ case SubSet:
318
+ {
319
+ if (!trees[curr_tree].cat_split.size())
320
+ divide_subset_split(workspace.ix_arr.data(),
321
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
322
+ workspace.st, workspace.end,
323
+ model_outputs.missing_action, model_outputs.new_cat_action,
324
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
325
+ else
326
+ divide_subset_split(workspace.ix_arr.data(),
327
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
328
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
329
+ (int) trees[curr_tree].cat_split.size(),
330
+ model_outputs.missing_action, model_outputs.new_cat_action,
331
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
332
+ break;
333
+ }
334
+ }
335
+ break;
336
+ }
337
+ }
338
+
339
+
340
+ /* continue splitting recursively */
341
+ size_t orig_end = workspace.end;
342
+ switch(model_outputs.missing_action)
343
+ {
344
+ case Impute:
345
+ {
346
+ split_ix = (trees.back().pct_tree_left >= .5)? end_NA : st_NA;
347
+ }
348
+
349
+ case Fail:
350
+ {
351
+ if (split_ix > workspace.st)
352
+ {
353
+ workspace.end = split_ix - 1;
354
+ traverse_tree_sim(workspace,
355
+ prediction_data,
356
+ model_outputs,
357
+ trees,
358
+ trees[curr_tree].tree_left);
359
+ }
360
+
361
+
362
+ if (split_ix < orig_end)
363
+ {
364
+ workspace.st = split_ix;
365
+ workspace.end = orig_end;
366
+ traverse_tree_sim(workspace,
367
+ prediction_data,
368
+ model_outputs,
369
+ trees,
370
+ trees[curr_tree].tree_right);
371
+ }
372
+ break;
373
+ }
374
+
375
+ case Divide: /* new_cat_action = 'Weighted' will also fall here */
376
+ {
377
+ std::vector<double> weights_arr;
378
+ std::vector<size_t> ix_arr;
379
+ if (end_NA > workspace.st)
380
+ {
381
+ weights_arr.assign(workspace.weights_arr.begin(),
382
+ workspace.weights_arr.begin() + end_NA);
383
+ ix_arr.assign(workspace.ix_arr.begin(),
384
+ workspace.ix_arr.begin() + end_NA);
385
+ }
386
+
387
+ if (end_NA > workspace.st)
388
+ {
389
+ workspace.end = end_NA - 1;
390
+ for (size_t row = st_NA; row < end_NA; row++)
391
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
392
+ traverse_tree_sim(workspace,
393
+ prediction_data,
394
+ model_outputs,
395
+ trees,
396
+ trees[curr_tree].tree_left);
397
+ }
398
+
399
+ if (st_NA < orig_end)
400
+ {
401
+ workspace.st = st_NA;
402
+ workspace.end = orig_end;
403
+ if (weights_arr.size())
404
+ {
405
+ std::copy(weights_arr.begin(),
406
+ weights_arr.end(),
407
+ workspace.weights_arr.begin());
408
+ std::copy(ix_arr.begin(),
409
+ ix_arr.end(),
410
+ workspace.ix_arr.begin());
411
+ weights_arr.clear();
412
+ weights_arr.shrink_to_fit();
413
+ ix_arr.clear();
414
+ ix_arr.shrink_to_fit();
415
+ }
416
+
417
+ for (size_t row = st_NA; row < end_NA; row++)
418
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1 - trees[curr_tree].pct_tree_left);
419
+ traverse_tree_sim(workspace,
420
+ prediction_data,
421
+ model_outputs,
422
+ trees,
423
+ trees[curr_tree].tree_right);
424
+ }
425
+ break;
426
+ }
427
+ }
428
+ }
429
+
430
+ void traverse_hplane_sim(WorkerForSimilarity &workspace,
431
+ PredictionData &prediction_data,
432
+ ExtIsoForest &model_outputs,
433
+ std::vector<IsoHPlane> &hplanes,
434
+ size_t curr_tree)
435
+ {
436
+ if (workspace.st == workspace.end)
437
+ return;
438
+
439
+ if (!workspace.tmat_sep.size())
440
+ {
441
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
442
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
443
+ return;
444
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
445
+ return;
446
+ }
447
+
448
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
449
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
450
+ obtain the average separation depth. */
451
+ if (hplanes[curr_tree].score >= 0)
452
+ {
453
+ if (workspace.tmat_sep.size())
454
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
455
+ prediction_data.nrows, workspace.tmat_sep.data(),
456
+ workspace.assume_full_distr? 3. :
457
+ expected_separation_depth((long double) hplanes[curr_tree].remainder
458
+ + (long double)(workspace.end - workspace.st + 1))
459
+ );
460
+ else
461
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
462
+ prediction_data.nrows, workspace.rmat.data(),
463
+ workspace.assume_full_distr? 3. :
464
+ expected_separation_depth((long double) hplanes[curr_tree].remainder
465
+ + (long double)(workspace.end - workspace.st + 1))
466
+ );
467
+ return;
468
+ }
469
+
470
+ else if (curr_tree > 0)
471
+ {
472
+ if (workspace.tmat_sep.size())
473
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
474
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
475
+ else
476
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
477
+ prediction_data.nrows, workspace.rmat.data(), -1.);
478
+ }
479
+
480
+ if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
481
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
482
+
483
+ /* reconstruct linear combination */
484
+ size_t ncols_numeric = 0;
485
+ size_t ncols_categ = 0;
486
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
487
+ if (prediction_data.categ_data != NULL || prediction_data.Xc != NULL)
488
+ {
489
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
490
+ {
491
+ switch(hplanes[curr_tree].col_type[col])
492
+ {
493
+ case Numeric:
494
+ {
495
+ if (prediction_data.Xc == NULL)
496
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
497
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
498
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
499
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
500
+ model_outputs.missing_action, NULL, NULL, false);
501
+ else
502
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
503
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
504
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
505
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
506
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
507
+ model_outputs.missing_action, NULL, NULL, false);
508
+ ncols_numeric++;
509
+ break;
510
+ }
511
+
512
+ case Categorical:
513
+ {
514
+ switch(model_outputs.cat_split_type)
515
+ {
516
+ case SingleCateg:
517
+ {
518
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
519
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
520
+ (int)0, NULL, hplanes[curr_tree].fill_new[ncols_categ],
521
+ hplanes[curr_tree].chosen_cat[ncols_categ],
522
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
523
+ workspace.comb_val[0], NULL, NULL, model_outputs.new_cat_action,
524
+ model_outputs.missing_action, SingleCateg, false);
525
+ break;
526
+ }
527
+
528
+ case SubSet:
529
+ {
530
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
531
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
532
+ (int) hplanes[curr_tree].cat_coef[ncols_categ].size(),
533
+ hplanes[curr_tree].cat_coef[ncols_categ].data(), (double) 0, (int) 0,
534
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
535
+ hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
536
+ model_outputs.new_cat_action, model_outputs.missing_action, SubSet, false);
537
+ break;
538
+ }
539
+ }
540
+ ncols_categ++;
541
+ break;
542
+ }
543
+ }
544
+ }
545
+ }
546
+
547
+
548
+ else /* faster version for numerical-only */
549
+ {
550
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
551
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
552
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
553
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
554
+ (model_outputs.missing_action == Fail)? workspace.comb_val[0] : hplanes[curr_tree].fill_val[col],
555
+ model_outputs.missing_action, NULL, NULL, false);
556
+ }
557
+
558
+ /* divide data */
559
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
560
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
561
+
562
+ /* continue splitting recursively */
563
+ size_t orig_end = workspace.end;
564
+ if (split_ix > workspace.st)
565
+ {
566
+ workspace.end = split_ix - 1;
567
+ traverse_hplane_sim(workspace,
568
+ prediction_data,
569
+ model_outputs,
570
+ hplanes,
571
+ hplanes[curr_tree].hplane_left);
572
+ }
573
+
574
+ if (split_ix < orig_end)
575
+ {
576
+ workspace.st = split_ix;
577
+ workspace.end = orig_end;
578
+ traverse_hplane_sim(workspace,
579
+ prediction_data,
580
+ model_outputs,
581
+ hplanes,
582
+ hplanes[curr_tree].hplane_right);
583
+ }
584
+
585
+ }
586
+
587
+ void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
588
+ std::vector<WorkerMemory> *worker_memory_m,
589
+ PredictionData *prediction_data, InputData *input_data,
590
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
591
+ double *restrict tmat, double *restrict rmat, size_t n_from,
592
+ size_t ntrees, bool assume_full_distr,
593
+ bool standardize_dist, int nthreads)
594
+ {
595
+ size_t ncomb = (prediction_data != NULL)?
596
+ (prediction_data->nrows * (prediction_data->nrows - 1)) / 2
597
+ :
598
+ (input_data->nrows * (input_data->nrows - 1)) / 2;
599
+ size_t n_to = (prediction_data != NULL)? (prediction_data->nrows - n_from) : 0;
600
+
601
+ #ifdef _OPENMP
602
+ if (nthreads > 1)
603
+ {
604
+ if (worker_memory != NULL)
605
+ {
606
+ for (WorkerForSimilarity &w : *worker_memory)
607
+ {
608
+ if (w.tmat_sep.size())
609
+ {
610
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory)
611
+ for (size_t_for ix = 0; ix < ncomb; ix++)
612
+ tmat[ix] += w.tmat_sep[ix];
613
+ }
614
+ else if (w.rmat.size())
615
+ {
616
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(rmat, w, worker_memory)
617
+ for (size_t_for ix = 0; ix < w.rmat.size(); ix++)
618
+ rmat[ix] += w.rmat[ix];
619
+ }
620
+ }
621
+ }
622
+
623
+ else
624
+ {
625
+ for (WorkerMemory &w : *worker_memory_m)
626
+ {
627
+ if (w.tmat_sep.size())
628
+ {
629
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory_m)
630
+ for (size_t_for ix = 0; ix < ncomb; ix++)
631
+ tmat[ix] += w.tmat_sep[ix];
632
+ }
633
+ }
634
+ }
635
+ }
636
+
637
+ else
638
+ #endif
639
+ {
640
+ if (worker_memory != NULL)
641
+ {
642
+ if ((*worker_memory)[0].tmat_sep.size())
643
+ std::copy((*worker_memory)[0].tmat_sep.begin(), (*worker_memory)[0].tmat_sep.end(), tmat);
644
+ else
645
+ std::copy((*worker_memory)[0].rmat.begin(), (*worker_memory)[0].rmat.end(), rmat);
646
+ }
647
+
648
+ else
649
+ {
650
+ std::copy((*worker_memory_m)[0].tmat_sep.begin(), (*worker_memory_m)[0].tmat_sep.end(), tmat);
651
+ }
652
+ }
653
+
654
+ double ntrees_dbl = (double) ntrees;
655
+ if (standardize_dist)
656
+ {
657
+ /* Note: the separation distances up this point are missing the first hop, which is always
658
+ a +1 to every combination. Thus, it needs to be added back for the average separation depth.
659
+ For the standardized metric, it takes the expected divisor as 2(=3-1) instead of 3, given
660
+ that every combination will always get a +1 at the beginning. Since what's obtained here
661
+ is a sum across all trees, adding this +1 means adding the number of trees. */
662
+ double div_trees = ntrees_dbl;
663
+ if (assume_full_distr)
664
+ {
665
+ div_trees *= 2;
666
+ }
667
+
668
+ else if (input_data != NULL)
669
+ {
670
+ div_trees *= (expected_separation_depth(input_data->nrows) - 1);
671
+ }
672
+
673
+ else
674
+ {
675
+ div_trees *= ((
676
+ (model_outputs != NULL)?
677
+ expected_separation_depth_hotstart(model_outputs->exp_avg_sep,
678
+ model_outputs->orig_sample_size,
679
+ model_outputs->orig_sample_size + prediction_data->nrows)
680
+ :
681
+ expected_separation_depth_hotstart(model_outputs_ext->exp_avg_sep,
682
+ model_outputs_ext->orig_sample_size,
683
+ model_outputs_ext->orig_sample_size + prediction_data->nrows)
684
+ ) - 1);
685
+ }
686
+
687
+
688
+ if (tmat != NULL)
689
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, ntrees_dbl, div_trees)
690
+ for (size_t_for ix = 0; ix < ncomb; ix++)
691
+ tmat[ix] = exp2( - tmat[ix] / div_trees);
692
+ else
693
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, rmat, ntrees_dbl, div_trees)
694
+ for (size_t_for ix = 0; ix < n_from * n_to; ix++)
695
+ rmat[ix] = exp2( - rmat[ix] / div_trees);
696
+ }
697
+
698
+ else
699
+ {
700
+ if (tmat != NULL)
701
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, ntrees_dbl)
702
+ for (size_t_for ix = 0; ix < ncomb; ix++)
703
+ tmat[ix] = (tmat[ix] + ntrees) / ntrees_dbl;
704
+ else
705
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(n_from, rmat, ntrees_dbl)
706
+ for (size_t_for ix = 0; ix < n_from * n_to; ix++)
707
+ rmat[ix] = (rmat[ix] + ntrees) / ntrees_dbl;
708
+ }
709
+ }
710
+
711
+ void initialize_worker_for_sim(WorkerForSimilarity &workspace,
712
+ PredictionData &prediction_data,
713
+ IsoForest *model_outputs,
714
+ ExtIsoForest *model_outputs_ext,
715
+ size_t n_from,
716
+ bool assume_full_distr)
717
+ {
718
+ workspace.st = 0;
719
+ workspace.end = prediction_data.nrows - 1;
720
+ workspace.n_from = n_from;
721
+ workspace.assume_full_distr = assume_full_distr; /* doesn't need to have one copy per worker */
722
+
723
+ if (!workspace.ix_arr.size())
724
+ {
725
+ workspace.ix_arr.resize(prediction_data.nrows);
726
+ std::iota(workspace.ix_arr.begin(), workspace.ix_arr.end(), (size_t)0);
727
+ if (!n_from)
728
+ workspace.tmat_sep.resize((prediction_data.nrows * (prediction_data.nrows - 1)) / 2, 0);
729
+ else
730
+ workspace.rmat.resize(prediction_data.nrows * n_from, 0);
731
+ }
732
+
733
+ if (model_outputs != NULL && (model_outputs->missing_action == Divide || model_outputs->new_cat_action == Weighted))
734
+ {
735
+ if (!workspace.weights_arr.size())
736
+ workspace.weights_arr.resize(prediction_data.nrows, 1);
737
+ else
738
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), 1);
739
+ }
740
+
741
+ if (model_outputs_ext != NULL)
742
+ {
743
+ if (!workspace.comb_val.size())
744
+ workspace.comb_val.resize(prediction_data.nrows, 0);
745
+ else
746
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.end(), 0);
747
+ }
748
+ }
749
+