isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1068 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ bool interrupt_switch;
48
+
49
+ /* Fit Isolation Forest model, or variant of it such as SCiForest
50
+ *
51
+ * Parameters:
52
+ * ===========
53
+ * - model_outputs (out)
54
+ * Pointer to already allocated isolation forest model object for single-variable splits.
55
+ * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
56
+ * additional trees through function 'add_tree'.
57
+ * - model_outputs_ext (out)
58
+ * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
59
+ * Note that if 'ndim' = 1, must use instead the single-variable model object.
60
+ * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
61
+ * additional trees through function 'add_tree'.
62
+ * - numeric_data[nrows * ncols_numeric]
63
+ * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
64
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
65
+ * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
66
+ * no sparse numeric data either).
67
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
68
+ * - ncols_numeric
69
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
70
+ * - categ_data[nrows * ncols_categ]
71
+ * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
72
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
73
+ * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
74
+ * Each category should be represented as an integer, and these integers must start at zero and
75
+ * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
76
+ * (note that they are not treated as being ordinal, this is just an encoding). Missing values
77
+ * should be encoded as negative numbers such as (-1).
78
+ * - ncols_categ
79
+ * Number of categorical columns in the data.
80
+ * - ncat[ncols_categ]
81
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
82
+ * the number of categories for that column is '5' (zero is one category).
83
+ * - Xc[nnz]
84
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
85
+ * Pass NULL if there are no sparse numeric columns.
86
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
87
+ * - Xc_ind[nnz]
88
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
89
+ * Pass NULL if there are no sparse numeric columns.
90
+ * - Xc_indptr[ncols_numeric + 1]
91
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
92
+ * start and at entry [col + 1] where does column 'col' end.
93
+ * Pass NULL if there are no sparse numeric columns.
94
+ * - ndim
95
+ * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
96
+ * the single-variable model. Note that the model object pointer passed must also
97
+ * agree with the value passed to 'ndim'.
98
+ * - ntry
99
+ * In the split-criterion extended model, how many random hyperplanes to evaluate in
100
+ * order to decide which one is best to take. Ignored for the single-variable case
101
+ * and for random splits.
102
+ * - coef_type
103
+ * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
104
+ * (as proposed in [3]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [4]. Ignored for the
105
+ * single-variable model.
106
+ * - sample_weights[nrows]
107
+ * Weights for the rows when building a tree, either as sampling importances when using
108
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
109
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
110
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
111
+ * through parameter 'weight_as_sample'.
112
+ * Pass NULL if the rows all have uniform weights.
113
+ * - with_replacement
114
+ * Whether to produce sub-samples with replacement or not.
115
+ * - weight_as_sample
116
+ * If passing 'sample_weights', whether to consider those weights as row sampling weights (i.e. the higher
117
+ * the weights, the more likely the observation will end up included in each tree sub-sample), or as distribution
118
+ * density weights (i.e. putting a weight of two is the same as if the row appeared twice, thus higher weight makes it
119
+ * less of an outlier). Note that sampling weight is only used when sub-sampling data for each tree.
120
+ * - nrows
121
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
122
+ * - sample_size
123
+ * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
124
+ * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
125
+ * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
126
+ * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
127
+ * in [5] is 'nrows' here.
128
+ * - ntrees
129
+ * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
130
+ * author's code in [5] is 10.
131
+ * - max_depth
132
+ * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
133
+ * - limit_depth
134
+ * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
135
+ * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
136
+ * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
137
+ * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass higher values if
138
+ * using the model for purposes other than outlier detection.
139
+ * - penalize_range
140
+ * Whether to penalize (add +1 to the terminal depth) observations at prediction time that have a value
141
+ * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
142
+ * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
143
+ * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
144
+ * when splitting by categorical variables.
145
+ * - standardize_dist
146
+ * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
147
+ * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
148
+ * - tmat[nrows * (nrows - 1) / 2]
149
+ * Array in which to calculate average separation depths or standardized distance metric (see documentation
150
+ * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
151
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
152
+ * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
153
+ * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
154
+ * entry 0 <= i < j < n will be located at position
155
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
156
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
157
+ * - output_depths[nrows]
158
+ * Array in which to calculate average path depths or standardized outlierness metric (see documentation
159
+ * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
160
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
161
+ * of rows. If not NULL, must already be initialized to zeros.
162
+ * - standardize_depth
163
+ * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
164
+ * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
165
+ * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
166
+ * the average depth of each row across all trees.
167
+ * - col_weights[ncols_numeric + ncols_categ]
168
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
169
+ * Ignored when picking columns by deterministic criterion.
170
+ * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
171
+ * - weigh_by_kurt
172
+ * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
173
+ * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
174
+ * sample, so if not using sub-samples, it's better to pass column weights calculated externally. For
175
+ * categorical columns, will calculate expected kurtosis if the column was converted to numerical by
176
+ * assigning to each category a random number ~ Unif(0, 1).
177
+ * - prob_pick_by_gain_avg
178
+ * Probability of making each split in the single-variable model by choosing a column and split point in that
179
+ * same column as both the column and split point that gives the largest averaged gain (as proposed in [4]) across
180
+ * all available columns and possible splits in each column. Note that this implies evaluating every single column
181
+ * in the sample data when this type of split happens, which will potentially make the model fitting much slower,
182
+ * but has no impact on prediction time. For categorical variables, will take the expected standard deviation that
183
+ * would be gotten if the column were converted to numerical by assigning to each category a random number ~ Unif(0, 1)
184
+ * and calculate gain with those assumed standard deviations. For the extended model, this parameter indicates the probability that the
185
+ * split point in the chosen linear combination of variables will be decided by this averaged gain criterion. Compared to
186
+ * a pooled average, this tends to result in more cases in which a single observation or very few of them are put into
187
+ * one branch. Recommended to use sub-samples (parameter `sample_size`) when passing this parameter. When splits are
188
+ * not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl',
189
+ * both the column and the split point are decided at random.
190
+ * Default setting for [1], [2], [3] is zero, and default for [4] is 1. This is the randomization parameter that can
191
+ * be passed to the author's original code in [5]. Note that, if passing value 1 (100%) with no sub-sampling and using the
192
+ * single-variable model, every single tree will have the exact same splits.
193
+ * - prob_split_by_gain_avg
194
+ * Probability of making each split by selecting a column at random and determining the split point as
195
+ * that which gives the highest averaged gain. Not supported for the extended model as the splits are on
196
+ * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_avg' for more details.
197
+ * - prob_pick_by_gain_pl
198
+ * Probability of making each split in the single-variable model by choosing a column and split point in that
199
+ * same column as both the column and split point that gives the largest pooled gain (as used in decision tree
200
+ * classifiers such as C4.5 in [7]) across all available columns and possible splits in each column. Note
201
+ * that this implies evaluating every single column in the sample data when this type of split happens, which
202
+ * will potentially make the model fitting much slower, but has no impact on prediction time. For categorical
203
+ * variables, will use shannon entropy instead (like in [7]). For the extended model, this parameter indicates the probability
204
+ * that the split point in the chosen linear combination of variables will be decided by this pooled gain
205
+ * criterion. Compared to a simple average, this tends to result in more evenly-divided splits and more clustered
206
+ * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
207
+ * When used for outlier detection, higher values of this parameter result in models that are able to better flag
208
+ * outliers in the training data, but generalize poorly to outliers in new data and to values of variables
209
+ * outside of the ranges from the training data. Passing small 'sample_size' and high values of this parameter will
210
+ * tend to flag too many outliers. When splits are not made according to any of 'prob_pick_by_gain_avg',
211
+ * 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl', both the column and the split point
212
+ * are decided at random. Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
213
+ * every single tree will have the exact same splits.
214
+ * - prob_split_by_gain_pl
215
+ * Probability of making each split by selecting a column at random and determining the split point as
216
+ * that which gives the highest pooled gain. Not supported for the extended model as the splits are on
217
+ * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_pl' for more details.
218
+ * - min_gain
219
+ * Minimum gain that a split threshold needs to produce in order to proceed with a split. Only used when the splits
220
+ * are decided by a gain criterion (either pooled or averaged). If the highest possible gain in the evaluated
221
+ * splits at a node is below this threshold, that node becomes a terminal node.
222
+ * - missing_action
223
+ * How to handle missing data at both fitting and prediction time. Options are a) "Divide" (for the single-variable
224
+ * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
225
+ * the data that went to each branch when fitting the model, b) "Impute", which will assign observations to the
226
+ * branch with the most observations in the single-variable model, or fill in missing values with the median
227
+ * of each column of the sample from which the split was made in the extended model (recommended), c) "Fail" which will assume
228
+ * there are no missing values and will trigger undefined behavior if it encounters any. In the extended model, infinite
229
+ * values will be treated as missing. Note that passing "fail" might crash the process if there turn out to be
230
+ * missing values, but will otherwise produce faster fitting and prediction times along with decreased model object sizes.
231
+ * Models from [1], [2], [3], [4] correspond to "Fail" here.
232
+ * - cat_split_type
233
+ * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
234
+ * a single category to a branch and the rest to the other branch. For the extended model, whether to
235
+ * give each category a coefficient, or only one while the rest get zero.
236
+ * - new_cat_action
237
+ * What to do after splitting a categorical feature when new data that reaches that split has categories that
238
+ * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
239
+ * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
240
+ * data that went to each branch when fitting the model, and in the extended model will assign
241
+ * them the median value for that column that was added to the linear combination of features, b) "Smallest", which will
242
+ * assign all observations with unseen categories in the split to the branch that had fewer observations when
243
+ * fitting the model, c) "Random", which will assing a branch (coefficient in the extended model) at random for
244
+ * each category beforehand, even if no observations had that category when fitting the model. Ignored when
245
+ * passing 'cat_split_type' = 'SingleCateg'.
246
+ * - all_perm
247
+ * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
248
+ * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
249
+ * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
250
+ * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
251
+ * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
252
+ * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
253
+ * possible to select other splits that respect the sorted frequency order.
254
+ * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
255
+ * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
256
+ * but note that this is not checked within the function.
257
+ * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
258
+ * - coef_by_prop
259
+ * In the extended model, whether to sort the randomly-generated coefficients for categories
260
+ * according to their relative frequency in the tree node. This might provide better results when using
261
+ * categorical variables with too many categories, but is not recommended, and not reflective of
262
+ * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
263
+ * variables.
264
+ * - imputer (out)
265
+ * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
266
+ * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
267
+ * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
268
+ * or division strategy.
269
+ * - min_imp_obs
270
+ * Minimum number of observations with which an imputation value can be produced. Ignored if passing
271
+ * 'build_imputer' = 'false'.
272
+ * - depth_imp
273
+ * How to weight observations according to their depth when used for imputing missing values. Passing
274
+ * "Higher" will weigh observations higher the further down the tree (away from the root node) the
275
+ * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
276
+ * to node depth in the tree. Implemented for testing purposes and not recommended to change
277
+ * from the default. Ignored when not passing 'impute_nodes'.
278
+ * - weigh_imp_rows
279
+ * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
280
+ * a node inversely proportional to the number of observations that end up there, while "Proportional"
281
+ * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
282
+ * in this regard regardless of how many observations end up there. Implemented for testing purposes
283
+ * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
284
+ * - impute_at_fit
285
+ * Whether to impute missing values in the input data as the model is being built. If passing 'true',
286
+ * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
287
+ * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
288
+ * - random_seed
289
+ * Seed that will be used to generate random numbers used by the model.
290
+ * - nthreads
291
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
292
+ * allocated, even if the thread does not end up being used. Ignored when not building with
293
+ * OpenMP support.
294
+ *
295
+ * Returns
296
+ * =======
297
+ * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
298
+ * If the process receives an interrupt signal, will return instead
299
+ * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
300
+ * what these values correspond to, you can use the functions
301
+ * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
302
+ * as integers.
303
+ *
304
+ * References
305
+ * ==========
306
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
307
+ * "Isolation forest."
308
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
309
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
310
+ * "Isolation-based anomaly detection."
311
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
312
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
313
+ * "Extended Isolation Forest."
314
+ * arXiv preprint arXiv:1811.02141 (2018).
315
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
316
+ * "On detecting clustered anomalies using SCiForest."
317
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
318
+ * [5] https://sourceforge.net/projects/iforest/
319
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
320
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
321
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
322
+ */
323
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
324
+ double numeric_data[], size_t ncols_numeric,
325
+ int categ_data[], size_t ncols_categ, int ncat[],
326
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
327
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
328
+ double sample_weights[], bool with_replacement, bool weight_as_sample,
329
+ size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
330
+ bool limit_depth, bool penalize_range,
331
+ bool standardize_dist, double tmat[],
332
+ double output_depths[], bool standardize_depth,
333
+ double col_weights[], bool weigh_by_kurt,
334
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
335
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
336
+ double min_gain, MissingAction missing_action,
337
+ CategSplit cat_split_type, NewCategAction new_cat_action,
338
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
339
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
340
+ uint64_t random_seed, int nthreads)
341
+ {
342
+ /* calculate maximum number of categories to use later */
343
+ int max_categ = 0;
344
+ for (size_t col = 0; col < ncols_categ; col++)
345
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
346
+
347
+ bool calc_dist = tmat != NULL;
348
+
349
+ if (calc_dist || sample_size == 0)
350
+ sample_size = nrows;
351
+
352
+ /* put data in structs to shorten function calls */
353
+ InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
354
+ nrows, ncols_numeric + ncols_categ, sample_weights,
355
+ weight_as_sample, col_weights,
356
+ Xc, Xc_ind, Xc_indptr,
357
+ 0, 0, std::vector<double>(),
358
+ std::vector<char>(), 0};
359
+ ModelParams model_params = {with_replacement, sample_size, ntrees,
360
+ limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
361
+ penalize_range, random_seed, weigh_by_kurt,
362
+ prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
363
+ prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
364
+ min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
365
+ (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
366
+ coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
367
+ depth_imp, weigh_imp_rows, min_imp_obs};
368
+
369
+ /* if using weights as sampling probability, build a binary tree for faster sampling */
370
+ if (input_data.weight_as_sample && input_data.sample_weights != NULL)
371
+ {
372
+ build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
373
+ input_data.nrows, input_data.log2_n, input_data.btree_offset);
374
+ }
375
+
376
+ /* if imputing missing values on-the-fly, need to determine which are missing */
377
+ std::vector<ImputedData> impute_vec;
378
+ std::unordered_map<size_t, ImputedData> impute_map;
379
+ if (model_params.impute_at_fit)
380
+ check_for_missing(input_data, impute_vec, impute_map, nthreads);
381
+
382
+ /* store model data */
383
+ if (model_outputs != NULL)
384
+ {
385
+ model_outputs->trees.resize(ntrees);
386
+ model_outputs->trees.shrink_to_fit();
387
+ model_outputs->new_cat_action = new_cat_action;
388
+ model_outputs->cat_split_type = cat_split_type;
389
+ model_outputs->missing_action = missing_action;
390
+ model_outputs->exp_avg_depth = expected_avg_depth(sample_size);
391
+ model_outputs->exp_avg_sep = expected_separation_depth(model_params.sample_size);
392
+ model_outputs->orig_sample_size = input_data.nrows;
393
+ }
394
+
395
+ else
396
+ {
397
+ model_outputs_ext->hplanes.resize(ntrees);
398
+ model_outputs_ext->hplanes.shrink_to_fit();
399
+ model_outputs_ext->new_cat_action = new_cat_action;
400
+ model_outputs_ext->cat_split_type = cat_split_type;
401
+ model_outputs_ext->missing_action = missing_action;
402
+ model_outputs_ext->exp_avg_depth = expected_avg_depth(sample_size);
403
+ model_outputs_ext->exp_avg_sep = expected_separation_depth(model_params.sample_size);
404
+ model_outputs_ext->orig_sample_size = input_data.nrows;
405
+ }
406
+
407
+ if (imputer != NULL)
408
+ initialize_imputer(*imputer, input_data, ntrees, nthreads);
409
+
410
+ /* initialize thread-private memory */
411
+ if ((size_t)nthreads > ntrees)
412
+ nthreads = (int)ntrees;
413
+ #ifdef _OPENMP
414
+ std::vector<WorkerMemory> worker_memory(nthreads);
415
+ #else
416
+ std::vector<WorkerMemory> worker_memory(1);
417
+ #endif
418
+
419
+ /* Global variable that determines if the procedure receives a stop signal */
420
+ interrupt_switch = false;
421
+
422
+ /* grow trees */
423
+ #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
424
+ for (size_t_for tree = 0; tree < ntrees; tree++)
425
+ {
426
+ if (interrupt_switch)
427
+ continue; /* Cannot break with OpenMP==2.0 (MSVC) */
428
+
429
+ if (
430
+ model_params.impute_at_fit &&
431
+ input_data.n_missing &&
432
+ !worker_memory[omp_get_thread_num()].impute_vec.size() &&
433
+ !worker_memory[omp_get_thread_num()].impute_map.size()
434
+ )
435
+ {
436
+ #ifdef _OPENMP
437
+ if (nthreads > 1)
438
+ {
439
+ worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
440
+ worker_memory[omp_get_thread_num()].impute_map = impute_map;
441
+ }
442
+
443
+ else
444
+ #endif
445
+ {
446
+ worker_memory[0].impute_vec = std::move(impute_vec);
447
+ worker_memory[0].impute_map = std::move(impute_map);
448
+ }
449
+ }
450
+
451
+ fit_itree((model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
452
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
453
+ worker_memory[omp_get_thread_num()],
454
+ input_data,
455
+ model_params,
456
+ (imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
457
+ tree);
458
+
459
+ if ((model_outputs != NULL))
460
+ model_outputs->trees[tree].shrink_to_fit();
461
+ else
462
+ model_outputs_ext->hplanes[tree].shrink_to_fit();
463
+
464
+ signal(SIGINT, set_interrup_global_variable);
465
+ }
466
+
467
+ /* check if the procedure got interrupted */
468
+ if (interrupt_switch) return EXIT_FAILURE;
469
+ interrupt_switch = false;
470
+
471
+ if ((model_outputs != NULL))
472
+ model_outputs->trees.shrink_to_fit();
473
+ else
474
+ model_outputs_ext->hplanes.shrink_to_fit();
475
+
476
+ /* if calculating similarity/distance, now need to reduce and average */
477
+ if (calc_dist)
478
+ gather_sim_result(NULL, &worker_memory,
479
+ NULL, &input_data,
480
+ model_outputs, model_outputs_ext,
481
+ tmat, NULL, 0,
482
+ model_params.ntrees, false,
483
+ standardize_dist, nthreads);
484
+
485
+ /* same for depths */
486
+ if (output_depths != NULL)
487
+ {
488
+ #ifdef _OPENMP
489
+ if (nthreads > 1)
490
+ {
491
+ for (WorkerMemory &w : worker_memory)
492
+ {
493
+ if (w.row_depths.size())
494
+ {
495
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
496
+ for (size_t_for row = 0; row < input_data.nrows; row++)
497
+ output_depths[row] += w.row_depths[row];
498
+ }
499
+ }
500
+ }
501
+ else
502
+ #endif
503
+ {
504
+ std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
505
+ }
506
+
507
+ if (standardize_depth)
508
+ {
509
+ double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
510
+ model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
511
+ for (size_t_for row = 0; row < nrows; row++)
512
+ output_depths[row] = exp2( - output_depths[row] / depth_divisor );
513
+ }
514
+
515
+ else
516
+ {
517
+ double ntrees_dbl = (double) ntrees;
518
+ for (size_t_for row = 0; row < nrows; row++)
519
+ output_depths[row] /= ntrees_dbl;
520
+ }
521
+ }
522
+
523
+ /* if imputing missing values, now need to reduce and write final values */
524
+ if (model_params.impute_at_fit)
525
+ {
526
+ #ifdef _OPENMP
527
+ if (nthreads > 1)
528
+ {
529
+ for (WorkerMemory &w : worker_memory)
530
+ combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
531
+ }
532
+
533
+ else
534
+ #endif
535
+ {
536
+ impute_vec = std::move(worker_memory[0].impute_vec);
537
+ impute_map = std::move(worker_memory[0].impute_map);
538
+ }
539
+
540
+ apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
541
+ }
542
+
543
+ return EXIT_SUCCESS;
544
+ }
545
+
546
+
547
+ /* Add additional trees to already-fitted isolation forest model
548
+ *
549
+ * Parameters
550
+ * ==========
551
+ * - model_outputs
552
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
553
+ * if the trees are are to be added to an extended model. Can only pass one of
554
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
555
+ * so it cannot be run in parallel for the same model object.
556
+ * - model_outputs_ext
557
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
558
+ * if the trees are are to be added to an single-variable model. Can only pass one of
559
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
560
+ * so it cannot be run in parallel for the same model object.
561
+ * - numeric_data
562
+ * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
563
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
564
+ * Pass NULL if there are no dense numeric columns.
565
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
566
+ * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
567
+ * of columns, either as dense or as sparse arrays.
568
+ * - ncols_numeric
569
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
570
+ * what was originally passed to 'fit_iforest'.
571
+ * - categ_data
572
+ * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
573
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
574
+ * Pass NULL if there are no categorical columns. The encoding must be the same as was used
575
+ * in the data to which the model was fit.
576
+ * Each category should be represented as an integer, and these integers must start at zero and
577
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
578
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
579
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
580
+ * must be the same as was used in the data to which the model was fit.
581
+ * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
582
+ * of columns and the same category encoding.
583
+ * - ncols_categ
584
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
585
+ * what was originally passed to 'fit_iforest'.
586
+ * - ncat
587
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
588
+ * what was originally passed to 'fit_iforest'.
589
+ * - Xc[nnz]
590
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
591
+ * Pass NULL if there are no sparse numeric columns.
592
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
593
+ * - Xc_ind[nnz]
594
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
595
+ * Pass NULL if there are no sparse numeric columns.
596
+ * - Xc_indptr[ncols_numeric + 1]
597
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
598
+ * start and at entry [col + 1] where does column 'col' end.
599
+ * Pass NULL if there are no sparse numeric columns.
600
+ * - ndim
601
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
602
+ * what was originally passed to 'fit_iforest'.
603
+ * - ntry
604
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
605
+ * what was originally passed to 'fit_iforest'.
606
+ * - coef_type
607
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
608
+ * what was originally passed to 'fit_iforest'.
609
+ * - sample_weights
610
+ * Weights for the rows when adding this tree, either as sampling importances when using
611
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
612
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
613
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
614
+ * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
615
+ * Pass NULL if the rows all have uniform weights.
616
+ * - nrows
617
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
618
+ * - max_depth
619
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
620
+ * what was originally passed to 'fit_iforest'.
621
+ * - limit_depth
622
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
623
+ * what was originally passed to 'fit_iforest'.
624
+ * - penalize_range
625
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
626
+ * what was originally passed to 'fit_iforest'.
627
+ * - col_weights
628
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
629
+ * Ignored when picking columns by deterministic criterion.
630
+ * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
631
+ * - weigh_by_kurt
632
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
633
+ * what was originally passed to 'fit_iforest'.
634
+ * - prob_pick_by_gain_avg
635
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
636
+ * what was originally passed to 'fit_iforest'.
637
+ * - prob_split_by_gain_avg
638
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
639
+ * what was originally passed to 'fit_iforest'.
640
+ * - prob_pick_by_gain_pl
641
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
642
+ * what was originally passed to 'fit_iforest'.
643
+ * - prob_split_by_gain_pl
644
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
645
+ * what was originally passed to 'fit_iforest'.
646
+ * - min_gain
647
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
648
+ * what was originally passed to 'fit_iforest'.
649
+ * - missing_action
650
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
651
+ * what was originally passed to 'fit_iforest'.
652
+ * - cat_split_type
653
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
654
+ * what was originally passed to 'fit_iforest'.
655
+ * - new_cat_action
656
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
657
+ * what was originally passed to 'fit_iforest'.
658
+ * - depth_imp
659
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
660
+ * what was originally passed to 'fit_iforest'.
661
+ * - weigh_imp_rows
662
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
663
+ * what was originally passed to 'fit_iforest'.
664
+ * - all_perm
665
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
666
+ * what was originally passed to 'fit_iforest'.
667
+ * - coef_by_prop
668
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
669
+ * what was originally passed to 'fit_iforest'.
670
+ * - impute_nodes
671
+ * Pointer to already-allocated imputation nodes for the tree that will be built. Note that the number of
672
+ * entries in the imputation object must match the number of fitted trees when it is used. Pass
673
+ * NULL if no imputation node is required.
674
+ * - min_imp_obs
675
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
676
+ * what was originally passed to 'fit_iforest'.
677
+ * - random_seed
678
+ * Seed that will be used to generate random numbers used by the model.
679
+ */
680
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
681
+ double numeric_data[], size_t ncols_numeric,
682
+ int categ_data[], size_t ncols_categ, int ncat[],
683
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
684
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
685
+ double sample_weights[], size_t nrows, size_t max_depth,
686
+ bool limit_depth, bool penalize_range,
687
+ double col_weights[], bool weigh_by_kurt,
688
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
689
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
690
+ double min_gain, MissingAction missing_action,
691
+ CategSplit cat_split_type, NewCategAction new_cat_action,
692
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
693
+ bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
694
+ uint64_t random_seed)
695
+ {
696
+ int max_categ = 0;
697
+ for (size_t col = 0; col < ncols_categ; col++)
698
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
699
+
700
+ InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
701
+ nrows, ncols_numeric + ncols_categ, sample_weights,
702
+ false, col_weights,
703
+ Xc, Xc_ind, Xc_indptr,
704
+ 0, 0, std::vector<double>(),
705
+ std::vector<char>(), 0};
706
+ ModelParams model_params = {false, nrows, (size_t)1,
707
+ max_depth? max_depth : (nrows - 1),
708
+ penalize_range, random_seed, weigh_by_kurt,
709
+ prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
710
+ prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
711
+ min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
712
+ (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
713
+ coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
714
+
715
+ std::unique_ptr<WorkerMemory> workspace = std::unique_ptr<WorkerMemory>(new WorkerMemory);
716
+
717
+ size_t last_tree;
718
+ if (model_outputs != NULL)
719
+ {
720
+ last_tree = model_outputs->trees.size();
721
+ model_outputs->trees.emplace_back();
722
+ }
723
+
724
+ else
725
+ {
726
+ last_tree = model_outputs_ext->hplanes.size();
727
+ model_outputs_ext->hplanes.emplace_back();
728
+ }
729
+
730
+ fit_itree((model_outputs != NULL)? &model_outputs->trees.back() : NULL,
731
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
732
+ *workspace,
733
+ input_data,
734
+ model_params,
735
+ impute_nodes,
736
+ last_tree);
737
+
738
+ if ((model_outputs != NULL))
739
+ model_outputs->trees.back().shrink_to_fit();
740
+ else
741
+ model_outputs_ext->hplanes.back().shrink_to_fit();
742
+
743
+ return EXIT_SUCCESS;
744
+ }
745
+
746
+ void fit_itree(std::vector<IsoTree> *tree_root,
747
+ std::vector<IsoHPlane> *hplane_root,
748
+ WorkerMemory &workspace,
749
+ InputData &input_data,
750
+ ModelParams &model_params,
751
+ std::vector<ImputeNode> *impute_nodes,
752
+ size_t tree_num)
753
+ {
754
+ /* initialize array for depths if called for */
755
+ if (!workspace.ix_arr.size() && model_params.calc_depth)
756
+ workspace.row_depths.resize(input_data.nrows, 0);
757
+
758
+ /* choose random sample of rows */
759
+ if (!workspace.ix_arr.size()) workspace.ix_arr.resize(model_params.sample_size);
760
+ if (input_data.log2_n > 0)
761
+ workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
762
+ input_data.btree_weights_init.end());
763
+ workspace.rnd_generator.seed(model_params.random_seed + tree_num);
764
+ if (input_data.col_weights != NULL)
765
+ workspace.col_sampler = std::discrete_distribution<size_t>(input_data.col_weights,
766
+ input_data.col_weights + input_data.ncols_numeric + input_data.ncols_categ);
767
+ workspace.runif = std::uniform_int_distribution<size_t>(0, input_data.ncols_tot - 1);
768
+ workspace.rbin = std::uniform_real_distribution<double>(0, 1);
769
+ sample_random_rows(workspace.ix_arr, input_data.nrows, model_params.with_replacement,
770
+ workspace.rnd_generator, workspace.ix_all,
771
+ (input_data.weight_as_sample)? input_data.sample_weights : NULL,
772
+ workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
773
+ workspace.is_repeated);
774
+ workspace.st = 0;
775
+ workspace.end = model_params.sample_size - 1;
776
+ if (!workspace.cols_possible.size())
777
+ workspace.cols_possible.resize(input_data.ncols_tot, true);
778
+ else
779
+ workspace.cols_possible.assign(workspace.cols_possible.size(), true);
780
+
781
+ /* set expected tree size and add root node */
782
+ {
783
+ size_t exp_nodes = 2 * model_params.sample_size;
784
+ if (model_params.sample_size >= (SIZE_MAX / (size_t)2))
785
+ exp_nodes = SIZE_MAX;
786
+ if (model_params.max_depth <= (size_t)30)
787
+ exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
788
+ if (tree_root != NULL)
789
+ {
790
+ tree_root->reserve(exp_nodes);
791
+ tree_root->emplace_back();
792
+ }
793
+ else
794
+ {
795
+ hplane_root->reserve(exp_nodes);
796
+ hplane_root->emplace_back();
797
+ }
798
+ if (impute_nodes != NULL)
799
+ {
800
+ impute_nodes->reserve(exp_nodes);
801
+ impute_nodes->emplace_back((size_t) 0);
802
+ }
803
+ }
804
+
805
+ /* initialize array with candidate categories if not already done */
806
+ if (!workspace.categs.size())
807
+ workspace.categs.resize(input_data.max_categ);
808
+
809
+ /* for the extended model, initialize extra vectors and objects */
810
+ if (hplane_root != NULL && !workspace.comb_val.size())
811
+ {
812
+ workspace.coef_norm = std::normal_distribution<double>(0, 1);
813
+ if (model_params.coef_type == Uniform)
814
+ workspace.coef_unif = std::uniform_real_distribution<double>(-1, 1);
815
+
816
+ workspace.cols_shuffled.resize(input_data.ncols_tot);
817
+ workspace.comb_val.resize(model_params.sample_size);
818
+ workspace.col_take.resize(model_params.ndim);
819
+ workspace.col_take_type.resize(model_params.ndim);
820
+
821
+ if (input_data.ncols_numeric)
822
+ {
823
+ workspace.ext_offset.resize(input_data.ncols_tot);
824
+ workspace.ext_coef.resize(input_data.ncols_tot);
825
+ workspace.ext_mean.resize(input_data.ncols_tot);
826
+ }
827
+
828
+ if (input_data.ncols_categ)
829
+ {
830
+ workspace.ext_fill_new.resize(input_data.max_categ);
831
+ switch(model_params.cat_split_type)
832
+ {
833
+ case SingleCateg:
834
+ {
835
+ workspace.chosen_cat.resize(input_data.max_categ);
836
+ break;
837
+ }
838
+
839
+ case SubSet:
840
+ {
841
+ workspace.ext_cat_coef.resize(input_data.ncols_tot);
842
+ for (std::vector<double> &v : workspace.ext_cat_coef)
843
+ v.resize(input_data.max_categ);
844
+ break;
845
+ }
846
+ }
847
+ }
848
+
849
+ workspace.ext_fill_val.resize(input_data.ncols_tot);
850
+
851
+ }
852
+
853
+ /* if it contains missing values, also have to set an array of weights,
854
+ which will be modified during iterations when there are NAs.
855
+ If there are already density weights, need to standardize them to sum up to
856
+ the sample size here */
857
+ long double weight_scaling = 0;
858
+ if (model_params.missing_action == Divide || (input_data.sample_weights != NULL && !input_data.weight_as_sample))
859
+ {
860
+ workspace.weights_map.clear();
861
+
862
+ /* if the sub-sample size is small relative to the full sample size, use a mapping */
863
+ if (model_params.sample_size < input_data.nrows / 4)
864
+ {
865
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
866
+ {
867
+ for (const size_t ix : workspace.ix_arr)
868
+ {
869
+ weight_scaling += input_data.sample_weights[ix];
870
+ workspace.weights_map[ix] = input_data.sample_weights[ix];
871
+ }
872
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
873
+ for (auto &w : workspace.weights_map)
874
+ w.second *= weight_scaling;
875
+
876
+ }
877
+
878
+ else
879
+ {
880
+ for (const size_t ix : workspace.ix_arr)
881
+ workspace.weights_map[ix] = 1;
882
+ }
883
+
884
+ }
885
+
886
+ /* if the sub-sample size is large, fill a full array matching to the sample size */
887
+ else
888
+ {
889
+ if (!workspace.weights_arr.size())
890
+ {
891
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
892
+ {
893
+ workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
894
+ weight_scaling = std::accumulate(workspace.ix_arr.begin(),
895
+ workspace.ix_arr.end(),
896
+ (long double)0,
897
+ [&input_data](const long double a, const size_t b){return a + (long double)input_data.sample_weights[b];}
898
+ );
899
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
900
+ for (double &w : workspace.weights_arr)
901
+ w *= weight_scaling;
902
+ }
903
+
904
+ else
905
+ {
906
+ workspace.weights_arr.resize(input_data.nrows, (double)1);
907
+ }
908
+
909
+ }
910
+
911
+ else
912
+ {
913
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
914
+ {
915
+ for (const size_t ix : workspace.ix_arr)
916
+ {
917
+ weight_scaling += input_data.sample_weights[ix];
918
+ workspace.weights_arr[ix] = input_data.sample_weights[ix];
919
+ }
920
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
921
+ for (double &w : workspace.weights_arr)
922
+ w *= weight_scaling;
923
+
924
+ }
925
+
926
+ else
927
+ {
928
+ /* Note: while not all of them need to be overwritten, this is faster
929
+ (sub-sample size was already determined to be at least 1/4 of the sample size) */
930
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), (double)1);
931
+ }
932
+ }
933
+ }
934
+ }
935
+
936
+ /* if producing distance/similarity, also need to initialize the triangular matrix */
937
+ if (model_params.calc_dist && !workspace.tmat_sep.size())
938
+ workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
939
+
940
+ /* make space for buffers if not already allocated */
941
+ if (
942
+ (model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
943
+ model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl ||
944
+ model_params.weigh_by_kurt || hplane_root != NULL)
945
+ &&
946
+ (!workspace.buffer_dbl.size() && !workspace.buffer_szt.size() && !workspace.buffer_chr.size())
947
+ )
948
+ {
949
+ size_t min_size_dbl = 0;
950
+ size_t min_size_szt = 0;
951
+ size_t min_size_chr = 0;
952
+
953
+ bool gain = model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
954
+ model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl;
955
+
956
+ if (input_data.ncols_categ)
957
+ {
958
+ min_size_szt = 2 * input_data.max_categ;
959
+ min_size_dbl = input_data.max_categ + 1;
960
+ if (gain && model_params.cat_split_type == SubSet)
961
+ min_size_chr = input_data.max_categ;
962
+ }
963
+
964
+ if (input_data.Xc != NULL && gain)
965
+ {
966
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
967
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
968
+ }
969
+
970
+ /* for the extended model */
971
+ if (hplane_root != NULL)
972
+ {
973
+ min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
974
+ if (model_params.missing_action != Fail)
975
+ {
976
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
977
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
978
+ }
979
+
980
+ if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
981
+ {
982
+ min_size_szt = std::max(min_size_szt, 2 * (size_t)input_data.max_categ + 1);
983
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
984
+ }
985
+
986
+ if (model_params.weigh_by_kurt)
987
+ min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
988
+ }
989
+
990
+ /* now resize */
991
+ if (workspace.buffer_dbl.size() < min_size_dbl)
992
+ workspace.buffer_dbl.resize(min_size_dbl);
993
+
994
+ if (workspace.buffer_szt.size() < min_size_szt)
995
+ workspace.buffer_szt.resize(min_size_szt);
996
+
997
+ if (workspace.buffer_chr.size() < min_size_chr)
998
+ workspace.buffer_chr.resize(min_size_chr);
999
+
1000
+ /* for guided column choice, need to also remember the best split so far */
1001
+ if (
1002
+ model_params.cat_split_type == SubSet &&
1003
+ (
1004
+ model_params.prob_pick_by_gain_avg ||
1005
+ model_params.prob_pick_by_gain_pl
1006
+ )
1007
+ )
1008
+ {
1009
+ workspace.this_split_categ.resize(input_data.max_categ);
1010
+ }
1011
+
1012
+ }
1013
+
1014
+ /* weigh columns by kurtosis in the sample if required */
1015
+ if (model_params.weigh_by_kurt)
1016
+ {
1017
+ std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
1018
+
1019
+ if (input_data.Xc == NULL)
1020
+ {
1021
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1022
+ kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1023
+ input_data.numeric_data + col * input_data.nrows,
1024
+ model_params.missing_action);
1025
+ }
1026
+
1027
+ else
1028
+ {
1029
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
1030
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1031
+ kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end, col,
1032
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
1033
+ model_params.missing_action);
1034
+ }
1035
+
1036
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
1037
+ kurt_weights[col + input_data.ncols_numeric] =
1038
+ calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1039
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
1040
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
1041
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
1042
+
1043
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
1044
+ if (kurt_weights[col] <= 0 || is_na_or_inf(kurt_weights[col]))
1045
+ workspace.cols_possible[col] = false;
1046
+
1047
+ workspace.col_sampler = std::discrete_distribution<size_t>(kurt_weights.begin(), kurt_weights.end());
1048
+ }
1049
+
1050
+ if (tree_root != NULL)
1051
+ split_itree_recursive(*tree_root,
1052
+ workspace,
1053
+ input_data,
1054
+ model_params,
1055
+ impute_nodes,
1056
+ 0);
1057
+ else
1058
+ split_hplane_recursive(*hplane_root,
1059
+ workspace,
1060
+ input_data,
1061
+ model_params,
1062
+ impute_nodes,
1063
+ 0);
1064
+
1065
+ /* if producing imputation structs, only need to keep the ones for terminal nodes */
1066
+ if (impute_nodes != NULL)
1067
+ drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
1068
+ }