isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1068 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ bool interrupt_switch;
48
+
49
+ /* Fit Isolation Forest model, or variant of it such as SCiForest
50
+ *
51
+ * Parameters:
52
+ * ===========
53
+ * - model_outputs (out)
54
+ * Pointer to already allocated isolation forest model object for single-variable splits.
55
+ * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
56
+ * additional trees through function 'add_tree'.
57
+ * - model_outputs_ext (out)
58
+ * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
59
+ * Note that if 'ndim' = 1, must use instead the single-variable model object.
60
+ * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
61
+ * additional trees through function 'add_tree'.
62
+ * - numeric_data[nrows * ncols_numeric]
63
+ * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
64
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
65
+ * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
66
+ * no sparse numeric data either).
67
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
68
+ * - ncols_numeric
69
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
70
+ * - categ_data[nrows * ncols_categ]
71
+ * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
72
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
73
+ * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
74
+ * Each category should be represented as an integer, and these integers must start at zero and
75
+ * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
76
+ * (note that they are not treated as being ordinal, this is just an encoding). Missing values
77
+ * should be encoded as negative numbers such as (-1).
78
+ * - ncols_categ
79
+ * Number of categorical columns in the data.
80
+ * - ncat[ncols_categ]
81
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
82
+ * the number of categories for that column is '5' (zero is one category).
83
+ * - Xc[nnz]
84
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
85
+ * Pass NULL if there are no sparse numeric columns.
86
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
87
+ * - Xc_ind[nnz]
88
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
89
+ * Pass NULL if there are no sparse numeric columns.
90
+ * - Xc_indptr[ncols_numeric + 1]
91
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
92
+ * start and at entry [col + 1] where does column 'col' end.
93
+ * Pass NULL if there are no sparse numeric columns.
94
+ * - ndim
95
+ * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
96
+ * the single-variable model. Note that the model object pointer passed must also
97
+ * agree with the value passed to 'ndim'.
98
+ * - ntry
99
+ * In the split-criterion extended model, how many random hyperplanes to evaluate in
100
+ * order to decide which one is best to take. Ignored for the single-variable case
101
+ * and for random splits.
102
+ * - coef_type
103
+ * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
104
+ * (as proposed in [3]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [4]. Ignored for the
105
+ * single-variable model.
106
+ * - sample_weights[nrows]
107
+ * Weights for the rows when building a tree, either as sampling importances when using
108
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
109
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
110
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
111
+ * through parameter 'weight_as_sample'.
112
+ * Pass NULL if the rows all have uniform weights.
113
+ * - with_replacement
114
+ * Whether to produce sub-samples with replacement or not.
115
+ * - weight_as_sample
116
+ * If passing 'sample_weights', whether to consider those weights as row sampling weights (i.e. the higher
117
+ * the weights, the more likely the observation will end up included in each tree sub-sample), or as distribution
118
+ * density weights (i.e. putting a weight of two is the same as if the row appeared twice, thus higher weight makes it
119
+ * less of an outlier). Note that sampling weight is only used when sub-sampling data for each tree.
120
+ * - nrows
121
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
122
+ * - sample_size
123
+ * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
124
+ * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
125
+ * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
126
+ * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
127
+ * in [5] is 'nrows' here.
128
+ * - ntrees
129
+ * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
130
+ * author's code in [5] is 10.
131
+ * - max_depth
132
+ * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
133
+ * - limit_depth
134
+ * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
135
+ * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
136
+ * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
137
+ * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass higher values if
138
+ * using the model for purposes other than outlier detection.
139
+ * - penalize_range
140
+ * Whether to penalize (add +1 to the terminal depth) observations at prediction time that have a value
141
+ * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
142
+ * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
143
+ * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
144
+ * when splitting by categorical variables.
145
+ * - standardize_dist
146
+ * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
147
+ * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
148
+ * - tmat[nrows * (nrows - 1) / 2]
149
+ * Array in which to calculate average separation depths or standardized distance metric (see documentation
150
+ * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
151
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
152
+ * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
153
+ * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
154
+ * entry 0 <= i < j < n will be located at position
155
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
156
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
157
+ * - output_depths[nrows]
158
+ * Array in which to calculate average path depths or standardized outlierness metric (see documentation
159
+ * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
160
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
161
+ * of rows. If not NULL, must already be initialized to zeros.
162
+ * - standardize_depth
163
+ * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
164
+ * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
165
+ * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
166
+ * the average depth of each row across all trees.
167
+ * - col_weights[ncols_numeric + ncols_categ]
168
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
169
+ * Ignored when picking columns by deterministic criterion.
170
+ * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
171
+ * - weigh_by_kurt
172
+ * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
173
+ * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
174
+ * sample, so if not using sub-samples, it's better to pass column weights calculated externally. For
175
+ * categorical columns, will calculate expected kurtosis if the column was converted to numerical by
176
+ * assigning to each category a random number ~ Unif(0, 1).
177
+ * - prob_pick_by_gain_avg
178
+ * Probability of making each split in the single-variable model by choosing a column and split point in that
179
+ * same column as both the column and split point that gives the largest averaged gain (as proposed in [4]) across
180
+ * all available columns and possible splits in each column. Note that this implies evaluating every single column
181
+ * in the sample data when this type of split happens, which will potentially make the model fitting much slower,
182
+ * but has no impact on prediction time. For categorical variables, will take the expected standard deviation that
183
+ * would be gotten if the column were converted to numerical by assigning to each category a random number ~ Unif(0, 1)
184
+ * and calculate gain with those assumed standard deviations. For the extended model, this parameter indicates the probability that the
185
+ * split point in the chosen linear combination of variables will be decided by this averaged gain criterion. Compared to
186
+ * a pooled average, this tends to result in more cases in which a single observation or very few of them are put into
187
+ * one branch. Recommended to use sub-samples (parameter `sample_size`) when passing this parameter. When splits are
188
+ * not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl',
189
+ * both the column and the split point are decided at random.
190
+ * Default setting for [1], [2], [3] is zero, and default for [4] is 1. This is the randomization parameter that can
191
+ * be passed to the author's original code in [5]. Note that, if passing value 1 (100%) with no sub-sampling and using the
192
+ * single-variable model, every single tree will have the exact same splits.
193
+ * - prob_split_by_gain_avg
194
+ * Probability of making each split by selecting a column at random and determining the split point as
195
+ * that which gives the highest averaged gain. Not supported for the extended model as the splits are on
196
+ * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_avg' for more details.
197
+ * - prob_pick_by_gain_pl
198
+ * Probability of making each split in the single-variable model by choosing a column and split point in that
199
+ * same column as both the column and split point that gives the largest pooled gain (as used in decision tree
200
+ * classifiers such as C4.5 in [7]) across all available columns and possible splits in each column. Note
201
+ * that this implies evaluating every single column in the sample data when this type of split happens, which
202
+ * will potentially make the model fitting much slower, but has no impact on prediction time. For categorical
203
+ * variables, will use shannon entropy instead (like in [7]). For the extended model, this parameter indicates the probability
204
+ * that the split point in the chosen linear combination of variables will be decided by this pooled gain
205
+ * criterion. Compared to a simple average, this tends to result in more evenly-divided splits and more clustered
206
+ * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
207
+ * When used for outlier detection, higher values of this parameter result in models that are able to better flag
208
+ * outliers in the training data, but generalize poorly to outliers in new data and to values of variables
209
+ * outside of the ranges from the training data. Passing small 'sample_size' and high values of this parameter will
210
+ * tend to flag too many outliers. When splits are not made according to any of 'prob_pick_by_gain_avg',
211
+ * 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl', both the column and the split point
212
+ * are decided at random. Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
213
+ * every single tree will have the exact same splits.
214
+ * - prob_split_by_gain_pl
215
+ * Probability of making each split by selecting a column at random and determining the split point as
216
+ * that which gives the highest pooled gain. Not supported for the extended model as the splits are on
217
+ * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_pl' for more details.
218
+ * - min_gain
219
+ * Minimum gain that a split threshold needs to produce in order to proceed with a split. Only used when the splits
220
+ * are decided by a gain criterion (either pooled or averaged). If the highest possible gain in the evaluated
221
+ * splits at a node is below this threshold, that node becomes a terminal node.
222
+ * - missing_action
223
+ * How to handle missing data at both fitting and prediction time. Options are a) "Divide" (for the single-variable
224
+ * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
225
+ * the data that went to each branch when fitting the model, b) "Impute", which will assign observations to the
226
+ * branch with the most observations in the single-variable model, or fill in missing values with the median
227
+ * of each column of the sample from which the split was made in the extended model (recommended), c) "Fail" which will assume
228
+ * there are no missing values and will trigger undefined behavior if it encounters any. In the extended model, infinite
229
+ * values will be treated as missing. Note that passing "fail" might crash the process if there turn out to be
230
+ * missing values, but will otherwise produce faster fitting and prediction times along with decreased model object sizes.
231
+ * Models from [1], [2], [3], [4] correspond to "Fail" here.
232
+ * - cat_split_type
233
+ * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
234
+ * a single category to a branch and the rest to the other branch. For the extended model, whether to
235
+ * give each category a coefficient, or only one while the rest get zero.
236
+ * - new_cat_action
237
+ * What to do after splitting a categorical feature when new data that reaches that split has categories that
238
+ * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
239
+ * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
240
+ * data that went to each branch when fitting the model, and in the extended model will assign
241
+ * them the median value for that column that was added to the linear combination of features, b) "Smallest", which will
242
+ * assign all observations with unseen categories in the split to the branch that had fewer observations when
243
+ * fitting the model, c) "Random", which will assing a branch (coefficient in the extended model) at random for
244
+ * each category beforehand, even if no observations had that category when fitting the model. Ignored when
245
+ * passing 'cat_split_type' = 'SingleCateg'.
246
+ * - all_perm
247
+ * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
248
+ * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
249
+ * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
250
+ * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
251
+ * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
252
+ * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
253
+ * possible to select other splits that respect the sorted frequency order.
254
+ * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
255
+ * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
256
+ * but note that this is not checked within the function.
257
+ * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
258
+ * - coef_by_prop
259
+ * In the extended model, whether to sort the randomly-generated coefficients for categories
260
+ * according to their relative frequency in the tree node. This might provide better results when using
261
+ * categorical variables with too many categories, but is not recommended, and not reflective of
262
+ * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
263
+ * variables.
264
+ * - imputer (out)
265
+ * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
266
+ * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
267
+ * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
268
+ * or division strategy.
269
+ * - min_imp_obs
270
+ * Minimum number of observations with which an imputation value can be produced. Ignored if passing
271
+ * 'build_imputer' = 'false'.
272
+ * - depth_imp
273
+ * How to weight observations according to their depth when used for imputing missing values. Passing
274
+ * "Higher" will weigh observations higher the further down the tree (away from the root node) the
275
+ * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
276
+ * to node depth in the tree. Implemented for testing purposes and not recommended to change
277
+ * from the default. Ignored when not passing 'impute_nodes'.
278
+ * - weigh_imp_rows
279
+ * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
280
+ * a node inversely proportional to the number of observations that end up there, while "Proportional"
281
+ * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
282
+ * in this regard regardless of how many observations end up there. Implemented for testing purposes
283
+ * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
284
+ * - impute_at_fit
285
+ * Whether to impute missing values in the input data as the model is being built. If passing 'true',
286
+ * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
287
+ * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
288
+ * - random_seed
289
+ * Seed that will be used to generate random numbers used by the model.
290
+ * - nthreads
291
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
292
+ * allocated, even if the thread does not end up being used. Ignored when not building with
293
+ * OpenMP support.
294
+ *
295
+ * Returns
296
+ * =======
297
+ * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
298
+ * If the process receives an interrupt signal, will return instead
299
+ * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
300
+ * what these values correspond to, you can use the functions
301
+ * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
302
+ * as integers.
303
+ *
304
+ * References
305
+ * ==========
306
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
307
+ * "Isolation forest."
308
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
309
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
310
+ * "Isolation-based anomaly detection."
311
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
312
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
313
+ * "Extended Isolation Forest."
314
+ * arXiv preprint arXiv:1811.02141 (2018).
315
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
316
+ * "On detecting clustered anomalies using SCiForest."
317
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
318
+ * [5] https://sourceforge.net/projects/iforest/
319
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
320
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
321
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
322
+ */
323
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
324
+ double numeric_data[], size_t ncols_numeric,
325
+ int categ_data[], size_t ncols_categ, int ncat[],
326
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
327
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
328
+ double sample_weights[], bool with_replacement, bool weight_as_sample,
329
+ size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
330
+ bool limit_depth, bool penalize_range,
331
+ bool standardize_dist, double tmat[],
332
+ double output_depths[], bool standardize_depth,
333
+ double col_weights[], bool weigh_by_kurt,
334
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
335
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
336
+ double min_gain, MissingAction missing_action,
337
+ CategSplit cat_split_type, NewCategAction new_cat_action,
338
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
339
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
340
+ uint64_t random_seed, int nthreads)
341
+ {
342
+ /* calculate maximum number of categories to use later */
343
+ int max_categ = 0;
344
+ for (size_t col = 0; col < ncols_categ; col++)
345
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
346
+
347
+ bool calc_dist = tmat != NULL;
348
+
349
+ if (calc_dist || sample_size == 0)
350
+ sample_size = nrows;
351
+
352
+ /* put data in structs to shorten function calls */
353
+ InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
354
+ nrows, ncols_numeric + ncols_categ, sample_weights,
355
+ weight_as_sample, col_weights,
356
+ Xc, Xc_ind, Xc_indptr,
357
+ 0, 0, std::vector<double>(),
358
+ std::vector<char>(), 0};
359
+ ModelParams model_params = {with_replacement, sample_size, ntrees,
360
+ limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
361
+ penalize_range, random_seed, weigh_by_kurt,
362
+ prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
363
+ prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
364
+ min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
365
+ (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
366
+ coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
367
+ depth_imp, weigh_imp_rows, min_imp_obs};
368
+
369
+ /* if using weights as sampling probability, build a binary tree for faster sampling */
370
+ if (input_data.weight_as_sample && input_data.sample_weights != NULL)
371
+ {
372
+ build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
373
+ input_data.nrows, input_data.log2_n, input_data.btree_offset);
374
+ }
375
+
376
+ /* if imputing missing values on-the-fly, need to determine which are missing */
377
+ std::vector<ImputedData> impute_vec;
378
+ std::unordered_map<size_t, ImputedData> impute_map;
379
+ if (model_params.impute_at_fit)
380
+ check_for_missing(input_data, impute_vec, impute_map, nthreads);
381
+
382
+ /* store model data */
383
+ if (model_outputs != NULL)
384
+ {
385
+ model_outputs->trees.resize(ntrees);
386
+ model_outputs->trees.shrink_to_fit();
387
+ model_outputs->new_cat_action = new_cat_action;
388
+ model_outputs->cat_split_type = cat_split_type;
389
+ model_outputs->missing_action = missing_action;
390
+ model_outputs->exp_avg_depth = expected_avg_depth(sample_size);
391
+ model_outputs->exp_avg_sep = expected_separation_depth(model_params.sample_size);
392
+ model_outputs->orig_sample_size = input_data.nrows;
393
+ }
394
+
395
+ else
396
+ {
397
+ model_outputs_ext->hplanes.resize(ntrees);
398
+ model_outputs_ext->hplanes.shrink_to_fit();
399
+ model_outputs_ext->new_cat_action = new_cat_action;
400
+ model_outputs_ext->cat_split_type = cat_split_type;
401
+ model_outputs_ext->missing_action = missing_action;
402
+ model_outputs_ext->exp_avg_depth = expected_avg_depth(sample_size);
403
+ model_outputs_ext->exp_avg_sep = expected_separation_depth(model_params.sample_size);
404
+ model_outputs_ext->orig_sample_size = input_data.nrows;
405
+ }
406
+
407
+ if (imputer != NULL)
408
+ initialize_imputer(*imputer, input_data, ntrees, nthreads);
409
+
410
+ /* initialize thread-private memory */
411
+ if ((size_t)nthreads > ntrees)
412
+ nthreads = (int)ntrees;
413
+ #ifdef _OPENMP
414
+ std::vector<WorkerMemory> worker_memory(nthreads);
415
+ #else
416
+ std::vector<WorkerMemory> worker_memory(1);
417
+ #endif
418
+
419
+ /* Global variable that determines if the procedure receives a stop signal */
420
+ interrupt_switch = false;
421
+
422
+ /* grow trees */
423
+ #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
424
+ for (size_t_for tree = 0; tree < ntrees; tree++)
425
+ {
426
+ if (interrupt_switch)
427
+ continue; /* Cannot break with OpenMP==2.0 (MSVC) */
428
+
429
+ if (
430
+ model_params.impute_at_fit &&
431
+ input_data.n_missing &&
432
+ !worker_memory[omp_get_thread_num()].impute_vec.size() &&
433
+ !worker_memory[omp_get_thread_num()].impute_map.size()
434
+ )
435
+ {
436
+ #ifdef _OPENMP
437
+ if (nthreads > 1)
438
+ {
439
+ worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
440
+ worker_memory[omp_get_thread_num()].impute_map = impute_map;
441
+ }
442
+
443
+ else
444
+ #endif
445
+ {
446
+ worker_memory[0].impute_vec = std::move(impute_vec);
447
+ worker_memory[0].impute_map = std::move(impute_map);
448
+ }
449
+ }
450
+
451
+ fit_itree((model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
452
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
453
+ worker_memory[omp_get_thread_num()],
454
+ input_data,
455
+ model_params,
456
+ (imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
457
+ tree);
458
+
459
+ if ((model_outputs != NULL))
460
+ model_outputs->trees[tree].shrink_to_fit();
461
+ else
462
+ model_outputs_ext->hplanes[tree].shrink_to_fit();
463
+
464
+ signal(SIGINT, set_interrup_global_variable);
465
+ }
466
+
467
+ /* check if the procedure got interrupted */
468
+ if (interrupt_switch) return EXIT_FAILURE;
469
+ interrupt_switch = false;
470
+
471
+ if ((model_outputs != NULL))
472
+ model_outputs->trees.shrink_to_fit();
473
+ else
474
+ model_outputs_ext->hplanes.shrink_to_fit();
475
+
476
+ /* if calculating similarity/distance, now need to reduce and average */
477
+ if (calc_dist)
478
+ gather_sim_result(NULL, &worker_memory,
479
+ NULL, &input_data,
480
+ model_outputs, model_outputs_ext,
481
+ tmat, NULL, 0,
482
+ model_params.ntrees, false,
483
+ standardize_dist, nthreads);
484
+
485
+ /* same for depths */
486
+ if (output_depths != NULL)
487
+ {
488
+ #ifdef _OPENMP
489
+ if (nthreads > 1)
490
+ {
491
+ for (WorkerMemory &w : worker_memory)
492
+ {
493
+ if (w.row_depths.size())
494
+ {
495
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
496
+ for (size_t_for row = 0; row < input_data.nrows; row++)
497
+ output_depths[row] += w.row_depths[row];
498
+ }
499
+ }
500
+ }
501
+ else
502
+ #endif
503
+ {
504
+ std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
505
+ }
506
+
507
+ if (standardize_depth)
508
+ {
509
+ double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
510
+ model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
511
+ for (size_t_for row = 0; row < nrows; row++)
512
+ output_depths[row] = exp2( - output_depths[row] / depth_divisor );
513
+ }
514
+
515
+ else
516
+ {
517
+ double ntrees_dbl = (double) ntrees;
518
+ for (size_t_for row = 0; row < nrows; row++)
519
+ output_depths[row] /= ntrees_dbl;
520
+ }
521
+ }
522
+
523
+ /* if imputing missing values, now need to reduce and write final values */
524
+ if (model_params.impute_at_fit)
525
+ {
526
+ #ifdef _OPENMP
527
+ if (nthreads > 1)
528
+ {
529
+ for (WorkerMemory &w : worker_memory)
530
+ combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
531
+ }
532
+
533
+ else
534
+ #endif
535
+ {
536
+ impute_vec = std::move(worker_memory[0].impute_vec);
537
+ impute_map = std::move(worker_memory[0].impute_map);
538
+ }
539
+
540
+ apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
541
+ }
542
+
543
+ return EXIT_SUCCESS;
544
+ }
545
+
546
+
547
+ /* Add additional trees to already-fitted isolation forest model
548
+ *
549
+ * Parameters
550
+ * ==========
551
+ * - model_outputs
552
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
553
+ * if the trees are are to be added to an extended model. Can only pass one of
554
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
555
+ * so it cannot be run in parallel for the same model object.
556
+ * - model_outputs_ext
557
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
558
+ * if the trees are are to be added to an single-variable model. Can only pass one of
559
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
560
+ * so it cannot be run in parallel for the same model object.
561
+ * - numeric_data
562
+ * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
563
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
564
+ * Pass NULL if there are no dense numeric columns.
565
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
566
+ * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
567
+ * of columns, either as dense or as sparse arrays.
568
+ * - ncols_numeric
569
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
570
+ * what was originally passed to 'fit_iforest'.
571
+ * - categ_data
572
+ * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
573
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
574
+ * Pass NULL if there are no categorical columns. The encoding must be the same as was used
575
+ * in the data to which the model was fit.
576
+ * Each category should be represented as an integer, and these integers must start at zero and
577
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
578
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
579
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
580
+ * must be the same as was used in the data to which the model was fit.
581
+ * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
582
+ * of columns and the same category encoding.
583
+ * - ncols_categ
584
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
585
+ * what was originally passed to 'fit_iforest'.
586
+ * - ncat
587
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
588
+ * what was originally passed to 'fit_iforest'.
589
+ * - Xc[nnz]
590
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
591
+ * Pass NULL if there are no sparse numeric columns.
592
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
593
+ * - Xc_ind[nnz]
594
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
595
+ * Pass NULL if there are no sparse numeric columns.
596
+ * - Xc_indptr[ncols_numeric + 1]
597
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
598
+ * start and at entry [col + 1] where does column 'col' end.
599
+ * Pass NULL if there are no sparse numeric columns.
600
+ * - ndim
601
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
602
+ * what was originally passed to 'fit_iforest'.
603
+ * - ntry
604
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
605
+ * what was originally passed to 'fit_iforest'.
606
+ * - coef_type
607
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
608
+ * what was originally passed to 'fit_iforest'.
609
+ * - sample_weights
610
+ * Weights for the rows when adding this tree, either as sampling importances when using
611
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
612
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
613
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
614
+ * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
615
+ * Pass NULL if the rows all have uniform weights.
616
+ * - nrows
617
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
618
+ * - max_depth
619
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
620
+ * what was originally passed to 'fit_iforest'.
621
+ * - limit_depth
622
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
623
+ * what was originally passed to 'fit_iforest'.
624
+ * - penalize_range
625
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
626
+ * what was originally passed to 'fit_iforest'.
627
+ * - col_weights
628
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
629
+ * Ignored when picking columns by deterministic criterion.
630
+ * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
631
+ * - weigh_by_kurt
632
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
633
+ * what was originally passed to 'fit_iforest'.
634
+ * - prob_pick_by_gain_avg
635
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
636
+ * what was originally passed to 'fit_iforest'.
637
+ * - prob_split_by_gain_avg
638
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
639
+ * what was originally passed to 'fit_iforest'.
640
+ * - prob_pick_by_gain_pl
641
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
642
+ * what was originally passed to 'fit_iforest'.
643
+ * - prob_split_by_gain_pl
644
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
645
+ * what was originally passed to 'fit_iforest'.
646
+ * - min_gain
647
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
648
+ * what was originally passed to 'fit_iforest'.
649
+ * - missing_action
650
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
651
+ * what was originally passed to 'fit_iforest'.
652
+ * - cat_split_type
653
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
654
+ * what was originally passed to 'fit_iforest'.
655
+ * - new_cat_action
656
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
657
+ * what was originally passed to 'fit_iforest'.
658
+ * - depth_imp
659
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
660
+ * what was originally passed to 'fit_iforest'.
661
+ * - weigh_imp_rows
662
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
663
+ * what was originally passed to 'fit_iforest'.
664
+ * - all_perm
665
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
666
+ * what was originally passed to 'fit_iforest'.
667
+ * - coef_by_prop
668
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
669
+ * what was originally passed to 'fit_iforest'.
670
+ * - impute_nodes
671
+ * Pointer to already-allocated imputation nodes for the tree that will be built. Note that the number of
672
+ * entries in the imputation object must match the number of fitted trees when it is used. Pass
673
+ * NULL if no imputation node is required.
674
+ * - min_imp_obs
675
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
676
+ * what was originally passed to 'fit_iforest'.
677
+ * - random_seed
678
+ * Seed that will be used to generate random numbers used by the model.
679
+ */
680
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
681
+ double numeric_data[], size_t ncols_numeric,
682
+ int categ_data[], size_t ncols_categ, int ncat[],
683
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
684
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
685
+ double sample_weights[], size_t nrows, size_t max_depth,
686
+ bool limit_depth, bool penalize_range,
687
+ double col_weights[], bool weigh_by_kurt,
688
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
689
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
690
+ double min_gain, MissingAction missing_action,
691
+ CategSplit cat_split_type, NewCategAction new_cat_action,
692
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
693
+ bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
694
+ uint64_t random_seed)
695
+ {
696
+ int max_categ = 0;
697
+ for (size_t col = 0; col < ncols_categ; col++)
698
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
699
+
700
+ InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
701
+ nrows, ncols_numeric + ncols_categ, sample_weights,
702
+ false, col_weights,
703
+ Xc, Xc_ind, Xc_indptr,
704
+ 0, 0, std::vector<double>(),
705
+ std::vector<char>(), 0};
706
+ ModelParams model_params = {false, nrows, (size_t)1,
707
+ max_depth? max_depth : (nrows - 1),
708
+ penalize_range, random_seed, weigh_by_kurt,
709
+ prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
710
+ prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
711
+ min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
712
+ (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
713
+ coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
714
+
715
+ std::unique_ptr<WorkerMemory> workspace = std::unique_ptr<WorkerMemory>(new WorkerMemory);
716
+
717
+ size_t last_tree;
718
+ if (model_outputs != NULL)
719
+ {
720
+ last_tree = model_outputs->trees.size();
721
+ model_outputs->trees.emplace_back();
722
+ }
723
+
724
+ else
725
+ {
726
+ last_tree = model_outputs_ext->hplanes.size();
727
+ model_outputs_ext->hplanes.emplace_back();
728
+ }
729
+
730
+ fit_itree((model_outputs != NULL)? &model_outputs->trees.back() : NULL,
731
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
732
+ *workspace,
733
+ input_data,
734
+ model_params,
735
+ impute_nodes,
736
+ last_tree);
737
+
738
+ if ((model_outputs != NULL))
739
+ model_outputs->trees.back().shrink_to_fit();
740
+ else
741
+ model_outputs_ext->hplanes.back().shrink_to_fit();
742
+
743
+ return EXIT_SUCCESS;
744
+ }
745
+
746
+ void fit_itree(std::vector<IsoTree> *tree_root,
747
+ std::vector<IsoHPlane> *hplane_root,
748
+ WorkerMemory &workspace,
749
+ InputData &input_data,
750
+ ModelParams &model_params,
751
+ std::vector<ImputeNode> *impute_nodes,
752
+ size_t tree_num)
753
+ {
754
+ /* initialize array for depths if called for */
755
+ if (!workspace.ix_arr.size() && model_params.calc_depth)
756
+ workspace.row_depths.resize(input_data.nrows, 0);
757
+
758
+ /* choose random sample of rows */
759
+ if (!workspace.ix_arr.size()) workspace.ix_arr.resize(model_params.sample_size);
760
+ if (input_data.log2_n > 0)
761
+ workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
762
+ input_data.btree_weights_init.end());
763
+ workspace.rnd_generator.seed(model_params.random_seed + tree_num);
764
+ if (input_data.col_weights != NULL)
765
+ workspace.col_sampler = std::discrete_distribution<size_t>(input_data.col_weights,
766
+ input_data.col_weights + input_data.ncols_numeric + input_data.ncols_categ);
767
+ workspace.runif = std::uniform_int_distribution<size_t>(0, input_data.ncols_tot - 1);
768
+ workspace.rbin = std::uniform_real_distribution<double>(0, 1);
769
+ sample_random_rows(workspace.ix_arr, input_data.nrows, model_params.with_replacement,
770
+ workspace.rnd_generator, workspace.ix_all,
771
+ (input_data.weight_as_sample)? input_data.sample_weights : NULL,
772
+ workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
773
+ workspace.is_repeated);
774
+ workspace.st = 0;
775
+ workspace.end = model_params.sample_size - 1;
776
+ if (!workspace.cols_possible.size())
777
+ workspace.cols_possible.resize(input_data.ncols_tot, true);
778
+ else
779
+ workspace.cols_possible.assign(workspace.cols_possible.size(), true);
780
+
781
+ /* set expected tree size and add root node */
782
+ {
783
+ size_t exp_nodes = 2 * model_params.sample_size;
784
+ if (model_params.sample_size >= (SIZE_MAX / (size_t)2))
785
+ exp_nodes = SIZE_MAX;
786
+ if (model_params.max_depth <= (size_t)30)
787
+ exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
788
+ if (tree_root != NULL)
789
+ {
790
+ tree_root->reserve(exp_nodes);
791
+ tree_root->emplace_back();
792
+ }
793
+ else
794
+ {
795
+ hplane_root->reserve(exp_nodes);
796
+ hplane_root->emplace_back();
797
+ }
798
+ if (impute_nodes != NULL)
799
+ {
800
+ impute_nodes->reserve(exp_nodes);
801
+ impute_nodes->emplace_back((size_t) 0);
802
+ }
803
+ }
804
+
805
+ /* initialize array with candidate categories if not already done */
806
+ if (!workspace.categs.size())
807
+ workspace.categs.resize(input_data.max_categ);
808
+
809
+ /* for the extended model, initialize extra vectors and objects */
810
+ if (hplane_root != NULL && !workspace.comb_val.size())
811
+ {
812
+ workspace.coef_norm = std::normal_distribution<double>(0, 1);
813
+ if (model_params.coef_type == Uniform)
814
+ workspace.coef_unif = std::uniform_real_distribution<double>(-1, 1);
815
+
816
+ workspace.cols_shuffled.resize(input_data.ncols_tot);
817
+ workspace.comb_val.resize(model_params.sample_size);
818
+ workspace.col_take.resize(model_params.ndim);
819
+ workspace.col_take_type.resize(model_params.ndim);
820
+
821
+ if (input_data.ncols_numeric)
822
+ {
823
+ workspace.ext_offset.resize(input_data.ncols_tot);
824
+ workspace.ext_coef.resize(input_data.ncols_tot);
825
+ workspace.ext_mean.resize(input_data.ncols_tot);
826
+ }
827
+
828
+ if (input_data.ncols_categ)
829
+ {
830
+ workspace.ext_fill_new.resize(input_data.max_categ);
831
+ switch(model_params.cat_split_type)
832
+ {
833
+ case SingleCateg:
834
+ {
835
+ workspace.chosen_cat.resize(input_data.max_categ);
836
+ break;
837
+ }
838
+
839
+ case SubSet:
840
+ {
841
+ workspace.ext_cat_coef.resize(input_data.ncols_tot);
842
+ for (std::vector<double> &v : workspace.ext_cat_coef)
843
+ v.resize(input_data.max_categ);
844
+ break;
845
+ }
846
+ }
847
+ }
848
+
849
+ workspace.ext_fill_val.resize(input_data.ncols_tot);
850
+
851
+ }
852
+
853
+ /* if it contains missing values, also have to set an array of weights,
854
+ which will be modified during iterations when there are NAs.
855
+ If there are already density weights, need to standardize them to sum up to
856
+ the sample size here */
857
+ long double weight_scaling = 0;
858
+ if (model_params.missing_action == Divide || (input_data.sample_weights != NULL && !input_data.weight_as_sample))
859
+ {
860
+ workspace.weights_map.clear();
861
+
862
+ /* if the sub-sample size is small relative to the full sample size, use a mapping */
863
+ if (model_params.sample_size < input_data.nrows / 4)
864
+ {
865
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
866
+ {
867
+ for (const size_t ix : workspace.ix_arr)
868
+ {
869
+ weight_scaling += input_data.sample_weights[ix];
870
+ workspace.weights_map[ix] = input_data.sample_weights[ix];
871
+ }
872
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
873
+ for (auto &w : workspace.weights_map)
874
+ w.second *= weight_scaling;
875
+
876
+ }
877
+
878
+ else
879
+ {
880
+ for (const size_t ix : workspace.ix_arr)
881
+ workspace.weights_map[ix] = 1;
882
+ }
883
+
884
+ }
885
+
886
+ /* if the sub-sample size is large, fill a full array matching to the sample size */
887
+ else
888
+ {
889
+ if (!workspace.weights_arr.size())
890
+ {
891
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
892
+ {
893
+ workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
894
+ weight_scaling = std::accumulate(workspace.ix_arr.begin(),
895
+ workspace.ix_arr.end(),
896
+ (long double)0,
897
+ [&input_data](const long double a, const size_t b){return a + (long double)input_data.sample_weights[b];}
898
+ );
899
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
900
+ for (double &w : workspace.weights_arr)
901
+ w *= weight_scaling;
902
+ }
903
+
904
+ else
905
+ {
906
+ workspace.weights_arr.resize(input_data.nrows, (double)1);
907
+ }
908
+
909
+ }
910
+
911
+ else
912
+ {
913
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
914
+ {
915
+ for (const size_t ix : workspace.ix_arr)
916
+ {
917
+ weight_scaling += input_data.sample_weights[ix];
918
+ workspace.weights_arr[ix] = input_data.sample_weights[ix];
919
+ }
920
+ weight_scaling = (long double)model_params.sample_size / weight_scaling;
921
+ for (double &w : workspace.weights_arr)
922
+ w *= weight_scaling;
923
+
924
+ }
925
+
926
+ else
927
+ {
928
+ /* Note: while not all of them need to be overwritten, this is faster
929
+ (sub-sample size was already determined to be at least 1/4 of the sample size) */
930
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), (double)1);
931
+ }
932
+ }
933
+ }
934
+ }
935
+
936
+ /* if producing distance/similarity, also need to initialize the triangular matrix */
937
+ if (model_params.calc_dist && !workspace.tmat_sep.size())
938
+ workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
939
+
940
+ /* make space for buffers if not already allocated */
941
+ if (
942
+ (model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
943
+ model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl ||
944
+ model_params.weigh_by_kurt || hplane_root != NULL)
945
+ &&
946
+ (!workspace.buffer_dbl.size() && !workspace.buffer_szt.size() && !workspace.buffer_chr.size())
947
+ )
948
+ {
949
+ size_t min_size_dbl = 0;
950
+ size_t min_size_szt = 0;
951
+ size_t min_size_chr = 0;
952
+
953
+ bool gain = model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
954
+ model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl;
955
+
956
+ if (input_data.ncols_categ)
957
+ {
958
+ min_size_szt = 2 * input_data.max_categ;
959
+ min_size_dbl = input_data.max_categ + 1;
960
+ if (gain && model_params.cat_split_type == SubSet)
961
+ min_size_chr = input_data.max_categ;
962
+ }
963
+
964
+ if (input_data.Xc != NULL && gain)
965
+ {
966
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
967
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
968
+ }
969
+
970
+ /* for the extended model */
971
+ if (hplane_root != NULL)
972
+ {
973
+ min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
974
+ if (model_params.missing_action != Fail)
975
+ {
976
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
977
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
978
+ }
979
+
980
+ if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
981
+ {
982
+ min_size_szt = std::max(min_size_szt, 2 * (size_t)input_data.max_categ + 1);
983
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
984
+ }
985
+
986
+ if (model_params.weigh_by_kurt)
987
+ min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
988
+ }
989
+
990
+ /* now resize */
991
+ if (workspace.buffer_dbl.size() < min_size_dbl)
992
+ workspace.buffer_dbl.resize(min_size_dbl);
993
+
994
+ if (workspace.buffer_szt.size() < min_size_szt)
995
+ workspace.buffer_szt.resize(min_size_szt);
996
+
997
+ if (workspace.buffer_chr.size() < min_size_chr)
998
+ workspace.buffer_chr.resize(min_size_chr);
999
+
1000
+ /* for guided column choice, need to also remember the best split so far */
1001
+ if (
1002
+ model_params.cat_split_type == SubSet &&
1003
+ (
1004
+ model_params.prob_pick_by_gain_avg ||
1005
+ model_params.prob_pick_by_gain_pl
1006
+ )
1007
+ )
1008
+ {
1009
+ workspace.this_split_categ.resize(input_data.max_categ);
1010
+ }
1011
+
1012
+ }
1013
+
1014
+ /* weigh columns by kurtosis in the sample if required */
1015
+ if (model_params.weigh_by_kurt)
1016
+ {
1017
+ std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
1018
+
1019
+ if (input_data.Xc == NULL)
1020
+ {
1021
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1022
+ kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1023
+ input_data.numeric_data + col * input_data.nrows,
1024
+ model_params.missing_action);
1025
+ }
1026
+
1027
+ else
1028
+ {
1029
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
1030
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1031
+ kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end, col,
1032
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
1033
+ model_params.missing_action);
1034
+ }
1035
+
1036
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
1037
+ kurt_weights[col + input_data.ncols_numeric] =
1038
+ calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1039
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
1040
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
1041
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
1042
+
1043
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
1044
+ if (kurt_weights[col] <= 0 || is_na_or_inf(kurt_weights[col]))
1045
+ workspace.cols_possible[col] = false;
1046
+
1047
+ workspace.col_sampler = std::discrete_distribution<size_t>(kurt_weights.begin(), kurt_weights.end());
1048
+ }
1049
+
1050
+ if (tree_root != NULL)
1051
+ split_itree_recursive(*tree_root,
1052
+ workspace,
1053
+ input_data,
1054
+ model_params,
1055
+ impute_nodes,
1056
+ 0);
1057
+ else
1058
+ split_hplane_recursive(*hplane_root,
1059
+ workspace,
1060
+ input_data,
1061
+ model_params,
1062
+ impute_nodes,
1063
+ 0);
1064
+
1065
+ /* if producing imputation structs, only need to keep the ones for terminal nodes */
1066
+ if (impute_nodes != NULL)
1067
+ drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
1068
+ }