isotree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
@@ -0,0 +1,1068 @@
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
+
* of categorical variables and missing values.
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
+
*
|
5
|
+
* This library is based on the following works:
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
+
* "Isolation forest."
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
+
* "Isolation-based anomaly detection."
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
+
* "Extended Isolation Forest."
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
+
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
+
*
|
24
|
+
* BSD 2-Clause License
|
25
|
+
* Copyright (c) 2019, David Cortes
|
26
|
+
* All rights reserved.
|
27
|
+
* Redistribution and use in source and binary forms, with or without
|
28
|
+
* modification, are permitted provided that the following conditions are met:
|
29
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
+
* list of conditions and the following disclaimer.
|
31
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
+
* this list of conditions and the following disclaimer in the documentation
|
33
|
+
* and/or other materials provided with the distribution.
|
34
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
+
*/
|
45
|
+
#include "isotree.hpp"
|
46
|
+
|
47
|
+
bool interrupt_switch;
|
48
|
+
|
49
|
+
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
50
|
+
*
|
51
|
+
* Parameters:
|
52
|
+
* ===========
|
53
|
+
* - model_outputs (out)
|
54
|
+
* Pointer to already allocated isolation forest model object for single-variable splits.
|
55
|
+
* If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
|
56
|
+
* additional trees through function 'add_tree'.
|
57
|
+
* - model_outputs_ext (out)
|
58
|
+
* Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
|
59
|
+
* Note that if 'ndim' = 1, must use instead the single-variable model object.
|
60
|
+
* If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
|
61
|
+
* additional trees through function 'add_tree'.
|
62
|
+
* - numeric_data[nrows * ncols_numeric]
|
63
|
+
* Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
|
64
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
65
|
+
* Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
|
66
|
+
* no sparse numeric data either).
|
67
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
68
|
+
* - ncols_numeric
|
69
|
+
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
70
|
+
* - categ_data[nrows * ncols_categ]
|
71
|
+
* Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
|
72
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
73
|
+
* Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
|
74
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
75
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must also be present
|
76
|
+
* (note that they are not treated as being ordinal, this is just an encoding). Missing values
|
77
|
+
* should be encoded as negative numbers such as (-1).
|
78
|
+
* - ncols_categ
|
79
|
+
* Number of categorical columns in the data.
|
80
|
+
* - ncat[ncols_categ]
|
81
|
+
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
82
|
+
* the number of categories for that column is '5' (zero is one category).
|
83
|
+
* - Xc[nnz]
|
84
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
85
|
+
* Pass NULL if there are no sparse numeric columns.
|
86
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
87
|
+
* - Xc_ind[nnz]
|
88
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
89
|
+
* Pass NULL if there are no sparse numeric columns.
|
90
|
+
* - Xc_indptr[ncols_numeric + 1]
|
91
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
92
|
+
* start and at entry [col + 1] where does column 'col' end.
|
93
|
+
* Pass NULL if there are no sparse numeric columns.
|
94
|
+
* - ndim
|
95
|
+
* How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
|
96
|
+
* the single-variable model. Note that the model object pointer passed must also
|
97
|
+
* agree with the value passed to 'ndim'.
|
98
|
+
* - ntry
|
99
|
+
* In the split-criterion extended model, how many random hyperplanes to evaluate in
|
100
|
+
* order to decide which one is best to take. Ignored for the single-variable case
|
101
|
+
* and for random splits.
|
102
|
+
* - coef_type
|
103
|
+
* For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
|
104
|
+
* (as proposed in [3]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [4]. Ignored for the
|
105
|
+
* single-variable model.
|
106
|
+
* - sample_weights[nrows]
|
107
|
+
* Weights for the rows when building a tree, either as sampling importances when using
|
108
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
109
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
110
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
111
|
+
* through parameter 'weight_as_sample'.
|
112
|
+
* Pass NULL if the rows all have uniform weights.
|
113
|
+
* - with_replacement
|
114
|
+
* Whether to produce sub-samples with replacement or not.
|
115
|
+
* - weight_as_sample
|
116
|
+
* If passing 'sample_weights', whether to consider those weights as row sampling weights (i.e. the higher
|
117
|
+
* the weights, the more likely the observation will end up included in each tree sub-sample), or as distribution
|
118
|
+
* density weights (i.e. putting a weight of two is the same as if the row appeared twice, thus higher weight makes it
|
119
|
+
* less of an outlier). Note that sampling weight is only used when sub-sampling data for each tree.
|
120
|
+
* - nrows
|
121
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
122
|
+
* - sample_size
|
123
|
+
* Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
|
124
|
+
* 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
|
125
|
+
* random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
|
126
|
+
* will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
|
127
|
+
* in [5] is 'nrows' here.
|
128
|
+
* - ntrees
|
129
|
+
* Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
|
130
|
+
* author's code in [5] is 10.
|
131
|
+
* - max_depth
|
132
|
+
* Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
|
133
|
+
* - limit_depth
|
134
|
+
* Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
|
135
|
+
* terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
|
136
|
+
* will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
|
137
|
+
* tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass higher values if
|
138
|
+
* using the model for purposes other than outlier detection.
|
139
|
+
* - penalize_range
|
140
|
+
* Whether to penalize (add +1 to the terminal depth) observations at prediction time that have a value
|
141
|
+
* of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
|
142
|
+
* reasonable range in the data being split (given by 2 * range in data and centered around the split point),
|
143
|
+
* as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
|
144
|
+
* when splitting by categorical variables.
|
145
|
+
* - standardize_dist
|
146
|
+
* If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
|
147
|
+
* depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
|
148
|
+
* - tmat[nrows * (nrows - 1) / 2]
|
149
|
+
* Array in which to calculate average separation depths or standardized distance metric (see documentation
|
150
|
+
* for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
151
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
152
|
+
* of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
|
153
|
+
* output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
|
154
|
+
* entry 0 <= i < j < n will be located at position
|
155
|
+
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
156
|
+
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
157
|
+
* - output_depths[nrows]
|
158
|
+
* Array in which to calculate average path depths or standardized outlierness metric (see documentation
|
159
|
+
* for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
160
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
161
|
+
* of rows. If not NULL, must already be initialized to zeros.
|
162
|
+
* - standardize_depth
|
163
|
+
* If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
|
164
|
+
* a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
|
165
|
+
* with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
|
166
|
+
* the average depth of each row across all trees.
|
167
|
+
* - col_weights[ncols_numeric + ncols_categ]
|
168
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
169
|
+
* Ignored when picking columns by deterministic criterion.
|
170
|
+
* If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
|
171
|
+
* - weigh_by_kurt
|
172
|
+
* Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
|
173
|
+
* for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
|
174
|
+
* sample, so if not using sub-samples, it's better to pass column weights calculated externally. For
|
175
|
+
* categorical columns, will calculate expected kurtosis if the column was converted to numerical by
|
176
|
+
* assigning to each category a random number ~ Unif(0, 1).
|
177
|
+
* - prob_pick_by_gain_avg
|
178
|
+
* Probability of making each split in the single-variable model by choosing a column and split point in that
|
179
|
+
* same column as both the column and split point that gives the largest averaged gain (as proposed in [4]) across
|
180
|
+
* all available columns and possible splits in each column. Note that this implies evaluating every single column
|
181
|
+
* in the sample data when this type of split happens, which will potentially make the model fitting much slower,
|
182
|
+
* but has no impact on prediction time. For categorical variables, will take the expected standard deviation that
|
183
|
+
* would be gotten if the column were converted to numerical by assigning to each category a random number ~ Unif(0, 1)
|
184
|
+
* and calculate gain with those assumed standard deviations. For the extended model, this parameter indicates the probability that the
|
185
|
+
* split point in the chosen linear combination of variables will be decided by this averaged gain criterion. Compared to
|
186
|
+
* a pooled average, this tends to result in more cases in which a single observation or very few of them are put into
|
187
|
+
* one branch. Recommended to use sub-samples (parameter `sample_size`) when passing this parameter. When splits are
|
188
|
+
* not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl',
|
189
|
+
* both the column and the split point are decided at random.
|
190
|
+
* Default setting for [1], [2], [3] is zero, and default for [4] is 1. This is the randomization parameter that can
|
191
|
+
* be passed to the author's original code in [5]. Note that, if passing value 1 (100%) with no sub-sampling and using the
|
192
|
+
* single-variable model, every single tree will have the exact same splits.
|
193
|
+
* - prob_split_by_gain_avg
|
194
|
+
* Probability of making each split by selecting a column at random and determining the split point as
|
195
|
+
* that which gives the highest averaged gain. Not supported for the extended model as the splits are on
|
196
|
+
* linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_avg' for more details.
|
197
|
+
* - prob_pick_by_gain_pl
|
198
|
+
* Probability of making each split in the single-variable model by choosing a column and split point in that
|
199
|
+
* same column as both the column and split point that gives the largest pooled gain (as used in decision tree
|
200
|
+
* classifiers such as C4.5 in [7]) across all available columns and possible splits in each column. Note
|
201
|
+
* that this implies evaluating every single column in the sample data when this type of split happens, which
|
202
|
+
* will potentially make the model fitting much slower, but has no impact on prediction time. For categorical
|
203
|
+
* variables, will use shannon entropy instead (like in [7]). For the extended model, this parameter indicates the probability
|
204
|
+
* that the split point in the chosen linear combination of variables will be decided by this pooled gain
|
205
|
+
* criterion. Compared to a simple average, this tends to result in more evenly-divided splits and more clustered
|
206
|
+
* groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
|
207
|
+
* When used for outlier detection, higher values of this parameter result in models that are able to better flag
|
208
|
+
* outliers in the training data, but generalize poorly to outliers in new data and to values of variables
|
209
|
+
* outside of the ranges from the training data. Passing small 'sample_size' and high values of this parameter will
|
210
|
+
* tend to flag too many outliers. When splits are not made according to any of 'prob_pick_by_gain_avg',
|
211
|
+
* 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl', both the column and the split point
|
212
|
+
* are decided at random. Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
|
213
|
+
* every single tree will have the exact same splits.
|
214
|
+
* - prob_split_by_gain_pl
|
215
|
+
* Probability of making each split by selecting a column at random and determining the split point as
|
216
|
+
* that which gives the highest pooled gain. Not supported for the extended model as the splits are on
|
217
|
+
* linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_pl' for more details.
|
218
|
+
* - min_gain
|
219
|
+
* Minimum gain that a split threshold needs to produce in order to proceed with a split. Only used when the splits
|
220
|
+
* are decided by a gain criterion (either pooled or averaged). If the highest possible gain in the evaluated
|
221
|
+
* splits at a node is below this threshold, that node becomes a terminal node.
|
222
|
+
* - missing_action
|
223
|
+
* How to handle missing data at both fitting and prediction time. Options are a) "Divide" (for the single-variable
|
224
|
+
* model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
|
225
|
+
* the data that went to each branch when fitting the model, b) "Impute", which will assign observations to the
|
226
|
+
* branch with the most observations in the single-variable model, or fill in missing values with the median
|
227
|
+
* of each column of the sample from which the split was made in the extended model (recommended), c) "Fail" which will assume
|
228
|
+
* there are no missing values and will trigger undefined behavior if it encounters any. In the extended model, infinite
|
229
|
+
* values will be treated as missing. Note that passing "fail" might crash the process if there turn out to be
|
230
|
+
* missing values, but will otherwise produce faster fitting and prediction times along with decreased model object sizes.
|
231
|
+
* Models from [1], [2], [3], [4] correspond to "Fail" here.
|
232
|
+
* - cat_split_type
|
233
|
+
* Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
|
234
|
+
* a single category to a branch and the rest to the other branch. For the extended model, whether to
|
235
|
+
* give each category a coefficient, or only one while the rest get zero.
|
236
|
+
* - new_cat_action
|
237
|
+
* What to do after splitting a categorical feature when new data that reaches that split has categories that
|
238
|
+
* the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
|
239
|
+
* in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
|
240
|
+
* data that went to each branch when fitting the model, and in the extended model will assign
|
241
|
+
* them the median value for that column that was added to the linear combination of features, b) "Smallest", which will
|
242
|
+
* assign all observations with unseen categories in the split to the branch that had fewer observations when
|
243
|
+
* fitting the model, c) "Random", which will assing a branch (coefficient in the extended model) at random for
|
244
|
+
* each category beforehand, even if no observations had that category when fitting the model. Ignored when
|
245
|
+
* passing 'cat_split_type' = 'SingleCateg'.
|
246
|
+
* - all_perm
|
247
|
+
* When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
|
248
|
+
* whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
|
249
|
+
* will sort the categories by their frequency and make a grouping in this sorted order. Note that the
|
250
|
+
* number of combinations evaluated (if 'true') is the factorial of the number of present categories in
|
251
|
+
* a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
|
252
|
+
* category in a separate branch, so not evaluating all permutations (passing 'false') will make it
|
253
|
+
* possible to select other splits that respect the sorted frequency order.
|
254
|
+
* The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
|
255
|
+
* systems, this means no column can have more than 20 different categories if using 'all_perm=true',
|
256
|
+
* but note that this is not checked within the function.
|
257
|
+
* Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
|
258
|
+
* - coef_by_prop
|
259
|
+
* In the extended model, whether to sort the randomly-generated coefficients for categories
|
260
|
+
* according to their relative frequency in the tree node. This might provide better results when using
|
261
|
+
* categorical variables with too many categories, but is not recommended, and not reflective of
|
262
|
+
* real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
|
263
|
+
* variables.
|
264
|
+
* - imputer (out)
|
265
|
+
* Pointer to already-allocated imputer object, which can be used to produce missing value imputations
|
266
|
+
* in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
|
267
|
+
* 'missing_action' as missing values inside the model are treated differently and follow their own imputation
|
268
|
+
* or division strategy.
|
269
|
+
* - min_imp_obs
|
270
|
+
* Minimum number of observations with which an imputation value can be produced. Ignored if passing
|
271
|
+
* 'build_imputer' = 'false'.
|
272
|
+
* - depth_imp
|
273
|
+
* How to weight observations according to their depth when used for imputing missing values. Passing
|
274
|
+
* "Higher" will weigh observations higher the further down the tree (away from the root node) the
|
275
|
+
* terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
|
276
|
+
* to node depth in the tree. Implemented for testing purposes and not recommended to change
|
277
|
+
* from the default. Ignored when not passing 'impute_nodes'.
|
278
|
+
* - weigh_imp_rows
|
279
|
+
* How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
|
280
|
+
* a node inversely proportional to the number of observations that end up there, while "Proportional"
|
281
|
+
* will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
|
282
|
+
* in this regard regardless of how many observations end up there. Implemented for testing purposes
|
283
|
+
* and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
|
284
|
+
* - impute_at_fit
|
285
|
+
* Whether to impute missing values in the input data as the model is being built. If passing 'true',
|
286
|
+
* then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
|
287
|
+
* 'categ_data', and 'Xc', will get overwritten with the imputations produced.
|
288
|
+
* - random_seed
|
289
|
+
* Seed that will be used to generate random numbers used by the model.
|
290
|
+
* - nthreads
|
291
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
292
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
293
|
+
* OpenMP support.
|
294
|
+
*
|
295
|
+
* Returns
|
296
|
+
* =======
|
297
|
+
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
298
|
+
* If the process receives an interrupt signal, will return instead
|
299
|
+
* 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
|
300
|
+
* what these values correspond to, you can use the functions
|
301
|
+
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
302
|
+
* as integers.
|
303
|
+
*
|
304
|
+
* References
|
305
|
+
* ==========
|
306
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
307
|
+
* "Isolation forest."
|
308
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
309
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
310
|
+
* "Isolation-based anomaly detection."
|
311
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
312
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
313
|
+
* "Extended Isolation Forest."
|
314
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
315
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
316
|
+
* "On detecting clustered anomalies using SCiForest."
|
317
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
318
|
+
* [5] https://sourceforge.net/projects/iforest/
|
319
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
320
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
321
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
322
|
+
*/
|
323
|
+
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
324
|
+
double numeric_data[], size_t ncols_numeric,
|
325
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
326
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
327
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
328
|
+
double sample_weights[], bool with_replacement, bool weight_as_sample,
|
329
|
+
size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
|
330
|
+
bool limit_depth, bool penalize_range,
|
331
|
+
bool standardize_dist, double tmat[],
|
332
|
+
double output_depths[], bool standardize_depth,
|
333
|
+
double col_weights[], bool weigh_by_kurt,
|
334
|
+
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
335
|
+
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
336
|
+
double min_gain, MissingAction missing_action,
|
337
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
338
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
339
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
340
|
+
uint64_t random_seed, int nthreads)
|
341
|
+
{
|
342
|
+
/* calculate maximum number of categories to use later */
|
343
|
+
int max_categ = 0;
|
344
|
+
for (size_t col = 0; col < ncols_categ; col++)
|
345
|
+
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
346
|
+
|
347
|
+
bool calc_dist = tmat != NULL;
|
348
|
+
|
349
|
+
if (calc_dist || sample_size == 0)
|
350
|
+
sample_size = nrows;
|
351
|
+
|
352
|
+
/* put data in structs to shorten function calls */
|
353
|
+
InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
354
|
+
nrows, ncols_numeric + ncols_categ, sample_weights,
|
355
|
+
weight_as_sample, col_weights,
|
356
|
+
Xc, Xc_ind, Xc_indptr,
|
357
|
+
0, 0, std::vector<double>(),
|
358
|
+
std::vector<char>(), 0};
|
359
|
+
ModelParams model_params = {with_replacement, sample_size, ntrees,
|
360
|
+
limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
|
361
|
+
penalize_range, random_seed, weigh_by_kurt,
|
362
|
+
prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
|
363
|
+
prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
|
364
|
+
min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
|
365
|
+
(model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
|
366
|
+
coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
|
367
|
+
depth_imp, weigh_imp_rows, min_imp_obs};
|
368
|
+
|
369
|
+
/* if using weights as sampling probability, build a binary tree for faster sampling */
|
370
|
+
if (input_data.weight_as_sample && input_data.sample_weights != NULL)
|
371
|
+
{
|
372
|
+
build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
|
373
|
+
input_data.nrows, input_data.log2_n, input_data.btree_offset);
|
374
|
+
}
|
375
|
+
|
376
|
+
/* if imputing missing values on-the-fly, need to determine which are missing */
|
377
|
+
std::vector<ImputedData> impute_vec;
|
378
|
+
std::unordered_map<size_t, ImputedData> impute_map;
|
379
|
+
if (model_params.impute_at_fit)
|
380
|
+
check_for_missing(input_data, impute_vec, impute_map, nthreads);
|
381
|
+
|
382
|
+
/* store model data */
|
383
|
+
if (model_outputs != NULL)
|
384
|
+
{
|
385
|
+
model_outputs->trees.resize(ntrees);
|
386
|
+
model_outputs->trees.shrink_to_fit();
|
387
|
+
model_outputs->new_cat_action = new_cat_action;
|
388
|
+
model_outputs->cat_split_type = cat_split_type;
|
389
|
+
model_outputs->missing_action = missing_action;
|
390
|
+
model_outputs->exp_avg_depth = expected_avg_depth(sample_size);
|
391
|
+
model_outputs->exp_avg_sep = expected_separation_depth(model_params.sample_size);
|
392
|
+
model_outputs->orig_sample_size = input_data.nrows;
|
393
|
+
}
|
394
|
+
|
395
|
+
else
|
396
|
+
{
|
397
|
+
model_outputs_ext->hplanes.resize(ntrees);
|
398
|
+
model_outputs_ext->hplanes.shrink_to_fit();
|
399
|
+
model_outputs_ext->new_cat_action = new_cat_action;
|
400
|
+
model_outputs_ext->cat_split_type = cat_split_type;
|
401
|
+
model_outputs_ext->missing_action = missing_action;
|
402
|
+
model_outputs_ext->exp_avg_depth = expected_avg_depth(sample_size);
|
403
|
+
model_outputs_ext->exp_avg_sep = expected_separation_depth(model_params.sample_size);
|
404
|
+
model_outputs_ext->orig_sample_size = input_data.nrows;
|
405
|
+
}
|
406
|
+
|
407
|
+
if (imputer != NULL)
|
408
|
+
initialize_imputer(*imputer, input_data, ntrees, nthreads);
|
409
|
+
|
410
|
+
/* initialize thread-private memory */
|
411
|
+
if ((size_t)nthreads > ntrees)
|
412
|
+
nthreads = (int)ntrees;
|
413
|
+
#ifdef _OPENMP
|
414
|
+
std::vector<WorkerMemory> worker_memory(nthreads);
|
415
|
+
#else
|
416
|
+
std::vector<WorkerMemory> worker_memory(1);
|
417
|
+
#endif
|
418
|
+
|
419
|
+
/* Global variable that determines if the procedure receives a stop signal */
|
420
|
+
interrupt_switch = false;
|
421
|
+
|
422
|
+
/* grow trees */
|
423
|
+
#pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
|
424
|
+
for (size_t_for tree = 0; tree < ntrees; tree++)
|
425
|
+
{
|
426
|
+
if (interrupt_switch)
|
427
|
+
continue; /* Cannot break with OpenMP==2.0 (MSVC) */
|
428
|
+
|
429
|
+
if (
|
430
|
+
model_params.impute_at_fit &&
|
431
|
+
input_data.n_missing &&
|
432
|
+
!worker_memory[omp_get_thread_num()].impute_vec.size() &&
|
433
|
+
!worker_memory[omp_get_thread_num()].impute_map.size()
|
434
|
+
)
|
435
|
+
{
|
436
|
+
#ifdef _OPENMP
|
437
|
+
if (nthreads > 1)
|
438
|
+
{
|
439
|
+
worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
|
440
|
+
worker_memory[omp_get_thread_num()].impute_map = impute_map;
|
441
|
+
}
|
442
|
+
|
443
|
+
else
|
444
|
+
#endif
|
445
|
+
{
|
446
|
+
worker_memory[0].impute_vec = std::move(impute_vec);
|
447
|
+
worker_memory[0].impute_map = std::move(impute_map);
|
448
|
+
}
|
449
|
+
}
|
450
|
+
|
451
|
+
fit_itree((model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
|
452
|
+
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
|
453
|
+
worker_memory[omp_get_thread_num()],
|
454
|
+
input_data,
|
455
|
+
model_params,
|
456
|
+
(imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
|
457
|
+
tree);
|
458
|
+
|
459
|
+
if ((model_outputs != NULL))
|
460
|
+
model_outputs->trees[tree].shrink_to_fit();
|
461
|
+
else
|
462
|
+
model_outputs_ext->hplanes[tree].shrink_to_fit();
|
463
|
+
|
464
|
+
signal(SIGINT, set_interrup_global_variable);
|
465
|
+
}
|
466
|
+
|
467
|
+
/* check if the procedure got interrupted */
|
468
|
+
if (interrupt_switch) return EXIT_FAILURE;
|
469
|
+
interrupt_switch = false;
|
470
|
+
|
471
|
+
if ((model_outputs != NULL))
|
472
|
+
model_outputs->trees.shrink_to_fit();
|
473
|
+
else
|
474
|
+
model_outputs_ext->hplanes.shrink_to_fit();
|
475
|
+
|
476
|
+
/* if calculating similarity/distance, now need to reduce and average */
|
477
|
+
if (calc_dist)
|
478
|
+
gather_sim_result(NULL, &worker_memory,
|
479
|
+
NULL, &input_data,
|
480
|
+
model_outputs, model_outputs_ext,
|
481
|
+
tmat, NULL, 0,
|
482
|
+
model_params.ntrees, false,
|
483
|
+
standardize_dist, nthreads);
|
484
|
+
|
485
|
+
/* same for depths */
|
486
|
+
if (output_depths != NULL)
|
487
|
+
{
|
488
|
+
#ifdef _OPENMP
|
489
|
+
if (nthreads > 1)
|
490
|
+
{
|
491
|
+
for (WorkerMemory &w : worker_memory)
|
492
|
+
{
|
493
|
+
if (w.row_depths.size())
|
494
|
+
{
|
495
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
|
496
|
+
for (size_t_for row = 0; row < input_data.nrows; row++)
|
497
|
+
output_depths[row] += w.row_depths[row];
|
498
|
+
}
|
499
|
+
}
|
500
|
+
}
|
501
|
+
else
|
502
|
+
#endif
|
503
|
+
{
|
504
|
+
std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
|
505
|
+
}
|
506
|
+
|
507
|
+
if (standardize_depth)
|
508
|
+
{
|
509
|
+
double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
|
510
|
+
model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
|
511
|
+
for (size_t_for row = 0; row < nrows; row++)
|
512
|
+
output_depths[row] = exp2( - output_depths[row] / depth_divisor );
|
513
|
+
}
|
514
|
+
|
515
|
+
else
|
516
|
+
{
|
517
|
+
double ntrees_dbl = (double) ntrees;
|
518
|
+
for (size_t_for row = 0; row < nrows; row++)
|
519
|
+
output_depths[row] /= ntrees_dbl;
|
520
|
+
}
|
521
|
+
}
|
522
|
+
|
523
|
+
/* if imputing missing values, now need to reduce and write final values */
|
524
|
+
if (model_params.impute_at_fit)
|
525
|
+
{
|
526
|
+
#ifdef _OPENMP
|
527
|
+
if (nthreads > 1)
|
528
|
+
{
|
529
|
+
for (WorkerMemory &w : worker_memory)
|
530
|
+
combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
|
531
|
+
}
|
532
|
+
|
533
|
+
else
|
534
|
+
#endif
|
535
|
+
{
|
536
|
+
impute_vec = std::move(worker_memory[0].impute_vec);
|
537
|
+
impute_map = std::move(worker_memory[0].impute_map);
|
538
|
+
}
|
539
|
+
|
540
|
+
apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
|
541
|
+
}
|
542
|
+
|
543
|
+
return EXIT_SUCCESS;
|
544
|
+
}
|
545
|
+
|
546
|
+
|
547
|
+
/* Add additional trees to already-fitted isolation forest model
|
548
|
+
*
|
549
|
+
* Parameters
|
550
|
+
* ==========
|
551
|
+
* - model_outputs
|
552
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
553
|
+
* if the trees are are to be added to an extended model. Can only pass one of
|
554
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
555
|
+
* so it cannot be run in parallel for the same model object.
|
556
|
+
* - model_outputs_ext
|
557
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
558
|
+
* if the trees are are to be added to an single-variable model. Can only pass one of
|
559
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
560
|
+
* so it cannot be run in parallel for the same model object.
|
561
|
+
* - numeric_data
|
562
|
+
* Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
563
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
564
|
+
* Pass NULL if there are no dense numeric columns.
|
565
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
566
|
+
* If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
|
567
|
+
* of columns, either as dense or as sparse arrays.
|
568
|
+
* - ncols_numeric
|
569
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
570
|
+
* what was originally passed to 'fit_iforest'.
|
571
|
+
* - categ_data
|
572
|
+
* Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
573
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
574
|
+
* Pass NULL if there are no categorical columns. The encoding must be the same as was used
|
575
|
+
* in the data to which the model was fit.
|
576
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
577
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
578
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
579
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
580
|
+
* must be the same as was used in the data to which the model was fit.
|
581
|
+
* If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
|
582
|
+
* of columns and the same category encoding.
|
583
|
+
* - ncols_categ
|
584
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
585
|
+
* what was originally passed to 'fit_iforest'.
|
586
|
+
* - ncat
|
587
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
588
|
+
* what was originally passed to 'fit_iforest'.
|
589
|
+
* - Xc[nnz]
|
590
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
591
|
+
* Pass NULL if there are no sparse numeric columns.
|
592
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
593
|
+
* - Xc_ind[nnz]
|
594
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
595
|
+
* Pass NULL if there are no sparse numeric columns.
|
596
|
+
* - Xc_indptr[ncols_numeric + 1]
|
597
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
598
|
+
* start and at entry [col + 1] where does column 'col' end.
|
599
|
+
* Pass NULL if there are no sparse numeric columns.
|
600
|
+
* - ndim
|
601
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
602
|
+
* what was originally passed to 'fit_iforest'.
|
603
|
+
* - ntry
|
604
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
605
|
+
* what was originally passed to 'fit_iforest'.
|
606
|
+
* - coef_type
|
607
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
608
|
+
* what was originally passed to 'fit_iforest'.
|
609
|
+
* - sample_weights
|
610
|
+
* Weights for the rows when adding this tree, either as sampling importances when using
|
611
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
612
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
613
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
614
|
+
* through parameter 'weight_as_sample' that was passed to 'fit_iforest.
|
615
|
+
* Pass NULL if the rows all have uniform weights.
|
616
|
+
* - nrows
|
617
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
618
|
+
* - max_depth
|
619
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
620
|
+
* what was originally passed to 'fit_iforest'.
|
621
|
+
* - limit_depth
|
622
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
623
|
+
* what was originally passed to 'fit_iforest'.
|
624
|
+
* - penalize_range
|
625
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
626
|
+
* what was originally passed to 'fit_iforest'.
|
627
|
+
* - col_weights
|
628
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
629
|
+
* Ignored when picking columns by deterministic criterion.
|
630
|
+
* If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
|
631
|
+
* - weigh_by_kurt
|
632
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
633
|
+
* what was originally passed to 'fit_iforest'.
|
634
|
+
* - prob_pick_by_gain_avg
|
635
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
636
|
+
* what was originally passed to 'fit_iforest'.
|
637
|
+
* - prob_split_by_gain_avg
|
638
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
639
|
+
* what was originally passed to 'fit_iforest'.
|
640
|
+
* - prob_pick_by_gain_pl
|
641
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
642
|
+
* what was originally passed to 'fit_iforest'.
|
643
|
+
* - prob_split_by_gain_pl
|
644
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
645
|
+
* what was originally passed to 'fit_iforest'.
|
646
|
+
* - min_gain
|
647
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
648
|
+
* what was originally passed to 'fit_iforest'.
|
649
|
+
* - missing_action
|
650
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
651
|
+
* what was originally passed to 'fit_iforest'.
|
652
|
+
* - cat_split_type
|
653
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
654
|
+
* what was originally passed to 'fit_iforest'.
|
655
|
+
* - new_cat_action
|
656
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
657
|
+
* what was originally passed to 'fit_iforest'.
|
658
|
+
* - depth_imp
|
659
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
660
|
+
* what was originally passed to 'fit_iforest'.
|
661
|
+
* - weigh_imp_rows
|
662
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
663
|
+
* what was originally passed to 'fit_iforest'.
|
664
|
+
* - all_perm
|
665
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
666
|
+
* what was originally passed to 'fit_iforest'.
|
667
|
+
* - coef_by_prop
|
668
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
669
|
+
* what was originally passed to 'fit_iforest'.
|
670
|
+
* - impute_nodes
|
671
|
+
* Pointer to already-allocated imputation nodes for the tree that will be built. Note that the number of
|
672
|
+
* entries in the imputation object must match the number of fitted trees when it is used. Pass
|
673
|
+
* NULL if no imputation node is required.
|
674
|
+
* - min_imp_obs
|
675
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
676
|
+
* what was originally passed to 'fit_iforest'.
|
677
|
+
* - random_seed
|
678
|
+
* Seed that will be used to generate random numbers used by the model.
|
679
|
+
*/
|
680
|
+
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
681
|
+
double numeric_data[], size_t ncols_numeric,
|
682
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
683
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
684
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
685
|
+
double sample_weights[], size_t nrows, size_t max_depth,
|
686
|
+
bool limit_depth, bool penalize_range,
|
687
|
+
double col_weights[], bool weigh_by_kurt,
|
688
|
+
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
689
|
+
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
690
|
+
double min_gain, MissingAction missing_action,
|
691
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
692
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
693
|
+
bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
|
694
|
+
uint64_t random_seed)
|
695
|
+
{
|
696
|
+
int max_categ = 0;
|
697
|
+
for (size_t col = 0; col < ncols_categ; col++)
|
698
|
+
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
699
|
+
|
700
|
+
InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
701
|
+
nrows, ncols_numeric + ncols_categ, sample_weights,
|
702
|
+
false, col_weights,
|
703
|
+
Xc, Xc_ind, Xc_indptr,
|
704
|
+
0, 0, std::vector<double>(),
|
705
|
+
std::vector<char>(), 0};
|
706
|
+
ModelParams model_params = {false, nrows, (size_t)1,
|
707
|
+
max_depth? max_depth : (nrows - 1),
|
708
|
+
penalize_range, random_seed, weigh_by_kurt,
|
709
|
+
prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
|
710
|
+
prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
|
711
|
+
min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
|
712
|
+
(model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
|
713
|
+
coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
|
714
|
+
|
715
|
+
std::unique_ptr<WorkerMemory> workspace = std::unique_ptr<WorkerMemory>(new WorkerMemory);
|
716
|
+
|
717
|
+
size_t last_tree;
|
718
|
+
if (model_outputs != NULL)
|
719
|
+
{
|
720
|
+
last_tree = model_outputs->trees.size();
|
721
|
+
model_outputs->trees.emplace_back();
|
722
|
+
}
|
723
|
+
|
724
|
+
else
|
725
|
+
{
|
726
|
+
last_tree = model_outputs_ext->hplanes.size();
|
727
|
+
model_outputs_ext->hplanes.emplace_back();
|
728
|
+
}
|
729
|
+
|
730
|
+
fit_itree((model_outputs != NULL)? &model_outputs->trees.back() : NULL,
|
731
|
+
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
|
732
|
+
*workspace,
|
733
|
+
input_data,
|
734
|
+
model_params,
|
735
|
+
impute_nodes,
|
736
|
+
last_tree);
|
737
|
+
|
738
|
+
if ((model_outputs != NULL))
|
739
|
+
model_outputs->trees.back().shrink_to_fit();
|
740
|
+
else
|
741
|
+
model_outputs_ext->hplanes.back().shrink_to_fit();
|
742
|
+
|
743
|
+
return EXIT_SUCCESS;
|
744
|
+
}
|
745
|
+
|
746
|
+
void fit_itree(std::vector<IsoTree> *tree_root,
|
747
|
+
std::vector<IsoHPlane> *hplane_root,
|
748
|
+
WorkerMemory &workspace,
|
749
|
+
InputData &input_data,
|
750
|
+
ModelParams &model_params,
|
751
|
+
std::vector<ImputeNode> *impute_nodes,
|
752
|
+
size_t tree_num)
|
753
|
+
{
|
754
|
+
/* initialize array for depths if called for */
|
755
|
+
if (!workspace.ix_arr.size() && model_params.calc_depth)
|
756
|
+
workspace.row_depths.resize(input_data.nrows, 0);
|
757
|
+
|
758
|
+
/* choose random sample of rows */
|
759
|
+
if (!workspace.ix_arr.size()) workspace.ix_arr.resize(model_params.sample_size);
|
760
|
+
if (input_data.log2_n > 0)
|
761
|
+
workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
|
762
|
+
input_data.btree_weights_init.end());
|
763
|
+
workspace.rnd_generator.seed(model_params.random_seed + tree_num);
|
764
|
+
if (input_data.col_weights != NULL)
|
765
|
+
workspace.col_sampler = std::discrete_distribution<size_t>(input_data.col_weights,
|
766
|
+
input_data.col_weights + input_data.ncols_numeric + input_data.ncols_categ);
|
767
|
+
workspace.runif = std::uniform_int_distribution<size_t>(0, input_data.ncols_tot - 1);
|
768
|
+
workspace.rbin = std::uniform_real_distribution<double>(0, 1);
|
769
|
+
sample_random_rows(workspace.ix_arr, input_data.nrows, model_params.with_replacement,
|
770
|
+
workspace.rnd_generator, workspace.ix_all,
|
771
|
+
(input_data.weight_as_sample)? input_data.sample_weights : NULL,
|
772
|
+
workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
|
773
|
+
workspace.is_repeated);
|
774
|
+
workspace.st = 0;
|
775
|
+
workspace.end = model_params.sample_size - 1;
|
776
|
+
if (!workspace.cols_possible.size())
|
777
|
+
workspace.cols_possible.resize(input_data.ncols_tot, true);
|
778
|
+
else
|
779
|
+
workspace.cols_possible.assign(workspace.cols_possible.size(), true);
|
780
|
+
|
781
|
+
/* set expected tree size and add root node */
|
782
|
+
{
|
783
|
+
size_t exp_nodes = 2 * model_params.sample_size;
|
784
|
+
if (model_params.sample_size >= (SIZE_MAX / (size_t)2))
|
785
|
+
exp_nodes = SIZE_MAX;
|
786
|
+
if (model_params.max_depth <= (size_t)30)
|
787
|
+
exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
|
788
|
+
if (tree_root != NULL)
|
789
|
+
{
|
790
|
+
tree_root->reserve(exp_nodes);
|
791
|
+
tree_root->emplace_back();
|
792
|
+
}
|
793
|
+
else
|
794
|
+
{
|
795
|
+
hplane_root->reserve(exp_nodes);
|
796
|
+
hplane_root->emplace_back();
|
797
|
+
}
|
798
|
+
if (impute_nodes != NULL)
|
799
|
+
{
|
800
|
+
impute_nodes->reserve(exp_nodes);
|
801
|
+
impute_nodes->emplace_back((size_t) 0);
|
802
|
+
}
|
803
|
+
}
|
804
|
+
|
805
|
+
/* initialize array with candidate categories if not already done */
|
806
|
+
if (!workspace.categs.size())
|
807
|
+
workspace.categs.resize(input_data.max_categ);
|
808
|
+
|
809
|
+
/* for the extended model, initialize extra vectors and objects */
|
810
|
+
if (hplane_root != NULL && !workspace.comb_val.size())
|
811
|
+
{
|
812
|
+
workspace.coef_norm = std::normal_distribution<double>(0, 1);
|
813
|
+
if (model_params.coef_type == Uniform)
|
814
|
+
workspace.coef_unif = std::uniform_real_distribution<double>(-1, 1);
|
815
|
+
|
816
|
+
workspace.cols_shuffled.resize(input_data.ncols_tot);
|
817
|
+
workspace.comb_val.resize(model_params.sample_size);
|
818
|
+
workspace.col_take.resize(model_params.ndim);
|
819
|
+
workspace.col_take_type.resize(model_params.ndim);
|
820
|
+
|
821
|
+
if (input_data.ncols_numeric)
|
822
|
+
{
|
823
|
+
workspace.ext_offset.resize(input_data.ncols_tot);
|
824
|
+
workspace.ext_coef.resize(input_data.ncols_tot);
|
825
|
+
workspace.ext_mean.resize(input_data.ncols_tot);
|
826
|
+
}
|
827
|
+
|
828
|
+
if (input_data.ncols_categ)
|
829
|
+
{
|
830
|
+
workspace.ext_fill_new.resize(input_data.max_categ);
|
831
|
+
switch(model_params.cat_split_type)
|
832
|
+
{
|
833
|
+
case SingleCateg:
|
834
|
+
{
|
835
|
+
workspace.chosen_cat.resize(input_data.max_categ);
|
836
|
+
break;
|
837
|
+
}
|
838
|
+
|
839
|
+
case SubSet:
|
840
|
+
{
|
841
|
+
workspace.ext_cat_coef.resize(input_data.ncols_tot);
|
842
|
+
for (std::vector<double> &v : workspace.ext_cat_coef)
|
843
|
+
v.resize(input_data.max_categ);
|
844
|
+
break;
|
845
|
+
}
|
846
|
+
}
|
847
|
+
}
|
848
|
+
|
849
|
+
workspace.ext_fill_val.resize(input_data.ncols_tot);
|
850
|
+
|
851
|
+
}
|
852
|
+
|
853
|
+
/* if it contains missing values, also have to set an array of weights,
|
854
|
+
which will be modified during iterations when there are NAs.
|
855
|
+
If there are already density weights, need to standardize them to sum up to
|
856
|
+
the sample size here */
|
857
|
+
long double weight_scaling = 0;
|
858
|
+
if (model_params.missing_action == Divide || (input_data.sample_weights != NULL && !input_data.weight_as_sample))
|
859
|
+
{
|
860
|
+
workspace.weights_map.clear();
|
861
|
+
|
862
|
+
/* if the sub-sample size is small relative to the full sample size, use a mapping */
|
863
|
+
if (model_params.sample_size < input_data.nrows / 4)
|
864
|
+
{
|
865
|
+
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
866
|
+
{
|
867
|
+
for (const size_t ix : workspace.ix_arr)
|
868
|
+
{
|
869
|
+
weight_scaling += input_data.sample_weights[ix];
|
870
|
+
workspace.weights_map[ix] = input_data.sample_weights[ix];
|
871
|
+
}
|
872
|
+
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
873
|
+
for (auto &w : workspace.weights_map)
|
874
|
+
w.second *= weight_scaling;
|
875
|
+
|
876
|
+
}
|
877
|
+
|
878
|
+
else
|
879
|
+
{
|
880
|
+
for (const size_t ix : workspace.ix_arr)
|
881
|
+
workspace.weights_map[ix] = 1;
|
882
|
+
}
|
883
|
+
|
884
|
+
}
|
885
|
+
|
886
|
+
/* if the sub-sample size is large, fill a full array matching to the sample size */
|
887
|
+
else
|
888
|
+
{
|
889
|
+
if (!workspace.weights_arr.size())
|
890
|
+
{
|
891
|
+
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
892
|
+
{
|
893
|
+
workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
|
894
|
+
weight_scaling = std::accumulate(workspace.ix_arr.begin(),
|
895
|
+
workspace.ix_arr.end(),
|
896
|
+
(long double)0,
|
897
|
+
[&input_data](const long double a, const size_t b){return a + (long double)input_data.sample_weights[b];}
|
898
|
+
);
|
899
|
+
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
900
|
+
for (double &w : workspace.weights_arr)
|
901
|
+
w *= weight_scaling;
|
902
|
+
}
|
903
|
+
|
904
|
+
else
|
905
|
+
{
|
906
|
+
workspace.weights_arr.resize(input_data.nrows, (double)1);
|
907
|
+
}
|
908
|
+
|
909
|
+
}
|
910
|
+
|
911
|
+
else
|
912
|
+
{
|
913
|
+
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
914
|
+
{
|
915
|
+
for (const size_t ix : workspace.ix_arr)
|
916
|
+
{
|
917
|
+
weight_scaling += input_data.sample_weights[ix];
|
918
|
+
workspace.weights_arr[ix] = input_data.sample_weights[ix];
|
919
|
+
}
|
920
|
+
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
921
|
+
for (double &w : workspace.weights_arr)
|
922
|
+
w *= weight_scaling;
|
923
|
+
|
924
|
+
}
|
925
|
+
|
926
|
+
else
|
927
|
+
{
|
928
|
+
/* Note: while not all of them need to be overwritten, this is faster
|
929
|
+
(sub-sample size was already determined to be at least 1/4 of the sample size) */
|
930
|
+
std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), (double)1);
|
931
|
+
}
|
932
|
+
}
|
933
|
+
}
|
934
|
+
}
|
935
|
+
|
936
|
+
/* if producing distance/similarity, also need to initialize the triangular matrix */
|
937
|
+
if (model_params.calc_dist && !workspace.tmat_sep.size())
|
938
|
+
workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
|
939
|
+
|
940
|
+
/* make space for buffers if not already allocated */
|
941
|
+
if (
|
942
|
+
(model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
|
943
|
+
model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl ||
|
944
|
+
model_params.weigh_by_kurt || hplane_root != NULL)
|
945
|
+
&&
|
946
|
+
(!workspace.buffer_dbl.size() && !workspace.buffer_szt.size() && !workspace.buffer_chr.size())
|
947
|
+
)
|
948
|
+
{
|
949
|
+
size_t min_size_dbl = 0;
|
950
|
+
size_t min_size_szt = 0;
|
951
|
+
size_t min_size_chr = 0;
|
952
|
+
|
953
|
+
bool gain = model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
|
954
|
+
model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl;
|
955
|
+
|
956
|
+
if (input_data.ncols_categ)
|
957
|
+
{
|
958
|
+
min_size_szt = 2 * input_data.max_categ;
|
959
|
+
min_size_dbl = input_data.max_categ + 1;
|
960
|
+
if (gain && model_params.cat_split_type == SubSet)
|
961
|
+
min_size_chr = input_data.max_categ;
|
962
|
+
}
|
963
|
+
|
964
|
+
if (input_data.Xc != NULL && gain)
|
965
|
+
{
|
966
|
+
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
967
|
+
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
968
|
+
}
|
969
|
+
|
970
|
+
/* for the extended model */
|
971
|
+
if (hplane_root != NULL)
|
972
|
+
{
|
973
|
+
min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
|
974
|
+
if (model_params.missing_action != Fail)
|
975
|
+
{
|
976
|
+
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
977
|
+
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
978
|
+
}
|
979
|
+
|
980
|
+
if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
|
981
|
+
{
|
982
|
+
min_size_szt = std::max(min_size_szt, 2 * (size_t)input_data.max_categ + 1);
|
983
|
+
min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
|
984
|
+
}
|
985
|
+
|
986
|
+
if (model_params.weigh_by_kurt)
|
987
|
+
min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
|
988
|
+
}
|
989
|
+
|
990
|
+
/* now resize */
|
991
|
+
if (workspace.buffer_dbl.size() < min_size_dbl)
|
992
|
+
workspace.buffer_dbl.resize(min_size_dbl);
|
993
|
+
|
994
|
+
if (workspace.buffer_szt.size() < min_size_szt)
|
995
|
+
workspace.buffer_szt.resize(min_size_szt);
|
996
|
+
|
997
|
+
if (workspace.buffer_chr.size() < min_size_chr)
|
998
|
+
workspace.buffer_chr.resize(min_size_chr);
|
999
|
+
|
1000
|
+
/* for guided column choice, need to also remember the best split so far */
|
1001
|
+
if (
|
1002
|
+
model_params.cat_split_type == SubSet &&
|
1003
|
+
(
|
1004
|
+
model_params.prob_pick_by_gain_avg ||
|
1005
|
+
model_params.prob_pick_by_gain_pl
|
1006
|
+
)
|
1007
|
+
)
|
1008
|
+
{
|
1009
|
+
workspace.this_split_categ.resize(input_data.max_categ);
|
1010
|
+
}
|
1011
|
+
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
/* weigh columns by kurtosis in the sample if required */
|
1015
|
+
if (model_params.weigh_by_kurt)
|
1016
|
+
{
|
1017
|
+
std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
|
1018
|
+
|
1019
|
+
if (input_data.Xc == NULL)
|
1020
|
+
{
|
1021
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1022
|
+
kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1023
|
+
input_data.numeric_data + col * input_data.nrows,
|
1024
|
+
model_params.missing_action);
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
else
|
1028
|
+
{
|
1029
|
+
std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
|
1030
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1031
|
+
kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
1032
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
1033
|
+
model_params.missing_action);
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
1037
|
+
kurt_weights[col + input_data.ncols_numeric] =
|
1038
|
+
calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1039
|
+
input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
|
1040
|
+
workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
|
1041
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
|
1042
|
+
|
1043
|
+
for (size_t col = 0; col < input_data.ncols_tot; col++)
|
1044
|
+
if (kurt_weights[col] <= 0 || is_na_or_inf(kurt_weights[col]))
|
1045
|
+
workspace.cols_possible[col] = false;
|
1046
|
+
|
1047
|
+
workspace.col_sampler = std::discrete_distribution<size_t>(kurt_weights.begin(), kurt_weights.end());
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
if (tree_root != NULL)
|
1051
|
+
split_itree_recursive(*tree_root,
|
1052
|
+
workspace,
|
1053
|
+
input_data,
|
1054
|
+
model_params,
|
1055
|
+
impute_nodes,
|
1056
|
+
0);
|
1057
|
+
else
|
1058
|
+
split_hplane_recursive(*hplane_root,
|
1059
|
+
workspace,
|
1060
|
+
input_data,
|
1061
|
+
model_params,
|
1062
|
+
impute_nodes,
|
1063
|
+
0);
|
1064
|
+
|
1065
|
+
/* if producing imputation structs, only need to keep the ones for terminal nodes */
|
1066
|
+
if (impute_nodes != NULL)
|
1067
|
+
drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
|
1068
|
+
}
|