isotree 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/LICENSE.txt +2 -2
- data/README.md +41 -23
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/dataset.rb +0 -1
- data/lib/isotree/isolation_forest.rb +114 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +40 -106
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,1090 +0,0 @@
|
|
1
|
-
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
-
* of categorical variables and missing values.
|
3
|
-
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
-
*
|
5
|
-
* This library is based on the following works:
|
6
|
-
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
-
* "Isolation forest."
|
8
|
-
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
-
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
-
* "Isolation-based anomaly detection."
|
11
|
-
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
-
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
-
* "Extended Isolation Forest."
|
14
|
-
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
-
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
-
* "On detecting clustered anomalies using SCiForest."
|
17
|
-
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
-
* [5] https://sourceforge.net/projects/iforest/
|
19
|
-
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
-
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
-
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
-
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
-
*
|
24
|
-
* BSD 2-Clause License
|
25
|
-
* Copyright (c) 2020, David Cortes
|
26
|
-
* All rights reserved.
|
27
|
-
* Redistribution and use in source and binary forms, with or without
|
28
|
-
* modification, are permitted provided that the following conditions are met:
|
29
|
-
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
-
* list of conditions and the following disclaimer.
|
31
|
-
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
-
* this list of conditions and the following disclaimer in the documentation
|
33
|
-
* and/or other materials provided with the distribution.
|
34
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
-
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
-
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
-
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
-
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
-
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
-
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
-
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
-
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
-
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
-
*/
|
45
|
-
#include "isotree.hpp"
|
46
|
-
|
47
|
-
bool interrupt_switch;
|
48
|
-
|
49
|
-
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
50
|
-
*
|
51
|
-
* Parameters:
|
52
|
-
* ===========
|
53
|
-
* - model_outputs (out)
|
54
|
-
* Pointer to already allocated isolation forest model object for single-variable splits.
|
55
|
-
* If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
|
56
|
-
* additional trees through function 'add_tree'.
|
57
|
-
* - model_outputs_ext (out)
|
58
|
-
* Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
|
59
|
-
* Note that if 'ndim' = 1, must use instead the single-variable model object.
|
60
|
-
* If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
|
61
|
-
* additional trees through function 'add_tree'.
|
62
|
-
* - numeric_data[nrows * ncols_numeric]
|
63
|
-
* Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
|
64
|
-
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
65
|
-
* Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
|
66
|
-
* no sparse numeric data either).
|
67
|
-
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
68
|
-
* - ncols_numeric
|
69
|
-
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
70
|
-
* - categ_data[nrows * ncols_categ]
|
71
|
-
* Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
|
72
|
-
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
73
|
-
* Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
|
74
|
-
* Each category should be represented as an integer, and these integers must start at zero and
|
75
|
-
* be in consecutive order - i.e. if category '3' is present, category '2' must also be present
|
76
|
-
* (note that they are not treated as being ordinal, this is just an encoding). Missing values
|
77
|
-
* should be encoded as negative numbers such as (-1).
|
78
|
-
* - ncols_categ
|
79
|
-
* Number of categorical columns in the data.
|
80
|
-
* - ncat[ncols_categ]
|
81
|
-
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
82
|
-
* the number of categories for that column is '5' (zero is one category).
|
83
|
-
* - Xc[nnz]
|
84
|
-
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
85
|
-
* Pass NULL if there are no sparse numeric columns.
|
86
|
-
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
87
|
-
* - Xc_ind[nnz]
|
88
|
-
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
89
|
-
* Pass NULL if there are no sparse numeric columns.
|
90
|
-
* - Xc_indptr[ncols_numeric + 1]
|
91
|
-
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
92
|
-
* start and at entry [col + 1] where does column 'col' end.
|
93
|
-
* Pass NULL if there are no sparse numeric columns.
|
94
|
-
* - ndim
|
95
|
-
* How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
|
96
|
-
* the single-variable model. Note that the model object pointer passed must also
|
97
|
-
* agree with the value passed to 'ndim'.
|
98
|
-
* - ntry
|
99
|
-
* In the split-criterion extended model, how many random hyperplanes to evaluate in
|
100
|
-
* order to decide which one is best to take. Ignored for the single-variable case
|
101
|
-
* and for random splits.
|
102
|
-
* - coef_type
|
103
|
-
* For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
|
104
|
-
* (as proposed in [3]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [4]. Ignored for the
|
105
|
-
* single-variable model.
|
106
|
-
* - sample_weights[nrows]
|
107
|
-
* Weights for the rows when building a tree, either as sampling importances when using
|
108
|
-
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
109
|
-
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
110
|
-
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
111
|
-
* through parameter 'weight_as_sample'.
|
112
|
-
* Pass NULL if the rows all have uniform weights.
|
113
|
-
* - with_replacement
|
114
|
-
* Whether to produce sub-samples with replacement or not.
|
115
|
-
* - weight_as_sample
|
116
|
-
* If passing 'sample_weights', whether to consider those weights as row sampling weights (i.e. the higher
|
117
|
-
* the weights, the more likely the observation will end up included in each tree sub-sample), or as distribution
|
118
|
-
* density weights (i.e. putting a weight of two is the same as if the row appeared twice, thus higher weight makes it
|
119
|
-
* less of an outlier). Note that sampling weight is only used when sub-sampling data for each tree.
|
120
|
-
* - nrows
|
121
|
-
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
122
|
-
* - sample_size
|
123
|
-
* Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
|
124
|
-
* 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
|
125
|
-
* random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
|
126
|
-
* will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
|
127
|
-
* in [5] is 'nrows' here.
|
128
|
-
* - ntrees
|
129
|
-
* Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
|
130
|
-
* author's code in [5] is 10.
|
131
|
-
* - max_depth
|
132
|
-
* Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
|
133
|
-
* - limit_depth
|
134
|
-
* Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
|
135
|
-
* terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
|
136
|
-
* will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
|
137
|
-
* tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass higher values if
|
138
|
-
* using the model for purposes other than outlier detection.
|
139
|
-
* - penalize_range
|
140
|
-
* Whether to penalize (add +1 to the terminal depth) observations at prediction time that have a value
|
141
|
-
* of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
|
142
|
-
* reasonable range in the data being split (given by 2 * range in data and centered around the split point),
|
143
|
-
* as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
|
144
|
-
* when splitting by categorical variables.
|
145
|
-
* - standardize_dist
|
146
|
-
* If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
|
147
|
-
* depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
|
148
|
-
* - tmat[nrows * (nrows - 1) / 2]
|
149
|
-
* Array in which to calculate average separation depths or standardized distance metric (see documentation
|
150
|
-
* for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
151
|
-
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
152
|
-
* of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
|
153
|
-
* output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
|
154
|
-
* entry 0 <= i < j < n will be located at position
|
155
|
-
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
156
|
-
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
157
|
-
* - output_depths[nrows]
|
158
|
-
* Array in which to calculate average path depths or standardized outlierness metric (see documentation
|
159
|
-
* for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
160
|
-
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
161
|
-
* of rows. If not NULL, must already be initialized to zeros.
|
162
|
-
* - standardize_depth
|
163
|
-
* If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
|
164
|
-
* a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
|
165
|
-
* with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
|
166
|
-
* the average depth of each row across all trees.
|
167
|
-
* - col_weights[ncols_numeric + ncols_categ]
|
168
|
-
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
169
|
-
* Ignored when picking columns by deterministic criterion.
|
170
|
-
* If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
|
171
|
-
* - weigh_by_kurt
|
172
|
-
* Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
|
173
|
-
* for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
|
174
|
-
* sample, so if not using sub-samples, it's better to pass column weights calculated externally. For
|
175
|
-
* categorical columns, will calculate expected kurtosis if the column was converted to numerical by
|
176
|
-
* assigning to each category a random number ~ Unif(0, 1).
|
177
|
-
* - prob_pick_by_gain_avg
|
178
|
-
* Probability of making each split in the single-variable model by choosing a column and split point in that
|
179
|
-
* same column as both the column and split point that gives the largest averaged gain (as proposed in [4]) across
|
180
|
-
* all available columns and possible splits in each column. Note that this implies evaluating every single column
|
181
|
-
* in the sample data when this type of split happens, which will potentially make the model fitting much slower,
|
182
|
-
* but has no impact on prediction time. For categorical variables, will take the expected standard deviation that
|
183
|
-
* would be gotten if the column were converted to numerical by assigning to each category a random number ~ Unif(0, 1)
|
184
|
-
* and calculate gain with those assumed standard deviations. For the extended model, this parameter indicates the probability that the
|
185
|
-
* split point in the chosen linear combination of variables will be decided by this averaged gain criterion. Compared to
|
186
|
-
* a pooled average, this tends to result in more cases in which a single observation or very few of them are put into
|
187
|
-
* one branch. Recommended to use sub-samples (parameter `sample_size`) when passing this parameter. When splits are
|
188
|
-
* not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl',
|
189
|
-
* both the column and the split point are decided at random.
|
190
|
-
* Default setting for [1], [2], [3] is zero, and default for [4] is 1. This is the randomization parameter that can
|
191
|
-
* be passed to the author's original code in [5]. Note that, if passing value 1 (100%) with no sub-sampling and using the
|
192
|
-
* single-variable model, every single tree will have the exact same splits.
|
193
|
-
* - prob_split_by_gain_avg
|
194
|
-
* Probability of making each split by selecting a column at random and determining the split point as
|
195
|
-
* that which gives the highest averaged gain. Not supported for the extended model as the splits are on
|
196
|
-
* linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_avg' for more details.
|
197
|
-
* - prob_pick_by_gain_pl
|
198
|
-
* Probability of making each split in the single-variable model by choosing a column and split point in that
|
199
|
-
* same column as both the column and split point that gives the largest pooled gain (as used in decision tree
|
200
|
-
* classifiers such as C4.5 in [7]) across all available columns and possible splits in each column. Note
|
201
|
-
* that this implies evaluating every single column in the sample data when this type of split happens, which
|
202
|
-
* will potentially make the model fitting much slower, but has no impact on prediction time. For categorical
|
203
|
-
* variables, will use shannon entropy instead (like in [7]). For the extended model, this parameter indicates the probability
|
204
|
-
* that the split point in the chosen linear combination of variables will be decided by this pooled gain
|
205
|
-
* criterion. Compared to a simple average, this tends to result in more evenly-divided splits and more clustered
|
206
|
-
* groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
|
207
|
-
* When used for outlier detection, higher values of this parameter result in models that are able to better flag
|
208
|
-
* outliers in the training data, but generalize poorly to outliers in new data and to values of variables
|
209
|
-
* outside of the ranges from the training data. Passing small 'sample_size' and high values of this parameter will
|
210
|
-
* tend to flag too many outliers. When splits are not made according to any of 'prob_pick_by_gain_avg',
|
211
|
-
* 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl', both the column and the split point
|
212
|
-
* are decided at random. Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
|
213
|
-
* every single tree will have the exact same splits.
|
214
|
-
* - prob_split_by_gain_pl
|
215
|
-
* Probability of making each split by selecting a column at random and determining the split point as
|
216
|
-
* that which gives the highest pooled gain. Not supported for the extended model as the splits are on
|
217
|
-
* linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_pl' for more details.
|
218
|
-
* - min_gain
|
219
|
-
* Minimum gain that a split threshold needs to produce in order to proceed with a split. Only used when the splits
|
220
|
-
* are decided by a gain criterion (either pooled or averaged). If the highest possible gain in the evaluated
|
221
|
-
* splits at a node is below this threshold, that node becomes a terminal node.
|
222
|
-
* - missing_action
|
223
|
-
* How to handle missing data at both fitting and prediction time. Options are a) "Divide" (for the single-variable
|
224
|
-
* model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
|
225
|
-
* the data that went to each branch when fitting the model, b) "Impute", which will assign observations to the
|
226
|
-
* branch with the most observations in the single-variable model, or fill in missing values with the median
|
227
|
-
* of each column of the sample from which the split was made in the extended model (recommended), c) "Fail" which will assume
|
228
|
-
* there are no missing values and will trigger undefined behavior if it encounters any. In the extended model, infinite
|
229
|
-
* values will be treated as missing. Note that passing "fail" might crash the process if there turn out to be
|
230
|
-
* missing values, but will otherwise produce faster fitting and prediction times along with decreased model object sizes.
|
231
|
-
* Models from [1], [2], [3], [4] correspond to "Fail" here.
|
232
|
-
* - cat_split_type
|
233
|
-
* Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
|
234
|
-
* a single category to a branch and the rest to the other branch. For the extended model, whether to
|
235
|
-
* give each category a coefficient, or only one while the rest get zero.
|
236
|
-
* - new_cat_action
|
237
|
-
* What to do after splitting a categorical feature when new data that reaches that split has categories that
|
238
|
-
* the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
|
239
|
-
* in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
|
240
|
-
* data that went to each branch when fitting the model, and in the extended model will assign
|
241
|
-
* them the median value for that column that was added to the linear combination of features, b) "Smallest", which will
|
242
|
-
* assign all observations with unseen categories in the split to the branch that had fewer observations when
|
243
|
-
* fitting the model, c) "Random", which will assing a branch (coefficient in the extended model) at random for
|
244
|
-
* each category beforehand, even if no observations had that category when fitting the model. Ignored when
|
245
|
-
* passing 'cat_split_type' = 'SingleCateg'.
|
246
|
-
* - all_perm
|
247
|
-
* When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
|
248
|
-
* whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
|
249
|
-
* will sort the categories by their frequency and make a grouping in this sorted order. Note that the
|
250
|
-
* number of combinations evaluated (if 'true') is the factorial of the number of present categories in
|
251
|
-
* a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
|
252
|
-
* category in a separate branch, so not evaluating all permutations (passing 'false') will make it
|
253
|
-
* possible to select other splits that respect the sorted frequency order.
|
254
|
-
* The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
|
255
|
-
* systems, this means no column can have more than 20 different categories if using 'all_perm=true',
|
256
|
-
* but note that this is not checked within the function.
|
257
|
-
* Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
|
258
|
-
* - coef_by_prop
|
259
|
-
* In the extended model, whether to sort the randomly-generated coefficients for categories
|
260
|
-
* according to their relative frequency in the tree node. This might provide better results when using
|
261
|
-
* categorical variables with too many categories, but is not recommended, and not reflective of
|
262
|
-
* real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
|
263
|
-
* variables.
|
264
|
-
* - imputer (out)
|
265
|
-
* Pointer to already-allocated imputer object, which can be used to produce missing value imputations
|
266
|
-
* in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
|
267
|
-
* 'missing_action' as missing values inside the model are treated differently and follow their own imputation
|
268
|
-
* or division strategy.
|
269
|
-
* - min_imp_obs
|
270
|
-
* Minimum number of observations with which an imputation value can be produced. Ignored if passing
|
271
|
-
* 'build_imputer' = 'false'.
|
272
|
-
* - depth_imp
|
273
|
-
* How to weight observations according to their depth when used for imputing missing values. Passing
|
274
|
-
* "Higher" will weigh observations higher the further down the tree (away from the root node) the
|
275
|
-
* terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
|
276
|
-
* to node depth in the tree. Implemented for testing purposes and not recommended to change
|
277
|
-
* from the default. Ignored when not passing 'impute_nodes'.
|
278
|
-
* - weigh_imp_rows
|
279
|
-
* How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
|
280
|
-
* a node inversely proportional to the number of observations that end up there, while "Proportional"
|
281
|
-
* will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
|
282
|
-
* in this regard regardless of how many observations end up there. Implemented for testing purposes
|
283
|
-
* and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
|
284
|
-
* - impute_at_fit
|
285
|
-
* Whether to impute missing values in the input data as the model is being built. If passing 'true',
|
286
|
-
* then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
|
287
|
-
* 'categ_data', and 'Xc', will get overwritten with the imputations produced.
|
288
|
-
* - random_seed
|
289
|
-
* Seed that will be used to generate random numbers used by the model.
|
290
|
-
* - handle_interrupt
|
291
|
-
* Whether to handle interrupt signals while the process is running. Note that this will
|
292
|
-
* interfere with interrupt handles when the procedure is called from interpreted languages
|
293
|
-
* such as Python or R.
|
294
|
-
* - nthreads
|
295
|
-
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
296
|
-
* allocated, even if the thread does not end up being used. Ignored when not building with
|
297
|
-
* OpenMP support.
|
298
|
-
*
|
299
|
-
* Returns
|
300
|
-
* =======
|
301
|
-
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
302
|
-
* If the process receives an interrupt signal, will return instead
|
303
|
-
* 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
|
304
|
-
* what these values correspond to, you can use the functions
|
305
|
-
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
306
|
-
* as integers.
|
307
|
-
*
|
308
|
-
* References
|
309
|
-
* ==========
|
310
|
-
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
311
|
-
* "Isolation forest."
|
312
|
-
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
313
|
-
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
314
|
-
* "Isolation-based anomaly detection."
|
315
|
-
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
316
|
-
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
317
|
-
* "Extended Isolation Forest."
|
318
|
-
* arXiv preprint arXiv:1811.02141 (2018).
|
319
|
-
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
320
|
-
* "On detecting clustered anomalies using SCiForest."
|
321
|
-
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
322
|
-
* [5] https://sourceforge.net/projects/iforest/
|
323
|
-
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
324
|
-
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
325
|
-
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
326
|
-
*/
|
327
|
-
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
328
|
-
double numeric_data[], size_t ncols_numeric,
|
329
|
-
int categ_data[], size_t ncols_categ, int ncat[],
|
330
|
-
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
331
|
-
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
332
|
-
double sample_weights[], bool with_replacement, bool weight_as_sample,
|
333
|
-
size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
|
334
|
-
bool limit_depth, bool penalize_range,
|
335
|
-
bool standardize_dist, double tmat[],
|
336
|
-
double output_depths[], bool standardize_depth,
|
337
|
-
double col_weights[], bool weigh_by_kurt,
|
338
|
-
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
339
|
-
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
340
|
-
double min_gain, MissingAction missing_action,
|
341
|
-
CategSplit cat_split_type, NewCategAction new_cat_action,
|
342
|
-
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
343
|
-
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
344
|
-
uint64_t random_seed, bool handle_interrupt, int nthreads)
|
345
|
-
{
|
346
|
-
/* calculate maximum number of categories to use later */
|
347
|
-
int max_categ = 0;
|
348
|
-
for (size_t col = 0; col < ncols_categ; col++)
|
349
|
-
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
350
|
-
|
351
|
-
bool calc_dist = tmat != NULL;
|
352
|
-
|
353
|
-
if (calc_dist || sample_size == 0)
|
354
|
-
sample_size = nrows;
|
355
|
-
|
356
|
-
/* put data in structs to shorten function calls */
|
357
|
-
InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
358
|
-
nrows, ncols_numeric + ncols_categ, sample_weights,
|
359
|
-
weight_as_sample, col_weights,
|
360
|
-
Xc, Xc_ind, Xc_indptr,
|
361
|
-
0, 0, std::vector<double>(),
|
362
|
-
std::vector<char>(), 0};
|
363
|
-
ModelParams model_params = {with_replacement, sample_size, ntrees,
|
364
|
-
limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
|
365
|
-
penalize_range, random_seed, weigh_by_kurt,
|
366
|
-
prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
|
367
|
-
prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
|
368
|
-
min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
|
369
|
-
(model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
|
370
|
-
coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
|
371
|
-
depth_imp, weigh_imp_rows, min_imp_obs};
|
372
|
-
|
373
|
-
/* if using weights as sampling probability, build a binary tree for faster sampling */
|
374
|
-
if (input_data.weight_as_sample && input_data.sample_weights != NULL)
|
375
|
-
{
|
376
|
-
build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
|
377
|
-
input_data.nrows, input_data.log2_n, input_data.btree_offset);
|
378
|
-
}
|
379
|
-
|
380
|
-
/* if imputing missing values on-the-fly, need to determine which are missing */
|
381
|
-
std::vector<ImputedData> impute_vec;
|
382
|
-
std::unordered_map<size_t, ImputedData> impute_map;
|
383
|
-
if (model_params.impute_at_fit)
|
384
|
-
check_for_missing(input_data, impute_vec, impute_map, nthreads);
|
385
|
-
|
386
|
-
/* store model data */
|
387
|
-
if (model_outputs != NULL)
|
388
|
-
{
|
389
|
-
model_outputs->trees.resize(ntrees);
|
390
|
-
model_outputs->trees.shrink_to_fit();
|
391
|
-
model_outputs->new_cat_action = new_cat_action;
|
392
|
-
model_outputs->cat_split_type = cat_split_type;
|
393
|
-
model_outputs->missing_action = missing_action;
|
394
|
-
model_outputs->exp_avg_depth = expected_avg_depth(sample_size);
|
395
|
-
model_outputs->exp_avg_sep = expected_separation_depth(model_params.sample_size);
|
396
|
-
model_outputs->orig_sample_size = input_data.nrows;
|
397
|
-
}
|
398
|
-
|
399
|
-
else
|
400
|
-
{
|
401
|
-
model_outputs_ext->hplanes.resize(ntrees);
|
402
|
-
model_outputs_ext->hplanes.shrink_to_fit();
|
403
|
-
model_outputs_ext->new_cat_action = new_cat_action;
|
404
|
-
model_outputs_ext->cat_split_type = cat_split_type;
|
405
|
-
model_outputs_ext->missing_action = missing_action;
|
406
|
-
model_outputs_ext->exp_avg_depth = expected_avg_depth(sample_size);
|
407
|
-
model_outputs_ext->exp_avg_sep = expected_separation_depth(model_params.sample_size);
|
408
|
-
model_outputs_ext->orig_sample_size = input_data.nrows;
|
409
|
-
}
|
410
|
-
|
411
|
-
if (imputer != NULL)
|
412
|
-
initialize_imputer(*imputer, input_data, ntrees, nthreads);
|
413
|
-
|
414
|
-
/* initialize thread-private memory */
|
415
|
-
if ((size_t)nthreads > ntrees)
|
416
|
-
nthreads = (int)ntrees;
|
417
|
-
#ifdef _OPENMP
|
418
|
-
std::vector<WorkerMemory> worker_memory(nthreads);
|
419
|
-
#else
|
420
|
-
std::vector<WorkerMemory> worker_memory(1);
|
421
|
-
#endif
|
422
|
-
|
423
|
-
/* Global variable that determines if the procedure receives a stop signal */
|
424
|
-
interrupt_switch = false;
|
425
|
-
/* TODO: find a better way of handling interrupt signals when calling in Python/R.
|
426
|
-
The following will still change the behavior of interrupts when called through e.g. Flask */
|
427
|
-
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
428
|
-
struct sigaction sig_handle = {};
|
429
|
-
if (handle_interrupt)
|
430
|
-
{
|
431
|
-
sig_handle.sa_flags = SA_RESETHAND;
|
432
|
-
sig_handle.sa_handler = set_interrup_global_variable;
|
433
|
-
sigemptyset(&sig_handle.sa_mask);
|
434
|
-
}
|
435
|
-
#endif
|
436
|
-
|
437
|
-
/* grow trees */
|
438
|
-
#pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
|
439
|
-
for (size_t_for tree = 0; tree < ntrees; tree++)
|
440
|
-
{
|
441
|
-
if (interrupt_switch)
|
442
|
-
continue; /* Cannot break with OpenMP==2.0 (MSVC) */
|
443
|
-
|
444
|
-
if (
|
445
|
-
model_params.impute_at_fit &&
|
446
|
-
input_data.n_missing &&
|
447
|
-
!worker_memory[omp_get_thread_num()].impute_vec.size() &&
|
448
|
-
!worker_memory[omp_get_thread_num()].impute_map.size()
|
449
|
-
)
|
450
|
-
{
|
451
|
-
#ifdef _OPENMP
|
452
|
-
if (nthreads > 1)
|
453
|
-
{
|
454
|
-
worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
|
455
|
-
worker_memory[omp_get_thread_num()].impute_map = impute_map;
|
456
|
-
}
|
457
|
-
|
458
|
-
else
|
459
|
-
#endif
|
460
|
-
{
|
461
|
-
worker_memory[0].impute_vec = std::move(impute_vec);
|
462
|
-
worker_memory[0].impute_map = std::move(impute_map);
|
463
|
-
}
|
464
|
-
}
|
465
|
-
|
466
|
-
fit_itree((model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
|
467
|
-
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
|
468
|
-
worker_memory[omp_get_thread_num()],
|
469
|
-
input_data,
|
470
|
-
model_params,
|
471
|
-
(imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
|
472
|
-
tree);
|
473
|
-
|
474
|
-
if ((model_outputs != NULL))
|
475
|
-
model_outputs->trees[tree].shrink_to_fit();
|
476
|
-
else
|
477
|
-
model_outputs_ext->hplanes[tree].shrink_to_fit();
|
478
|
-
|
479
|
-
if (handle_interrupt)
|
480
|
-
{
|
481
|
-
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
482
|
-
sigaction(SIGINT, &sig_handle, NULL);
|
483
|
-
#else
|
484
|
-
signal(SIGINT, set_interrup_global_variable);
|
485
|
-
#endif
|
486
|
-
}
|
487
|
-
}
|
488
|
-
|
489
|
-
/* check if the procedure got interrupted */
|
490
|
-
if (interrupt_switch) return EXIT_FAILURE;
|
491
|
-
interrupt_switch = false;
|
492
|
-
|
493
|
-
if ((model_outputs != NULL))
|
494
|
-
model_outputs->trees.shrink_to_fit();
|
495
|
-
else
|
496
|
-
model_outputs_ext->hplanes.shrink_to_fit();
|
497
|
-
|
498
|
-
/* if calculating similarity/distance, now need to reduce and average */
|
499
|
-
if (calc_dist)
|
500
|
-
gather_sim_result(NULL, &worker_memory,
|
501
|
-
NULL, &input_data,
|
502
|
-
model_outputs, model_outputs_ext,
|
503
|
-
tmat, NULL, 0,
|
504
|
-
model_params.ntrees, false,
|
505
|
-
standardize_dist, nthreads);
|
506
|
-
|
507
|
-
/* same for depths */
|
508
|
-
if (output_depths != NULL)
|
509
|
-
{
|
510
|
-
#ifdef _OPENMP
|
511
|
-
if (nthreads > 1)
|
512
|
-
{
|
513
|
-
for (WorkerMemory &w : worker_memory)
|
514
|
-
{
|
515
|
-
if (w.row_depths.size())
|
516
|
-
{
|
517
|
-
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
|
518
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
519
|
-
output_depths[row] += w.row_depths[row];
|
520
|
-
}
|
521
|
-
}
|
522
|
-
}
|
523
|
-
else
|
524
|
-
#endif
|
525
|
-
{
|
526
|
-
std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
|
527
|
-
}
|
528
|
-
|
529
|
-
if (standardize_depth)
|
530
|
-
{
|
531
|
-
double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
|
532
|
-
model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
|
533
|
-
for (size_t_for row = 0; row < nrows; row++)
|
534
|
-
output_depths[row] = exp2( - output_depths[row] / depth_divisor );
|
535
|
-
}
|
536
|
-
|
537
|
-
else
|
538
|
-
{
|
539
|
-
double ntrees_dbl = (double) ntrees;
|
540
|
-
for (size_t_for row = 0; row < nrows; row++)
|
541
|
-
output_depths[row] /= ntrees_dbl;
|
542
|
-
}
|
543
|
-
}
|
544
|
-
|
545
|
-
/* if imputing missing values, now need to reduce and write final values */
|
546
|
-
if (model_params.impute_at_fit)
|
547
|
-
{
|
548
|
-
#ifdef _OPENMP
|
549
|
-
if (nthreads > 1)
|
550
|
-
{
|
551
|
-
for (WorkerMemory &w : worker_memory)
|
552
|
-
combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
|
553
|
-
}
|
554
|
-
|
555
|
-
else
|
556
|
-
#endif
|
557
|
-
{
|
558
|
-
impute_vec = std::move(worker_memory[0].impute_vec);
|
559
|
-
impute_map = std::move(worker_memory[0].impute_map);
|
560
|
-
}
|
561
|
-
|
562
|
-
apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
|
563
|
-
}
|
564
|
-
|
565
|
-
return EXIT_SUCCESS;
|
566
|
-
}
|
567
|
-
|
568
|
-
|
569
|
-
/* Add additional trees to already-fitted isolation forest model
|
570
|
-
*
|
571
|
-
* Parameters
|
572
|
-
* ==========
|
573
|
-
* - model_outputs
|
574
|
-
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
575
|
-
* if the trees are are to be added to an extended model. Can only pass one of
|
576
|
-
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
577
|
-
* so it cannot be run in parallel for the same model object.
|
578
|
-
* - model_outputs_ext
|
579
|
-
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
580
|
-
* if the trees are are to be added to an single-variable model. Can only pass one of
|
581
|
-
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
582
|
-
* so it cannot be run in parallel for the same model object.
|
583
|
-
* - numeric_data
|
584
|
-
* Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
585
|
-
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
586
|
-
* Pass NULL if there are no dense numeric columns.
|
587
|
-
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
588
|
-
* If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
|
589
|
-
* of columns, either as dense or as sparse arrays.
|
590
|
-
* - ncols_numeric
|
591
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
592
|
-
* what was originally passed to 'fit_iforest'.
|
593
|
-
* - categ_data
|
594
|
-
* Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
595
|
-
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
596
|
-
* Pass NULL if there are no categorical columns. The encoding must be the same as was used
|
597
|
-
* in the data to which the model was fit.
|
598
|
-
* Each category should be represented as an integer, and these integers must start at zero and
|
599
|
-
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
600
|
-
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
601
|
-
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
602
|
-
* must be the same as was used in the data to which the model was fit.
|
603
|
-
* If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
|
604
|
-
* of columns and the same category encoding.
|
605
|
-
* - ncols_categ
|
606
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
607
|
-
* what was originally passed to 'fit_iforest'.
|
608
|
-
* - ncat
|
609
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
610
|
-
* what was originally passed to 'fit_iforest'.
|
611
|
-
* - Xc[nnz]
|
612
|
-
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
613
|
-
* Pass NULL if there are no sparse numeric columns.
|
614
|
-
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
615
|
-
* - Xc_ind[nnz]
|
616
|
-
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
617
|
-
* Pass NULL if there are no sparse numeric columns.
|
618
|
-
* - Xc_indptr[ncols_numeric + 1]
|
619
|
-
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
620
|
-
* start and at entry [col + 1] where does column 'col' end.
|
621
|
-
* Pass NULL if there are no sparse numeric columns.
|
622
|
-
* - ndim
|
623
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
624
|
-
* what was originally passed to 'fit_iforest'.
|
625
|
-
* - ntry
|
626
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
627
|
-
* what was originally passed to 'fit_iforest'.
|
628
|
-
* - coef_type
|
629
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
630
|
-
* what was originally passed to 'fit_iforest'.
|
631
|
-
* - sample_weights
|
632
|
-
* Weights for the rows when adding this tree, either as sampling importances when using
|
633
|
-
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
634
|
-
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
635
|
-
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
636
|
-
* through parameter 'weight_as_sample' that was passed to 'fit_iforest.
|
637
|
-
* Pass NULL if the rows all have uniform weights.
|
638
|
-
* - nrows
|
639
|
-
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
640
|
-
* - max_depth
|
641
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
642
|
-
* what was originally passed to 'fit_iforest'.
|
643
|
-
* - limit_depth
|
644
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
645
|
-
* what was originally passed to 'fit_iforest'.
|
646
|
-
* - penalize_range
|
647
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
648
|
-
* what was originally passed to 'fit_iforest'.
|
649
|
-
* - col_weights
|
650
|
-
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
651
|
-
* Ignored when picking columns by deterministic criterion.
|
652
|
-
* If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
|
653
|
-
* - weigh_by_kurt
|
654
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
655
|
-
* what was originally passed to 'fit_iforest'.
|
656
|
-
* - prob_pick_by_gain_avg
|
657
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
658
|
-
* what was originally passed to 'fit_iforest'.
|
659
|
-
* - prob_split_by_gain_avg
|
660
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
661
|
-
* what was originally passed to 'fit_iforest'.
|
662
|
-
* - prob_pick_by_gain_pl
|
663
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
664
|
-
* what was originally passed to 'fit_iforest'.
|
665
|
-
* - prob_split_by_gain_pl
|
666
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
667
|
-
* what was originally passed to 'fit_iforest'.
|
668
|
-
* - min_gain
|
669
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
670
|
-
* what was originally passed to 'fit_iforest'.
|
671
|
-
* - missing_action
|
672
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
673
|
-
* what was originally passed to 'fit_iforest'.
|
674
|
-
* - cat_split_type
|
675
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
676
|
-
* what was originally passed to 'fit_iforest'.
|
677
|
-
* - new_cat_action
|
678
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
679
|
-
* what was originally passed to 'fit_iforest'.
|
680
|
-
* - depth_imp
|
681
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
682
|
-
* what was originally passed to 'fit_iforest'.
|
683
|
-
* - weigh_imp_rows
|
684
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
685
|
-
* what was originally passed to 'fit_iforest'.
|
686
|
-
* - all_perm
|
687
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
688
|
-
* what was originally passed to 'fit_iforest'.
|
689
|
-
* - coef_by_prop
|
690
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
691
|
-
* what was originally passed to 'fit_iforest'.
|
692
|
-
* - impute_nodes
|
693
|
-
* Pointer to already-allocated imputation nodes for the tree that will be built. Note that the number of
|
694
|
-
* entries in the imputation object must match the number of fitted trees when it is used. Pass
|
695
|
-
* NULL if no imputation node is required.
|
696
|
-
* - min_imp_obs
|
697
|
-
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
698
|
-
* what was originally passed to 'fit_iforest'.
|
699
|
-
* - random_seed
|
700
|
-
* Seed that will be used to generate random numbers used by the model.
|
701
|
-
*/
|
702
|
-
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
703
|
-
double numeric_data[], size_t ncols_numeric,
|
704
|
-
int categ_data[], size_t ncols_categ, int ncat[],
|
705
|
-
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
706
|
-
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
707
|
-
double sample_weights[], size_t nrows, size_t max_depth,
|
708
|
-
bool limit_depth, bool penalize_range,
|
709
|
-
double col_weights[], bool weigh_by_kurt,
|
710
|
-
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
711
|
-
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
712
|
-
double min_gain, MissingAction missing_action,
|
713
|
-
CategSplit cat_split_type, NewCategAction new_cat_action,
|
714
|
-
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
715
|
-
bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
|
716
|
-
uint64_t random_seed)
|
717
|
-
{
|
718
|
-
int max_categ = 0;
|
719
|
-
for (size_t col = 0; col < ncols_categ; col++)
|
720
|
-
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
721
|
-
|
722
|
-
InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
723
|
-
nrows, ncols_numeric + ncols_categ, sample_weights,
|
724
|
-
false, col_weights,
|
725
|
-
Xc, Xc_ind, Xc_indptr,
|
726
|
-
0, 0, std::vector<double>(),
|
727
|
-
std::vector<char>(), 0};
|
728
|
-
ModelParams model_params = {false, nrows, (size_t)1,
|
729
|
-
max_depth? max_depth : (nrows - 1),
|
730
|
-
penalize_range, random_seed, weigh_by_kurt,
|
731
|
-
prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
|
732
|
-
prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
|
733
|
-
min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
|
734
|
-
(model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
|
735
|
-
coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
|
736
|
-
|
737
|
-
std::unique_ptr<WorkerMemory> workspace = std::unique_ptr<WorkerMemory>(new WorkerMemory);
|
738
|
-
|
739
|
-
size_t last_tree;
|
740
|
-
if (model_outputs != NULL)
|
741
|
-
{
|
742
|
-
last_tree = model_outputs->trees.size();
|
743
|
-
model_outputs->trees.emplace_back();
|
744
|
-
}
|
745
|
-
|
746
|
-
else
|
747
|
-
{
|
748
|
-
last_tree = model_outputs_ext->hplanes.size();
|
749
|
-
model_outputs_ext->hplanes.emplace_back();
|
750
|
-
}
|
751
|
-
|
752
|
-
fit_itree((model_outputs != NULL)? &model_outputs->trees.back() : NULL,
|
753
|
-
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
|
754
|
-
*workspace,
|
755
|
-
input_data,
|
756
|
-
model_params,
|
757
|
-
impute_nodes,
|
758
|
-
last_tree);
|
759
|
-
|
760
|
-
if ((model_outputs != NULL))
|
761
|
-
model_outputs->trees.back().shrink_to_fit();
|
762
|
-
else
|
763
|
-
model_outputs_ext->hplanes.back().shrink_to_fit();
|
764
|
-
|
765
|
-
return EXIT_SUCCESS;
|
766
|
-
}
|
767
|
-
|
768
|
-
void fit_itree(std::vector<IsoTree> *tree_root,
|
769
|
-
std::vector<IsoHPlane> *hplane_root,
|
770
|
-
WorkerMemory &workspace,
|
771
|
-
InputData &input_data,
|
772
|
-
ModelParams &model_params,
|
773
|
-
std::vector<ImputeNode> *impute_nodes,
|
774
|
-
size_t tree_num)
|
775
|
-
{
|
776
|
-
/* initialize array for depths if called for */
|
777
|
-
if (!workspace.ix_arr.size() && model_params.calc_depth)
|
778
|
-
workspace.row_depths.resize(input_data.nrows, 0);
|
779
|
-
|
780
|
-
/* choose random sample of rows */
|
781
|
-
if (!workspace.ix_arr.size()) workspace.ix_arr.resize(model_params.sample_size);
|
782
|
-
if (input_data.log2_n > 0)
|
783
|
-
workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
|
784
|
-
input_data.btree_weights_init.end());
|
785
|
-
workspace.rnd_generator.seed(model_params.random_seed + tree_num);
|
786
|
-
if (input_data.col_weights != NULL)
|
787
|
-
workspace.col_sampler = std::discrete_distribution<size_t>(input_data.col_weights,
|
788
|
-
input_data.col_weights + input_data.ncols_numeric + input_data.ncols_categ);
|
789
|
-
workspace.runif = std::uniform_int_distribution<size_t>(0, input_data.ncols_tot - 1);
|
790
|
-
workspace.rbin = std::uniform_real_distribution<double>(0, 1);
|
791
|
-
sample_random_rows(workspace.ix_arr, input_data.nrows, model_params.with_replacement,
|
792
|
-
workspace.rnd_generator, workspace.ix_all,
|
793
|
-
(input_data.weight_as_sample)? input_data.sample_weights : NULL,
|
794
|
-
workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
|
795
|
-
workspace.is_repeated);
|
796
|
-
workspace.st = 0;
|
797
|
-
workspace.end = model_params.sample_size - 1;
|
798
|
-
if (!workspace.cols_possible.size())
|
799
|
-
workspace.cols_possible.resize(input_data.ncols_tot, true);
|
800
|
-
else
|
801
|
-
workspace.cols_possible.assign(workspace.cols_possible.size(), true);
|
802
|
-
|
803
|
-
/* set expected tree size and add root node */
|
804
|
-
{
|
805
|
-
size_t exp_nodes = 2 * model_params.sample_size;
|
806
|
-
if (model_params.sample_size >= (SIZE_MAX / (size_t)2))
|
807
|
-
exp_nodes = SIZE_MAX;
|
808
|
-
if (model_params.max_depth <= (size_t)30)
|
809
|
-
exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
|
810
|
-
if (tree_root != NULL)
|
811
|
-
{
|
812
|
-
tree_root->reserve(exp_nodes);
|
813
|
-
tree_root->emplace_back();
|
814
|
-
}
|
815
|
-
else
|
816
|
-
{
|
817
|
-
hplane_root->reserve(exp_nodes);
|
818
|
-
hplane_root->emplace_back();
|
819
|
-
}
|
820
|
-
if (impute_nodes != NULL)
|
821
|
-
{
|
822
|
-
impute_nodes->reserve(exp_nodes);
|
823
|
-
impute_nodes->emplace_back((size_t) 0);
|
824
|
-
}
|
825
|
-
}
|
826
|
-
|
827
|
-
/* initialize array with candidate categories if not already done */
|
828
|
-
if (!workspace.categs.size())
|
829
|
-
workspace.categs.resize(input_data.max_categ);
|
830
|
-
|
831
|
-
/* for the extended model, initialize extra vectors and objects */
|
832
|
-
if (hplane_root != NULL && !workspace.comb_val.size())
|
833
|
-
{
|
834
|
-
workspace.coef_norm = std::normal_distribution<double>(0, 1);
|
835
|
-
if (model_params.coef_type == Uniform)
|
836
|
-
workspace.coef_unif = std::uniform_real_distribution<double>(-1, 1);
|
837
|
-
|
838
|
-
workspace.cols_shuffled.resize(input_data.ncols_tot);
|
839
|
-
workspace.comb_val.resize(model_params.sample_size);
|
840
|
-
workspace.col_take.resize(model_params.ndim);
|
841
|
-
workspace.col_take_type.resize(model_params.ndim);
|
842
|
-
|
843
|
-
if (input_data.ncols_numeric)
|
844
|
-
{
|
845
|
-
workspace.ext_offset.resize(input_data.ncols_tot);
|
846
|
-
workspace.ext_coef.resize(input_data.ncols_tot);
|
847
|
-
workspace.ext_mean.resize(input_data.ncols_tot);
|
848
|
-
}
|
849
|
-
|
850
|
-
if (input_data.ncols_categ)
|
851
|
-
{
|
852
|
-
workspace.ext_fill_new.resize(input_data.max_categ);
|
853
|
-
switch(model_params.cat_split_type)
|
854
|
-
{
|
855
|
-
case SingleCateg:
|
856
|
-
{
|
857
|
-
workspace.chosen_cat.resize(input_data.max_categ);
|
858
|
-
break;
|
859
|
-
}
|
860
|
-
|
861
|
-
case SubSet:
|
862
|
-
{
|
863
|
-
workspace.ext_cat_coef.resize(input_data.ncols_tot);
|
864
|
-
for (std::vector<double> &v : workspace.ext_cat_coef)
|
865
|
-
v.resize(input_data.max_categ);
|
866
|
-
break;
|
867
|
-
}
|
868
|
-
}
|
869
|
-
}
|
870
|
-
|
871
|
-
workspace.ext_fill_val.resize(input_data.ncols_tot);
|
872
|
-
|
873
|
-
}
|
874
|
-
|
875
|
-
/* if it contains missing values, also have to set an array of weights,
|
876
|
-
which will be modified during iterations when there are NAs.
|
877
|
-
If there are already density weights, need to standardize them to sum up to
|
878
|
-
the sample size here */
|
879
|
-
long double weight_scaling = 0;
|
880
|
-
if (model_params.missing_action == Divide || (input_data.sample_weights != NULL && !input_data.weight_as_sample))
|
881
|
-
{
|
882
|
-
workspace.weights_map.clear();
|
883
|
-
|
884
|
-
/* if the sub-sample size is small relative to the full sample size, use a mapping */
|
885
|
-
if (model_params.sample_size < input_data.nrows / 4)
|
886
|
-
{
|
887
|
-
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
888
|
-
{
|
889
|
-
for (const size_t ix : workspace.ix_arr)
|
890
|
-
{
|
891
|
-
weight_scaling += input_data.sample_weights[ix];
|
892
|
-
workspace.weights_map[ix] = input_data.sample_weights[ix];
|
893
|
-
}
|
894
|
-
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
895
|
-
for (auto &w : workspace.weights_map)
|
896
|
-
w.second *= weight_scaling;
|
897
|
-
|
898
|
-
}
|
899
|
-
|
900
|
-
else
|
901
|
-
{
|
902
|
-
for (const size_t ix : workspace.ix_arr)
|
903
|
-
workspace.weights_map[ix] = 1;
|
904
|
-
}
|
905
|
-
|
906
|
-
}
|
907
|
-
|
908
|
-
/* if the sub-sample size is large, fill a full array matching to the sample size */
|
909
|
-
else
|
910
|
-
{
|
911
|
-
if (!workspace.weights_arr.size())
|
912
|
-
{
|
913
|
-
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
914
|
-
{
|
915
|
-
workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
|
916
|
-
weight_scaling = std::accumulate(workspace.ix_arr.begin(),
|
917
|
-
workspace.ix_arr.end(),
|
918
|
-
(long double)0,
|
919
|
-
[&input_data](const long double a, const size_t b){return a + (long double)input_data.sample_weights[b];}
|
920
|
-
);
|
921
|
-
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
922
|
-
for (double &w : workspace.weights_arr)
|
923
|
-
w *= weight_scaling;
|
924
|
-
}
|
925
|
-
|
926
|
-
else
|
927
|
-
{
|
928
|
-
workspace.weights_arr.resize(input_data.nrows, (double)1);
|
929
|
-
}
|
930
|
-
|
931
|
-
}
|
932
|
-
|
933
|
-
else
|
934
|
-
{
|
935
|
-
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
936
|
-
{
|
937
|
-
for (const size_t ix : workspace.ix_arr)
|
938
|
-
{
|
939
|
-
weight_scaling += input_data.sample_weights[ix];
|
940
|
-
workspace.weights_arr[ix] = input_data.sample_weights[ix];
|
941
|
-
}
|
942
|
-
weight_scaling = (long double)model_params.sample_size / weight_scaling;
|
943
|
-
for (double &w : workspace.weights_arr)
|
944
|
-
w *= weight_scaling;
|
945
|
-
|
946
|
-
}
|
947
|
-
|
948
|
-
else
|
949
|
-
{
|
950
|
-
/* Note: while not all of them need to be overwritten, this is faster
|
951
|
-
(sub-sample size was already determined to be at least 1/4 of the sample size) */
|
952
|
-
std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), (double)1);
|
953
|
-
}
|
954
|
-
}
|
955
|
-
}
|
956
|
-
}
|
957
|
-
|
958
|
-
/* if producing distance/similarity, also need to initialize the triangular matrix */
|
959
|
-
if (model_params.calc_dist && !workspace.tmat_sep.size())
|
960
|
-
workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
|
961
|
-
|
962
|
-
/* make space for buffers if not already allocated */
|
963
|
-
if (
|
964
|
-
(model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
|
965
|
-
model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl ||
|
966
|
-
model_params.weigh_by_kurt || hplane_root != NULL)
|
967
|
-
&&
|
968
|
-
(!workspace.buffer_dbl.size() && !workspace.buffer_szt.size() && !workspace.buffer_chr.size())
|
969
|
-
)
|
970
|
-
{
|
971
|
-
size_t min_size_dbl = 0;
|
972
|
-
size_t min_size_szt = 0;
|
973
|
-
size_t min_size_chr = 0;
|
974
|
-
|
975
|
-
bool gain = model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
|
976
|
-
model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl;
|
977
|
-
|
978
|
-
if (input_data.ncols_categ)
|
979
|
-
{
|
980
|
-
min_size_szt = 2 * input_data.max_categ;
|
981
|
-
min_size_dbl = input_data.max_categ + 1;
|
982
|
-
if (gain && model_params.cat_split_type == SubSet)
|
983
|
-
min_size_chr = input_data.max_categ;
|
984
|
-
}
|
985
|
-
|
986
|
-
if (input_data.Xc_indptr != NULL && gain)
|
987
|
-
{
|
988
|
-
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
989
|
-
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
990
|
-
}
|
991
|
-
|
992
|
-
/* for the extended model */
|
993
|
-
if (hplane_root != NULL)
|
994
|
-
{
|
995
|
-
min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
|
996
|
-
if (model_params.missing_action != Fail)
|
997
|
-
{
|
998
|
-
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
999
|
-
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
1000
|
-
}
|
1001
|
-
|
1002
|
-
if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
|
1003
|
-
{
|
1004
|
-
min_size_szt = std::max(min_size_szt, 2 * (size_t)input_data.max_categ + 1);
|
1005
|
-
min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
|
1006
|
-
}
|
1007
|
-
|
1008
|
-
if (model_params.weigh_by_kurt)
|
1009
|
-
min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
|
1010
|
-
}
|
1011
|
-
|
1012
|
-
/* now resize */
|
1013
|
-
if (workspace.buffer_dbl.size() < min_size_dbl)
|
1014
|
-
workspace.buffer_dbl.resize(min_size_dbl);
|
1015
|
-
|
1016
|
-
if (workspace.buffer_szt.size() < min_size_szt)
|
1017
|
-
workspace.buffer_szt.resize(min_size_szt);
|
1018
|
-
|
1019
|
-
if (workspace.buffer_chr.size() < min_size_chr)
|
1020
|
-
workspace.buffer_chr.resize(min_size_chr);
|
1021
|
-
|
1022
|
-
/* for guided column choice, need to also remember the best split so far */
|
1023
|
-
if (
|
1024
|
-
model_params.cat_split_type == SubSet &&
|
1025
|
-
(
|
1026
|
-
model_params.prob_pick_by_gain_avg ||
|
1027
|
-
model_params.prob_pick_by_gain_pl
|
1028
|
-
)
|
1029
|
-
)
|
1030
|
-
{
|
1031
|
-
workspace.this_split_categ.resize(input_data.max_categ);
|
1032
|
-
}
|
1033
|
-
|
1034
|
-
}
|
1035
|
-
|
1036
|
-
/* weigh columns by kurtosis in the sample if required */
|
1037
|
-
if (model_params.weigh_by_kurt)
|
1038
|
-
{
|
1039
|
-
std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
|
1040
|
-
|
1041
|
-
if (input_data.Xc_indptr == NULL)
|
1042
|
-
{
|
1043
|
-
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1044
|
-
kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1045
|
-
input_data.numeric_data + col * input_data.nrows,
|
1046
|
-
model_params.missing_action);
|
1047
|
-
}
|
1048
|
-
|
1049
|
-
else
|
1050
|
-
{
|
1051
|
-
std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
|
1052
|
-
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1053
|
-
kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
1054
|
-
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
1055
|
-
model_params.missing_action);
|
1056
|
-
}
|
1057
|
-
|
1058
|
-
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
1059
|
-
kurt_weights[col + input_data.ncols_numeric] =
|
1060
|
-
calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1061
|
-
input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
|
1062
|
-
workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
|
1063
|
-
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
|
1064
|
-
|
1065
|
-
for (size_t col = 0; col < input_data.ncols_tot; col++)
|
1066
|
-
if (kurt_weights[col] <= 0 || is_na_or_inf(kurt_weights[col]))
|
1067
|
-
workspace.cols_possible[col] = false;
|
1068
|
-
|
1069
|
-
workspace.col_sampler = std::discrete_distribution<size_t>(kurt_weights.begin(), kurt_weights.end());
|
1070
|
-
}
|
1071
|
-
|
1072
|
-
if (tree_root != NULL)
|
1073
|
-
split_itree_recursive(*tree_root,
|
1074
|
-
workspace,
|
1075
|
-
input_data,
|
1076
|
-
model_params,
|
1077
|
-
impute_nodes,
|
1078
|
-
0);
|
1079
|
-
else
|
1080
|
-
split_hplane_recursive(*hplane_root,
|
1081
|
-
workspace,
|
1082
|
-
input_data,
|
1083
|
-
model_params,
|
1084
|
-
impute_nodes,
|
1085
|
-
0);
|
1086
|
-
|
1087
|
-
/* if producing imputation structs, only need to keep the ones for terminal nodes */
|
1088
|
-
if (impute_nodes != NULL)
|
1089
|
-
drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
|
1090
|
-
}
|