isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,2401 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
#include "isotree.hpp"
|
|
64
|
+
|
|
65
|
+
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
|
66
|
+
*
|
|
67
|
+
* Parameters:
|
|
68
|
+
* ===========
|
|
69
|
+
* - model_outputs (out)
|
|
70
|
+
* Pointer to already allocated isolation forest model object for single-variable splits.
|
|
71
|
+
* If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
|
|
72
|
+
* additional trees through function 'add_tree'.
|
|
73
|
+
* - model_outputs_ext (out)
|
|
74
|
+
* Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
|
|
75
|
+
* Note that if 'ndim' = 1, must use instead the single-variable model object.
|
|
76
|
+
* If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
|
|
77
|
+
* additional trees through function 'add_tree'.
|
|
78
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
79
|
+
* Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
|
|
80
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
81
|
+
* Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
|
|
82
|
+
* no sparse numeric data either).
|
|
83
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
84
|
+
* - ncols_numeric
|
|
85
|
+
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
|
86
|
+
* - categ_data[nrows * ncols_categ]
|
|
87
|
+
* Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
|
|
88
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
89
|
+
* Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
|
|
90
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
91
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must also be present
|
|
92
|
+
* (note that they are not treated as being ordinal, this is just an encoding). Missing values
|
|
93
|
+
* should be encoded as negative numbers such as (-1).
|
|
94
|
+
* - ncols_categ
|
|
95
|
+
* Number of categorical columns in the data.
|
|
96
|
+
* - ncat[ncols_categ]
|
|
97
|
+
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
|
98
|
+
* the number of categories for that column is '5' (zero is one category).
|
|
99
|
+
* - Xc[nnz]
|
|
100
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
101
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
102
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
103
|
+
* - Xc_ind[nnz]
|
|
104
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
105
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
106
|
+
* The largest value here should be smaller than the largest possible value of 'size_t'.
|
|
107
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
108
|
+
* - Xc_indptr[ncols_numeric + 1]
|
|
109
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
110
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
111
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
112
|
+
* - ndim
|
|
113
|
+
* How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
|
|
114
|
+
* the single-variable model. Note that the model object pointer passed must also
|
|
115
|
+
* agree with the value passed to 'ndim'.
|
|
116
|
+
* - ntry
|
|
117
|
+
* When using any of 'prob_pick_by_gain_pl', 'prob_pick_by_gain_avg', 'prob_pick_by_full_gain', 'prob_pick_by_dens', how many variables (with 'ndim=1')
|
|
118
|
+
* or linear combinations (with 'ndim>1') to try for determining the best one according to gain.
|
|
119
|
+
* Recommended value in reference [4] is 10 (with 'prob_pick_by_gain_avg', for outlier detection), while the
|
|
120
|
+
* recommended value in reference [11] is 1 (with 'prob_pick_by_gain_pl', for outlier detection), and the
|
|
121
|
+
* recommended value in reference [9] is 10 to 20 (with 'prob_pick_by_gain_pl', for missing value imputations).
|
|
122
|
+
* - coef_type
|
|
123
|
+
* For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
|
|
124
|
+
* (as proposed in [4]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [3]. Ignored for the
|
|
125
|
+
* single-variable model.
|
|
126
|
+
* - sample_weights[nrows]
|
|
127
|
+
* Weights for the rows when building a tree, either as sampling importances when using
|
|
128
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
|
129
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
|
130
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
|
131
|
+
* through parameter 'weight_as_sample'.
|
|
132
|
+
* Pass NULL if the rows all have uniform weights.
|
|
133
|
+
* - with_replacement
|
|
134
|
+
* Whether to sample rows with replacement or not (not recommended). Note that distance calculations,
|
|
135
|
+
* if desired, don't work well with duplicate rows.
|
|
136
|
+
* - weight_as_sample
|
|
137
|
+
* If passing sample (row) weights when fitting the model, whether to consider those weights as row
|
|
138
|
+
* sampling weights (i.e. the higher the weights, the more likely the observation will end up included
|
|
139
|
+
* in each tree sub-sample), or as distribution density weights (i.e. putting a weight of two is the same
|
|
140
|
+
* as if the row appeared twice, thus higher weight makes it less of an outlier, but does not give it a
|
|
141
|
+
* higher chance of being sampled if the data uses sub-sampling).
|
|
142
|
+
* - nrows
|
|
143
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
144
|
+
* - sample_size
|
|
145
|
+
* Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
|
|
146
|
+
* 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
|
|
147
|
+
* random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
|
|
148
|
+
* will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
|
|
149
|
+
* in [5] is 'nrows' here.
|
|
150
|
+
* - ntrees
|
|
151
|
+
* Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
|
|
152
|
+
* author's code in [5] is 10.
|
|
153
|
+
* - max_depth
|
|
154
|
+
* Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
|
|
155
|
+
* Models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
|
|
156
|
+
* deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
|
|
157
|
+
* predictions.
|
|
158
|
+
* Note that models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
|
|
159
|
+
* deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
|
|
160
|
+
* predictions.
|
|
161
|
+
* If using pooled gain, one might want to substitute 'max_depth' with 'min_gain'.
|
|
162
|
+
* - ncols_per_tree
|
|
163
|
+
* Number of columns to use (have as potential candidates for splitting at each iteration) in each tree,
|
|
164
|
+
* similar to the 'mtry' parameter of random forests.
|
|
165
|
+
* In general, this is only relevant when using non-random splits and/or weighted column choices.
|
|
166
|
+
* If passing zero, will use the full number of available columns.
|
|
167
|
+
* Recommended value: 0.
|
|
168
|
+
* - limit_depth
|
|
169
|
+
* Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
|
|
170
|
+
* terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
|
|
171
|
+
* will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
|
|
172
|
+
* tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass 'false' here
|
|
173
|
+
* and higher values for 'max_depth' if using the model for purposes other than outlier detection.
|
|
174
|
+
* Note that, if passing 'limit_depth=true', then 'max_depth' is ignored.
|
|
175
|
+
* - penalize_range
|
|
176
|
+
* Whether to penalize (add -1 to the terminal depth) observations at prediction time that have a value
|
|
177
|
+
* of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
|
|
178
|
+
* reasonable range in the data being split (given by 2 * range in data and centered around the split point),
|
|
179
|
+
* as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
|
|
180
|
+
* when splitting by categorical variables. Note that this can make a very large difference in the results
|
|
181
|
+
* when using 'prob_pick_by_gain_pl'.
|
|
182
|
+
* This option is not supported when using density-based outlier scoring metrics.
|
|
183
|
+
* - standardize_data
|
|
184
|
+
* Whether to standardize the features at each node before creating a linear combination of them as suggested
|
|
185
|
+
* in [4]. This is ignored when using 'ndim=1'.
|
|
186
|
+
* - scoring_metric
|
|
187
|
+
* Metric to use for determining outlier scores (see reference [13]).
|
|
188
|
+
* If passing 'Depth', will use isolation depth as proposed in reference [1]. This is typically the safest choice
|
|
189
|
+
* and plays well with all model types offered by this library.
|
|
190
|
+
* If passing 'Density', will set scores for each terminal node as the ratio between the fraction of points in the sub-sample
|
|
191
|
+
* that end up in that node and the fraction of the volume in the feature space which defines
|
|
192
|
+
* the node according to the splits that lead to it.
|
|
193
|
+
* If using 'ndim=1', for categorical variables, 'Density' is defined in terms
|
|
194
|
+
* of number of categories that go towards each side of the split divided by number of categories
|
|
195
|
+
* in the observations that reached that node.
|
|
196
|
+
* The standardized outlier score from 'Density' for a given observation is calculated as the
|
|
197
|
+
* negative of the logarithm of the geometric mean from the per-tree densities, which unlike
|
|
198
|
+
* the standardized score produced from 'Depth', is unbounded, but just like the standardized
|
|
199
|
+
* score form 'Depth', has a natural threshold for definining outlierness, which in this case
|
|
200
|
+
* is zero is instead of 0.5. The non-standardized outlier score for 'Density' is calculated as the
|
|
201
|
+
* geometric mean, while the per-tree scores are calculated as the density values.
|
|
202
|
+
* 'Density' might lead to better predictions when using 'ndim=1', particularly in the presence
|
|
203
|
+
* of categorical variables. Note however that using 'Density' requires more trees for convergence
|
|
204
|
+
* of scores (i.e. good results) compared to isolation-based metrics.
|
|
205
|
+
* 'Density' is incompatible with 'penalize_range=true'.
|
|
206
|
+
* If passing 'AdjDepth', will use an adjusted isolation depth that takes into account the number of points that
|
|
207
|
+
* go to each side of a given split vs. the fraction of the range of that feature that each
|
|
208
|
+
* side of the split occupies, by a metric as follows: 'd = 2/ (1 + 1/(2*p))'
|
|
209
|
+
* where 'p' is defined as 'p = (n_s / n_t) / (r_s / r_t)
|
|
210
|
+
* with 'n_t' being the number of points that reach a given node, 'n_s' the
|
|
211
|
+
* number of points that are sent to a given side of the split/branch at that node,
|
|
212
|
+
* 'r_t' being the range (maximum minus minimum) of the splitting feature or
|
|
213
|
+
* linear combination among the points that reached the node, and 'r_s' being the
|
|
214
|
+
* range of the same feature or linear combination among the points that are sent to this
|
|
215
|
+
* same side of the split/branch. This makes each split add a number between zero and two
|
|
216
|
+
* to the isolation depth, with this number's probabilistic distribution being centered
|
|
217
|
+
* around 1 and thus the expected isolation depth remaing the same as in the original
|
|
218
|
+
* 'Depth' metric, but having more variability around the extremes.
|
|
219
|
+
* Scores (standardized, non-standardized, per-tree) for 'AdjDepth' are aggregated in the same way
|
|
220
|
+
* as for 'Depth'.
|
|
221
|
+
* 'AdjDepth' might lead to better predictions when using 'ndim=1', particularly in the prescence
|
|
222
|
+
* of categorical variables and for smaller datasets, and for smaller datasets, might make
|
|
223
|
+
* sense to combine it with 'penalize_range=true'.
|
|
224
|
+
* If passing 'AdjDensity', will use the same metric from 'AdjDepth', but applied multiplicatively instead
|
|
225
|
+
* of additively. The expected value for 'AdjDepth' is not strictly the same
|
|
226
|
+
* as for isolation, but using the expected isolation depth as standardizing criterion
|
|
227
|
+
* tends to produce similar standardized score distributions (centered around 0.5).
|
|
228
|
+
* Scores (standardized, non-standardized, per-tree) from 'AdjDensity' are aggregated in the same way
|
|
229
|
+
* as for 'Depth'.
|
|
230
|
+
* 'AdjDepth' is incompatible with 'penalize_range=true'.
|
|
231
|
+
* If passing 'BoxedRatio', will set the scores for each terminal node as the ratio between the volume of the boxed
|
|
232
|
+
* feature space for the node as defined by the smallest and largest values from the split
|
|
233
|
+
* conditions for each column (bounded by the variable ranges in the sample) and the
|
|
234
|
+
* variable ranges in the tree sample.
|
|
235
|
+
* If using 'ndim=1', for categorical variables 'BoxedRatio' is defined in terms of number of categories.
|
|
236
|
+
* If using 'ndim=>1', 'BoxedRatio' is defined in terms of the maximum achievable value for the
|
|
237
|
+
* splitting linear combination determined from the minimum and maximum values for each
|
|
238
|
+
* variable among the points in the sample, and as such, it has a rather different meaning
|
|
239
|
+
* compared to the score obtained with 'ndim=1' - 'BoxedRatio' scores with 'ndim>1'
|
|
240
|
+
* typically provide very poor quality results and this metric is thus not recommended to
|
|
241
|
+
* use in the extended model. With 'ndim>1', 'BoxedRatio' also has a tendency of producing too small
|
|
242
|
+
* values which round to zero.
|
|
243
|
+
* The standardized outlier score from 'BoxedRatio' for a given observation is calculated
|
|
244
|
+
* simply as the the average from the per-tree boxed ratios. 'BoxedRatio' metric
|
|
245
|
+
* has a lower bound of zero and a theorical upper bound of one, but in practice the scores
|
|
246
|
+
* tend to be very small numbers close to zero, and its distribution across
|
|
247
|
+
* different datasets is rather unpredictable. In order to keep rankings comparable with
|
|
248
|
+
* the rest of the metrics, the non-standardized outlier scores for 'BoxedRatio' are calculated as the
|
|
249
|
+
* negative of the average instead. The per-tree 'BoxedRatio' scores are calculated as the ratios.
|
|
250
|
+
* 'BoxedRatio' can be calculated in a fast-but-not-so-precise way, and in a low-but-precise
|
|
251
|
+
* way, which is controlled by parameter 'fast_bratio'. Usually, both should give the
|
|
252
|
+
* same results, but in some fatasets, the fast way can lead to numerical inaccuracies
|
|
253
|
+
* due to roundoffs very close to zero.
|
|
254
|
+
* 'BoxedRatio' might lead to better predictions in datasets with many rows when using 'ndim=1'
|
|
255
|
+
* and a relatively small 'sample_size'. Note that more trees are required for convergence
|
|
256
|
+
* of scores when using 'BoxedRatio'. In some datasets, 'BoxedRatio' metric might result in very bad
|
|
257
|
+
* predictions, to the point that taking its inverse produces a much better ranking of outliers.
|
|
258
|
+
* 'BoxedRatio' option is incompatible with 'penalize_range'.
|
|
259
|
+
* If passing 'BoxedDensity2', will set the score as the ratio between the fraction of points within the sample that
|
|
260
|
+
* end up in a given terminal node and the 'BoxedRatio' metric.
|
|
261
|
+
* Aggregation of scores (standardized, non-standardized, per-tree) for 'BoxedDensity2' is done in the same
|
|
262
|
+
* way as for 'Density', and it also has a natural threshold at zero for determining
|
|
263
|
+
* outliers and inliers.
|
|
264
|
+
* 'BoxedDensity2' is typically usable with 'ndim>1', but tends to produce much bigger values
|
|
265
|
+
* compared to 'ndim=1'.
|
|
266
|
+
* Albeit unintuitively, in many datasets, one can usually get better results with metric
|
|
267
|
+
* 'BoxedDensity' instead.
|
|
268
|
+
* The calculation of 'BoxedDensity2' is also controlled by 'fast_bratio'.
|
|
269
|
+
* 'BoxedDensity2' incompatible with 'penalize_range'.
|
|
270
|
+
* If passing 'BoxedDensity', will set the score as the ratio between the fraction of points within the sample that
|
|
271
|
+
* end up in a given terminal node and the ratio between the boxed volume of the feature
|
|
272
|
+
* space in the sample and the boxed volume of a node given by the split conditions (inverse
|
|
273
|
+
* as in 'BoxedDensity2'). This metric does not have any theoretical or intuitive
|
|
274
|
+
* justification behind its existence, and it is perhaps ilogical to use it as a
|
|
275
|
+
* scoring metric, but tends to produce good results in some datasets.
|
|
276
|
+
* The standardized outlier scores for 'BoxedDensity' are defined as the negative of the geometric mean,
|
|
277
|
+
* while the non-standardized scores are the geometric mean, and the per-tree scores are simply the 'density' values.
|
|
278
|
+
* The calculation of 'BoxedDensity' is also controlled by 'fast_bratio'.
|
|
279
|
+
* 'BoxedDensity' option is incompatible with 'penalize_range'.
|
|
280
|
+
* - fast_bratio
|
|
281
|
+
* When using "boxed" metrics for scoring, whether to calculate them in a fast way through
|
|
282
|
+
* cumulative sum of logarithms of ratios after each split, or in a slower way as sum of
|
|
283
|
+
* logarithms of a single ratio per column for each terminal node.
|
|
284
|
+
* Usually, both methods should give the same results, but in some datasets, particularly
|
|
285
|
+
* when variables have too small or too large ranges, the first method can be prone to
|
|
286
|
+
* numerical inaccuracies due to roundoff close to zero.
|
|
287
|
+
* Note that this does not affect calculations for models with 'ndim>1', since given the
|
|
288
|
+
* split types, the calculation for them is different.
|
|
289
|
+
* - standardize_dist
|
|
290
|
+
* If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
|
|
291
|
+
* depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
|
|
292
|
+
* - tmat[nrows * (nrows - 1) / 2]
|
|
293
|
+
* Array in which to calculate average separation depths or standardized distance metric (see documentation
|
|
294
|
+
* for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
|
295
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
|
296
|
+
* of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
|
|
297
|
+
* output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
|
|
298
|
+
* entry 0 <= i < j < n will be located at position
|
|
299
|
+
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
|
300
|
+
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
|
301
|
+
* - output_depths[nrows]
|
|
302
|
+
* Array in which to calculate average path depths or standardized outlierness metric (see documentation
|
|
303
|
+
* for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
|
304
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
|
305
|
+
* of rows. If not NULL, must already be initialized to zeros.
|
|
306
|
+
* - standardize_depth
|
|
307
|
+
* If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
|
|
308
|
+
* a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
|
|
309
|
+
* with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
|
|
310
|
+
* the average depth of each row across all trees.
|
|
311
|
+
* - col_weights[ncols_numeric + ncols_categ]
|
|
312
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
|
313
|
+
* Ignored when picking columns by deterministic criterion.
|
|
314
|
+
* If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
|
|
315
|
+
* effect is multiplicative.
|
|
316
|
+
* - weigh_by_kurt
|
|
317
|
+
* Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
|
|
318
|
+
* for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
|
|
319
|
+
* sample. For categorical columns, will calculate expected kurtosis if the column were converted to
|
|
320
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1).
|
|
321
|
+
* This is intended as a cheap feature selector, while the parameter 'prob_pick_col_by_kurt'
|
|
322
|
+
* provides the option to do this at each node in the tree for a different overall type of model.
|
|
323
|
+
* If passing column weights or weighted column choices ('prob_pick_col_by_range', 'prob_pick_col_by_var'),
|
|
324
|
+
* the effect will be multiplicative. This option is not compatible with 'prob_pick_col_by_kurt'.
|
|
325
|
+
* If passing 'missing_action=fail' and the data has infinite values, columns with rows
|
|
326
|
+
* having infinite values will get a weight of zero. If passing a different value for missing
|
|
327
|
+
* action, infinite values will be ignored in the kurtosis calculation.
|
|
328
|
+
* If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
|
|
329
|
+
* in order not to favor columns with missing values (which would increase kurtosis by all having
|
|
330
|
+
* the same central value).
|
|
331
|
+
* - prob_pick_by_gain_pl
|
|
332
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
333
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
334
|
+
* that maximizes a pooled standard deviation gain criterion (see references [9] and [11]) on the
|
|
335
|
+
* same variable or linear combination, similarly to regression trees such as CART.
|
|
336
|
+
* If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
|
|
337
|
+
* in which the largest standardized gain can be achieved.
|
|
338
|
+
* For categorical variables with 'ndim=1', will use shannon entropy instead (like in [7]).
|
|
339
|
+
* Compared to a simple averaged gain, this tends to result in more evenly-divided splits and more clustered
|
|
340
|
+
* groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
|
|
341
|
+
* When used for outlier detection, datasets with multimodal distributions usually see better performance
|
|
342
|
+
* under this type of splits.
|
|
343
|
+
* Note that, since this makes the trees more even and thus it takes more steps to produce isolated nodes,
|
|
344
|
+
* the resulting object will be heavier. When splits are not made according to any of 'prob_pick_by_gain_avg',
|
|
345
|
+
* 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
346
|
+
* Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
|
|
347
|
+
* every single tree will have the exact same splits.
|
|
348
|
+
* Be aware that 'penalize_range' can also have a large impact when using 'prob_pick_by_gain_pl'.
|
|
349
|
+
* Be aware also that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable
|
|
350
|
+
* model, every single tree will have the exact same splits.
|
|
351
|
+
* Under this option, models are likely to produce better results when increasing 'max_depth'.
|
|
352
|
+
* Alternatively, one can also control the depth through 'min_gain' (for which one might want to
|
|
353
|
+
* set 'max_depth=0').
|
|
354
|
+
* Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain',
|
|
355
|
+
* 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
|
|
356
|
+
* - prob_pick_by_gain_avg
|
|
357
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
358
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
359
|
+
* that maximizes an averaged standard deviation gain criterion (see references [4] and [11]) on the
|
|
360
|
+
* same variable or linear combination.
|
|
361
|
+
* If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
|
|
362
|
+
* in which the largest standardized gain can be achieved.
|
|
363
|
+
* For categorical variables with 'ndim=1', will take the expected standard deviation that would be
|
|
364
|
+
* gotten if the column were converted to numerical by assigning to each category a random
|
|
365
|
+
* number ~ Unif(0, 1) and calculate gain with those assumed standard deviations.
|
|
366
|
+
* Compared to a pooled gain, this tends to result in more cases in which a single observation or very
|
|
367
|
+
* few of them are put into one branch. Typically, datasets with outliers defined by extreme values in
|
|
368
|
+
* some column more or less independently of the rest, usually see better performance under this type
|
|
369
|
+
* of split. Recommended to use sub-samples (parameter 'sample_size') when
|
|
370
|
+
* passing this parameter. Note that, since this will create isolated nodes faster, the resulting object
|
|
371
|
+
* will be lighter (use less memory).
|
|
372
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
373
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
374
|
+
* Default setting for [1], [2], [3] is zero, and default for [4] is 1.
|
|
375
|
+
* This is the randomization parameter that can be passed to the author's original code in [5],
|
|
376
|
+
* but note that the code in [5] suffers from a mathematical error in the calculation of running standard deviations,
|
|
377
|
+
* so the results from it might not match with this library's.
|
|
378
|
+
* Be aware that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable model,
|
|
379
|
+
* every single tree will have the exact same splits.
|
|
380
|
+
* Under this option, models are likely to produce better results when increasing 'max_depth'.
|
|
381
|
+
* Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
382
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
|
|
383
|
+
* - prob_pick_by_full_gain
|
|
384
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
385
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
386
|
+
* that minimizes the pooled sums of variances of all columns (or a subset of them if using
|
|
387
|
+
* 'ncols_per_tree').
|
|
388
|
+
* In general, 'prob_pick_by_full_gain' is much slower to evaluate than the other gain types, and does not tend to
|
|
389
|
+
* lead to better results. When using 'prob_pick_by_full_gain', one might want to use a different scoring
|
|
390
|
+
* metric (particulatly 'Density', 'BoxedDensity2' or 'BoxedRatio'). Note that
|
|
391
|
+
* the variance calculations are all done through the (exact) sorted-indices approach, while is much
|
|
392
|
+
* slower than the (approximate) histogram approach used by other decision tree software.
|
|
393
|
+
* Be aware that the data is not standardized in any way for the range calculations, thus the scales
|
|
394
|
+
* of features will make a large difference under 'prob_pick_by_full_gain', which might not make it suitable for
|
|
395
|
+
* all types of data.
|
|
396
|
+
* 'prob_pick_by_full_gain' is not compatible with categorical data, and 'min_gain' does not apply to it.
|
|
397
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
398
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
399
|
+
* Default setting for [1], [2], [3], [4] is zero.
|
|
400
|
+
* - prob_pick_dens
|
|
401
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
402
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
403
|
+
* that maximizes the pooled densities of the branch distributions.
|
|
404
|
+
* The 'min_gain' option does not apply to this type of splits.
|
|
405
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
406
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
407
|
+
* Default setting for [1], [2], [3], [4] is zero.
|
|
408
|
+
* - prob_pick_col_by_range
|
|
409
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
410
|
+
* proportional to the range spanned by each column within a node as proposed in reference [12].
|
|
411
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
412
|
+
* probability proportional to the range spanned by each column within a node.
|
|
413
|
+
* This option is not compatible with categorical data. If passing column weights, the
|
|
414
|
+
* effect will be multiplicative.
|
|
415
|
+
* Be aware that the data is not standardized in any way for the range calculations, thus the scales
|
|
416
|
+
* of features will make a large difference under this option, which might not make it suitable for
|
|
417
|
+
* all types of data.
|
|
418
|
+
* Note that the proposed RRCF model from [12] uses a different scoring metric for producing anomaly
|
|
419
|
+
* scores, while this library uses isolation depth regardless of how columns are chosen, thus results
|
|
420
|
+
* are likely to be different from those of other software implementations. Nevertheless, as explored
|
|
421
|
+
* in [11], isolation depth as a scoring metric typically provides better results than the
|
|
422
|
+
* "co-displacement" metric from [12] under these split types.
|
|
423
|
+
* - prob_pick_col_by_var
|
|
424
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
425
|
+
* proportional to the variance of each column within a node.
|
|
426
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
427
|
+
* probability proportional to the variance of each column within a node.
|
|
428
|
+
* For categorical data, it will calculate the expected variance if the column were converted to
|
|
429
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1), which depending on the number of
|
|
430
|
+
* categories and their distribution, produces numbers typically a bit smaller than standardized numerical
|
|
431
|
+
* variables.
|
|
432
|
+
* Note that when using sparse matrices, the calculation of variance will rely on a procedure that
|
|
433
|
+
* uses sums of squares, which has less numerical precision than the
|
|
434
|
+
* calculation used for dense inputs, and as such, the results might differ slightly.
|
|
435
|
+
* Be aware that this calculated variance is not standardized in any way, so the scales of
|
|
436
|
+
* features will make a large difference under this option.
|
|
437
|
+
* If there are infinite values, all columns having infinite values will be treated as having the
|
|
438
|
+
* same weight, and will be chosen before every other column with non-infinite values.
|
|
439
|
+
* If passing column weights , the effect will be multiplicative.
|
|
440
|
+
* If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
|
|
441
|
+
* variance calculation. Otherwise, all columns with infinite values will have the same probability
|
|
442
|
+
* and will be chosen before columns with non-infinite values.
|
|
443
|
+
* - prob_pick_col_by_kurt
|
|
444
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
445
|
+
* proportional to the kurtosis of each column **within a node** (unlike the option 'weigh_by_kurtosis'
|
|
446
|
+
* which calculates this metric only at the root).
|
|
447
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
448
|
+
* probability proportional to the kurtosis of each column within a node.
|
|
449
|
+
* For categorical data, it will calculate the expected kurtosis if the column were converted to
|
|
450
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1).
|
|
451
|
+
* Note that when using sparse matrices, the calculation of kurtosis will rely on a procedure that
|
|
452
|
+
* uses sums of squares and higher-power numbers, which has less numerical precision than the
|
|
453
|
+
* calculation used for dense inputs, and as such, the results might differ slightly.
|
|
454
|
+
* If passing column weights, the effect will be multiplicative. This option is not compatible
|
|
455
|
+
* with 'weigh_by_kurtosis'.
|
|
456
|
+
* If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
|
|
457
|
+
* variance calculation. Otherwise, all columns with infinite values will have the same probability
|
|
458
|
+
* and will be chosen before columns with non-infinite values.
|
|
459
|
+
* If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
|
|
460
|
+
* in order not to favor columns with missing values (which would increase kurtosis by all having
|
|
461
|
+
* the same central value).
|
|
462
|
+
* Be aware that kurtosis can be a rather slow metric to calculate.
|
|
463
|
+
* - min_gain
|
|
464
|
+
* Minimum gain that a split threshold needs to produce in order to proceed with a split.
|
|
465
|
+
* Only used when the splits are decided by a variance gain criterion ('prob_pick_by_gain_pl' or
|
|
466
|
+
* 'prob_pick_by_gain_avg', but not 'prob_pick_by_full_gain' nor 'prob_pick_by_dens').
|
|
467
|
+
* If the highest possible gain in the evaluated splits at a node is below this threshold,
|
|
468
|
+
* that node becomes a terminal node.
|
|
469
|
+
* This can be used as a more sophisticated depth control when using pooled gain (note that 'max_depth'
|
|
470
|
+
* still applies on top of this heuristic).
|
|
471
|
+
* - missing_action
|
|
472
|
+
* How to handle missing data at both fitting and prediction time. Options are a) 'Divide' (for the single-variable
|
|
473
|
+
* model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
|
|
474
|
+
* the data that went to each branch when fitting the model, b) 'Impute', which will assign observations to the
|
|
475
|
+
* branch with the most observations in the single-variable model (but imputed values will also be used for
|
|
476
|
+
* gain calculations), or fill in missing values with the median of each column of the sample from which the
|
|
477
|
+
* split was made in the extended model (recommended) (but note that the calculation of medians does not take
|
|
478
|
+
* into account sample weights when using 'weights_as_sample_prob=false', and note that when using a gain
|
|
479
|
+
* criterion for splits with 'ndim=1', it will use the imputed values in the calculation), c) 'Fail' which will
|
|
480
|
+
* assume that there are no missing values and will trigger undefined behavior if it encounters any.
|
|
481
|
+
* In the extended model, infinite values will be treated as missing.
|
|
482
|
+
* Note that passing 'Fail' might crash the process if there turn out to be missing values, but will otherwise
|
|
483
|
+
* produce faster fitting and prediction times along with decreased model object sizes.
|
|
484
|
+
* Models from [1], [2], [3], [4] correspond to 'Fail' here.
|
|
485
|
+
* - cat_split_type
|
|
486
|
+
* Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
|
|
487
|
+
* a single category to a branch and the rest to the other branch. For the extended model, whether to
|
|
488
|
+
* give each category a coefficient, or only one while the rest get zero.
|
|
489
|
+
* - new_cat_action
|
|
490
|
+
* What to do after splitting a categorical feature when new data that reaches that split has categories that
|
|
491
|
+
* the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
|
|
492
|
+
* in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
|
|
493
|
+
* data that went to each branch when fitting the model, and in the extended model will assign
|
|
494
|
+
* them the median value for that column that was added to the linear combination of features (but note that
|
|
495
|
+
* this median calculation does not use sample weights when using 'weights_as_sample_prob=false'),
|
|
496
|
+
* b) "Smallest", which will assign all observations with unseen categories in the split to the branch that
|
|
497
|
+
* had fewer observations when fitting the model, c) "Random", which will assing a branch (coefficient in the
|
|
498
|
+
* extended model) at random for each category beforehand, even if no observations had that category when
|
|
499
|
+
* fitting the model. Ignored when passing 'cat_split_type' = 'SingleCateg'.
|
|
500
|
+
* - all_perm
|
|
501
|
+
* When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
|
|
502
|
+
* whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
|
|
503
|
+
* will sort the categories by their frequency and make a grouping in this sorted order. Note that the
|
|
504
|
+
* number of combinations evaluated (if 'true') is the factorial of the number of present categories in
|
|
505
|
+
* a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
|
|
506
|
+
* category in a separate branch, so not evaluating all permutations (passing 'false') will make it
|
|
507
|
+
* possible to select other splits that respect the sorted frequency order.
|
|
508
|
+
* The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
|
|
509
|
+
* systems, this means no column can have more than 20 different categories if using 'all_perm=true',
|
|
510
|
+
* but note that this is not checked within the function.
|
|
511
|
+
* Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
|
|
512
|
+
* - coef_by_prop
|
|
513
|
+
* In the extended model, whether to sort the randomly-generated coefficients for categories
|
|
514
|
+
* according to their relative frequency in the tree node. This might provide better results when using
|
|
515
|
+
* categorical variables with too many categories, but is not recommended, and not reflective of
|
|
516
|
+
* real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
|
|
517
|
+
* variables.
|
|
518
|
+
* - imputer (out)
|
|
519
|
+
* Pointer to already-allocated imputer object, which can be used to produce missing value imputations
|
|
520
|
+
* in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
|
|
521
|
+
* 'missing_action' as missing values inside the model are treated differently and follow their own imputation
|
|
522
|
+
* or division strategy.
|
|
523
|
+
* - min_imp_obs
|
|
524
|
+
* Minimum number of observations with which an imputation value can be produced. Ignored if passing
|
|
525
|
+
* 'build_imputer' = 'false'.
|
|
526
|
+
* - depth_imp
|
|
527
|
+
* How to weight observations according to their depth when used for imputing missing values. Passing
|
|
528
|
+
* "Higher" will weigh observations higher the further down the tree (away from the root node) the
|
|
529
|
+
* terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
|
|
530
|
+
* to node depth in the tree. Implemented for testing purposes and not recommended to change
|
|
531
|
+
* from the default. Ignored when not passing 'impute_nodes'.
|
|
532
|
+
* - weigh_imp_rows
|
|
533
|
+
* How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
|
|
534
|
+
* a node inversely proportional to the number of observations that end up there, while "Proportional"
|
|
535
|
+
* will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
|
|
536
|
+
* in this regard regardless of how many observations end up there. Implemented for testing purposes
|
|
537
|
+
* and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
|
|
538
|
+
* - impute_at_fit
|
|
539
|
+
* Whether to impute missing values in the input data as the model is being built. If passing 'true',
|
|
540
|
+
* then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
|
|
541
|
+
* 'categ_data', and 'Xc', will get overwritten with the imputations produced.
|
|
542
|
+
* - random_seed
|
|
543
|
+
* Seed that will be used to generate random numbers used by the model.
|
|
544
|
+
* - use_long_double
|
|
545
|
+
* Whether to use 'long double' (extended precision) type for more precise calculations about
|
|
546
|
+
* standard deviations, means, ratios, weights, gain, and other potential aggregates. This makes
|
|
547
|
+
* such calculations accurate to a larger number of decimals (provided that the compiler used has
|
|
548
|
+
* wider long doubles than doubles) and it is highly recommended to use when the input data has
|
|
549
|
+
* a number of rows or columns exceeding 2^53 (an unlikely scenario), and also highly recommended
|
|
550
|
+
* to use when the input data has problematic scales (e.g. numbers that differ from each other by
|
|
551
|
+
* something like 10^-100 or columns that include values like 10^100 and 10^-100 and still need to
|
|
552
|
+
* be sensitive to a difference of 10^-100), but will make the calculations slower, the more so in
|
|
553
|
+
* platforms in which 'long double' is a software-emulated type (e.g. Power8 platforms).
|
|
554
|
+
* Note that some platforms (most notably windows with the msvc compiler) do not make any difference
|
|
555
|
+
* between 'double' and 'long double'.
|
|
556
|
+
* - nthreads
|
|
557
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
558
|
+
* allocated, even if the thread does not end up being used.
|
|
559
|
+
* Be aware that most of the operations are bound by memory bandwidth, which means that
|
|
560
|
+
* adding more threads will not result in a linear speed-up. For some types of data
|
|
561
|
+
* (e.g. large sparse matrices with small sample sizes), adding more threads might result
|
|
562
|
+
* in only a very modest speed up (e.g. 1.5x faster with 4x more threads),
|
|
563
|
+
* even if all threads look fully utilized.
|
|
564
|
+
* Ignored when not building with OpenMP support.
|
|
565
|
+
*
|
|
566
|
+
* Returns
|
|
567
|
+
* =======
|
|
568
|
+
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
|
569
|
+
* If the process receives an interrupt signal, will return instead
|
|
570
|
+
* 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
|
|
571
|
+
* what these values correspond to, you can use the functions
|
|
572
|
+
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
|
573
|
+
* as integers.
|
|
574
|
+
*/
|
|
575
|
+
template <class real_t, class sparse_ix>
|
|
576
|
+
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
577
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
578
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
579
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
580
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
581
|
+
real_t sample_weights[], bool with_replacement, bool weight_as_sample,
|
|
582
|
+
size_t nrows, size_t sample_size, size_t ntrees,
|
|
583
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
584
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
585
|
+
ScoringMetric scoring_metric, bool fast_bratio,
|
|
586
|
+
bool standardize_dist, double tmat[],
|
|
587
|
+
double output_depths[], bool standardize_depth,
|
|
588
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
589
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
590
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
591
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
592
|
+
double prob_pick_col_by_kurt,
|
|
593
|
+
double min_gain, MissingAction missing_action,
|
|
594
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
595
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
596
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
|
597
|
+
uint64_t random_seed, bool use_long_double, int nthreads)
|
|
598
|
+
{
|
|
599
|
+
if (use_long_double && !has_long_double()) {
|
|
600
|
+
use_long_double = false;
|
|
601
|
+
fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
|
602
|
+
}
|
|
603
|
+
#ifndef NO_LONG_DOUBLE
|
|
604
|
+
if (likely(!use_long_double))
|
|
605
|
+
#endif
|
|
606
|
+
return fit_iforest_internal<real_t, sparse_ix, double>(
|
|
607
|
+
model_outputs, model_outputs_ext,
|
|
608
|
+
numeric_data, ncols_numeric,
|
|
609
|
+
categ_data, ncols_categ, ncat,
|
|
610
|
+
Xc, Xc_ind, Xc_indptr,
|
|
611
|
+
ndim, ntry, coef_type, coef_by_prop,
|
|
612
|
+
sample_weights, with_replacement, weight_as_sample,
|
|
613
|
+
nrows, sample_size, ntrees,
|
|
614
|
+
max_depth, ncols_per_tree,
|
|
615
|
+
limit_depth, penalize_range, standardize_data,
|
|
616
|
+
scoring_metric, fast_bratio,
|
|
617
|
+
standardize_dist, tmat,
|
|
618
|
+
output_depths, standardize_depth,
|
|
619
|
+
col_weights, weigh_by_kurt,
|
|
620
|
+
prob_pick_by_gain_pl, prob_pick_by_gain_avg,
|
|
621
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
622
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
623
|
+
prob_pick_col_by_kurt,
|
|
624
|
+
min_gain, missing_action,
|
|
625
|
+
cat_split_type, new_cat_action,
|
|
626
|
+
all_perm, imputer, min_imp_obs,
|
|
627
|
+
depth_imp, weigh_imp_rows, impute_at_fit,
|
|
628
|
+
random_seed, nthreads
|
|
629
|
+
);
|
|
630
|
+
#ifndef NO_LONG_DOUBLE
|
|
631
|
+
else
|
|
632
|
+
return fit_iforest_internal<real_t, sparse_ix, long double>(
|
|
633
|
+
model_outputs, model_outputs_ext,
|
|
634
|
+
numeric_data, ncols_numeric,
|
|
635
|
+
categ_data, ncols_categ, ncat,
|
|
636
|
+
Xc, Xc_ind, Xc_indptr,
|
|
637
|
+
ndim, ntry, coef_type, coef_by_prop,
|
|
638
|
+
sample_weights, with_replacement, weight_as_sample,
|
|
639
|
+
nrows, sample_size, ntrees,
|
|
640
|
+
max_depth, ncols_per_tree,
|
|
641
|
+
limit_depth, penalize_range, standardize_data,
|
|
642
|
+
scoring_metric, fast_bratio,
|
|
643
|
+
standardize_dist, tmat,
|
|
644
|
+
output_depths, standardize_depth,
|
|
645
|
+
col_weights, weigh_by_kurt,
|
|
646
|
+
prob_pick_by_gain_pl, prob_pick_by_gain_avg,
|
|
647
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
648
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
649
|
+
prob_pick_col_by_kurt,
|
|
650
|
+
min_gain, missing_action,
|
|
651
|
+
cat_split_type, new_cat_action,
|
|
652
|
+
all_perm, imputer, min_imp_obs,
|
|
653
|
+
depth_imp, weigh_imp_rows, impute_at_fit,
|
|
654
|
+
random_seed, nthreads
|
|
655
|
+
);
|
|
656
|
+
#endif
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
660
|
+
int fit_iforest_internal(
|
|
661
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
662
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
663
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
664
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
665
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
666
|
+
real_t sample_weights[], bool with_replacement, bool weight_as_sample,
|
|
667
|
+
size_t nrows, size_t sample_size, size_t ntrees,
|
|
668
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
669
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
670
|
+
ScoringMetric scoring_metric, bool fast_bratio,
|
|
671
|
+
bool standardize_dist, double tmat[],
|
|
672
|
+
double output_depths[], bool standardize_depth,
|
|
673
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
674
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
675
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
676
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
677
|
+
double prob_pick_col_by_kurt,
|
|
678
|
+
double min_gain, MissingAction missing_action,
|
|
679
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
680
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
681
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
|
682
|
+
uint64_t random_seed, int nthreads)
|
|
683
|
+
{
|
|
684
|
+
if (
|
|
685
|
+
prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
|
|
686
|
+
prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
|
|
687
|
+
prob_pick_col_by_range < 0 ||
|
|
688
|
+
prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
|
|
689
|
+
) {
|
|
690
|
+
throw std::runtime_error("Cannot pass negative probabilities.\n");
|
|
691
|
+
}
|
|
692
|
+
if (prob_pick_col_by_range && ncols_categ)
|
|
693
|
+
throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
|
|
694
|
+
if (prob_pick_by_full_gain && ncols_categ)
|
|
695
|
+
throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
|
|
696
|
+
if (prob_pick_col_by_kurt && weigh_by_kurt)
|
|
697
|
+
throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
|
|
698
|
+
if (ndim == 0 && model_outputs == NULL)
|
|
699
|
+
throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
|
|
700
|
+
if (penalize_range &&
|
|
701
|
+
(scoring_metric == Density ||
|
|
702
|
+
scoring_metric == AdjDensity ||
|
|
703
|
+
is_boxed_metric(scoring_metric))
|
|
704
|
+
)
|
|
705
|
+
throw std::runtime_error("'penalize_range' is incompatible with density scoring.\n");
|
|
706
|
+
if (with_replacement) {
|
|
707
|
+
if (tmat != NULL)
|
|
708
|
+
throw std::runtime_error("Cannot calculate distance while sampling with replacement.\n");
|
|
709
|
+
if (output_depths != NULL)
|
|
710
|
+
throw std::runtime_error("Cannot make predictions at fit time when sampling with replacement.\n");
|
|
711
|
+
if (impute_at_fit)
|
|
712
|
+
throw std::runtime_error("Cannot impute at fit time when sampling with replacement.\n");
|
|
713
|
+
}
|
|
714
|
+
if (sample_size != 0 && sample_size < nrows) {
|
|
715
|
+
if (output_depths != NULL)
|
|
716
|
+
throw std::runtime_error("Cannot produce outlier scores at fit time when using sub-sampling.\n");
|
|
717
|
+
if (tmat != NULL)
|
|
718
|
+
throw std::runtime_error("Cannot calculate distances at fit time when using sub-sampling.\n");
|
|
719
|
+
if (impute_at_fit)
|
|
720
|
+
throw std::runtime_error("Cannot produce missing data imputations at fit time when using sub-sampling.\n");
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
/* TODO: this function should also accept the array as a memoryview with a
|
|
725
|
+
leading dimension that might not correspond to the number of columns,
|
|
726
|
+
so as to avoid having to make deep copies of memoryviews in python and to
|
|
727
|
+
allow using pointers to columns of dataframes in R and Python. */
|
|
728
|
+
|
|
729
|
+
/* calculate maximum number of categories to use later */
|
|
730
|
+
int max_categ = 0;
|
|
731
|
+
for (size_t col = 0; col < ncols_categ; col++)
|
|
732
|
+
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
|
733
|
+
|
|
734
|
+
bool calc_dist = tmat != NULL;
|
|
735
|
+
|
|
736
|
+
if (sample_size == 0)
|
|
737
|
+
sample_size = nrows;
|
|
738
|
+
|
|
739
|
+
if (model_outputs != NULL)
|
|
740
|
+
ntry = std::min(ntry, ncols_numeric + ncols_categ);
|
|
741
|
+
|
|
742
|
+
if (ncols_per_tree == 0)
|
|
743
|
+
ncols_per_tree = ncols_numeric + ncols_categ;
|
|
744
|
+
|
|
745
|
+
/* put data in structs to shorten function calls */
|
|
746
|
+
InputData<real_t, sparse_ix>
|
|
747
|
+
input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
|
748
|
+
nrows, ncols_numeric + ncols_categ, sample_weights,
|
|
749
|
+
weight_as_sample, col_weights,
|
|
750
|
+
Xc, Xc_ind, Xc_indptr,
|
|
751
|
+
0, 0, std::vector<double>(),
|
|
752
|
+
std::vector<char>(), 0, NULL,
|
|
753
|
+
(double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
|
|
754
|
+
std::vector<double>(), std::vector<double>(),
|
|
755
|
+
std::vector<size_t>(), std::vector<size_t>()};
|
|
756
|
+
ModelParams model_params = {with_replacement, sample_size, ntrees, ncols_per_tree,
|
|
757
|
+
limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
|
|
758
|
+
penalize_range, standardize_data, random_seed, weigh_by_kurt,
|
|
759
|
+
prob_pick_by_gain_avg, prob_pick_by_gain_pl,
|
|
760
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
761
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
762
|
+
prob_pick_col_by_kurt,
|
|
763
|
+
min_gain, cat_split_type, new_cat_action, missing_action,
|
|
764
|
+
scoring_metric, fast_bratio, all_perm,
|
|
765
|
+
(model_outputs != NULL)? 0 : ndim, ntry,
|
|
766
|
+
coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
|
|
767
|
+
depth_imp, weigh_imp_rows, min_imp_obs};
|
|
768
|
+
|
|
769
|
+
/* if calculating full gain, need to produce copies of the data in row-major order */
|
|
770
|
+
if (prob_pick_by_full_gain)
|
|
771
|
+
{
|
|
772
|
+
if (input_data.Xc_indptr == NULL)
|
|
773
|
+
colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
|
|
774
|
+
else
|
|
775
|
+
colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
776
|
+
input_data.nrows, input_data.ncols_numeric,
|
|
777
|
+
input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
/* if using weights as sampling probability, build a binary tree for faster sampling */
|
|
781
|
+
if (input_data.weight_as_sample && input_data.sample_weights != NULL)
|
|
782
|
+
{
|
|
783
|
+
build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
|
|
784
|
+
input_data.nrows, input_data.log2_n, input_data.btree_offset);
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
/* same for column weights */
|
|
788
|
+
/* TODO: this should also save the kurtoses when using 'prob_pick_col_by_kurt' */
|
|
789
|
+
ColumnSampler<ldouble_safe> base_col_sampler;
|
|
790
|
+
if (
|
|
791
|
+
col_weights != NULL ||
|
|
792
|
+
(model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
|
|
793
|
+
(model_params.ncols_per_tree >= input_data.ncols_tot / (model_params.ntrees * 2)))
|
|
794
|
+
)
|
|
795
|
+
{
|
|
796
|
+
bool avoid_col_weights = (model_outputs != NULL && model_params.ntry >= model_params.ncols_per_tree &&
|
|
797
|
+
model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
|
|
798
|
+
model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
|
|
799
|
+
||
|
|
800
|
+
(model_outputs == NULL && model_params.ndim >= model_params.ncols_per_tree)
|
|
801
|
+
||
|
|
802
|
+
(model_params.ncols_per_tree == 1);
|
|
803
|
+
if (!avoid_col_weights)
|
|
804
|
+
{
|
|
805
|
+
if (model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement)
|
|
806
|
+
{
|
|
807
|
+
RNG_engine rnd_generator(random_seed);
|
|
808
|
+
std::vector<double> kurt_weights = calc_kurtosis_all_data<InputData<real_t, sparse_ix>, ldouble_safe>(input_data, model_params, rnd_generator);
|
|
809
|
+
if (col_weights != NULL)
|
|
810
|
+
{
|
|
811
|
+
for (size_t col = 0; col < input_data.ncols_tot; col++)
|
|
812
|
+
{
|
|
813
|
+
if (kurt_weights[col] <= 0) continue;
|
|
814
|
+
kurt_weights[col] *= col_weights[col];
|
|
815
|
+
kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
base_col_sampler.initialize(kurt_weights.data(), input_data.ncols_tot);
|
|
819
|
+
|
|
820
|
+
if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
|
|
821
|
+
{
|
|
822
|
+
input_data.all_kurtoses = std::move(kurt_weights);
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
else
|
|
827
|
+
{
|
|
828
|
+
base_col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
input_data.preinitialized_col_sampler = &base_col_sampler;
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
/* in some cases, all trees will need to calculate variable ranges for all columns */
|
|
836
|
+
/* TODO: the model might use 'leave_m_cols', or have 'prob_pick_col_by_range<1', in which
|
|
837
|
+
case it might not be beneficial to do this beforehand. Find out when the expected gain
|
|
838
|
+
from doing this here is not beneficial. */
|
|
839
|
+
/* TODO: move this to a different file, it doesn't belong here */
|
|
840
|
+
std::vector<double> variable_ranges_low;
|
|
841
|
+
std::vector<double> variable_ranges_high;
|
|
842
|
+
std::vector<int> variable_ncats;
|
|
843
|
+
if (
|
|
844
|
+
model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
|
|
845
|
+
(model_params.ncols_per_tree >= input_data.ncols_numeric) &&
|
|
846
|
+
((model_params.prob_pick_col_by_range && input_data.ncols_numeric)
|
|
847
|
+
||
|
|
848
|
+
is_boxed_metric(model_params.scoring_metric))
|
|
849
|
+
)
|
|
850
|
+
{
|
|
851
|
+
variable_ranges_low.resize(input_data.ncols_numeric);
|
|
852
|
+
variable_ranges_high.resize(input_data.ncols_numeric);
|
|
853
|
+
|
|
854
|
+
std::unique_ptr<unsigned char[]> buffer_cats;
|
|
855
|
+
size_t adj_col;
|
|
856
|
+
if (is_boxed_metric(model_params.scoring_metric))
|
|
857
|
+
{
|
|
858
|
+
variable_ncats.resize(input_data.ncols_categ);
|
|
859
|
+
buffer_cats = std::unique_ptr<unsigned char[]>(new unsigned char[input_data.max_categ]);
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
if (base_col_sampler.col_indices.empty())
|
|
863
|
+
base_col_sampler.initialize(input_data.ncols_tot);
|
|
864
|
+
|
|
865
|
+
bool unsplittable;
|
|
866
|
+
size_t n_tried_numeric = 0;
|
|
867
|
+
size_t col;
|
|
868
|
+
base_col_sampler.prepare_full_pass();
|
|
869
|
+
while (base_col_sampler.sample_col(col))
|
|
870
|
+
{
|
|
871
|
+
if (col < input_data.ncols_numeric)
|
|
872
|
+
{
|
|
873
|
+
if (input_data.Xc_indptr == NULL)
|
|
874
|
+
{
|
|
875
|
+
get_range(input_data.numeric_data + nrows*col,
|
|
876
|
+
input_data.nrows,
|
|
877
|
+
model_params.missing_action,
|
|
878
|
+
variable_ranges_low[col],
|
|
879
|
+
variable_ranges_high[col],
|
|
880
|
+
unsplittable);
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
else
|
|
884
|
+
{
|
|
885
|
+
get_range(col, input_data.nrows,
|
|
886
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
887
|
+
model_params.missing_action,
|
|
888
|
+
variable_ranges_low[col],
|
|
889
|
+
variable_ranges_high[col],
|
|
890
|
+
unsplittable);
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
n_tried_numeric++;
|
|
894
|
+
|
|
895
|
+
if (unsplittable)
|
|
896
|
+
{
|
|
897
|
+
variable_ranges_low[col] = 0;
|
|
898
|
+
variable_ranges_high[col] = 0;
|
|
899
|
+
base_col_sampler.drop_col(col);
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
else
|
|
904
|
+
{
|
|
905
|
+
if (!is_boxed_metric(model_params.scoring_metric))
|
|
906
|
+
{
|
|
907
|
+
if (n_tried_numeric >= input_data.ncols_numeric)
|
|
908
|
+
break;
|
|
909
|
+
else
|
|
910
|
+
continue;
|
|
911
|
+
}
|
|
912
|
+
adj_col = col - input_data.ncols_numeric;
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
variable_ncats[adj_col] = count_ncateg_in_col(input_data.categ_data + input_data.nrows*adj_col,
|
|
916
|
+
input_data.nrows, input_data.ncat[adj_col],
|
|
917
|
+
buffer_cats.get());
|
|
918
|
+
if (variable_ncats[adj_col] <= 1)
|
|
919
|
+
base_col_sampler.drop_col(col);
|
|
920
|
+
}
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
input_data.preinitialized_col_sampler = &base_col_sampler;
|
|
924
|
+
if (input_data.ncols_numeric) {
|
|
925
|
+
input_data.range_low = variable_ranges_low.data();
|
|
926
|
+
input_data.range_high = variable_ranges_high.data();
|
|
927
|
+
}
|
|
928
|
+
if (input_data.ncols_categ) {
|
|
929
|
+
input_data.ncat_ = variable_ncats.data();
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
/* if imputing missing values on-the-fly, need to determine which are missing */
|
|
934
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> impute_vec;
|
|
935
|
+
hashed_map<size_t, ImputedData<sparse_ix, ldouble_safe>> impute_map;
|
|
936
|
+
if (model_params.impute_at_fit)
|
|
937
|
+
check_for_missing(input_data, impute_vec, impute_map, nthreads);
|
|
938
|
+
|
|
939
|
+
/* store model data */
|
|
940
|
+
if (model_outputs != NULL)
|
|
941
|
+
{
|
|
942
|
+
model_outputs->trees.resize(ntrees);
|
|
943
|
+
model_outputs->trees.shrink_to_fit();
|
|
944
|
+
model_outputs->new_cat_action = new_cat_action;
|
|
945
|
+
model_outputs->cat_split_type = cat_split_type;
|
|
946
|
+
model_outputs->missing_action = missing_action;
|
|
947
|
+
model_outputs->scoring_metric = scoring_metric;
|
|
948
|
+
if (
|
|
949
|
+
model_outputs->scoring_metric != Density &&
|
|
950
|
+
model_outputs->scoring_metric != BoxedDensity &&
|
|
951
|
+
model_outputs->scoring_metric != BoxedDensity2 &&
|
|
952
|
+
model_outputs->scoring_metric != BoxedRatio
|
|
953
|
+
)
|
|
954
|
+
model_outputs->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
|
|
955
|
+
else
|
|
956
|
+
model_outputs->exp_avg_depth = 1;
|
|
957
|
+
model_outputs->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
|
|
958
|
+
model_outputs->orig_sample_size = input_data.nrows;
|
|
959
|
+
model_outputs->has_range_penalty = penalize_range;
|
|
960
|
+
}
|
|
961
|
+
|
|
962
|
+
else
|
|
963
|
+
{
|
|
964
|
+
model_outputs_ext->hplanes.resize(ntrees);
|
|
965
|
+
model_outputs_ext->hplanes.shrink_to_fit();
|
|
966
|
+
model_outputs_ext->new_cat_action = new_cat_action;
|
|
967
|
+
model_outputs_ext->cat_split_type = cat_split_type;
|
|
968
|
+
model_outputs_ext->missing_action = missing_action;
|
|
969
|
+
model_outputs_ext->scoring_metric = scoring_metric;
|
|
970
|
+
if (
|
|
971
|
+
model_outputs_ext->scoring_metric != Density &&
|
|
972
|
+
model_outputs_ext->scoring_metric != BoxedDensity &&
|
|
973
|
+
model_outputs_ext->scoring_metric != BoxedDensity2 &&
|
|
974
|
+
model_outputs_ext->scoring_metric != BoxedRatio
|
|
975
|
+
)
|
|
976
|
+
model_outputs_ext->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
|
|
977
|
+
else
|
|
978
|
+
model_outputs_ext->exp_avg_depth = 1;
|
|
979
|
+
model_outputs_ext->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
|
|
980
|
+
model_outputs_ext->orig_sample_size = input_data.nrows;
|
|
981
|
+
model_outputs_ext->has_range_penalty = penalize_range;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
if (imputer != NULL)
|
|
985
|
+
initialize_imputer<decltype(input_data), ldouble_safe>(
|
|
986
|
+
*imputer, input_data, ntrees, nthreads
|
|
987
|
+
);
|
|
988
|
+
|
|
989
|
+
/* initialize thread-private memory */
|
|
990
|
+
if ((size_t)nthreads > ntrees)
|
|
991
|
+
nthreads = (int)ntrees;
|
|
992
|
+
#ifdef _OPENMP
|
|
993
|
+
std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(nthreads);
|
|
994
|
+
#else
|
|
995
|
+
std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(1);
|
|
996
|
+
#endif
|
|
997
|
+
|
|
998
|
+
/* Global variable that determines if the procedure receives a stop signal */
|
|
999
|
+
SignalSwitcher ss = SignalSwitcher();
|
|
1000
|
+
|
|
1001
|
+
/* For exception handling */
|
|
1002
|
+
bool threw_exception = false;
|
|
1003
|
+
std::exception_ptr ex = NULL;
|
|
1004
|
+
|
|
1005
|
+
/* grow trees */
|
|
1006
|
+
#pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params, threw_exception, ex)
|
|
1007
|
+
for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
|
|
1008
|
+
{
|
|
1009
|
+
if (interrupt_switch || threw_exception)
|
|
1010
|
+
continue; /* Cannot break with OpenMP==2.0 (MSVC) */
|
|
1011
|
+
|
|
1012
|
+
try
|
|
1013
|
+
{
|
|
1014
|
+
if (
|
|
1015
|
+
model_params.impute_at_fit &&
|
|
1016
|
+
input_data.n_missing &&
|
|
1017
|
+
!worker_memory[omp_get_thread_num()].impute_vec.size() &&
|
|
1018
|
+
!worker_memory[omp_get_thread_num()].impute_map.size()
|
|
1019
|
+
)
|
|
1020
|
+
{
|
|
1021
|
+
#ifdef _OPENMP
|
|
1022
|
+
if (nthreads > 1)
|
|
1023
|
+
{
|
|
1024
|
+
worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
|
|
1025
|
+
worker_memory[omp_get_thread_num()].impute_map = impute_map;
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
else
|
|
1029
|
+
#endif
|
|
1030
|
+
{
|
|
1031
|
+
worker_memory[0].impute_vec = std::move(impute_vec);
|
|
1032
|
+
worker_memory[0].impute_map = std::move(impute_map);
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
fit_itree<decltype(input_data), typename std::remove_pointer<decltype(worker_memory.data())>::type, ldouble_safe>(
|
|
1037
|
+
(model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
|
|
1038
|
+
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
|
|
1039
|
+
worker_memory[omp_get_thread_num()],
|
|
1040
|
+
input_data,
|
|
1041
|
+
model_params,
|
|
1042
|
+
(imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
|
|
1043
|
+
tree);
|
|
1044
|
+
|
|
1045
|
+
if ((model_outputs != NULL))
|
|
1046
|
+
model_outputs->trees[tree].shrink_to_fit();
|
|
1047
|
+
else
|
|
1048
|
+
model_outputs_ext->hplanes[tree].shrink_to_fit();
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
catch (...)
|
|
1052
|
+
{
|
|
1053
|
+
#pragma omp critical
|
|
1054
|
+
{
|
|
1055
|
+
if (!threw_exception)
|
|
1056
|
+
{
|
|
1057
|
+
threw_exception = true;
|
|
1058
|
+
ex = std::current_exception();
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
/* check if the procedure got interrupted */
|
|
1065
|
+
check_interrupt_switch(ss);
|
|
1066
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
1067
|
+
if (interrupt_switch) return EXIT_FAILURE;
|
|
1068
|
+
#endif
|
|
1069
|
+
|
|
1070
|
+
/* check if some exception was thrown */
|
|
1071
|
+
if (threw_exception)
|
|
1072
|
+
std::rethrow_exception(ex);
|
|
1073
|
+
|
|
1074
|
+
if ((model_outputs != NULL))
|
|
1075
|
+
model_outputs->trees.shrink_to_fit();
|
|
1076
|
+
else
|
|
1077
|
+
model_outputs_ext->hplanes.shrink_to_fit();
|
|
1078
|
+
|
|
1079
|
+
/* if calculating similarity/distance, now need to reduce and average */
|
|
1080
|
+
if (calc_dist)
|
|
1081
|
+
gather_sim_result< PredictionData<real_t, sparse_ix>, InputData<real_t, sparse_ix> >
|
|
1082
|
+
(NULL, &worker_memory,
|
|
1083
|
+
NULL, &input_data,
|
|
1084
|
+
model_outputs, model_outputs_ext,
|
|
1085
|
+
tmat, NULL, 0,
|
|
1086
|
+
model_params.ntrees, false,
|
|
1087
|
+
standardize_dist, false, nthreads);
|
|
1088
|
+
|
|
1089
|
+
check_interrupt_switch(ss);
|
|
1090
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
1091
|
+
if (interrupt_switch) return EXIT_FAILURE;
|
|
1092
|
+
#endif
|
|
1093
|
+
|
|
1094
|
+
/* same for depths */
|
|
1095
|
+
if (output_depths != NULL)
|
|
1096
|
+
{
|
|
1097
|
+
#ifdef _OPENMP
|
|
1098
|
+
if (nthreads > 1)
|
|
1099
|
+
{
|
|
1100
|
+
for (auto &w : worker_memory)
|
|
1101
|
+
{
|
|
1102
|
+
if (w.row_depths.size())
|
|
1103
|
+
{
|
|
1104
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
|
|
1105
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
|
1106
|
+
output_depths[row] += w.row_depths[row];
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
else
|
|
1111
|
+
#endif
|
|
1112
|
+
{
|
|
1113
|
+
std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
if (standardize_depth)
|
|
1117
|
+
{
|
|
1118
|
+
double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
|
|
1119
|
+
model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
|
|
1120
|
+
for (size_t row = 0; row < nrows; row++)
|
|
1121
|
+
output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
else
|
|
1125
|
+
{
|
|
1126
|
+
double ntrees_dbl = (double) ntrees;
|
|
1127
|
+
for (size_t row = 0; row < nrows; row++)
|
|
1128
|
+
output_depths[row] /= ntrees_dbl;
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
check_interrupt_switch(ss);
|
|
1133
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
1134
|
+
if (interrupt_switch) return EXIT_FAILURE;
|
|
1135
|
+
#endif
|
|
1136
|
+
|
|
1137
|
+
/* if imputing missing values, now need to reduce and write final values */
|
|
1138
|
+
if (model_params.impute_at_fit)
|
|
1139
|
+
{
|
|
1140
|
+
#ifdef _OPENMP
|
|
1141
|
+
if (nthreads > 1)
|
|
1142
|
+
{
|
|
1143
|
+
for (auto &w : worker_memory)
|
|
1144
|
+
combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
|
|
1145
|
+
}
|
|
1146
|
+
|
|
1147
|
+
else
|
|
1148
|
+
#endif
|
|
1149
|
+
{
|
|
1150
|
+
impute_vec = std::move(worker_memory[0].impute_vec);
|
|
1151
|
+
impute_map = std::move(worker_memory[0].impute_map);
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
check_interrupt_switch(ss);
|
|
1158
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
1159
|
+
if (interrupt_switch) return EXIT_FAILURE;
|
|
1160
|
+
#endif
|
|
1161
|
+
|
|
1162
|
+
return EXIT_SUCCESS;
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
/* Add additional trees to already-fitted isolation forest model
|
|
1167
|
+
*
|
|
1168
|
+
* Parameters
|
|
1169
|
+
* ==========
|
|
1170
|
+
* - model_outputs
|
|
1171
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
1172
|
+
* if the trees are are to be added to an extended model. Can only pass one of
|
|
1173
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
|
1174
|
+
* so it cannot be run in parallel for the same model object.
|
|
1175
|
+
* - model_outputs_ext
|
|
1176
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
1177
|
+
* if the trees are are to be added to an single-variable model. Can only pass one of
|
|
1178
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
|
1179
|
+
* so it cannot be run in parallel for the same model object.
|
|
1180
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
1181
|
+
* Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
|
1182
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
1183
|
+
* Pass NULL if there are no dense numeric columns.
|
|
1184
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
1185
|
+
* If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
|
|
1186
|
+
* of columns, either as dense or as sparse arrays.
|
|
1187
|
+
* - ncols_numeric
|
|
1188
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1189
|
+
* what was originally passed to 'fit_iforest'.
|
|
1190
|
+
* - categ_data[nrows * ncols_categ]
|
|
1191
|
+
* Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
|
1192
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
1193
|
+
* Pass NULL if there are no categorical columns. The encoding must be the same as was used
|
|
1194
|
+
* in the data to which the model was fit.
|
|
1195
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
1196
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
1197
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
1198
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
1199
|
+
* must be the same as was used in the data to which the model was fit.
|
|
1200
|
+
* If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
|
|
1201
|
+
* of columns and the same category encoding.
|
|
1202
|
+
* - ncols_categ
|
|
1203
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1204
|
+
* what was originally passed to 'fit_iforest'.
|
|
1205
|
+
* - ncat[ncols_categ]
|
|
1206
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). May contain new categories,
|
|
1207
|
+
* but should keep the same encodings that were used for previous categories.
|
|
1208
|
+
* - Xc[nnz]
|
|
1209
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
1210
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1211
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
1212
|
+
* - Xc_ind[nnz]
|
|
1213
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
1214
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1215
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1216
|
+
* - Xc_indptr[ncols_numeric + 1]
|
|
1217
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
1218
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
1219
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1220
|
+
* - ndim
|
|
1221
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1222
|
+
* what was originally passed to 'fit_iforest'.
|
|
1223
|
+
* - ntry
|
|
1224
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1225
|
+
* what was originally passed to 'fit_iforest'.
|
|
1226
|
+
* - coef_type
|
|
1227
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1228
|
+
* what was originally passed to 'fit_iforest'.
|
|
1229
|
+
* - sample_weights
|
|
1230
|
+
* Weights for the rows when adding this tree, either as sampling importances when using
|
|
1231
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
|
1232
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
|
1233
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
|
1234
|
+
* through parameter 'weight_as_sample' that was passed to 'fit_iforest.
|
|
1235
|
+
* Pass NULL if the rows all have uniform weights.
|
|
1236
|
+
* - nrows
|
|
1237
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
1238
|
+
* - max_depth
|
|
1239
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1240
|
+
* what was originally passed to 'fit_iforest'.
|
|
1241
|
+
* - ncols_per_tree
|
|
1242
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1243
|
+
* what was originally passed to 'fit_iforest'.
|
|
1244
|
+
* - limit_depth
|
|
1245
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1246
|
+
* what was originally passed to 'fit_iforest'.
|
|
1247
|
+
* - penalize_range
|
|
1248
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1249
|
+
* what was originally passed to 'fit_iforest'.
|
|
1250
|
+
* - standardize_data
|
|
1251
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1252
|
+
* what was originally passed to 'fit_iforest'.
|
|
1253
|
+
* - fast_bratio
|
|
1254
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1255
|
+
* what was originally passed to 'fit_iforest'.
|
|
1256
|
+
* - col_weights
|
|
1257
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
|
1258
|
+
* Ignored when picking columns by deterministic criterion.
|
|
1259
|
+
* If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
|
|
1260
|
+
* effect is multiplicative.
|
|
1261
|
+
* - weigh_by_kurt
|
|
1262
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1263
|
+
* what was originally passed to 'fit_iforest'.
|
|
1264
|
+
* - prob_pick_by_gain_pl
|
|
1265
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1266
|
+
* what was originally passed to 'fit_iforest'.
|
|
1267
|
+
* - prob_pick_by_gain_avg
|
|
1268
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1269
|
+
* what was originally passed to 'fit_iforest'.
|
|
1270
|
+
* - prob_pick_by_full_gain
|
|
1271
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1272
|
+
* what was originally passed to 'fit_iforest'.
|
|
1273
|
+
* - prob_pick_by_dens
|
|
1274
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1275
|
+
* what was originally passed to 'fit_iforest'.
|
|
1276
|
+
* - prob_pick_col_by_range
|
|
1277
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1278
|
+
* what was originally passed to 'fit_iforest'.
|
|
1279
|
+
* - prob_pick_col_by_var
|
|
1280
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1281
|
+
* what was originally passed to 'fit_iforest'.
|
|
1282
|
+
* - prob_pick_col_by_kurt
|
|
1283
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1284
|
+
* what was originally passed to 'fit_iforest'.
|
|
1285
|
+
* - min_gain
|
|
1286
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1287
|
+
* what was originally passed to 'fit_iforest'.
|
|
1288
|
+
* - missing_action
|
|
1289
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1290
|
+
* what was originally passed to 'fit_iforest'.
|
|
1291
|
+
* - cat_split_type
|
|
1292
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1293
|
+
* what was originally passed to 'fit_iforest'.
|
|
1294
|
+
* - new_cat_action
|
|
1295
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1296
|
+
* what was originally passed to 'fit_iforest'.
|
|
1297
|
+
* - depth_imp
|
|
1298
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1299
|
+
* what was originally passed to 'fit_iforest'.
|
|
1300
|
+
* - weigh_imp_rows
|
|
1301
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
1302
|
+
* what was originally passed to 'fit_iforest'.
|
|
1303
|
+
* - all_perm
|
|
1304
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1305
|
+
* what was originally passed to 'fit_iforest'.
|
|
1306
|
+
* - coef_by_prop
|
|
1307
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1308
|
+
* what was originally passed to 'fit_iforest'.
|
|
1309
|
+
* - imputer
|
|
1310
|
+
* Pointer to already-allocated imputer object, as it was output from function 'fit_model' while
|
|
1311
|
+
* producing either 'model_outputs' or 'model_outputs_ext'.
|
|
1312
|
+
* Pass NULL if the model was built without imputer.
|
|
1313
|
+
* - min_imp_obs
|
|
1314
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1315
|
+
* what was originally passed to 'fit_iforest'.
|
|
1316
|
+
* - indexer
|
|
1317
|
+
* Indexer object associated to the model object ('model_outputs' or 'model_outputs_ext'), which will
|
|
1318
|
+
* be updated with the new tree to add.
|
|
1319
|
+
* If 'indexer' has reference points, these must be passed again here in order to index them.
|
|
1320
|
+
* Pass NULL if the model has no associated indexer.
|
|
1321
|
+
* - ref_numeric_data[nref * ncols_numeric]
|
|
1322
|
+
* Pointer to numeric data for reference points. May be ordered by rows
|
|
1323
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1324
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1325
|
+
* (see parameter 'ref_is_col_major').
|
|
1326
|
+
* Pass NULL if there are no dense numeric columns or no reference points.
|
|
1327
|
+
* Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
|
|
1328
|
+
* If 'indexer' is passed, it has reference points, and the data to which the model was fit had
|
|
1329
|
+
* numeric columns, then numeric data for reference points must be passed (in either dense or sparse format).
|
|
1330
|
+
* - ref_categ_data[nref * ncols_categ]
|
|
1331
|
+
* Pointer to categorical data for reference points. May be ordered by rows
|
|
1332
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1333
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1334
|
+
* (see parameter 'ref_is_col_major').
|
|
1335
|
+
* Pass NULL if there are no categorical columns or no reference points.
|
|
1336
|
+
* If 'indexer' is passed, it has reference points, and the data to which the model was fit had
|
|
1337
|
+
* categorical columns, then 'ref_categ_data' must be passed.
|
|
1338
|
+
* - ref_is_col_major
|
|
1339
|
+
* Whether 'ref_numeric_data' and/or 'ref_categ_data' are in column-major order. If numeric data is
|
|
1340
|
+
* passed in sparse format, categorical data must be passed in column-major format. If passing dense
|
|
1341
|
+
* data, row-major format is preferred as it will be faster. If the data is passed in row-major format,
|
|
1342
|
+
* must also pass 'ref_ld_numeric' and/or 'ref_ld_categ'.
|
|
1343
|
+
* If both 'ref_numeric_data' and 'ref_categ_data' are passed, they must have the same orientation
|
|
1344
|
+
* (row-major or column-major).
|
|
1345
|
+
* - ref_ld_numeric
|
|
1346
|
+
* Leading dimension of the array 'ref_numeric_data', if it is passed in row-major format.
|
|
1347
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1348
|
+
* be accessed assuming that row 'n' starts at 'ref_numeric_data + n*ref_ld_numeric'). If passing
|
|
1349
|
+
* 'ref_numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
1350
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
1351
|
+
* data in sparse format.
|
|
1352
|
+
* - ref_ld_categ
|
|
1353
|
+
* Leading dimension of the array 'ref_categ_data', if it is passed in row-major format.
|
|
1354
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1355
|
+
* be accessed assuming that row 'n' starts at 'ref_categ_data + n*ref_ld_categ'). If passing
|
|
1356
|
+
* 'ref_categ_data' in column-major order, this is ignored and will be assumed that the
|
|
1357
|
+
* leading dimension corresponds to the number of rows.
|
|
1358
|
+
* - ref_Xc[ref_nnz]
|
|
1359
|
+
* Pointer to numeric data for reference points in sparse numeric matrix in CSC format (column-compressed).
|
|
1360
|
+
* Pass NULL if there are no sparse numeric columns for reference points or no reference points.
|
|
1361
|
+
* Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
|
|
1362
|
+
* - ref_Xc_ind[ref_nnz]
|
|
1363
|
+
* Pointer to row indices to which each non-zero entry in 'ref_Xc' corresponds.
|
|
1364
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1365
|
+
* Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
|
|
1366
|
+
* - ref_Xc_indptr[ref_nnz]
|
|
1367
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
1368
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
1369
|
+
* Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
|
|
1370
|
+
* - random_seed
|
|
1371
|
+
* Seed that will be used to generate random numbers used by the model.
|
|
1372
|
+
* - use_long_double
|
|
1373
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
1374
|
+
* what was originally passed to 'fit_iforest'.
|
|
1375
|
+
*/
|
|
1376
|
+
template <class real_t, class sparse_ix>
|
|
1377
|
+
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1378
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
1379
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
1380
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1381
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
1382
|
+
real_t sample_weights[], size_t nrows,
|
|
1383
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
1384
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
1385
|
+
bool fast_bratio,
|
|
1386
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
1387
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
1388
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
1389
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
1390
|
+
double prob_pick_col_by_kurt,
|
|
1391
|
+
double min_gain, MissingAction missing_action,
|
|
1392
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
1393
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
1394
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
1395
|
+
TreesIndexer *indexer,
|
|
1396
|
+
real_t ref_numeric_data[], int ref_categ_data[],
|
|
1397
|
+
bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
|
|
1398
|
+
real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
|
|
1399
|
+
uint64_t random_seed, bool use_long_double)
|
|
1400
|
+
{
|
|
1401
|
+
if (use_long_double && !has_long_double()) {
|
|
1402
|
+
use_long_double = false;
|
|
1403
|
+
fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
|
1404
|
+
}
|
|
1405
|
+
#ifndef NO_LONG_DOUBLE
|
|
1406
|
+
if (likely(!use_long_double))
|
|
1407
|
+
#endif
|
|
1408
|
+
return add_tree_internal<real_t, sparse_ix, double>(
|
|
1409
|
+
model_outputs, model_outputs_ext,
|
|
1410
|
+
numeric_data, ncols_numeric,
|
|
1411
|
+
categ_data, ncols_categ, ncat,
|
|
1412
|
+
Xc, Xc_ind, Xc_indptr,
|
|
1413
|
+
ndim, ntry, coef_type, coef_by_prop,
|
|
1414
|
+
sample_weights, nrows,
|
|
1415
|
+
max_depth, ncols_per_tree,
|
|
1416
|
+
limit_depth, penalize_range, standardize_data,
|
|
1417
|
+
fast_bratio,
|
|
1418
|
+
col_weights, weigh_by_kurt,
|
|
1419
|
+
prob_pick_by_gain_pl, prob_pick_by_gain_avg,
|
|
1420
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
1421
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
1422
|
+
prob_pick_col_by_kurt,
|
|
1423
|
+
min_gain, missing_action,
|
|
1424
|
+
cat_split_type, new_cat_action,
|
|
1425
|
+
depth_imp, weigh_imp_rows,
|
|
1426
|
+
all_perm, imputer, min_imp_obs,
|
|
1427
|
+
indexer,
|
|
1428
|
+
ref_numeric_data, ref_categ_data,
|
|
1429
|
+
ref_is_col_major, ref_ld_numeric, ref_ld_categ,
|
|
1430
|
+
ref_Xc, ref_Xc_ind, ref_Xc_indptr,
|
|
1431
|
+
random_seed
|
|
1432
|
+
);
|
|
1433
|
+
#ifndef NO_LONG_DOUBLE
|
|
1434
|
+
else
|
|
1435
|
+
return add_tree_internal<real_t, sparse_ix, long double>(
|
|
1436
|
+
model_outputs, model_outputs_ext,
|
|
1437
|
+
numeric_data, ncols_numeric,
|
|
1438
|
+
categ_data, ncols_categ, ncat,
|
|
1439
|
+
Xc, Xc_ind, Xc_indptr,
|
|
1440
|
+
ndim, ntry, coef_type, coef_by_prop,
|
|
1441
|
+
sample_weights, nrows,
|
|
1442
|
+
max_depth, ncols_per_tree,
|
|
1443
|
+
limit_depth, penalize_range, standardize_data,
|
|
1444
|
+
fast_bratio,
|
|
1445
|
+
col_weights, weigh_by_kurt,
|
|
1446
|
+
prob_pick_by_gain_pl, prob_pick_by_gain_avg,
|
|
1447
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
1448
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
1449
|
+
prob_pick_col_by_kurt,
|
|
1450
|
+
min_gain, missing_action,
|
|
1451
|
+
cat_split_type, new_cat_action,
|
|
1452
|
+
depth_imp, weigh_imp_rows,
|
|
1453
|
+
all_perm, imputer, min_imp_obs,
|
|
1454
|
+
indexer,
|
|
1455
|
+
ref_numeric_data, ref_categ_data,
|
|
1456
|
+
ref_is_col_major, ref_ld_numeric, ref_ld_categ,
|
|
1457
|
+
ref_Xc, ref_Xc_ind, ref_Xc_indptr,
|
|
1458
|
+
random_seed
|
|
1459
|
+
);
|
|
1460
|
+
#endif
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1464
|
+
int add_tree_internal(
|
|
1465
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1466
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
1467
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
1468
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1469
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
1470
|
+
real_t sample_weights[], size_t nrows,
|
|
1471
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
1472
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
1473
|
+
bool fast_bratio,
|
|
1474
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
1475
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
1476
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
1477
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
1478
|
+
double prob_pick_col_by_kurt,
|
|
1479
|
+
double min_gain, MissingAction missing_action,
|
|
1480
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
1481
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
1482
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
1483
|
+
TreesIndexer *indexer,
|
|
1484
|
+
real_t ref_numeric_data[], int ref_categ_data[],
|
|
1485
|
+
bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
|
|
1486
|
+
real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
|
|
1487
|
+
uint64_t random_seed)
|
|
1488
|
+
{
|
|
1489
|
+
if (
|
|
1490
|
+
prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
|
|
1491
|
+
prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
|
|
1492
|
+
prob_pick_col_by_range < 0 ||
|
|
1493
|
+
prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
|
|
1494
|
+
) {
|
|
1495
|
+
throw std::runtime_error("Cannot pass negative probabilities.\n");
|
|
1496
|
+
}
|
|
1497
|
+
if (prob_pick_col_by_range && ncols_categ)
|
|
1498
|
+
throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
|
|
1499
|
+
if (prob_pick_by_full_gain && ncols_categ)
|
|
1500
|
+
throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
|
|
1501
|
+
if (prob_pick_col_by_kurt && weigh_by_kurt)
|
|
1502
|
+
throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
|
|
1503
|
+
if (ndim == 0 && model_outputs == NULL)
|
|
1504
|
+
throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
|
|
1505
|
+
if (indexer != NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty()) {
|
|
1506
|
+
if (ref_numeric_data == NULL && ref_categ_data == NULL && ref_Xc_indptr == NULL)
|
|
1507
|
+
throw std::runtime_error("'indexer' has reference points. Those points must be passed to index them in the new tree to add.\n");
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
std::vector<ImputeNode> *impute_nodes = NULL;
|
|
1511
|
+
|
|
1512
|
+
int max_categ = 0;
|
|
1513
|
+
for (size_t col = 0; col < ncols_categ; col++)
|
|
1514
|
+
max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
|
|
1515
|
+
|
|
1516
|
+
if (model_outputs != NULL)
|
|
1517
|
+
ntry = std::min(ntry, ncols_numeric + ncols_categ);
|
|
1518
|
+
|
|
1519
|
+
if (ncols_per_tree == 0)
|
|
1520
|
+
ncols_per_tree = ncols_numeric + ncols_categ;
|
|
1521
|
+
|
|
1522
|
+
if (indexer != NULL && indexer->indices.empty())
|
|
1523
|
+
indexer = NULL;
|
|
1524
|
+
|
|
1525
|
+
InputData<real_t, sparse_ix>
|
|
1526
|
+
input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
|
|
1527
|
+
nrows, ncols_numeric + ncols_categ, sample_weights,
|
|
1528
|
+
false, col_weights,
|
|
1529
|
+
Xc, Xc_ind, Xc_indptr,
|
|
1530
|
+
0, 0, std::vector<double>(),
|
|
1531
|
+
std::vector<char>(), 0, NULL,
|
|
1532
|
+
(double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
|
|
1533
|
+
std::vector<double>(), std::vector<double>(),
|
|
1534
|
+
std::vector<size_t>(), std::vector<size_t>()};
|
|
1535
|
+
ModelParams model_params = {false, nrows, (size_t)1, ncols_per_tree,
|
|
1536
|
+
max_depth? max_depth : (nrows - 1),
|
|
1537
|
+
penalize_range, standardize_data, random_seed, weigh_by_kurt,
|
|
1538
|
+
prob_pick_by_gain_avg, prob_pick_by_gain_pl,
|
|
1539
|
+
prob_pick_by_full_gain, prob_pick_by_dens,
|
|
1540
|
+
prob_pick_col_by_range, prob_pick_col_by_var,
|
|
1541
|
+
prob_pick_col_by_kurt,
|
|
1542
|
+
min_gain, cat_split_type, new_cat_action, missing_action,
|
|
1543
|
+
(model_outputs != NULL)? model_outputs->scoring_metric : model_outputs_ext->scoring_metric,
|
|
1544
|
+
fast_bratio, all_perm,
|
|
1545
|
+
(model_outputs != NULL)? 0 : ndim, ntry,
|
|
1546
|
+
coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
|
|
1547
|
+
|
|
1548
|
+
if (prob_pick_by_full_gain)
|
|
1549
|
+
{
|
|
1550
|
+
if (input_data.Xc_indptr == NULL)
|
|
1551
|
+
colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
|
|
1552
|
+
else
|
|
1553
|
+
colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
1554
|
+
input_data.nrows, input_data.ncols_numeric,
|
|
1555
|
+
input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
std::unique_ptr<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> workspace(
|
|
1559
|
+
new WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>()
|
|
1560
|
+
);
|
|
1561
|
+
|
|
1562
|
+
size_t last_tree;
|
|
1563
|
+
bool added_tree = false;
|
|
1564
|
+
try
|
|
1565
|
+
{
|
|
1566
|
+
if (model_outputs != NULL)
|
|
1567
|
+
{
|
|
1568
|
+
last_tree = model_outputs->trees.size();
|
|
1569
|
+
model_outputs->trees.emplace_back();
|
|
1570
|
+
added_tree = true;
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
else
|
|
1574
|
+
{
|
|
1575
|
+
last_tree = model_outputs_ext->hplanes.size();
|
|
1576
|
+
model_outputs_ext->hplanes.emplace_back();
|
|
1577
|
+
added_tree = true;
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
if (imputer != NULL)
|
|
1581
|
+
{
|
|
1582
|
+
imputer->imputer_tree.emplace_back();
|
|
1583
|
+
impute_nodes = &(imputer->imputer_tree.back());
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
if (indexer != NULL)
|
|
1587
|
+
{
|
|
1588
|
+
indexer->indices.emplace_back();
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
SignalSwitcher ss = SignalSwitcher();
|
|
1592
|
+
check_interrupt_switch(ss);
|
|
1593
|
+
|
|
1594
|
+
fit_itree<decltype(input_data), typename std::remove_pointer<decltype(workspace.get())>::type, ldouble_safe>(
|
|
1595
|
+
(model_outputs != NULL)? &model_outputs->trees.back() : NULL,
|
|
1596
|
+
(model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
|
|
1597
|
+
*workspace,
|
|
1598
|
+
input_data,
|
|
1599
|
+
model_params,
|
|
1600
|
+
impute_nodes,
|
|
1601
|
+
last_tree);
|
|
1602
|
+
|
|
1603
|
+
check_interrupt_switch(ss);
|
|
1604
|
+
|
|
1605
|
+
if (model_outputs != NULL) {
|
|
1606
|
+
model_outputs->trees.back().shrink_to_fit();
|
|
1607
|
+
model_outputs->has_range_penalty = model_outputs->has_range_penalty || penalize_range;
|
|
1608
|
+
}
|
|
1609
|
+
else {
|
|
1610
|
+
model_outputs_ext->hplanes.back().shrink_to_fit();
|
|
1611
|
+
model_outputs_ext->has_range_penalty = model_outputs_ext->has_range_penalty || penalize_range;
|
|
1612
|
+
}
|
|
1613
|
+
|
|
1614
|
+
if (imputer != NULL)
|
|
1615
|
+
imputer->imputer_tree.back().shrink_to_fit();
|
|
1616
|
+
|
|
1617
|
+
if (indexer != NULL)
|
|
1618
|
+
{
|
|
1619
|
+
if (model_outputs != NULL)
|
|
1620
|
+
build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
|
|
1621
|
+
indexer->indices.back().n_terminal,
|
|
1622
|
+
model_outputs->trees.back());
|
|
1623
|
+
else
|
|
1624
|
+
build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
|
|
1625
|
+
indexer->indices.back().n_terminal,
|
|
1626
|
+
model_outputs_ext->hplanes.back());
|
|
1627
|
+
|
|
1628
|
+
check_interrupt_switch(ss);
|
|
1629
|
+
|
|
1630
|
+
|
|
1631
|
+
if (!indexer->indices.front().node_distances.empty())
|
|
1632
|
+
{
|
|
1633
|
+
std::vector<size_t> temp;
|
|
1634
|
+
temp.reserve(indexer->indices.back().n_terminal);
|
|
1635
|
+
if (model_outputs != NULL) {
|
|
1636
|
+
build_dindex(
|
|
1637
|
+
temp,
|
|
1638
|
+
indexer->indices.back().terminal_node_mappings,
|
|
1639
|
+
indexer->indices.back().node_distances,
|
|
1640
|
+
indexer->indices.back().node_depths,
|
|
1641
|
+
indexer->indices.back().n_terminal,
|
|
1642
|
+
model_outputs->trees.back()
|
|
1643
|
+
);
|
|
1644
|
+
}
|
|
1645
|
+
else {
|
|
1646
|
+
build_dindex(
|
|
1647
|
+
temp,
|
|
1648
|
+
indexer->indices.back().terminal_node_mappings,
|
|
1649
|
+
indexer->indices.back().node_distances,
|
|
1650
|
+
indexer->indices.back().node_depths,
|
|
1651
|
+
indexer->indices.back().n_terminal,
|
|
1652
|
+
model_outputs_ext->hplanes.back()
|
|
1653
|
+
);
|
|
1654
|
+
}
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
check_interrupt_switch(ss);
|
|
1658
|
+
if (!indexer->indices.front().reference_points.empty())
|
|
1659
|
+
{
|
|
1660
|
+
size_t n_ref = indexer->indices.front().reference_points.size();
|
|
1661
|
+
std::vector<sparse_ix> terminal_indices(n_ref);
|
|
1662
|
+
std::unique_ptr<double[]> ignored(new double[n_ref]);
|
|
1663
|
+
if (model_outputs != NULL)
|
|
1664
|
+
{
|
|
1665
|
+
IsoForest single_tree_model;
|
|
1666
|
+
single_tree_model.new_cat_action = model_outputs->new_cat_action;
|
|
1667
|
+
single_tree_model.cat_split_type = model_outputs->cat_split_type;
|
|
1668
|
+
single_tree_model.missing_action = model_outputs->missing_action;
|
|
1669
|
+
single_tree_model.trees.push_back(model_outputs->trees.back());
|
|
1670
|
+
|
|
1671
|
+
predict_iforest(ref_numeric_data, ref_categ_data,
|
|
1672
|
+
ref_is_col_major, ref_ld_numeric, ref_ld_categ,
|
|
1673
|
+
ref_Xc, ref_Xc_ind, ref_Xc_indptr,
|
|
1674
|
+
(real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
|
|
1675
|
+
n_ref, 1, false,
|
|
1676
|
+
&single_tree_model, (ExtIsoForest*)NULL,
|
|
1677
|
+
ignored.get(), terminal_indices.data(),
|
|
1678
|
+
(double*)NULL,
|
|
1679
|
+
indexer);
|
|
1680
|
+
}
|
|
1681
|
+
|
|
1682
|
+
else
|
|
1683
|
+
{
|
|
1684
|
+
ExtIsoForest single_tree_model;
|
|
1685
|
+
single_tree_model.new_cat_action = model_outputs_ext->new_cat_action;
|
|
1686
|
+
single_tree_model.cat_split_type = model_outputs_ext->cat_split_type;
|
|
1687
|
+
single_tree_model.missing_action = model_outputs_ext->missing_action;
|
|
1688
|
+
single_tree_model.hplanes.push_back(model_outputs_ext->hplanes.back());
|
|
1689
|
+
|
|
1690
|
+
predict_iforest(ref_numeric_data, ref_categ_data,
|
|
1691
|
+
ref_is_col_major, ref_ld_numeric, ref_ld_categ,
|
|
1692
|
+
ref_Xc, ref_Xc_ind, ref_Xc_indptr,
|
|
1693
|
+
(real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
|
|
1694
|
+
n_ref, 1, false,
|
|
1695
|
+
(IsoForest*)NULL, &single_tree_model,
|
|
1696
|
+
ignored.get(), terminal_indices.data(),
|
|
1697
|
+
(double*)NULL,
|
|
1698
|
+
indexer);
|
|
1699
|
+
}
|
|
1700
|
+
|
|
1701
|
+
ignored.reset();
|
|
1702
|
+
indexer->indices.back().reference_points.assign(terminal_indices.begin(), terminal_indices.end());
|
|
1703
|
+
indexer->indices.back().reference_points.shrink_to_fit();
|
|
1704
|
+
build_ref_node(indexer->indices.back());
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
check_interrupt_switch(ss);
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
catch (...)
|
|
1712
|
+
{
|
|
1713
|
+
if (added_tree)
|
|
1714
|
+
{
|
|
1715
|
+
if (model_outputs != NULL)
|
|
1716
|
+
model_outputs->trees.pop_back();
|
|
1717
|
+
else
|
|
1718
|
+
model_outputs_ext->hplanes.pop_back();
|
|
1719
|
+
if (imputer != NULL) {
|
|
1720
|
+
if (model_outputs != NULL)
|
|
1721
|
+
imputer->imputer_tree.resize(model_outputs->trees.size());
|
|
1722
|
+
else
|
|
1723
|
+
imputer->imputer_tree.resize(model_outputs_ext->hplanes.size());
|
|
1724
|
+
}
|
|
1725
|
+
if (indexer != NULL) {
|
|
1726
|
+
if (model_outputs != NULL)
|
|
1727
|
+
indexer->indices.resize(model_outputs->trees.size());
|
|
1728
|
+
else
|
|
1729
|
+
indexer->indices.resize(model_outputs_ext->hplanes.size());
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
throw;
|
|
1733
|
+
}
|
|
1734
|
+
|
|
1735
|
+
return EXIT_SUCCESS;
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
1739
|
+
void fit_itree(std::vector<IsoTree> *tree_root,
|
|
1740
|
+
std::vector<IsoHPlane> *hplane_root,
|
|
1741
|
+
WorkerMemory &workspace,
|
|
1742
|
+
InputData &input_data,
|
|
1743
|
+
ModelParams &model_params,
|
|
1744
|
+
std::vector<ImputeNode> *impute_nodes,
|
|
1745
|
+
size_t tree_num)
|
|
1746
|
+
{
|
|
1747
|
+
/* initialize array for depths if called for */
|
|
1748
|
+
if (workspace.ix_arr.empty() && model_params.calc_depth)
|
|
1749
|
+
workspace.row_depths.resize(input_data.nrows, 0);
|
|
1750
|
+
|
|
1751
|
+
/* choose random sample of rows */
|
|
1752
|
+
if (workspace.ix_arr.empty()) workspace.ix_arr.resize(model_params.sample_size);
|
|
1753
|
+
if (input_data.log2_n > 0)
|
|
1754
|
+
workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
|
|
1755
|
+
input_data.btree_weights_init.end());
|
|
1756
|
+
workspace.rnd_generator.seed(model_params.random_seed + tree_num);
|
|
1757
|
+
workspace.rbin = UniformUnitInterval(0, 1);
|
|
1758
|
+
sample_random_rows<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
|
|
1759
|
+
workspace.ix_arr, input_data.nrows, model_params.with_replacement,
|
|
1760
|
+
workspace.rnd_generator, workspace.ix_all,
|
|
1761
|
+
(input_data.weight_as_sample)? input_data.sample_weights : NULL,
|
|
1762
|
+
workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
|
|
1763
|
+
workspace.is_repeated);
|
|
1764
|
+
workspace.st = 0;
|
|
1765
|
+
workspace.end = model_params.sample_size - 1;
|
|
1766
|
+
|
|
1767
|
+
/* in some cases, it's not possible to use column weights even if they are given,
|
|
1768
|
+
because every single column will always need to be checked or end up being used. */
|
|
1769
|
+
bool avoid_col_weights = (tree_root != NULL && model_params.ntry >= model_params.ncols_per_tree &&
|
|
1770
|
+
model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
|
|
1771
|
+
model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
|
|
1772
|
+
||
|
|
1773
|
+
(tree_root == NULL && model_params.ndim >= model_params.ncols_per_tree)
|
|
1774
|
+
||
|
|
1775
|
+
(model_params.ncols_per_tree == 1);
|
|
1776
|
+
if (input_data.preinitialized_col_sampler == NULL)
|
|
1777
|
+
{
|
|
1778
|
+
if (input_data.col_weights != NULL && !avoid_col_weights && !model_params.weigh_by_kurt)
|
|
1779
|
+
workspace.col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1782
|
+
|
|
1783
|
+
/* set expected tree size and add root node */
|
|
1784
|
+
{
|
|
1785
|
+
size_t exp_nodes = mult2(model_params.sample_size);
|
|
1786
|
+
if (model_params.sample_size >= div2(SIZE_MAX))
|
|
1787
|
+
exp_nodes = SIZE_MAX;
|
|
1788
|
+
else if (model_params.max_depth <= (size_t)30)
|
|
1789
|
+
exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
|
|
1790
|
+
if (tree_root != NULL)
|
|
1791
|
+
{
|
|
1792
|
+
tree_root->reserve(exp_nodes);
|
|
1793
|
+
tree_root->emplace_back();
|
|
1794
|
+
}
|
|
1795
|
+
else
|
|
1796
|
+
{
|
|
1797
|
+
hplane_root->reserve(exp_nodes);
|
|
1798
|
+
hplane_root->emplace_back();
|
|
1799
|
+
}
|
|
1800
|
+
if (impute_nodes != NULL)
|
|
1801
|
+
{
|
|
1802
|
+
impute_nodes->reserve(exp_nodes);
|
|
1803
|
+
impute_nodes->emplace_back((size_t) 0);
|
|
1804
|
+
}
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
/* initialize array with candidate categories if not already done */
|
|
1808
|
+
if (workspace.categs.empty())
|
|
1809
|
+
workspace.categs.resize(input_data.max_categ);
|
|
1810
|
+
|
|
1811
|
+
/* initialize array with per-node column weights if needed */
|
|
1812
|
+
if ((model_params.prob_pick_col_by_range ||
|
|
1813
|
+
model_params.prob_pick_col_by_var ||
|
|
1814
|
+
model_params.prob_pick_col_by_kurt) && workspace.node_col_weights.empty())
|
|
1815
|
+
{
|
|
1816
|
+
workspace.node_col_weights.resize(input_data.ncols_tot);
|
|
1817
|
+
if (tree_root != NULL || model_params.standardize_data || model_params.missing_action != Fail)
|
|
1818
|
+
{
|
|
1819
|
+
workspace.saved_stat1.resize(input_data.ncols_numeric);
|
|
1820
|
+
workspace.saved_stat2.resize(input_data.ncols_numeric);
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
|
|
1824
|
+
/* IMPORTANT!!!!!
|
|
1825
|
+
The standard library implementation is likely going to use the Box-Muller method
|
|
1826
|
+
for normal sampling, which has some state memory in the **distribution object itself**
|
|
1827
|
+
in addition to the state memory from the RNG engine. DO NOT avoid re-generating this
|
|
1828
|
+
object on each tree, despite being inefficient, because then it can cause seed
|
|
1829
|
+
irreproducibility when the number of splitting dimensions is odd and the number
|
|
1830
|
+
of threads is more than 1. This is a very hard issue to debug since everything
|
|
1831
|
+
works fine depending on the order in which trees are assigned to threads.
|
|
1832
|
+
DO NOT PUT THESE LINES BELOW THE NEXT IF. */
|
|
1833
|
+
if (hplane_root != NULL)
|
|
1834
|
+
{
|
|
1835
|
+
if (input_data.ncols_categ || model_params.coef_type == Normal)
|
|
1836
|
+
workspace.coef_norm = StandardNormalDistr(0, 1);
|
|
1837
|
+
if (model_params.coef_type == Uniform)
|
|
1838
|
+
workspace.coef_unif = UniformMinusOneToOne(-1, 1);
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
/* for the extended model, initialize extra vectors and objects */
|
|
1842
|
+
if (hplane_root != NULL && workspace.comb_val.empty())
|
|
1843
|
+
{
|
|
1844
|
+
workspace.comb_val.resize(model_params.sample_size);
|
|
1845
|
+
workspace.col_take.resize(model_params.ndim);
|
|
1846
|
+
workspace.col_take_type.resize(model_params.ndim);
|
|
1847
|
+
|
|
1848
|
+
if (input_data.ncols_numeric)
|
|
1849
|
+
{
|
|
1850
|
+
workspace.ext_offset.resize(input_data.ncols_tot);
|
|
1851
|
+
workspace.ext_coef.resize(input_data.ncols_tot);
|
|
1852
|
+
workspace.ext_mean.resize(input_data.ncols_tot);
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
if (input_data.ncols_categ)
|
|
1856
|
+
{
|
|
1857
|
+
workspace.ext_fill_new.resize(input_data.max_categ);
|
|
1858
|
+
switch(model_params.cat_split_type)
|
|
1859
|
+
{
|
|
1860
|
+
case SingleCateg:
|
|
1861
|
+
{
|
|
1862
|
+
workspace.chosen_cat.resize(input_data.max_categ);
|
|
1863
|
+
break;
|
|
1864
|
+
}
|
|
1865
|
+
|
|
1866
|
+
case SubSet:
|
|
1867
|
+
{
|
|
1868
|
+
workspace.ext_cat_coef.resize(input_data.ncols_tot);
|
|
1869
|
+
for (std::vector<double> &v : workspace.ext_cat_coef)
|
|
1870
|
+
v.resize(input_data.max_categ);
|
|
1871
|
+
break;
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
workspace.ext_fill_val.resize(input_data.ncols_tot);
|
|
1877
|
+
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
/* If there are density weights, need to standardize them to sum up to
|
|
1881
|
+
the sample size here. Note that weights for missing values with 'Divide'
|
|
1882
|
+
are only initialized on-demand later on. */
|
|
1883
|
+
workspace.changed_weights = false;
|
|
1884
|
+
if (hplane_root == NULL) workspace.weights_map.clear();
|
|
1885
|
+
|
|
1886
|
+
ldouble_safe weight_scaling = 0;
|
|
1887
|
+
if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
|
|
1888
|
+
{
|
|
1889
|
+
workspace.changed_weights = true;
|
|
1890
|
+
|
|
1891
|
+
/* For the extended model, if there is no sub-sampling, these weights will remain
|
|
1892
|
+
constant throughout and do not need to be re-generated. */
|
|
1893
|
+
if (!( hplane_root != NULL &&
|
|
1894
|
+
(!workspace.weights_map.empty() || !workspace.weights_arr.empty()) &&
|
|
1895
|
+
model_params.sample_size == input_data.nrows && !model_params.with_replacement
|
|
1896
|
+
)
|
|
1897
|
+
)
|
|
1898
|
+
{
|
|
1899
|
+
workspace.weights_map.clear();
|
|
1900
|
+
|
|
1901
|
+
/* if the sub-sample size is small relative to the full sample size, use a mapping */
|
|
1902
|
+
if (input_data.Xc_indptr != NULL && model_params.sample_size < input_data.nrows / 50)
|
|
1903
|
+
{
|
|
1904
|
+
for (const size_t ix : workspace.ix_arr)
|
|
1905
|
+
weight_scaling += input_data.sample_weights[ix];
|
|
1906
|
+
weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
|
|
1907
|
+
workspace.weights_map.reserve(workspace.ix_arr.size());
|
|
1908
|
+
for (const size_t ix : workspace.ix_arr)
|
|
1909
|
+
workspace.weights_map[ix] = input_data.sample_weights[ix] * weight_scaling;
|
|
1910
|
+
}
|
|
1911
|
+
|
|
1912
|
+
/* if the sub-sample size is large, fill a full array matching to the sample size */
|
|
1913
|
+
else
|
|
1914
|
+
{
|
|
1915
|
+
if (workspace.weights_arr.empty())
|
|
1916
|
+
{
|
|
1917
|
+
workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
|
|
1918
|
+
weight_scaling = std::accumulate(workspace.ix_arr.begin(),
|
|
1919
|
+
workspace.ix_arr.end(),
|
|
1920
|
+
(ldouble_safe)0,
|
|
1921
|
+
[&input_data](const ldouble_safe a, const size_t b){return a + (ldouble_safe)input_data.sample_weights[b];}
|
|
1922
|
+
);
|
|
1923
|
+
weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
|
|
1924
|
+
for (double &w : workspace.weights_arr)
|
|
1925
|
+
w *= weight_scaling;
|
|
1926
|
+
}
|
|
1927
|
+
|
|
1928
|
+
else
|
|
1929
|
+
{
|
|
1930
|
+
for (const size_t ix : workspace.ix_arr)
|
|
1931
|
+
{
|
|
1932
|
+
weight_scaling += input_data.sample_weights[ix];
|
|
1933
|
+
workspace.weights_arr[ix] = input_data.sample_weights[ix];
|
|
1934
|
+
}
|
|
1935
|
+
weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
|
|
1936
|
+
for (double &w : workspace.weights_arr)
|
|
1937
|
+
w *= weight_scaling;
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
|
|
1943
|
+
/* if producing distance/similarity, also need to initialize the triangular matrix */
|
|
1944
|
+
if (model_params.calc_dist && workspace.tmat_sep.empty())
|
|
1945
|
+
workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
|
|
1946
|
+
|
|
1947
|
+
/* make space for buffers if not already allocated */
|
|
1948
|
+
if (
|
|
1949
|
+
(model_params.prob_pick_by_gain_avg > 0 ||
|
|
1950
|
+
model_params.prob_pick_by_gain_pl > 0 ||
|
|
1951
|
+
model_params.prob_pick_by_full_gain > 0 ||
|
|
1952
|
+
model_params.prob_pick_by_dens > 0 ||
|
|
1953
|
+
model_params.prob_pick_col_by_range > 0 ||
|
|
1954
|
+
model_params.prob_pick_col_by_var > 0 ||
|
|
1955
|
+
model_params.prob_pick_col_by_kurt > 0 ||
|
|
1956
|
+
model_params.weigh_by_kurt || hplane_root != NULL)
|
|
1957
|
+
&&
|
|
1958
|
+
(workspace.buffer_dbl.empty() && workspace.buffer_szt.empty() && workspace.buffer_chr.empty())
|
|
1959
|
+
)
|
|
1960
|
+
{
|
|
1961
|
+
size_t min_size_dbl = 0;
|
|
1962
|
+
size_t min_size_szt = 0;
|
|
1963
|
+
size_t min_size_chr = 0;
|
|
1964
|
+
|
|
1965
|
+
bool gain = model_params.prob_pick_by_gain_avg > 0 ||
|
|
1966
|
+
model_params.prob_pick_by_gain_pl > 0 ||
|
|
1967
|
+
model_params.prob_pick_by_full_gain > 0 ||
|
|
1968
|
+
model_params.prob_pick_by_dens > 0;
|
|
1969
|
+
|
|
1970
|
+
if (input_data.ncols_categ)
|
|
1971
|
+
{
|
|
1972
|
+
min_size_szt = (size_t)2 * (size_t)input_data.max_categ;
|
|
1973
|
+
min_size_dbl = input_data.max_categ + 1;
|
|
1974
|
+
if (gain && model_params.cat_split_type == SubSet)
|
|
1975
|
+
min_size_chr = input_data.max_categ;
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
if (input_data.Xc_indptr != NULL && gain)
|
|
1979
|
+
{
|
|
1980
|
+
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
|
1981
|
+
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
|
1982
|
+
}
|
|
1983
|
+
|
|
1984
|
+
/* TODO: revisit if this covers all the cases */
|
|
1985
|
+
if (model_params.ntry > 1 || gain)
|
|
1986
|
+
{
|
|
1987
|
+
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
|
1988
|
+
if (model_params.ndim < 2 && input_data.Xc_indptr != NULL)
|
|
1989
|
+
min_size_dbl = std::max(min_size_dbl, (size_t)2*model_params.sample_size);
|
|
1990
|
+
}
|
|
1991
|
+
|
|
1992
|
+
/* for sampled column choices */
|
|
1993
|
+
if (model_params.prob_pick_col_by_var)
|
|
1994
|
+
{
|
|
1995
|
+
if (input_data.ncols_categ) {
|
|
1996
|
+
min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
|
|
1997
|
+
min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ + 1);
|
|
1998
|
+
}
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
if (model_params.prob_pick_col_by_kurt)
|
|
2002
|
+
{
|
|
2003
|
+
if (input_data.ncols_categ) {
|
|
2004
|
+
min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
|
|
2005
|
+
min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
|
|
2006
|
+
}
|
|
2007
|
+
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
/* for the extended model */
|
|
2011
|
+
if (hplane_root != NULL)
|
|
2012
|
+
{
|
|
2013
|
+
min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
|
|
2014
|
+
if (model_params.missing_action != Fail)
|
|
2015
|
+
{
|
|
2016
|
+
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
|
2017
|
+
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
|
|
2021
|
+
{
|
|
2022
|
+
min_size_szt = std::max(min_size_szt, (size_t)2 * (size_t)input_data.max_categ + (size_t)1);
|
|
2023
|
+
min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2026
|
+
if (model_params.weigh_by_kurt)
|
|
2027
|
+
min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
|
|
2028
|
+
|
|
2029
|
+
if (gain && (!workspace.weights_arr.empty() || !workspace.weights_map.empty()))
|
|
2030
|
+
{
|
|
2031
|
+
workspace.sample_weights.resize(model_params.sample_size);
|
|
2032
|
+
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
|
|
2036
|
+
/* now resize */
|
|
2037
|
+
if (workspace.buffer_dbl.size() < min_size_dbl)
|
|
2038
|
+
workspace.buffer_dbl.resize(min_size_dbl);
|
|
2039
|
+
|
|
2040
|
+
if (workspace.buffer_szt.size() < min_size_szt)
|
|
2041
|
+
workspace.buffer_szt.resize(min_size_szt);
|
|
2042
|
+
|
|
2043
|
+
if (workspace.buffer_chr.size() < min_size_chr)
|
|
2044
|
+
workspace.buffer_chr.resize(min_size_chr);
|
|
2045
|
+
|
|
2046
|
+
/* for guided column choice, need to also remember the best split so far */
|
|
2047
|
+
if (
|
|
2048
|
+
model_params.cat_split_type == SubSet &&
|
|
2049
|
+
(
|
|
2050
|
+
model_params.prob_pick_by_gain_avg ||
|
|
2051
|
+
model_params.prob_pick_by_gain_pl ||
|
|
2052
|
+
model_params.prob_pick_by_full_gain ||
|
|
2053
|
+
model_params.prob_pick_by_dens
|
|
2054
|
+
)
|
|
2055
|
+
)
|
|
2056
|
+
{
|
|
2057
|
+
workspace.this_split_categ.resize(input_data.max_categ);
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
}
|
|
2061
|
+
|
|
2062
|
+
/* Other potentially necessary buffers */
|
|
2063
|
+
if (
|
|
2064
|
+
tree_root != NULL && model_params.missing_action == Impute &&
|
|
2065
|
+
(model_params.prob_pick_by_gain_avg || model_params.prob_pick_by_gain_pl ||
|
|
2066
|
+
model_params.prob_pick_by_full_gain || model_params.prob_pick_by_dens) &&
|
|
2067
|
+
input_data.Xc_indptr == NULL && input_data.ncols_numeric && workspace.imputed_x_buffer.empty()
|
|
2068
|
+
)
|
|
2069
|
+
{
|
|
2070
|
+
workspace.imputed_x_buffer.resize(input_data.nrows);
|
|
2071
|
+
}
|
|
2072
|
+
|
|
2073
|
+
if (model_params.prob_pick_by_full_gain && workspace.col_indices.empty())
|
|
2074
|
+
workspace.col_indices.resize(model_params.ncols_per_tree);
|
|
2075
|
+
|
|
2076
|
+
if (
|
|
2077
|
+
(model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) &&
|
|
2078
|
+
model_params.weigh_by_kurt &&
|
|
2079
|
+
model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
|
|
2080
|
+
(model_params.ncols_per_tree == input_data.ncols_tot) &&
|
|
2081
|
+
!input_data.all_kurtoses.empty()
|
|
2082
|
+
) {
|
|
2083
|
+
workspace.tree_kurtoses = input_data.all_kurtoses.data();
|
|
2084
|
+
}
|
|
2085
|
+
else {
|
|
2086
|
+
workspace.tree_kurtoses = NULL;
|
|
2087
|
+
}
|
|
2088
|
+
|
|
2089
|
+
/* weigh columns by kurtosis in the sample if required */
|
|
2090
|
+
/* TODO: this one could probably be refactored to use the function in the helpers */
|
|
2091
|
+
std::vector<double> kurt_weights;
|
|
2092
|
+
bool avoid_leave_m_cols = false;
|
|
2093
|
+
if (
|
|
2094
|
+
model_params.weigh_by_kurt &&
|
|
2095
|
+
!avoid_col_weights &&
|
|
2096
|
+
(input_data.preinitialized_col_sampler == NULL
|
|
2097
|
+
||
|
|
2098
|
+
((model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) && workspace.tree_kurtoses == NULL))
|
|
2099
|
+
)
|
|
2100
|
+
{
|
|
2101
|
+
kurt_weights.resize(input_data.ncols_numeric + input_data.ncols_categ, 0.);
|
|
2102
|
+
|
|
2103
|
+
if (model_params.ncols_per_tree >= input_data.ncols_tot)
|
|
2104
|
+
{
|
|
2105
|
+
|
|
2106
|
+
if (input_data.Xc_indptr == NULL)
|
|
2107
|
+
{
|
|
2108
|
+
|
|
2109
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
2110
|
+
{
|
|
2111
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2112
|
+
kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
|
|
2113
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2114
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2115
|
+
model_params.missing_action);
|
|
2116
|
+
else if (!workspace.weights_arr.empty())
|
|
2117
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, decltype(workspace.weights_arr), ldouble_safe>(
|
|
2118
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2119
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2120
|
+
model_params.missing_action, workspace.weights_arr);
|
|
2121
|
+
else
|
|
2122
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
|
|
2123
|
+
decltype(workspace.weights_map), ldouble_safe>(
|
|
2124
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2125
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2126
|
+
model_params.missing_action, workspace.weights_map);
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2130
|
+
else
|
|
2131
|
+
{
|
|
2132
|
+
std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
|
|
2133
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
2134
|
+
{
|
|
2135
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2136
|
+
kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2137
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2138
|
+
ldouble_safe>(
|
|
2139
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2140
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2141
|
+
model_params.missing_action);
|
|
2142
|
+
else if (!workspace.weights_arr.empty())
|
|
2143
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2144
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2145
|
+
decltype(workspace.weights_arr), ldouble_safe>(
|
|
2146
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2147
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2148
|
+
model_params.missing_action, workspace.weights_arr);
|
|
2149
|
+
else
|
|
2150
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2151
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2152
|
+
decltype(workspace.weights_map), ldouble_safe>(
|
|
2153
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2154
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2155
|
+
model_params.missing_action, workspace.weights_map);
|
|
2156
|
+
}
|
|
2157
|
+
}
|
|
2158
|
+
|
|
2159
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
|
2160
|
+
{
|
|
2161
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2162
|
+
kurt_weights[col + input_data.ncols_numeric] =
|
|
2163
|
+
calc_kurtosis<ldouble_safe>(
|
|
2164
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2165
|
+
input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
|
|
2166
|
+
workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
|
|
2167
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
|
|
2168
|
+
else if (!workspace.weights_arr.empty())
|
|
2169
|
+
kurt_weights[col + input_data.ncols_numeric] =
|
|
2170
|
+
calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
|
|
2171
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2172
|
+
input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
|
|
2173
|
+
workspace.buffer_dbl.data(),
|
|
2174
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
|
|
2175
|
+
workspace.weights_arr);
|
|
2176
|
+
else
|
|
2177
|
+
kurt_weights[col + input_data.ncols_numeric] =
|
|
2178
|
+
calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
|
|
2179
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2180
|
+
input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
|
|
2181
|
+
workspace.buffer_dbl.data(),
|
|
2182
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
|
|
2183
|
+
workspace.weights_map);
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
for (auto &w : kurt_weights) w = (w == -HUGE_VAL)? 0. : std::fmax(1e-8, -1. + w);
|
|
2187
|
+
if (input_data.col_weights != NULL)
|
|
2188
|
+
{
|
|
2189
|
+
for (size_t col = 0; col < input_data.ncols_tot; col++)
|
|
2190
|
+
{
|
|
2191
|
+
if (kurt_weights[col] <= 0) continue;
|
|
2192
|
+
kurt_weights[col] *= input_data.col_weights[col];
|
|
2193
|
+
kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
|
|
2200
|
+
|
|
2201
|
+
else
|
|
2202
|
+
{
|
|
2203
|
+
std::vector<size_t> cols_take(model_params.ncols_per_tree);
|
|
2204
|
+
std::vector<size_t> buffer1;
|
|
2205
|
+
std::vector<bool> buffer2;
|
|
2206
|
+
sample_random_rows<double, double>(
|
|
2207
|
+
cols_take, input_data.ncols_tot, false,
|
|
2208
|
+
workspace.rnd_generator, buffer1,
|
|
2209
|
+
(double*)NULL, kurt_weights, /* <- will not get used */
|
|
2210
|
+
(size_t)0, (size_t)0, buffer2);
|
|
2211
|
+
|
|
2212
|
+
if (
|
|
2213
|
+
model_params.sample_size == input_data.nrows &&
|
|
2214
|
+
!model_params.with_replacement &&
|
|
2215
|
+
!input_data.all_kurtoses.empty()
|
|
2216
|
+
)
|
|
2217
|
+
{
|
|
2218
|
+
for (size_t col : cols_take)
|
|
2219
|
+
kurt_weights[col] = input_data.all_kurtoses[col];
|
|
2220
|
+
goto skip_kurt_calculations;
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
if (input_data.Xc_indptr != NULL)
|
|
2224
|
+
std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
|
|
2225
|
+
|
|
2226
|
+
for (size_t col : cols_take)
|
|
2227
|
+
{
|
|
2228
|
+
if (col < input_data.ncols_numeric)
|
|
2229
|
+
{
|
|
2230
|
+
if (input_data.Xc_indptr == NULL)
|
|
2231
|
+
{
|
|
2232
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2233
|
+
kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
|
|
2234
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2235
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2236
|
+
model_params.missing_action);
|
|
2237
|
+
else if (!workspace.weights_arr.empty())
|
|
2238
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
|
|
2239
|
+
decltype(workspace.weights_arr), ldouble_safe>(
|
|
2240
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2241
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2242
|
+
model_params.missing_action, workspace.weights_arr);
|
|
2243
|
+
else
|
|
2244
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
|
|
2245
|
+
decltype(workspace.weights_map), ldouble_safe>(
|
|
2246
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2247
|
+
input_data.numeric_data + col * input_data.nrows,
|
|
2248
|
+
model_params.missing_action, workspace.weights_map);
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2251
|
+
else
|
|
2252
|
+
{
|
|
2253
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2254
|
+
kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2255
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2256
|
+
ldouble_safe>(
|
|
2257
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2258
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2259
|
+
model_params.missing_action);
|
|
2260
|
+
else if (!workspace.weights_arr.empty())
|
|
2261
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2262
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2263
|
+
decltype(workspace.weights_arr), ldouble_safe>(
|
|
2264
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2265
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2266
|
+
model_params.missing_action, workspace.weights_arr);
|
|
2267
|
+
else
|
|
2268
|
+
kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
|
|
2269
|
+
typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
|
|
2270
|
+
decltype(workspace.weights_map), ldouble_safe>(
|
|
2271
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, col,
|
|
2272
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
|
2273
|
+
model_params.missing_action, workspace.weights_map);
|
|
2274
|
+
}
|
|
2275
|
+
}
|
|
2276
|
+
|
|
2277
|
+
else
|
|
2278
|
+
{
|
|
2279
|
+
if (workspace.weights_arr.empty() && workspace.weights_map.empty())
|
|
2280
|
+
kurt_weights[col] =
|
|
2281
|
+
calc_kurtosis<ldouble_safe>(
|
|
2282
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2283
|
+
input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
|
|
2284
|
+
input_data.ncat[col - input_data.ncols_numeric],
|
|
2285
|
+
workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
|
|
2286
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
|
|
2287
|
+
else if (!workspace.weights_arr.empty())
|
|
2288
|
+
kurt_weights[col] =
|
|
2289
|
+
calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
|
|
2290
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2291
|
+
input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
|
|
2292
|
+
input_data.ncat[col - input_data.ncols_numeric],
|
|
2293
|
+
workspace.buffer_dbl.data(),
|
|
2294
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
|
|
2295
|
+
workspace.weights_arr);
|
|
2296
|
+
else
|
|
2297
|
+
kurt_weights[col] =
|
|
2298
|
+
calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
|
|
2299
|
+
workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
2300
|
+
input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
|
|
2301
|
+
input_data.ncat[col - input_data.ncols_numeric],
|
|
2302
|
+
workspace.buffer_dbl.data(),
|
|
2303
|
+
model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
|
|
2304
|
+
workspace.weights_map);
|
|
2305
|
+
}
|
|
2306
|
+
|
|
2307
|
+
/* Note to self: don't move this to outside of the braces, as it needs to assign a weight
|
|
2308
|
+
of zero to the columns that were not selected, thus it should only do this clipping
|
|
2309
|
+
for columns that are chosen. */
|
|
2310
|
+
if (kurt_weights[col] == -HUGE_VAL)
|
|
2311
|
+
{
|
|
2312
|
+
kurt_weights[col] = 0;
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2315
|
+
else
|
|
2316
|
+
{
|
|
2317
|
+
kurt_weights[col] = std::fmax(1e-8, -1. + kurt_weights[col]);
|
|
2318
|
+
if (input_data.col_weights != NULL)
|
|
2319
|
+
{
|
|
2320
|
+
kurt_weights[col] *= input_data.col_weights[col];
|
|
2321
|
+
kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
|
|
2322
|
+
}
|
|
2323
|
+
}
|
|
2324
|
+
}
|
|
2325
|
+
|
|
2326
|
+
skip_kurt_calculations:
|
|
2327
|
+
workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
|
|
2328
|
+
avoid_leave_m_cols = true;
|
|
2329
|
+
}
|
|
2330
|
+
|
|
2331
|
+
if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
|
|
2332
|
+
{
|
|
2333
|
+
workspace.tree_kurtoses = kurt_weights.data();
|
|
2334
|
+
}
|
|
2335
|
+
}
|
|
2336
|
+
|
|
2337
|
+
bool col_sampler_is_fresh = true;
|
|
2338
|
+
if (input_data.preinitialized_col_sampler == NULL) {
|
|
2339
|
+
workspace.col_sampler.initialize(input_data.ncols_tot);
|
|
2340
|
+
}
|
|
2341
|
+
else {
|
|
2342
|
+
workspace.col_sampler = *((ColumnSampler<ldouble_safe>*)input_data.preinitialized_col_sampler);
|
|
2343
|
+
col_sampler_is_fresh = false;
|
|
2344
|
+
}
|
|
2345
|
+
/* TODO: this can be done more efficiently when sub-sampling columns */
|
|
2346
|
+
if (!avoid_leave_m_cols)
|
|
2347
|
+
workspace.col_sampler.leave_m_cols(model_params.ncols_per_tree, workspace.rnd_generator);
|
|
2348
|
+
if (model_params.ncols_per_tree < input_data.ncols_tot) col_sampler_is_fresh = false;
|
|
2349
|
+
workspace.try_all = false;
|
|
2350
|
+
if (hplane_root != NULL && model_params.ndim >= input_data.ncols_tot)
|
|
2351
|
+
workspace.try_all = true;
|
|
2352
|
+
|
|
2353
|
+
if (model_params.scoring_metric != Depth && !is_boxed_metric(model_params.scoring_metric))
|
|
2354
|
+
{
|
|
2355
|
+
workspace.density_calculator.initialize(model_params.max_depth,
|
|
2356
|
+
input_data.ncols_categ? input_data.max_categ : 0,
|
|
2357
|
+
tree_root != NULL && input_data.ncols_categ,
|
|
2358
|
+
model_params.scoring_metric);
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2361
|
+
else if (is_boxed_metric(model_params.scoring_metric))
|
|
2362
|
+
{
|
|
2363
|
+
if (tree_root != NULL)
|
|
2364
|
+
workspace.density_calculator.initialize_bdens(input_data,
|
|
2365
|
+
model_params,
|
|
2366
|
+
workspace.ix_arr,
|
|
2367
|
+
workspace.col_sampler);
|
|
2368
|
+
else
|
|
2369
|
+
workspace.density_calculator.initialize_bdens_ext(input_data,
|
|
2370
|
+
model_params,
|
|
2371
|
+
workspace.ix_arr,
|
|
2372
|
+
workspace.col_sampler,
|
|
2373
|
+
col_sampler_is_fresh);
|
|
2374
|
+
}
|
|
2375
|
+
|
|
2376
|
+
if (tree_root != NULL)
|
|
2377
|
+
{
|
|
2378
|
+
split_itree_recursive<InputData, WorkerMemory, ldouble_safe>(
|
|
2379
|
+
*tree_root,
|
|
2380
|
+
workspace,
|
|
2381
|
+
input_data,
|
|
2382
|
+
model_params,
|
|
2383
|
+
impute_nodes,
|
|
2384
|
+
0);
|
|
2385
|
+
}
|
|
2386
|
+
|
|
2387
|
+
else
|
|
2388
|
+
{
|
|
2389
|
+
split_hplane_recursive<InputData, WorkerMemory, ldouble_safe>(
|
|
2390
|
+
*hplane_root,
|
|
2391
|
+
workspace,
|
|
2392
|
+
input_data,
|
|
2393
|
+
model_params,
|
|
2394
|
+
impute_nodes,
|
|
2395
|
+
0);
|
|
2396
|
+
}
|
|
2397
|
+
|
|
2398
|
+
/* if producing imputation structs, only need to keep the ones for terminal nodes */
|
|
2399
|
+
if (impute_nodes != NULL)
|
|
2400
|
+
drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
|
|
2401
|
+
}
|