isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,2401 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* Fit Isolation Forest model, or variant of it such as SCiForest
66
+ *
67
+ * Parameters:
68
+ * ===========
69
+ * - model_outputs (out)
70
+ * Pointer to already allocated isolation forest model object for single-variable splits.
71
+ * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
72
+ * additional trees through function 'add_tree'.
73
+ * - model_outputs_ext (out)
74
+ * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
75
+ * Note that if 'ndim' = 1, must use instead the single-variable model object.
76
+ * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
77
+ * additional trees through function 'add_tree'.
78
+ * - numeric_data[nrows * ncols_numeric]
79
+ * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
80
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
81
+ * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
82
+ * no sparse numeric data either).
83
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
84
+ * - ncols_numeric
85
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
86
+ * - categ_data[nrows * ncols_categ]
87
+ * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
88
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
89
+ * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
90
+ * Each category should be represented as an integer, and these integers must start at zero and
91
+ * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
92
+ * (note that they are not treated as being ordinal, this is just an encoding). Missing values
93
+ * should be encoded as negative numbers such as (-1).
94
+ * - ncols_categ
95
+ * Number of categorical columns in the data.
96
+ * - ncat[ncols_categ]
97
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
98
+ * the number of categories for that column is '5' (zero is one category).
99
+ * - Xc[nnz]
100
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
101
+ * Pass NULL if there are no sparse numeric columns.
102
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
103
+ * - Xc_ind[nnz]
104
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
105
+ * Must be in sorted order, otherwise results will be incorrect.
106
+ * The largest value here should be smaller than the largest possible value of 'size_t'.
107
+ * Pass NULL if there are no sparse numeric columns.
108
+ * - Xc_indptr[ncols_numeric + 1]
109
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
110
+ * start and at entry [col + 1] where does column 'col' end.
111
+ * Pass NULL if there are no sparse numeric columns.
112
+ * - ndim
113
+ * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
114
+ * the single-variable model. Note that the model object pointer passed must also
115
+ * agree with the value passed to 'ndim'.
116
+ * - ntry
117
+ * When using any of 'prob_pick_by_gain_pl', 'prob_pick_by_gain_avg', 'prob_pick_by_full_gain', 'prob_pick_by_dens', how many variables (with 'ndim=1')
118
+ * or linear combinations (with 'ndim>1') to try for determining the best one according to gain.
119
+ * Recommended value in reference [4] is 10 (with 'prob_pick_by_gain_avg', for outlier detection), while the
120
+ * recommended value in reference [11] is 1 (with 'prob_pick_by_gain_pl', for outlier detection), and the
121
+ * recommended value in reference [9] is 10 to 20 (with 'prob_pick_by_gain_pl', for missing value imputations).
122
+ * - coef_type
123
+ * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
124
+ * (as proposed in [4]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [3]. Ignored for the
125
+ * single-variable model.
126
+ * - sample_weights[nrows]
127
+ * Weights for the rows when building a tree, either as sampling importances when using
128
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
129
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
130
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
131
+ * through parameter 'weight_as_sample'.
132
+ * Pass NULL if the rows all have uniform weights.
133
+ * - with_replacement
134
+ * Whether to sample rows with replacement or not (not recommended). Note that distance calculations,
135
+ * if desired, don't work well with duplicate rows.
136
+ * - weight_as_sample
137
+ * If passing sample (row) weights when fitting the model, whether to consider those weights as row
138
+ * sampling weights (i.e. the higher the weights, the more likely the observation will end up included
139
+ * in each tree sub-sample), or as distribution density weights (i.e. putting a weight of two is the same
140
+ * as if the row appeared twice, thus higher weight makes it less of an outlier, but does not give it a
141
+ * higher chance of being sampled if the data uses sub-sampling).
142
+ * - nrows
143
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
144
+ * - sample_size
145
+ * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
146
+ * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
147
+ * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
148
+ * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
149
+ * in [5] is 'nrows' here.
150
+ * - ntrees
151
+ * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
152
+ * author's code in [5] is 10.
153
+ * - max_depth
154
+ * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
155
+ * Models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
156
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
157
+ * predictions.
158
+ * Note that models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
159
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
160
+ * predictions.
161
+ * If using pooled gain, one might want to substitute 'max_depth' with 'min_gain'.
162
+ * - ncols_per_tree
163
+ * Number of columns to use (have as potential candidates for splitting at each iteration) in each tree,
164
+ * similar to the 'mtry' parameter of random forests.
165
+ * In general, this is only relevant when using non-random splits and/or weighted column choices.
166
+ * If passing zero, will use the full number of available columns.
167
+ * Recommended value: 0.
168
+ * - limit_depth
169
+ * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
170
+ * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
171
+ * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
172
+ * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass 'false' here
173
+ * and higher values for 'max_depth' if using the model for purposes other than outlier detection.
174
+ * Note that, if passing 'limit_depth=true', then 'max_depth' is ignored.
175
+ * - penalize_range
176
+ * Whether to penalize (add -1 to the terminal depth) observations at prediction time that have a value
177
+ * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
178
+ * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
179
+ * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
180
+ * when splitting by categorical variables. Note that this can make a very large difference in the results
181
+ * when using 'prob_pick_by_gain_pl'.
182
+ * This option is not supported when using density-based outlier scoring metrics.
183
+ * - standardize_data
184
+ * Whether to standardize the features at each node before creating a linear combination of them as suggested
185
+ * in [4]. This is ignored when using 'ndim=1'.
186
+ * - scoring_metric
187
+ * Metric to use for determining outlier scores (see reference [13]).
188
+ * If passing 'Depth', will use isolation depth as proposed in reference [1]. This is typically the safest choice
189
+ * and plays well with all model types offered by this library.
190
+ * If passing 'Density', will set scores for each terminal node as the ratio between the fraction of points in the sub-sample
191
+ * that end up in that node and the fraction of the volume in the feature space which defines
192
+ * the node according to the splits that lead to it.
193
+ * If using 'ndim=1', for categorical variables, 'Density' is defined in terms
194
+ * of number of categories that go towards each side of the split divided by number of categories
195
+ * in the observations that reached that node.
196
+ * The standardized outlier score from 'Density' for a given observation is calculated as the
197
+ * negative of the logarithm of the geometric mean from the per-tree densities, which unlike
198
+ * the standardized score produced from 'Depth', is unbounded, but just like the standardized
199
+ * score form 'Depth', has a natural threshold for definining outlierness, which in this case
200
+ * is zero is instead of 0.5. The non-standardized outlier score for 'Density' is calculated as the
201
+ * geometric mean, while the per-tree scores are calculated as the density values.
202
+ * 'Density' might lead to better predictions when using 'ndim=1', particularly in the presence
203
+ * of categorical variables. Note however that using 'Density' requires more trees for convergence
204
+ * of scores (i.e. good results) compared to isolation-based metrics.
205
+ * 'Density' is incompatible with 'penalize_range=true'.
206
+ * If passing 'AdjDepth', will use an adjusted isolation depth that takes into account the number of points that
207
+ * go to each side of a given split vs. the fraction of the range of that feature that each
208
+ * side of the split occupies, by a metric as follows: 'd = 2/ (1 + 1/(2*p))'
209
+ * where 'p' is defined as 'p = (n_s / n_t) / (r_s / r_t)
210
+ * with 'n_t' being the number of points that reach a given node, 'n_s' the
211
+ * number of points that are sent to a given side of the split/branch at that node,
212
+ * 'r_t' being the range (maximum minus minimum) of the splitting feature or
213
+ * linear combination among the points that reached the node, and 'r_s' being the
214
+ * range of the same feature or linear combination among the points that are sent to this
215
+ * same side of the split/branch. This makes each split add a number between zero and two
216
+ * to the isolation depth, with this number's probabilistic distribution being centered
217
+ * around 1 and thus the expected isolation depth remaing the same as in the original
218
+ * 'Depth' metric, but having more variability around the extremes.
219
+ * Scores (standardized, non-standardized, per-tree) for 'AdjDepth' are aggregated in the same way
220
+ * as for 'Depth'.
221
+ * 'AdjDepth' might lead to better predictions when using 'ndim=1', particularly in the prescence
222
+ * of categorical variables and for smaller datasets, and for smaller datasets, might make
223
+ * sense to combine it with 'penalize_range=true'.
224
+ * If passing 'AdjDensity', will use the same metric from 'AdjDepth', but applied multiplicatively instead
225
+ * of additively. The expected value for 'AdjDepth' is not strictly the same
226
+ * as for isolation, but using the expected isolation depth as standardizing criterion
227
+ * tends to produce similar standardized score distributions (centered around 0.5).
228
+ * Scores (standardized, non-standardized, per-tree) from 'AdjDensity' are aggregated in the same way
229
+ * as for 'Depth'.
230
+ * 'AdjDepth' is incompatible with 'penalize_range=true'.
231
+ * If passing 'BoxedRatio', will set the scores for each terminal node as the ratio between the volume of the boxed
232
+ * feature space for the node as defined by the smallest and largest values from the split
233
+ * conditions for each column (bounded by the variable ranges in the sample) and the
234
+ * variable ranges in the tree sample.
235
+ * If using 'ndim=1', for categorical variables 'BoxedRatio' is defined in terms of number of categories.
236
+ * If using 'ndim=>1', 'BoxedRatio' is defined in terms of the maximum achievable value for the
237
+ * splitting linear combination determined from the minimum and maximum values for each
238
+ * variable among the points in the sample, and as such, it has a rather different meaning
239
+ * compared to the score obtained with 'ndim=1' - 'BoxedRatio' scores with 'ndim>1'
240
+ * typically provide very poor quality results and this metric is thus not recommended to
241
+ * use in the extended model. With 'ndim>1', 'BoxedRatio' also has a tendency of producing too small
242
+ * values which round to zero.
243
+ * The standardized outlier score from 'BoxedRatio' for a given observation is calculated
244
+ * simply as the the average from the per-tree boxed ratios. 'BoxedRatio' metric
245
+ * has a lower bound of zero and a theorical upper bound of one, but in practice the scores
246
+ * tend to be very small numbers close to zero, and its distribution across
247
+ * different datasets is rather unpredictable. In order to keep rankings comparable with
248
+ * the rest of the metrics, the non-standardized outlier scores for 'BoxedRatio' are calculated as the
249
+ * negative of the average instead. The per-tree 'BoxedRatio' scores are calculated as the ratios.
250
+ * 'BoxedRatio' can be calculated in a fast-but-not-so-precise way, and in a low-but-precise
251
+ * way, which is controlled by parameter 'fast_bratio'. Usually, both should give the
252
+ * same results, but in some fatasets, the fast way can lead to numerical inaccuracies
253
+ * due to roundoffs very close to zero.
254
+ * 'BoxedRatio' might lead to better predictions in datasets with many rows when using 'ndim=1'
255
+ * and a relatively small 'sample_size'. Note that more trees are required for convergence
256
+ * of scores when using 'BoxedRatio'. In some datasets, 'BoxedRatio' metric might result in very bad
257
+ * predictions, to the point that taking its inverse produces a much better ranking of outliers.
258
+ * 'BoxedRatio' option is incompatible with 'penalize_range'.
259
+ * If passing 'BoxedDensity2', will set the score as the ratio between the fraction of points within the sample that
260
+ * end up in a given terminal node and the 'BoxedRatio' metric.
261
+ * Aggregation of scores (standardized, non-standardized, per-tree) for 'BoxedDensity2' is done in the same
262
+ * way as for 'Density', and it also has a natural threshold at zero for determining
263
+ * outliers and inliers.
264
+ * 'BoxedDensity2' is typically usable with 'ndim>1', but tends to produce much bigger values
265
+ * compared to 'ndim=1'.
266
+ * Albeit unintuitively, in many datasets, one can usually get better results with metric
267
+ * 'BoxedDensity' instead.
268
+ * The calculation of 'BoxedDensity2' is also controlled by 'fast_bratio'.
269
+ * 'BoxedDensity2' incompatible with 'penalize_range'.
270
+ * If passing 'BoxedDensity', will set the score as the ratio between the fraction of points within the sample that
271
+ * end up in a given terminal node and the ratio between the boxed volume of the feature
272
+ * space in the sample and the boxed volume of a node given by the split conditions (inverse
273
+ * as in 'BoxedDensity2'). This metric does not have any theoretical or intuitive
274
+ * justification behind its existence, and it is perhaps ilogical to use it as a
275
+ * scoring metric, but tends to produce good results in some datasets.
276
+ * The standardized outlier scores for 'BoxedDensity' are defined as the negative of the geometric mean,
277
+ * while the non-standardized scores are the geometric mean, and the per-tree scores are simply the 'density' values.
278
+ * The calculation of 'BoxedDensity' is also controlled by 'fast_bratio'.
279
+ * 'BoxedDensity' option is incompatible with 'penalize_range'.
280
+ * - fast_bratio
281
+ * When using "boxed" metrics for scoring, whether to calculate them in a fast way through
282
+ * cumulative sum of logarithms of ratios after each split, or in a slower way as sum of
283
+ * logarithms of a single ratio per column for each terminal node.
284
+ * Usually, both methods should give the same results, but in some datasets, particularly
285
+ * when variables have too small or too large ranges, the first method can be prone to
286
+ * numerical inaccuracies due to roundoff close to zero.
287
+ * Note that this does not affect calculations for models with 'ndim>1', since given the
288
+ * split types, the calculation for them is different.
289
+ * - standardize_dist
290
+ * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
291
+ * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
292
+ * - tmat[nrows * (nrows - 1) / 2]
293
+ * Array in which to calculate average separation depths or standardized distance metric (see documentation
294
+ * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
295
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
296
+ * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
297
+ * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
298
+ * entry 0 <= i < j < n will be located at position
299
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
300
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
301
+ * - output_depths[nrows]
302
+ * Array in which to calculate average path depths or standardized outlierness metric (see documentation
303
+ * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
304
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
305
+ * of rows. If not NULL, must already be initialized to zeros.
306
+ * - standardize_depth
307
+ * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
308
+ * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
309
+ * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
310
+ * the average depth of each row across all trees.
311
+ * - col_weights[ncols_numeric + ncols_categ]
312
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
313
+ * Ignored when picking columns by deterministic criterion.
314
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
315
+ * effect is multiplicative.
316
+ * - weigh_by_kurt
317
+ * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
318
+ * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
319
+ * sample. For categorical columns, will calculate expected kurtosis if the column were converted to
320
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
321
+ * This is intended as a cheap feature selector, while the parameter 'prob_pick_col_by_kurt'
322
+ * provides the option to do this at each node in the tree for a different overall type of model.
323
+ * If passing column weights or weighted column choices ('prob_pick_col_by_range', 'prob_pick_col_by_var'),
324
+ * the effect will be multiplicative. This option is not compatible with 'prob_pick_col_by_kurt'.
325
+ * If passing 'missing_action=fail' and the data has infinite values, columns with rows
326
+ * having infinite values will get a weight of zero. If passing a different value for missing
327
+ * action, infinite values will be ignored in the kurtosis calculation.
328
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
329
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
330
+ * the same central value).
331
+ * - prob_pick_by_gain_pl
332
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
333
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
334
+ * that maximizes a pooled standard deviation gain criterion (see references [9] and [11]) on the
335
+ * same variable or linear combination, similarly to regression trees such as CART.
336
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
337
+ * in which the largest standardized gain can be achieved.
338
+ * For categorical variables with 'ndim=1', will use shannon entropy instead (like in [7]).
339
+ * Compared to a simple averaged gain, this tends to result in more evenly-divided splits and more clustered
340
+ * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
341
+ * When used for outlier detection, datasets with multimodal distributions usually see better performance
342
+ * under this type of splits.
343
+ * Note that, since this makes the trees more even and thus it takes more steps to produce isolated nodes,
344
+ * the resulting object will be heavier. When splits are not made according to any of 'prob_pick_by_gain_avg',
345
+ * 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
346
+ * Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
347
+ * every single tree will have the exact same splits.
348
+ * Be aware that 'penalize_range' can also have a large impact when using 'prob_pick_by_gain_pl'.
349
+ * Be aware also that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable
350
+ * model, every single tree will have the exact same splits.
351
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
352
+ * Alternatively, one can also control the depth through 'min_gain' (for which one might want to
353
+ * set 'max_depth=0').
354
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain',
355
+ * 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
356
+ * - prob_pick_by_gain_avg
357
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
358
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
359
+ * that maximizes an averaged standard deviation gain criterion (see references [4] and [11]) on the
360
+ * same variable or linear combination.
361
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
362
+ * in which the largest standardized gain can be achieved.
363
+ * For categorical variables with 'ndim=1', will take the expected standard deviation that would be
364
+ * gotten if the column were converted to numerical by assigning to each category a random
365
+ * number ~ Unif(0, 1) and calculate gain with those assumed standard deviations.
366
+ * Compared to a pooled gain, this tends to result in more cases in which a single observation or very
367
+ * few of them are put into one branch. Typically, datasets with outliers defined by extreme values in
368
+ * some column more or less independently of the rest, usually see better performance under this type
369
+ * of split. Recommended to use sub-samples (parameter 'sample_size') when
370
+ * passing this parameter. Note that, since this will create isolated nodes faster, the resulting object
371
+ * will be lighter (use less memory).
372
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
373
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
374
+ * Default setting for [1], [2], [3] is zero, and default for [4] is 1.
375
+ * This is the randomization parameter that can be passed to the author's original code in [5],
376
+ * but note that the code in [5] suffers from a mathematical error in the calculation of running standard deviations,
377
+ * so the results from it might not match with this library's.
378
+ * Be aware that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable model,
379
+ * every single tree will have the exact same splits.
380
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
381
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
382
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
383
+ * - prob_pick_by_full_gain
384
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
385
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
386
+ * that minimizes the pooled sums of variances of all columns (or a subset of them if using
387
+ * 'ncols_per_tree').
388
+ * In general, 'prob_pick_by_full_gain' is much slower to evaluate than the other gain types, and does not tend to
389
+ * lead to better results. When using 'prob_pick_by_full_gain', one might want to use a different scoring
390
+ * metric (particulatly 'Density', 'BoxedDensity2' or 'BoxedRatio'). Note that
391
+ * the variance calculations are all done through the (exact) sorted-indices approach, while is much
392
+ * slower than the (approximate) histogram approach used by other decision tree software.
393
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
394
+ * of features will make a large difference under 'prob_pick_by_full_gain', which might not make it suitable for
395
+ * all types of data.
396
+ * 'prob_pick_by_full_gain' is not compatible with categorical data, and 'min_gain' does not apply to it.
397
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
398
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
399
+ * Default setting for [1], [2], [3], [4] is zero.
400
+ * - prob_pick_dens
401
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
402
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
403
+ * that maximizes the pooled densities of the branch distributions.
404
+ * The 'min_gain' option does not apply to this type of splits.
405
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
406
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
407
+ * Default setting for [1], [2], [3], [4] is zero.
408
+ * - prob_pick_col_by_range
409
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
410
+ * proportional to the range spanned by each column within a node as proposed in reference [12].
411
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
412
+ * probability proportional to the range spanned by each column within a node.
413
+ * This option is not compatible with categorical data. If passing column weights, the
414
+ * effect will be multiplicative.
415
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
416
+ * of features will make a large difference under this option, which might not make it suitable for
417
+ * all types of data.
418
+ * Note that the proposed RRCF model from [12] uses a different scoring metric for producing anomaly
419
+ * scores, while this library uses isolation depth regardless of how columns are chosen, thus results
420
+ * are likely to be different from those of other software implementations. Nevertheless, as explored
421
+ * in [11], isolation depth as a scoring metric typically provides better results than the
422
+ * "co-displacement" metric from [12] under these split types.
423
+ * - prob_pick_col_by_var
424
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
425
+ * proportional to the variance of each column within a node.
426
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
427
+ * probability proportional to the variance of each column within a node.
428
+ * For categorical data, it will calculate the expected variance if the column were converted to
429
+ * numerical by assigning to each category a random number ~ Unif(0, 1), which depending on the number of
430
+ * categories and their distribution, produces numbers typically a bit smaller than standardized numerical
431
+ * variables.
432
+ * Note that when using sparse matrices, the calculation of variance will rely on a procedure that
433
+ * uses sums of squares, which has less numerical precision than the
434
+ * calculation used for dense inputs, and as such, the results might differ slightly.
435
+ * Be aware that this calculated variance is not standardized in any way, so the scales of
436
+ * features will make a large difference under this option.
437
+ * If there are infinite values, all columns having infinite values will be treated as having the
438
+ * same weight, and will be chosen before every other column with non-infinite values.
439
+ * If passing column weights , the effect will be multiplicative.
440
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
441
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
442
+ * and will be chosen before columns with non-infinite values.
443
+ * - prob_pick_col_by_kurt
444
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
445
+ * proportional to the kurtosis of each column **within a node** (unlike the option 'weigh_by_kurtosis'
446
+ * which calculates this metric only at the root).
447
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
448
+ * probability proportional to the kurtosis of each column within a node.
449
+ * For categorical data, it will calculate the expected kurtosis if the column were converted to
450
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
451
+ * Note that when using sparse matrices, the calculation of kurtosis will rely on a procedure that
452
+ * uses sums of squares and higher-power numbers, which has less numerical precision than the
453
+ * calculation used for dense inputs, and as such, the results might differ slightly.
454
+ * If passing column weights, the effect will be multiplicative. This option is not compatible
455
+ * with 'weigh_by_kurtosis'.
456
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
457
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
458
+ * and will be chosen before columns with non-infinite values.
459
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
460
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
461
+ * the same central value).
462
+ * Be aware that kurtosis can be a rather slow metric to calculate.
463
+ * - min_gain
464
+ * Minimum gain that a split threshold needs to produce in order to proceed with a split.
465
+ * Only used when the splits are decided by a variance gain criterion ('prob_pick_by_gain_pl' or
466
+ * 'prob_pick_by_gain_avg', but not 'prob_pick_by_full_gain' nor 'prob_pick_by_dens').
467
+ * If the highest possible gain in the evaluated splits at a node is below this threshold,
468
+ * that node becomes a terminal node.
469
+ * This can be used as a more sophisticated depth control when using pooled gain (note that 'max_depth'
470
+ * still applies on top of this heuristic).
471
+ * - missing_action
472
+ * How to handle missing data at both fitting and prediction time. Options are a) 'Divide' (for the single-variable
473
+ * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
474
+ * the data that went to each branch when fitting the model, b) 'Impute', which will assign observations to the
475
+ * branch with the most observations in the single-variable model (but imputed values will also be used for
476
+ * gain calculations), or fill in missing values with the median of each column of the sample from which the
477
+ * split was made in the extended model (recommended) (but note that the calculation of medians does not take
478
+ * into account sample weights when using 'weights_as_sample_prob=false', and note that when using a gain
479
+ * criterion for splits with 'ndim=1', it will use the imputed values in the calculation), c) 'Fail' which will
480
+ * assume that there are no missing values and will trigger undefined behavior if it encounters any.
481
+ * In the extended model, infinite values will be treated as missing.
482
+ * Note that passing 'Fail' might crash the process if there turn out to be missing values, but will otherwise
483
+ * produce faster fitting and prediction times along with decreased model object sizes.
484
+ * Models from [1], [2], [3], [4] correspond to 'Fail' here.
485
+ * - cat_split_type
486
+ * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
487
+ * a single category to a branch and the rest to the other branch. For the extended model, whether to
488
+ * give each category a coefficient, or only one while the rest get zero.
489
+ * - new_cat_action
490
+ * What to do after splitting a categorical feature when new data that reaches that split has categories that
491
+ * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
492
+ * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
493
+ * data that went to each branch when fitting the model, and in the extended model will assign
494
+ * them the median value for that column that was added to the linear combination of features (but note that
495
+ * this median calculation does not use sample weights when using 'weights_as_sample_prob=false'),
496
+ * b) "Smallest", which will assign all observations with unseen categories in the split to the branch that
497
+ * had fewer observations when fitting the model, c) "Random", which will assing a branch (coefficient in the
498
+ * extended model) at random for each category beforehand, even if no observations had that category when
499
+ * fitting the model. Ignored when passing 'cat_split_type' = 'SingleCateg'.
500
+ * - all_perm
501
+ * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
502
+ * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
503
+ * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
504
+ * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
505
+ * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
506
+ * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
507
+ * possible to select other splits that respect the sorted frequency order.
508
+ * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
509
+ * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
510
+ * but note that this is not checked within the function.
511
+ * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
512
+ * - coef_by_prop
513
+ * In the extended model, whether to sort the randomly-generated coefficients for categories
514
+ * according to their relative frequency in the tree node. This might provide better results when using
515
+ * categorical variables with too many categories, but is not recommended, and not reflective of
516
+ * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
517
+ * variables.
518
+ * - imputer (out)
519
+ * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
520
+ * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
521
+ * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
522
+ * or division strategy.
523
+ * - min_imp_obs
524
+ * Minimum number of observations with which an imputation value can be produced. Ignored if passing
525
+ * 'build_imputer' = 'false'.
526
+ * - depth_imp
527
+ * How to weight observations according to their depth when used for imputing missing values. Passing
528
+ * "Higher" will weigh observations higher the further down the tree (away from the root node) the
529
+ * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
530
+ * to node depth in the tree. Implemented for testing purposes and not recommended to change
531
+ * from the default. Ignored when not passing 'impute_nodes'.
532
+ * - weigh_imp_rows
533
+ * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
534
+ * a node inversely proportional to the number of observations that end up there, while "Proportional"
535
+ * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
536
+ * in this regard regardless of how many observations end up there. Implemented for testing purposes
537
+ * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
538
+ * - impute_at_fit
539
+ * Whether to impute missing values in the input data as the model is being built. If passing 'true',
540
+ * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
541
+ * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
542
+ * - random_seed
543
+ * Seed that will be used to generate random numbers used by the model.
544
+ * - use_long_double
545
+ * Whether to use 'long double' (extended precision) type for more precise calculations about
546
+ * standard deviations, means, ratios, weights, gain, and other potential aggregates. This makes
547
+ * such calculations accurate to a larger number of decimals (provided that the compiler used has
548
+ * wider long doubles than doubles) and it is highly recommended to use when the input data has
549
+ * a number of rows or columns exceeding 2^53 (an unlikely scenario), and also highly recommended
550
+ * to use when the input data has problematic scales (e.g. numbers that differ from each other by
551
+ * something like 10^-100 or columns that include values like 10^100 and 10^-100 and still need to
552
+ * be sensitive to a difference of 10^-100), but will make the calculations slower, the more so in
553
+ * platforms in which 'long double' is a software-emulated type (e.g. Power8 platforms).
554
+ * Note that some platforms (most notably windows with the msvc compiler) do not make any difference
555
+ * between 'double' and 'long double'.
556
+ * - nthreads
557
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
558
+ * allocated, even if the thread does not end up being used.
559
+ * Be aware that most of the operations are bound by memory bandwidth, which means that
560
+ * adding more threads will not result in a linear speed-up. For some types of data
561
+ * (e.g. large sparse matrices with small sample sizes), adding more threads might result
562
+ * in only a very modest speed up (e.g. 1.5x faster with 4x more threads),
563
+ * even if all threads look fully utilized.
564
+ * Ignored when not building with OpenMP support.
565
+ *
566
+ * Returns
567
+ * =======
568
+ * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
569
+ * If the process receives an interrupt signal, will return instead
570
+ * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
571
+ * what these values correspond to, you can use the functions
572
+ * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
573
+ * as integers.
574
+ */
575
+ template <class real_t, class sparse_ix>
576
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
577
+ real_t numeric_data[], size_t ncols_numeric,
578
+ int categ_data[], size_t ncols_categ, int ncat[],
579
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
580
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
581
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
582
+ size_t nrows, size_t sample_size, size_t ntrees,
583
+ size_t max_depth, size_t ncols_per_tree,
584
+ bool limit_depth, bool penalize_range, bool standardize_data,
585
+ ScoringMetric scoring_metric, bool fast_bratio,
586
+ bool standardize_dist, double tmat[],
587
+ double output_depths[], bool standardize_depth,
588
+ real_t col_weights[], bool weigh_by_kurt,
589
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
590
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
591
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
592
+ double prob_pick_col_by_kurt,
593
+ double min_gain, MissingAction missing_action,
594
+ CategSplit cat_split_type, NewCategAction new_cat_action,
595
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
596
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
597
+ uint64_t random_seed, bool use_long_double, int nthreads)
598
+ {
599
+ if (use_long_double && !has_long_double()) {
600
+ use_long_double = false;
601
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
602
+ }
603
+ #ifndef NO_LONG_DOUBLE
604
+ if (likely(!use_long_double))
605
+ #endif
606
+ return fit_iforest_internal<real_t, sparse_ix, double>(
607
+ model_outputs, model_outputs_ext,
608
+ numeric_data, ncols_numeric,
609
+ categ_data, ncols_categ, ncat,
610
+ Xc, Xc_ind, Xc_indptr,
611
+ ndim, ntry, coef_type, coef_by_prop,
612
+ sample_weights, with_replacement, weight_as_sample,
613
+ nrows, sample_size, ntrees,
614
+ max_depth, ncols_per_tree,
615
+ limit_depth, penalize_range, standardize_data,
616
+ scoring_metric, fast_bratio,
617
+ standardize_dist, tmat,
618
+ output_depths, standardize_depth,
619
+ col_weights, weigh_by_kurt,
620
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
621
+ prob_pick_by_full_gain, prob_pick_by_dens,
622
+ prob_pick_col_by_range, prob_pick_col_by_var,
623
+ prob_pick_col_by_kurt,
624
+ min_gain, missing_action,
625
+ cat_split_type, new_cat_action,
626
+ all_perm, imputer, min_imp_obs,
627
+ depth_imp, weigh_imp_rows, impute_at_fit,
628
+ random_seed, nthreads
629
+ );
630
+ #ifndef NO_LONG_DOUBLE
631
+ else
632
+ return fit_iforest_internal<real_t, sparse_ix, long double>(
633
+ model_outputs, model_outputs_ext,
634
+ numeric_data, ncols_numeric,
635
+ categ_data, ncols_categ, ncat,
636
+ Xc, Xc_ind, Xc_indptr,
637
+ ndim, ntry, coef_type, coef_by_prop,
638
+ sample_weights, with_replacement, weight_as_sample,
639
+ nrows, sample_size, ntrees,
640
+ max_depth, ncols_per_tree,
641
+ limit_depth, penalize_range, standardize_data,
642
+ scoring_metric, fast_bratio,
643
+ standardize_dist, tmat,
644
+ output_depths, standardize_depth,
645
+ col_weights, weigh_by_kurt,
646
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
647
+ prob_pick_by_full_gain, prob_pick_by_dens,
648
+ prob_pick_col_by_range, prob_pick_col_by_var,
649
+ prob_pick_col_by_kurt,
650
+ min_gain, missing_action,
651
+ cat_split_type, new_cat_action,
652
+ all_perm, imputer, min_imp_obs,
653
+ depth_imp, weigh_imp_rows, impute_at_fit,
654
+ random_seed, nthreads
655
+ );
656
+ #endif
657
+ }
658
+
659
+ template <class real_t, class sparse_ix, class ldouble_safe>
660
+ int fit_iforest_internal(
661
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
662
+ real_t numeric_data[], size_t ncols_numeric,
663
+ int categ_data[], size_t ncols_categ, int ncat[],
664
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
665
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
666
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
667
+ size_t nrows, size_t sample_size, size_t ntrees,
668
+ size_t max_depth, size_t ncols_per_tree,
669
+ bool limit_depth, bool penalize_range, bool standardize_data,
670
+ ScoringMetric scoring_metric, bool fast_bratio,
671
+ bool standardize_dist, double tmat[],
672
+ double output_depths[], bool standardize_depth,
673
+ real_t col_weights[], bool weigh_by_kurt,
674
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
675
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
676
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
677
+ double prob_pick_col_by_kurt,
678
+ double min_gain, MissingAction missing_action,
679
+ CategSplit cat_split_type, NewCategAction new_cat_action,
680
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
681
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
682
+ uint64_t random_seed, int nthreads)
683
+ {
684
+ if (
685
+ prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
686
+ prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
687
+ prob_pick_col_by_range < 0 ||
688
+ prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
689
+ ) {
690
+ throw std::runtime_error("Cannot pass negative probabilities.\n");
691
+ }
692
+ if (prob_pick_col_by_range && ncols_categ)
693
+ throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
694
+ if (prob_pick_by_full_gain && ncols_categ)
695
+ throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
696
+ if (prob_pick_col_by_kurt && weigh_by_kurt)
697
+ throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
698
+ if (ndim == 0 && model_outputs == NULL)
699
+ throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
700
+ if (penalize_range &&
701
+ (scoring_metric == Density ||
702
+ scoring_metric == AdjDensity ||
703
+ is_boxed_metric(scoring_metric))
704
+ )
705
+ throw std::runtime_error("'penalize_range' is incompatible with density scoring.\n");
706
+ if (with_replacement) {
707
+ if (tmat != NULL)
708
+ throw std::runtime_error("Cannot calculate distance while sampling with replacement.\n");
709
+ if (output_depths != NULL)
710
+ throw std::runtime_error("Cannot make predictions at fit time when sampling with replacement.\n");
711
+ if (impute_at_fit)
712
+ throw std::runtime_error("Cannot impute at fit time when sampling with replacement.\n");
713
+ }
714
+ if (sample_size != 0 && sample_size < nrows) {
715
+ if (output_depths != NULL)
716
+ throw std::runtime_error("Cannot produce outlier scores at fit time when using sub-sampling.\n");
717
+ if (tmat != NULL)
718
+ throw std::runtime_error("Cannot calculate distances at fit time when using sub-sampling.\n");
719
+ if (impute_at_fit)
720
+ throw std::runtime_error("Cannot produce missing data imputations at fit time when using sub-sampling.\n");
721
+ }
722
+
723
+
724
+ /* TODO: this function should also accept the array as a memoryview with a
725
+ leading dimension that might not correspond to the number of columns,
726
+ so as to avoid having to make deep copies of memoryviews in python and to
727
+ allow using pointers to columns of dataframes in R and Python. */
728
+
729
+ /* calculate maximum number of categories to use later */
730
+ int max_categ = 0;
731
+ for (size_t col = 0; col < ncols_categ; col++)
732
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
733
+
734
+ bool calc_dist = tmat != NULL;
735
+
736
+ if (sample_size == 0)
737
+ sample_size = nrows;
738
+
739
+ if (model_outputs != NULL)
740
+ ntry = std::min(ntry, ncols_numeric + ncols_categ);
741
+
742
+ if (ncols_per_tree == 0)
743
+ ncols_per_tree = ncols_numeric + ncols_categ;
744
+
745
+ /* put data in structs to shorten function calls */
746
+ InputData<real_t, sparse_ix>
747
+ input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
748
+ nrows, ncols_numeric + ncols_categ, sample_weights,
749
+ weight_as_sample, col_weights,
750
+ Xc, Xc_ind, Xc_indptr,
751
+ 0, 0, std::vector<double>(),
752
+ std::vector<char>(), 0, NULL,
753
+ (double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
754
+ std::vector<double>(), std::vector<double>(),
755
+ std::vector<size_t>(), std::vector<size_t>()};
756
+ ModelParams model_params = {with_replacement, sample_size, ntrees, ncols_per_tree,
757
+ limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
758
+ penalize_range, standardize_data, random_seed, weigh_by_kurt,
759
+ prob_pick_by_gain_avg, prob_pick_by_gain_pl,
760
+ prob_pick_by_full_gain, prob_pick_by_dens,
761
+ prob_pick_col_by_range, prob_pick_col_by_var,
762
+ prob_pick_col_by_kurt,
763
+ min_gain, cat_split_type, new_cat_action, missing_action,
764
+ scoring_metric, fast_bratio, all_perm,
765
+ (model_outputs != NULL)? 0 : ndim, ntry,
766
+ coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
767
+ depth_imp, weigh_imp_rows, min_imp_obs};
768
+
769
+ /* if calculating full gain, need to produce copies of the data in row-major order */
770
+ if (prob_pick_by_full_gain)
771
+ {
772
+ if (input_data.Xc_indptr == NULL)
773
+ colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
774
+ else
775
+ colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
776
+ input_data.nrows, input_data.ncols_numeric,
777
+ input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
778
+ }
779
+
780
+ /* if using weights as sampling probability, build a binary tree for faster sampling */
781
+ if (input_data.weight_as_sample && input_data.sample_weights != NULL)
782
+ {
783
+ build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
784
+ input_data.nrows, input_data.log2_n, input_data.btree_offset);
785
+ }
786
+
787
+ /* same for column weights */
788
+ /* TODO: this should also save the kurtoses when using 'prob_pick_col_by_kurt' */
789
+ ColumnSampler<ldouble_safe> base_col_sampler;
790
+ if (
791
+ col_weights != NULL ||
792
+ (model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
793
+ (model_params.ncols_per_tree >= input_data.ncols_tot / (model_params.ntrees * 2)))
794
+ )
795
+ {
796
+ bool avoid_col_weights = (model_outputs != NULL && model_params.ntry >= model_params.ncols_per_tree &&
797
+ model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
798
+ model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
799
+ ||
800
+ (model_outputs == NULL && model_params.ndim >= model_params.ncols_per_tree)
801
+ ||
802
+ (model_params.ncols_per_tree == 1);
803
+ if (!avoid_col_weights)
804
+ {
805
+ if (model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement)
806
+ {
807
+ RNG_engine rnd_generator(random_seed);
808
+ std::vector<double> kurt_weights = calc_kurtosis_all_data<InputData<real_t, sparse_ix>, ldouble_safe>(input_data, model_params, rnd_generator);
809
+ if (col_weights != NULL)
810
+ {
811
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
812
+ {
813
+ if (kurt_weights[col] <= 0) continue;
814
+ kurt_weights[col] *= col_weights[col];
815
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
816
+ }
817
+ }
818
+ base_col_sampler.initialize(kurt_weights.data(), input_data.ncols_tot);
819
+
820
+ if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
821
+ {
822
+ input_data.all_kurtoses = std::move(kurt_weights);
823
+ }
824
+ }
825
+
826
+ else
827
+ {
828
+ base_col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
829
+ }
830
+
831
+ input_data.preinitialized_col_sampler = &base_col_sampler;
832
+ }
833
+ }
834
+
835
+ /* in some cases, all trees will need to calculate variable ranges for all columns */
836
+ /* TODO: the model might use 'leave_m_cols', or have 'prob_pick_col_by_range<1', in which
837
+ case it might not be beneficial to do this beforehand. Find out when the expected gain
838
+ from doing this here is not beneficial. */
839
+ /* TODO: move this to a different file, it doesn't belong here */
840
+ std::vector<double> variable_ranges_low;
841
+ std::vector<double> variable_ranges_high;
842
+ std::vector<int> variable_ncats;
843
+ if (
844
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
845
+ (model_params.ncols_per_tree >= input_data.ncols_numeric) &&
846
+ ((model_params.prob_pick_col_by_range && input_data.ncols_numeric)
847
+ ||
848
+ is_boxed_metric(model_params.scoring_metric))
849
+ )
850
+ {
851
+ variable_ranges_low.resize(input_data.ncols_numeric);
852
+ variable_ranges_high.resize(input_data.ncols_numeric);
853
+
854
+ std::unique_ptr<unsigned char[]> buffer_cats;
855
+ size_t adj_col;
856
+ if (is_boxed_metric(model_params.scoring_metric))
857
+ {
858
+ variable_ncats.resize(input_data.ncols_categ);
859
+ buffer_cats = std::unique_ptr<unsigned char[]>(new unsigned char[input_data.max_categ]);
860
+ }
861
+
862
+ if (base_col_sampler.col_indices.empty())
863
+ base_col_sampler.initialize(input_data.ncols_tot);
864
+
865
+ bool unsplittable;
866
+ size_t n_tried_numeric = 0;
867
+ size_t col;
868
+ base_col_sampler.prepare_full_pass();
869
+ while (base_col_sampler.sample_col(col))
870
+ {
871
+ if (col < input_data.ncols_numeric)
872
+ {
873
+ if (input_data.Xc_indptr == NULL)
874
+ {
875
+ get_range(input_data.numeric_data + nrows*col,
876
+ input_data.nrows,
877
+ model_params.missing_action,
878
+ variable_ranges_low[col],
879
+ variable_ranges_high[col],
880
+ unsplittable);
881
+ }
882
+
883
+ else
884
+ {
885
+ get_range(col, input_data.nrows,
886
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
887
+ model_params.missing_action,
888
+ variable_ranges_low[col],
889
+ variable_ranges_high[col],
890
+ unsplittable);
891
+ }
892
+
893
+ n_tried_numeric++;
894
+
895
+ if (unsplittable)
896
+ {
897
+ variable_ranges_low[col] = 0;
898
+ variable_ranges_high[col] = 0;
899
+ base_col_sampler.drop_col(col);
900
+ }
901
+ }
902
+
903
+ else
904
+ {
905
+ if (!is_boxed_metric(model_params.scoring_metric))
906
+ {
907
+ if (n_tried_numeric >= input_data.ncols_numeric)
908
+ break;
909
+ else
910
+ continue;
911
+ }
912
+ adj_col = col - input_data.ncols_numeric;
913
+
914
+
915
+ variable_ncats[adj_col] = count_ncateg_in_col(input_data.categ_data + input_data.nrows*adj_col,
916
+ input_data.nrows, input_data.ncat[adj_col],
917
+ buffer_cats.get());
918
+ if (variable_ncats[adj_col] <= 1)
919
+ base_col_sampler.drop_col(col);
920
+ }
921
+ }
922
+
923
+ input_data.preinitialized_col_sampler = &base_col_sampler;
924
+ if (input_data.ncols_numeric) {
925
+ input_data.range_low = variable_ranges_low.data();
926
+ input_data.range_high = variable_ranges_high.data();
927
+ }
928
+ if (input_data.ncols_categ) {
929
+ input_data.ncat_ = variable_ncats.data();
930
+ }
931
+ }
932
+
933
+ /* if imputing missing values on-the-fly, need to determine which are missing */
934
+ std::vector<ImputedData<sparse_ix, ldouble_safe>> impute_vec;
935
+ hashed_map<size_t, ImputedData<sparse_ix, ldouble_safe>> impute_map;
936
+ if (model_params.impute_at_fit)
937
+ check_for_missing(input_data, impute_vec, impute_map, nthreads);
938
+
939
+ /* store model data */
940
+ if (model_outputs != NULL)
941
+ {
942
+ model_outputs->trees.resize(ntrees);
943
+ model_outputs->trees.shrink_to_fit();
944
+ model_outputs->new_cat_action = new_cat_action;
945
+ model_outputs->cat_split_type = cat_split_type;
946
+ model_outputs->missing_action = missing_action;
947
+ model_outputs->scoring_metric = scoring_metric;
948
+ if (
949
+ model_outputs->scoring_metric != Density &&
950
+ model_outputs->scoring_metric != BoxedDensity &&
951
+ model_outputs->scoring_metric != BoxedDensity2 &&
952
+ model_outputs->scoring_metric != BoxedRatio
953
+ )
954
+ model_outputs->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
955
+ else
956
+ model_outputs->exp_avg_depth = 1;
957
+ model_outputs->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
958
+ model_outputs->orig_sample_size = input_data.nrows;
959
+ model_outputs->has_range_penalty = penalize_range;
960
+ }
961
+
962
+ else
963
+ {
964
+ model_outputs_ext->hplanes.resize(ntrees);
965
+ model_outputs_ext->hplanes.shrink_to_fit();
966
+ model_outputs_ext->new_cat_action = new_cat_action;
967
+ model_outputs_ext->cat_split_type = cat_split_type;
968
+ model_outputs_ext->missing_action = missing_action;
969
+ model_outputs_ext->scoring_metric = scoring_metric;
970
+ if (
971
+ model_outputs_ext->scoring_metric != Density &&
972
+ model_outputs_ext->scoring_metric != BoxedDensity &&
973
+ model_outputs_ext->scoring_metric != BoxedDensity2 &&
974
+ model_outputs_ext->scoring_metric != BoxedRatio
975
+ )
976
+ model_outputs_ext->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
977
+ else
978
+ model_outputs_ext->exp_avg_depth = 1;
979
+ model_outputs_ext->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
980
+ model_outputs_ext->orig_sample_size = input_data.nrows;
981
+ model_outputs_ext->has_range_penalty = penalize_range;
982
+ }
983
+
984
+ if (imputer != NULL)
985
+ initialize_imputer<decltype(input_data), ldouble_safe>(
986
+ *imputer, input_data, ntrees, nthreads
987
+ );
988
+
989
+ /* initialize thread-private memory */
990
+ if ((size_t)nthreads > ntrees)
991
+ nthreads = (int)ntrees;
992
+ #ifdef _OPENMP
993
+ std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(nthreads);
994
+ #else
995
+ std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(1);
996
+ #endif
997
+
998
+ /* Global variable that determines if the procedure receives a stop signal */
999
+ SignalSwitcher ss = SignalSwitcher();
1000
+
1001
+ /* For exception handling */
1002
+ bool threw_exception = false;
1003
+ std::exception_ptr ex = NULL;
1004
+
1005
+ /* grow trees */
1006
+ #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params, threw_exception, ex)
1007
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1008
+ {
1009
+ if (interrupt_switch || threw_exception)
1010
+ continue; /* Cannot break with OpenMP==2.0 (MSVC) */
1011
+
1012
+ try
1013
+ {
1014
+ if (
1015
+ model_params.impute_at_fit &&
1016
+ input_data.n_missing &&
1017
+ !worker_memory[omp_get_thread_num()].impute_vec.size() &&
1018
+ !worker_memory[omp_get_thread_num()].impute_map.size()
1019
+ )
1020
+ {
1021
+ #ifdef _OPENMP
1022
+ if (nthreads > 1)
1023
+ {
1024
+ worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
1025
+ worker_memory[omp_get_thread_num()].impute_map = impute_map;
1026
+ }
1027
+
1028
+ else
1029
+ #endif
1030
+ {
1031
+ worker_memory[0].impute_vec = std::move(impute_vec);
1032
+ worker_memory[0].impute_map = std::move(impute_map);
1033
+ }
1034
+ }
1035
+
1036
+ fit_itree<decltype(input_data), typename std::remove_pointer<decltype(worker_memory.data())>::type, ldouble_safe>(
1037
+ (model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
1038
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
1039
+ worker_memory[omp_get_thread_num()],
1040
+ input_data,
1041
+ model_params,
1042
+ (imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
1043
+ tree);
1044
+
1045
+ if ((model_outputs != NULL))
1046
+ model_outputs->trees[tree].shrink_to_fit();
1047
+ else
1048
+ model_outputs_ext->hplanes[tree].shrink_to_fit();
1049
+ }
1050
+
1051
+ catch (...)
1052
+ {
1053
+ #pragma omp critical
1054
+ {
1055
+ if (!threw_exception)
1056
+ {
1057
+ threw_exception = true;
1058
+ ex = std::current_exception();
1059
+ }
1060
+ }
1061
+ }
1062
+ }
1063
+
1064
+ /* check if the procedure got interrupted */
1065
+ check_interrupt_switch(ss);
1066
+ #if defined(DONT_THROW_ON_INTERRUPT)
1067
+ if (interrupt_switch) return EXIT_FAILURE;
1068
+ #endif
1069
+
1070
+ /* check if some exception was thrown */
1071
+ if (threw_exception)
1072
+ std::rethrow_exception(ex);
1073
+
1074
+ if ((model_outputs != NULL))
1075
+ model_outputs->trees.shrink_to_fit();
1076
+ else
1077
+ model_outputs_ext->hplanes.shrink_to_fit();
1078
+
1079
+ /* if calculating similarity/distance, now need to reduce and average */
1080
+ if (calc_dist)
1081
+ gather_sim_result< PredictionData<real_t, sparse_ix>, InputData<real_t, sparse_ix> >
1082
+ (NULL, &worker_memory,
1083
+ NULL, &input_data,
1084
+ model_outputs, model_outputs_ext,
1085
+ tmat, NULL, 0,
1086
+ model_params.ntrees, false,
1087
+ standardize_dist, false, nthreads);
1088
+
1089
+ check_interrupt_switch(ss);
1090
+ #if defined(DONT_THROW_ON_INTERRUPT)
1091
+ if (interrupt_switch) return EXIT_FAILURE;
1092
+ #endif
1093
+
1094
+ /* same for depths */
1095
+ if (output_depths != NULL)
1096
+ {
1097
+ #ifdef _OPENMP
1098
+ if (nthreads > 1)
1099
+ {
1100
+ for (auto &w : worker_memory)
1101
+ {
1102
+ if (w.row_depths.size())
1103
+ {
1104
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
1105
+ for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
1106
+ output_depths[row] += w.row_depths[row];
1107
+ }
1108
+ }
1109
+ }
1110
+ else
1111
+ #endif
1112
+ {
1113
+ std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
1114
+ }
1115
+
1116
+ if (standardize_depth)
1117
+ {
1118
+ double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
1119
+ model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
1120
+ for (size_t row = 0; row < nrows; row++)
1121
+ output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
1122
+ }
1123
+
1124
+ else
1125
+ {
1126
+ double ntrees_dbl = (double) ntrees;
1127
+ for (size_t row = 0; row < nrows; row++)
1128
+ output_depths[row] /= ntrees_dbl;
1129
+ }
1130
+ }
1131
+
1132
+ check_interrupt_switch(ss);
1133
+ #if defined(DONT_THROW_ON_INTERRUPT)
1134
+ if (interrupt_switch) return EXIT_FAILURE;
1135
+ #endif
1136
+
1137
+ /* if imputing missing values, now need to reduce and write final values */
1138
+ if (model_params.impute_at_fit)
1139
+ {
1140
+ #ifdef _OPENMP
1141
+ if (nthreads > 1)
1142
+ {
1143
+ for (auto &w : worker_memory)
1144
+ combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
1145
+ }
1146
+
1147
+ else
1148
+ #endif
1149
+ {
1150
+ impute_vec = std::move(worker_memory[0].impute_vec);
1151
+ impute_map = std::move(worker_memory[0].impute_map);
1152
+ }
1153
+
1154
+ apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
1155
+ }
1156
+
1157
+ check_interrupt_switch(ss);
1158
+ #if defined(DONT_THROW_ON_INTERRUPT)
1159
+ if (interrupt_switch) return EXIT_FAILURE;
1160
+ #endif
1161
+
1162
+ return EXIT_SUCCESS;
1163
+ }
1164
+
1165
+
1166
+ /* Add additional trees to already-fitted isolation forest model
1167
+ *
1168
+ * Parameters
1169
+ * ==========
1170
+ * - model_outputs
1171
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
1172
+ * if the trees are are to be added to an extended model. Can only pass one of
1173
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
1174
+ * so it cannot be run in parallel for the same model object.
1175
+ * - model_outputs_ext
1176
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
1177
+ * if the trees are are to be added to an single-variable model. Can only pass one of
1178
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
1179
+ * so it cannot be run in parallel for the same model object.
1180
+ * - numeric_data[nrows * ncols_numeric]
1181
+ * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
1182
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
1183
+ * Pass NULL if there are no dense numeric columns.
1184
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1185
+ * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
1186
+ * of columns, either as dense or as sparse arrays.
1187
+ * - ncols_numeric
1188
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1189
+ * what was originally passed to 'fit_iforest'.
1190
+ * - categ_data[nrows * ncols_categ]
1191
+ * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
1192
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
1193
+ * Pass NULL if there are no categorical columns. The encoding must be the same as was used
1194
+ * in the data to which the model was fit.
1195
+ * Each category should be represented as an integer, and these integers must start at zero and
1196
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
1197
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
1198
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
1199
+ * must be the same as was used in the data to which the model was fit.
1200
+ * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
1201
+ * of columns and the same category encoding.
1202
+ * - ncols_categ
1203
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1204
+ * what was originally passed to 'fit_iforest'.
1205
+ * - ncat[ncols_categ]
1206
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). May contain new categories,
1207
+ * but should keep the same encodings that were used for previous categories.
1208
+ * - Xc[nnz]
1209
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
1210
+ * Pass NULL if there are no sparse numeric columns.
1211
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1212
+ * - Xc_ind[nnz]
1213
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
1214
+ * Must be in sorted order, otherwise results will be incorrect.
1215
+ * Pass NULL if there are no sparse numeric columns.
1216
+ * - Xc_indptr[ncols_numeric + 1]
1217
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1218
+ * start and at entry [col + 1] where does column 'col' end.
1219
+ * Pass NULL if there are no sparse numeric columns.
1220
+ * - ndim
1221
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1222
+ * what was originally passed to 'fit_iforest'.
1223
+ * - ntry
1224
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1225
+ * what was originally passed to 'fit_iforest'.
1226
+ * - coef_type
1227
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1228
+ * what was originally passed to 'fit_iforest'.
1229
+ * - sample_weights
1230
+ * Weights for the rows when adding this tree, either as sampling importances when using
1231
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
1232
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
1233
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
1234
+ * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
1235
+ * Pass NULL if the rows all have uniform weights.
1236
+ * - nrows
1237
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
1238
+ * - max_depth
1239
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1240
+ * what was originally passed to 'fit_iforest'.
1241
+ * - ncols_per_tree
1242
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1243
+ * what was originally passed to 'fit_iforest'.
1244
+ * - limit_depth
1245
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1246
+ * what was originally passed to 'fit_iforest'.
1247
+ * - penalize_range
1248
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1249
+ * what was originally passed to 'fit_iforest'.
1250
+ * - standardize_data
1251
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1252
+ * what was originally passed to 'fit_iforest'.
1253
+ * - fast_bratio
1254
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1255
+ * what was originally passed to 'fit_iforest'.
1256
+ * - col_weights
1257
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
1258
+ * Ignored when picking columns by deterministic criterion.
1259
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
1260
+ * effect is multiplicative.
1261
+ * - weigh_by_kurt
1262
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1263
+ * what was originally passed to 'fit_iforest'.
1264
+ * - prob_pick_by_gain_pl
1265
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1266
+ * what was originally passed to 'fit_iforest'.
1267
+ * - prob_pick_by_gain_avg
1268
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1269
+ * what was originally passed to 'fit_iforest'.
1270
+ * - prob_pick_by_full_gain
1271
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1272
+ * what was originally passed to 'fit_iforest'.
1273
+ * - prob_pick_by_dens
1274
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1275
+ * what was originally passed to 'fit_iforest'.
1276
+ * - prob_pick_col_by_range
1277
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1278
+ * what was originally passed to 'fit_iforest'.
1279
+ * - prob_pick_col_by_var
1280
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1281
+ * what was originally passed to 'fit_iforest'.
1282
+ * - prob_pick_col_by_kurt
1283
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1284
+ * what was originally passed to 'fit_iforest'.
1285
+ * - min_gain
1286
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1287
+ * what was originally passed to 'fit_iforest'.
1288
+ * - missing_action
1289
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1290
+ * what was originally passed to 'fit_iforest'.
1291
+ * - cat_split_type
1292
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1293
+ * what was originally passed to 'fit_iforest'.
1294
+ * - new_cat_action
1295
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1296
+ * what was originally passed to 'fit_iforest'.
1297
+ * - depth_imp
1298
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1299
+ * what was originally passed to 'fit_iforest'.
1300
+ * - weigh_imp_rows
1301
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1302
+ * what was originally passed to 'fit_iforest'.
1303
+ * - all_perm
1304
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1305
+ * what was originally passed to 'fit_iforest'.
1306
+ * - coef_by_prop
1307
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1308
+ * what was originally passed to 'fit_iforest'.
1309
+ * - imputer
1310
+ * Pointer to already-allocated imputer object, as it was output from function 'fit_model' while
1311
+ * producing either 'model_outputs' or 'model_outputs_ext'.
1312
+ * Pass NULL if the model was built without imputer.
1313
+ * - min_imp_obs
1314
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1315
+ * what was originally passed to 'fit_iforest'.
1316
+ * - indexer
1317
+ * Indexer object associated to the model object ('model_outputs' or 'model_outputs_ext'), which will
1318
+ * be updated with the new tree to add.
1319
+ * If 'indexer' has reference points, these must be passed again here in order to index them.
1320
+ * Pass NULL if the model has no associated indexer.
1321
+ * - ref_numeric_data[nref * ncols_numeric]
1322
+ * Pointer to numeric data for reference points. May be ordered by rows
1323
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1324
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1325
+ * (see parameter 'ref_is_col_major').
1326
+ * Pass NULL if there are no dense numeric columns or no reference points.
1327
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
1328
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
1329
+ * numeric columns, then numeric data for reference points must be passed (in either dense or sparse format).
1330
+ * - ref_categ_data[nref * ncols_categ]
1331
+ * Pointer to categorical data for reference points. May be ordered by rows
1332
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1333
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1334
+ * (see parameter 'ref_is_col_major').
1335
+ * Pass NULL if there are no categorical columns or no reference points.
1336
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
1337
+ * categorical columns, then 'ref_categ_data' must be passed.
1338
+ * - ref_is_col_major
1339
+ * Whether 'ref_numeric_data' and/or 'ref_categ_data' are in column-major order. If numeric data is
1340
+ * passed in sparse format, categorical data must be passed in column-major format. If passing dense
1341
+ * data, row-major format is preferred as it will be faster. If the data is passed in row-major format,
1342
+ * must also pass 'ref_ld_numeric' and/or 'ref_ld_categ'.
1343
+ * If both 'ref_numeric_data' and 'ref_categ_data' are passed, they must have the same orientation
1344
+ * (row-major or column-major).
1345
+ * - ref_ld_numeric
1346
+ * Leading dimension of the array 'ref_numeric_data', if it is passed in row-major format.
1347
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1348
+ * be accessed assuming that row 'n' starts at 'ref_numeric_data + n*ref_ld_numeric'). If passing
1349
+ * 'ref_numeric_data' in column-major order, this is ignored and will be assumed that the
1350
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
1351
+ * data in sparse format.
1352
+ * - ref_ld_categ
1353
+ * Leading dimension of the array 'ref_categ_data', if it is passed in row-major format.
1354
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1355
+ * be accessed assuming that row 'n' starts at 'ref_categ_data + n*ref_ld_categ'). If passing
1356
+ * 'ref_categ_data' in column-major order, this is ignored and will be assumed that the
1357
+ * leading dimension corresponds to the number of rows.
1358
+ * - ref_Xc[ref_nnz]
1359
+ * Pointer to numeric data for reference points in sparse numeric matrix in CSC format (column-compressed).
1360
+ * Pass NULL if there are no sparse numeric columns for reference points or no reference points.
1361
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
1362
+ * - ref_Xc_ind[ref_nnz]
1363
+ * Pointer to row indices to which each non-zero entry in 'ref_Xc' corresponds.
1364
+ * Must be in sorted order, otherwise results will be incorrect.
1365
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
1366
+ * - ref_Xc_indptr[ref_nnz]
1367
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1368
+ * start and at entry [col + 1] where does column 'col' end.
1369
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
1370
+ * - random_seed
1371
+ * Seed that will be used to generate random numbers used by the model.
1372
+ * - use_long_double
1373
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1374
+ * what was originally passed to 'fit_iforest'.
1375
+ */
1376
+ template <class real_t, class sparse_ix>
1377
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1378
+ real_t numeric_data[], size_t ncols_numeric,
1379
+ int categ_data[], size_t ncols_categ, int ncat[],
1380
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1381
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
1382
+ real_t sample_weights[], size_t nrows,
1383
+ size_t max_depth, size_t ncols_per_tree,
1384
+ bool limit_depth, bool penalize_range, bool standardize_data,
1385
+ bool fast_bratio,
1386
+ real_t col_weights[], bool weigh_by_kurt,
1387
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
1388
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
1389
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
1390
+ double prob_pick_col_by_kurt,
1391
+ double min_gain, MissingAction missing_action,
1392
+ CategSplit cat_split_type, NewCategAction new_cat_action,
1393
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
1394
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
1395
+ TreesIndexer *indexer,
1396
+ real_t ref_numeric_data[], int ref_categ_data[],
1397
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1398
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
1399
+ uint64_t random_seed, bool use_long_double)
1400
+ {
1401
+ if (use_long_double && !has_long_double()) {
1402
+ use_long_double = false;
1403
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
1404
+ }
1405
+ #ifndef NO_LONG_DOUBLE
1406
+ if (likely(!use_long_double))
1407
+ #endif
1408
+ return add_tree_internal<real_t, sparse_ix, double>(
1409
+ model_outputs, model_outputs_ext,
1410
+ numeric_data, ncols_numeric,
1411
+ categ_data, ncols_categ, ncat,
1412
+ Xc, Xc_ind, Xc_indptr,
1413
+ ndim, ntry, coef_type, coef_by_prop,
1414
+ sample_weights, nrows,
1415
+ max_depth, ncols_per_tree,
1416
+ limit_depth, penalize_range, standardize_data,
1417
+ fast_bratio,
1418
+ col_weights, weigh_by_kurt,
1419
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
1420
+ prob_pick_by_full_gain, prob_pick_by_dens,
1421
+ prob_pick_col_by_range, prob_pick_col_by_var,
1422
+ prob_pick_col_by_kurt,
1423
+ min_gain, missing_action,
1424
+ cat_split_type, new_cat_action,
1425
+ depth_imp, weigh_imp_rows,
1426
+ all_perm, imputer, min_imp_obs,
1427
+ indexer,
1428
+ ref_numeric_data, ref_categ_data,
1429
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1430
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1431
+ random_seed
1432
+ );
1433
+ #ifndef NO_LONG_DOUBLE
1434
+ else
1435
+ return add_tree_internal<real_t, sparse_ix, long double>(
1436
+ model_outputs, model_outputs_ext,
1437
+ numeric_data, ncols_numeric,
1438
+ categ_data, ncols_categ, ncat,
1439
+ Xc, Xc_ind, Xc_indptr,
1440
+ ndim, ntry, coef_type, coef_by_prop,
1441
+ sample_weights, nrows,
1442
+ max_depth, ncols_per_tree,
1443
+ limit_depth, penalize_range, standardize_data,
1444
+ fast_bratio,
1445
+ col_weights, weigh_by_kurt,
1446
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
1447
+ prob_pick_by_full_gain, prob_pick_by_dens,
1448
+ prob_pick_col_by_range, prob_pick_col_by_var,
1449
+ prob_pick_col_by_kurt,
1450
+ min_gain, missing_action,
1451
+ cat_split_type, new_cat_action,
1452
+ depth_imp, weigh_imp_rows,
1453
+ all_perm, imputer, min_imp_obs,
1454
+ indexer,
1455
+ ref_numeric_data, ref_categ_data,
1456
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1457
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1458
+ random_seed
1459
+ );
1460
+ #endif
1461
+ }
1462
+
1463
+ template <class real_t, class sparse_ix, class ldouble_safe>
1464
+ int add_tree_internal(
1465
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1466
+ real_t numeric_data[], size_t ncols_numeric,
1467
+ int categ_data[], size_t ncols_categ, int ncat[],
1468
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1469
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
1470
+ real_t sample_weights[], size_t nrows,
1471
+ size_t max_depth, size_t ncols_per_tree,
1472
+ bool limit_depth, bool penalize_range, bool standardize_data,
1473
+ bool fast_bratio,
1474
+ real_t col_weights[], bool weigh_by_kurt,
1475
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
1476
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
1477
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
1478
+ double prob_pick_col_by_kurt,
1479
+ double min_gain, MissingAction missing_action,
1480
+ CategSplit cat_split_type, NewCategAction new_cat_action,
1481
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
1482
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
1483
+ TreesIndexer *indexer,
1484
+ real_t ref_numeric_data[], int ref_categ_data[],
1485
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1486
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
1487
+ uint64_t random_seed)
1488
+ {
1489
+ if (
1490
+ prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
1491
+ prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
1492
+ prob_pick_col_by_range < 0 ||
1493
+ prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
1494
+ ) {
1495
+ throw std::runtime_error("Cannot pass negative probabilities.\n");
1496
+ }
1497
+ if (prob_pick_col_by_range && ncols_categ)
1498
+ throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
1499
+ if (prob_pick_by_full_gain && ncols_categ)
1500
+ throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
1501
+ if (prob_pick_col_by_kurt && weigh_by_kurt)
1502
+ throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
1503
+ if (ndim == 0 && model_outputs == NULL)
1504
+ throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
1505
+ if (indexer != NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty()) {
1506
+ if (ref_numeric_data == NULL && ref_categ_data == NULL && ref_Xc_indptr == NULL)
1507
+ throw std::runtime_error("'indexer' has reference points. Those points must be passed to index them in the new tree to add.\n");
1508
+ }
1509
+
1510
+ std::vector<ImputeNode> *impute_nodes = NULL;
1511
+
1512
+ int max_categ = 0;
1513
+ for (size_t col = 0; col < ncols_categ; col++)
1514
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
1515
+
1516
+ if (model_outputs != NULL)
1517
+ ntry = std::min(ntry, ncols_numeric + ncols_categ);
1518
+
1519
+ if (ncols_per_tree == 0)
1520
+ ncols_per_tree = ncols_numeric + ncols_categ;
1521
+
1522
+ if (indexer != NULL && indexer->indices.empty())
1523
+ indexer = NULL;
1524
+
1525
+ InputData<real_t, sparse_ix>
1526
+ input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
1527
+ nrows, ncols_numeric + ncols_categ, sample_weights,
1528
+ false, col_weights,
1529
+ Xc, Xc_ind, Xc_indptr,
1530
+ 0, 0, std::vector<double>(),
1531
+ std::vector<char>(), 0, NULL,
1532
+ (double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
1533
+ std::vector<double>(), std::vector<double>(),
1534
+ std::vector<size_t>(), std::vector<size_t>()};
1535
+ ModelParams model_params = {false, nrows, (size_t)1, ncols_per_tree,
1536
+ max_depth? max_depth : (nrows - 1),
1537
+ penalize_range, standardize_data, random_seed, weigh_by_kurt,
1538
+ prob_pick_by_gain_avg, prob_pick_by_gain_pl,
1539
+ prob_pick_by_full_gain, prob_pick_by_dens,
1540
+ prob_pick_col_by_range, prob_pick_col_by_var,
1541
+ prob_pick_col_by_kurt,
1542
+ min_gain, cat_split_type, new_cat_action, missing_action,
1543
+ (model_outputs != NULL)? model_outputs->scoring_metric : model_outputs_ext->scoring_metric,
1544
+ fast_bratio, all_perm,
1545
+ (model_outputs != NULL)? 0 : ndim, ntry,
1546
+ coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
1547
+
1548
+ if (prob_pick_by_full_gain)
1549
+ {
1550
+ if (input_data.Xc_indptr == NULL)
1551
+ colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
1552
+ else
1553
+ colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
1554
+ input_data.nrows, input_data.ncols_numeric,
1555
+ input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
1556
+ }
1557
+
1558
+ std::unique_ptr<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> workspace(
1559
+ new WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>()
1560
+ );
1561
+
1562
+ size_t last_tree;
1563
+ bool added_tree = false;
1564
+ try
1565
+ {
1566
+ if (model_outputs != NULL)
1567
+ {
1568
+ last_tree = model_outputs->trees.size();
1569
+ model_outputs->trees.emplace_back();
1570
+ added_tree = true;
1571
+ }
1572
+
1573
+ else
1574
+ {
1575
+ last_tree = model_outputs_ext->hplanes.size();
1576
+ model_outputs_ext->hplanes.emplace_back();
1577
+ added_tree = true;
1578
+ }
1579
+
1580
+ if (imputer != NULL)
1581
+ {
1582
+ imputer->imputer_tree.emplace_back();
1583
+ impute_nodes = &(imputer->imputer_tree.back());
1584
+ }
1585
+
1586
+ if (indexer != NULL)
1587
+ {
1588
+ indexer->indices.emplace_back();
1589
+ }
1590
+
1591
+ SignalSwitcher ss = SignalSwitcher();
1592
+ check_interrupt_switch(ss);
1593
+
1594
+ fit_itree<decltype(input_data), typename std::remove_pointer<decltype(workspace.get())>::type, ldouble_safe>(
1595
+ (model_outputs != NULL)? &model_outputs->trees.back() : NULL,
1596
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
1597
+ *workspace,
1598
+ input_data,
1599
+ model_params,
1600
+ impute_nodes,
1601
+ last_tree);
1602
+
1603
+ check_interrupt_switch(ss);
1604
+
1605
+ if (model_outputs != NULL) {
1606
+ model_outputs->trees.back().shrink_to_fit();
1607
+ model_outputs->has_range_penalty = model_outputs->has_range_penalty || penalize_range;
1608
+ }
1609
+ else {
1610
+ model_outputs_ext->hplanes.back().shrink_to_fit();
1611
+ model_outputs_ext->has_range_penalty = model_outputs_ext->has_range_penalty || penalize_range;
1612
+ }
1613
+
1614
+ if (imputer != NULL)
1615
+ imputer->imputer_tree.back().shrink_to_fit();
1616
+
1617
+ if (indexer != NULL)
1618
+ {
1619
+ if (model_outputs != NULL)
1620
+ build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
1621
+ indexer->indices.back().n_terminal,
1622
+ model_outputs->trees.back());
1623
+ else
1624
+ build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
1625
+ indexer->indices.back().n_terminal,
1626
+ model_outputs_ext->hplanes.back());
1627
+
1628
+ check_interrupt_switch(ss);
1629
+
1630
+
1631
+ if (!indexer->indices.front().node_distances.empty())
1632
+ {
1633
+ std::vector<size_t> temp;
1634
+ temp.reserve(indexer->indices.back().n_terminal);
1635
+ if (model_outputs != NULL) {
1636
+ build_dindex(
1637
+ temp,
1638
+ indexer->indices.back().terminal_node_mappings,
1639
+ indexer->indices.back().node_distances,
1640
+ indexer->indices.back().node_depths,
1641
+ indexer->indices.back().n_terminal,
1642
+ model_outputs->trees.back()
1643
+ );
1644
+ }
1645
+ else {
1646
+ build_dindex(
1647
+ temp,
1648
+ indexer->indices.back().terminal_node_mappings,
1649
+ indexer->indices.back().node_distances,
1650
+ indexer->indices.back().node_depths,
1651
+ indexer->indices.back().n_terminal,
1652
+ model_outputs_ext->hplanes.back()
1653
+ );
1654
+ }
1655
+ }
1656
+
1657
+ check_interrupt_switch(ss);
1658
+ if (!indexer->indices.front().reference_points.empty())
1659
+ {
1660
+ size_t n_ref = indexer->indices.front().reference_points.size();
1661
+ std::vector<sparse_ix> terminal_indices(n_ref);
1662
+ std::unique_ptr<double[]> ignored(new double[n_ref]);
1663
+ if (model_outputs != NULL)
1664
+ {
1665
+ IsoForest single_tree_model;
1666
+ single_tree_model.new_cat_action = model_outputs->new_cat_action;
1667
+ single_tree_model.cat_split_type = model_outputs->cat_split_type;
1668
+ single_tree_model.missing_action = model_outputs->missing_action;
1669
+ single_tree_model.trees.push_back(model_outputs->trees.back());
1670
+
1671
+ predict_iforest(ref_numeric_data, ref_categ_data,
1672
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1673
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1674
+ (real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
1675
+ n_ref, 1, false,
1676
+ &single_tree_model, (ExtIsoForest*)NULL,
1677
+ ignored.get(), terminal_indices.data(),
1678
+ (double*)NULL,
1679
+ indexer);
1680
+ }
1681
+
1682
+ else
1683
+ {
1684
+ ExtIsoForest single_tree_model;
1685
+ single_tree_model.new_cat_action = model_outputs_ext->new_cat_action;
1686
+ single_tree_model.cat_split_type = model_outputs_ext->cat_split_type;
1687
+ single_tree_model.missing_action = model_outputs_ext->missing_action;
1688
+ single_tree_model.hplanes.push_back(model_outputs_ext->hplanes.back());
1689
+
1690
+ predict_iforest(ref_numeric_data, ref_categ_data,
1691
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1692
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1693
+ (real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
1694
+ n_ref, 1, false,
1695
+ (IsoForest*)NULL, &single_tree_model,
1696
+ ignored.get(), terminal_indices.data(),
1697
+ (double*)NULL,
1698
+ indexer);
1699
+ }
1700
+
1701
+ ignored.reset();
1702
+ indexer->indices.back().reference_points.assign(terminal_indices.begin(), terminal_indices.end());
1703
+ indexer->indices.back().reference_points.shrink_to_fit();
1704
+ build_ref_node(indexer->indices.back());
1705
+ }
1706
+
1707
+ check_interrupt_switch(ss);
1708
+ }
1709
+ }
1710
+
1711
+ catch (...)
1712
+ {
1713
+ if (added_tree)
1714
+ {
1715
+ if (model_outputs != NULL)
1716
+ model_outputs->trees.pop_back();
1717
+ else
1718
+ model_outputs_ext->hplanes.pop_back();
1719
+ if (imputer != NULL) {
1720
+ if (model_outputs != NULL)
1721
+ imputer->imputer_tree.resize(model_outputs->trees.size());
1722
+ else
1723
+ imputer->imputer_tree.resize(model_outputs_ext->hplanes.size());
1724
+ }
1725
+ if (indexer != NULL) {
1726
+ if (model_outputs != NULL)
1727
+ indexer->indices.resize(model_outputs->trees.size());
1728
+ else
1729
+ indexer->indices.resize(model_outputs_ext->hplanes.size());
1730
+ }
1731
+ }
1732
+ throw;
1733
+ }
1734
+
1735
+ return EXIT_SUCCESS;
1736
+ }
1737
+
1738
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1739
+ void fit_itree(std::vector<IsoTree> *tree_root,
1740
+ std::vector<IsoHPlane> *hplane_root,
1741
+ WorkerMemory &workspace,
1742
+ InputData &input_data,
1743
+ ModelParams &model_params,
1744
+ std::vector<ImputeNode> *impute_nodes,
1745
+ size_t tree_num)
1746
+ {
1747
+ /* initialize array for depths if called for */
1748
+ if (workspace.ix_arr.empty() && model_params.calc_depth)
1749
+ workspace.row_depths.resize(input_data.nrows, 0);
1750
+
1751
+ /* choose random sample of rows */
1752
+ if (workspace.ix_arr.empty()) workspace.ix_arr.resize(model_params.sample_size);
1753
+ if (input_data.log2_n > 0)
1754
+ workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
1755
+ input_data.btree_weights_init.end());
1756
+ workspace.rnd_generator.seed(model_params.random_seed + tree_num);
1757
+ workspace.rbin = UniformUnitInterval(0, 1);
1758
+ sample_random_rows<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
1759
+ workspace.ix_arr, input_data.nrows, model_params.with_replacement,
1760
+ workspace.rnd_generator, workspace.ix_all,
1761
+ (input_data.weight_as_sample)? input_data.sample_weights : NULL,
1762
+ workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
1763
+ workspace.is_repeated);
1764
+ workspace.st = 0;
1765
+ workspace.end = model_params.sample_size - 1;
1766
+
1767
+ /* in some cases, it's not possible to use column weights even if they are given,
1768
+ because every single column will always need to be checked or end up being used. */
1769
+ bool avoid_col_weights = (tree_root != NULL && model_params.ntry >= model_params.ncols_per_tree &&
1770
+ model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
1771
+ model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
1772
+ ||
1773
+ (tree_root == NULL && model_params.ndim >= model_params.ncols_per_tree)
1774
+ ||
1775
+ (model_params.ncols_per_tree == 1);
1776
+ if (input_data.preinitialized_col_sampler == NULL)
1777
+ {
1778
+ if (input_data.col_weights != NULL && !avoid_col_weights && !model_params.weigh_by_kurt)
1779
+ workspace.col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
1780
+ }
1781
+
1782
+
1783
+ /* set expected tree size and add root node */
1784
+ {
1785
+ size_t exp_nodes = mult2(model_params.sample_size);
1786
+ if (model_params.sample_size >= div2(SIZE_MAX))
1787
+ exp_nodes = SIZE_MAX;
1788
+ else if (model_params.max_depth <= (size_t)30)
1789
+ exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
1790
+ if (tree_root != NULL)
1791
+ {
1792
+ tree_root->reserve(exp_nodes);
1793
+ tree_root->emplace_back();
1794
+ }
1795
+ else
1796
+ {
1797
+ hplane_root->reserve(exp_nodes);
1798
+ hplane_root->emplace_back();
1799
+ }
1800
+ if (impute_nodes != NULL)
1801
+ {
1802
+ impute_nodes->reserve(exp_nodes);
1803
+ impute_nodes->emplace_back((size_t) 0);
1804
+ }
1805
+ }
1806
+
1807
+ /* initialize array with candidate categories if not already done */
1808
+ if (workspace.categs.empty())
1809
+ workspace.categs.resize(input_data.max_categ);
1810
+
1811
+ /* initialize array with per-node column weights if needed */
1812
+ if ((model_params.prob_pick_col_by_range ||
1813
+ model_params.prob_pick_col_by_var ||
1814
+ model_params.prob_pick_col_by_kurt) && workspace.node_col_weights.empty())
1815
+ {
1816
+ workspace.node_col_weights.resize(input_data.ncols_tot);
1817
+ if (tree_root != NULL || model_params.standardize_data || model_params.missing_action != Fail)
1818
+ {
1819
+ workspace.saved_stat1.resize(input_data.ncols_numeric);
1820
+ workspace.saved_stat2.resize(input_data.ncols_numeric);
1821
+ }
1822
+ }
1823
+
1824
+ /* IMPORTANT!!!!!
1825
+ The standard library implementation is likely going to use the Box-Muller method
1826
+ for normal sampling, which has some state memory in the **distribution object itself**
1827
+ in addition to the state memory from the RNG engine. DO NOT avoid re-generating this
1828
+ object on each tree, despite being inefficient, because then it can cause seed
1829
+ irreproducibility when the number of splitting dimensions is odd and the number
1830
+ of threads is more than 1. This is a very hard issue to debug since everything
1831
+ works fine depending on the order in which trees are assigned to threads.
1832
+ DO NOT PUT THESE LINES BELOW THE NEXT IF. */
1833
+ if (hplane_root != NULL)
1834
+ {
1835
+ if (input_data.ncols_categ || model_params.coef_type == Normal)
1836
+ workspace.coef_norm = StandardNormalDistr(0, 1);
1837
+ if (model_params.coef_type == Uniform)
1838
+ workspace.coef_unif = UniformMinusOneToOne(-1, 1);
1839
+ }
1840
+
1841
+ /* for the extended model, initialize extra vectors and objects */
1842
+ if (hplane_root != NULL && workspace.comb_val.empty())
1843
+ {
1844
+ workspace.comb_val.resize(model_params.sample_size);
1845
+ workspace.col_take.resize(model_params.ndim);
1846
+ workspace.col_take_type.resize(model_params.ndim);
1847
+
1848
+ if (input_data.ncols_numeric)
1849
+ {
1850
+ workspace.ext_offset.resize(input_data.ncols_tot);
1851
+ workspace.ext_coef.resize(input_data.ncols_tot);
1852
+ workspace.ext_mean.resize(input_data.ncols_tot);
1853
+ }
1854
+
1855
+ if (input_data.ncols_categ)
1856
+ {
1857
+ workspace.ext_fill_new.resize(input_data.max_categ);
1858
+ switch(model_params.cat_split_type)
1859
+ {
1860
+ case SingleCateg:
1861
+ {
1862
+ workspace.chosen_cat.resize(input_data.max_categ);
1863
+ break;
1864
+ }
1865
+
1866
+ case SubSet:
1867
+ {
1868
+ workspace.ext_cat_coef.resize(input_data.ncols_tot);
1869
+ for (std::vector<double> &v : workspace.ext_cat_coef)
1870
+ v.resize(input_data.max_categ);
1871
+ break;
1872
+ }
1873
+ }
1874
+ }
1875
+
1876
+ workspace.ext_fill_val.resize(input_data.ncols_tot);
1877
+
1878
+ }
1879
+
1880
+ /* If there are density weights, need to standardize them to sum up to
1881
+ the sample size here. Note that weights for missing values with 'Divide'
1882
+ are only initialized on-demand later on. */
1883
+ workspace.changed_weights = false;
1884
+ if (hplane_root == NULL) workspace.weights_map.clear();
1885
+
1886
+ ldouble_safe weight_scaling = 0;
1887
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
1888
+ {
1889
+ workspace.changed_weights = true;
1890
+
1891
+ /* For the extended model, if there is no sub-sampling, these weights will remain
1892
+ constant throughout and do not need to be re-generated. */
1893
+ if (!( hplane_root != NULL &&
1894
+ (!workspace.weights_map.empty() || !workspace.weights_arr.empty()) &&
1895
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement
1896
+ )
1897
+ )
1898
+ {
1899
+ workspace.weights_map.clear();
1900
+
1901
+ /* if the sub-sample size is small relative to the full sample size, use a mapping */
1902
+ if (input_data.Xc_indptr != NULL && model_params.sample_size < input_data.nrows / 50)
1903
+ {
1904
+ for (const size_t ix : workspace.ix_arr)
1905
+ weight_scaling += input_data.sample_weights[ix];
1906
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1907
+ workspace.weights_map.reserve(workspace.ix_arr.size());
1908
+ for (const size_t ix : workspace.ix_arr)
1909
+ workspace.weights_map[ix] = input_data.sample_weights[ix] * weight_scaling;
1910
+ }
1911
+
1912
+ /* if the sub-sample size is large, fill a full array matching to the sample size */
1913
+ else
1914
+ {
1915
+ if (workspace.weights_arr.empty())
1916
+ {
1917
+ workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
1918
+ weight_scaling = std::accumulate(workspace.ix_arr.begin(),
1919
+ workspace.ix_arr.end(),
1920
+ (ldouble_safe)0,
1921
+ [&input_data](const ldouble_safe a, const size_t b){return a + (ldouble_safe)input_data.sample_weights[b];}
1922
+ );
1923
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1924
+ for (double &w : workspace.weights_arr)
1925
+ w *= weight_scaling;
1926
+ }
1927
+
1928
+ else
1929
+ {
1930
+ for (const size_t ix : workspace.ix_arr)
1931
+ {
1932
+ weight_scaling += input_data.sample_weights[ix];
1933
+ workspace.weights_arr[ix] = input_data.sample_weights[ix];
1934
+ }
1935
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1936
+ for (double &w : workspace.weights_arr)
1937
+ w *= weight_scaling;
1938
+ }
1939
+ }
1940
+ }
1941
+ }
1942
+
1943
+ /* if producing distance/similarity, also need to initialize the triangular matrix */
1944
+ if (model_params.calc_dist && workspace.tmat_sep.empty())
1945
+ workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
1946
+
1947
+ /* make space for buffers if not already allocated */
1948
+ if (
1949
+ (model_params.prob_pick_by_gain_avg > 0 ||
1950
+ model_params.prob_pick_by_gain_pl > 0 ||
1951
+ model_params.prob_pick_by_full_gain > 0 ||
1952
+ model_params.prob_pick_by_dens > 0 ||
1953
+ model_params.prob_pick_col_by_range > 0 ||
1954
+ model_params.prob_pick_col_by_var > 0 ||
1955
+ model_params.prob_pick_col_by_kurt > 0 ||
1956
+ model_params.weigh_by_kurt || hplane_root != NULL)
1957
+ &&
1958
+ (workspace.buffer_dbl.empty() && workspace.buffer_szt.empty() && workspace.buffer_chr.empty())
1959
+ )
1960
+ {
1961
+ size_t min_size_dbl = 0;
1962
+ size_t min_size_szt = 0;
1963
+ size_t min_size_chr = 0;
1964
+
1965
+ bool gain = model_params.prob_pick_by_gain_avg > 0 ||
1966
+ model_params.prob_pick_by_gain_pl > 0 ||
1967
+ model_params.prob_pick_by_full_gain > 0 ||
1968
+ model_params.prob_pick_by_dens > 0;
1969
+
1970
+ if (input_data.ncols_categ)
1971
+ {
1972
+ min_size_szt = (size_t)2 * (size_t)input_data.max_categ;
1973
+ min_size_dbl = input_data.max_categ + 1;
1974
+ if (gain && model_params.cat_split_type == SubSet)
1975
+ min_size_chr = input_data.max_categ;
1976
+ }
1977
+
1978
+ if (input_data.Xc_indptr != NULL && gain)
1979
+ {
1980
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
1981
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
1982
+ }
1983
+
1984
+ /* TODO: revisit if this covers all the cases */
1985
+ if (model_params.ntry > 1 || gain)
1986
+ {
1987
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
1988
+ if (model_params.ndim < 2 && input_data.Xc_indptr != NULL)
1989
+ min_size_dbl = std::max(min_size_dbl, (size_t)2*model_params.sample_size);
1990
+ }
1991
+
1992
+ /* for sampled column choices */
1993
+ if (model_params.prob_pick_col_by_var)
1994
+ {
1995
+ if (input_data.ncols_categ) {
1996
+ min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
1997
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ + 1);
1998
+ }
1999
+ }
2000
+
2001
+ if (model_params.prob_pick_col_by_kurt)
2002
+ {
2003
+ if (input_data.ncols_categ) {
2004
+ min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
2005
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
2006
+ }
2007
+
2008
+ }
2009
+
2010
+ /* for the extended model */
2011
+ if (hplane_root != NULL)
2012
+ {
2013
+ min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
2014
+ if (model_params.missing_action != Fail)
2015
+ {
2016
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
2017
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
2018
+ }
2019
+
2020
+ if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
2021
+ {
2022
+ min_size_szt = std::max(min_size_szt, (size_t)2 * (size_t)input_data.max_categ + (size_t)1);
2023
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
2024
+ }
2025
+
2026
+ if (model_params.weigh_by_kurt)
2027
+ min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
2028
+
2029
+ if (gain && (!workspace.weights_arr.empty() || !workspace.weights_map.empty()))
2030
+ {
2031
+ workspace.sample_weights.resize(model_params.sample_size);
2032
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
2033
+ }
2034
+ }
2035
+
2036
+ /* now resize */
2037
+ if (workspace.buffer_dbl.size() < min_size_dbl)
2038
+ workspace.buffer_dbl.resize(min_size_dbl);
2039
+
2040
+ if (workspace.buffer_szt.size() < min_size_szt)
2041
+ workspace.buffer_szt.resize(min_size_szt);
2042
+
2043
+ if (workspace.buffer_chr.size() < min_size_chr)
2044
+ workspace.buffer_chr.resize(min_size_chr);
2045
+
2046
+ /* for guided column choice, need to also remember the best split so far */
2047
+ if (
2048
+ model_params.cat_split_type == SubSet &&
2049
+ (
2050
+ model_params.prob_pick_by_gain_avg ||
2051
+ model_params.prob_pick_by_gain_pl ||
2052
+ model_params.prob_pick_by_full_gain ||
2053
+ model_params.prob_pick_by_dens
2054
+ )
2055
+ )
2056
+ {
2057
+ workspace.this_split_categ.resize(input_data.max_categ);
2058
+ }
2059
+
2060
+ }
2061
+
2062
+ /* Other potentially necessary buffers */
2063
+ if (
2064
+ tree_root != NULL && model_params.missing_action == Impute &&
2065
+ (model_params.prob_pick_by_gain_avg || model_params.prob_pick_by_gain_pl ||
2066
+ model_params.prob_pick_by_full_gain || model_params.prob_pick_by_dens) &&
2067
+ input_data.Xc_indptr == NULL && input_data.ncols_numeric && workspace.imputed_x_buffer.empty()
2068
+ )
2069
+ {
2070
+ workspace.imputed_x_buffer.resize(input_data.nrows);
2071
+ }
2072
+
2073
+ if (model_params.prob_pick_by_full_gain && workspace.col_indices.empty())
2074
+ workspace.col_indices.resize(model_params.ncols_per_tree);
2075
+
2076
+ if (
2077
+ (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) &&
2078
+ model_params.weigh_by_kurt &&
2079
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
2080
+ (model_params.ncols_per_tree == input_data.ncols_tot) &&
2081
+ !input_data.all_kurtoses.empty()
2082
+ ) {
2083
+ workspace.tree_kurtoses = input_data.all_kurtoses.data();
2084
+ }
2085
+ else {
2086
+ workspace.tree_kurtoses = NULL;
2087
+ }
2088
+
2089
+ /* weigh columns by kurtosis in the sample if required */
2090
+ /* TODO: this one could probably be refactored to use the function in the helpers */
2091
+ std::vector<double> kurt_weights;
2092
+ bool avoid_leave_m_cols = false;
2093
+ if (
2094
+ model_params.weigh_by_kurt &&
2095
+ !avoid_col_weights &&
2096
+ (input_data.preinitialized_col_sampler == NULL
2097
+ ||
2098
+ ((model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) && workspace.tree_kurtoses == NULL))
2099
+ )
2100
+ {
2101
+ kurt_weights.resize(input_data.ncols_numeric + input_data.ncols_categ, 0.);
2102
+
2103
+ if (model_params.ncols_per_tree >= input_data.ncols_tot)
2104
+ {
2105
+
2106
+ if (input_data.Xc_indptr == NULL)
2107
+ {
2108
+
2109
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
2110
+ {
2111
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2112
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
2113
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2114
+ input_data.numeric_data + col * input_data.nrows,
2115
+ model_params.missing_action);
2116
+ else if (!workspace.weights_arr.empty())
2117
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, decltype(workspace.weights_arr), ldouble_safe>(
2118
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2119
+ input_data.numeric_data + col * input_data.nrows,
2120
+ model_params.missing_action, workspace.weights_arr);
2121
+ else
2122
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2123
+ decltype(workspace.weights_map), ldouble_safe>(
2124
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2125
+ input_data.numeric_data + col * input_data.nrows,
2126
+ model_params.missing_action, workspace.weights_map);
2127
+ }
2128
+ }
2129
+
2130
+ else
2131
+ {
2132
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
2133
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
2134
+ {
2135
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2136
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2137
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2138
+ ldouble_safe>(
2139
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2140
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2141
+ model_params.missing_action);
2142
+ else if (!workspace.weights_arr.empty())
2143
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2144
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2145
+ decltype(workspace.weights_arr), ldouble_safe>(
2146
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2147
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2148
+ model_params.missing_action, workspace.weights_arr);
2149
+ else
2150
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2151
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2152
+ decltype(workspace.weights_map), ldouble_safe>(
2153
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2154
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2155
+ model_params.missing_action, workspace.weights_map);
2156
+ }
2157
+ }
2158
+
2159
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
2160
+ {
2161
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2162
+ kurt_weights[col + input_data.ncols_numeric] =
2163
+ calc_kurtosis<ldouble_safe>(
2164
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2165
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2166
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
2167
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
2168
+ else if (!workspace.weights_arr.empty())
2169
+ kurt_weights[col + input_data.ncols_numeric] =
2170
+ calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
2171
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2172
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2173
+ workspace.buffer_dbl.data(),
2174
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2175
+ workspace.weights_arr);
2176
+ else
2177
+ kurt_weights[col + input_data.ncols_numeric] =
2178
+ calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
2179
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2180
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2181
+ workspace.buffer_dbl.data(),
2182
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2183
+ workspace.weights_map);
2184
+ }
2185
+
2186
+ for (auto &w : kurt_weights) w = (w == -HUGE_VAL)? 0. : std::fmax(1e-8, -1. + w);
2187
+ if (input_data.col_weights != NULL)
2188
+ {
2189
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
2190
+ {
2191
+ if (kurt_weights[col] <= 0) continue;
2192
+ kurt_weights[col] *= input_data.col_weights[col];
2193
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
2194
+ }
2195
+ }
2196
+ workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
2197
+ }
2198
+
2199
+
2200
+
2201
+ else
2202
+ {
2203
+ std::vector<size_t> cols_take(model_params.ncols_per_tree);
2204
+ std::vector<size_t> buffer1;
2205
+ std::vector<bool> buffer2;
2206
+ sample_random_rows<double, double>(
2207
+ cols_take, input_data.ncols_tot, false,
2208
+ workspace.rnd_generator, buffer1,
2209
+ (double*)NULL, kurt_weights, /* <- will not get used */
2210
+ (size_t)0, (size_t)0, buffer2);
2211
+
2212
+ if (
2213
+ model_params.sample_size == input_data.nrows &&
2214
+ !model_params.with_replacement &&
2215
+ !input_data.all_kurtoses.empty()
2216
+ )
2217
+ {
2218
+ for (size_t col : cols_take)
2219
+ kurt_weights[col] = input_data.all_kurtoses[col];
2220
+ goto skip_kurt_calculations;
2221
+ }
2222
+
2223
+ if (input_data.Xc_indptr != NULL)
2224
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
2225
+
2226
+ for (size_t col : cols_take)
2227
+ {
2228
+ if (col < input_data.ncols_numeric)
2229
+ {
2230
+ if (input_data.Xc_indptr == NULL)
2231
+ {
2232
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2233
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
2234
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2235
+ input_data.numeric_data + col * input_data.nrows,
2236
+ model_params.missing_action);
2237
+ else if (!workspace.weights_arr.empty())
2238
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2239
+ decltype(workspace.weights_arr), ldouble_safe>(
2240
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2241
+ input_data.numeric_data + col * input_data.nrows,
2242
+ model_params.missing_action, workspace.weights_arr);
2243
+ else
2244
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2245
+ decltype(workspace.weights_map), ldouble_safe>(
2246
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2247
+ input_data.numeric_data + col * input_data.nrows,
2248
+ model_params.missing_action, workspace.weights_map);
2249
+ }
2250
+
2251
+ else
2252
+ {
2253
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2254
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2255
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2256
+ ldouble_safe>(
2257
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2258
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2259
+ model_params.missing_action);
2260
+ else if (!workspace.weights_arr.empty())
2261
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2262
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2263
+ decltype(workspace.weights_arr), ldouble_safe>(
2264
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2265
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2266
+ model_params.missing_action, workspace.weights_arr);
2267
+ else
2268
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2269
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2270
+ decltype(workspace.weights_map), ldouble_safe>(
2271
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2272
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2273
+ model_params.missing_action, workspace.weights_map);
2274
+ }
2275
+ }
2276
+
2277
+ else
2278
+ {
2279
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2280
+ kurt_weights[col] =
2281
+ calc_kurtosis<ldouble_safe>(
2282
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2283
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2284
+ input_data.ncat[col - input_data.ncols_numeric],
2285
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
2286
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
2287
+ else if (!workspace.weights_arr.empty())
2288
+ kurt_weights[col] =
2289
+ calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
2290
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2291
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2292
+ input_data.ncat[col - input_data.ncols_numeric],
2293
+ workspace.buffer_dbl.data(),
2294
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2295
+ workspace.weights_arr);
2296
+ else
2297
+ kurt_weights[col] =
2298
+ calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
2299
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2300
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2301
+ input_data.ncat[col - input_data.ncols_numeric],
2302
+ workspace.buffer_dbl.data(),
2303
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2304
+ workspace.weights_map);
2305
+ }
2306
+
2307
+ /* Note to self: don't move this to outside of the braces, as it needs to assign a weight
2308
+ of zero to the columns that were not selected, thus it should only do this clipping
2309
+ for columns that are chosen. */
2310
+ if (kurt_weights[col] == -HUGE_VAL)
2311
+ {
2312
+ kurt_weights[col] = 0;
2313
+ }
2314
+
2315
+ else
2316
+ {
2317
+ kurt_weights[col] = std::fmax(1e-8, -1. + kurt_weights[col]);
2318
+ if (input_data.col_weights != NULL)
2319
+ {
2320
+ kurt_weights[col] *= input_data.col_weights[col];
2321
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
2322
+ }
2323
+ }
2324
+ }
2325
+
2326
+ skip_kurt_calculations:
2327
+ workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
2328
+ avoid_leave_m_cols = true;
2329
+ }
2330
+
2331
+ if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
2332
+ {
2333
+ workspace.tree_kurtoses = kurt_weights.data();
2334
+ }
2335
+ }
2336
+
2337
+ bool col_sampler_is_fresh = true;
2338
+ if (input_data.preinitialized_col_sampler == NULL) {
2339
+ workspace.col_sampler.initialize(input_data.ncols_tot);
2340
+ }
2341
+ else {
2342
+ workspace.col_sampler = *((ColumnSampler<ldouble_safe>*)input_data.preinitialized_col_sampler);
2343
+ col_sampler_is_fresh = false;
2344
+ }
2345
+ /* TODO: this can be done more efficiently when sub-sampling columns */
2346
+ if (!avoid_leave_m_cols)
2347
+ workspace.col_sampler.leave_m_cols(model_params.ncols_per_tree, workspace.rnd_generator);
2348
+ if (model_params.ncols_per_tree < input_data.ncols_tot) col_sampler_is_fresh = false;
2349
+ workspace.try_all = false;
2350
+ if (hplane_root != NULL && model_params.ndim >= input_data.ncols_tot)
2351
+ workspace.try_all = true;
2352
+
2353
+ if (model_params.scoring_metric != Depth && !is_boxed_metric(model_params.scoring_metric))
2354
+ {
2355
+ workspace.density_calculator.initialize(model_params.max_depth,
2356
+ input_data.ncols_categ? input_data.max_categ : 0,
2357
+ tree_root != NULL && input_data.ncols_categ,
2358
+ model_params.scoring_metric);
2359
+ }
2360
+
2361
+ else if (is_boxed_metric(model_params.scoring_metric))
2362
+ {
2363
+ if (tree_root != NULL)
2364
+ workspace.density_calculator.initialize_bdens(input_data,
2365
+ model_params,
2366
+ workspace.ix_arr,
2367
+ workspace.col_sampler);
2368
+ else
2369
+ workspace.density_calculator.initialize_bdens_ext(input_data,
2370
+ model_params,
2371
+ workspace.ix_arr,
2372
+ workspace.col_sampler,
2373
+ col_sampler_is_fresh);
2374
+ }
2375
+
2376
+ if (tree_root != NULL)
2377
+ {
2378
+ split_itree_recursive<InputData, WorkerMemory, ldouble_safe>(
2379
+ *tree_root,
2380
+ workspace,
2381
+ input_data,
2382
+ model_params,
2383
+ impute_nodes,
2384
+ 0);
2385
+ }
2386
+
2387
+ else
2388
+ {
2389
+ split_hplane_recursive<InputData, WorkerMemory, ldouble_safe>(
2390
+ *hplane_root,
2391
+ workspace,
2392
+ input_data,
2393
+ model_params,
2394
+ impute_nodes,
2395
+ 0);
2396
+ }
2397
+
2398
+ /* if producing imputation structs, only need to keep the ones for terminal nodes */
2399
+ if (impute_nodes != NULL)
2400
+ drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
2401
+ }