isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,2401 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* Fit Isolation Forest model, or variant of it such as SCiForest
66
+ *
67
+ * Parameters:
68
+ * ===========
69
+ * - model_outputs (out)
70
+ * Pointer to already allocated isolation forest model object for single-variable splits.
71
+ * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
72
+ * additional trees through function 'add_tree'.
73
+ * - model_outputs_ext (out)
74
+ * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
75
+ * Note that if 'ndim' = 1, must use instead the single-variable model object.
76
+ * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
77
+ * additional trees through function 'add_tree'.
78
+ * - numeric_data[nrows * ncols_numeric]
79
+ * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
80
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
81
+ * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
82
+ * no sparse numeric data either).
83
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
84
+ * - ncols_numeric
85
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
86
+ * - categ_data[nrows * ncols_categ]
87
+ * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
88
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
89
+ * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
90
+ * Each category should be represented as an integer, and these integers must start at zero and
91
+ * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
92
+ * (note that they are not treated as being ordinal, this is just an encoding). Missing values
93
+ * should be encoded as negative numbers such as (-1).
94
+ * - ncols_categ
95
+ * Number of categorical columns in the data.
96
+ * - ncat[ncols_categ]
97
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
98
+ * the number of categories for that column is '5' (zero is one category).
99
+ * - Xc[nnz]
100
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
101
+ * Pass NULL if there are no sparse numeric columns.
102
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
103
+ * - Xc_ind[nnz]
104
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
105
+ * Must be in sorted order, otherwise results will be incorrect.
106
+ * The largest value here should be smaller than the largest possible value of 'size_t'.
107
+ * Pass NULL if there are no sparse numeric columns.
108
+ * - Xc_indptr[ncols_numeric + 1]
109
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
110
+ * start and at entry [col + 1] where does column 'col' end.
111
+ * Pass NULL if there are no sparse numeric columns.
112
+ * - ndim
113
+ * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
114
+ * the single-variable model. Note that the model object pointer passed must also
115
+ * agree with the value passed to 'ndim'.
116
+ * - ntry
117
+ * When using any of 'prob_pick_by_gain_pl', 'prob_pick_by_gain_avg', 'prob_pick_by_full_gain', 'prob_pick_by_dens', how many variables (with 'ndim=1')
118
+ * or linear combinations (with 'ndim>1') to try for determining the best one according to gain.
119
+ * Recommended value in reference [4] is 10 (with 'prob_pick_by_gain_avg', for outlier detection), while the
120
+ * recommended value in reference [11] is 1 (with 'prob_pick_by_gain_pl', for outlier detection), and the
121
+ * recommended value in reference [9] is 10 to 20 (with 'prob_pick_by_gain_pl', for missing value imputations).
122
+ * - coef_type
123
+ * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
124
+ * (as proposed in [4]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [3]. Ignored for the
125
+ * single-variable model.
126
+ * - sample_weights[nrows]
127
+ * Weights for the rows when building a tree, either as sampling importances when using
128
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
129
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
130
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
131
+ * through parameter 'weight_as_sample'.
132
+ * Pass NULL if the rows all have uniform weights.
133
+ * - with_replacement
134
+ * Whether to sample rows with replacement or not (not recommended). Note that distance calculations,
135
+ * if desired, don't work well with duplicate rows.
136
+ * - weight_as_sample
137
+ * If passing sample (row) weights when fitting the model, whether to consider those weights as row
138
+ * sampling weights (i.e. the higher the weights, the more likely the observation will end up included
139
+ * in each tree sub-sample), or as distribution density weights (i.e. putting a weight of two is the same
140
+ * as if the row appeared twice, thus higher weight makes it less of an outlier, but does not give it a
141
+ * higher chance of being sampled if the data uses sub-sampling).
142
+ * - nrows
143
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
144
+ * - sample_size
145
+ * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
146
+ * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
147
+ * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
148
+ * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
149
+ * in [5] is 'nrows' here.
150
+ * - ntrees
151
+ * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
152
+ * author's code in [5] is 10.
153
+ * - max_depth
154
+ * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
155
+ * Models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
156
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
157
+ * predictions.
158
+ * Note that models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
159
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
160
+ * predictions.
161
+ * If using pooled gain, one might want to substitute 'max_depth' with 'min_gain'.
162
+ * - ncols_per_tree
163
+ * Number of columns to use (have as potential candidates for splitting at each iteration) in each tree,
164
+ * similar to the 'mtry' parameter of random forests.
165
+ * In general, this is only relevant when using non-random splits and/or weighted column choices.
166
+ * If passing zero, will use the full number of available columns.
167
+ * Recommended value: 0.
168
+ * - limit_depth
169
+ * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
170
+ * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
171
+ * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
172
+ * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass 'false' here
173
+ * and higher values for 'max_depth' if using the model for purposes other than outlier detection.
174
+ * Note that, if passing 'limit_depth=true', then 'max_depth' is ignored.
175
+ * - penalize_range
176
+ * Whether to penalize (add -1 to the terminal depth) observations at prediction time that have a value
177
+ * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
178
+ * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
179
+ * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
180
+ * when splitting by categorical variables. Note that this can make a very large difference in the results
181
+ * when using 'prob_pick_by_gain_pl'.
182
+ * This option is not supported when using density-based outlier scoring metrics.
183
+ * - standardize_data
184
+ * Whether to standardize the features at each node before creating a linear combination of them as suggested
185
+ * in [4]. This is ignored when using 'ndim=1'.
186
+ * - scoring_metric
187
+ * Metric to use for determining outlier scores (see reference [13]).
188
+ * If passing 'Depth', will use isolation depth as proposed in reference [1]. This is typically the safest choice
189
+ * and plays well with all model types offered by this library.
190
+ * If passing 'Density', will set scores for each terminal node as the ratio between the fraction of points in the sub-sample
191
+ * that end up in that node and the fraction of the volume in the feature space which defines
192
+ * the node according to the splits that lead to it.
193
+ * If using 'ndim=1', for categorical variables, 'Density' is defined in terms
194
+ * of number of categories that go towards each side of the split divided by number of categories
195
+ * in the observations that reached that node.
196
+ * The standardized outlier score from 'Density' for a given observation is calculated as the
197
+ * negative of the logarithm of the geometric mean from the per-tree densities, which unlike
198
+ * the standardized score produced from 'Depth', is unbounded, but just like the standardized
199
+ * score form 'Depth', has a natural threshold for definining outlierness, which in this case
200
+ * is zero is instead of 0.5. The non-standardized outlier score for 'Density' is calculated as the
201
+ * geometric mean, while the per-tree scores are calculated as the density values.
202
+ * 'Density' might lead to better predictions when using 'ndim=1', particularly in the presence
203
+ * of categorical variables. Note however that using 'Density' requires more trees for convergence
204
+ * of scores (i.e. good results) compared to isolation-based metrics.
205
+ * 'Density' is incompatible with 'penalize_range=true'.
206
+ * If passing 'AdjDepth', will use an adjusted isolation depth that takes into account the number of points that
207
+ * go to each side of a given split vs. the fraction of the range of that feature that each
208
+ * side of the split occupies, by a metric as follows: 'd = 2/ (1 + 1/(2*p))'
209
+ * where 'p' is defined as 'p = (n_s / n_t) / (r_s / r_t)
210
+ * with 'n_t' being the number of points that reach a given node, 'n_s' the
211
+ * number of points that are sent to a given side of the split/branch at that node,
212
+ * 'r_t' being the range (maximum minus minimum) of the splitting feature or
213
+ * linear combination among the points that reached the node, and 'r_s' being the
214
+ * range of the same feature or linear combination among the points that are sent to this
215
+ * same side of the split/branch. This makes each split add a number between zero and two
216
+ * to the isolation depth, with this number's probabilistic distribution being centered
217
+ * around 1 and thus the expected isolation depth remaing the same as in the original
218
+ * 'Depth' metric, but having more variability around the extremes.
219
+ * Scores (standardized, non-standardized, per-tree) for 'AdjDepth' are aggregated in the same way
220
+ * as for 'Depth'.
221
+ * 'AdjDepth' might lead to better predictions when using 'ndim=1', particularly in the prescence
222
+ * of categorical variables and for smaller datasets, and for smaller datasets, might make
223
+ * sense to combine it with 'penalize_range=true'.
224
+ * If passing 'AdjDensity', will use the same metric from 'AdjDepth', but applied multiplicatively instead
225
+ * of additively. The expected value for 'AdjDepth' is not strictly the same
226
+ * as for isolation, but using the expected isolation depth as standardizing criterion
227
+ * tends to produce similar standardized score distributions (centered around 0.5).
228
+ * Scores (standardized, non-standardized, per-tree) from 'AdjDensity' are aggregated in the same way
229
+ * as for 'Depth'.
230
+ * 'AdjDepth' is incompatible with 'penalize_range=true'.
231
+ * If passing 'BoxedRatio', will set the scores for each terminal node as the ratio between the volume of the boxed
232
+ * feature space for the node as defined by the smallest and largest values from the split
233
+ * conditions for each column (bounded by the variable ranges in the sample) and the
234
+ * variable ranges in the tree sample.
235
+ * If using 'ndim=1', for categorical variables 'BoxedRatio' is defined in terms of number of categories.
236
+ * If using 'ndim=>1', 'BoxedRatio' is defined in terms of the maximum achievable value for the
237
+ * splitting linear combination determined from the minimum and maximum values for each
238
+ * variable among the points in the sample, and as such, it has a rather different meaning
239
+ * compared to the score obtained with 'ndim=1' - 'BoxedRatio' scores with 'ndim>1'
240
+ * typically provide very poor quality results and this metric is thus not recommended to
241
+ * use in the extended model. With 'ndim>1', 'BoxedRatio' also has a tendency of producing too small
242
+ * values which round to zero.
243
+ * The standardized outlier score from 'BoxedRatio' for a given observation is calculated
244
+ * simply as the the average from the per-tree boxed ratios. 'BoxedRatio' metric
245
+ * has a lower bound of zero and a theorical upper bound of one, but in practice the scores
246
+ * tend to be very small numbers close to zero, and its distribution across
247
+ * different datasets is rather unpredictable. In order to keep rankings comparable with
248
+ * the rest of the metrics, the non-standardized outlier scores for 'BoxedRatio' are calculated as the
249
+ * negative of the average instead. The per-tree 'BoxedRatio' scores are calculated as the ratios.
250
+ * 'BoxedRatio' can be calculated in a fast-but-not-so-precise way, and in a low-but-precise
251
+ * way, which is controlled by parameter 'fast_bratio'. Usually, both should give the
252
+ * same results, but in some fatasets, the fast way can lead to numerical inaccuracies
253
+ * due to roundoffs very close to zero.
254
+ * 'BoxedRatio' might lead to better predictions in datasets with many rows when using 'ndim=1'
255
+ * and a relatively small 'sample_size'. Note that more trees are required for convergence
256
+ * of scores when using 'BoxedRatio'. In some datasets, 'BoxedRatio' metric might result in very bad
257
+ * predictions, to the point that taking its inverse produces a much better ranking of outliers.
258
+ * 'BoxedRatio' option is incompatible with 'penalize_range'.
259
+ * If passing 'BoxedDensity2', will set the score as the ratio between the fraction of points within the sample that
260
+ * end up in a given terminal node and the 'BoxedRatio' metric.
261
+ * Aggregation of scores (standardized, non-standardized, per-tree) for 'BoxedDensity2' is done in the same
262
+ * way as for 'Density', and it also has a natural threshold at zero for determining
263
+ * outliers and inliers.
264
+ * 'BoxedDensity2' is typically usable with 'ndim>1', but tends to produce much bigger values
265
+ * compared to 'ndim=1'.
266
+ * Albeit unintuitively, in many datasets, one can usually get better results with metric
267
+ * 'BoxedDensity' instead.
268
+ * The calculation of 'BoxedDensity2' is also controlled by 'fast_bratio'.
269
+ * 'BoxedDensity2' incompatible with 'penalize_range'.
270
+ * If passing 'BoxedDensity', will set the score as the ratio between the fraction of points within the sample that
271
+ * end up in a given terminal node and the ratio between the boxed volume of the feature
272
+ * space in the sample and the boxed volume of a node given by the split conditions (inverse
273
+ * as in 'BoxedDensity2'). This metric does not have any theoretical or intuitive
274
+ * justification behind its existence, and it is perhaps ilogical to use it as a
275
+ * scoring metric, but tends to produce good results in some datasets.
276
+ * The standardized outlier scores for 'BoxedDensity' are defined as the negative of the geometric mean,
277
+ * while the non-standardized scores are the geometric mean, and the per-tree scores are simply the 'density' values.
278
+ * The calculation of 'BoxedDensity' is also controlled by 'fast_bratio'.
279
+ * 'BoxedDensity' option is incompatible with 'penalize_range'.
280
+ * - fast_bratio
281
+ * When using "boxed" metrics for scoring, whether to calculate them in a fast way through
282
+ * cumulative sum of logarithms of ratios after each split, or in a slower way as sum of
283
+ * logarithms of a single ratio per column for each terminal node.
284
+ * Usually, both methods should give the same results, but in some datasets, particularly
285
+ * when variables have too small or too large ranges, the first method can be prone to
286
+ * numerical inaccuracies due to roundoff close to zero.
287
+ * Note that this does not affect calculations for models with 'ndim>1', since given the
288
+ * split types, the calculation for them is different.
289
+ * - standardize_dist
290
+ * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
291
+ * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
292
+ * - tmat[nrows * (nrows - 1) / 2]
293
+ * Array in which to calculate average separation depths or standardized distance metric (see documentation
294
+ * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
295
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
296
+ * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
297
+ * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
298
+ * entry 0 <= i < j < n will be located at position
299
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
300
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
301
+ * - output_depths[nrows]
302
+ * Array in which to calculate average path depths or standardized outlierness metric (see documentation
303
+ * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
304
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
305
+ * of rows. If not NULL, must already be initialized to zeros.
306
+ * - standardize_depth
307
+ * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
308
+ * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
309
+ * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
310
+ * the average depth of each row across all trees.
311
+ * - col_weights[ncols_numeric + ncols_categ]
312
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
313
+ * Ignored when picking columns by deterministic criterion.
314
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
315
+ * effect is multiplicative.
316
+ * - weigh_by_kurt
317
+ * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
318
+ * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
319
+ * sample. For categorical columns, will calculate expected kurtosis if the column were converted to
320
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
321
+ * This is intended as a cheap feature selector, while the parameter 'prob_pick_col_by_kurt'
322
+ * provides the option to do this at each node in the tree for a different overall type of model.
323
+ * If passing column weights or weighted column choices ('prob_pick_col_by_range', 'prob_pick_col_by_var'),
324
+ * the effect will be multiplicative. This option is not compatible with 'prob_pick_col_by_kurt'.
325
+ * If passing 'missing_action=fail' and the data has infinite values, columns with rows
326
+ * having infinite values will get a weight of zero. If passing a different value for missing
327
+ * action, infinite values will be ignored in the kurtosis calculation.
328
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
329
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
330
+ * the same central value).
331
+ * - prob_pick_by_gain_pl
332
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
333
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
334
+ * that maximizes a pooled standard deviation gain criterion (see references [9] and [11]) on the
335
+ * same variable or linear combination, similarly to regression trees such as CART.
336
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
337
+ * in which the largest standardized gain can be achieved.
338
+ * For categorical variables with 'ndim=1', will use shannon entropy instead (like in [7]).
339
+ * Compared to a simple averaged gain, this tends to result in more evenly-divided splits and more clustered
340
+ * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
341
+ * When used for outlier detection, datasets with multimodal distributions usually see better performance
342
+ * under this type of splits.
343
+ * Note that, since this makes the trees more even and thus it takes more steps to produce isolated nodes,
344
+ * the resulting object will be heavier. When splits are not made according to any of 'prob_pick_by_gain_avg',
345
+ * 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
346
+ * Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
347
+ * every single tree will have the exact same splits.
348
+ * Be aware that 'penalize_range' can also have a large impact when using 'prob_pick_by_gain_pl'.
349
+ * Be aware also that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable
350
+ * model, every single tree will have the exact same splits.
351
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
352
+ * Alternatively, one can also control the depth through 'min_gain' (for which one might want to
353
+ * set 'max_depth=0').
354
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain',
355
+ * 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
356
+ * - prob_pick_by_gain_avg
357
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
358
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
359
+ * that maximizes an averaged standard deviation gain criterion (see references [4] and [11]) on the
360
+ * same variable or linear combination.
361
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
362
+ * in which the largest standardized gain can be achieved.
363
+ * For categorical variables with 'ndim=1', will take the expected standard deviation that would be
364
+ * gotten if the column were converted to numerical by assigning to each category a random
365
+ * number ~ Unif(0, 1) and calculate gain with those assumed standard deviations.
366
+ * Compared to a pooled gain, this tends to result in more cases in which a single observation or very
367
+ * few of them are put into one branch. Typically, datasets with outliers defined by extreme values in
368
+ * some column more or less independently of the rest, usually see better performance under this type
369
+ * of split. Recommended to use sub-samples (parameter 'sample_size') when
370
+ * passing this parameter. Note that, since this will create isolated nodes faster, the resulting object
371
+ * will be lighter (use less memory).
372
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
373
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
374
+ * Default setting for [1], [2], [3] is zero, and default for [4] is 1.
375
+ * This is the randomization parameter that can be passed to the author's original code in [5],
376
+ * but note that the code in [5] suffers from a mathematical error in the calculation of running standard deviations,
377
+ * so the results from it might not match with this library's.
378
+ * Be aware that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable model,
379
+ * every single tree will have the exact same splits.
380
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
381
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
382
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
383
+ * - prob_pick_by_full_gain
384
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
385
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
386
+ * that minimizes the pooled sums of variances of all columns (or a subset of them if using
387
+ * 'ncols_per_tree').
388
+ * In general, 'prob_pick_by_full_gain' is much slower to evaluate than the other gain types, and does not tend to
389
+ * lead to better results. When using 'prob_pick_by_full_gain', one might want to use a different scoring
390
+ * metric (particulatly 'Density', 'BoxedDensity2' or 'BoxedRatio'). Note that
391
+ * the variance calculations are all done through the (exact) sorted-indices approach, while is much
392
+ * slower than the (approximate) histogram approach used by other decision tree software.
393
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
394
+ * of features will make a large difference under 'prob_pick_by_full_gain', which might not make it suitable for
395
+ * all types of data.
396
+ * 'prob_pick_by_full_gain' is not compatible with categorical data, and 'min_gain' does not apply to it.
397
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
398
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
399
+ * Default setting for [1], [2], [3], [4] is zero.
400
+ * - prob_pick_dens
401
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
402
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
403
+ * that maximizes the pooled densities of the branch distributions.
404
+ * The 'min_gain' option does not apply to this type of splits.
405
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
406
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
407
+ * Default setting for [1], [2], [3], [4] is zero.
408
+ * - prob_pick_col_by_range
409
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
410
+ * proportional to the range spanned by each column within a node as proposed in reference [12].
411
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
412
+ * probability proportional to the range spanned by each column within a node.
413
+ * This option is not compatible with categorical data. If passing column weights, the
414
+ * effect will be multiplicative.
415
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
416
+ * of features will make a large difference under this option, which might not make it suitable for
417
+ * all types of data.
418
+ * Note that the proposed RRCF model from [12] uses a different scoring metric for producing anomaly
419
+ * scores, while this library uses isolation depth regardless of how columns are chosen, thus results
420
+ * are likely to be different from those of other software implementations. Nevertheless, as explored
421
+ * in [11], isolation depth as a scoring metric typically provides better results than the
422
+ * "co-displacement" metric from [12] under these split types.
423
+ * - prob_pick_col_by_var
424
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
425
+ * proportional to the variance of each column within a node.
426
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
427
+ * probability proportional to the variance of each column within a node.
428
+ * For categorical data, it will calculate the expected variance if the column were converted to
429
+ * numerical by assigning to each category a random number ~ Unif(0, 1), which depending on the number of
430
+ * categories and their distribution, produces numbers typically a bit smaller than standardized numerical
431
+ * variables.
432
+ * Note that when using sparse matrices, the calculation of variance will rely on a procedure that
433
+ * uses sums of squares, which has less numerical precision than the
434
+ * calculation used for dense inputs, and as such, the results might differ slightly.
435
+ * Be aware that this calculated variance is not standardized in any way, so the scales of
436
+ * features will make a large difference under this option.
437
+ * If there are infinite values, all columns having infinite values will be treated as having the
438
+ * same weight, and will be chosen before every other column with non-infinite values.
439
+ * If passing column weights , the effect will be multiplicative.
440
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
441
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
442
+ * and will be chosen before columns with non-infinite values.
443
+ * - prob_pick_col_by_kurt
444
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
445
+ * proportional to the kurtosis of each column **within a node** (unlike the option 'weigh_by_kurtosis'
446
+ * which calculates this metric only at the root).
447
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
448
+ * probability proportional to the kurtosis of each column within a node.
449
+ * For categorical data, it will calculate the expected kurtosis if the column were converted to
450
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
451
+ * Note that when using sparse matrices, the calculation of kurtosis will rely on a procedure that
452
+ * uses sums of squares and higher-power numbers, which has less numerical precision than the
453
+ * calculation used for dense inputs, and as such, the results might differ slightly.
454
+ * If passing column weights, the effect will be multiplicative. This option is not compatible
455
+ * with 'weigh_by_kurtosis'.
456
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
457
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
458
+ * and will be chosen before columns with non-infinite values.
459
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
460
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
461
+ * the same central value).
462
+ * Be aware that kurtosis can be a rather slow metric to calculate.
463
+ * - min_gain
464
+ * Minimum gain that a split threshold needs to produce in order to proceed with a split.
465
+ * Only used when the splits are decided by a variance gain criterion ('prob_pick_by_gain_pl' or
466
+ * 'prob_pick_by_gain_avg', but not 'prob_pick_by_full_gain' nor 'prob_pick_by_dens').
467
+ * If the highest possible gain in the evaluated splits at a node is below this threshold,
468
+ * that node becomes a terminal node.
469
+ * This can be used as a more sophisticated depth control when using pooled gain (note that 'max_depth'
470
+ * still applies on top of this heuristic).
471
+ * - missing_action
472
+ * How to handle missing data at both fitting and prediction time. Options are a) 'Divide' (for the single-variable
473
+ * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
474
+ * the data that went to each branch when fitting the model, b) 'Impute', which will assign observations to the
475
+ * branch with the most observations in the single-variable model (but imputed values will also be used for
476
+ * gain calculations), or fill in missing values with the median of each column of the sample from which the
477
+ * split was made in the extended model (recommended) (but note that the calculation of medians does not take
478
+ * into account sample weights when using 'weights_as_sample_prob=false', and note that when using a gain
479
+ * criterion for splits with 'ndim=1', it will use the imputed values in the calculation), c) 'Fail' which will
480
+ * assume that there are no missing values and will trigger undefined behavior if it encounters any.
481
+ * In the extended model, infinite values will be treated as missing.
482
+ * Note that passing 'Fail' might crash the process if there turn out to be missing values, but will otherwise
483
+ * produce faster fitting and prediction times along with decreased model object sizes.
484
+ * Models from [1], [2], [3], [4] correspond to 'Fail' here.
485
+ * - cat_split_type
486
+ * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
487
+ * a single category to a branch and the rest to the other branch. For the extended model, whether to
488
+ * give each category a coefficient, or only one while the rest get zero.
489
+ * - new_cat_action
490
+ * What to do after splitting a categorical feature when new data that reaches that split has categories that
491
+ * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
492
+ * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
493
+ * data that went to each branch when fitting the model, and in the extended model will assign
494
+ * them the median value for that column that was added to the linear combination of features (but note that
495
+ * this median calculation does not use sample weights when using 'weights_as_sample_prob=false'),
496
+ * b) "Smallest", which will assign all observations with unseen categories in the split to the branch that
497
+ * had fewer observations when fitting the model, c) "Random", which will assing a branch (coefficient in the
498
+ * extended model) at random for each category beforehand, even if no observations had that category when
499
+ * fitting the model. Ignored when passing 'cat_split_type' = 'SingleCateg'.
500
+ * - all_perm
501
+ * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
502
+ * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
503
+ * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
504
+ * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
505
+ * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
506
+ * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
507
+ * possible to select other splits that respect the sorted frequency order.
508
+ * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
509
+ * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
510
+ * but note that this is not checked within the function.
511
+ * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
512
+ * - coef_by_prop
513
+ * In the extended model, whether to sort the randomly-generated coefficients for categories
514
+ * according to their relative frequency in the tree node. This might provide better results when using
515
+ * categorical variables with too many categories, but is not recommended, and not reflective of
516
+ * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
517
+ * variables.
518
+ * - imputer (out)
519
+ * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
520
+ * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
521
+ * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
522
+ * or division strategy.
523
+ * - min_imp_obs
524
+ * Minimum number of observations with which an imputation value can be produced. Ignored if passing
525
+ * 'build_imputer' = 'false'.
526
+ * - depth_imp
527
+ * How to weight observations according to their depth when used for imputing missing values. Passing
528
+ * "Higher" will weigh observations higher the further down the tree (away from the root node) the
529
+ * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
530
+ * to node depth in the tree. Implemented for testing purposes and not recommended to change
531
+ * from the default. Ignored when not passing 'impute_nodes'.
532
+ * - weigh_imp_rows
533
+ * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
534
+ * a node inversely proportional to the number of observations that end up there, while "Proportional"
535
+ * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
536
+ * in this regard regardless of how many observations end up there. Implemented for testing purposes
537
+ * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
538
+ * - impute_at_fit
539
+ * Whether to impute missing values in the input data as the model is being built. If passing 'true',
540
+ * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
541
+ * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
542
+ * - random_seed
543
+ * Seed that will be used to generate random numbers used by the model.
544
+ * - use_long_double
545
+ * Whether to use 'long double' (extended precision) type for more precise calculations about
546
+ * standard deviations, means, ratios, weights, gain, and other potential aggregates. This makes
547
+ * such calculations accurate to a larger number of decimals (provided that the compiler used has
548
+ * wider long doubles than doubles) and it is highly recommended to use when the input data has
549
+ * a number of rows or columns exceeding 2^53 (an unlikely scenario), and also highly recommended
550
+ * to use when the input data has problematic scales (e.g. numbers that differ from each other by
551
+ * something like 10^-100 or columns that include values like 10^100 and 10^-100 and still need to
552
+ * be sensitive to a difference of 10^-100), but will make the calculations slower, the more so in
553
+ * platforms in which 'long double' is a software-emulated type (e.g. Power8 platforms).
554
+ * Note that some platforms (most notably windows with the msvc compiler) do not make any difference
555
+ * between 'double' and 'long double'.
556
+ * - nthreads
557
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
558
+ * allocated, even if the thread does not end up being used.
559
+ * Be aware that most of the operations are bound by memory bandwidth, which means that
560
+ * adding more threads will not result in a linear speed-up. For some types of data
561
+ * (e.g. large sparse matrices with small sample sizes), adding more threads might result
562
+ * in only a very modest speed up (e.g. 1.5x faster with 4x more threads),
563
+ * even if all threads look fully utilized.
564
+ * Ignored when not building with OpenMP support.
565
+ *
566
+ * Returns
567
+ * =======
568
+ * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
569
+ * If the process receives an interrupt signal, will return instead
570
+ * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
571
+ * what these values correspond to, you can use the functions
572
+ * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
573
+ * as integers.
574
+ */
575
+ template <class real_t, class sparse_ix>
576
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
577
+ real_t numeric_data[], size_t ncols_numeric,
578
+ int categ_data[], size_t ncols_categ, int ncat[],
579
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
580
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
581
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
582
+ size_t nrows, size_t sample_size, size_t ntrees,
583
+ size_t max_depth, size_t ncols_per_tree,
584
+ bool limit_depth, bool penalize_range, bool standardize_data,
585
+ ScoringMetric scoring_metric, bool fast_bratio,
586
+ bool standardize_dist, double tmat[],
587
+ double output_depths[], bool standardize_depth,
588
+ real_t col_weights[], bool weigh_by_kurt,
589
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
590
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
591
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
592
+ double prob_pick_col_by_kurt,
593
+ double min_gain, MissingAction missing_action,
594
+ CategSplit cat_split_type, NewCategAction new_cat_action,
595
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
596
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
597
+ uint64_t random_seed, bool use_long_double, int nthreads)
598
+ {
599
+ if (use_long_double && !has_long_double()) {
600
+ use_long_double = false;
601
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
602
+ }
603
+ #ifndef NO_LONG_DOUBLE
604
+ if (likely(!use_long_double))
605
+ #endif
606
+ return fit_iforest_internal<real_t, sparse_ix, double>(
607
+ model_outputs, model_outputs_ext,
608
+ numeric_data, ncols_numeric,
609
+ categ_data, ncols_categ, ncat,
610
+ Xc, Xc_ind, Xc_indptr,
611
+ ndim, ntry, coef_type, coef_by_prop,
612
+ sample_weights, with_replacement, weight_as_sample,
613
+ nrows, sample_size, ntrees,
614
+ max_depth, ncols_per_tree,
615
+ limit_depth, penalize_range, standardize_data,
616
+ scoring_metric, fast_bratio,
617
+ standardize_dist, tmat,
618
+ output_depths, standardize_depth,
619
+ col_weights, weigh_by_kurt,
620
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
621
+ prob_pick_by_full_gain, prob_pick_by_dens,
622
+ prob_pick_col_by_range, prob_pick_col_by_var,
623
+ prob_pick_col_by_kurt,
624
+ min_gain, missing_action,
625
+ cat_split_type, new_cat_action,
626
+ all_perm, imputer, min_imp_obs,
627
+ depth_imp, weigh_imp_rows, impute_at_fit,
628
+ random_seed, nthreads
629
+ );
630
+ #ifndef NO_LONG_DOUBLE
631
+ else
632
+ return fit_iforest_internal<real_t, sparse_ix, long double>(
633
+ model_outputs, model_outputs_ext,
634
+ numeric_data, ncols_numeric,
635
+ categ_data, ncols_categ, ncat,
636
+ Xc, Xc_ind, Xc_indptr,
637
+ ndim, ntry, coef_type, coef_by_prop,
638
+ sample_weights, with_replacement, weight_as_sample,
639
+ nrows, sample_size, ntrees,
640
+ max_depth, ncols_per_tree,
641
+ limit_depth, penalize_range, standardize_data,
642
+ scoring_metric, fast_bratio,
643
+ standardize_dist, tmat,
644
+ output_depths, standardize_depth,
645
+ col_weights, weigh_by_kurt,
646
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
647
+ prob_pick_by_full_gain, prob_pick_by_dens,
648
+ prob_pick_col_by_range, prob_pick_col_by_var,
649
+ prob_pick_col_by_kurt,
650
+ min_gain, missing_action,
651
+ cat_split_type, new_cat_action,
652
+ all_perm, imputer, min_imp_obs,
653
+ depth_imp, weigh_imp_rows, impute_at_fit,
654
+ random_seed, nthreads
655
+ );
656
+ #endif
657
+ }
658
+
659
+ template <class real_t, class sparse_ix, class ldouble_safe>
660
+ int fit_iforest_internal(
661
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
662
+ real_t numeric_data[], size_t ncols_numeric,
663
+ int categ_data[], size_t ncols_categ, int ncat[],
664
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
665
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
666
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
667
+ size_t nrows, size_t sample_size, size_t ntrees,
668
+ size_t max_depth, size_t ncols_per_tree,
669
+ bool limit_depth, bool penalize_range, bool standardize_data,
670
+ ScoringMetric scoring_metric, bool fast_bratio,
671
+ bool standardize_dist, double tmat[],
672
+ double output_depths[], bool standardize_depth,
673
+ real_t col_weights[], bool weigh_by_kurt,
674
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
675
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
676
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
677
+ double prob_pick_col_by_kurt,
678
+ double min_gain, MissingAction missing_action,
679
+ CategSplit cat_split_type, NewCategAction new_cat_action,
680
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
681
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
682
+ uint64_t random_seed, int nthreads)
683
+ {
684
+ if (
685
+ prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
686
+ prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
687
+ prob_pick_col_by_range < 0 ||
688
+ prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
689
+ ) {
690
+ throw std::runtime_error("Cannot pass negative probabilities.\n");
691
+ }
692
+ if (prob_pick_col_by_range && ncols_categ)
693
+ throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
694
+ if (prob_pick_by_full_gain && ncols_categ)
695
+ throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
696
+ if (prob_pick_col_by_kurt && weigh_by_kurt)
697
+ throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
698
+ if (ndim == 0 && model_outputs == NULL)
699
+ throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
700
+ if (penalize_range &&
701
+ (scoring_metric == Density ||
702
+ scoring_metric == AdjDensity ||
703
+ is_boxed_metric(scoring_metric))
704
+ )
705
+ throw std::runtime_error("'penalize_range' is incompatible with density scoring.\n");
706
+ if (with_replacement) {
707
+ if (tmat != NULL)
708
+ throw std::runtime_error("Cannot calculate distance while sampling with replacement.\n");
709
+ if (output_depths != NULL)
710
+ throw std::runtime_error("Cannot make predictions at fit time when sampling with replacement.\n");
711
+ if (impute_at_fit)
712
+ throw std::runtime_error("Cannot impute at fit time when sampling with replacement.\n");
713
+ }
714
+ if (sample_size != 0 && sample_size < nrows) {
715
+ if (output_depths != NULL)
716
+ throw std::runtime_error("Cannot produce outlier scores at fit time when using sub-sampling.\n");
717
+ if (tmat != NULL)
718
+ throw std::runtime_error("Cannot calculate distances at fit time when using sub-sampling.\n");
719
+ if (impute_at_fit)
720
+ throw std::runtime_error("Cannot produce missing data imputations at fit time when using sub-sampling.\n");
721
+ }
722
+
723
+
724
+ /* TODO: this function should also accept the array as a memoryview with a
725
+ leading dimension that might not correspond to the number of columns,
726
+ so as to avoid having to make deep copies of memoryviews in python and to
727
+ allow using pointers to columns of dataframes in R and Python. */
728
+
729
+ /* calculate maximum number of categories to use later */
730
+ int max_categ = 0;
731
+ for (size_t col = 0; col < ncols_categ; col++)
732
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
733
+
734
+ bool calc_dist = tmat != NULL;
735
+
736
+ if (sample_size == 0)
737
+ sample_size = nrows;
738
+
739
+ if (model_outputs != NULL)
740
+ ntry = std::min(ntry, ncols_numeric + ncols_categ);
741
+
742
+ if (ncols_per_tree == 0)
743
+ ncols_per_tree = ncols_numeric + ncols_categ;
744
+
745
+ /* put data in structs to shorten function calls */
746
+ InputData<real_t, sparse_ix>
747
+ input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
748
+ nrows, ncols_numeric + ncols_categ, sample_weights,
749
+ weight_as_sample, col_weights,
750
+ Xc, Xc_ind, Xc_indptr,
751
+ 0, 0, std::vector<double>(),
752
+ std::vector<char>(), 0, NULL,
753
+ (double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
754
+ std::vector<double>(), std::vector<double>(),
755
+ std::vector<size_t>(), std::vector<size_t>()};
756
+ ModelParams model_params = {with_replacement, sample_size, ntrees, ncols_per_tree,
757
+ limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
758
+ penalize_range, standardize_data, random_seed, weigh_by_kurt,
759
+ prob_pick_by_gain_avg, prob_pick_by_gain_pl,
760
+ prob_pick_by_full_gain, prob_pick_by_dens,
761
+ prob_pick_col_by_range, prob_pick_col_by_var,
762
+ prob_pick_col_by_kurt,
763
+ min_gain, cat_split_type, new_cat_action, missing_action,
764
+ scoring_metric, fast_bratio, all_perm,
765
+ (model_outputs != NULL)? 0 : ndim, ntry,
766
+ coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
767
+ depth_imp, weigh_imp_rows, min_imp_obs};
768
+
769
+ /* if calculating full gain, need to produce copies of the data in row-major order */
770
+ if (prob_pick_by_full_gain)
771
+ {
772
+ if (input_data.Xc_indptr == NULL)
773
+ colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
774
+ else
775
+ colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
776
+ input_data.nrows, input_data.ncols_numeric,
777
+ input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
778
+ }
779
+
780
+ /* if using weights as sampling probability, build a binary tree for faster sampling */
781
+ if (input_data.weight_as_sample && input_data.sample_weights != NULL)
782
+ {
783
+ build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
784
+ input_data.nrows, input_data.log2_n, input_data.btree_offset);
785
+ }
786
+
787
+ /* same for column weights */
788
+ /* TODO: this should also save the kurtoses when using 'prob_pick_col_by_kurt' */
789
+ ColumnSampler<ldouble_safe> base_col_sampler;
790
+ if (
791
+ col_weights != NULL ||
792
+ (model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
793
+ (model_params.ncols_per_tree >= input_data.ncols_tot / (model_params.ntrees * 2)))
794
+ )
795
+ {
796
+ bool avoid_col_weights = (model_outputs != NULL && model_params.ntry >= model_params.ncols_per_tree &&
797
+ model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
798
+ model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
799
+ ||
800
+ (model_outputs == NULL && model_params.ndim >= model_params.ncols_per_tree)
801
+ ||
802
+ (model_params.ncols_per_tree == 1);
803
+ if (!avoid_col_weights)
804
+ {
805
+ if (model_params.weigh_by_kurt && model_params.sample_size == input_data.nrows && !model_params.with_replacement)
806
+ {
807
+ RNG_engine rnd_generator(random_seed);
808
+ std::vector<double> kurt_weights = calc_kurtosis_all_data<InputData<real_t, sparse_ix>, ldouble_safe>(input_data, model_params, rnd_generator);
809
+ if (col_weights != NULL)
810
+ {
811
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
812
+ {
813
+ if (kurt_weights[col] <= 0) continue;
814
+ kurt_weights[col] *= col_weights[col];
815
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
816
+ }
817
+ }
818
+ base_col_sampler.initialize(kurt_weights.data(), input_data.ncols_tot);
819
+
820
+ if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
821
+ {
822
+ input_data.all_kurtoses = std::move(kurt_weights);
823
+ }
824
+ }
825
+
826
+ else
827
+ {
828
+ base_col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
829
+ }
830
+
831
+ input_data.preinitialized_col_sampler = &base_col_sampler;
832
+ }
833
+ }
834
+
835
+ /* in some cases, all trees will need to calculate variable ranges for all columns */
836
+ /* TODO: the model might use 'leave_m_cols', or have 'prob_pick_col_by_range<1', in which
837
+ case it might not be beneficial to do this beforehand. Find out when the expected gain
838
+ from doing this here is not beneficial. */
839
+ /* TODO: move this to a different file, it doesn't belong here */
840
+ std::vector<double> variable_ranges_low;
841
+ std::vector<double> variable_ranges_high;
842
+ std::vector<int> variable_ncats;
843
+ if (
844
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
845
+ (model_params.ncols_per_tree >= input_data.ncols_numeric) &&
846
+ ((model_params.prob_pick_col_by_range && input_data.ncols_numeric)
847
+ ||
848
+ is_boxed_metric(model_params.scoring_metric))
849
+ )
850
+ {
851
+ variable_ranges_low.resize(input_data.ncols_numeric);
852
+ variable_ranges_high.resize(input_data.ncols_numeric);
853
+
854
+ std::unique_ptr<unsigned char[]> buffer_cats;
855
+ size_t adj_col;
856
+ if (is_boxed_metric(model_params.scoring_metric))
857
+ {
858
+ variable_ncats.resize(input_data.ncols_categ);
859
+ buffer_cats = std::unique_ptr<unsigned char[]>(new unsigned char[input_data.max_categ]);
860
+ }
861
+
862
+ if (base_col_sampler.col_indices.empty())
863
+ base_col_sampler.initialize(input_data.ncols_tot);
864
+
865
+ bool unsplittable;
866
+ size_t n_tried_numeric = 0;
867
+ size_t col;
868
+ base_col_sampler.prepare_full_pass();
869
+ while (base_col_sampler.sample_col(col))
870
+ {
871
+ if (col < input_data.ncols_numeric)
872
+ {
873
+ if (input_data.Xc_indptr == NULL)
874
+ {
875
+ get_range(input_data.numeric_data + nrows*col,
876
+ input_data.nrows,
877
+ model_params.missing_action,
878
+ variable_ranges_low[col],
879
+ variable_ranges_high[col],
880
+ unsplittable);
881
+ }
882
+
883
+ else
884
+ {
885
+ get_range(col, input_data.nrows,
886
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
887
+ model_params.missing_action,
888
+ variable_ranges_low[col],
889
+ variable_ranges_high[col],
890
+ unsplittable);
891
+ }
892
+
893
+ n_tried_numeric++;
894
+
895
+ if (unsplittable)
896
+ {
897
+ variable_ranges_low[col] = 0;
898
+ variable_ranges_high[col] = 0;
899
+ base_col_sampler.drop_col(col);
900
+ }
901
+ }
902
+
903
+ else
904
+ {
905
+ if (!is_boxed_metric(model_params.scoring_metric))
906
+ {
907
+ if (n_tried_numeric >= input_data.ncols_numeric)
908
+ break;
909
+ else
910
+ continue;
911
+ }
912
+ adj_col = col - input_data.ncols_numeric;
913
+
914
+
915
+ variable_ncats[adj_col] = count_ncateg_in_col(input_data.categ_data + input_data.nrows*adj_col,
916
+ input_data.nrows, input_data.ncat[adj_col],
917
+ buffer_cats.get());
918
+ if (variable_ncats[adj_col] <= 1)
919
+ base_col_sampler.drop_col(col);
920
+ }
921
+ }
922
+
923
+ input_data.preinitialized_col_sampler = &base_col_sampler;
924
+ if (input_data.ncols_numeric) {
925
+ input_data.range_low = variable_ranges_low.data();
926
+ input_data.range_high = variable_ranges_high.data();
927
+ }
928
+ if (input_data.ncols_categ) {
929
+ input_data.ncat_ = variable_ncats.data();
930
+ }
931
+ }
932
+
933
+ /* if imputing missing values on-the-fly, need to determine which are missing */
934
+ std::vector<ImputedData<sparse_ix, ldouble_safe>> impute_vec;
935
+ hashed_map<size_t, ImputedData<sparse_ix, ldouble_safe>> impute_map;
936
+ if (model_params.impute_at_fit)
937
+ check_for_missing(input_data, impute_vec, impute_map, nthreads);
938
+
939
+ /* store model data */
940
+ if (model_outputs != NULL)
941
+ {
942
+ model_outputs->trees.resize(ntrees);
943
+ model_outputs->trees.shrink_to_fit();
944
+ model_outputs->new_cat_action = new_cat_action;
945
+ model_outputs->cat_split_type = cat_split_type;
946
+ model_outputs->missing_action = missing_action;
947
+ model_outputs->scoring_metric = scoring_metric;
948
+ if (
949
+ model_outputs->scoring_metric != Density &&
950
+ model_outputs->scoring_metric != BoxedDensity &&
951
+ model_outputs->scoring_metric != BoxedDensity2 &&
952
+ model_outputs->scoring_metric != BoxedRatio
953
+ )
954
+ model_outputs->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
955
+ else
956
+ model_outputs->exp_avg_depth = 1;
957
+ model_outputs->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
958
+ model_outputs->orig_sample_size = input_data.nrows;
959
+ model_outputs->has_range_penalty = penalize_range;
960
+ }
961
+
962
+ else
963
+ {
964
+ model_outputs_ext->hplanes.resize(ntrees);
965
+ model_outputs_ext->hplanes.shrink_to_fit();
966
+ model_outputs_ext->new_cat_action = new_cat_action;
967
+ model_outputs_ext->cat_split_type = cat_split_type;
968
+ model_outputs_ext->missing_action = missing_action;
969
+ model_outputs_ext->scoring_metric = scoring_metric;
970
+ if (
971
+ model_outputs_ext->scoring_metric != Density &&
972
+ model_outputs_ext->scoring_metric != BoxedDensity &&
973
+ model_outputs_ext->scoring_metric != BoxedDensity2 &&
974
+ model_outputs_ext->scoring_metric != BoxedRatio
975
+ )
976
+ model_outputs_ext->exp_avg_depth = expected_avg_depth<ldouble_safe>(sample_size);
977
+ else
978
+ model_outputs_ext->exp_avg_depth = 1;
979
+ model_outputs_ext->exp_avg_sep = expected_separation_depth<ldouble_safe>(model_params.sample_size);
980
+ model_outputs_ext->orig_sample_size = input_data.nrows;
981
+ model_outputs_ext->has_range_penalty = penalize_range;
982
+ }
983
+
984
+ if (imputer != NULL)
985
+ initialize_imputer<decltype(input_data), ldouble_safe>(
986
+ *imputer, input_data, ntrees, nthreads
987
+ );
988
+
989
+ /* initialize thread-private memory */
990
+ if ((size_t)nthreads > ntrees)
991
+ nthreads = (int)ntrees;
992
+ #ifdef _OPENMP
993
+ std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(nthreads);
994
+ #else
995
+ std::vector<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> worker_memory(1);
996
+ #endif
997
+
998
+ /* Global variable that determines if the procedure receives a stop signal */
999
+ SignalSwitcher ss = SignalSwitcher();
1000
+
1001
+ /* For exception handling */
1002
+ bool threw_exception = false;
1003
+ std::exception_ptr ex = NULL;
1004
+
1005
+ /* grow trees */
1006
+ #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params, threw_exception, ex)
1007
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1008
+ {
1009
+ if (interrupt_switch || threw_exception)
1010
+ continue; /* Cannot break with OpenMP==2.0 (MSVC) */
1011
+
1012
+ try
1013
+ {
1014
+ if (
1015
+ model_params.impute_at_fit &&
1016
+ input_data.n_missing &&
1017
+ !worker_memory[omp_get_thread_num()].impute_vec.size() &&
1018
+ !worker_memory[omp_get_thread_num()].impute_map.size()
1019
+ )
1020
+ {
1021
+ #ifdef _OPENMP
1022
+ if (nthreads > 1)
1023
+ {
1024
+ worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
1025
+ worker_memory[omp_get_thread_num()].impute_map = impute_map;
1026
+ }
1027
+
1028
+ else
1029
+ #endif
1030
+ {
1031
+ worker_memory[0].impute_vec = std::move(impute_vec);
1032
+ worker_memory[0].impute_map = std::move(impute_map);
1033
+ }
1034
+ }
1035
+
1036
+ fit_itree<decltype(input_data), typename std::remove_pointer<decltype(worker_memory.data())>::type, ldouble_safe>(
1037
+ (model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
1038
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
1039
+ worker_memory[omp_get_thread_num()],
1040
+ input_data,
1041
+ model_params,
1042
+ (imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
1043
+ tree);
1044
+
1045
+ if ((model_outputs != NULL))
1046
+ model_outputs->trees[tree].shrink_to_fit();
1047
+ else
1048
+ model_outputs_ext->hplanes[tree].shrink_to_fit();
1049
+ }
1050
+
1051
+ catch (...)
1052
+ {
1053
+ #pragma omp critical
1054
+ {
1055
+ if (!threw_exception)
1056
+ {
1057
+ threw_exception = true;
1058
+ ex = std::current_exception();
1059
+ }
1060
+ }
1061
+ }
1062
+ }
1063
+
1064
+ /* check if the procedure got interrupted */
1065
+ check_interrupt_switch(ss);
1066
+ #if defined(DONT_THROW_ON_INTERRUPT)
1067
+ if (interrupt_switch) return EXIT_FAILURE;
1068
+ #endif
1069
+
1070
+ /* check if some exception was thrown */
1071
+ if (threw_exception)
1072
+ std::rethrow_exception(ex);
1073
+
1074
+ if ((model_outputs != NULL))
1075
+ model_outputs->trees.shrink_to_fit();
1076
+ else
1077
+ model_outputs_ext->hplanes.shrink_to_fit();
1078
+
1079
+ /* if calculating similarity/distance, now need to reduce and average */
1080
+ if (calc_dist)
1081
+ gather_sim_result< PredictionData<real_t, sparse_ix>, InputData<real_t, sparse_ix> >
1082
+ (NULL, &worker_memory,
1083
+ NULL, &input_data,
1084
+ model_outputs, model_outputs_ext,
1085
+ tmat, NULL, 0,
1086
+ model_params.ntrees, false,
1087
+ standardize_dist, false, nthreads);
1088
+
1089
+ check_interrupt_switch(ss);
1090
+ #if defined(DONT_THROW_ON_INTERRUPT)
1091
+ if (interrupt_switch) return EXIT_FAILURE;
1092
+ #endif
1093
+
1094
+ /* same for depths */
1095
+ if (output_depths != NULL)
1096
+ {
1097
+ #ifdef _OPENMP
1098
+ if (nthreads > 1)
1099
+ {
1100
+ for (auto &w : worker_memory)
1101
+ {
1102
+ if (w.row_depths.size())
1103
+ {
1104
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
1105
+ for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
1106
+ output_depths[row] += w.row_depths[row];
1107
+ }
1108
+ }
1109
+ }
1110
+ else
1111
+ #endif
1112
+ {
1113
+ std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
1114
+ }
1115
+
1116
+ if (standardize_depth)
1117
+ {
1118
+ double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
1119
+ model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
1120
+ for (size_t row = 0; row < nrows; row++)
1121
+ output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
1122
+ }
1123
+
1124
+ else
1125
+ {
1126
+ double ntrees_dbl = (double) ntrees;
1127
+ for (size_t row = 0; row < nrows; row++)
1128
+ output_depths[row] /= ntrees_dbl;
1129
+ }
1130
+ }
1131
+
1132
+ check_interrupt_switch(ss);
1133
+ #if defined(DONT_THROW_ON_INTERRUPT)
1134
+ if (interrupt_switch) return EXIT_FAILURE;
1135
+ #endif
1136
+
1137
+ /* if imputing missing values, now need to reduce and write final values */
1138
+ if (model_params.impute_at_fit)
1139
+ {
1140
+ #ifdef _OPENMP
1141
+ if (nthreads > 1)
1142
+ {
1143
+ for (auto &w : worker_memory)
1144
+ combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
1145
+ }
1146
+
1147
+ else
1148
+ #endif
1149
+ {
1150
+ impute_vec = std::move(worker_memory[0].impute_vec);
1151
+ impute_map = std::move(worker_memory[0].impute_map);
1152
+ }
1153
+
1154
+ apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
1155
+ }
1156
+
1157
+ check_interrupt_switch(ss);
1158
+ #if defined(DONT_THROW_ON_INTERRUPT)
1159
+ if (interrupt_switch) return EXIT_FAILURE;
1160
+ #endif
1161
+
1162
+ return EXIT_SUCCESS;
1163
+ }
1164
+
1165
+
1166
+ /* Add additional trees to already-fitted isolation forest model
1167
+ *
1168
+ * Parameters
1169
+ * ==========
1170
+ * - model_outputs
1171
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
1172
+ * if the trees are are to be added to an extended model. Can only pass one of
1173
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
1174
+ * so it cannot be run in parallel for the same model object.
1175
+ * - model_outputs_ext
1176
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
1177
+ * if the trees are are to be added to an single-variable model. Can only pass one of
1178
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
1179
+ * so it cannot be run in parallel for the same model object.
1180
+ * - numeric_data[nrows * ncols_numeric]
1181
+ * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
1182
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
1183
+ * Pass NULL if there are no dense numeric columns.
1184
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1185
+ * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
1186
+ * of columns, either as dense or as sparse arrays.
1187
+ * - ncols_numeric
1188
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1189
+ * what was originally passed to 'fit_iforest'.
1190
+ * - categ_data[nrows * ncols_categ]
1191
+ * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
1192
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
1193
+ * Pass NULL if there are no categorical columns. The encoding must be the same as was used
1194
+ * in the data to which the model was fit.
1195
+ * Each category should be represented as an integer, and these integers must start at zero and
1196
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
1197
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
1198
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
1199
+ * must be the same as was used in the data to which the model was fit.
1200
+ * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
1201
+ * of columns and the same category encoding.
1202
+ * - ncols_categ
1203
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1204
+ * what was originally passed to 'fit_iforest'.
1205
+ * - ncat[ncols_categ]
1206
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). May contain new categories,
1207
+ * but should keep the same encodings that were used for previous categories.
1208
+ * - Xc[nnz]
1209
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
1210
+ * Pass NULL if there are no sparse numeric columns.
1211
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1212
+ * - Xc_ind[nnz]
1213
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
1214
+ * Must be in sorted order, otherwise results will be incorrect.
1215
+ * Pass NULL if there are no sparse numeric columns.
1216
+ * - Xc_indptr[ncols_numeric + 1]
1217
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1218
+ * start and at entry [col + 1] where does column 'col' end.
1219
+ * Pass NULL if there are no sparse numeric columns.
1220
+ * - ndim
1221
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1222
+ * what was originally passed to 'fit_iforest'.
1223
+ * - ntry
1224
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1225
+ * what was originally passed to 'fit_iforest'.
1226
+ * - coef_type
1227
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1228
+ * what was originally passed to 'fit_iforest'.
1229
+ * - sample_weights
1230
+ * Weights for the rows when adding this tree, either as sampling importances when using
1231
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
1232
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
1233
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
1234
+ * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
1235
+ * Pass NULL if the rows all have uniform weights.
1236
+ * - nrows
1237
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
1238
+ * - max_depth
1239
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1240
+ * what was originally passed to 'fit_iforest'.
1241
+ * - ncols_per_tree
1242
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1243
+ * what was originally passed to 'fit_iforest'.
1244
+ * - limit_depth
1245
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1246
+ * what was originally passed to 'fit_iforest'.
1247
+ * - penalize_range
1248
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1249
+ * what was originally passed to 'fit_iforest'.
1250
+ * - standardize_data
1251
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1252
+ * what was originally passed to 'fit_iforest'.
1253
+ * - fast_bratio
1254
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1255
+ * what was originally passed to 'fit_iforest'.
1256
+ * - col_weights
1257
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
1258
+ * Ignored when picking columns by deterministic criterion.
1259
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
1260
+ * effect is multiplicative.
1261
+ * - weigh_by_kurt
1262
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1263
+ * what was originally passed to 'fit_iforest'.
1264
+ * - prob_pick_by_gain_pl
1265
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1266
+ * what was originally passed to 'fit_iforest'.
1267
+ * - prob_pick_by_gain_avg
1268
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1269
+ * what was originally passed to 'fit_iforest'.
1270
+ * - prob_pick_by_full_gain
1271
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1272
+ * what was originally passed to 'fit_iforest'.
1273
+ * - prob_pick_by_dens
1274
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1275
+ * what was originally passed to 'fit_iforest'.
1276
+ * - prob_pick_col_by_range
1277
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1278
+ * what was originally passed to 'fit_iforest'.
1279
+ * - prob_pick_col_by_var
1280
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1281
+ * what was originally passed to 'fit_iforest'.
1282
+ * - prob_pick_col_by_kurt
1283
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1284
+ * what was originally passed to 'fit_iforest'.
1285
+ * - min_gain
1286
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1287
+ * what was originally passed to 'fit_iforest'.
1288
+ * - missing_action
1289
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1290
+ * what was originally passed to 'fit_iforest'.
1291
+ * - cat_split_type
1292
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1293
+ * what was originally passed to 'fit_iforest'.
1294
+ * - new_cat_action
1295
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1296
+ * what was originally passed to 'fit_iforest'.
1297
+ * - depth_imp
1298
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1299
+ * what was originally passed to 'fit_iforest'.
1300
+ * - weigh_imp_rows
1301
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
1302
+ * what was originally passed to 'fit_iforest'.
1303
+ * - all_perm
1304
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1305
+ * what was originally passed to 'fit_iforest'.
1306
+ * - coef_by_prop
1307
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1308
+ * what was originally passed to 'fit_iforest'.
1309
+ * - imputer
1310
+ * Pointer to already-allocated imputer object, as it was output from function 'fit_model' while
1311
+ * producing either 'model_outputs' or 'model_outputs_ext'.
1312
+ * Pass NULL if the model was built without imputer.
1313
+ * - min_imp_obs
1314
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1315
+ * what was originally passed to 'fit_iforest'.
1316
+ * - indexer
1317
+ * Indexer object associated to the model object ('model_outputs' or 'model_outputs_ext'), which will
1318
+ * be updated with the new tree to add.
1319
+ * If 'indexer' has reference points, these must be passed again here in order to index them.
1320
+ * Pass NULL if the model has no associated indexer.
1321
+ * - ref_numeric_data[nref * ncols_numeric]
1322
+ * Pointer to numeric data for reference points. May be ordered by rows
1323
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1324
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1325
+ * (see parameter 'ref_is_col_major').
1326
+ * Pass NULL if there are no dense numeric columns or no reference points.
1327
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
1328
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
1329
+ * numeric columns, then numeric data for reference points must be passed (in either dense or sparse format).
1330
+ * - ref_categ_data[nref * ncols_categ]
1331
+ * Pointer to categorical data for reference points. May be ordered by rows
1332
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1333
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1334
+ * (see parameter 'ref_is_col_major').
1335
+ * Pass NULL if there are no categorical columns or no reference points.
1336
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
1337
+ * categorical columns, then 'ref_categ_data' must be passed.
1338
+ * - ref_is_col_major
1339
+ * Whether 'ref_numeric_data' and/or 'ref_categ_data' are in column-major order. If numeric data is
1340
+ * passed in sparse format, categorical data must be passed in column-major format. If passing dense
1341
+ * data, row-major format is preferred as it will be faster. If the data is passed in row-major format,
1342
+ * must also pass 'ref_ld_numeric' and/or 'ref_ld_categ'.
1343
+ * If both 'ref_numeric_data' and 'ref_categ_data' are passed, they must have the same orientation
1344
+ * (row-major or column-major).
1345
+ * - ref_ld_numeric
1346
+ * Leading dimension of the array 'ref_numeric_data', if it is passed in row-major format.
1347
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1348
+ * be accessed assuming that row 'n' starts at 'ref_numeric_data + n*ref_ld_numeric'). If passing
1349
+ * 'ref_numeric_data' in column-major order, this is ignored and will be assumed that the
1350
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
1351
+ * data in sparse format.
1352
+ * - ref_ld_categ
1353
+ * Leading dimension of the array 'ref_categ_data', if it is passed in row-major format.
1354
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1355
+ * be accessed assuming that row 'n' starts at 'ref_categ_data + n*ref_ld_categ'). If passing
1356
+ * 'ref_categ_data' in column-major order, this is ignored and will be assumed that the
1357
+ * leading dimension corresponds to the number of rows.
1358
+ * - ref_Xc[ref_nnz]
1359
+ * Pointer to numeric data for reference points in sparse numeric matrix in CSC format (column-compressed).
1360
+ * Pass NULL if there are no sparse numeric columns for reference points or no reference points.
1361
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
1362
+ * - ref_Xc_ind[ref_nnz]
1363
+ * Pointer to row indices to which each non-zero entry in 'ref_Xc' corresponds.
1364
+ * Must be in sorted order, otherwise results will be incorrect.
1365
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
1366
+ * - ref_Xc_indptr[ref_nnz]
1367
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1368
+ * start and at entry [col + 1] where does column 'col' end.
1369
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
1370
+ * - random_seed
1371
+ * Seed that will be used to generate random numbers used by the model.
1372
+ * - use_long_double
1373
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
1374
+ * what was originally passed to 'fit_iforest'.
1375
+ */
1376
+ template <class real_t, class sparse_ix>
1377
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1378
+ real_t numeric_data[], size_t ncols_numeric,
1379
+ int categ_data[], size_t ncols_categ, int ncat[],
1380
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1381
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
1382
+ real_t sample_weights[], size_t nrows,
1383
+ size_t max_depth, size_t ncols_per_tree,
1384
+ bool limit_depth, bool penalize_range, bool standardize_data,
1385
+ bool fast_bratio,
1386
+ real_t col_weights[], bool weigh_by_kurt,
1387
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
1388
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
1389
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
1390
+ double prob_pick_col_by_kurt,
1391
+ double min_gain, MissingAction missing_action,
1392
+ CategSplit cat_split_type, NewCategAction new_cat_action,
1393
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
1394
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
1395
+ TreesIndexer *indexer,
1396
+ real_t ref_numeric_data[], int ref_categ_data[],
1397
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1398
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
1399
+ uint64_t random_seed, bool use_long_double)
1400
+ {
1401
+ if (use_long_double && !has_long_double()) {
1402
+ use_long_double = false;
1403
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
1404
+ }
1405
+ #ifndef NO_LONG_DOUBLE
1406
+ if (likely(!use_long_double))
1407
+ #endif
1408
+ return add_tree_internal<real_t, sparse_ix, double>(
1409
+ model_outputs, model_outputs_ext,
1410
+ numeric_data, ncols_numeric,
1411
+ categ_data, ncols_categ, ncat,
1412
+ Xc, Xc_ind, Xc_indptr,
1413
+ ndim, ntry, coef_type, coef_by_prop,
1414
+ sample_weights, nrows,
1415
+ max_depth, ncols_per_tree,
1416
+ limit_depth, penalize_range, standardize_data,
1417
+ fast_bratio,
1418
+ col_weights, weigh_by_kurt,
1419
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
1420
+ prob_pick_by_full_gain, prob_pick_by_dens,
1421
+ prob_pick_col_by_range, prob_pick_col_by_var,
1422
+ prob_pick_col_by_kurt,
1423
+ min_gain, missing_action,
1424
+ cat_split_type, new_cat_action,
1425
+ depth_imp, weigh_imp_rows,
1426
+ all_perm, imputer, min_imp_obs,
1427
+ indexer,
1428
+ ref_numeric_data, ref_categ_data,
1429
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1430
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1431
+ random_seed
1432
+ );
1433
+ #ifndef NO_LONG_DOUBLE
1434
+ else
1435
+ return add_tree_internal<real_t, sparse_ix, long double>(
1436
+ model_outputs, model_outputs_ext,
1437
+ numeric_data, ncols_numeric,
1438
+ categ_data, ncols_categ, ncat,
1439
+ Xc, Xc_ind, Xc_indptr,
1440
+ ndim, ntry, coef_type, coef_by_prop,
1441
+ sample_weights, nrows,
1442
+ max_depth, ncols_per_tree,
1443
+ limit_depth, penalize_range, standardize_data,
1444
+ fast_bratio,
1445
+ col_weights, weigh_by_kurt,
1446
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
1447
+ prob_pick_by_full_gain, prob_pick_by_dens,
1448
+ prob_pick_col_by_range, prob_pick_col_by_var,
1449
+ prob_pick_col_by_kurt,
1450
+ min_gain, missing_action,
1451
+ cat_split_type, new_cat_action,
1452
+ depth_imp, weigh_imp_rows,
1453
+ all_perm, imputer, min_imp_obs,
1454
+ indexer,
1455
+ ref_numeric_data, ref_categ_data,
1456
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1457
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1458
+ random_seed
1459
+ );
1460
+ #endif
1461
+ }
1462
+
1463
+ template <class real_t, class sparse_ix, class ldouble_safe>
1464
+ int add_tree_internal(
1465
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1466
+ real_t numeric_data[], size_t ncols_numeric,
1467
+ int categ_data[], size_t ncols_categ, int ncat[],
1468
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1469
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
1470
+ real_t sample_weights[], size_t nrows,
1471
+ size_t max_depth, size_t ncols_per_tree,
1472
+ bool limit_depth, bool penalize_range, bool standardize_data,
1473
+ bool fast_bratio,
1474
+ real_t col_weights[], bool weigh_by_kurt,
1475
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
1476
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
1477
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
1478
+ double prob_pick_col_by_kurt,
1479
+ double min_gain, MissingAction missing_action,
1480
+ CategSplit cat_split_type, NewCategAction new_cat_action,
1481
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
1482
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
1483
+ TreesIndexer *indexer,
1484
+ real_t ref_numeric_data[], int ref_categ_data[],
1485
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1486
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
1487
+ uint64_t random_seed)
1488
+ {
1489
+ if (
1490
+ prob_pick_by_gain_avg < 0 || prob_pick_by_gain_pl < 0 ||
1491
+ prob_pick_by_full_gain < 0 || prob_pick_by_dens < 0 ||
1492
+ prob_pick_col_by_range < 0 ||
1493
+ prob_pick_col_by_var < 0 || prob_pick_col_by_kurt < 0
1494
+ ) {
1495
+ throw std::runtime_error("Cannot pass negative probabilities.\n");
1496
+ }
1497
+ if (prob_pick_col_by_range && ncols_categ)
1498
+ throw std::runtime_error("'prob_pick_col_by_range' is not compatible with categorical data.\n");
1499
+ if (prob_pick_by_full_gain && ncols_categ)
1500
+ throw std::runtime_error("'prob_pick_by_full_gain' is not compatible with categorical data.\n");
1501
+ if (prob_pick_col_by_kurt && weigh_by_kurt)
1502
+ throw std::runtime_error("'weigh_by_kurt' and 'prob_pick_col_by_kurt' cannot be used together.\n");
1503
+ if (ndim == 0 && model_outputs == NULL)
1504
+ throw std::runtime_error("Must pass 'ndim>0' in the extended model.\n");
1505
+ if (indexer != NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty()) {
1506
+ if (ref_numeric_data == NULL && ref_categ_data == NULL && ref_Xc_indptr == NULL)
1507
+ throw std::runtime_error("'indexer' has reference points. Those points must be passed to index them in the new tree to add.\n");
1508
+ }
1509
+
1510
+ std::vector<ImputeNode> *impute_nodes = NULL;
1511
+
1512
+ int max_categ = 0;
1513
+ for (size_t col = 0; col < ncols_categ; col++)
1514
+ max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
1515
+
1516
+ if (model_outputs != NULL)
1517
+ ntry = std::min(ntry, ncols_numeric + ncols_categ);
1518
+
1519
+ if (ncols_per_tree == 0)
1520
+ ncols_per_tree = ncols_numeric + ncols_categ;
1521
+
1522
+ if (indexer != NULL && indexer->indices.empty())
1523
+ indexer = NULL;
1524
+
1525
+ InputData<real_t, sparse_ix>
1526
+ input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
1527
+ nrows, ncols_numeric + ncols_categ, sample_weights,
1528
+ false, col_weights,
1529
+ Xc, Xc_ind, Xc_indptr,
1530
+ 0, 0, std::vector<double>(),
1531
+ std::vector<char>(), 0, NULL,
1532
+ (double*)NULL, (double*)NULL, (int*)NULL, std::vector<double>(),
1533
+ std::vector<double>(), std::vector<double>(),
1534
+ std::vector<size_t>(), std::vector<size_t>()};
1535
+ ModelParams model_params = {false, nrows, (size_t)1, ncols_per_tree,
1536
+ max_depth? max_depth : (nrows - 1),
1537
+ penalize_range, standardize_data, random_seed, weigh_by_kurt,
1538
+ prob_pick_by_gain_avg, prob_pick_by_gain_pl,
1539
+ prob_pick_by_full_gain, prob_pick_by_dens,
1540
+ prob_pick_col_by_range, prob_pick_col_by_var,
1541
+ prob_pick_col_by_kurt,
1542
+ min_gain, cat_split_type, new_cat_action, missing_action,
1543
+ (model_outputs != NULL)? model_outputs->scoring_metric : model_outputs_ext->scoring_metric,
1544
+ fast_bratio, all_perm,
1545
+ (model_outputs != NULL)? 0 : ndim, ntry,
1546
+ coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
1547
+
1548
+ if (prob_pick_by_full_gain)
1549
+ {
1550
+ if (input_data.Xc_indptr == NULL)
1551
+ colmajor_to_rowmajor(input_data.numeric_data, input_data.nrows, input_data.ncols_numeric, input_data.X_row_major);
1552
+ else
1553
+ colmajor_to_rowmajor(input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
1554
+ input_data.nrows, input_data.ncols_numeric,
1555
+ input_data.Xr, input_data.Xr_ind, input_data.Xr_indptr);
1556
+ }
1557
+
1558
+ std::unique_ptr<WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>> workspace(
1559
+ new WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t>()
1560
+ );
1561
+
1562
+ size_t last_tree;
1563
+ bool added_tree = false;
1564
+ try
1565
+ {
1566
+ if (model_outputs != NULL)
1567
+ {
1568
+ last_tree = model_outputs->trees.size();
1569
+ model_outputs->trees.emplace_back();
1570
+ added_tree = true;
1571
+ }
1572
+
1573
+ else
1574
+ {
1575
+ last_tree = model_outputs_ext->hplanes.size();
1576
+ model_outputs_ext->hplanes.emplace_back();
1577
+ added_tree = true;
1578
+ }
1579
+
1580
+ if (imputer != NULL)
1581
+ {
1582
+ imputer->imputer_tree.emplace_back();
1583
+ impute_nodes = &(imputer->imputer_tree.back());
1584
+ }
1585
+
1586
+ if (indexer != NULL)
1587
+ {
1588
+ indexer->indices.emplace_back();
1589
+ }
1590
+
1591
+ SignalSwitcher ss = SignalSwitcher();
1592
+ check_interrupt_switch(ss);
1593
+
1594
+ fit_itree<decltype(input_data), typename std::remove_pointer<decltype(workspace.get())>::type, ldouble_safe>(
1595
+ (model_outputs != NULL)? &model_outputs->trees.back() : NULL,
1596
+ (model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
1597
+ *workspace,
1598
+ input_data,
1599
+ model_params,
1600
+ impute_nodes,
1601
+ last_tree);
1602
+
1603
+ check_interrupt_switch(ss);
1604
+
1605
+ if (model_outputs != NULL) {
1606
+ model_outputs->trees.back().shrink_to_fit();
1607
+ model_outputs->has_range_penalty = model_outputs->has_range_penalty || penalize_range;
1608
+ }
1609
+ else {
1610
+ model_outputs_ext->hplanes.back().shrink_to_fit();
1611
+ model_outputs_ext->has_range_penalty = model_outputs_ext->has_range_penalty || penalize_range;
1612
+ }
1613
+
1614
+ if (imputer != NULL)
1615
+ imputer->imputer_tree.back().shrink_to_fit();
1616
+
1617
+ if (indexer != NULL)
1618
+ {
1619
+ if (model_outputs != NULL)
1620
+ build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
1621
+ indexer->indices.back().n_terminal,
1622
+ model_outputs->trees.back());
1623
+ else
1624
+ build_terminal_node_mappings_single_tree(indexer->indices.back().terminal_node_mappings,
1625
+ indexer->indices.back().n_terminal,
1626
+ model_outputs_ext->hplanes.back());
1627
+
1628
+ check_interrupt_switch(ss);
1629
+
1630
+
1631
+ if (!indexer->indices.front().node_distances.empty())
1632
+ {
1633
+ std::vector<size_t> temp;
1634
+ temp.reserve(indexer->indices.back().n_terminal);
1635
+ if (model_outputs != NULL) {
1636
+ build_dindex(
1637
+ temp,
1638
+ indexer->indices.back().terminal_node_mappings,
1639
+ indexer->indices.back().node_distances,
1640
+ indexer->indices.back().node_depths,
1641
+ indexer->indices.back().n_terminal,
1642
+ model_outputs->trees.back()
1643
+ );
1644
+ }
1645
+ else {
1646
+ build_dindex(
1647
+ temp,
1648
+ indexer->indices.back().terminal_node_mappings,
1649
+ indexer->indices.back().node_distances,
1650
+ indexer->indices.back().node_depths,
1651
+ indexer->indices.back().n_terminal,
1652
+ model_outputs_ext->hplanes.back()
1653
+ );
1654
+ }
1655
+ }
1656
+
1657
+ check_interrupt_switch(ss);
1658
+ if (!indexer->indices.front().reference_points.empty())
1659
+ {
1660
+ size_t n_ref = indexer->indices.front().reference_points.size();
1661
+ std::vector<sparse_ix> terminal_indices(n_ref);
1662
+ std::unique_ptr<double[]> ignored(new double[n_ref]);
1663
+ if (model_outputs != NULL)
1664
+ {
1665
+ IsoForest single_tree_model;
1666
+ single_tree_model.new_cat_action = model_outputs->new_cat_action;
1667
+ single_tree_model.cat_split_type = model_outputs->cat_split_type;
1668
+ single_tree_model.missing_action = model_outputs->missing_action;
1669
+ single_tree_model.trees.push_back(model_outputs->trees.back());
1670
+
1671
+ predict_iforest(ref_numeric_data, ref_categ_data,
1672
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1673
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1674
+ (real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
1675
+ n_ref, 1, false,
1676
+ &single_tree_model, (ExtIsoForest*)NULL,
1677
+ ignored.get(), terminal_indices.data(),
1678
+ (double*)NULL,
1679
+ indexer);
1680
+ }
1681
+
1682
+ else
1683
+ {
1684
+ ExtIsoForest single_tree_model;
1685
+ single_tree_model.new_cat_action = model_outputs_ext->new_cat_action;
1686
+ single_tree_model.cat_split_type = model_outputs_ext->cat_split_type;
1687
+ single_tree_model.missing_action = model_outputs_ext->missing_action;
1688
+ single_tree_model.hplanes.push_back(model_outputs_ext->hplanes.back());
1689
+
1690
+ predict_iforest(ref_numeric_data, ref_categ_data,
1691
+ ref_is_col_major, ref_ld_numeric, ref_ld_categ,
1692
+ ref_Xc, ref_Xc_ind, ref_Xc_indptr,
1693
+ (real_t*)NULL, (sparse_ix*)NULL, (sparse_ix*)NULL,
1694
+ n_ref, 1, false,
1695
+ (IsoForest*)NULL, &single_tree_model,
1696
+ ignored.get(), terminal_indices.data(),
1697
+ (double*)NULL,
1698
+ indexer);
1699
+ }
1700
+
1701
+ ignored.reset();
1702
+ indexer->indices.back().reference_points.assign(terminal_indices.begin(), terminal_indices.end());
1703
+ indexer->indices.back().reference_points.shrink_to_fit();
1704
+ build_ref_node(indexer->indices.back());
1705
+ }
1706
+
1707
+ check_interrupt_switch(ss);
1708
+ }
1709
+ }
1710
+
1711
+ catch (...)
1712
+ {
1713
+ if (added_tree)
1714
+ {
1715
+ if (model_outputs != NULL)
1716
+ model_outputs->trees.pop_back();
1717
+ else
1718
+ model_outputs_ext->hplanes.pop_back();
1719
+ if (imputer != NULL) {
1720
+ if (model_outputs != NULL)
1721
+ imputer->imputer_tree.resize(model_outputs->trees.size());
1722
+ else
1723
+ imputer->imputer_tree.resize(model_outputs_ext->hplanes.size());
1724
+ }
1725
+ if (indexer != NULL) {
1726
+ if (model_outputs != NULL)
1727
+ indexer->indices.resize(model_outputs->trees.size());
1728
+ else
1729
+ indexer->indices.resize(model_outputs_ext->hplanes.size());
1730
+ }
1731
+ }
1732
+ throw;
1733
+ }
1734
+
1735
+ return EXIT_SUCCESS;
1736
+ }
1737
+
1738
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1739
+ void fit_itree(std::vector<IsoTree> *tree_root,
1740
+ std::vector<IsoHPlane> *hplane_root,
1741
+ WorkerMemory &workspace,
1742
+ InputData &input_data,
1743
+ ModelParams &model_params,
1744
+ std::vector<ImputeNode> *impute_nodes,
1745
+ size_t tree_num)
1746
+ {
1747
+ /* initialize array for depths if called for */
1748
+ if (workspace.ix_arr.empty() && model_params.calc_depth)
1749
+ workspace.row_depths.resize(input_data.nrows, 0);
1750
+
1751
+ /* choose random sample of rows */
1752
+ if (workspace.ix_arr.empty()) workspace.ix_arr.resize(model_params.sample_size);
1753
+ if (input_data.log2_n > 0)
1754
+ workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
1755
+ input_data.btree_weights_init.end());
1756
+ workspace.rnd_generator.seed(model_params.random_seed + tree_num);
1757
+ workspace.rbin = UniformUnitInterval(0, 1);
1758
+ sample_random_rows<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
1759
+ workspace.ix_arr, input_data.nrows, model_params.with_replacement,
1760
+ workspace.rnd_generator, workspace.ix_all,
1761
+ (input_data.weight_as_sample)? input_data.sample_weights : NULL,
1762
+ workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
1763
+ workspace.is_repeated);
1764
+ workspace.st = 0;
1765
+ workspace.end = model_params.sample_size - 1;
1766
+
1767
+ /* in some cases, it's not possible to use column weights even if they are given,
1768
+ because every single column will always need to be checked or end up being used. */
1769
+ bool avoid_col_weights = (tree_root != NULL && model_params.ntry >= model_params.ncols_per_tree &&
1770
+ model_params.prob_pick_by_gain_avg + model_params.prob_pick_by_gain_pl +
1771
+ model_params.prob_pick_by_full_gain + model_params.prob_pick_by_dens >= 1)
1772
+ ||
1773
+ (tree_root == NULL && model_params.ndim >= model_params.ncols_per_tree)
1774
+ ||
1775
+ (model_params.ncols_per_tree == 1);
1776
+ if (input_data.preinitialized_col_sampler == NULL)
1777
+ {
1778
+ if (input_data.col_weights != NULL && !avoid_col_weights && !model_params.weigh_by_kurt)
1779
+ workspace.col_sampler.initialize(input_data.col_weights, input_data.ncols_tot);
1780
+ }
1781
+
1782
+
1783
+ /* set expected tree size and add root node */
1784
+ {
1785
+ size_t exp_nodes = mult2(model_params.sample_size);
1786
+ if (model_params.sample_size >= div2(SIZE_MAX))
1787
+ exp_nodes = SIZE_MAX;
1788
+ else if (model_params.max_depth <= (size_t)30)
1789
+ exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
1790
+ if (tree_root != NULL)
1791
+ {
1792
+ tree_root->reserve(exp_nodes);
1793
+ tree_root->emplace_back();
1794
+ }
1795
+ else
1796
+ {
1797
+ hplane_root->reserve(exp_nodes);
1798
+ hplane_root->emplace_back();
1799
+ }
1800
+ if (impute_nodes != NULL)
1801
+ {
1802
+ impute_nodes->reserve(exp_nodes);
1803
+ impute_nodes->emplace_back((size_t) 0);
1804
+ }
1805
+ }
1806
+
1807
+ /* initialize array with candidate categories if not already done */
1808
+ if (workspace.categs.empty())
1809
+ workspace.categs.resize(input_data.max_categ);
1810
+
1811
+ /* initialize array with per-node column weights if needed */
1812
+ if ((model_params.prob_pick_col_by_range ||
1813
+ model_params.prob_pick_col_by_var ||
1814
+ model_params.prob_pick_col_by_kurt) && workspace.node_col_weights.empty())
1815
+ {
1816
+ workspace.node_col_weights.resize(input_data.ncols_tot);
1817
+ if (tree_root != NULL || model_params.standardize_data || model_params.missing_action != Fail)
1818
+ {
1819
+ workspace.saved_stat1.resize(input_data.ncols_numeric);
1820
+ workspace.saved_stat2.resize(input_data.ncols_numeric);
1821
+ }
1822
+ }
1823
+
1824
+ /* IMPORTANT!!!!!
1825
+ The standard library implementation is likely going to use the Box-Muller method
1826
+ for normal sampling, which has some state memory in the **distribution object itself**
1827
+ in addition to the state memory from the RNG engine. DO NOT avoid re-generating this
1828
+ object on each tree, despite being inefficient, because then it can cause seed
1829
+ irreproducibility when the number of splitting dimensions is odd and the number
1830
+ of threads is more than 1. This is a very hard issue to debug since everything
1831
+ works fine depending on the order in which trees are assigned to threads.
1832
+ DO NOT PUT THESE LINES BELOW THE NEXT IF. */
1833
+ if (hplane_root != NULL)
1834
+ {
1835
+ if (input_data.ncols_categ || model_params.coef_type == Normal)
1836
+ workspace.coef_norm = StandardNormalDistr(0, 1);
1837
+ if (model_params.coef_type == Uniform)
1838
+ workspace.coef_unif = UniformMinusOneToOne(-1, 1);
1839
+ }
1840
+
1841
+ /* for the extended model, initialize extra vectors and objects */
1842
+ if (hplane_root != NULL && workspace.comb_val.empty())
1843
+ {
1844
+ workspace.comb_val.resize(model_params.sample_size);
1845
+ workspace.col_take.resize(model_params.ndim);
1846
+ workspace.col_take_type.resize(model_params.ndim);
1847
+
1848
+ if (input_data.ncols_numeric)
1849
+ {
1850
+ workspace.ext_offset.resize(input_data.ncols_tot);
1851
+ workspace.ext_coef.resize(input_data.ncols_tot);
1852
+ workspace.ext_mean.resize(input_data.ncols_tot);
1853
+ }
1854
+
1855
+ if (input_data.ncols_categ)
1856
+ {
1857
+ workspace.ext_fill_new.resize(input_data.max_categ);
1858
+ switch(model_params.cat_split_type)
1859
+ {
1860
+ case SingleCateg:
1861
+ {
1862
+ workspace.chosen_cat.resize(input_data.max_categ);
1863
+ break;
1864
+ }
1865
+
1866
+ case SubSet:
1867
+ {
1868
+ workspace.ext_cat_coef.resize(input_data.ncols_tot);
1869
+ for (std::vector<double> &v : workspace.ext_cat_coef)
1870
+ v.resize(input_data.max_categ);
1871
+ break;
1872
+ }
1873
+ }
1874
+ }
1875
+
1876
+ workspace.ext_fill_val.resize(input_data.ncols_tot);
1877
+
1878
+ }
1879
+
1880
+ /* If there are density weights, need to standardize them to sum up to
1881
+ the sample size here. Note that weights for missing values with 'Divide'
1882
+ are only initialized on-demand later on. */
1883
+ workspace.changed_weights = false;
1884
+ if (hplane_root == NULL) workspace.weights_map.clear();
1885
+
1886
+ ldouble_safe weight_scaling = 0;
1887
+ if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
1888
+ {
1889
+ workspace.changed_weights = true;
1890
+
1891
+ /* For the extended model, if there is no sub-sampling, these weights will remain
1892
+ constant throughout and do not need to be re-generated. */
1893
+ if (!( hplane_root != NULL &&
1894
+ (!workspace.weights_map.empty() || !workspace.weights_arr.empty()) &&
1895
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement
1896
+ )
1897
+ )
1898
+ {
1899
+ workspace.weights_map.clear();
1900
+
1901
+ /* if the sub-sample size is small relative to the full sample size, use a mapping */
1902
+ if (input_data.Xc_indptr != NULL && model_params.sample_size < input_data.nrows / 50)
1903
+ {
1904
+ for (const size_t ix : workspace.ix_arr)
1905
+ weight_scaling += input_data.sample_weights[ix];
1906
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1907
+ workspace.weights_map.reserve(workspace.ix_arr.size());
1908
+ for (const size_t ix : workspace.ix_arr)
1909
+ workspace.weights_map[ix] = input_data.sample_weights[ix] * weight_scaling;
1910
+ }
1911
+
1912
+ /* if the sub-sample size is large, fill a full array matching to the sample size */
1913
+ else
1914
+ {
1915
+ if (workspace.weights_arr.empty())
1916
+ {
1917
+ workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
1918
+ weight_scaling = std::accumulate(workspace.ix_arr.begin(),
1919
+ workspace.ix_arr.end(),
1920
+ (ldouble_safe)0,
1921
+ [&input_data](const ldouble_safe a, const size_t b){return a + (ldouble_safe)input_data.sample_weights[b];}
1922
+ );
1923
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1924
+ for (double &w : workspace.weights_arr)
1925
+ w *= weight_scaling;
1926
+ }
1927
+
1928
+ else
1929
+ {
1930
+ for (const size_t ix : workspace.ix_arr)
1931
+ {
1932
+ weight_scaling += input_data.sample_weights[ix];
1933
+ workspace.weights_arr[ix] = input_data.sample_weights[ix];
1934
+ }
1935
+ weight_scaling = (ldouble_safe)model_params.sample_size / weight_scaling;
1936
+ for (double &w : workspace.weights_arr)
1937
+ w *= weight_scaling;
1938
+ }
1939
+ }
1940
+ }
1941
+ }
1942
+
1943
+ /* if producing distance/similarity, also need to initialize the triangular matrix */
1944
+ if (model_params.calc_dist && workspace.tmat_sep.empty())
1945
+ workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
1946
+
1947
+ /* make space for buffers if not already allocated */
1948
+ if (
1949
+ (model_params.prob_pick_by_gain_avg > 0 ||
1950
+ model_params.prob_pick_by_gain_pl > 0 ||
1951
+ model_params.prob_pick_by_full_gain > 0 ||
1952
+ model_params.prob_pick_by_dens > 0 ||
1953
+ model_params.prob_pick_col_by_range > 0 ||
1954
+ model_params.prob_pick_col_by_var > 0 ||
1955
+ model_params.prob_pick_col_by_kurt > 0 ||
1956
+ model_params.weigh_by_kurt || hplane_root != NULL)
1957
+ &&
1958
+ (workspace.buffer_dbl.empty() && workspace.buffer_szt.empty() && workspace.buffer_chr.empty())
1959
+ )
1960
+ {
1961
+ size_t min_size_dbl = 0;
1962
+ size_t min_size_szt = 0;
1963
+ size_t min_size_chr = 0;
1964
+
1965
+ bool gain = model_params.prob_pick_by_gain_avg > 0 ||
1966
+ model_params.prob_pick_by_gain_pl > 0 ||
1967
+ model_params.prob_pick_by_full_gain > 0 ||
1968
+ model_params.prob_pick_by_dens > 0;
1969
+
1970
+ if (input_data.ncols_categ)
1971
+ {
1972
+ min_size_szt = (size_t)2 * (size_t)input_data.max_categ;
1973
+ min_size_dbl = input_data.max_categ + 1;
1974
+ if (gain && model_params.cat_split_type == SubSet)
1975
+ min_size_chr = input_data.max_categ;
1976
+ }
1977
+
1978
+ if (input_data.Xc_indptr != NULL && gain)
1979
+ {
1980
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
1981
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
1982
+ }
1983
+
1984
+ /* TODO: revisit if this covers all the cases */
1985
+ if (model_params.ntry > 1 || gain)
1986
+ {
1987
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
1988
+ if (model_params.ndim < 2 && input_data.Xc_indptr != NULL)
1989
+ min_size_dbl = std::max(min_size_dbl, (size_t)2*model_params.sample_size);
1990
+ }
1991
+
1992
+ /* for sampled column choices */
1993
+ if (model_params.prob_pick_col_by_var)
1994
+ {
1995
+ if (input_data.ncols_categ) {
1996
+ min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
1997
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ + 1);
1998
+ }
1999
+ }
2000
+
2001
+ if (model_params.prob_pick_col_by_kurt)
2002
+ {
2003
+ if (input_data.ncols_categ) {
2004
+ min_size_szt = std::max(min_size_szt, (size_t)input_data.max_categ + 1);
2005
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
2006
+ }
2007
+
2008
+ }
2009
+
2010
+ /* for the extended model */
2011
+ if (hplane_root != NULL)
2012
+ {
2013
+ min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
2014
+ if (model_params.missing_action != Fail)
2015
+ {
2016
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
2017
+ min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
2018
+ }
2019
+
2020
+ if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
2021
+ {
2022
+ min_size_szt = std::max(min_size_szt, (size_t)2 * (size_t)input_data.max_categ + (size_t)1);
2023
+ min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
2024
+ }
2025
+
2026
+ if (model_params.weigh_by_kurt)
2027
+ min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
2028
+
2029
+ if (gain && (!workspace.weights_arr.empty() || !workspace.weights_map.empty()))
2030
+ {
2031
+ workspace.sample_weights.resize(model_params.sample_size);
2032
+ min_size_szt = std::max(min_size_szt, model_params.sample_size);
2033
+ }
2034
+ }
2035
+
2036
+ /* now resize */
2037
+ if (workspace.buffer_dbl.size() < min_size_dbl)
2038
+ workspace.buffer_dbl.resize(min_size_dbl);
2039
+
2040
+ if (workspace.buffer_szt.size() < min_size_szt)
2041
+ workspace.buffer_szt.resize(min_size_szt);
2042
+
2043
+ if (workspace.buffer_chr.size() < min_size_chr)
2044
+ workspace.buffer_chr.resize(min_size_chr);
2045
+
2046
+ /* for guided column choice, need to also remember the best split so far */
2047
+ if (
2048
+ model_params.cat_split_type == SubSet &&
2049
+ (
2050
+ model_params.prob_pick_by_gain_avg ||
2051
+ model_params.prob_pick_by_gain_pl ||
2052
+ model_params.prob_pick_by_full_gain ||
2053
+ model_params.prob_pick_by_dens
2054
+ )
2055
+ )
2056
+ {
2057
+ workspace.this_split_categ.resize(input_data.max_categ);
2058
+ }
2059
+
2060
+ }
2061
+
2062
+ /* Other potentially necessary buffers */
2063
+ if (
2064
+ tree_root != NULL && model_params.missing_action == Impute &&
2065
+ (model_params.prob_pick_by_gain_avg || model_params.prob_pick_by_gain_pl ||
2066
+ model_params.prob_pick_by_full_gain || model_params.prob_pick_by_dens) &&
2067
+ input_data.Xc_indptr == NULL && input_data.ncols_numeric && workspace.imputed_x_buffer.empty()
2068
+ )
2069
+ {
2070
+ workspace.imputed_x_buffer.resize(input_data.nrows);
2071
+ }
2072
+
2073
+ if (model_params.prob_pick_by_full_gain && workspace.col_indices.empty())
2074
+ workspace.col_indices.resize(model_params.ncols_per_tree);
2075
+
2076
+ if (
2077
+ (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) &&
2078
+ model_params.weigh_by_kurt &&
2079
+ model_params.sample_size == input_data.nrows && !model_params.with_replacement &&
2080
+ (model_params.ncols_per_tree == input_data.ncols_tot) &&
2081
+ !input_data.all_kurtoses.empty()
2082
+ ) {
2083
+ workspace.tree_kurtoses = input_data.all_kurtoses.data();
2084
+ }
2085
+ else {
2086
+ workspace.tree_kurtoses = NULL;
2087
+ }
2088
+
2089
+ /* weigh columns by kurtosis in the sample if required */
2090
+ /* TODO: this one could probably be refactored to use the function in the helpers */
2091
+ std::vector<double> kurt_weights;
2092
+ bool avoid_leave_m_cols = false;
2093
+ if (
2094
+ model_params.weigh_by_kurt &&
2095
+ !avoid_col_weights &&
2096
+ (input_data.preinitialized_col_sampler == NULL
2097
+ ||
2098
+ ((model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var) && workspace.tree_kurtoses == NULL))
2099
+ )
2100
+ {
2101
+ kurt_weights.resize(input_data.ncols_numeric + input_data.ncols_categ, 0.);
2102
+
2103
+ if (model_params.ncols_per_tree >= input_data.ncols_tot)
2104
+ {
2105
+
2106
+ if (input_data.Xc_indptr == NULL)
2107
+ {
2108
+
2109
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
2110
+ {
2111
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2112
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
2113
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2114
+ input_data.numeric_data + col * input_data.nrows,
2115
+ model_params.missing_action);
2116
+ else if (!workspace.weights_arr.empty())
2117
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, decltype(workspace.weights_arr), ldouble_safe>(
2118
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2119
+ input_data.numeric_data + col * input_data.nrows,
2120
+ model_params.missing_action, workspace.weights_arr);
2121
+ else
2122
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2123
+ decltype(workspace.weights_map), ldouble_safe>(
2124
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2125
+ input_data.numeric_data + col * input_data.nrows,
2126
+ model_params.missing_action, workspace.weights_map);
2127
+ }
2128
+ }
2129
+
2130
+ else
2131
+ {
2132
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
2133
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
2134
+ {
2135
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2136
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2137
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2138
+ ldouble_safe>(
2139
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2140
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2141
+ model_params.missing_action);
2142
+ else if (!workspace.weights_arr.empty())
2143
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2144
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2145
+ decltype(workspace.weights_arr), ldouble_safe>(
2146
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2147
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2148
+ model_params.missing_action, workspace.weights_arr);
2149
+ else
2150
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2151
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2152
+ decltype(workspace.weights_map), ldouble_safe>(
2153
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2154
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2155
+ model_params.missing_action, workspace.weights_map);
2156
+ }
2157
+ }
2158
+
2159
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
2160
+ {
2161
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2162
+ kurt_weights[col + input_data.ncols_numeric] =
2163
+ calc_kurtosis<ldouble_safe>(
2164
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2165
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2166
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
2167
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
2168
+ else if (!workspace.weights_arr.empty())
2169
+ kurt_weights[col + input_data.ncols_numeric] =
2170
+ calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
2171
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2172
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2173
+ workspace.buffer_dbl.data(),
2174
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2175
+ workspace.weights_arr);
2176
+ else
2177
+ kurt_weights[col + input_data.ncols_numeric] =
2178
+ calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
2179
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2180
+ input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
2181
+ workspace.buffer_dbl.data(),
2182
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2183
+ workspace.weights_map);
2184
+ }
2185
+
2186
+ for (auto &w : kurt_weights) w = (w == -HUGE_VAL)? 0. : std::fmax(1e-8, -1. + w);
2187
+ if (input_data.col_weights != NULL)
2188
+ {
2189
+ for (size_t col = 0; col < input_data.ncols_tot; col++)
2190
+ {
2191
+ if (kurt_weights[col] <= 0) continue;
2192
+ kurt_weights[col] *= input_data.col_weights[col];
2193
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
2194
+ }
2195
+ }
2196
+ workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
2197
+ }
2198
+
2199
+
2200
+
2201
+ else
2202
+ {
2203
+ std::vector<size_t> cols_take(model_params.ncols_per_tree);
2204
+ std::vector<size_t> buffer1;
2205
+ std::vector<bool> buffer2;
2206
+ sample_random_rows<double, double>(
2207
+ cols_take, input_data.ncols_tot, false,
2208
+ workspace.rnd_generator, buffer1,
2209
+ (double*)NULL, kurt_weights, /* <- will not get used */
2210
+ (size_t)0, (size_t)0, buffer2);
2211
+
2212
+ if (
2213
+ model_params.sample_size == input_data.nrows &&
2214
+ !model_params.with_replacement &&
2215
+ !input_data.all_kurtoses.empty()
2216
+ )
2217
+ {
2218
+ for (size_t col : cols_take)
2219
+ kurt_weights[col] = input_data.all_kurtoses[col];
2220
+ goto skip_kurt_calculations;
2221
+ }
2222
+
2223
+ if (input_data.Xc_indptr != NULL)
2224
+ std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
2225
+
2226
+ for (size_t col : cols_take)
2227
+ {
2228
+ if (col < input_data.ncols_numeric)
2229
+ {
2230
+ if (input_data.Xc_indptr == NULL)
2231
+ {
2232
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2233
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.numeric_data)>::type, ldouble_safe>(
2234
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2235
+ input_data.numeric_data + col * input_data.nrows,
2236
+ model_params.missing_action);
2237
+ else if (!workspace.weights_arr.empty())
2238
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2239
+ decltype(workspace.weights_arr), ldouble_safe>(
2240
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2241
+ input_data.numeric_data + col * input_data.nrows,
2242
+ model_params.missing_action, workspace.weights_arr);
2243
+ else
2244
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.numeric_data)>::type,
2245
+ decltype(workspace.weights_map), ldouble_safe>(
2246
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2247
+ input_data.numeric_data + col * input_data.nrows,
2248
+ model_params.missing_action, workspace.weights_map);
2249
+ }
2250
+
2251
+ else
2252
+ {
2253
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2254
+ kurt_weights[col] = calc_kurtosis<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2255
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2256
+ ldouble_safe>(
2257
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2258
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2259
+ model_params.missing_action);
2260
+ else if (!workspace.weights_arr.empty())
2261
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2262
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2263
+ decltype(workspace.weights_arr), ldouble_safe>(
2264
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2265
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2266
+ model_params.missing_action, workspace.weights_arr);
2267
+ else
2268
+ kurt_weights[col] = calc_kurtosis_weighted<typename std::remove_pointer<decltype(input_data.Xc)>::type,
2269
+ typename std::remove_pointer<decltype(input_data.Xc_indptr)>::type,
2270
+ decltype(workspace.weights_map), ldouble_safe>(
2271
+ workspace.ix_arr.data(), workspace.st, workspace.end, col,
2272
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
2273
+ model_params.missing_action, workspace.weights_map);
2274
+ }
2275
+ }
2276
+
2277
+ else
2278
+ {
2279
+ if (workspace.weights_arr.empty() && workspace.weights_map.empty())
2280
+ kurt_weights[col] =
2281
+ calc_kurtosis<ldouble_safe>(
2282
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2283
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2284
+ input_data.ncat[col - input_data.ncols_numeric],
2285
+ workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
2286
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
2287
+ else if (!workspace.weights_arr.empty())
2288
+ kurt_weights[col] =
2289
+ calc_kurtosis_weighted<decltype(workspace.weights_arr), ldouble_safe>(
2290
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2291
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2292
+ input_data.ncat[col - input_data.ncols_numeric],
2293
+ workspace.buffer_dbl.data(),
2294
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2295
+ workspace.weights_arr);
2296
+ else
2297
+ kurt_weights[col] =
2298
+ calc_kurtosis_weighted<decltype(workspace.weights_map), ldouble_safe>(
2299
+ workspace.ix_arr.data(), workspace.st, workspace.end,
2300
+ input_data.categ_data + (col - input_data.ncols_numeric) * input_data.nrows,
2301
+ input_data.ncat[col - input_data.ncols_numeric],
2302
+ workspace.buffer_dbl.data(),
2303
+ model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator,
2304
+ workspace.weights_map);
2305
+ }
2306
+
2307
+ /* Note to self: don't move this to outside of the braces, as it needs to assign a weight
2308
+ of zero to the columns that were not selected, thus it should only do this clipping
2309
+ for columns that are chosen. */
2310
+ if (kurt_weights[col] == -HUGE_VAL)
2311
+ {
2312
+ kurt_weights[col] = 0;
2313
+ }
2314
+
2315
+ else
2316
+ {
2317
+ kurt_weights[col] = std::fmax(1e-8, -1. + kurt_weights[col]);
2318
+ if (input_data.col_weights != NULL)
2319
+ {
2320
+ kurt_weights[col] *= input_data.col_weights[col];
2321
+ kurt_weights[col] = std::fmax(kurt_weights[col], 1e-100);
2322
+ }
2323
+ }
2324
+ }
2325
+
2326
+ skip_kurt_calculations:
2327
+ workspace.col_sampler.initialize(kurt_weights.data(), kurt_weights.size());
2328
+ avoid_leave_m_cols = true;
2329
+ }
2330
+
2331
+ if (model_params.prob_pick_col_by_range || model_params.prob_pick_col_by_var)
2332
+ {
2333
+ workspace.tree_kurtoses = kurt_weights.data();
2334
+ }
2335
+ }
2336
+
2337
+ bool col_sampler_is_fresh = true;
2338
+ if (input_data.preinitialized_col_sampler == NULL) {
2339
+ workspace.col_sampler.initialize(input_data.ncols_tot);
2340
+ }
2341
+ else {
2342
+ workspace.col_sampler = *((ColumnSampler<ldouble_safe>*)input_data.preinitialized_col_sampler);
2343
+ col_sampler_is_fresh = false;
2344
+ }
2345
+ /* TODO: this can be done more efficiently when sub-sampling columns */
2346
+ if (!avoid_leave_m_cols)
2347
+ workspace.col_sampler.leave_m_cols(model_params.ncols_per_tree, workspace.rnd_generator);
2348
+ if (model_params.ncols_per_tree < input_data.ncols_tot) col_sampler_is_fresh = false;
2349
+ workspace.try_all = false;
2350
+ if (hplane_root != NULL && model_params.ndim >= input_data.ncols_tot)
2351
+ workspace.try_all = true;
2352
+
2353
+ if (model_params.scoring_metric != Depth && !is_boxed_metric(model_params.scoring_metric))
2354
+ {
2355
+ workspace.density_calculator.initialize(model_params.max_depth,
2356
+ input_data.ncols_categ? input_data.max_categ : 0,
2357
+ tree_root != NULL && input_data.ncols_categ,
2358
+ model_params.scoring_metric);
2359
+ }
2360
+
2361
+ else if (is_boxed_metric(model_params.scoring_metric))
2362
+ {
2363
+ if (tree_root != NULL)
2364
+ workspace.density_calculator.initialize_bdens(input_data,
2365
+ model_params,
2366
+ workspace.ix_arr,
2367
+ workspace.col_sampler);
2368
+ else
2369
+ workspace.density_calculator.initialize_bdens_ext(input_data,
2370
+ model_params,
2371
+ workspace.ix_arr,
2372
+ workspace.col_sampler,
2373
+ col_sampler_is_fresh);
2374
+ }
2375
+
2376
+ if (tree_root != NULL)
2377
+ {
2378
+ split_itree_recursive<InputData, WorkerMemory, ldouble_safe>(
2379
+ *tree_root,
2380
+ workspace,
2381
+ input_data,
2382
+ model_params,
2383
+ impute_nodes,
2384
+ 0);
2385
+ }
2386
+
2387
+ else
2388
+ {
2389
+ split_hplane_recursive<InputData, WorkerMemory, ldouble_safe>(
2390
+ *hplane_root,
2391
+ workspace,
2392
+ input_data,
2393
+ model_params,
2394
+ impute_nodes,
2395
+ 0);
2396
+ }
2397
+
2398
+ /* if producing imputation structs, only need to keep the ones for terminal nodes */
2399
+ if (impute_nodes != NULL)
2400
+ drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
2401
+ }