isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,1090 +0,0 @@
1
- /* Isolation forests and variations thereof, with adjustments for incorporation
2
- * of categorical variables and missing values.
3
- * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
5
- * This library is based on the following works:
6
- * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
- * "Isolation forest."
8
- * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
- * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
- * "Isolation-based anomaly detection."
11
- * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
- * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
- * "Extended Isolation Forest."
14
- * arXiv preprint arXiv:1811.02141 (2018).
15
- * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
- * "On detecting clustered anomalies using SCiForest."
17
- * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
- * [5] https://sourceforge.net/projects/iforest/
19
- * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
- * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
24
- * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
26
- * All rights reserved.
27
- * Redistribution and use in source and binary forms, with or without
28
- * modification, are permitted provided that the following conditions are met:
29
- * * Redistributions of source code must retain the above copyright notice, this
30
- * list of conditions and the following disclaimer.
31
- * * Redistributions in binary form must reproduce the above copyright notice,
32
- * this list of conditions and the following disclaimer in the documentation
33
- * and/or other materials provided with the distribution.
34
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
- */
45
- #include "isotree.hpp"
46
-
47
- bool interrupt_switch;
48
-
49
- /* Fit Isolation Forest model, or variant of it such as SCiForest
50
- *
51
- * Parameters:
52
- * ===========
53
- * - model_outputs (out)
54
- * Pointer to already allocated isolation forest model object for single-variable splits.
55
- * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
56
- * additional trees through function 'add_tree'.
57
- * - model_outputs_ext (out)
58
- * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
59
- * Note that if 'ndim' = 1, must use instead the single-variable model object.
60
- * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
61
- * additional trees through function 'add_tree'.
62
- * - numeric_data[nrows * ncols_numeric]
63
- * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
64
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
65
- * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
66
- * no sparse numeric data either).
67
- * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
68
- * - ncols_numeric
69
- * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
70
- * - categ_data[nrows * ncols_categ]
71
- * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
72
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
73
- * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
74
- * Each category should be represented as an integer, and these integers must start at zero and
75
- * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
76
- * (note that they are not treated as being ordinal, this is just an encoding). Missing values
77
- * should be encoded as negative numbers such as (-1).
78
- * - ncols_categ
79
- * Number of categorical columns in the data.
80
- * - ncat[ncols_categ]
81
- * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
82
- * the number of categories for that column is '5' (zero is one category).
83
- * - Xc[nnz]
84
- * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
85
- * Pass NULL if there are no sparse numeric columns.
86
- * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
87
- * - Xc_ind[nnz]
88
- * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
89
- * Pass NULL if there are no sparse numeric columns.
90
- * - Xc_indptr[ncols_numeric + 1]
91
- * Pointer to column index pointers that tell at entry [col] where does column 'col'
92
- * start and at entry [col + 1] where does column 'col' end.
93
- * Pass NULL if there are no sparse numeric columns.
94
- * - ndim
95
- * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
96
- * the single-variable model. Note that the model object pointer passed must also
97
- * agree with the value passed to 'ndim'.
98
- * - ntry
99
- * In the split-criterion extended model, how many random hyperplanes to evaluate in
100
- * order to decide which one is best to take. Ignored for the single-variable case
101
- * and for random splits.
102
- * - coef_type
103
- * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
104
- * (as proposed in [3]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [4]. Ignored for the
105
- * single-variable model.
106
- * - sample_weights[nrows]
107
- * Weights for the rows when building a tree, either as sampling importances when using
108
- * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
109
- * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
110
- * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
111
- * through parameter 'weight_as_sample'.
112
- * Pass NULL if the rows all have uniform weights.
113
- * - with_replacement
114
- * Whether to produce sub-samples with replacement or not.
115
- * - weight_as_sample
116
- * If passing 'sample_weights', whether to consider those weights as row sampling weights (i.e. the higher
117
- * the weights, the more likely the observation will end up included in each tree sub-sample), or as distribution
118
- * density weights (i.e. putting a weight of two is the same as if the row appeared twice, thus higher weight makes it
119
- * less of an outlier). Note that sampling weight is only used when sub-sampling data for each tree.
120
- * - nrows
121
- * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
122
- * - sample_size
123
- * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
124
- * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
125
- * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
126
- * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
127
- * in [5] is 'nrows' here.
128
- * - ntrees
129
- * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
130
- * author's code in [5] is 10.
131
- * - max_depth
132
- * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
133
- * - limit_depth
134
- * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
135
- * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
136
- * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
137
- * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass higher values if
138
- * using the model for purposes other than outlier detection.
139
- * - penalize_range
140
- * Whether to penalize (add +1 to the terminal depth) observations at prediction time that have a value
141
- * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
142
- * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
143
- * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
144
- * when splitting by categorical variables.
145
- * - standardize_dist
146
- * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
147
- * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
148
- * - tmat[nrows * (nrows - 1) / 2]
149
- * Array in which to calculate average separation depths or standardized distance metric (see documentation
150
- * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
151
- * the regular model process. If passing this output argument, the sample size must be the same as the number
152
- * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
153
- * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
154
- * entry 0 <= i < j < n will be located at position
155
- * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
156
- * Can be converted to a dense square matrix through function 'tmat_to_dense'.
157
- * - output_depths[nrows]
158
- * Array in which to calculate average path depths or standardized outlierness metric (see documentation
159
- * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
160
- * the regular model process. If passing this output argument, the sample size must be the same as the number
161
- * of rows. If not NULL, must already be initialized to zeros.
162
- * - standardize_depth
163
- * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
164
- * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
165
- * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
166
- * the average depth of each row across all trees.
167
- * - col_weights[ncols_numeric + ncols_categ]
168
- * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
169
- * Ignored when picking columns by deterministic criterion.
170
- * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
171
- * - weigh_by_kurt
172
- * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
173
- * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
174
- * sample, so if not using sub-samples, it's better to pass column weights calculated externally. For
175
- * categorical columns, will calculate expected kurtosis if the column was converted to numerical by
176
- * assigning to each category a random number ~ Unif(0, 1).
177
- * - prob_pick_by_gain_avg
178
- * Probability of making each split in the single-variable model by choosing a column and split point in that
179
- * same column as both the column and split point that gives the largest averaged gain (as proposed in [4]) across
180
- * all available columns and possible splits in each column. Note that this implies evaluating every single column
181
- * in the sample data when this type of split happens, which will potentially make the model fitting much slower,
182
- * but has no impact on prediction time. For categorical variables, will take the expected standard deviation that
183
- * would be gotten if the column were converted to numerical by assigning to each category a random number ~ Unif(0, 1)
184
- * and calculate gain with those assumed standard deviations. For the extended model, this parameter indicates the probability that the
185
- * split point in the chosen linear combination of variables will be decided by this averaged gain criterion. Compared to
186
- * a pooled average, this tends to result in more cases in which a single observation or very few of them are put into
187
- * one branch. Recommended to use sub-samples (parameter `sample_size`) when passing this parameter. When splits are
188
- * not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl',
189
- * both the column and the split point are decided at random.
190
- * Default setting for [1], [2], [3] is zero, and default for [4] is 1. This is the randomization parameter that can
191
- * be passed to the author's original code in [5]. Note that, if passing value 1 (100%) with no sub-sampling and using the
192
- * single-variable model, every single tree will have the exact same splits.
193
- * - prob_split_by_gain_avg
194
- * Probability of making each split by selecting a column at random and determining the split point as
195
- * that which gives the highest averaged gain. Not supported for the extended model as the splits are on
196
- * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_avg' for more details.
197
- * - prob_pick_by_gain_pl
198
- * Probability of making each split in the single-variable model by choosing a column and split point in that
199
- * same column as both the column and split point that gives the largest pooled gain (as used in decision tree
200
- * classifiers such as C4.5 in [7]) across all available columns and possible splits in each column. Note
201
- * that this implies evaluating every single column in the sample data when this type of split happens, which
202
- * will potentially make the model fitting much slower, but has no impact on prediction time. For categorical
203
- * variables, will use shannon entropy instead (like in [7]). For the extended model, this parameter indicates the probability
204
- * that the split point in the chosen linear combination of variables will be decided by this pooled gain
205
- * criterion. Compared to a simple average, this tends to result in more evenly-divided splits and more clustered
206
- * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
207
- * When used for outlier detection, higher values of this parameter result in models that are able to better flag
208
- * outliers in the training data, but generalize poorly to outliers in new data and to values of variables
209
- * outside of the ranges from the training data. Passing small 'sample_size' and high values of this parameter will
210
- * tend to flag too many outliers. When splits are not made according to any of 'prob_pick_by_gain_avg',
211
- * 'prob_pick_by_gain_pl', 'prob_split_by_gain_avg', 'prob_split_by_gain_pl', both the column and the split point
212
- * are decided at random. Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
213
- * every single tree will have the exact same splits.
214
- * - prob_split_by_gain_pl
215
- * Probability of making each split by selecting a column at random and determining the split point as
216
- * that which gives the highest pooled gain. Not supported for the extended model as the splits are on
217
- * linear combinations of variables. See the documentation for parameter 'prob_pick_by_gain_pl' for more details.
218
- * - min_gain
219
- * Minimum gain that a split threshold needs to produce in order to proceed with a split. Only used when the splits
220
- * are decided by a gain criterion (either pooled or averaged). If the highest possible gain in the evaluated
221
- * splits at a node is below this threshold, that node becomes a terminal node.
222
- * - missing_action
223
- * How to handle missing data at both fitting and prediction time. Options are a) "Divide" (for the single-variable
224
- * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
225
- * the data that went to each branch when fitting the model, b) "Impute", which will assign observations to the
226
- * branch with the most observations in the single-variable model, or fill in missing values with the median
227
- * of each column of the sample from which the split was made in the extended model (recommended), c) "Fail" which will assume
228
- * there are no missing values and will trigger undefined behavior if it encounters any. In the extended model, infinite
229
- * values will be treated as missing. Note that passing "fail" might crash the process if there turn out to be
230
- * missing values, but will otherwise produce faster fitting and prediction times along with decreased model object sizes.
231
- * Models from [1], [2], [3], [4] correspond to "Fail" here.
232
- * - cat_split_type
233
- * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
234
- * a single category to a branch and the rest to the other branch. For the extended model, whether to
235
- * give each category a coefficient, or only one while the rest get zero.
236
- * - new_cat_action
237
- * What to do after splitting a categorical feature when new data that reaches that split has categories that
238
- * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
239
- * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
240
- * data that went to each branch when fitting the model, and in the extended model will assign
241
- * them the median value for that column that was added to the linear combination of features, b) "Smallest", which will
242
- * assign all observations with unseen categories in the split to the branch that had fewer observations when
243
- * fitting the model, c) "Random", which will assing a branch (coefficient in the extended model) at random for
244
- * each category beforehand, even if no observations had that category when fitting the model. Ignored when
245
- * passing 'cat_split_type' = 'SingleCateg'.
246
- * - all_perm
247
- * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
248
- * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
249
- * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
250
- * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
251
- * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
252
- * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
253
- * possible to select other splits that respect the sorted frequency order.
254
- * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
255
- * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
256
- * but note that this is not checked within the function.
257
- * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
258
- * - coef_by_prop
259
- * In the extended model, whether to sort the randomly-generated coefficients for categories
260
- * according to their relative frequency in the tree node. This might provide better results when using
261
- * categorical variables with too many categories, but is not recommended, and not reflective of
262
- * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
263
- * variables.
264
- * - imputer (out)
265
- * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
266
- * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
267
- * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
268
- * or division strategy.
269
- * - min_imp_obs
270
- * Minimum number of observations with which an imputation value can be produced. Ignored if passing
271
- * 'build_imputer' = 'false'.
272
- * - depth_imp
273
- * How to weight observations according to their depth when used for imputing missing values. Passing
274
- * "Higher" will weigh observations higher the further down the tree (away from the root node) the
275
- * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
276
- * to node depth in the tree. Implemented for testing purposes and not recommended to change
277
- * from the default. Ignored when not passing 'impute_nodes'.
278
- * - weigh_imp_rows
279
- * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
280
- * a node inversely proportional to the number of observations that end up there, while "Proportional"
281
- * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
282
- * in this regard regardless of how many observations end up there. Implemented for testing purposes
283
- * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
284
- * - impute_at_fit
285
- * Whether to impute missing values in the input data as the model is being built. If passing 'true',
286
- * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
287
- * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
288
- * - random_seed
289
- * Seed that will be used to generate random numbers used by the model.
290
- * - handle_interrupt
291
- * Whether to handle interrupt signals while the process is running. Note that this will
292
- * interfere with interrupt handles when the procedure is called from interpreted languages
293
- * such as Python or R.
294
- * - nthreads
295
- * Number of parallel threads to use. Note that, the more threads, the more memory will be
296
- * allocated, even if the thread does not end up being used. Ignored when not building with
297
- * OpenMP support.
298
- *
299
- * Returns
300
- * =======
301
- * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
302
- * If the process receives an interrupt signal, will return instead
303
- * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
304
- * what these values correspond to, you can use the functions
305
- * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
306
- * as integers.
307
- *
308
- * References
309
- * ==========
310
- * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
311
- * "Isolation forest."
312
- * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
313
- * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
314
- * "Isolation-based anomaly detection."
315
- * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
316
- * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
317
- * "Extended Isolation Forest."
318
- * arXiv preprint arXiv:1811.02141 (2018).
319
- * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
320
- * "On detecting clustered anomalies using SCiForest."
321
- * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
322
- * [5] https://sourceforge.net/projects/iforest/
323
- * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
324
- * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
325
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
326
- */
327
- int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
328
- double numeric_data[], size_t ncols_numeric,
329
- int categ_data[], size_t ncols_categ, int ncat[],
330
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
331
- size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
332
- double sample_weights[], bool with_replacement, bool weight_as_sample,
333
- size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
334
- bool limit_depth, bool penalize_range,
335
- bool standardize_dist, double tmat[],
336
- double output_depths[], bool standardize_depth,
337
- double col_weights[], bool weigh_by_kurt,
338
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
339
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
340
- double min_gain, MissingAction missing_action,
341
- CategSplit cat_split_type, NewCategAction new_cat_action,
342
- bool all_perm, Imputer *imputer, size_t min_imp_obs,
343
- UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
344
- uint64_t random_seed, bool handle_interrupt, int nthreads)
345
- {
346
- /* calculate maximum number of categories to use later */
347
- int max_categ = 0;
348
- for (size_t col = 0; col < ncols_categ; col++)
349
- max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
350
-
351
- bool calc_dist = tmat != NULL;
352
-
353
- if (calc_dist || sample_size == 0)
354
- sample_size = nrows;
355
-
356
- /* put data in structs to shorten function calls */
357
- InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
358
- nrows, ncols_numeric + ncols_categ, sample_weights,
359
- weight_as_sample, col_weights,
360
- Xc, Xc_ind, Xc_indptr,
361
- 0, 0, std::vector<double>(),
362
- std::vector<char>(), 0};
363
- ModelParams model_params = {with_replacement, sample_size, ntrees,
364
- limit_depth? log2ceil(sample_size) : max_depth? max_depth : (sample_size - 1),
365
- penalize_range, random_seed, weigh_by_kurt,
366
- prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
367
- prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
368
- min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
369
- (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
370
- coef_type, coef_by_prop, calc_dist, (bool)(output_depths != NULL), impute_at_fit,
371
- depth_imp, weigh_imp_rows, min_imp_obs};
372
-
373
- /* if using weights as sampling probability, build a binary tree for faster sampling */
374
- if (input_data.weight_as_sample && input_data.sample_weights != NULL)
375
- {
376
- build_btree_sampler(input_data.btree_weights_init, input_data.sample_weights,
377
- input_data.nrows, input_data.log2_n, input_data.btree_offset);
378
- }
379
-
380
- /* if imputing missing values on-the-fly, need to determine which are missing */
381
- std::vector<ImputedData> impute_vec;
382
- std::unordered_map<size_t, ImputedData> impute_map;
383
- if (model_params.impute_at_fit)
384
- check_for_missing(input_data, impute_vec, impute_map, nthreads);
385
-
386
- /* store model data */
387
- if (model_outputs != NULL)
388
- {
389
- model_outputs->trees.resize(ntrees);
390
- model_outputs->trees.shrink_to_fit();
391
- model_outputs->new_cat_action = new_cat_action;
392
- model_outputs->cat_split_type = cat_split_type;
393
- model_outputs->missing_action = missing_action;
394
- model_outputs->exp_avg_depth = expected_avg_depth(sample_size);
395
- model_outputs->exp_avg_sep = expected_separation_depth(model_params.sample_size);
396
- model_outputs->orig_sample_size = input_data.nrows;
397
- }
398
-
399
- else
400
- {
401
- model_outputs_ext->hplanes.resize(ntrees);
402
- model_outputs_ext->hplanes.shrink_to_fit();
403
- model_outputs_ext->new_cat_action = new_cat_action;
404
- model_outputs_ext->cat_split_type = cat_split_type;
405
- model_outputs_ext->missing_action = missing_action;
406
- model_outputs_ext->exp_avg_depth = expected_avg_depth(sample_size);
407
- model_outputs_ext->exp_avg_sep = expected_separation_depth(model_params.sample_size);
408
- model_outputs_ext->orig_sample_size = input_data.nrows;
409
- }
410
-
411
- if (imputer != NULL)
412
- initialize_imputer(*imputer, input_data, ntrees, nthreads);
413
-
414
- /* initialize thread-private memory */
415
- if ((size_t)nthreads > ntrees)
416
- nthreads = (int)ntrees;
417
- #ifdef _OPENMP
418
- std::vector<WorkerMemory> worker_memory(nthreads);
419
- #else
420
- std::vector<WorkerMemory> worker_memory(1);
421
- #endif
422
-
423
- /* Global variable that determines if the procedure receives a stop signal */
424
- interrupt_switch = false;
425
- /* TODO: find a better way of handling interrupt signals when calling in Python/R.
426
- The following will still change the behavior of interrupts when called through e.g. Flask */
427
- #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
428
- struct sigaction sig_handle = {};
429
- if (handle_interrupt)
430
- {
431
- sig_handle.sa_flags = SA_RESETHAND;
432
- sig_handle.sa_handler = set_interrup_global_variable;
433
- sigemptyset(&sig_handle.sa_mask);
434
- }
435
- #endif
436
-
437
- /* grow trees */
438
- #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
439
- for (size_t_for tree = 0; tree < ntrees; tree++)
440
- {
441
- if (interrupt_switch)
442
- continue; /* Cannot break with OpenMP==2.0 (MSVC) */
443
-
444
- if (
445
- model_params.impute_at_fit &&
446
- input_data.n_missing &&
447
- !worker_memory[omp_get_thread_num()].impute_vec.size() &&
448
- !worker_memory[omp_get_thread_num()].impute_map.size()
449
- )
450
- {
451
- #ifdef _OPENMP
452
- if (nthreads > 1)
453
- {
454
- worker_memory[omp_get_thread_num()].impute_vec = impute_vec;
455
- worker_memory[omp_get_thread_num()].impute_map = impute_map;
456
- }
457
-
458
- else
459
- #endif
460
- {
461
- worker_memory[0].impute_vec = std::move(impute_vec);
462
- worker_memory[0].impute_map = std::move(impute_map);
463
- }
464
- }
465
-
466
- fit_itree((model_outputs != NULL)? &model_outputs->trees[tree] : NULL,
467
- (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : NULL,
468
- worker_memory[omp_get_thread_num()],
469
- input_data,
470
- model_params,
471
- (imputer != NULL)? &(imputer->imputer_tree[tree]) : NULL,
472
- tree);
473
-
474
- if ((model_outputs != NULL))
475
- model_outputs->trees[tree].shrink_to_fit();
476
- else
477
- model_outputs_ext->hplanes[tree].shrink_to_fit();
478
-
479
- if (handle_interrupt)
480
- {
481
- #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
482
- sigaction(SIGINT, &sig_handle, NULL);
483
- #else
484
- signal(SIGINT, set_interrup_global_variable);
485
- #endif
486
- }
487
- }
488
-
489
- /* check if the procedure got interrupted */
490
- if (interrupt_switch) return EXIT_FAILURE;
491
- interrupt_switch = false;
492
-
493
- if ((model_outputs != NULL))
494
- model_outputs->trees.shrink_to_fit();
495
- else
496
- model_outputs_ext->hplanes.shrink_to_fit();
497
-
498
- /* if calculating similarity/distance, now need to reduce and average */
499
- if (calc_dist)
500
- gather_sim_result(NULL, &worker_memory,
501
- NULL, &input_data,
502
- model_outputs, model_outputs_ext,
503
- tmat, NULL, 0,
504
- model_params.ntrees, false,
505
- standardize_dist, nthreads);
506
-
507
- /* same for depths */
508
- if (output_depths != NULL)
509
- {
510
- #ifdef _OPENMP
511
- if (nthreads > 1)
512
- {
513
- for (WorkerMemory &w : worker_memory)
514
- {
515
- if (w.row_depths.size())
516
- {
517
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data, output_depths, w, worker_memory)
518
- for (size_t_for row = 0; row < input_data.nrows; row++)
519
- output_depths[row] += w.row_depths[row];
520
- }
521
- }
522
- }
523
- else
524
- #endif
525
- {
526
- std::copy(worker_memory[0].row_depths.begin(), worker_memory[0].row_depths.end(), output_depths);
527
- }
528
-
529
- if (standardize_depth)
530
- {
531
- double depth_divisor = (double)ntrees * ((model_outputs != NULL)?
532
- model_outputs->exp_avg_depth : model_outputs_ext->exp_avg_depth);
533
- for (size_t_for row = 0; row < nrows; row++)
534
- output_depths[row] = exp2( - output_depths[row] / depth_divisor );
535
- }
536
-
537
- else
538
- {
539
- double ntrees_dbl = (double) ntrees;
540
- for (size_t_for row = 0; row < nrows; row++)
541
- output_depths[row] /= ntrees_dbl;
542
- }
543
- }
544
-
545
- /* if imputing missing values, now need to reduce and write final values */
546
- if (model_params.impute_at_fit)
547
- {
548
- #ifdef _OPENMP
549
- if (nthreads > 1)
550
- {
551
- for (WorkerMemory &w : worker_memory)
552
- combine_tree_imputations(w, impute_vec, impute_map, input_data.has_missing, nthreads);
553
- }
554
-
555
- else
556
- #endif
557
- {
558
- impute_vec = std::move(worker_memory[0].impute_vec);
559
- impute_map = std::move(worker_memory[0].impute_map);
560
- }
561
-
562
- apply_imputation_results(impute_vec, impute_map, *imputer, input_data, nthreads);
563
- }
564
-
565
- return EXIT_SUCCESS;
566
- }
567
-
568
-
569
- /* Add additional trees to already-fitted isolation forest model
570
- *
571
- * Parameters
572
- * ==========
573
- * - model_outputs
574
- * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
575
- * if the trees are are to be added to an extended model. Can only pass one of
576
- * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
577
- * so it cannot be run in parallel for the same model object.
578
- * - model_outputs_ext
579
- * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
580
- * if the trees are are to be added to an single-variable model. Can only pass one of
581
- * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
582
- * so it cannot be run in parallel for the same model object.
583
- * - numeric_data
584
- * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
585
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
586
- * Pass NULL if there are no dense numeric columns.
587
- * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
588
- * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
589
- * of columns, either as dense or as sparse arrays.
590
- * - ncols_numeric
591
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
592
- * what was originally passed to 'fit_iforest'.
593
- * - categ_data
594
- * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
595
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
596
- * Pass NULL if there are no categorical columns. The encoding must be the same as was used
597
- * in the data to which the model was fit.
598
- * Each category should be represented as an integer, and these integers must start at zero and
599
- * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
600
- * present when the model was fit (note that they are not treated as being ordinal, this is just
601
- * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
602
- * must be the same as was used in the data to which the model was fit.
603
- * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
604
- * of columns and the same category encoding.
605
- * - ncols_categ
606
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
607
- * what was originally passed to 'fit_iforest'.
608
- * - ncat
609
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
610
- * what was originally passed to 'fit_iforest'.
611
- * - Xc[nnz]
612
- * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
613
- * Pass NULL if there are no sparse numeric columns.
614
- * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
615
- * - Xc_ind[nnz]
616
- * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
617
- * Pass NULL if there are no sparse numeric columns.
618
- * - Xc_indptr[ncols_numeric + 1]
619
- * Pointer to column index pointers that tell at entry [col] where does column 'col'
620
- * start and at entry [col + 1] where does column 'col' end.
621
- * Pass NULL if there are no sparse numeric columns.
622
- * - ndim
623
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
624
- * what was originally passed to 'fit_iforest'.
625
- * - ntry
626
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
627
- * what was originally passed to 'fit_iforest'.
628
- * - coef_type
629
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
630
- * what was originally passed to 'fit_iforest'.
631
- * - sample_weights
632
- * Weights for the rows when adding this tree, either as sampling importances when using
633
- * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
634
- * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
635
- * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
636
- * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
637
- * Pass NULL if the rows all have uniform weights.
638
- * - nrows
639
- * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
640
- * - max_depth
641
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
642
- * what was originally passed to 'fit_iforest'.
643
- * - limit_depth
644
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
645
- * what was originally passed to 'fit_iforest'.
646
- * - penalize_range
647
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
648
- * what was originally passed to 'fit_iforest'.
649
- * - col_weights
650
- * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
651
- * Ignored when picking columns by deterministic criterion.
652
- * If passing NULL, each column will have a uniform weight. Cannot be used when weighting by kurtosis.
653
- * - weigh_by_kurt
654
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
655
- * what was originally passed to 'fit_iforest'.
656
- * - prob_pick_by_gain_avg
657
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
658
- * what was originally passed to 'fit_iforest'.
659
- * - prob_split_by_gain_avg
660
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
661
- * what was originally passed to 'fit_iforest'.
662
- * - prob_pick_by_gain_pl
663
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
664
- * what was originally passed to 'fit_iforest'.
665
- * - prob_split_by_gain_pl
666
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
667
- * what was originally passed to 'fit_iforest'.
668
- * - min_gain
669
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
670
- * what was originally passed to 'fit_iforest'.
671
- * - missing_action
672
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
673
- * what was originally passed to 'fit_iforest'.
674
- * - cat_split_type
675
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
676
- * what was originally passed to 'fit_iforest'.
677
- * - new_cat_action
678
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
679
- * what was originally passed to 'fit_iforest'.
680
- * - depth_imp
681
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
682
- * what was originally passed to 'fit_iforest'.
683
- * - weigh_imp_rows
684
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
685
- * what was originally passed to 'fit_iforest'.
686
- * - all_perm
687
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
688
- * what was originally passed to 'fit_iforest'.
689
- * - coef_by_prop
690
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
691
- * what was originally passed to 'fit_iforest'.
692
- * - impute_nodes
693
- * Pointer to already-allocated imputation nodes for the tree that will be built. Note that the number of
694
- * entries in the imputation object must match the number of fitted trees when it is used. Pass
695
- * NULL if no imputation node is required.
696
- * - min_imp_obs
697
- * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
698
- * what was originally passed to 'fit_iforest'.
699
- * - random_seed
700
- * Seed that will be used to generate random numbers used by the model.
701
- */
702
- int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
703
- double numeric_data[], size_t ncols_numeric,
704
- int categ_data[], size_t ncols_categ, int ncat[],
705
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
706
- size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
707
- double sample_weights[], size_t nrows, size_t max_depth,
708
- bool limit_depth, bool penalize_range,
709
- double col_weights[], bool weigh_by_kurt,
710
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
711
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
712
- double min_gain, MissingAction missing_action,
713
- CategSplit cat_split_type, NewCategAction new_cat_action,
714
- UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
715
- bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
716
- uint64_t random_seed)
717
- {
718
- int max_categ = 0;
719
- for (size_t col = 0; col < ncols_categ; col++)
720
- max_categ = (ncat[col] > max_categ)? ncat[col] : max_categ;
721
-
722
- InputData input_data = {numeric_data, ncols_numeric, categ_data, ncat, max_categ, ncols_categ,
723
- nrows, ncols_numeric + ncols_categ, sample_weights,
724
- false, col_weights,
725
- Xc, Xc_ind, Xc_indptr,
726
- 0, 0, std::vector<double>(),
727
- std::vector<char>(), 0};
728
- ModelParams model_params = {false, nrows, (size_t)1,
729
- max_depth? max_depth : (nrows - 1),
730
- penalize_range, random_seed, weigh_by_kurt,
731
- prob_pick_by_gain_avg, (model_outputs == NULL)? 0 : prob_split_by_gain_avg,
732
- prob_pick_by_gain_pl, (model_outputs == NULL)? 0 : prob_split_by_gain_pl,
733
- min_gain, cat_split_type, new_cat_action, missing_action, all_perm,
734
- (model_outputs != NULL)? 0 : ndim, (model_outputs != NULL)? 0 : ntry,
735
- coef_type, coef_by_prop, false, false, false, depth_imp, weigh_imp_rows, min_imp_obs};
736
-
737
- std::unique_ptr<WorkerMemory> workspace = std::unique_ptr<WorkerMemory>(new WorkerMemory);
738
-
739
- size_t last_tree;
740
- if (model_outputs != NULL)
741
- {
742
- last_tree = model_outputs->trees.size();
743
- model_outputs->trees.emplace_back();
744
- }
745
-
746
- else
747
- {
748
- last_tree = model_outputs_ext->hplanes.size();
749
- model_outputs_ext->hplanes.emplace_back();
750
- }
751
-
752
- fit_itree((model_outputs != NULL)? &model_outputs->trees.back() : NULL,
753
- (model_outputs_ext != NULL)? &model_outputs_ext->hplanes.back() : NULL,
754
- *workspace,
755
- input_data,
756
- model_params,
757
- impute_nodes,
758
- last_tree);
759
-
760
- if ((model_outputs != NULL))
761
- model_outputs->trees.back().shrink_to_fit();
762
- else
763
- model_outputs_ext->hplanes.back().shrink_to_fit();
764
-
765
- return EXIT_SUCCESS;
766
- }
767
-
768
- void fit_itree(std::vector<IsoTree> *tree_root,
769
- std::vector<IsoHPlane> *hplane_root,
770
- WorkerMemory &workspace,
771
- InputData &input_data,
772
- ModelParams &model_params,
773
- std::vector<ImputeNode> *impute_nodes,
774
- size_t tree_num)
775
- {
776
- /* initialize array for depths if called for */
777
- if (!workspace.ix_arr.size() && model_params.calc_depth)
778
- workspace.row_depths.resize(input_data.nrows, 0);
779
-
780
- /* choose random sample of rows */
781
- if (!workspace.ix_arr.size()) workspace.ix_arr.resize(model_params.sample_size);
782
- if (input_data.log2_n > 0)
783
- workspace.btree_weights.assign(input_data.btree_weights_init.begin(),
784
- input_data.btree_weights_init.end());
785
- workspace.rnd_generator.seed(model_params.random_seed + tree_num);
786
- if (input_data.col_weights != NULL)
787
- workspace.col_sampler = std::discrete_distribution<size_t>(input_data.col_weights,
788
- input_data.col_weights + input_data.ncols_numeric + input_data.ncols_categ);
789
- workspace.runif = std::uniform_int_distribution<size_t>(0, input_data.ncols_tot - 1);
790
- workspace.rbin = std::uniform_real_distribution<double>(0, 1);
791
- sample_random_rows(workspace.ix_arr, input_data.nrows, model_params.with_replacement,
792
- workspace.rnd_generator, workspace.ix_all,
793
- (input_data.weight_as_sample)? input_data.sample_weights : NULL,
794
- workspace.btree_weights, input_data.log2_n, input_data.btree_offset,
795
- workspace.is_repeated);
796
- workspace.st = 0;
797
- workspace.end = model_params.sample_size - 1;
798
- if (!workspace.cols_possible.size())
799
- workspace.cols_possible.resize(input_data.ncols_tot, true);
800
- else
801
- workspace.cols_possible.assign(workspace.cols_possible.size(), true);
802
-
803
- /* set expected tree size and add root node */
804
- {
805
- size_t exp_nodes = 2 * model_params.sample_size;
806
- if (model_params.sample_size >= (SIZE_MAX / (size_t)2))
807
- exp_nodes = SIZE_MAX;
808
- if (model_params.max_depth <= (size_t)30)
809
- exp_nodes = std::min(exp_nodes, pow2(model_params.max_depth));
810
- if (tree_root != NULL)
811
- {
812
- tree_root->reserve(exp_nodes);
813
- tree_root->emplace_back();
814
- }
815
- else
816
- {
817
- hplane_root->reserve(exp_nodes);
818
- hplane_root->emplace_back();
819
- }
820
- if (impute_nodes != NULL)
821
- {
822
- impute_nodes->reserve(exp_nodes);
823
- impute_nodes->emplace_back((size_t) 0);
824
- }
825
- }
826
-
827
- /* initialize array with candidate categories if not already done */
828
- if (!workspace.categs.size())
829
- workspace.categs.resize(input_data.max_categ);
830
-
831
- /* for the extended model, initialize extra vectors and objects */
832
- if (hplane_root != NULL && !workspace.comb_val.size())
833
- {
834
- workspace.coef_norm = std::normal_distribution<double>(0, 1);
835
- if (model_params.coef_type == Uniform)
836
- workspace.coef_unif = std::uniform_real_distribution<double>(-1, 1);
837
-
838
- workspace.cols_shuffled.resize(input_data.ncols_tot);
839
- workspace.comb_val.resize(model_params.sample_size);
840
- workspace.col_take.resize(model_params.ndim);
841
- workspace.col_take_type.resize(model_params.ndim);
842
-
843
- if (input_data.ncols_numeric)
844
- {
845
- workspace.ext_offset.resize(input_data.ncols_tot);
846
- workspace.ext_coef.resize(input_data.ncols_tot);
847
- workspace.ext_mean.resize(input_data.ncols_tot);
848
- }
849
-
850
- if (input_data.ncols_categ)
851
- {
852
- workspace.ext_fill_new.resize(input_data.max_categ);
853
- switch(model_params.cat_split_type)
854
- {
855
- case SingleCateg:
856
- {
857
- workspace.chosen_cat.resize(input_data.max_categ);
858
- break;
859
- }
860
-
861
- case SubSet:
862
- {
863
- workspace.ext_cat_coef.resize(input_data.ncols_tot);
864
- for (std::vector<double> &v : workspace.ext_cat_coef)
865
- v.resize(input_data.max_categ);
866
- break;
867
- }
868
- }
869
- }
870
-
871
- workspace.ext_fill_val.resize(input_data.ncols_tot);
872
-
873
- }
874
-
875
- /* if it contains missing values, also have to set an array of weights,
876
- which will be modified during iterations when there are NAs.
877
- If there are already density weights, need to standardize them to sum up to
878
- the sample size here */
879
- long double weight_scaling = 0;
880
- if (model_params.missing_action == Divide || (input_data.sample_weights != NULL && !input_data.weight_as_sample))
881
- {
882
- workspace.weights_map.clear();
883
-
884
- /* if the sub-sample size is small relative to the full sample size, use a mapping */
885
- if (model_params.sample_size < input_data.nrows / 4)
886
- {
887
- if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
888
- {
889
- for (const size_t ix : workspace.ix_arr)
890
- {
891
- weight_scaling += input_data.sample_weights[ix];
892
- workspace.weights_map[ix] = input_data.sample_weights[ix];
893
- }
894
- weight_scaling = (long double)model_params.sample_size / weight_scaling;
895
- for (auto &w : workspace.weights_map)
896
- w.second *= weight_scaling;
897
-
898
- }
899
-
900
- else
901
- {
902
- for (const size_t ix : workspace.ix_arr)
903
- workspace.weights_map[ix] = 1;
904
- }
905
-
906
- }
907
-
908
- /* if the sub-sample size is large, fill a full array matching to the sample size */
909
- else
910
- {
911
- if (!workspace.weights_arr.size())
912
- {
913
- if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
914
- {
915
- workspace.weights_arr.assign(input_data.sample_weights, input_data.sample_weights + input_data.nrows);
916
- weight_scaling = std::accumulate(workspace.ix_arr.begin(),
917
- workspace.ix_arr.end(),
918
- (long double)0,
919
- [&input_data](const long double a, const size_t b){return a + (long double)input_data.sample_weights[b];}
920
- );
921
- weight_scaling = (long double)model_params.sample_size / weight_scaling;
922
- for (double &w : workspace.weights_arr)
923
- w *= weight_scaling;
924
- }
925
-
926
- else
927
- {
928
- workspace.weights_arr.resize(input_data.nrows, (double)1);
929
- }
930
-
931
- }
932
-
933
- else
934
- {
935
- if (input_data.sample_weights != NULL && !input_data.weight_as_sample)
936
- {
937
- for (const size_t ix : workspace.ix_arr)
938
- {
939
- weight_scaling += input_data.sample_weights[ix];
940
- workspace.weights_arr[ix] = input_data.sample_weights[ix];
941
- }
942
- weight_scaling = (long double)model_params.sample_size / weight_scaling;
943
- for (double &w : workspace.weights_arr)
944
- w *= weight_scaling;
945
-
946
- }
947
-
948
- else
949
- {
950
- /* Note: while not all of them need to be overwritten, this is faster
951
- (sub-sample size was already determined to be at least 1/4 of the sample size) */
952
- std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), (double)1);
953
- }
954
- }
955
- }
956
- }
957
-
958
- /* if producing distance/similarity, also need to initialize the triangular matrix */
959
- if (model_params.calc_dist && !workspace.tmat_sep.size())
960
- workspace.tmat_sep.resize((input_data.nrows * (input_data.nrows - 1)) / 2, 0);
961
-
962
- /* make space for buffers if not already allocated */
963
- if (
964
- (model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
965
- model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl ||
966
- model_params.weigh_by_kurt || hplane_root != NULL)
967
- &&
968
- (!workspace.buffer_dbl.size() && !workspace.buffer_szt.size() && !workspace.buffer_chr.size())
969
- )
970
- {
971
- size_t min_size_dbl = 0;
972
- size_t min_size_szt = 0;
973
- size_t min_size_chr = 0;
974
-
975
- bool gain = model_params.prob_split_by_gain_avg || model_params.prob_pick_by_gain_avg ||
976
- model_params.prob_split_by_gain_pl || model_params.prob_pick_by_gain_pl;
977
-
978
- if (input_data.ncols_categ)
979
- {
980
- min_size_szt = 2 * input_data.max_categ;
981
- min_size_dbl = input_data.max_categ + 1;
982
- if (gain && model_params.cat_split_type == SubSet)
983
- min_size_chr = input_data.max_categ;
984
- }
985
-
986
- if (input_data.Xc_indptr != NULL && gain)
987
- {
988
- min_size_szt = std::max(min_size_szt, model_params.sample_size);
989
- min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
990
- }
991
-
992
- /* for the extended model */
993
- if (hplane_root != NULL)
994
- {
995
- min_size_dbl = std::max(min_size_dbl, pow2(log2ceil(input_data.ncols_tot) + 1));
996
- if (model_params.missing_action != Fail)
997
- {
998
- min_size_szt = std::max(min_size_szt, model_params.sample_size);
999
- min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
1000
- }
1001
-
1002
- if (input_data.ncols_categ && model_params.cat_split_type == SubSet)
1003
- {
1004
- min_size_szt = std::max(min_size_szt, 2 * (size_t)input_data.max_categ + 1);
1005
- min_size_dbl = std::max(min_size_dbl, (size_t)input_data.max_categ);
1006
- }
1007
-
1008
- if (model_params.weigh_by_kurt)
1009
- min_size_szt = std::max(min_size_szt, input_data.ncols_tot);
1010
- }
1011
-
1012
- /* now resize */
1013
- if (workspace.buffer_dbl.size() < min_size_dbl)
1014
- workspace.buffer_dbl.resize(min_size_dbl);
1015
-
1016
- if (workspace.buffer_szt.size() < min_size_szt)
1017
- workspace.buffer_szt.resize(min_size_szt);
1018
-
1019
- if (workspace.buffer_chr.size() < min_size_chr)
1020
- workspace.buffer_chr.resize(min_size_chr);
1021
-
1022
- /* for guided column choice, need to also remember the best split so far */
1023
- if (
1024
- model_params.cat_split_type == SubSet &&
1025
- (
1026
- model_params.prob_pick_by_gain_avg ||
1027
- model_params.prob_pick_by_gain_pl
1028
- )
1029
- )
1030
- {
1031
- workspace.this_split_categ.resize(input_data.max_categ);
1032
- }
1033
-
1034
- }
1035
-
1036
- /* weigh columns by kurtosis in the sample if required */
1037
- if (model_params.weigh_by_kurt)
1038
- {
1039
- std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
1040
-
1041
- if (input_data.Xc_indptr == NULL)
1042
- {
1043
- for (size_t col = 0; col < input_data.ncols_numeric; col++)
1044
- kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1045
- input_data.numeric_data + col * input_data.nrows,
1046
- model_params.missing_action);
1047
- }
1048
-
1049
- else
1050
- {
1051
- std::sort(workspace.ix_arr.begin(), workspace.ix_arr.end());
1052
- for (size_t col = 0; col < input_data.ncols_numeric; col++)
1053
- kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end, col,
1054
- input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
1055
- model_params.missing_action);
1056
- }
1057
-
1058
- for (size_t col = 0; col < input_data.ncols_categ; col++)
1059
- kurt_weights[col + input_data.ncols_numeric] =
1060
- calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
1061
- input_data.categ_data + col * input_data.nrows, input_data.ncat[col],
1062
- workspace.buffer_szt.data(), workspace.buffer_dbl.data(),
1063
- model_params.missing_action, model_params.cat_split_type, workspace.rnd_generator);
1064
-
1065
- for (size_t col = 0; col < input_data.ncols_tot; col++)
1066
- if (kurt_weights[col] <= 0 || is_na_or_inf(kurt_weights[col]))
1067
- workspace.cols_possible[col] = false;
1068
-
1069
- workspace.col_sampler = std::discrete_distribution<size_t>(kurt_weights.begin(), kurt_weights.end());
1070
- }
1071
-
1072
- if (tree_root != NULL)
1073
- split_itree_recursive(*tree_root,
1074
- workspace,
1075
- input_data,
1076
- model_params,
1077
- impute_nodes,
1078
- 0);
1079
- else
1080
- split_hplane_recursive(*hplane_root,
1081
- workspace,
1082
- input_data,
1083
- model_params,
1084
- impute_nodes,
1085
- 0);
1086
-
1087
- /* if producing imputation structs, only need to keep the ones for terminal nodes */
1088
- if (impute_nodes != NULL)
1089
- drop_nonterminal_imp_node(*impute_nodes, tree_root, hplane_root);
1090
- }