isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,2111 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2021, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+
64
+ /* Standard headers */
65
+ #include <cstddef>
66
+ #include <cstdint>
67
+ #include <vector>
68
+ using std::size_t;
69
+
70
+ /* The library has overloaded functions supporting different input types.
71
+ Note that, while 'float' type is supported, it will
72
+ be slower to fit models to them as the models internally use
73
+ 'double' and 'long double', and it's not recommended to use.
74
+
75
+ In order to use the library with different types than the ones
76
+ suggested here, add something like this before including the
77
+ library header:
78
+ #define real_t float
79
+ #define sparse_ix int
80
+ #include "isotree.hpp"
81
+ The header may be included multiple times if required. */
82
+ #ifndef real_t
83
+ #define real_t double /* supported: float, double */
84
+ #endif
85
+ #ifndef sparse_ix
86
+ #define sparse_ix int /* supported: int, int64_t, size_t */
87
+ #endif
88
+
89
+ #ifndef ISOTREE_H
90
+ #define ISOTREE_H
91
+
92
+ #ifdef _WIN32
93
+ #define ISOTREE_EXPORTED __declspec(dllimport)
94
+ #else
95
+ #define ISOTREE_EXPORTED
96
+ #endif
97
+
98
+
99
+ /* Types used through the package - zero is the suggested value (when appropriate) */
100
+ typedef enum NewCategAction {Weighted=0, Smallest=11, Random=12} NewCategAction; /* Weighted means Impute in the extended model */
101
+ typedef enum MissingAction {Divide=21, Impute=22, Fail=0} MissingAction; /* Divide is only for non-extended model */
102
+ typedef enum ColType {Numeric=31, Categorical=32, NotUsed=0} ColType;
103
+ typedef enum CategSplit {SubSet=0, SingleCateg=41} CategSplit;
104
+ typedef enum CoefType {Uniform=61, Normal=0} CoefType; /* For extended model */
105
+ typedef enum UseDepthImp {Lower=71, Higher=0, Same=72} UseDepthImp; /* For NA imputation */
106
+ typedef enum WeighImpRows {Inverse=0, Prop=81, Flat=82} WeighImpRows; /* For NA imputation */
107
+ typedef enum ScoringMetric {Depth=0, Density=92, BoxedDensity=94, BoxedDensity2=96, BoxedRatio=95,
108
+ AdjDepth=91, AdjDensity=93} ScoringMetric;
109
+
110
+ /* Notes about new categorical action:
111
+ * - For single-variable case, if using 'Smallest', can then pass data at prediction time
112
+ * having categories that were never in the training data (as an integer higher than 'ncat'
113
+ * for that column), but if using 'Random' or 'Weighted', these must be passed as NA (int < 0)
114
+ * - For extended case, 'Weighted' becomes a weighted imputation instead, and if using either
115
+ * 'Weighted' or 'Smallest', can pass newer, unseen categories at prediction time too.
116
+ * - If using 'Random', cannot pass new categories at prediction time.
117
+ * - If using 'Weighted' for single-variable case, cannot predict similarity with a value
118
+ * for MissingAction other than 'Divide'. */
119
+
120
+
121
+
122
+ /* Structs that are output (modified) from the main function */
123
+ typedef struct IsoTree {
124
+ ColType col_type = NotUsed;
125
+ size_t col_num;
126
+ double num_split;
127
+ std::vector<char> cat_split;
128
+ int chosen_cat;
129
+ size_t tree_left;
130
+ size_t tree_right;
131
+ double pct_tree_left;
132
+ double score; /* will not be integer when there are weights or early stop */
133
+ double range_low = -HUGE_VAL;
134
+ double range_high = HUGE_VAL;
135
+ double remainder; /* only used for distance/similarity */
136
+
137
+ IsoTree() = default;
138
+
139
+ } IsoTree;
140
+
141
+ typedef struct IsoHPlane {
142
+ std::vector<size_t> col_num;
143
+ std::vector<ColType> col_type;
144
+ std::vector<double> coef;
145
+ std::vector<double> mean;
146
+ std::vector<std::vector<double>> cat_coef;
147
+ std::vector<int> chosen_cat;
148
+ std::vector<double> fill_val;
149
+ std::vector<double> fill_new;
150
+
151
+ double split_point;
152
+ size_t hplane_left;
153
+ size_t hplane_right;
154
+ double score; /* will not be integer when there are weights or early stop */
155
+ double range_low = -HUGE_VAL;
156
+ double range_high = HUGE_VAL;
157
+ double remainder; /* only used for distance/similarity */
158
+
159
+ IsoHPlane() = default;
160
+ } IsoHPlane;
161
+
162
+ typedef struct IsoForest {
163
+ std::vector< std::vector<IsoTree> > trees;
164
+ NewCategAction new_cat_action;
165
+ CategSplit cat_split_type;
166
+ MissingAction missing_action;
167
+ double exp_avg_depth;
168
+ double exp_avg_sep;
169
+ size_t orig_sample_size;
170
+ bool has_range_penalty;
171
+ IsoForest() = default;
172
+ } IsoForest;
173
+
174
+ typedef struct ExtIsoForest {
175
+ std::vector< std::vector<IsoHPlane> > hplanes;
176
+ NewCategAction new_cat_action;
177
+ CategSplit cat_split_type;
178
+ MissingAction missing_action;
179
+ double exp_avg_depth;
180
+ double exp_avg_sep;
181
+ size_t orig_sample_size;
182
+ bool has_range_penalty;
183
+ ExtIsoForest() = default;
184
+ } ExtIsoForest;
185
+
186
+ typedef struct ImputeNode {
187
+ std::vector<double> num_sum;
188
+ std::vector<double> num_weight;
189
+ std::vector<std::vector<double>> cat_sum;
190
+ std::vector<double> cat_weight;
191
+ size_t parent;
192
+ ImputeNode() = default;
193
+ } ImputeNode; /* this is for each tree node */
194
+
195
+ typedef struct Imputer {
196
+ size_t ncols_numeric;
197
+ size_t ncols_categ;
198
+ std::vector<int> ncat;
199
+ std::vector<std::vector<ImputeNode>> imputer_tree;
200
+ std::vector<double> col_means;
201
+ std::vector<int> col_modes;
202
+ Imputer() = default;
203
+ } Imputer;
204
+
205
+ typedef struct SingleTreeIndex {
206
+ std::vector<size_t> terminal_node_mappings;
207
+ std::vector<double> node_distances;
208
+ std::vector<double> node_depths;
209
+ std::vector<size_t> reference_points;
210
+ std::vector<size_t> reference_indptr;
211
+ std::vector<size_t> reference_mapping;
212
+ size_t n_terminal;
213
+ } TreeNodeIndex;
214
+
215
+ typedef struct TreesIndexer {
216
+ std::vector<SingleTreeIndex> indices;
217
+ TreesIndexer() = default;
218
+ } TreesIndexer;
219
+
220
+ #endif /* ISOTREE_H */
221
+
222
+ /* Fit Isolation Forest model, or variant of it such as SCiForest
223
+ *
224
+ * Parameters:
225
+ * ===========
226
+ * - model_outputs (out)
227
+ * Pointer to already allocated isolation forest model object for single-variable splits.
228
+ * If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
229
+ * additional trees through function 'add_tree'.
230
+ * - model_outputs_ext (out)
231
+ * Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
232
+ * Note that if 'ndim' = 1, must use instead the single-variable model object.
233
+ * If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
234
+ * additional trees through function 'add_tree'.
235
+ * - numeric_data[nrows * ncols_numeric]
236
+ * Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
237
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
238
+ * Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
239
+ * no sparse numeric data either).
240
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
241
+ * - ncols_numeric
242
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
243
+ * - categ_data[nrows * ncols_categ]
244
+ * Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
245
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
246
+ * Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
247
+ * Each category should be represented as an integer, and these integers must start at zero and
248
+ * be in consecutive order - i.e. if category '3' is present, category '2' must also be present
249
+ * (note that they are not treated as being ordinal, this is just an encoding). Missing values
250
+ * should be encoded as negative numbers such as (-1).
251
+ * - ncols_categ
252
+ * Number of categorical columns in the data.
253
+ * - ncat[ncols_categ]
254
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
255
+ * the number of categories for that column is '5' (zero is one category).
256
+ * - Xc[nnz]
257
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
258
+ * Pass NULL if there are no sparse numeric columns.
259
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
260
+ * - Xc_ind[nnz]
261
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
262
+ * Must be in sorted order, otherwise results will be incorrect.
263
+ * The largest value here should be smaller than the largest possible value of 'size_t'.
264
+ * Pass NULL if there are no sparse numeric columns.
265
+ * - Xc_indptr[ncols_numeric + 1]
266
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
267
+ * start and at entry [col + 1] where does column 'col' end.
268
+ * Pass NULL if there are no sparse numeric columns.
269
+ * - ndim
270
+ * How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
271
+ * the single-variable model. Note that the model object pointer passed must also
272
+ * agree with the value passed to 'ndim'.
273
+ * - ntry
274
+ * When using any of 'prob_pick_by_gain_pl', 'prob_pick_by_gain_avg', 'prob_pick_by_full_gain', 'prob_pick_by_dens', how many variables (with 'ndim=1')
275
+ * or linear combinations (with 'ndim>1') to try for determining the best one according to gain.
276
+ * Recommended value in reference [4] is 10 (with 'prob_pick_by_gain_avg', for outlier detection), while the
277
+ * recommended value in reference [11] is 1 (with 'prob_pick_by_gain_pl', for outlier detection), and the
278
+ * recommended value in reference [9] is 10 to 20 (with 'prob_pick_by_gain_pl', for missing value imputations).
279
+ * - coef_type
280
+ * For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
281
+ * (as proposed in [4]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [3]. Ignored for the
282
+ * single-variable model.
283
+ * - sample_weights[nrows]
284
+ * Weights for the rows when building a tree, either as sampling importances when using
285
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
286
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
287
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
288
+ * through parameter 'weight_as_sample'.
289
+ * Pass NULL if the rows all have uniform weights.
290
+ * - with_replacement
291
+ * Whether to sample rows with replacement or not (not recommended). Note that distance calculations,
292
+ * if desired, don't work well with duplicate rows.
293
+ * - weight_as_sample
294
+ * If passing sample (row) weights when fitting the model, whether to consider those weights as row
295
+ * sampling weights (i.e. the higher the weights, the more likely the observation will end up included
296
+ * in each tree sub-sample), or as distribution density weights (i.e. putting a weight of two is the same
297
+ * as if the row appeared twice, thus higher weight makes it less of an outlier, but does not give it a
298
+ * higher chance of being sampled if the data uses sub-sampling).
299
+ * - nrows
300
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
301
+ * - sample_size
302
+ * Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
303
+ * 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
304
+ * random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
305
+ * will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
306
+ * in [5] is 'nrows' here.
307
+ * - ntrees
308
+ * Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
309
+ * author's code in [5] is 10.
310
+ * - max_depth
311
+ * Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
312
+ * Models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
313
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
314
+ * predictions.
315
+ * Note that models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
316
+ * deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
317
+ * predictions.
318
+ * If using pooled gain, one might want to substitute 'max_depth' with 'min_gain'.
319
+ * - ncols_per_tree
320
+ * Number of columns to use (have as potential candidates for splitting at each iteration) in each tree,
321
+ * similar to the 'mtry' parameter of random forests.
322
+ * In general, this is only relevant when using non-random splits and/or weighted column choices.
323
+ * If passing zero, will use the full number of available columns.
324
+ * Recommended value: 0.
325
+ * - limit_depth
326
+ * Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
327
+ * terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
328
+ * will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
329
+ * tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass 'false' here
330
+ * and higher values for 'max_depth' if using the model for purposes other than outlier detection.
331
+ * Note that, if passing 'limit_depth=true', then 'max_depth' is ignored.
332
+ * - penalize_range
333
+ * Whether to penalize (add -1 to the terminal depth) observations at prediction time that have a value
334
+ * of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
335
+ * reasonable range in the data being split (given by 2 * range in data and centered around the split point),
336
+ * as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
337
+ * when splitting by categorical variables. Note that this can make a very large difference in the results
338
+ * when using 'prob_pick_by_gain_pl'.
339
+ * This option is not supported when using density-based outlier scoring metrics.
340
+ * - standardize_data
341
+ * Whether to standardize the features at each node before creating a linear combination of them as suggested
342
+ * in [4]. This is ignored when using 'ndim=1'.
343
+ * - scoring_metric
344
+ * Metric to use for determining outlier scores (see reference [13]).
345
+ * If passing 'Depth', will use isolation depth as proposed in reference [1]. This is typically the safest choice
346
+ * and plays well with all model types offered by this library.
347
+ * If passing 'Density', will set scores for each terminal node as the ratio between the fraction of points in the sub-sample
348
+ * that end up in that node and the fraction of the volume in the feature space which defines
349
+ * the node according to the splits that lead to it.
350
+ * If using 'ndim=1', for categorical variables, 'Density' is defined in terms
351
+ * of number of categories that go towards each side of the split divided by number of categories
352
+ * in the observations that reached that node.
353
+ * The standardized outlier score from 'Density' for a given observation is calculated as the
354
+ * negative of the logarithm of the geometric mean from the per-tree densities, which unlike
355
+ * the standardized score produced from 'Depth', is unbounded, but just like the standardized
356
+ * score form 'Depth', has a natural threshold for definining outlierness, which in this case
357
+ * is zero is instead of 0.5. The non-standardized outlier score for 'Density' is calculated as the
358
+ * geometric mean, while the per-tree scores are calculated as the density values.
359
+ * 'Density' might lead to better predictions when using 'ndim=1', particularly in the presence
360
+ * of categorical variables. Note however that using 'Density' requires more trees for convergence
361
+ * of scores (i.e. good results) compared to isolation-based metrics.
362
+ * 'Density' is incompatible with 'penalize_range=true'.
363
+ * If passing 'AdjDepth', will use an adjusted isolation depth that takes into account the number of points that
364
+ * go to each side of a given split vs. the fraction of the range of that feature that each
365
+ * side of the split occupies, by a metric as follows: 'd = 2/ (1 + 1/(2*p))'
366
+ * where 'p' is defined as 'p = (n_s / n_t) / (r_s / r_t)
367
+ * with 'n_t' being the number of points that reach a given node, 'n_s' the
368
+ * number of points that are sent to a given side of the split/branch at that node,
369
+ * 'r_t' being the range (maximum minus minimum) of the splitting feature or
370
+ * linear combination among the points that reached the node, and 'r_s' being the
371
+ * range of the same feature or linear combination among the points that are sent to this
372
+ * same side of the split/branch. This makes each split add a number between zero and two
373
+ * to the isolation depth, with this number's probabilistic distribution being centered
374
+ * around 1 and thus the expected isolation depth remaing the same as in the original
375
+ * 'Depth' metric, but having more variability around the extremes.
376
+ * Scores (standardized, non-standardized, per-tree) for 'AdjDepth' are aggregated in the same way
377
+ * as for 'Depth'.
378
+ * 'AdjDepth' might lead to better predictions when using 'ndim=1', particularly in the prescence
379
+ * of categorical variables and for smaller datasets, and for smaller datasets, might make
380
+ * sense to combine it with 'penalize_range=true'.
381
+ * If passing 'AdjDensity', will use the same metric from 'AdjDepth', but applied multiplicatively instead
382
+ * of additively. The expected value for 'AdjDepth' is not strictly the same
383
+ * as for isolation, but using the expected isolation depth as standardizing criterion
384
+ * tends to produce similar standardized score distributions (centered around 0.5).
385
+ * Scores (standardized, non-standardized, per-tree) from 'AdjDensity' are aggregated in the same way
386
+ * as for 'Depth'.
387
+ * 'AdjDepth' is incompatible with 'penalize_range=true'.
388
+ * If passing 'BoxedRatio', will set the scores for each terminal node as the ratio between the volume of the boxed
389
+ * feature space for the node as defined by the smallest and largest values from the split
390
+ * conditions for each column (bounded by the variable ranges in the sample) and the
391
+ * variable ranges in the tree sample.
392
+ * If using 'ndim=1', for categorical variables 'BoxedRatio' is defined in terms of number of categories.
393
+ * If using 'ndim=>1', 'BoxedRatio' is defined in terms of the maximum achievable value for the
394
+ * splitting linear combination determined from the minimum and maximum values for each
395
+ * variable among the points in the sample, and as such, it has a rather different meaning
396
+ * compared to the score obtained with 'ndim=1' - 'BoxedRatio' scores with 'ndim>1'
397
+ * typically provide very poor quality results and this metric is thus not recommended to
398
+ * use in the extended model. With 'ndim>1', 'BoxedRatio' also has a tendency of producing too small
399
+ * values which round to zero.
400
+ * The standardized outlier score from 'BoxedRatio' for a given observation is calculated
401
+ * simply as the the average from the per-tree boxed ratios. 'BoxedRatio' metric
402
+ * has a lower bound of zero and a theorical upper bound of one, but in practice the scores
403
+ * tend to be very small numbers close to zero, and its distribution across
404
+ * different datasets is rather unpredictable. In order to keep rankings comparable with
405
+ * the rest of the metrics, the non-standardized outlier scores for 'BoxedRatio' are calculated as the
406
+ * negative of the average instead. The per-tree 'BoxedRatio' scores are calculated as the ratios.
407
+ * 'BoxedRatio' can be calculated in a fast-but-not-so-precise way, and in a low-but-precise
408
+ * way, which is controlled by parameter 'fast_bratio'. Usually, both should give the
409
+ * same results, but in some fatasets, the fast way can lead to numerical inaccuracies
410
+ * due to roundoffs very close to zero.
411
+ * 'BoxedRatio' might lead to better predictions in datasets with many rows when using 'ndim=1'
412
+ * and a relatively small 'sample_size'. Note that more trees are required for convergence
413
+ * of scores when using 'BoxedRatio'. In some datasets, 'BoxedRatio' metric might result in very bad
414
+ * predictions, to the point that taking its inverse produces a much better ranking of outliers.
415
+ * 'BoxedRatio' option is incompatible with 'penalize_range'.
416
+ * If passing 'BoxedDensity2', will set the score as the ratio between the fraction of points within the sample that
417
+ * end up in a given terminal node and the 'BoxedRatio' metric.
418
+ * Aggregation of scores (standardized, non-standardized, per-tree) for 'BoxedDensity2' is done in the same
419
+ * way as for 'Density', and it also has a natural threshold at zero for determining
420
+ * outliers and inliers.
421
+ * 'BoxedDensity2' is typically usable with 'ndim>1', but tends to produce much bigger values
422
+ * compared to 'ndim=1'.
423
+ * Albeit unintuitively, in many datasets, one can usually get better results with metric
424
+ * 'BoxedDensity' instead.
425
+ * The calculation of 'BoxedDensity2' is also controlled by 'fast_bratio'.
426
+ * 'BoxedDensity2' incompatible with 'penalize_range'.
427
+ * If passing 'BoxedDensity', will set the score as the ratio between the fraction of points within the sample that
428
+ * end up in a given terminal node and the ratio between the boxed volume of the feature
429
+ * space in the sample and the boxed volume of a node given by the split conditions (inverse
430
+ * as in 'BoxedDensity2'). This metric does not have any theoretical or intuitive
431
+ * justification behind its existence, and it is perhaps ilogical to use it as a
432
+ * scoring metric, but tends to produce good results in some datasets.
433
+ * The standardized outlier scores for 'BoxedDensity' are defined as the negative of the geometric mean,
434
+ * while the non-standardized scores are the geometric mean, and the per-tree scores are simply the 'density' values.
435
+ * The calculation of 'BoxedDensity' is also controlled by 'fast_bratio'.
436
+ * 'BoxedDensity' option is incompatible with 'penalize_range'.
437
+ * - fast_bratio
438
+ * When using "boxed" metrics for scoring, whether to calculate them in a fast way through
439
+ * cumulative sum of logarithms of ratios after each split, or in a slower way as sum of
440
+ * logarithms of a single ratio per column for each terminal node.
441
+ * Usually, both methods should give the same results, but in some datasets, particularly
442
+ * when variables have too small or too large ranges, the first method can be prone to
443
+ * numerical inaccuracies due to roundoff close to zero.
444
+ * Note that this does not affect calculations for models with 'ndim>1', since given the
445
+ * split types, the calculation for them is different.
446
+ * - standardize_dist
447
+ * If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
448
+ * depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
449
+ * - tmat[nrows * (nrows - 1) / 2]
450
+ * Array in which to calculate average separation depths or standardized distance metric (see documentation
451
+ * for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
452
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
453
+ * of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
454
+ * output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
455
+ * entry 0 <= i < j < n will be located at position
456
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
457
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
458
+ * - output_depths[nrows]
459
+ * Array in which to calculate average path depths or standardized outlierness metric (see documentation
460
+ * for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
461
+ * the regular model process. If passing this output argument, the sample size must be the same as the number
462
+ * of rows. If not NULL, must already be initialized to zeros.
463
+ * - standardize_depth
464
+ * If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
465
+ * a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
466
+ * with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
467
+ * the average depth of each row across all trees.
468
+ * - col_weights[ncols_numeric + ncols_categ]
469
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
470
+ * Ignored when picking columns by deterministic criterion.
471
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
472
+ * effect is multiplicative.
473
+ * - weigh_by_kurt
474
+ * Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
475
+ * for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
476
+ * sample. For categorical columns, will calculate expected kurtosis if the column were converted to
477
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
478
+ * This is intended as a cheap feature selector, while the parameter 'prob_pick_col_by_kurt'
479
+ * provides the option to do this at each node in the tree for a different overall type of model.
480
+ * If passing column weights or weighted column choices ('prob_pick_col_by_range', 'prob_pick_col_by_var'),
481
+ * the effect will be multiplicative. This option is not compatible with 'prob_pick_col_by_kurt'.
482
+ * If passing 'missing_action=fail' and the data has infinite values, columns with rows
483
+ * having infinite values will get a weight of zero. If passing a different value for missing
484
+ * action, infinite values will be ignored in the kurtosis calculation.
485
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
486
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
487
+ * the same central value).
488
+ * - prob_pick_by_gain_pl
489
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
490
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
491
+ * that maximizes a pooled standard deviation gain criterion (see references [9] and [11]) on the
492
+ * same variable or linear combination, similarly to regression trees such as CART.
493
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
494
+ * in which the largest standardized gain can be achieved.
495
+ * For categorical variables with 'ndim=1', will use shannon entropy instead (like in [7]).
496
+ * Compared to a simple averaged gain, this tends to result in more evenly-divided splits and more clustered
497
+ * groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
498
+ * When used for outlier detection, datasets with multimodal distributions usually see better performance
499
+ * under this type of splits.
500
+ * Note that, since this makes the trees more even and thus it takes more steps to produce isolated nodes,
501
+ * the resulting object will be heavier. When splits are not made according to any of 'prob_pick_by_gain_avg',
502
+ * 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
503
+ * Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
504
+ * every single tree will have the exact same splits.
505
+ * Be aware that 'penalize_range' can also have a large impact when using 'prob_pick_by_gain_pl'.
506
+ * Be aware also that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable
507
+ * model, every single tree will have the exact same splits.
508
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
509
+ * Alternatively, one can also control the depth through 'min_gain' (for which one might want to
510
+ * set 'max_depth=0').
511
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain',
512
+ * 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
513
+ * - prob_pick_by_gain_avg
514
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
515
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
516
+ * that maximizes an averaged standard deviation gain criterion (see references [4] and [11]) on the
517
+ * same variable or linear combination.
518
+ * If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
519
+ * in which the largest standardized gain can be achieved.
520
+ * For categorical variables with 'ndim=1', will take the expected standard deviation that would be
521
+ * gotten if the column were converted to numerical by assigning to each category a random
522
+ * number ~ Unif(0, 1) and calculate gain with those assumed standard deviations.
523
+ * Compared to a pooled gain, this tends to result in more cases in which a single observation or very
524
+ * few of them are put into one branch. Typically, datasets with outliers defined by extreme values in
525
+ * some column more or less independently of the rest, usually see better performance under this type
526
+ * of split. Recommended to use sub-samples (parameter 'sample_size') when
527
+ * passing this parameter. Note that, since this will create isolated nodes faster, the resulting object
528
+ * will be lighter (use less memory).
529
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
530
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
531
+ * Default setting for [1], [2], [3] is zero, and default for [4] is 1.
532
+ * This is the randomization parameter that can be passed to the author's original code in [5],
533
+ * but note that the code in [5] suffers from a mathematical error in the calculation of running standard deviations,
534
+ * so the results from it might not match with this library's.
535
+ * Be aware that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable model,
536
+ * every single tree will have the exact same splits.
537
+ * Under this option, models are likely to produce better results when increasing 'max_depth'.
538
+ * Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
539
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
540
+ * - prob_pick_by_full_gain
541
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
542
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
543
+ * that minimizes the pooled sums of variances of all columns (or a subset of them if using
544
+ * 'ncols_per_tree').
545
+ * In general, 'prob_pick_by_full_gain' is much slower to evaluate than the other gain types, and does not tend to
546
+ * lead to better results. When using 'prob_pick_by_full_gain', one might want to use a different scoring
547
+ * metric (particulatly 'Density', 'BoxedDensity2' or 'BoxedRatio'). Note that
548
+ * the variance calculations are all done through the (exact) sorted-indices approach, while is much
549
+ * slower than the (approximate) histogram approach used by other decision tree software.
550
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
551
+ * of features will make a large difference under 'prob_pick_by_full_gain', which might not make it suitable for
552
+ * all types of data.
553
+ * 'prob_pick_by_full_gain' is not compatible with categorical data, and 'min_gain' does not apply to it.
554
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
555
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
556
+ * Default setting for [1], [2], [3], [4] is zero.
557
+ * - prob_pick_dens
558
+ * This parameter indicates the probability of choosing the threshold on which to split a variable
559
+ * (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
560
+ * that maximizes the pooled densities of the branch distributions.
561
+ * The 'min_gain' option does not apply to this type of splits.
562
+ * When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
563
+ * 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
564
+ * Default setting for [1], [2], [3], [4] is zero.
565
+ * - prob_pick_col_by_range
566
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
567
+ * proportional to the range spanned by each column within a node as proposed in reference [12].
568
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
569
+ * probability proportional to the range spanned by each column within a node.
570
+ * This option is not compatible with categorical data. If passing column weights, the
571
+ * effect will be multiplicative.
572
+ * Be aware that the data is not standardized in any way for the range calculations, thus the scales
573
+ * of features will make a large difference under this option, which might not make it suitable for
574
+ * all types of data.
575
+ * Note that the proposed RRCF model from [12] uses a different scoring metric for producing anomaly
576
+ * scores, while this library uses isolation depth regardless of how columns are chosen, thus results
577
+ * are likely to be different from those of other software implementations. Nevertheless, as explored
578
+ * in [11], isolation depth as a scoring metric typically provides better results than the
579
+ * "co-displacement" metric from [12] under these split types.
580
+ * - prob_pick_col_by_var
581
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
582
+ * proportional to the variance of each column within a node.
583
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
584
+ * probability proportional to the variance of each column within a node.
585
+ * For categorical data, it will calculate the expected variance if the column were converted to
586
+ * numerical by assigning to each category a random number ~ Unif(0, 1), which depending on the number of
587
+ * categories and their distribution, produces numbers typically a bit smaller than standardized numerical
588
+ * variables.
589
+ * Note that when using sparse matrices, the calculation of variance will rely on a procedure that
590
+ * uses sums of squares, which has less numerical precision than the
591
+ * calculation used for dense inputs, and as such, the results might differ slightly.
592
+ * Be aware that this calculated variance is not standardized in any way, so the scales of
593
+ * features will make a large difference under this option.
594
+ * If there are infinite values, all columns having infinite values will be treated as having the
595
+ * same weight, and will be chosen before every other column with non-infinite values.
596
+ * If passing column weights , the effect will be multiplicative.
597
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
598
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
599
+ * and will be chosen before columns with non-infinite values.
600
+ * - prob_pick_col_by_kurt
601
+ * When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
602
+ * proportional to the kurtosis of each column **within a node** (unlike the option 'weigh_by_kurtosis'
603
+ * which calculates this metric only at the root).
604
+ * When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
605
+ * probability proportional to the kurtosis of each column within a node.
606
+ * For categorical data, it will calculate the expected kurtosis if the column were converted to
607
+ * numerical by assigning to each category a random number ~ Unif(0, 1).
608
+ * Note that when using sparse matrices, the calculation of kurtosis will rely on a procedure that
609
+ * uses sums of squares and higher-power numbers, which has less numerical precision than the
610
+ * calculation used for dense inputs, and as such, the results might differ slightly.
611
+ * If passing column weights, the effect will be multiplicative. This option is not compatible
612
+ * with 'weigh_by_kurtosis'.
613
+ * If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
614
+ * variance calculation. Otherwise, all columns with infinite values will have the same probability
615
+ * and will be chosen before columns with non-infinite values.
616
+ * If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
617
+ * in order not to favor columns with missing values (which would increase kurtosis by all having
618
+ * the same central value).
619
+ * Be aware that kurtosis can be a rather slow metric to calculate.
620
+ * - min_gain
621
+ * Minimum gain that a split threshold needs to produce in order to proceed with a split.
622
+ * Only used when the splits are decided by a variance gain criterion ('prob_pick_by_gain_pl' or
623
+ * 'prob_pick_by_gain_avg', but not 'prob_pick_by_full_gain' nor 'prob_pick_by_dens').
624
+ * If the highest possible gain in the evaluated splits at a node is below this threshold,
625
+ * that node becomes a terminal node.
626
+ * This can be used as a more sophisticated depth control when using pooled gain (note that 'max_depth'
627
+ * still applies on top of this heuristic).
628
+ * - missing_action
629
+ * How to handle missing data at both fitting and prediction time. Options are a) 'Divide' (for the single-variable
630
+ * model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
631
+ * the data that went to each branch when fitting the model, b) 'Impute', which will assign observations to the
632
+ * branch with the most observations in the single-variable model (but imputed values will also be used for
633
+ * gain calculations), or fill in missing values with the median of each column of the sample from which the
634
+ * split was made in the extended model (recommended) (but note that the calculation of medians does not take
635
+ * into account sample weights when using 'weights_as_sample_prob=false', and note that when using a gain
636
+ * criterion for splits with 'ndim=1', it will use the imputed values in the calculation), c) 'Fail' which will
637
+ * assume that there are no missing values and will trigger undefined behavior if it encounters any.
638
+ * In the extended model, infinite values will be treated as missing.
639
+ * Note that passing 'Fail' might crash the process if there turn out to be missing values, but will otherwise
640
+ * produce faster fitting and prediction times along with decreased model object sizes.
641
+ * Models from [1], [2], [3], [4] correspond to 'Fail' here.
642
+ * - cat_split_type
643
+ * Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
644
+ * a single category to a branch and the rest to the other branch. For the extended model, whether to
645
+ * give each category a coefficient, or only one while the rest get zero.
646
+ * - new_cat_action
647
+ * What to do after splitting a categorical feature when new data that reaches that split has categories that
648
+ * the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
649
+ * in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
650
+ * data that went to each branch when fitting the model, and in the extended model will assign
651
+ * them the median value for that column that was added to the linear combination of features (but note that
652
+ * this median calculation does not use sample weights when using 'weights_as_sample_prob=false'),
653
+ * b) "Smallest", which will assign all observations with unseen categories in the split to the branch that
654
+ * had fewer observations when fitting the model, c) "Random", which will assing a branch (coefficient in the
655
+ * extended model) at random for each category beforehand, even if no observations had that category when
656
+ * fitting the model. Ignored when passing 'cat_split_type' = 'SingleCateg'.
657
+ * - all_perm
658
+ * When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
659
+ * whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
660
+ * will sort the categories by their frequency and make a grouping in this sorted order. Note that the
661
+ * number of combinations evaluated (if 'true') is the factorial of the number of present categories in
662
+ * a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
663
+ * category in a separate branch, so not evaluating all permutations (passing 'false') will make it
664
+ * possible to select other splits that respect the sorted frequency order.
665
+ * The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
666
+ * systems, this means no column can have more than 20 different categories if using 'all_perm=true',
667
+ * but note that this is not checked within the function.
668
+ * Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
669
+ * - coef_by_prop
670
+ * In the extended model, whether to sort the randomly-generated coefficients for categories
671
+ * according to their relative frequency in the tree node. This might provide better results when using
672
+ * categorical variables with too many categories, but is not recommended, and not reflective of
673
+ * real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
674
+ * variables.
675
+ * - imputer (out)
676
+ * Pointer to already-allocated imputer object, which can be used to produce missing value imputations
677
+ * in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
678
+ * 'missing_action' as missing values inside the model are treated differently and follow their own imputation
679
+ * or division strategy.
680
+ * - min_imp_obs
681
+ * Minimum number of observations with which an imputation value can be produced. Ignored if passing
682
+ * 'build_imputer' = 'false'.
683
+ * - depth_imp
684
+ * How to weight observations according to their depth when used for imputing missing values. Passing
685
+ * "Higher" will weigh observations higher the further down the tree (away from the root node) the
686
+ * terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
687
+ * to node depth in the tree. Implemented for testing purposes and not recommended to change
688
+ * from the default. Ignored when not passing 'impute_nodes'.
689
+ * - weigh_imp_rows
690
+ * How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
691
+ * a node inversely proportional to the number of observations that end up there, while "Proportional"
692
+ * will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
693
+ * in this regard regardless of how many observations end up there. Implemented for testing purposes
694
+ * and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
695
+ * - impute_at_fit
696
+ * Whether to impute missing values in the input data as the model is being built. If passing 'true',
697
+ * then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
698
+ * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
699
+ * - random_seed
700
+ * Seed that will be used to generate random numbers used by the model.
701
+ * - use_long_double
702
+ * Whether to use 'long double' (extended precision) type for more precise calculations about
703
+ * standard deviations, means, ratios, weights, gain, and other potential aggregates. This makes
704
+ * such calculations accurate to a larger number of decimals (provided that the compiler used has
705
+ * wider long doubles than doubles) and it is highly recommended to use when the input data has
706
+ * a number of rows or columns exceeding 2^53 (an unlikely scenario), and also highly recommended
707
+ * to use when the input data has problematic scales (e.g. numbers that differ from each other by
708
+ * something like 10^-100 or columns that include values like 10^100 and 10^-100 and still need to
709
+ * be sensitive to a difference of 10^-100), but will make the calculations slower, the more so in
710
+ * platforms in which 'long double' is a software-emulated type (e.g. Power8 platforms).
711
+ * Note that some platforms (most notably windows with the msvc compiler) do not make any difference
712
+ * between 'double' and 'long double'.
713
+ * - nthreads
714
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
715
+ * allocated, even if the thread does not end up being used.
716
+ * Be aware that most of the operations are bound by memory bandwidth, which means that
717
+ * adding more threads will not result in a linear speed-up. For some types of data
718
+ * (e.g. large sparse matrices with small sample sizes), adding more threads might result
719
+ * in only a very modest speed up (e.g. 1.5x faster with 4x more threads),
720
+ * even if all threads look fully utilized.
721
+ * Ignored when not building with OpenMP support.
722
+ *
723
+ * Returns
724
+ * =======
725
+ * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
726
+ * If the process receives an interrupt signal, will return instead
727
+ * 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
728
+ * what these values correspond to, you can use the functions
729
+ * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
730
+ * as integers.
731
+ */
732
+ ISOTREE_EXPORTED
733
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
734
+ real_t numeric_data[], size_t ncols_numeric,
735
+ int categ_data[], size_t ncols_categ, int ncat[],
736
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
737
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
738
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
739
+ size_t nrows, size_t sample_size, size_t ntrees,
740
+ size_t max_depth, size_t ncols_per_tree,
741
+ bool limit_depth, bool penalize_range, bool standardize_data,
742
+ ScoringMetric scoring_metric, bool fast_bratio,
743
+ bool standardize_dist, double tmat[],
744
+ double output_depths[], bool standardize_depth,
745
+ real_t col_weights[], bool weigh_by_kurt,
746
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
747
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
748
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
749
+ double prob_pick_col_by_kurt,
750
+ double min_gain, MissingAction missing_action,
751
+ CategSplit cat_split_type, NewCategAction new_cat_action,
752
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
753
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
754
+ uint64_t random_seed, bool use_long_double, int nthreads);
755
+
756
+
757
+
758
+ /* Add additional trees to already-fitted isolation forest model
759
+ *
760
+ * Parameters
761
+ * ==========
762
+ * - model_outputs
763
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
764
+ * if the trees are are to be added to an extended model. Can only pass one of
765
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
766
+ * so it cannot be run in parallel for the same model object.
767
+ * - model_outputs_ext
768
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
769
+ * if the trees are are to be added to an single-variable model. Can only pass one of
770
+ * 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
771
+ * so it cannot be run in parallel for the same model object.
772
+ * - numeric_data[nrows * ncols_numeric]
773
+ * Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
774
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
775
+ * Pass NULL if there are no dense numeric columns.
776
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
777
+ * If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
778
+ * of columns, either as dense or as sparse arrays.
779
+ * - ncols_numeric
780
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
781
+ * what was originally passed to 'fit_iforest'.
782
+ * - categ_data[nrows * ncols_categ]
783
+ * Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
784
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
785
+ * Pass NULL if there are no categorical columns. The encoding must be the same as was used
786
+ * in the data to which the model was fit.
787
+ * Each category should be represented as an integer, and these integers must start at zero and
788
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
789
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
790
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
791
+ * must be the same as was used in the data to which the model was fit.
792
+ * If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
793
+ * of columns and the same category encoding.
794
+ * - ncols_categ
795
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
796
+ * what was originally passed to 'fit_iforest'.
797
+ * - ncat[ncols_categ]
798
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). May contain new categories,
799
+ * but should keep the same encodings that were used for previous categories.
800
+ * - Xc[nnz]
801
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
802
+ * Pass NULL if there are no sparse numeric columns.
803
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
804
+ * - Xc_ind[nnz]
805
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
806
+ * Must be in sorted order, otherwise results will be incorrect.
807
+ * Pass NULL if there are no sparse numeric columns.
808
+ * - Xc_indptr[ncols_numeric + 1]
809
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
810
+ * start and at entry [col + 1] where does column 'col' end.
811
+ * Pass NULL if there are no sparse numeric columns.
812
+ * - ndim
813
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
814
+ * what was originally passed to 'fit_iforest'.
815
+ * - ntry
816
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
817
+ * what was originally passed to 'fit_iforest'.
818
+ * - coef_type
819
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
820
+ * what was originally passed to 'fit_iforest'.
821
+ * - sample_weights
822
+ * Weights for the rows when adding this tree, either as sampling importances when using
823
+ * sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
824
+ * in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
825
+ * the row appeared twice, thus it's less of an outlier) - how this is taken is determined
826
+ * through parameter 'weight_as_sample' that was passed to 'fit_iforest.
827
+ * Pass NULL if the rows all have uniform weights.
828
+ * - nrows
829
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
830
+ * - max_depth
831
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
832
+ * what was originally passed to 'fit_iforest'.
833
+ * - ncols_per_tree
834
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
835
+ * what was originally passed to 'fit_iforest'.
836
+ * - limit_depth
837
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
838
+ * what was originally passed to 'fit_iforest'.
839
+ * - penalize_range
840
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
841
+ * what was originally passed to 'fit_iforest'.
842
+ * - standardize_data
843
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
844
+ * what was originally passed to 'fit_iforest'.
845
+ * - fast_bratio
846
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
847
+ * what was originally passed to 'fit_iforest'.
848
+ * - col_weights
849
+ * Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
850
+ * Ignored when picking columns by deterministic criterion.
851
+ * If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
852
+ * effect is multiplicative.
853
+ * - weigh_by_kurt
854
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
855
+ * what was originally passed to 'fit_iforest'.
856
+ * - prob_pick_by_gain_pl
857
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
858
+ * what was originally passed to 'fit_iforest'.
859
+ * - prob_pick_by_gain_avg
860
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
861
+ * what was originally passed to 'fit_iforest'.
862
+ * - prob_pick_by_full_gain
863
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
864
+ * what was originally passed to 'fit_iforest'.
865
+ * - prob_pick_by_dens
866
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
867
+ * what was originally passed to 'fit_iforest'.
868
+ * - prob_pick_col_by_range
869
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
870
+ * what was originally passed to 'fit_iforest'.
871
+ * - prob_pick_col_by_var
872
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
873
+ * what was originally passed to 'fit_iforest'.
874
+ * - prob_pick_col_by_kurt
875
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
876
+ * what was originally passed to 'fit_iforest'.
877
+ * - min_gain
878
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
879
+ * what was originally passed to 'fit_iforest'.
880
+ * - missing_action
881
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
882
+ * what was originally passed to 'fit_iforest'.
883
+ * - cat_split_type
884
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
885
+ * what was originally passed to 'fit_iforest'.
886
+ * - new_cat_action
887
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
888
+ * what was originally passed to 'fit_iforest'.
889
+ * - depth_imp
890
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
891
+ * what was originally passed to 'fit_iforest'.
892
+ * - weigh_imp_rows
893
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
894
+ * what was originally passed to 'fit_iforest'.
895
+ * - all_perm
896
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
897
+ * what was originally passed to 'fit_iforest'.
898
+ * - coef_by_prop
899
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
900
+ * what was originally passed to 'fit_iforest'.
901
+ * - imputer
902
+ * Pointer to already-allocated imputer object, as it was output from function 'fit_model' while
903
+ * producing either 'model_outputs' or 'model_outputs_ext'.
904
+ * Pass NULL if the model was built without imputer.
905
+ * - min_imp_obs
906
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
907
+ * what was originally passed to 'fit_iforest'.
908
+ * - indexer
909
+ * Indexer object associated to the model object ('model_outputs' or 'model_outputs_ext'), which will
910
+ * be updated with the new tree to add.
911
+ * If 'indexer' has reference points, these must be passed again here in order to index them.
912
+ * Pass NULL if the model has no associated indexer.
913
+ * - ref_numeric_data[nref * ncols_numeric]
914
+ * Pointer to numeric data for reference points. May be ordered by rows
915
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
916
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
917
+ * (see parameter 'ref_is_col_major').
918
+ * Pass NULL if there are no dense numeric columns or no reference points.
919
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
920
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
921
+ * numeric columns, then numeric data for reference points must be passed (in either dense or sparse format).
922
+ * - ref_categ_data[nref * ncols_categ]
923
+ * Pointer to categorical data for reference points. May be ordered by rows
924
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
925
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
926
+ * (see parameter 'ref_is_col_major').
927
+ * Pass NULL if there are no categorical columns or no reference points.
928
+ * If 'indexer' is passed, it has reference points, and the data to which the model was fit had
929
+ * categorical columns, then 'ref_categ_data' must be passed.
930
+ * - ref_is_col_major
931
+ * Whether 'ref_numeric_data' and/or 'ref_categ_data' are in column-major order. If numeric data is
932
+ * passed in sparse format, categorical data must be passed in column-major format. If passing dense
933
+ * data, row-major format is preferred as it will be faster. If the data is passed in row-major format,
934
+ * must also pass 'ref_ld_numeric' and/or 'ref_ld_categ'.
935
+ * If both 'ref_numeric_data' and 'ref_categ_data' are passed, they must have the same orientation
936
+ * (row-major or column-major).
937
+ * - ref_ld_numeric
938
+ * Leading dimension of the array 'ref_numeric_data', if it is passed in row-major format.
939
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
940
+ * be accessed assuming that row 'n' starts at 'ref_numeric_data + n*ref_ld_numeric'). If passing
941
+ * 'ref_numeric_data' in column-major order, this is ignored and will be assumed that the
942
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
943
+ * data in sparse format.
944
+ * - ref_ld_categ
945
+ * Leading dimension of the array 'ref_categ_data', if it is passed in row-major format.
946
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
947
+ * be accessed assuming that row 'n' starts at 'ref_categ_data + n*ref_ld_categ'). If passing
948
+ * 'ref_categ_data' in column-major order, this is ignored and will be assumed that the
949
+ * leading dimension corresponds to the number of rows.
950
+ * - ref_Xc[ref_nnz]
951
+ * Pointer to numeric data for reference points in sparse numeric matrix in CSC format (column-compressed).
952
+ * Pass NULL if there are no sparse numeric columns for reference points or no reference points.
953
+ * Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
954
+ * - ref_Xc_ind[ref_nnz]
955
+ * Pointer to row indices to which each non-zero entry in 'ref_Xc' corresponds.
956
+ * Must be in sorted order, otherwise results will be incorrect.
957
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
958
+ * - ref_Xc_indptr[ref_nnz]
959
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
960
+ * start and at entry [col + 1] where does column 'col' end.
961
+ * Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
962
+ * - random_seed
963
+ * Seed that will be used to generate random numbers used by the model.
964
+ * - use_long_double
965
+ * Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
966
+ * what was originally passed to 'fit_iforest'.
967
+ */
968
+ ISOTREE_EXPORTED
969
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
970
+ real_t numeric_data[], size_t ncols_numeric,
971
+ int categ_data[], size_t ncols_categ, int ncat[],
972
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
973
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
974
+ real_t sample_weights[], size_t nrows,
975
+ size_t max_depth, size_t ncols_per_tree,
976
+ bool limit_depth, bool penalize_range, bool standardize_data,
977
+ bool fast_bratio,
978
+ real_t col_weights[], bool weigh_by_kurt,
979
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
980
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
981
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
982
+ double prob_pick_col_by_kurt,
983
+ double min_gain, MissingAction missing_action,
984
+ CategSplit cat_split_type, NewCategAction new_cat_action,
985
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
986
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
987
+ TreesIndexer *indexer,
988
+ real_t ref_numeric_data[], int ref_categ_data[],
989
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
990
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
991
+ uint64_t random_seed, bool use_long_double);
992
+
993
+
994
+ /* Predict outlier score, average depth, or terminal node numbers
995
+ *
996
+ * Parameters
997
+ * ==========
998
+ * - numeric_data[nrows * ncols_numeric]
999
+ * Pointer to numeric data for which to make predictions. May be ordered by rows
1000
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1001
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1002
+ * (see parameter 'is_col_major').
1003
+ * Pass NULL if there are no dense numeric columns.
1004
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
1005
+ * - categ_data[nrows * ncols_categ]
1006
+ * Pointer to categorical data for which to make predictions. May be ordered by rows
1007
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1008
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1009
+ * (see parameter 'is_col_major').
1010
+ * Pass NULL if there are no categorical columns.
1011
+ * Each category should be represented as an integer, and these integers must start at zero and
1012
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
1013
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
1014
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
1015
+ * must be the same as was used in the data to which the model was fit.
1016
+ * - is_col_major
1017
+ * Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
1018
+ * model was fit. If passing 'false', will assume they are in row-major order. Note that most of
1019
+ * the functions in this library work only with column-major order, but here both are suitable
1020
+ * and row-major is preferred. Both arrays must have the same orientation (row/column major).
1021
+ * If there is numeric sparse data in combination with categorical dense data and there are many
1022
+ * rows, it is recommended to pass the categorical data in column major order, as it will take
1023
+ * a faster route.
1024
+ * If passing 'is_col_major=true', must also provide 'ld_numeric' and/or 'ld_categ'.
1025
+ * - ld_numeric
1026
+ * Leading dimension of the array 'numeric_data', if it is passed in row-major format.
1027
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1028
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
1029
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
1030
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
1031
+ * data in sparse format.
1032
+ * - ld_categ
1033
+ * Leading dimension of the array 'categ_data', if it is passed in row-major format.
1034
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1035
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
1036
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
1037
+ * leading dimension corresponds to the number of rows.
1038
+ * - Xc[nnz]
1039
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
1040
+ * Pass NULL if there are no sparse numeric columns.
1041
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
1042
+ * - Xc_ind[nnz]
1043
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
1044
+ * Must be in sorted order, otherwise results will be incorrect.
1045
+ * Pass NULL if there are no sparse numeric columns in CSC format.
1046
+ * - Xc_indptr[ncols_categ + 1]
1047
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1048
+ * start and at entry [col + 1] where does column 'col' end.
1049
+ * Pass NULL if there are no sparse numeric columns in CSC format.
1050
+ * - Xr[nnz]
1051
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
1052
+ * Pass NULL if there are no sparse numeric columns.
1053
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
1054
+ * - Xr_ind[nnz]
1055
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
1056
+ * Must be in sorted order, otherwise results will be incorrect.
1057
+ * Pass NULL if there are no sparse numeric columns in CSR format.
1058
+ * - Xr_indptr[nrows + 1]
1059
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
1060
+ * start and at entry [row + 1] where does row 'row' end.
1061
+ * Pass NULL if there are no sparse numeric columns in CSR format.
1062
+ * - nrows
1063
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
1064
+ * - nthreads
1065
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
1066
+ * allocated, even if the thread does not end up being used. Ignored when not building with
1067
+ * OpenMP support.
1068
+ * - standardize
1069
+ * Whether to standardize the average depths for each row according to their relative magnitude
1070
+ * compared to the expected average, in order to obtain an outlier score. If passing 'false',
1071
+ * will output the average depth instead.
1072
+ * Ignored when not passing 'output_depths'.
1073
+ * - model_outputs
1074
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
1075
+ * if the predictions are to be made from an extended model. Can only pass one of
1076
+ * 'model_outputs' and 'model_outputs_ext'.
1077
+ * - model_outputs_ext
1078
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
1079
+ * if the predictions are to be made from a single-variable model. Can only pass one of
1080
+ * 'model_outputs' and 'model_outputs_ext'.
1081
+ * - output_depths[nrows] (out)
1082
+ * Pointer to array where the output average depths or outlier scores will be written into
1083
+ * (the return type is controlled according to parameter 'standardize').
1084
+ * Should always be passed when calling this function (it is not optional).
1085
+ * - tree_num[nrows * ntrees] (out)
1086
+ * Pointer to array where the output terminal node numbers will be written into.
1087
+ * Note that the mapping between tree node and terminal tree node is not stored in
1088
+ * the model object for efficiency reasons, so this mapping will be determined on-the-fly
1089
+ * when passing this parameter, and as such, there will be some overhead regardless of
1090
+ * the actual number of rows. Output will be in column-major order ([nrows, ntrees]).
1091
+ * This will not be calculable when using 'ndim==1' alongside with either
1092
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
1093
+ * Pass NULL if this type of output is not needed.
1094
+ * - per_tree_depths[nrows * ntrees] (out)
1095
+ * Pointer to array where to output per-tree depths or expected depths for each row.
1096
+ * Note that these will not include range penalities ('penalize_range=true').
1097
+ * Output will be in row-major order ([nrows, ntrees]).
1098
+ * This will not be calculable when using 'ndim==1' alongside with either
1099
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
1100
+ * Pass NULL if this type of output is not needed.
1101
+ * - indexer
1102
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
1103
+ * which can be used to speed up tree numbers/indices predictions.
1104
+ * This is ignored when not passing 'tree_num'.
1105
+ * Pass NULL if the indexer has not been constructed.
1106
+ */
1107
+ ISOTREE_EXPORTED
1108
+ void predict_iforest(real_t numeric_data[], int categ_data[],
1109
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1110
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1111
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1112
+ size_t nrows, int nthreads, bool standardize,
1113
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1114
+ double output_depths[], sparse_ix tree_num[],
1115
+ double per_tree_depths[],
1116
+ TreesIndexer *indexer);
1117
+
1118
+
1119
+
1120
+ /* Get the number of nodes present in a given model, per tree
1121
+ *
1122
+ * Parameters
1123
+ * ==========
1124
+ * - model_outputs
1125
+ * Pointer to fitted single-variable model object from function 'fit_iforest'.
1126
+ * - model_outputs_ext
1127
+ * Pointer to fitted extended model object from function 'fit_iforest'.
1128
+ * - n_nodes[ntrees] (out)
1129
+ * Number of nodes in tree of the model, including non-terminal nodes.
1130
+ * - n_terminal[ntrees] (out)
1131
+ * Number of terminal nodes in each tree of the model.
1132
+ * - nthreads
1133
+ * Number of parallel threads to use.
1134
+ */
1135
+ ISOTREE_EXPORTED void get_num_nodes(IsoForest &model_outputs, sparse_ix *n_nodes, sparse_ix *n_terminal, int nthreads) noexcept;
1136
+ ISOTREE_EXPORTED void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *n_nodes, sparse_ix *n_terminal, int nthreads) noexcept;
1137
+
1138
+
1139
+
1140
+ /* Calculate distance or similarity or kernel/proximity between data points
1141
+ *
1142
+ * Parameters
1143
+ * ==========
1144
+ * - numeric_data[nrows * ncols_numeric]
1145
+ * Pointer to numeric data for which to make calculations. If not using 'indexer', must be
1146
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
1147
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
1148
+ * row-major or column-major format (with row-major being faster).
1149
+ * If categorical data is passed, must be in the same storage order (row-major / column-major)
1150
+ * as numerical data (whether dense or sparse).
1151
+ * The column order must be the same as in the data that was used to fit the model.
1152
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
1153
+ * the first group is assumed to be the earlier rows here.
1154
+ * Pass NULL if there are no dense numeric columns.
1155
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1156
+ * - categ_data[nrows * ncols_categ]
1157
+ * Pointer to categorical data for which to make calculations. If not using 'indexer', must be
1158
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
1159
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
1160
+ * row-major or column-major format (with row-major being faster).
1161
+ * If numerical data is passed, must be in the same storage order (row-major / column-major)
1162
+ * as categorical data (whether the numerical data is dense or sparse).
1163
+ * Each category should be represented as an integer, and these integers must start at zero and
1164
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
1165
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
1166
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
1167
+ * must be the same as was used in the data to which the model was fit.
1168
+ * Pass NULL if there are no categorical columns.
1169
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
1170
+ * the first group is assumed to be the earlier rows here.
1171
+ * - Xc[nnz]
1172
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed),
1173
+ * or optionally in CSR format (row-compressed) if using 'indexer' and passing 'is_col_major=false'
1174
+ * (not recommended as the calculations will be slower if sparse data is passed as CSR).
1175
+ * If categorical data is passed, must be in the same storage order (row-major or CSR / column-major or CSC)
1176
+ * as numerical data (whether dense or sparse).
1177
+ * Pass NULL if there are no sparse numeric columns.
1178
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
1179
+ * - Xc_ind[nnz]
1180
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds
1181
+ * (column indices if 'Xc' is in CSR format).
1182
+ * Must be in sorted order, otherwise results will be incorrect.
1183
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
1184
+ * - Xc_indptr[ncols_categ + 1]
1185
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
1186
+ * start and at entry [col + 1] where does column 'col' end
1187
+ * (row index pointers if 'Xc' is passed in CSR format).
1188
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
1189
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
1190
+ * the first group is assumed to be the earlier rows here.
1191
+ * - nrows
1192
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
1193
+ * - use_long_double
1194
+ * Whether to use 'long double' (extended precision) type for the calculations. This makes them
1195
+ * more accurate (provided that the compiler used has wider long doubles than doubles), but
1196
+ * slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
1197
+ * Power8 platforms).
1198
+ * - nthreads
1199
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
1200
+ * allocated, even if the thread does not end up being used (with one exception being kernel calculations
1201
+ * with respect to reference points in an idexer). Ignored when not building with OpenMP support.
1202
+ * - assume_full_distr
1203
+ * Whether to assume that the fitted model represents a full population distribution (will use a
1204
+ * standardizing criterion assuming infinite sample, and the results of the similarity between two points
1205
+ * at prediction time will not depend on the prescence of any third point that is similar to them, but will
1206
+ * differ more compared to the pairwise distances between points from which the model was fit). If passing
1207
+ * 'false', will calculate pairwise distances as if the new observations at prediction time were added to
1208
+ * the sample to which each tree was fit, which will make the distances between two points potentially vary
1209
+ * according to other newly introduced points.
1210
+ * This was added for experimentation purposes only and it's not recommended to pass 'false'.
1211
+ * Note that when calculating distances using 'indexer', there
1212
+ * might be slight discrepancies between the numbers produced with or without the indexer due to what
1213
+ * are considered "additional" observations in this calculation.
1214
+ * This is ignored when passing 'as_kernel=true'.
1215
+ * - standardize_dist
1216
+ * Whether to standardize the resulting average separation depths between rows according
1217
+ * to the expected average separation depth in a similar way as when predicting outlierness,
1218
+ * in order to obtain a standardized distance. If passing 'false', will output the average
1219
+ * separation depth instead.
1220
+ * If passing 'as_kernel=true', this indicates whether to output a fraction (if 'true') or
1221
+ * the raw number of matching trees (if 'false').
1222
+ * - as_kernel
1223
+ * Whether to calculate the "similarities" as isolation kernel or proximity matrix, which counts
1224
+ * the proportion of trees in which two observations end up in the same terminal node. This is
1225
+ * typically much faster than separation-based distance, but is typically not as good quality.
1226
+ * Note that, for kernel calculations, the indexer is only used if it has reference points stored on it.
1227
+ * - model_outputs
1228
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
1229
+ * if the calculations are to be made from an extended model. Can only pass one of
1230
+ * 'model_outputs' and 'model_outputs_ext'.
1231
+ * - model_outputs_ext
1232
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
1233
+ * if the calculations are to be made from a single-variable model. Can only pass one of
1234
+ * 'model_outputs' and 'model_outputs_ext'.
1235
+ * - tmat[nrows * (nrows - 1) / 2] (out)
1236
+ * Pointer to array where the resulting pairwise distances or average separation depths or kernels will
1237
+ * be written into. As the output is a symmetric matrix, this function will only fill in the
1238
+ * upper-triangular part, in which entry 0 <= i < j < n will be located at position
1239
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
1240
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
1241
+ * The array must already be initialized to zeros.
1242
+ * If calculating distance/separation from a group of points to another group of points,
1243
+ * pass NULL here and use 'rmat' instead.
1244
+ * - rmat[nrows1 * nrows2] (out)
1245
+ * Pointer to array where to write the distances or separation depths or kernels between each row in
1246
+ * one set of observations and each row in a different set of observations. If doing these
1247
+ * calculations for all pairs of observations/rows, pass 'tmat' instead.
1248
+ * Will take the first group of observations as the rows in this matrix, and the second
1249
+ * group as the columns. The groups are assumed to be in the same data arrays, with the
1250
+ * first group corresponding to the earlier rows there.
1251
+ * This matrix will be used in row-major order (i.e. entries 1..nrows2 contain the first row from nrows1).
1252
+ * Must be already initialized to zeros.
1253
+ * If passing 'use_indexed_references=true' plus an indexer object with reference points, this
1254
+ * array should have dimension [nrows, n_references].
1255
+ * Ignored when 'tmat' is passed.
1256
+ * - n_from
1257
+ * When calculating distances between two groups of points, this indicates the number of
1258
+ * observations/rows belonging to the first group (the rows in 'rmat'), which will be
1259
+ * assumed to be the first 'n_from' rows.
1260
+ * Ignored when 'tmat' is passed or when 'use_indexed_references=true' plus an indexer with
1261
+ * references are passed.
1262
+ * - use_indexed_references
1263
+ * Whether to calculate distances with respect to reference points stored in the indexer
1264
+ * object, if it has any. This is only supported with 'assume_full_distr=true' or with 'as_kernel=true'.
1265
+ * If passing 'use_indexed_references=true', then 'tmat' must be NULL, and 'rmat' must
1266
+ * be of dimension [nrows, n_references].
1267
+ * - indexer
1268
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
1269
+ * which can be used to speed up distance calculations, assuming that it was built with
1270
+ * option 'with_distances=true'. If it does not contain node distances, it will not be used.
1271
+ * Pass NULL if the indexer has not been constructed or was constructed with 'with_distances=false'.
1272
+ * If it contains reference points and passing 'use_indexed_references=true', distances will be
1273
+ * calculated between between the input data passed here and the reference points stored in this object.
1274
+ * If passing 'as_kernel=true', the indexer can only be used for calculating kernels with respect to
1275
+ * reference points in the indexer, otherwise it will not be used (which also means that the data must be
1276
+ * passed in column-major order for all kernel calculations that are not with respect to reference points
1277
+ * from an indexer).
1278
+ * - is_col_major
1279
+ * Whether the data comes in column-major order. If using 'indexer', predictions are also possible
1280
+ * (and are even faster for the case of dense-only data) if passing the data in row-major format.
1281
+ * Without 'indexer' (and with 'as_kernel=true' but without reference points in the idnexer), data
1282
+ * may only be passed in column-major format.
1283
+ * If there is sparse numeric data, it is highly suggested to pass it in CSC/column-major format.
1284
+ * - ld_numeric
1285
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'numeric_data'.
1286
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1287
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
1288
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
1289
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
1290
+ * data in sparse format.
1291
+ * Note that data in row-major order is only accepted when using 'indexer'.
1292
+ * - ld_categ
1293
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'categ_data'.
1294
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
1295
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
1296
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
1297
+ * leading dimension corresponds to the number of rows.
1298
+ * Note that data in row-major order is only accepted when using 'indexer'.
1299
+ */
1300
+ ISOTREE_EXPORTED
1301
+ void calc_similarity(real_t numeric_data[], int categ_data[],
1302
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1303
+ size_t nrows, bool use_long_double, int nthreads,
1304
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
1305
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1306
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
1307
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
1308
+
1309
+ /* Impute missing values in new data
1310
+ *
1311
+ * Parameters
1312
+ * ==========
1313
+ * - numeric_data[nrows * ncols_numeric] (in, out)
1314
+ * Pointer to numeric data in which missing values will be imputed. May be ordered by rows
1315
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1316
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1317
+ * (see parameter 'is_col_major').
1318
+ * Pass NULL if there are no dense numeric columns.
1319
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
1320
+ * Imputations will overwrite values in this same array.
1321
+ * - categ_data[nrows * ncols_categ]
1322
+ * Pointer to categorical data in which missing values will be imputed. May be ordered by rows
1323
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
1324
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
1325
+ * (see parameter 'is_col_major').
1326
+ * Pass NULL if there are no categorical columns.
1327
+ * Each category should be represented as an integer, and these integers must start at zero and
1328
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
1329
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
1330
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
1331
+ * must be the same as was used in the data to which the model was fit.
1332
+ * Imputations will overwrite values in this same array.
1333
+ * - is_col_major
1334
+ * Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
1335
+ * model was fit. If passing 'false', will assume they are in row-major order. Note that most of
1336
+ * the functions in this library work only with column-major order, but here both are suitable
1337
+ * and row-major is preferred. Both arrays must have the same orientation (row/column major).
1338
+ * - ncols_categ
1339
+ * Number of categorical columns in the data.
1340
+ * - ncat[ncols_categ]
1341
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
1342
+ * the number of categories for that column is '5' (zero is one category).
1343
+ * Must be the same as was passed to 'fit_iforest'.
1344
+ * - Xr[nnz] (in, out)
1345
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
1346
+ * Pass NULL if there are no sparse numeric columns.
1347
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
1348
+ * Imputations will overwrite values in this same array.
1349
+ * - Xr_ind[nnz]
1350
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
1351
+ * Must be in sorted order, otherwise results will be incorrect.
1352
+ * Pass NULL if there are no sparse numeric columns in CSR format.
1353
+ * - Xr_indptr[nrows + 1]
1354
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
1355
+ * start and at entry [row + 1] where does row 'row' end.
1356
+ * Pass NULL if there are no sparse numeric columns in CSR format.
1357
+ * - nrows
1358
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
1359
+ * - use_long_double
1360
+ * Whether to use 'long double' (extended precision) type for the calculations. This makes them
1361
+ * more accurate (provided that the compiler used has wider long doubles than doubles), but
1362
+ * slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
1363
+ * Power8 platforms).
1364
+ * - nthreads
1365
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
1366
+ * allocated, even if the thread does not end up being used. Ignored when not building with
1367
+ * OpenMP support.
1368
+ * - model_outputs
1369
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
1370
+ * if the predictions are to be made from an extended model. Can only pass one of
1371
+ * 'model_outputs' and 'model_outputs_ext'.
1372
+ * - model_outputs_ext
1373
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
1374
+ * if the predictions are to be made from a single-variable model. Can only pass one of
1375
+ * 'model_outputs' and 'model_outputs_ext'.
1376
+ * - impute_nodes
1377
+ * Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
1378
+ * as produced from function 'fit_iforest',
1379
+ */
1380
+ ISOTREE_EXPORTED
1381
+ void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
1382
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1383
+ size_t nrows, bool use_long_double, int nthreads,
1384
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1385
+ Imputer &imputer);
1386
+
1387
+
1388
+ /* Append trees from one model into another
1389
+ *
1390
+ * Parameters
1391
+ * ==========
1392
+ * - model (in, out)
1393
+ * Pointer to isolation forest model wich has already been fit through 'fit_iforest'.
1394
+ * The trees from 'other' will be merged into this (will be at the end of vector member 'trees').
1395
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
1396
+ * in order for this merge to work correctly - at the very least, should have
1397
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
1398
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
1399
+ * Pass NULL if this is not to be used.
1400
+ * - other
1401
+ * Pointer to isolation forest model which has already been fit through 'fit_iforest'.
1402
+ * The trees from this object will be added into 'model' (this object will not be modified).
1403
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
1404
+ * in order for this merge to work correctly - at the very least, should have
1405
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
1406
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
1407
+ * Pass NULL if this is not to be used.
1408
+ * - ext_model (in, out)
1409
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
1410
+ * The trees/hyperplanes from 'ext_other' will be merged into this (will be at the end of vector member 'hplanes').
1411
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
1412
+ * in order for this merge to work correctly - at the very least, should have
1413
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
1414
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
1415
+ * Pass NULL if this is not to be used.
1416
+ * - ext_other
1417
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
1418
+ * The trees/hyperplanes from this object will be added into 'ext_model' (this object will not be modified).
1419
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
1420
+ * in order for this merge to work correctly - at the very least, should have
1421
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
1422
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
1423
+ * Pass NULL if this is not to be used.
1424
+ * - imputer (in, out)
1425
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
1426
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
1427
+ * The imputation nodes from 'iother' will be merged into this (will be at the end of vector member 'imputer_tree').
1428
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
1429
+ * hyperparameters after the merge).
1430
+ * Pass NULL if this is not to be used.
1431
+ * - iother
1432
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
1433
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
1434
+ * The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
1435
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
1436
+ * hyperparameters after the merge).
1437
+ * Pass NULL if this is not to be used.
1438
+ * - indexer (in, out)
1439
+ * Pointer to indexer object which has already been fit through 'fit_iforest' along with
1440
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized function.
1441
+ * The imputation nodes from 'ind_other' will be merged into this (will be at the end of vector member 'indices').
1442
+ * Reference points should not differ between 'indexer' and 'ind_other'.
1443
+ * Pass NULL if this is not to be used.
1444
+ * - ind_other
1445
+ * Pointer to indexer object which has already been fit through 'fit_iforest' along with
1446
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized function.
1447
+ * The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
1448
+ * Reference points should not differ between 'indexer' and 'ind_other'.
1449
+ * Pass NULL if this is not to be used.
1450
+ */
1451
+ ISOTREE_EXPORTED
1452
+ void merge_models(IsoForest* model, IsoForest* other,
1453
+ ExtIsoForest* ext_model, ExtIsoForest* ext_other,
1454
+ Imputer* imputer, Imputer* iother,
1455
+ TreesIndexer* indexer, TreesIndexer* ind_other);
1456
+
1457
+ /* Create a model containing a sub-set of the trees from another model
1458
+ *
1459
+ * Parameters
1460
+ * ==========
1461
+ * - model (in)
1462
+ * Pointer to isolation forest model wich has already been fit through 'fit_iforest',
1463
+ * from which the desired trees will be copied into a new model object.
1464
+ * Pass NULL if using the extended model.
1465
+ * - ext_model (in)
1466
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest',
1467
+ * from which the desired trees will be copied into a new model object.
1468
+ * Pass NULL if using the single-variable model.
1469
+ * - imputer (in)
1470
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
1471
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
1472
+ * Pass NULL if the model was built without an imputer.
1473
+ * - indexer (in)
1474
+ * Pointer to indexer object which has already been fit through 'fit_iforest' along with
1475
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized funcction.
1476
+ * Pass NULL if the model was built without an indexer.
1477
+ * - model_new (out)
1478
+ * Pointer to already-allocated isolation forest model, which will be reset and to
1479
+ * which the selected trees from 'model' will be copied.
1480
+ * Pass NULL if using the extended model.
1481
+ * - ext_model_new (out)
1482
+ * Pointer to already-allocated extended isolation forest model, which will be reset and to
1483
+ * which the selected hyperplanes from 'ext_model' will be copied.
1484
+ * Pass NULL if using the single-variable model.
1485
+ * - imputer_new (out)
1486
+ * Pointer to already-allocated imputation object, which will be reset and to
1487
+ * which the selected nodes from 'imputer' (matching to those of either 'model'
1488
+ * or 'ext_model') will be copied.
1489
+ * Pass NULL if the model was built without an imputer.
1490
+ * - indexer_new (out)
1491
+ * Pointer to already-allocated indexer object, which will be reset and to
1492
+ * which the selected nodes from 'indexer' (matching to those of either 'model'
1493
+ * or 'ext_model') will be copied.
1494
+ * Pass NULL if the model was built without an indexer.
1495
+ */
1496
+ ISOTREE_EXPORTED
1497
+ void subset_model(IsoForest* model, IsoForest* model_new,
1498
+ ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
1499
+ Imputer* imputer, Imputer* imputer_new,
1500
+ TreesIndexer* indexer, TreesIndexer* indexer_new,
1501
+ size_t *trees_take, size_t ntrees_take);
1502
+
1503
+ /* Build indexer for faster terminal node predictions and/or distance calculations
1504
+ *
1505
+ * Parameters
1506
+ * ==========
1507
+ * - indexer
1508
+ * Pointer or reference to an indexer object which will be associated to a fitted model and in
1509
+ * which indices for terminal nodes and potentially node distances will be stored.
1510
+ * - model / model_outputs / model_outputs_ext
1511
+ * Pointer or reference to a fitted model object for which an indexer will be built.
1512
+ * - nthreads
1513
+ * Number of parallel threads to use. This operation will only be multi-threaded when passing
1514
+ * 'with_distances=true'.
1515
+ * - with_distances
1516
+ * Whether to also pre-calculate node distances in order to speed up 'calc_similarity' (distances).
1517
+ * Note that this will consume a lot more memory and make the resulting object significantly
1518
+ * heavier.
1519
+ */
1520
+ ISOTREE_EXPORTED
1521
+ void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances);
1522
+ ISOTREE_EXPORTED
1523
+ void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances);
1524
+ ISOTREE_EXPORTED
1525
+ void build_tree_indices
1526
+ (
1527
+ TreesIndexer *indexer,
1528
+ const IsoForest *model_outputs,
1529
+ const ExtIsoForest *model_outputs_ext,
1530
+ int nthreads,
1531
+ const bool with_distances
1532
+ );
1533
+ /* Gets the number of reference points stored in an indexer object */
1534
+ ISOTREE_EXPORTED
1535
+ size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept;
1536
+
1537
+
1538
+ /* Functions to inspect serialized objects
1539
+ *
1540
+ * Parameters
1541
+ * ==========
1542
+ * - serialized_bytes (in)
1543
+ * A model from this library, serialized through the functions available since
1544
+ * version 0.3.0, in any of the varieties offered by the library (as separate
1545
+ * objects or as combined objects with metadata).
1546
+ * - is_isotree_model (out)
1547
+ * Whether the input 'serialized_bytes' is a serialized model from this library.
1548
+ * - is_compatible (out)
1549
+ * Whether the serialized model is compatible (i.e. can be de-serialized) with the
1550
+ * current setup.
1551
+ * Serialized models are compatible between:
1552
+ * - Different operating systems.
1553
+ * - Different compilers.
1554
+ * - Systems with different 'size_t' width (e.g. 32-bit and 64-bit),
1555
+ * as long as the file was produced on a system that was either 32-bit or 64-bit,
1556
+ * and as long as each saved value fits within the range of the machine's 'size_t' type.
1557
+ * - Systems with different 'int' width,
1558
+ * as long as the file was produced on a system that was 16-bit, 32-bit, or 64-bit,
1559
+ * and as long as each saved value fits within the range of the machine's int type.
1560
+ * - Systems with different bit endianness (e.g. x86 and PPC64 in non-le mode).
1561
+ * - Versions of this package from 0.3.0 onwards.
1562
+ * But are not compatible between:
1563
+ * - Systems with different floating point numeric representations
1564
+ * (e.g. standard IEEE754 vs. a base-10 system).
1565
+ * - Versions of this package earlier than 0.3.0.
1566
+ * This pretty much guarantees that a given file can be serialized and de-serialized
1567
+ * in the same machine in which it was built, regardless of how the library was compiled.
1568
+ * Reading a serialized model that was produced in a platform with different
1569
+ * characteristics (e.g. 32-bit vs. 64-bit) will be much slower however.
1570
+ * - has_combined_objects (out)
1571
+ * Whether the serialized model is in the format of combined objects (as produced by the
1572
+ * functions named 'serialized_combined') or in the format of separate objects (as produced
1573
+ * by the functions named 'serialized_<model>').
1574
+ * If if is in the format of combined objects, must be de-serialized through the functions
1575
+ * named 'deserialize_combined'; ohterwise, must be de-serialized through the functions
1576
+ * named 'deserialize_<model>'.
1577
+ * Note that the Python and R interfaces of this library use the combined objects format
1578
+ * when serializing to files.
1579
+ * - has_IsoForest (out)
1580
+ * Whether the serialized bytes include an 'IsoForest' object. If it has 'has_combined_objects=true',
1581
+ * might include additional objects.
1582
+ * - has_ExtIsoForest (out)
1583
+ * Whether the serialized bytes include an 'ExtIsoForest' object. If it has 'has_combined_objects=true',
1584
+ * might include additional objects.
1585
+ * - has_Imputer (out)
1586
+ * Whether the serialized bytes include an 'Imputer' object. If it has 'has_combined_objects=true',
1587
+ * might include additional objects.
1588
+ * - has_metadata (out)
1589
+ * Whether the serialized bytes include additional metadata in the form of a 'char' array.
1590
+ * This can only be present when having 'has_combined_objects=true'.
1591
+ * - size_metadata (out)
1592
+ * When the serialized bytes contain metadata, this denotes the size of the metadata (number
1593
+ * of bytes that it contains).
1594
+ */
1595
+ ISOTREE_EXPORTED
1596
+ void inspect_serialized_object
1597
+ (
1598
+ const char *serialized_bytes,
1599
+ bool &is_isotree_model,
1600
+ bool &is_compatible,
1601
+ bool &has_combined_objects,
1602
+ bool &has_IsoForest,
1603
+ bool &has_ExtIsoForest,
1604
+ bool &has_Imputer,
1605
+ bool &has_Indexer,
1606
+ bool &has_metadata,
1607
+ size_t &size_metadata
1608
+ );
1609
+ ISOTREE_EXPORTED
1610
+ void inspect_serialized_object
1611
+ (
1612
+ FILE *serialized_bytes,
1613
+ bool &is_isotree_model,
1614
+ bool &is_compatible,
1615
+ bool &has_combined_objects,
1616
+ bool &has_IsoForest,
1617
+ bool &has_ExtIsoForest,
1618
+ bool &has_Imputer,
1619
+ bool &has_Indexer,
1620
+ bool &has_metadata,
1621
+ size_t &size_metadata
1622
+ );
1623
+ ISOTREE_EXPORTED
1624
+ void inspect_serialized_object
1625
+ (
1626
+ std::istream &serialized_bytes,
1627
+ bool &is_isotree_model,
1628
+ bool &is_compatible,
1629
+ bool &has_combined_objects,
1630
+ bool &has_IsoForest,
1631
+ bool &has_ExtIsoForest,
1632
+ bool &has_Imputer,
1633
+ bool &has_Indexer,
1634
+ bool &has_metadata,
1635
+ size_t &size_metadata
1636
+ );
1637
+ ISOTREE_EXPORTED
1638
+ void inspect_serialized_object
1639
+ (
1640
+ const std::string &serialized_bytes,
1641
+ bool &is_isotree_model,
1642
+ bool &is_compatible,
1643
+ bool &has_combined_objects,
1644
+ bool &has_IsoForest,
1645
+ bool &has_ExtIsoForest,
1646
+ bool &has_Imputer,
1647
+ bool &has_Indexer,
1648
+ bool &has_metadata,
1649
+ size_t &size_metadata
1650
+ );
1651
+
1652
+ /* Serialization and de-serialization functions (individual objects)
1653
+ *
1654
+ * Parameters
1655
+ * ==========
1656
+ * - model (in or out depending on function)
1657
+ * A model object to serialize (when it has 'const' qualifier), after being fitted through
1658
+ * function 'fit_iforest'; or an already-allocated object (should be initialized through
1659
+ * the default constructor) into which a serialized object of the same class will be
1660
+ * de-serialized. In the latter case, the contents of this object will be overwritten.
1661
+ * Note that this will only be able to load models generated with isotree version 0.3.0
1662
+ * and later, and that these serialized models are forwards compatible but not backwards
1663
+ * compatible (that is, a model saved with 0.3.0 can be loaded with 0.3.6, but not the other
1664
+ * way around).
1665
+ * - output (out)
1666
+ * A writable object or stream in which to save/persist/serialize the
1667
+ * model or imputer object. In the functions that do not take this as a parameter,
1668
+ * it will be returned as a string containing the raw bytes.
1669
+ * Should be opened in binary mode.
1670
+ * Note: on Windows, if compiling this library with a compiler other than MSVC or MINGW,
1671
+ * there might be issues writing models to FILE pointers if the models are larger than 2GB.
1672
+ * - in (in)
1673
+ * An readable object or stream which contains the serialized/persisted model or
1674
+ * imputer object which will be de-serialized. Should be opened in binary mode.
1675
+ *
1676
+ * Returns
1677
+ * =======
1678
+ * (Only for functions 'determine_serialized_size')
1679
+ * Size that the model or imputer object will use when serialized, intended to be
1680
+ * used for allocating arrays beforehand when serializing to 'char'.
1681
+ */
1682
+ ISOTREE_EXPORTED
1683
+ size_t determine_serialized_size(const IsoForest &model) noexcept;
1684
+ ISOTREE_EXPORTED
1685
+ size_t determine_serialized_size(const ExtIsoForest &model) noexcept;
1686
+ ISOTREE_EXPORTED
1687
+ size_t determine_serialized_size(const Imputer &model) noexcept;
1688
+ ISOTREE_EXPORTED
1689
+ size_t determine_serialized_size(const TreesIndexer &model) noexcept;
1690
+ ISOTREE_EXPORTED
1691
+ void serialize_IsoForest(const IsoForest &model, char *out);
1692
+ ISOTREE_EXPORTED
1693
+ void serialize_IsoForest(const IsoForest &model, FILE *out);
1694
+ ISOTREE_EXPORTED
1695
+ void serialize_IsoForest(const IsoForest &model, std::ostream &out);
1696
+ ISOTREE_EXPORTED
1697
+ std::string serialize_IsoForest(const IsoForest &model);
1698
+ ISOTREE_EXPORTED
1699
+ void deserialize_IsoForest(IsoForest &model, const char *in);
1700
+ ISOTREE_EXPORTED
1701
+ void deserialize_IsoForest(IsoForest &model, FILE *in);
1702
+ ISOTREE_EXPORTED
1703
+ void deserialize_IsoForest(IsoForest &model, std::istream &in);
1704
+ ISOTREE_EXPORTED
1705
+ void deserialize_IsoForest(IsoForest &model, const std::string &in);
1706
+ ISOTREE_EXPORTED
1707
+ void serialize_ExtIsoForest(const ExtIsoForest &model, char *out);
1708
+ ISOTREE_EXPORTED
1709
+ void serialize_ExtIsoForest(const ExtIsoForest &model, FILE *out);
1710
+ ISOTREE_EXPORTED
1711
+ void serialize_ExtIsoForest(const ExtIsoForest &model, std::ostream &out);
1712
+ ISOTREE_EXPORTED
1713
+ std::string serialize_ExtIsoForest(const ExtIsoForest &model);
1714
+ ISOTREE_EXPORTED
1715
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const char *in);
1716
+ ISOTREE_EXPORTED
1717
+ void deserialize_ExtIsoForest(ExtIsoForest &model, FILE *in);
1718
+ ISOTREE_EXPORTED
1719
+ void deserialize_ExtIsoForest(ExtIsoForest &model, std::istream &in);
1720
+ ISOTREE_EXPORTED
1721
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const std::string &in);
1722
+ ISOTREE_EXPORTED
1723
+ void serialize_Imputer(const Imputer &model, char *out);
1724
+ ISOTREE_EXPORTED
1725
+ void serialize_Imputer(const Imputer &model, FILE *out);
1726
+ ISOTREE_EXPORTED
1727
+ void serialize_Imputer(const Imputer &model, std::ostream &out);
1728
+ ISOTREE_EXPORTED
1729
+ std::string serialize_Imputer(const Imputer &model);
1730
+ ISOTREE_EXPORTED
1731
+ void deserialize_Imputer(Imputer &model, const char *in);
1732
+ ISOTREE_EXPORTED
1733
+ void deserialize_Imputer(Imputer &model, FILE *in);
1734
+ ISOTREE_EXPORTED
1735
+ void deserialize_Imputer(Imputer &model, std::istream &in);
1736
+ ISOTREE_EXPORTED
1737
+ void deserialize_Imputer(Imputer &model, const std::string &in);
1738
+ ISOTREE_EXPORTED
1739
+ void serialize_Indexer(const TreesIndexer &model, char *out);
1740
+ ISOTREE_EXPORTED
1741
+ void serialize_Indexer(const TreesIndexer &model, FILE *out);
1742
+ ISOTREE_EXPORTED
1743
+ void serialize_Indexer(const TreesIndexer &model, std::ostream &out);
1744
+ ISOTREE_EXPORTED
1745
+ std::string serialize_Indexer(const TreesIndexer &model);
1746
+ ISOTREE_EXPORTED
1747
+ void deserialize_Indexer(TreesIndexer &model, const char *in);
1748
+ ISOTREE_EXPORTED
1749
+ void deserialize_Indexer(TreesIndexer &model, FILE *in);
1750
+ ISOTREE_EXPORTED
1751
+ void deserialize_Indexer(TreesIndexer &model, std::istream &in);
1752
+ ISOTREE_EXPORTED
1753
+ void deserialize_Indexer(TreesIndexer &model, const std::string &in);
1754
+
1755
+
1756
+ /* Serialization and de-serialization functions (combined objects)
1757
+ *
1758
+ * Parameters
1759
+ * ==========
1760
+ * - model (in or out depending on function)
1761
+ * A single-variable model object to serialize or de-serialize.
1762
+ * If the serialized object contains this type of object, it must be
1763
+ * passed, as an already-allocated object (initialized through the default
1764
+ * constructor function).
1765
+ * When de-serializing, can check if it needs to be passed through function
1766
+ * 'inspect_serialized_object'.
1767
+ * If using the extended model, should pass NULL.
1768
+ * Must pass one of 'model' or 'model_ext'.
1769
+ * - model_ext (in or out depending on function)
1770
+ * An extended model object to serialize or de-serialize.
1771
+ * If using the single-variable model, should pass NULL.
1772
+ * Must pass one of 'model' or 'model_ext'.
1773
+ * - imputer (in or out depending on function)
1774
+ * An imputer object to serialize or de-serialize.
1775
+ * Like 'model' and 'model_ext', must also be passed when de-serializing
1776
+ * if the serialized bytes contain such object.
1777
+ * - optional_metadata (in or out depending on function)
1778
+ * Optional metadata to write at the end of the file, which will be written
1779
+ * unformatted (it is assumed files are in binary mode).
1780
+ * Pass NULL if there is no metadata.
1781
+ * - size_optional_metadata (in or out depending on function)
1782
+ * Size of the optional metadata, if passed. Pass zero if there is no metadata.
1783
+ * - serialized_model (in)
1784
+ * A single-variable model which was serialized to raw bytes in the separate-objects
1785
+ * format, using function 'serialize_IsoForest'.
1786
+ * Pass NULL if using the extended model.
1787
+ * Must pass one of 'serialized_model' or 'serialized_model_ext'.
1788
+ * Note that if it was produced on a platform with different characteristics than
1789
+ * the one in which this function is being called (e.g. different 'size_t' width or
1790
+ * different endianness), it will be re-serialized during the function call, which
1791
+ * can be slow and use a lot of memory.
1792
+ * - serialized_model_ext (in)
1793
+ * An extended model which was serialized to raw bytes in the separate-objects
1794
+ * format, using function 'serialize_ExtIsoForest'.
1795
+ * Pass NULL if using the single-variable model.
1796
+ * Must pass one of 'serialized_model' or 'serialized_model_ext'.
1797
+ * - serialized_imputer (in)
1798
+ * An imputer object which was serialized to raw bytes in the separate-objects
1799
+ * format, using function 'serialize_Imputer'.
1800
+ * - output (out)
1801
+ * A writable object or stream in which to save/persist/serialize the
1802
+ * model objects. In the functions that do not take this as a parameter,
1803
+ * it will be returned as a string containing the raw bytes.
1804
+ * Should be opened in binary mode.
1805
+ * - in (in)
1806
+ * An readable object or stream which contains the serialized/persisted model
1807
+ * objects which will be de-serialized. Should be opened in binary mode.
1808
+ *
1809
+ * Returns
1810
+ * =======
1811
+ * (Only for functions 'determine_serialized_size')
1812
+ * Size that the objects will use when serialized, intended to be
1813
+ * used for allocating arrays beforehand when serializing to 'char'.
1814
+ */
1815
+ ISOTREE_EXPORTED
1816
+ size_t determine_serialized_size_combined
1817
+ (
1818
+ const IsoForest *model,
1819
+ const ExtIsoForest *model_ext,
1820
+ const Imputer *imputer,
1821
+ const TreesIndexer *indexer,
1822
+ const size_t size_optional_metadata
1823
+ ) noexcept;
1824
+ ISOTREE_EXPORTED
1825
+ size_t determine_serialized_size_combined
1826
+ (
1827
+ const char *serialized_model,
1828
+ const char *serialized_model_ext,
1829
+ const char *serialized_imputer,
1830
+ const char *serialized_indexer,
1831
+ const size_t size_optional_metadata
1832
+ ) noexcept;
1833
+ ISOTREE_EXPORTED
1834
+ void serialize_combined
1835
+ (
1836
+ const IsoForest *model,
1837
+ const ExtIsoForest *model_ext,
1838
+ const Imputer *imputer,
1839
+ const TreesIndexer *indexer,
1840
+ const char *optional_metadata,
1841
+ const size_t size_optional_metadata,
1842
+ char *out
1843
+ );
1844
+ ISOTREE_EXPORTED
1845
+ void serialize_combined
1846
+ (
1847
+ const IsoForest *model,
1848
+ const ExtIsoForest *model_ext,
1849
+ const Imputer *imputer,
1850
+ const TreesIndexer *indexer,
1851
+ const char *optional_metadata,
1852
+ const size_t size_optional_metadata,
1853
+ FILE *out
1854
+ );
1855
+ ISOTREE_EXPORTED
1856
+ void serialize_combined
1857
+ (
1858
+ const IsoForest *model,
1859
+ const ExtIsoForest *model_ext,
1860
+ const Imputer *imputer,
1861
+ const TreesIndexer *indexer,
1862
+ const char *optional_metadata,
1863
+ const size_t size_optional_metadata,
1864
+ std::ostream &out
1865
+ );
1866
+ ISOTREE_EXPORTED
1867
+ std::string serialize_combined
1868
+ (
1869
+ const IsoForest *model,
1870
+ const ExtIsoForest *model_ext,
1871
+ const Imputer *imputer,
1872
+ const TreesIndexer *indexer,
1873
+ const char *optional_metadata,
1874
+ const size_t size_optional_metadata
1875
+ );
1876
+ ISOTREE_EXPORTED
1877
+ void serialize_combined
1878
+ (
1879
+ const char *serialized_model,
1880
+ const char *serialized_model_ext,
1881
+ const char *serialized_imputer,
1882
+ const char *serialized_indexer,
1883
+ const char *optional_metadata,
1884
+ const size_t size_optional_metadata,
1885
+ FILE *out
1886
+ );
1887
+ ISOTREE_EXPORTED
1888
+ void serialize_combined
1889
+ (
1890
+ const char *serialized_model,
1891
+ const char *serialized_model_ext,
1892
+ const char *serialized_imputer,
1893
+ const char *serialized_indexer,
1894
+ const char *optional_metadata,
1895
+ const size_t size_optional_metadata,
1896
+ std::ostream &out
1897
+ );
1898
+ ISOTREE_EXPORTED
1899
+ std::string serialize_combined
1900
+ (
1901
+ const char *serialized_model,
1902
+ const char *serialized_model_ext,
1903
+ const char *serialized_imputer,
1904
+ const char *serialized_indexer,
1905
+ const char *optional_metadata,
1906
+ const size_t size_optional_metadata
1907
+ );
1908
+ ISOTREE_EXPORTED
1909
+ void deserialize_combined
1910
+ (
1911
+ const char* in,
1912
+ IsoForest *model,
1913
+ ExtIsoForest *model_ext,
1914
+ Imputer *imputer,
1915
+ TreesIndexer *indexer,
1916
+ char *optional_metadata
1917
+ );
1918
+ ISOTREE_EXPORTED
1919
+ void deserialize_combined
1920
+ (
1921
+ FILE* in,
1922
+ IsoForest *model,
1923
+ ExtIsoForest *model_ext,
1924
+ Imputer *imputer,
1925
+ TreesIndexer *indexer,
1926
+ char *optional_metadata
1927
+ );
1928
+ ISOTREE_EXPORTED
1929
+ void deserialize_combined
1930
+ (
1931
+ std::istream &in,
1932
+ IsoForest *model,
1933
+ ExtIsoForest *model_ext,
1934
+ Imputer *imputer,
1935
+ TreesIndexer *indexer,
1936
+ char *optional_metadata
1937
+ );
1938
+ ISOTREE_EXPORTED
1939
+ void deserialize_combined
1940
+ (
1941
+ const std::string &in,
1942
+ IsoForest *model,
1943
+ ExtIsoForest *model_ext,
1944
+ Imputer *imputer,
1945
+ TreesIndexer *indexer,
1946
+ char *optional_metadata
1947
+ );
1948
+
1949
+
1950
+ /* Serialize additional trees into previous serialized bytes
1951
+ *
1952
+ * Parameters
1953
+ * ==========
1954
+ * - model (in)
1955
+ * A model object to re-serialize, which had already been serialized into
1956
+ * 'serialized_bytes' with fewer trees than it currently has, and then
1957
+ * additional trees added through functions such as 'add_tree' or 'merge_models'.
1958
+ * - serialized_bytes (in) / old_bytes (out)
1959
+ * Serialized version of 'model', which had previously been produced with
1960
+ * fewer trees than it currently has and then additional trees added through
1961
+ * functions such as 'add_tree' or 'merge_models'.
1962
+ * Must have been produced in a setup with the same characteristics (e.g. width
1963
+ * of 'int' and 'size_t', endianness, etc.).
1964
+ * - old_ntrees
1965
+ * Number of trees which were serialized from 'model' into 'serialized_bytes'
1966
+ * before. Trees that come after this index are assumed to be the additional
1967
+ * trees to serialize.
1968
+ *
1969
+ * Returns
1970
+ * =======
1971
+ * - For functions 'check_can_undergo_incremental_serialization', whether the serialized
1972
+ * object can be incrementally serialized.
1973
+ * - For functions 'determine_serialized_size_additional_trees', additional size (in addition
1974
+ * to current size) that the new serialized objects will have if they undergo incremental
1975
+ * serialization.
1976
+ */
1977
+ ISOTREE_EXPORTED
1978
+ bool check_can_undergo_incremental_serialization(const IsoForest &model, const char *serialized_bytes);
1979
+ ISOTREE_EXPORTED
1980
+ bool check_can_undergo_incremental_serialization(const ExtIsoForest &model, const char *serialized_bytes);
1981
+ ISOTREE_EXPORTED
1982
+ size_t determine_serialized_size_additional_trees(const IsoForest &model, size_t old_ntrees);
1983
+ ISOTREE_EXPORTED
1984
+ size_t determine_serialized_size_additional_trees(const ExtIsoForest &model, size_t old_ntrees);
1985
+ ISOTREE_EXPORTED
1986
+ size_t determine_serialized_size_additional_trees(const Imputer &model, size_t old_ntrees);
1987
+ ISOTREE_EXPORTED
1988
+ size_t determine_serialized_size_additional_trees(const TreesIndexer &model, size_t old_ntrees);
1989
+ ISOTREE_EXPORTED
1990
+ void incremental_serialize_IsoForest(const IsoForest &model, char *old_bytes_reallocated);
1991
+ ISOTREE_EXPORTED
1992
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, char *old_bytes_reallocated);
1993
+ ISOTREE_EXPORTED
1994
+ void incremental_serialize_Imputer(const Imputer &model, char *old_bytes_reallocated);
1995
+ ISOTREE_EXPORTED
1996
+ void incremental_serialize_Indexer(const TreesIndexer &model, char *old_bytes_reallocated);
1997
+ ISOTREE_EXPORTED
1998
+ void incremental_serialize_IsoForest(const IsoForest &model, std::string &old_bytes);
1999
+ ISOTREE_EXPORTED
2000
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, std::string &old_bytes);
2001
+ ISOTREE_EXPORTED
2002
+ void incremental_serialize_Imputer(const Imputer &model, std::string &old_bytes);
2003
+ ISOTREE_EXPORTED
2004
+ void incremental_serialize_Indexer(const TreesIndexer &model, std::string &old_bytes);
2005
+
2006
+
2007
+ /* Translate isolation forest model into a single SQL select statement
2008
+ *
2009
+ * Parameters
2010
+ * ==========
2011
+ * - model_outputs
2012
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
2013
+ * if the predictions are to be made from an extended model. Can only pass one of
2014
+ * 'model_outputs' and 'model_outputs_ext'.
2015
+ * - model_outputs_ext
2016
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
2017
+ * if the predictions are to be made from a single-variable model. Can only pass one of
2018
+ * 'model_outputs' and 'model_outputs_ext'.
2019
+ * - table_from
2020
+ * Table name from where the columns used in the model will be selected.
2021
+ * - select_as
2022
+ * Alias to give to the outlier score in the select statement.
2023
+ * - numeric_colnames
2024
+ * Names to use for the numerical columns.
2025
+ * - categ_colnames
2026
+ * Names to use for the categorical columns.
2027
+ * - categ_levels
2028
+ * Names to use for the levels/categories of each categorical column. These will be enclosed
2029
+ * in single quotes.
2030
+ * - index1
2031
+ * Whether to make the node numbers start their numeration at 1 instead of 0 in the
2032
+ * resulting statement. If passing 'output_tree_num=false', this will only affect the
2033
+ * commented lines which act as delimiters. If passing 'output_tree_num=true', will also
2034
+ * affect the results (which will also start at 1).
2035
+ * - nthreads
2036
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
2037
+ * allocated, even if the thread does not end up being used. Ignored when not building with
2038
+ * OpenMP support.
2039
+ *
2040
+ * Returns
2041
+ * =======
2042
+ * A string with the corresponding SQL statement that will calculate the outlier score
2043
+ * from the model.
2044
+ */
2045
+ ISOTREE_EXPORTED
2046
+ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
2047
+ std::string &table_from, std::string &select_as,
2048
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
2049
+ std::vector<std::vector<std::string>> &categ_levels,
2050
+ bool index1, int nthreads);
2051
+
2052
+
2053
+ /* Translate model trees into SQL select statements
2054
+ *
2055
+ * Parameters
2056
+ * ==========
2057
+ * - model_outputs
2058
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
2059
+ * if the predictions are to be made from an extended model. Can only pass one of
2060
+ * 'model_outputs' and 'model_outputs_ext'.
2061
+ * - model_outputs_ext
2062
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
2063
+ * if the predictions are to be made from a single-variable model. Can only pass one of
2064
+ * 'model_outputs' and 'model_outputs_ext'.
2065
+ * - numeric_colnames
2066
+ * Names to use for the numerical columns.
2067
+ * - categ_colnames
2068
+ * Names to use for the categorical columns.
2069
+ * - categ_levels
2070
+ * Names to use for the levels/categories of each categorical column. These will be enclosed
2071
+ * in single quotes.
2072
+ * - output_tree_num
2073
+ * Whether to output the terminal node number instead of the separation depth at each node.
2074
+ * - index1
2075
+ * Whether to make the node numbers start their numeration at 1 instead of 0 in the
2076
+ * resulting statement. If passing 'output_tree_num=false', this will only affect the
2077
+ * commented lines which act as delimiters. If passing 'output_tree_num=true', will also
2078
+ * affect the results (which will also start at 1).
2079
+ * - single_tree
2080
+ * Whether to generate the select statement for a single tree of the model instead of for
2081
+ * all. The tree number to generate is to be passed under 'tree_num'.
2082
+ * - tree_num
2083
+ * Tree number for which to generate an SQL select statement, if passing 'single_tree=true'.
2084
+ * - nthreads
2085
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
2086
+ * allocated, even if the thread does not end up being used. Ignored when not building with
2087
+ * OpenMP support.
2088
+ *
2089
+ * Returns
2090
+ * =======
2091
+ * A vector containing at each element the SQL statement for the corresponding tree in the model.
2092
+ * If passing 'single_tree=true', will contain only one element, corresponding to the tree given
2093
+ * in 'tree_num'. The statements will be node-by-node, with commented-out separators using '---'
2094
+ * as delimiters and including the node number as part of the comment.
2095
+ */
2096
+ ISOTREE_EXPORTED
2097
+ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
2098
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
2099
+ std::vector<std::vector<std::string>> &categ_levels,
2100
+ bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
2101
+ int nthreads);
2102
+
2103
+
2104
+ ISOTREE_EXPORTED
2105
+ void set_reference_points(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext, TreesIndexer *indexer,
2106
+ const bool with_distances,
2107
+ real_t *numeric_data, int *categ_data,
2108
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
2109
+ real_t *Xc, sparse_ix *Xc_ind, sparse_ix *Xc_indptr,
2110
+ real_t *Xr, sparse_ix *Xr_ind, sparse_ix *Xr_indptr,
2111
+ size_t nrows, int nthreads);