isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,2111 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2021, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
|
|
64
|
+
/* Standard headers */
|
|
65
|
+
#include <cstddef>
|
|
66
|
+
#include <cstdint>
|
|
67
|
+
#include <vector>
|
|
68
|
+
using std::size_t;
|
|
69
|
+
|
|
70
|
+
/* The library has overloaded functions supporting different input types.
|
|
71
|
+
Note that, while 'float' type is supported, it will
|
|
72
|
+
be slower to fit models to them as the models internally use
|
|
73
|
+
'double' and 'long double', and it's not recommended to use.
|
|
74
|
+
|
|
75
|
+
In order to use the library with different types than the ones
|
|
76
|
+
suggested here, add something like this before including the
|
|
77
|
+
library header:
|
|
78
|
+
#define real_t float
|
|
79
|
+
#define sparse_ix int
|
|
80
|
+
#include "isotree.hpp"
|
|
81
|
+
The header may be included multiple times if required. */
|
|
82
|
+
#ifndef real_t
|
|
83
|
+
#define real_t double /* supported: float, double */
|
|
84
|
+
#endif
|
|
85
|
+
#ifndef sparse_ix
|
|
86
|
+
#define sparse_ix int /* supported: int, int64_t, size_t */
|
|
87
|
+
#endif
|
|
88
|
+
|
|
89
|
+
#ifndef ISOTREE_H
|
|
90
|
+
#define ISOTREE_H
|
|
91
|
+
|
|
92
|
+
#ifdef _WIN32
|
|
93
|
+
#define ISOTREE_EXPORTED __declspec(dllimport)
|
|
94
|
+
#else
|
|
95
|
+
#define ISOTREE_EXPORTED
|
|
96
|
+
#endif
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
/* Types used through the package - zero is the suggested value (when appropriate) */
|
|
100
|
+
typedef enum NewCategAction {Weighted=0, Smallest=11, Random=12} NewCategAction; /* Weighted means Impute in the extended model */
|
|
101
|
+
typedef enum MissingAction {Divide=21, Impute=22, Fail=0} MissingAction; /* Divide is only for non-extended model */
|
|
102
|
+
typedef enum ColType {Numeric=31, Categorical=32, NotUsed=0} ColType;
|
|
103
|
+
typedef enum CategSplit {SubSet=0, SingleCateg=41} CategSplit;
|
|
104
|
+
typedef enum CoefType {Uniform=61, Normal=0} CoefType; /* For extended model */
|
|
105
|
+
typedef enum UseDepthImp {Lower=71, Higher=0, Same=72} UseDepthImp; /* For NA imputation */
|
|
106
|
+
typedef enum WeighImpRows {Inverse=0, Prop=81, Flat=82} WeighImpRows; /* For NA imputation */
|
|
107
|
+
typedef enum ScoringMetric {Depth=0, Density=92, BoxedDensity=94, BoxedDensity2=96, BoxedRatio=95,
|
|
108
|
+
AdjDepth=91, AdjDensity=93} ScoringMetric;
|
|
109
|
+
|
|
110
|
+
/* Notes about new categorical action:
|
|
111
|
+
* - For single-variable case, if using 'Smallest', can then pass data at prediction time
|
|
112
|
+
* having categories that were never in the training data (as an integer higher than 'ncat'
|
|
113
|
+
* for that column), but if using 'Random' or 'Weighted', these must be passed as NA (int < 0)
|
|
114
|
+
* - For extended case, 'Weighted' becomes a weighted imputation instead, and if using either
|
|
115
|
+
* 'Weighted' or 'Smallest', can pass newer, unseen categories at prediction time too.
|
|
116
|
+
* - If using 'Random', cannot pass new categories at prediction time.
|
|
117
|
+
* - If using 'Weighted' for single-variable case, cannot predict similarity with a value
|
|
118
|
+
* for MissingAction other than 'Divide'. */
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
/* Structs that are output (modified) from the main function */
|
|
123
|
+
typedef struct IsoTree {
|
|
124
|
+
ColType col_type = NotUsed;
|
|
125
|
+
size_t col_num;
|
|
126
|
+
double num_split;
|
|
127
|
+
std::vector<char> cat_split;
|
|
128
|
+
int chosen_cat;
|
|
129
|
+
size_t tree_left;
|
|
130
|
+
size_t tree_right;
|
|
131
|
+
double pct_tree_left;
|
|
132
|
+
double score; /* will not be integer when there are weights or early stop */
|
|
133
|
+
double range_low = -HUGE_VAL;
|
|
134
|
+
double range_high = HUGE_VAL;
|
|
135
|
+
double remainder; /* only used for distance/similarity */
|
|
136
|
+
|
|
137
|
+
IsoTree() = default;
|
|
138
|
+
|
|
139
|
+
} IsoTree;
|
|
140
|
+
|
|
141
|
+
typedef struct IsoHPlane {
|
|
142
|
+
std::vector<size_t> col_num;
|
|
143
|
+
std::vector<ColType> col_type;
|
|
144
|
+
std::vector<double> coef;
|
|
145
|
+
std::vector<double> mean;
|
|
146
|
+
std::vector<std::vector<double>> cat_coef;
|
|
147
|
+
std::vector<int> chosen_cat;
|
|
148
|
+
std::vector<double> fill_val;
|
|
149
|
+
std::vector<double> fill_new;
|
|
150
|
+
|
|
151
|
+
double split_point;
|
|
152
|
+
size_t hplane_left;
|
|
153
|
+
size_t hplane_right;
|
|
154
|
+
double score; /* will not be integer when there are weights or early stop */
|
|
155
|
+
double range_low = -HUGE_VAL;
|
|
156
|
+
double range_high = HUGE_VAL;
|
|
157
|
+
double remainder; /* only used for distance/similarity */
|
|
158
|
+
|
|
159
|
+
IsoHPlane() = default;
|
|
160
|
+
} IsoHPlane;
|
|
161
|
+
|
|
162
|
+
typedef struct IsoForest {
|
|
163
|
+
std::vector< std::vector<IsoTree> > trees;
|
|
164
|
+
NewCategAction new_cat_action;
|
|
165
|
+
CategSplit cat_split_type;
|
|
166
|
+
MissingAction missing_action;
|
|
167
|
+
double exp_avg_depth;
|
|
168
|
+
double exp_avg_sep;
|
|
169
|
+
size_t orig_sample_size;
|
|
170
|
+
bool has_range_penalty;
|
|
171
|
+
IsoForest() = default;
|
|
172
|
+
} IsoForest;
|
|
173
|
+
|
|
174
|
+
typedef struct ExtIsoForest {
|
|
175
|
+
std::vector< std::vector<IsoHPlane> > hplanes;
|
|
176
|
+
NewCategAction new_cat_action;
|
|
177
|
+
CategSplit cat_split_type;
|
|
178
|
+
MissingAction missing_action;
|
|
179
|
+
double exp_avg_depth;
|
|
180
|
+
double exp_avg_sep;
|
|
181
|
+
size_t orig_sample_size;
|
|
182
|
+
bool has_range_penalty;
|
|
183
|
+
ExtIsoForest() = default;
|
|
184
|
+
} ExtIsoForest;
|
|
185
|
+
|
|
186
|
+
typedef struct ImputeNode {
|
|
187
|
+
std::vector<double> num_sum;
|
|
188
|
+
std::vector<double> num_weight;
|
|
189
|
+
std::vector<std::vector<double>> cat_sum;
|
|
190
|
+
std::vector<double> cat_weight;
|
|
191
|
+
size_t parent;
|
|
192
|
+
ImputeNode() = default;
|
|
193
|
+
} ImputeNode; /* this is for each tree node */
|
|
194
|
+
|
|
195
|
+
typedef struct Imputer {
|
|
196
|
+
size_t ncols_numeric;
|
|
197
|
+
size_t ncols_categ;
|
|
198
|
+
std::vector<int> ncat;
|
|
199
|
+
std::vector<std::vector<ImputeNode>> imputer_tree;
|
|
200
|
+
std::vector<double> col_means;
|
|
201
|
+
std::vector<int> col_modes;
|
|
202
|
+
Imputer() = default;
|
|
203
|
+
} Imputer;
|
|
204
|
+
|
|
205
|
+
typedef struct SingleTreeIndex {
|
|
206
|
+
std::vector<size_t> terminal_node_mappings;
|
|
207
|
+
std::vector<double> node_distances;
|
|
208
|
+
std::vector<double> node_depths;
|
|
209
|
+
std::vector<size_t> reference_points;
|
|
210
|
+
std::vector<size_t> reference_indptr;
|
|
211
|
+
std::vector<size_t> reference_mapping;
|
|
212
|
+
size_t n_terminal;
|
|
213
|
+
} TreeNodeIndex;
|
|
214
|
+
|
|
215
|
+
typedef struct TreesIndexer {
|
|
216
|
+
std::vector<SingleTreeIndex> indices;
|
|
217
|
+
TreesIndexer() = default;
|
|
218
|
+
} TreesIndexer;
|
|
219
|
+
|
|
220
|
+
#endif /* ISOTREE_H */
|
|
221
|
+
|
|
222
|
+
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
|
223
|
+
*
|
|
224
|
+
* Parameters:
|
|
225
|
+
* ===========
|
|
226
|
+
* - model_outputs (out)
|
|
227
|
+
* Pointer to already allocated isolation forest model object for single-variable splits.
|
|
228
|
+
* If fitting the extended model, pass NULL (must pass 'model_outputs_ext'). Can later add
|
|
229
|
+
* additional trees through function 'add_tree'.
|
|
230
|
+
* - model_outputs_ext (out)
|
|
231
|
+
* Pointer to already allocated extended isolation forest model object (for multiple-variable splits).
|
|
232
|
+
* Note that if 'ndim' = 1, must use instead the single-variable model object.
|
|
233
|
+
* If fitting the single-variable model, pass NULL (must pass 'model_outputs'). Can later add
|
|
234
|
+
* additional trees through function 'add_tree'.
|
|
235
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
236
|
+
* Pointer to numeric data to which to fit the model. Must be ordered by columns like Fortran,
|
|
237
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
238
|
+
* Pass NULL if there are no dense numeric columns (must also pass 'ncols_numeric' = 0 if there's
|
|
239
|
+
* no sparse numeric data either).
|
|
240
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
241
|
+
* - ncols_numeric
|
|
242
|
+
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
|
243
|
+
* - categ_data[nrows * ncols_categ]
|
|
244
|
+
* Pointer to categorical data to which to fit the model. Must be ordered by columns like Fortran,
|
|
245
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
246
|
+
* Pass NULL if there are no categorical columns (must also pass 'ncols_categ' = 0).
|
|
247
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
248
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must also be present
|
|
249
|
+
* (note that they are not treated as being ordinal, this is just an encoding). Missing values
|
|
250
|
+
* should be encoded as negative numbers such as (-1).
|
|
251
|
+
* - ncols_categ
|
|
252
|
+
* Number of categorical columns in the data.
|
|
253
|
+
* - ncat[ncols_categ]
|
|
254
|
+
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
|
255
|
+
* the number of categories for that column is '5' (zero is one category).
|
|
256
|
+
* - Xc[nnz]
|
|
257
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
258
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
259
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
260
|
+
* - Xc_ind[nnz]
|
|
261
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
262
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
263
|
+
* The largest value here should be smaller than the largest possible value of 'size_t'.
|
|
264
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
265
|
+
* - Xc_indptr[ncols_numeric + 1]
|
|
266
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
267
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
268
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
269
|
+
* - ndim
|
|
270
|
+
* How many dimensions (columns) to use for making a split. Must pass 'ndim' = 1 for
|
|
271
|
+
* the single-variable model. Note that the model object pointer passed must also
|
|
272
|
+
* agree with the value passed to 'ndim'.
|
|
273
|
+
* - ntry
|
|
274
|
+
* When using any of 'prob_pick_by_gain_pl', 'prob_pick_by_gain_avg', 'prob_pick_by_full_gain', 'prob_pick_by_dens', how many variables (with 'ndim=1')
|
|
275
|
+
* or linear combinations (with 'ndim>1') to try for determining the best one according to gain.
|
|
276
|
+
* Recommended value in reference [4] is 10 (with 'prob_pick_by_gain_avg', for outlier detection), while the
|
|
277
|
+
* recommended value in reference [11] is 1 (with 'prob_pick_by_gain_pl', for outlier detection), and the
|
|
278
|
+
* recommended value in reference [9] is 10 to 20 (with 'prob_pick_by_gain_pl', for missing value imputations).
|
|
279
|
+
* - coef_type
|
|
280
|
+
* For the extended model, whether to sample random coefficients according to a normal distribution ~ N(0, 1)
|
|
281
|
+
* (as proposed in [4]) or according to a uniform distribution ~ Unif(-1, +1) as proposed in [3]. Ignored for the
|
|
282
|
+
* single-variable model.
|
|
283
|
+
* - sample_weights[nrows]
|
|
284
|
+
* Weights for the rows when building a tree, either as sampling importances when using
|
|
285
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
|
286
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
|
287
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
|
288
|
+
* through parameter 'weight_as_sample'.
|
|
289
|
+
* Pass NULL if the rows all have uniform weights.
|
|
290
|
+
* - with_replacement
|
|
291
|
+
* Whether to sample rows with replacement or not (not recommended). Note that distance calculations,
|
|
292
|
+
* if desired, don't work well with duplicate rows.
|
|
293
|
+
* - weight_as_sample
|
|
294
|
+
* If passing sample (row) weights when fitting the model, whether to consider those weights as row
|
|
295
|
+
* sampling weights (i.e. the higher the weights, the more likely the observation will end up included
|
|
296
|
+
* in each tree sub-sample), or as distribution density weights (i.e. putting a weight of two is the same
|
|
297
|
+
* as if the row appeared twice, thus higher weight makes it less of an outlier, but does not give it a
|
|
298
|
+
* higher chance of being sampled if the data uses sub-sampling).
|
|
299
|
+
* - nrows
|
|
300
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
301
|
+
* - sample_size
|
|
302
|
+
* Sample size of the data sub-samples with which each binary tree will be built. When a terminal node has more than
|
|
303
|
+
* 1 observation, the remaining isolation depth for them is estimated assuming the data and splits are both uniformly
|
|
304
|
+
* random (separation depth follows a similar process with expected value calculated as in [6]). If passing zero,
|
|
305
|
+
* will set it to 'nrows'. Recommended value in [1], [2], [3] is 256, while the default value in the author's code
|
|
306
|
+
* in [5] is 'nrows' here.
|
|
307
|
+
* - ntrees
|
|
308
|
+
* Number of binary trees to build for the model. Recommended value in [1] is 100, while the default value in the
|
|
309
|
+
* author's code in [5] is 10.
|
|
310
|
+
* - max_depth
|
|
311
|
+
* Maximum depth of the binary trees to grow. Will get overwritten if passing 'limit_depth' = 'true'.
|
|
312
|
+
* Models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
|
|
313
|
+
* deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
|
|
314
|
+
* predictions.
|
|
315
|
+
* Note that models that use 'prob_pick_by_gain_pl' or 'prob_pick_by_gain_avg' are likely to benefit from
|
|
316
|
+
* deeper trees (larger 'max_depth'), but deeper trees can result in much slower model fitting and
|
|
317
|
+
* predictions.
|
|
318
|
+
* If using pooled gain, one might want to substitute 'max_depth' with 'min_gain'.
|
|
319
|
+
* - ncols_per_tree
|
|
320
|
+
* Number of columns to use (have as potential candidates for splitting at each iteration) in each tree,
|
|
321
|
+
* similar to the 'mtry' parameter of random forests.
|
|
322
|
+
* In general, this is only relevant when using non-random splits and/or weighted column choices.
|
|
323
|
+
* If passing zero, will use the full number of available columns.
|
|
324
|
+
* Recommended value: 0.
|
|
325
|
+
* - limit_depth
|
|
326
|
+
* Whether to automatically set the maximum depth to the corresponding depth of a balanced binary tree with number of
|
|
327
|
+
* terminal nodes corresponding to the sub-sample size (the reason being that, if trying to detect outliers, an outlier
|
|
328
|
+
* will only be so if it turns out to be isolated with shorter average depth than usual, which corresponds to a balanced
|
|
329
|
+
* tree depth). Default setting for [1], [2], [3], [4] is 'true', but it's recommended to pass 'false' here
|
|
330
|
+
* and higher values for 'max_depth' if using the model for purposes other than outlier detection.
|
|
331
|
+
* Note that, if passing 'limit_depth=true', then 'max_depth' is ignored.
|
|
332
|
+
* - penalize_range
|
|
333
|
+
* Whether to penalize (add -1 to the terminal depth) observations at prediction time that have a value
|
|
334
|
+
* of the chosen split variable (linear combination in extended model) that falls outside of a pre-determined
|
|
335
|
+
* reasonable range in the data being split (given by 2 * range in data and centered around the split point),
|
|
336
|
+
* as proposed in [4] and implemented in the authors' original code in [5]. Not used in single-variable model
|
|
337
|
+
* when splitting by categorical variables. Note that this can make a very large difference in the results
|
|
338
|
+
* when using 'prob_pick_by_gain_pl'.
|
|
339
|
+
* This option is not supported when using density-based outlier scoring metrics.
|
|
340
|
+
* - standardize_data
|
|
341
|
+
* Whether to standardize the features at each node before creating a linear combination of them as suggested
|
|
342
|
+
* in [4]. This is ignored when using 'ndim=1'.
|
|
343
|
+
* - scoring_metric
|
|
344
|
+
* Metric to use for determining outlier scores (see reference [13]).
|
|
345
|
+
* If passing 'Depth', will use isolation depth as proposed in reference [1]. This is typically the safest choice
|
|
346
|
+
* and plays well with all model types offered by this library.
|
|
347
|
+
* If passing 'Density', will set scores for each terminal node as the ratio between the fraction of points in the sub-sample
|
|
348
|
+
* that end up in that node and the fraction of the volume in the feature space which defines
|
|
349
|
+
* the node according to the splits that lead to it.
|
|
350
|
+
* If using 'ndim=1', for categorical variables, 'Density' is defined in terms
|
|
351
|
+
* of number of categories that go towards each side of the split divided by number of categories
|
|
352
|
+
* in the observations that reached that node.
|
|
353
|
+
* The standardized outlier score from 'Density' for a given observation is calculated as the
|
|
354
|
+
* negative of the logarithm of the geometric mean from the per-tree densities, which unlike
|
|
355
|
+
* the standardized score produced from 'Depth', is unbounded, but just like the standardized
|
|
356
|
+
* score form 'Depth', has a natural threshold for definining outlierness, which in this case
|
|
357
|
+
* is zero is instead of 0.5. The non-standardized outlier score for 'Density' is calculated as the
|
|
358
|
+
* geometric mean, while the per-tree scores are calculated as the density values.
|
|
359
|
+
* 'Density' might lead to better predictions when using 'ndim=1', particularly in the presence
|
|
360
|
+
* of categorical variables. Note however that using 'Density' requires more trees for convergence
|
|
361
|
+
* of scores (i.e. good results) compared to isolation-based metrics.
|
|
362
|
+
* 'Density' is incompatible with 'penalize_range=true'.
|
|
363
|
+
* If passing 'AdjDepth', will use an adjusted isolation depth that takes into account the number of points that
|
|
364
|
+
* go to each side of a given split vs. the fraction of the range of that feature that each
|
|
365
|
+
* side of the split occupies, by a metric as follows: 'd = 2/ (1 + 1/(2*p))'
|
|
366
|
+
* where 'p' is defined as 'p = (n_s / n_t) / (r_s / r_t)
|
|
367
|
+
* with 'n_t' being the number of points that reach a given node, 'n_s' the
|
|
368
|
+
* number of points that are sent to a given side of the split/branch at that node,
|
|
369
|
+
* 'r_t' being the range (maximum minus minimum) of the splitting feature or
|
|
370
|
+
* linear combination among the points that reached the node, and 'r_s' being the
|
|
371
|
+
* range of the same feature or linear combination among the points that are sent to this
|
|
372
|
+
* same side of the split/branch. This makes each split add a number between zero and two
|
|
373
|
+
* to the isolation depth, with this number's probabilistic distribution being centered
|
|
374
|
+
* around 1 and thus the expected isolation depth remaing the same as in the original
|
|
375
|
+
* 'Depth' metric, but having more variability around the extremes.
|
|
376
|
+
* Scores (standardized, non-standardized, per-tree) for 'AdjDepth' are aggregated in the same way
|
|
377
|
+
* as for 'Depth'.
|
|
378
|
+
* 'AdjDepth' might lead to better predictions when using 'ndim=1', particularly in the prescence
|
|
379
|
+
* of categorical variables and for smaller datasets, and for smaller datasets, might make
|
|
380
|
+
* sense to combine it with 'penalize_range=true'.
|
|
381
|
+
* If passing 'AdjDensity', will use the same metric from 'AdjDepth', but applied multiplicatively instead
|
|
382
|
+
* of additively. The expected value for 'AdjDepth' is not strictly the same
|
|
383
|
+
* as for isolation, but using the expected isolation depth as standardizing criterion
|
|
384
|
+
* tends to produce similar standardized score distributions (centered around 0.5).
|
|
385
|
+
* Scores (standardized, non-standardized, per-tree) from 'AdjDensity' are aggregated in the same way
|
|
386
|
+
* as for 'Depth'.
|
|
387
|
+
* 'AdjDepth' is incompatible with 'penalize_range=true'.
|
|
388
|
+
* If passing 'BoxedRatio', will set the scores for each terminal node as the ratio between the volume of the boxed
|
|
389
|
+
* feature space for the node as defined by the smallest and largest values from the split
|
|
390
|
+
* conditions for each column (bounded by the variable ranges in the sample) and the
|
|
391
|
+
* variable ranges in the tree sample.
|
|
392
|
+
* If using 'ndim=1', for categorical variables 'BoxedRatio' is defined in terms of number of categories.
|
|
393
|
+
* If using 'ndim=>1', 'BoxedRatio' is defined in terms of the maximum achievable value for the
|
|
394
|
+
* splitting linear combination determined from the minimum and maximum values for each
|
|
395
|
+
* variable among the points in the sample, and as such, it has a rather different meaning
|
|
396
|
+
* compared to the score obtained with 'ndim=1' - 'BoxedRatio' scores with 'ndim>1'
|
|
397
|
+
* typically provide very poor quality results and this metric is thus not recommended to
|
|
398
|
+
* use in the extended model. With 'ndim>1', 'BoxedRatio' also has a tendency of producing too small
|
|
399
|
+
* values which round to zero.
|
|
400
|
+
* The standardized outlier score from 'BoxedRatio' for a given observation is calculated
|
|
401
|
+
* simply as the the average from the per-tree boxed ratios. 'BoxedRatio' metric
|
|
402
|
+
* has a lower bound of zero and a theorical upper bound of one, but in practice the scores
|
|
403
|
+
* tend to be very small numbers close to zero, and its distribution across
|
|
404
|
+
* different datasets is rather unpredictable. In order to keep rankings comparable with
|
|
405
|
+
* the rest of the metrics, the non-standardized outlier scores for 'BoxedRatio' are calculated as the
|
|
406
|
+
* negative of the average instead. The per-tree 'BoxedRatio' scores are calculated as the ratios.
|
|
407
|
+
* 'BoxedRatio' can be calculated in a fast-but-not-so-precise way, and in a low-but-precise
|
|
408
|
+
* way, which is controlled by parameter 'fast_bratio'. Usually, both should give the
|
|
409
|
+
* same results, but in some fatasets, the fast way can lead to numerical inaccuracies
|
|
410
|
+
* due to roundoffs very close to zero.
|
|
411
|
+
* 'BoxedRatio' might lead to better predictions in datasets with many rows when using 'ndim=1'
|
|
412
|
+
* and a relatively small 'sample_size'. Note that more trees are required for convergence
|
|
413
|
+
* of scores when using 'BoxedRatio'. In some datasets, 'BoxedRatio' metric might result in very bad
|
|
414
|
+
* predictions, to the point that taking its inverse produces a much better ranking of outliers.
|
|
415
|
+
* 'BoxedRatio' option is incompatible with 'penalize_range'.
|
|
416
|
+
* If passing 'BoxedDensity2', will set the score as the ratio between the fraction of points within the sample that
|
|
417
|
+
* end up in a given terminal node and the 'BoxedRatio' metric.
|
|
418
|
+
* Aggregation of scores (standardized, non-standardized, per-tree) for 'BoxedDensity2' is done in the same
|
|
419
|
+
* way as for 'Density', and it also has a natural threshold at zero for determining
|
|
420
|
+
* outliers and inliers.
|
|
421
|
+
* 'BoxedDensity2' is typically usable with 'ndim>1', but tends to produce much bigger values
|
|
422
|
+
* compared to 'ndim=1'.
|
|
423
|
+
* Albeit unintuitively, in many datasets, one can usually get better results with metric
|
|
424
|
+
* 'BoxedDensity' instead.
|
|
425
|
+
* The calculation of 'BoxedDensity2' is also controlled by 'fast_bratio'.
|
|
426
|
+
* 'BoxedDensity2' incompatible with 'penalize_range'.
|
|
427
|
+
* If passing 'BoxedDensity', will set the score as the ratio between the fraction of points within the sample that
|
|
428
|
+
* end up in a given terminal node and the ratio between the boxed volume of the feature
|
|
429
|
+
* space in the sample and the boxed volume of a node given by the split conditions (inverse
|
|
430
|
+
* as in 'BoxedDensity2'). This metric does not have any theoretical or intuitive
|
|
431
|
+
* justification behind its existence, and it is perhaps ilogical to use it as a
|
|
432
|
+
* scoring metric, but tends to produce good results in some datasets.
|
|
433
|
+
* The standardized outlier scores for 'BoxedDensity' are defined as the negative of the geometric mean,
|
|
434
|
+
* while the non-standardized scores are the geometric mean, and the per-tree scores are simply the 'density' values.
|
|
435
|
+
* The calculation of 'BoxedDensity' is also controlled by 'fast_bratio'.
|
|
436
|
+
* 'BoxedDensity' option is incompatible with 'penalize_range'.
|
|
437
|
+
* - fast_bratio
|
|
438
|
+
* When using "boxed" metrics for scoring, whether to calculate them in a fast way through
|
|
439
|
+
* cumulative sum of logarithms of ratios after each split, or in a slower way as sum of
|
|
440
|
+
* logarithms of a single ratio per column for each terminal node.
|
|
441
|
+
* Usually, both methods should give the same results, but in some datasets, particularly
|
|
442
|
+
* when variables have too small or too large ranges, the first method can be prone to
|
|
443
|
+
* numerical inaccuracies due to roundoff close to zero.
|
|
444
|
+
* Note that this does not affect calculations for models with 'ndim>1', since given the
|
|
445
|
+
* split types, the calculation for them is different.
|
|
446
|
+
* - standardize_dist
|
|
447
|
+
* If passing 'tmat' (see documentation for it), whether to standardize the resulting average separation
|
|
448
|
+
* depths in order to produce a distance metric or not, in the same way this is done for the outlier score.
|
|
449
|
+
* - tmat[nrows * (nrows - 1) / 2]
|
|
450
|
+
* Array in which to calculate average separation depths or standardized distance metric (see documentation
|
|
451
|
+
* for 'standardize_dist') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
|
452
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
|
453
|
+
* of rows, and there cannot be sample weights. If not NULL, must already be initialized to zeros. As the
|
|
454
|
+
* output is a symmetric matrix, this function will only fill in the upper-triangular part, in which
|
|
455
|
+
* entry 0 <= i < j < n will be located at position
|
|
456
|
+
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
|
457
|
+
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
|
458
|
+
* - output_depths[nrows]
|
|
459
|
+
* Array in which to calculate average path depths or standardized outlierness metric (see documentation
|
|
460
|
+
* for 'standardize_depth') as the model is being fit. Pass NULL to avoid doing these calculations alongside
|
|
461
|
+
* the regular model process. If passing this output argument, the sample size must be the same as the number
|
|
462
|
+
* of rows. If not NULL, must already be initialized to zeros.
|
|
463
|
+
* - standardize_depth
|
|
464
|
+
* If passing 'output_depths', whether to standardize the results as proposed in [1], in order to obtain
|
|
465
|
+
* a metric in which the more outlier is an observation, the closer this standardized metric will be to 1,
|
|
466
|
+
* with average observations obtaining 0.5. If passing 'false' here, the numbers in 'output_depths' will be
|
|
467
|
+
* the average depth of each row across all trees.
|
|
468
|
+
* - col_weights[ncols_numeric + ncols_categ]
|
|
469
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
|
470
|
+
* Ignored when picking columns by deterministic criterion.
|
|
471
|
+
* If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
|
|
472
|
+
* effect is multiplicative.
|
|
473
|
+
* - weigh_by_kurt
|
|
474
|
+
* Whether to weigh each column according to the kurtosis obtained in the sub-sample that is selected
|
|
475
|
+
* for each tree as briefly proposed in [1]. Note that this is only done at the beginning of each tree
|
|
476
|
+
* sample. For categorical columns, will calculate expected kurtosis if the column were converted to
|
|
477
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1).
|
|
478
|
+
* This is intended as a cheap feature selector, while the parameter 'prob_pick_col_by_kurt'
|
|
479
|
+
* provides the option to do this at each node in the tree for a different overall type of model.
|
|
480
|
+
* If passing column weights or weighted column choices ('prob_pick_col_by_range', 'prob_pick_col_by_var'),
|
|
481
|
+
* the effect will be multiplicative. This option is not compatible with 'prob_pick_col_by_kurt'.
|
|
482
|
+
* If passing 'missing_action=fail' and the data has infinite values, columns with rows
|
|
483
|
+
* having infinite values will get a weight of zero. If passing a different value for missing
|
|
484
|
+
* action, infinite values will be ignored in the kurtosis calculation.
|
|
485
|
+
* If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
|
|
486
|
+
* in order not to favor columns with missing values (which would increase kurtosis by all having
|
|
487
|
+
* the same central value).
|
|
488
|
+
* - prob_pick_by_gain_pl
|
|
489
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
490
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
491
|
+
* that maximizes a pooled standard deviation gain criterion (see references [9] and [11]) on the
|
|
492
|
+
* same variable or linear combination, similarly to regression trees such as CART.
|
|
493
|
+
* If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
|
|
494
|
+
* in which the largest standardized gain can be achieved.
|
|
495
|
+
* For categorical variables with 'ndim=1', will use shannon entropy instead (like in [7]).
|
|
496
|
+
* Compared to a simple averaged gain, this tends to result in more evenly-divided splits and more clustered
|
|
497
|
+
* groups when they are smaller. Recommended to pass higher values when used for imputation of missing values.
|
|
498
|
+
* When used for outlier detection, datasets with multimodal distributions usually see better performance
|
|
499
|
+
* under this type of splits.
|
|
500
|
+
* Note that, since this makes the trees more even and thus it takes more steps to produce isolated nodes,
|
|
501
|
+
* the resulting object will be heavier. When splits are not made according to any of 'prob_pick_by_gain_avg',
|
|
502
|
+
* 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
503
|
+
* Note that, if passing value 1 (100%) with no sub-sampling and using the single-variable model,
|
|
504
|
+
* every single tree will have the exact same splits.
|
|
505
|
+
* Be aware that 'penalize_range' can also have a large impact when using 'prob_pick_by_gain_pl'.
|
|
506
|
+
* Be aware also that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable
|
|
507
|
+
* model, every single tree will have the exact same splits.
|
|
508
|
+
* Under this option, models are likely to produce better results when increasing 'max_depth'.
|
|
509
|
+
* Alternatively, one can also control the depth through 'min_gain' (for which one might want to
|
|
510
|
+
* set 'max_depth=0').
|
|
511
|
+
* Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl', 'prob_pick_by_full_gain',
|
|
512
|
+
* 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
|
|
513
|
+
* - prob_pick_by_gain_avg
|
|
514
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
515
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
516
|
+
* that maximizes an averaged standard deviation gain criterion (see references [4] and [11]) on the
|
|
517
|
+
* same variable or linear combination.
|
|
518
|
+
* If using 'ntry>1', will try several variables or linear combinations thereof and choose the one
|
|
519
|
+
* in which the largest standardized gain can be achieved.
|
|
520
|
+
* For categorical variables with 'ndim=1', will take the expected standard deviation that would be
|
|
521
|
+
* gotten if the column were converted to numerical by assigning to each category a random
|
|
522
|
+
* number ~ Unif(0, 1) and calculate gain with those assumed standard deviations.
|
|
523
|
+
* Compared to a pooled gain, this tends to result in more cases in which a single observation or very
|
|
524
|
+
* few of them are put into one branch. Typically, datasets with outliers defined by extreme values in
|
|
525
|
+
* some column more or less independently of the rest, usually see better performance under this type
|
|
526
|
+
* of split. Recommended to use sub-samples (parameter 'sample_size') when
|
|
527
|
+
* passing this parameter. Note that, since this will create isolated nodes faster, the resulting object
|
|
528
|
+
* will be lighter (use less memory).
|
|
529
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
530
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
531
|
+
* Default setting for [1], [2], [3] is zero, and default for [4] is 1.
|
|
532
|
+
* This is the randomization parameter that can be passed to the author's original code in [5],
|
|
533
|
+
* but note that the code in [5] suffers from a mathematical error in the calculation of running standard deviations,
|
|
534
|
+
* so the results from it might not match with this library's.
|
|
535
|
+
* Be aware that, if passing a value of 1 (100%) with no sub-sampling and using the single-variable model,
|
|
536
|
+
* every single tree will have the exact same splits.
|
|
537
|
+
* Under this option, models are likely to produce better results when increasing 'max_depth'.
|
|
538
|
+
* Important detail: if using any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
539
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', the distribution of outlier scores is unlikely to be centered around 0.5.
|
|
540
|
+
* - prob_pick_by_full_gain
|
|
541
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
542
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
543
|
+
* that minimizes the pooled sums of variances of all columns (or a subset of them if using
|
|
544
|
+
* 'ncols_per_tree').
|
|
545
|
+
* In general, 'prob_pick_by_full_gain' is much slower to evaluate than the other gain types, and does not tend to
|
|
546
|
+
* lead to better results. When using 'prob_pick_by_full_gain', one might want to use a different scoring
|
|
547
|
+
* metric (particulatly 'Density', 'BoxedDensity2' or 'BoxedRatio'). Note that
|
|
548
|
+
* the variance calculations are all done through the (exact) sorted-indices approach, while is much
|
|
549
|
+
* slower than the (approximate) histogram approach used by other decision tree software.
|
|
550
|
+
* Be aware that the data is not standardized in any way for the range calculations, thus the scales
|
|
551
|
+
* of features will make a large difference under 'prob_pick_by_full_gain', which might not make it suitable for
|
|
552
|
+
* all types of data.
|
|
553
|
+
* 'prob_pick_by_full_gain' is not compatible with categorical data, and 'min_gain' does not apply to it.
|
|
554
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
555
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
556
|
+
* Default setting for [1], [2], [3], [4] is zero.
|
|
557
|
+
* - prob_pick_dens
|
|
558
|
+
* This parameter indicates the probability of choosing the threshold on which to split a variable
|
|
559
|
+
* (with 'ndim=1') or a linear combination of variables (when using 'ndim>1') as the threshold
|
|
560
|
+
* that maximizes the pooled densities of the branch distributions.
|
|
561
|
+
* The 'min_gain' option does not apply to this type of splits.
|
|
562
|
+
* When splits are not made according to any of 'prob_pick_by_gain_avg', 'prob_pick_by_gain_pl',
|
|
563
|
+
* 'prob_pick_by_full_gain', 'prob_pick_by_dens', both the column and the split point are decided at random.
|
|
564
|
+
* Default setting for [1], [2], [3], [4] is zero.
|
|
565
|
+
* - prob_pick_col_by_range
|
|
566
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
567
|
+
* proportional to the range spanned by each column within a node as proposed in reference [12].
|
|
568
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
569
|
+
* probability proportional to the range spanned by each column within a node.
|
|
570
|
+
* This option is not compatible with categorical data. If passing column weights, the
|
|
571
|
+
* effect will be multiplicative.
|
|
572
|
+
* Be aware that the data is not standardized in any way for the range calculations, thus the scales
|
|
573
|
+
* of features will make a large difference under this option, which might not make it suitable for
|
|
574
|
+
* all types of data.
|
|
575
|
+
* Note that the proposed RRCF model from [12] uses a different scoring metric for producing anomaly
|
|
576
|
+
* scores, while this library uses isolation depth regardless of how columns are chosen, thus results
|
|
577
|
+
* are likely to be different from those of other software implementations. Nevertheless, as explored
|
|
578
|
+
* in [11], isolation depth as a scoring metric typically provides better results than the
|
|
579
|
+
* "co-displacement" metric from [12] under these split types.
|
|
580
|
+
* - prob_pick_col_by_var
|
|
581
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
582
|
+
* proportional to the variance of each column within a node.
|
|
583
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
584
|
+
* probability proportional to the variance of each column within a node.
|
|
585
|
+
* For categorical data, it will calculate the expected variance if the column were converted to
|
|
586
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1), which depending on the number of
|
|
587
|
+
* categories and their distribution, produces numbers typically a bit smaller than standardized numerical
|
|
588
|
+
* variables.
|
|
589
|
+
* Note that when using sparse matrices, the calculation of variance will rely on a procedure that
|
|
590
|
+
* uses sums of squares, which has less numerical precision than the
|
|
591
|
+
* calculation used for dense inputs, and as such, the results might differ slightly.
|
|
592
|
+
* Be aware that this calculated variance is not standardized in any way, so the scales of
|
|
593
|
+
* features will make a large difference under this option.
|
|
594
|
+
* If there are infinite values, all columns having infinite values will be treated as having the
|
|
595
|
+
* same weight, and will be chosen before every other column with non-infinite values.
|
|
596
|
+
* If passing column weights , the effect will be multiplicative.
|
|
597
|
+
* If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
|
|
598
|
+
* variance calculation. Otherwise, all columns with infinite values will have the same probability
|
|
599
|
+
* and will be chosen before columns with non-infinite values.
|
|
600
|
+
* - prob_pick_col_by_kurt
|
|
601
|
+
* When using 'ndim=1', this denotes the probability of choosing the column to split with a probability
|
|
602
|
+
* proportional to the kurtosis of each column **within a node** (unlike the option 'weigh_by_kurtosis'
|
|
603
|
+
* which calculates this metric only at the root).
|
|
604
|
+
* When using 'ndim>1', this denotes the probability of choosing columns to create a hyperplane with a
|
|
605
|
+
* probability proportional to the kurtosis of each column within a node.
|
|
606
|
+
* For categorical data, it will calculate the expected kurtosis if the column were converted to
|
|
607
|
+
* numerical by assigning to each category a random number ~ Unif(0, 1).
|
|
608
|
+
* Note that when using sparse matrices, the calculation of kurtosis will rely on a procedure that
|
|
609
|
+
* uses sums of squares and higher-power numbers, which has less numerical precision than the
|
|
610
|
+
* calculation used for dense inputs, and as such, the results might differ slightly.
|
|
611
|
+
* If passing column weights, the effect will be multiplicative. This option is not compatible
|
|
612
|
+
* with 'weigh_by_kurtosis'.
|
|
613
|
+
* If passing a 'missing_action' different than 'fail', infinite values will be ignored for the
|
|
614
|
+
* variance calculation. Otherwise, all columns with infinite values will have the same probability
|
|
615
|
+
* and will be chosen before columns with non-infinite values.
|
|
616
|
+
* If using 'missing_action=Impute', the calculation of kurtosis will not use imputed values
|
|
617
|
+
* in order not to favor columns with missing values (which would increase kurtosis by all having
|
|
618
|
+
* the same central value).
|
|
619
|
+
* Be aware that kurtosis can be a rather slow metric to calculate.
|
|
620
|
+
* - min_gain
|
|
621
|
+
* Minimum gain that a split threshold needs to produce in order to proceed with a split.
|
|
622
|
+
* Only used when the splits are decided by a variance gain criterion ('prob_pick_by_gain_pl' or
|
|
623
|
+
* 'prob_pick_by_gain_avg', but not 'prob_pick_by_full_gain' nor 'prob_pick_by_dens').
|
|
624
|
+
* If the highest possible gain in the evaluated splits at a node is below this threshold,
|
|
625
|
+
* that node becomes a terminal node.
|
|
626
|
+
* This can be used as a more sophisticated depth control when using pooled gain (note that 'max_depth'
|
|
627
|
+
* still applies on top of this heuristic).
|
|
628
|
+
* - missing_action
|
|
629
|
+
* How to handle missing data at both fitting and prediction time. Options are a) 'Divide' (for the single-variable
|
|
630
|
+
* model only, recommended), which will follow both branches and combine the result with the weight given by the fraction of
|
|
631
|
+
* the data that went to each branch when fitting the model, b) 'Impute', which will assign observations to the
|
|
632
|
+
* branch with the most observations in the single-variable model (but imputed values will also be used for
|
|
633
|
+
* gain calculations), or fill in missing values with the median of each column of the sample from which the
|
|
634
|
+
* split was made in the extended model (recommended) (but note that the calculation of medians does not take
|
|
635
|
+
* into account sample weights when using 'weights_as_sample_prob=false', and note that when using a gain
|
|
636
|
+
* criterion for splits with 'ndim=1', it will use the imputed values in the calculation), c) 'Fail' which will
|
|
637
|
+
* assume that there are no missing values and will trigger undefined behavior if it encounters any.
|
|
638
|
+
* In the extended model, infinite values will be treated as missing.
|
|
639
|
+
* Note that passing 'Fail' might crash the process if there turn out to be missing values, but will otherwise
|
|
640
|
+
* produce faster fitting and prediction times along with decreased model object sizes.
|
|
641
|
+
* Models from [1], [2], [3], [4] correspond to 'Fail' here.
|
|
642
|
+
* - cat_split_type
|
|
643
|
+
* Whether to split categorical features by assigning sub-sets of them to each branch, or by assigning
|
|
644
|
+
* a single category to a branch and the rest to the other branch. For the extended model, whether to
|
|
645
|
+
* give each category a coefficient, or only one while the rest get zero.
|
|
646
|
+
* - new_cat_action
|
|
647
|
+
* What to do after splitting a categorical feature when new data that reaches that split has categories that
|
|
648
|
+
* the sub-sample from which the split was done did not have. Options are a) "Weighted" (recommended), which
|
|
649
|
+
* in the single-variable model will follow both branches and combine the result with weight given by the fraction of the
|
|
650
|
+
* data that went to each branch when fitting the model, and in the extended model will assign
|
|
651
|
+
* them the median value for that column that was added to the linear combination of features (but note that
|
|
652
|
+
* this median calculation does not use sample weights when using 'weights_as_sample_prob=false'),
|
|
653
|
+
* b) "Smallest", which will assign all observations with unseen categories in the split to the branch that
|
|
654
|
+
* had fewer observations when fitting the model, c) "Random", which will assing a branch (coefficient in the
|
|
655
|
+
* extended model) at random for each category beforehand, even if no observations had that category when
|
|
656
|
+
* fitting the model. Ignored when passing 'cat_split_type' = 'SingleCateg'.
|
|
657
|
+
* - all_perm
|
|
658
|
+
* When doing categorical variable splits by pooled gain with 'ndim=1' (regular model),
|
|
659
|
+
* whether to consider all possible permutations of variables to assign to each branch or not. If 'false',
|
|
660
|
+
* will sort the categories by their frequency and make a grouping in this sorted order. Note that the
|
|
661
|
+
* number of combinations evaluated (if 'true') is the factorial of the number of present categories in
|
|
662
|
+
* a given column (minus 2). For averaged gain, the best split is always to put the second most-frequent
|
|
663
|
+
* category in a separate branch, so not evaluating all permutations (passing 'false') will make it
|
|
664
|
+
* possible to select other splits that respect the sorted frequency order.
|
|
665
|
+
* The total number of combinations must be a number that can fit into a 'size_t' variable - for x64-64
|
|
666
|
+
* systems, this means no column can have more than 20 different categories if using 'all_perm=true',
|
|
667
|
+
* but note that this is not checked within the function.
|
|
668
|
+
* Ignored when not using categorical variables or not doing splits by pooled gain or using 'ndim>1'.
|
|
669
|
+
* - coef_by_prop
|
|
670
|
+
* In the extended model, whether to sort the randomly-generated coefficients for categories
|
|
671
|
+
* according to their relative frequency in the tree node. This might provide better results when using
|
|
672
|
+
* categorical variables with too many categories, but is not recommended, and not reflective of
|
|
673
|
+
* real "categorical-ness". Ignored for the regular model ('ndim=1') and/or when not using categorical
|
|
674
|
+
* variables.
|
|
675
|
+
* - imputer (out)
|
|
676
|
+
* Pointer to already-allocated imputer object, which can be used to produce missing value imputations
|
|
677
|
+
* in new data. Pass NULL if no missing value imputations are required. Note that this is not related to
|
|
678
|
+
* 'missing_action' as missing values inside the model are treated differently and follow their own imputation
|
|
679
|
+
* or division strategy.
|
|
680
|
+
* - min_imp_obs
|
|
681
|
+
* Minimum number of observations with which an imputation value can be produced. Ignored if passing
|
|
682
|
+
* 'build_imputer' = 'false'.
|
|
683
|
+
* - depth_imp
|
|
684
|
+
* How to weight observations according to their depth when used for imputing missing values. Passing
|
|
685
|
+
* "Higher" will weigh observations higher the further down the tree (away from the root node) the
|
|
686
|
+
* terminal node is, while "lower" will do the opposite, and "Sane" will not modify the weights according
|
|
687
|
+
* to node depth in the tree. Implemented for testing purposes and not recommended to change
|
|
688
|
+
* from the default. Ignored when not passing 'impute_nodes'.
|
|
689
|
+
* - weigh_imp_rows
|
|
690
|
+
* How to weight node sizes when used for imputing missing values. Passing "Inverse" will weigh
|
|
691
|
+
* a node inversely proportional to the number of observations that end up there, while "Proportional"
|
|
692
|
+
* will weight them heavier the more observations there are, and "Flat" will weigh all nodes the same
|
|
693
|
+
* in this regard regardless of how many observations end up there. Implemented for testing purposes
|
|
694
|
+
* and not recommended to change from the default. Ignored when not passing 'impute_nodes'.
|
|
695
|
+
* - impute_at_fit
|
|
696
|
+
* Whether to impute missing values in the input data as the model is being built. If passing 'true',
|
|
697
|
+
* then 'sample_size' must be equal to 'nrows'. Values in the arrays passed to 'numeric_data',
|
|
698
|
+
* 'categ_data', and 'Xc', will get overwritten with the imputations produced.
|
|
699
|
+
* - random_seed
|
|
700
|
+
* Seed that will be used to generate random numbers used by the model.
|
|
701
|
+
* - use_long_double
|
|
702
|
+
* Whether to use 'long double' (extended precision) type for more precise calculations about
|
|
703
|
+
* standard deviations, means, ratios, weights, gain, and other potential aggregates. This makes
|
|
704
|
+
* such calculations accurate to a larger number of decimals (provided that the compiler used has
|
|
705
|
+
* wider long doubles than doubles) and it is highly recommended to use when the input data has
|
|
706
|
+
* a number of rows or columns exceeding 2^53 (an unlikely scenario), and also highly recommended
|
|
707
|
+
* to use when the input data has problematic scales (e.g. numbers that differ from each other by
|
|
708
|
+
* something like 10^-100 or columns that include values like 10^100 and 10^-100 and still need to
|
|
709
|
+
* be sensitive to a difference of 10^-100), but will make the calculations slower, the more so in
|
|
710
|
+
* platforms in which 'long double' is a software-emulated type (e.g. Power8 platforms).
|
|
711
|
+
* Note that some platforms (most notably windows with the msvc compiler) do not make any difference
|
|
712
|
+
* between 'double' and 'long double'.
|
|
713
|
+
* - nthreads
|
|
714
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
715
|
+
* allocated, even if the thread does not end up being used.
|
|
716
|
+
* Be aware that most of the operations are bound by memory bandwidth, which means that
|
|
717
|
+
* adding more threads will not result in a linear speed-up. For some types of data
|
|
718
|
+
* (e.g. large sparse matrices with small sample sizes), adding more threads might result
|
|
719
|
+
* in only a very modest speed up (e.g. 1.5x faster with 4x more threads),
|
|
720
|
+
* even if all threads look fully utilized.
|
|
721
|
+
* Ignored when not building with OpenMP support.
|
|
722
|
+
*
|
|
723
|
+
* Returns
|
|
724
|
+
* =======
|
|
725
|
+
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
|
726
|
+
* If the process receives an interrupt signal, will return instead
|
|
727
|
+
* 'EXIT_FAILURE' (typically =1). If you do not have any way of determining
|
|
728
|
+
* what these values correspond to, you can use the functions
|
|
729
|
+
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
|
730
|
+
* as integers.
|
|
731
|
+
*/
|
|
732
|
+
ISOTREE_EXPORTED
|
|
733
|
+
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
734
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
735
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
736
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
737
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
738
|
+
real_t sample_weights[], bool with_replacement, bool weight_as_sample,
|
|
739
|
+
size_t nrows, size_t sample_size, size_t ntrees,
|
|
740
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
741
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
742
|
+
ScoringMetric scoring_metric, bool fast_bratio,
|
|
743
|
+
bool standardize_dist, double tmat[],
|
|
744
|
+
double output_depths[], bool standardize_depth,
|
|
745
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
746
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
747
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
748
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
749
|
+
double prob_pick_col_by_kurt,
|
|
750
|
+
double min_gain, MissingAction missing_action,
|
|
751
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
752
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
753
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
|
754
|
+
uint64_t random_seed, bool use_long_double, int nthreads);
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
/* Add additional trees to already-fitted isolation forest model
|
|
759
|
+
*
|
|
760
|
+
* Parameters
|
|
761
|
+
* ==========
|
|
762
|
+
* - model_outputs
|
|
763
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
764
|
+
* if the trees are are to be added to an extended model. Can only pass one of
|
|
765
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
|
766
|
+
* so it cannot be run in parallel for the same model object.
|
|
767
|
+
* - model_outputs_ext
|
|
768
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
769
|
+
* if the trees are are to be added to an single-variable model. Can only pass one of
|
|
770
|
+
* 'model_outputs' and 'model_outputs_ext'. Note that this function is not thread-safe,
|
|
771
|
+
* so it cannot be run in parallel for the same model object.
|
|
772
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
773
|
+
* Pointer to numeric data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
|
774
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
775
|
+
* Pass NULL if there are no dense numeric columns.
|
|
776
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
777
|
+
* If the model from 'fit_iforest' was fit to numeric data, must pass numeric data with the same number
|
|
778
|
+
* of columns, either as dense or as sparse arrays.
|
|
779
|
+
* - ncols_numeric
|
|
780
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
781
|
+
* what was originally passed to 'fit_iforest'.
|
|
782
|
+
* - categ_data[nrows * ncols_categ]
|
|
783
|
+
* Pointer to categorical data to which to fit this additional tree. Must be ordered by columns like Fortran,
|
|
784
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.).
|
|
785
|
+
* Pass NULL if there are no categorical columns. The encoding must be the same as was used
|
|
786
|
+
* in the data to which the model was fit.
|
|
787
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
788
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
789
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
790
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
791
|
+
* must be the same as was used in the data to which the model was fit.
|
|
792
|
+
* If the model from 'fit_iforest' was fit to categorical data, must pass categorical data with the same number
|
|
793
|
+
* of columns and the same category encoding.
|
|
794
|
+
* - ncols_categ
|
|
795
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
796
|
+
* what was originally passed to 'fit_iforest'.
|
|
797
|
+
* - ncat[ncols_categ]
|
|
798
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). May contain new categories,
|
|
799
|
+
* but should keep the same encodings that were used for previous categories.
|
|
800
|
+
* - Xc[nnz]
|
|
801
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
802
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
803
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
804
|
+
* - Xc_ind[nnz]
|
|
805
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
806
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
807
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
808
|
+
* - Xc_indptr[ncols_numeric + 1]
|
|
809
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
810
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
811
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
812
|
+
* - ndim
|
|
813
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
814
|
+
* what was originally passed to 'fit_iforest'.
|
|
815
|
+
* - ntry
|
|
816
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
817
|
+
* what was originally passed to 'fit_iforest'.
|
|
818
|
+
* - coef_type
|
|
819
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
820
|
+
* what was originally passed to 'fit_iforest'.
|
|
821
|
+
* - sample_weights
|
|
822
|
+
* Weights for the rows when adding this tree, either as sampling importances when using
|
|
823
|
+
* sub-samples for each tree (i.e. passing weight '2' makes a row twice as likely to be included
|
|
824
|
+
* in a random sub-sample), or as density measurement (i.e. passing weight '2' is the same as if
|
|
825
|
+
* the row appeared twice, thus it's less of an outlier) - how this is taken is determined
|
|
826
|
+
* through parameter 'weight_as_sample' that was passed to 'fit_iforest.
|
|
827
|
+
* Pass NULL if the rows all have uniform weights.
|
|
828
|
+
* - nrows
|
|
829
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
830
|
+
* - max_depth
|
|
831
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
832
|
+
* what was originally passed to 'fit_iforest'.
|
|
833
|
+
* - ncols_per_tree
|
|
834
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
835
|
+
* what was originally passed to 'fit_iforest'.
|
|
836
|
+
* - limit_depth
|
|
837
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
838
|
+
* what was originally passed to 'fit_iforest'.
|
|
839
|
+
* - penalize_range
|
|
840
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
841
|
+
* what was originally passed to 'fit_iforest'.
|
|
842
|
+
* - standardize_data
|
|
843
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
844
|
+
* what was originally passed to 'fit_iforest'.
|
|
845
|
+
* - fast_bratio
|
|
846
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
847
|
+
* what was originally passed to 'fit_iforest'.
|
|
848
|
+
* - col_weights
|
|
849
|
+
* Sampling weights for each column, assuming all the numeric columns come before the categorical columns.
|
|
850
|
+
* Ignored when picking columns by deterministic criterion.
|
|
851
|
+
* If passing NULL, each column will have a uniform weight. If used along with kurtosis weights, the
|
|
852
|
+
* effect is multiplicative.
|
|
853
|
+
* - weigh_by_kurt
|
|
854
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
855
|
+
* what was originally passed to 'fit_iforest'.
|
|
856
|
+
* - prob_pick_by_gain_pl
|
|
857
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
858
|
+
* what was originally passed to 'fit_iforest'.
|
|
859
|
+
* - prob_pick_by_gain_avg
|
|
860
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
861
|
+
* what was originally passed to 'fit_iforest'.
|
|
862
|
+
* - prob_pick_by_full_gain
|
|
863
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
864
|
+
* what was originally passed to 'fit_iforest'.
|
|
865
|
+
* - prob_pick_by_dens
|
|
866
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
867
|
+
* what was originally passed to 'fit_iforest'.
|
|
868
|
+
* - prob_pick_col_by_range
|
|
869
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
870
|
+
* what was originally passed to 'fit_iforest'.
|
|
871
|
+
* - prob_pick_col_by_var
|
|
872
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
873
|
+
* what was originally passed to 'fit_iforest'.
|
|
874
|
+
* - prob_pick_col_by_kurt
|
|
875
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
876
|
+
* what was originally passed to 'fit_iforest'.
|
|
877
|
+
* - min_gain
|
|
878
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
879
|
+
* what was originally passed to 'fit_iforest'.
|
|
880
|
+
* - missing_action
|
|
881
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
882
|
+
* what was originally passed to 'fit_iforest'.
|
|
883
|
+
* - cat_split_type
|
|
884
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
885
|
+
* what was originally passed to 'fit_iforest'.
|
|
886
|
+
* - new_cat_action
|
|
887
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
888
|
+
* what was originally passed to 'fit_iforest'.
|
|
889
|
+
* - depth_imp
|
|
890
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
891
|
+
* what was originally passed to 'fit_iforest'.
|
|
892
|
+
* - weigh_imp_rows
|
|
893
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Cannot be changed from
|
|
894
|
+
* what was originally passed to 'fit_iforest'.
|
|
895
|
+
* - all_perm
|
|
896
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
897
|
+
* what was originally passed to 'fit_iforest'.
|
|
898
|
+
* - coef_by_prop
|
|
899
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
900
|
+
* what was originally passed to 'fit_iforest'.
|
|
901
|
+
* - imputer
|
|
902
|
+
* Pointer to already-allocated imputer object, as it was output from function 'fit_model' while
|
|
903
|
+
* producing either 'model_outputs' or 'model_outputs_ext'.
|
|
904
|
+
* Pass NULL if the model was built without imputer.
|
|
905
|
+
* - min_imp_obs
|
|
906
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
907
|
+
* what was originally passed to 'fit_iforest'.
|
|
908
|
+
* - indexer
|
|
909
|
+
* Indexer object associated to the model object ('model_outputs' or 'model_outputs_ext'), which will
|
|
910
|
+
* be updated with the new tree to add.
|
|
911
|
+
* If 'indexer' has reference points, these must be passed again here in order to index them.
|
|
912
|
+
* Pass NULL if the model has no associated indexer.
|
|
913
|
+
* - ref_numeric_data[nref * ncols_numeric]
|
|
914
|
+
* Pointer to numeric data for reference points. May be ordered by rows
|
|
915
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
916
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
917
|
+
* (see parameter 'ref_is_col_major').
|
|
918
|
+
* Pass NULL if there are no dense numeric columns or no reference points.
|
|
919
|
+
* Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
|
|
920
|
+
* If 'indexer' is passed, it has reference points, and the data to which the model was fit had
|
|
921
|
+
* numeric columns, then numeric data for reference points must be passed (in either dense or sparse format).
|
|
922
|
+
* - ref_categ_data[nref * ncols_categ]
|
|
923
|
+
* Pointer to categorical data for reference points. May be ordered by rows
|
|
924
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
925
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
926
|
+
* (see parameter 'ref_is_col_major').
|
|
927
|
+
* Pass NULL if there are no categorical columns or no reference points.
|
|
928
|
+
* If 'indexer' is passed, it has reference points, and the data to which the model was fit had
|
|
929
|
+
* categorical columns, then 'ref_categ_data' must be passed.
|
|
930
|
+
* - ref_is_col_major
|
|
931
|
+
* Whether 'ref_numeric_data' and/or 'ref_categ_data' are in column-major order. If numeric data is
|
|
932
|
+
* passed in sparse format, categorical data must be passed in column-major format. If passing dense
|
|
933
|
+
* data, row-major format is preferred as it will be faster. If the data is passed in row-major format,
|
|
934
|
+
* must also pass 'ref_ld_numeric' and/or 'ref_ld_categ'.
|
|
935
|
+
* If both 'ref_numeric_data' and 'ref_categ_data' are passed, they must have the same orientation
|
|
936
|
+
* (row-major or column-major).
|
|
937
|
+
* - ref_ld_numeric
|
|
938
|
+
* Leading dimension of the array 'ref_numeric_data', if it is passed in row-major format.
|
|
939
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
940
|
+
* be accessed assuming that row 'n' starts at 'ref_numeric_data + n*ref_ld_numeric'). If passing
|
|
941
|
+
* 'ref_numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
942
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
943
|
+
* data in sparse format.
|
|
944
|
+
* - ref_ld_categ
|
|
945
|
+
* Leading dimension of the array 'ref_categ_data', if it is passed in row-major format.
|
|
946
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
947
|
+
* be accessed assuming that row 'n' starts at 'ref_categ_data + n*ref_ld_categ'). If passing
|
|
948
|
+
* 'ref_categ_data' in column-major order, this is ignored and will be assumed that the
|
|
949
|
+
* leading dimension corresponds to the number of rows.
|
|
950
|
+
* - ref_Xc[ref_nnz]
|
|
951
|
+
* Pointer to numeric data for reference points in sparse numeric matrix in CSC format (column-compressed).
|
|
952
|
+
* Pass NULL if there are no sparse numeric columns for reference points or no reference points.
|
|
953
|
+
* Can only pass one of 'ref_numeric_data' or 'ref_Xc' + 'ref_Xc_ind' + 'ref_Xc_indptr'.
|
|
954
|
+
* - ref_Xc_ind[ref_nnz]
|
|
955
|
+
* Pointer to row indices to which each non-zero entry in 'ref_Xc' corresponds.
|
|
956
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
957
|
+
* Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
|
|
958
|
+
* - ref_Xc_indptr[ref_nnz]
|
|
959
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
960
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
961
|
+
* Pass NULL if there are no sparse numeric columns in CSC format for reference points or no reference points.
|
|
962
|
+
* - random_seed
|
|
963
|
+
* Seed that will be used to generate random numbers used by the model.
|
|
964
|
+
* - use_long_double
|
|
965
|
+
* Same parameter as for 'fit_iforest' (see the documentation in there for details). Can be changed from
|
|
966
|
+
* what was originally passed to 'fit_iforest'.
|
|
967
|
+
*/
|
|
968
|
+
ISOTREE_EXPORTED
|
|
969
|
+
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
970
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
971
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
972
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
973
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
974
|
+
real_t sample_weights[], size_t nrows,
|
|
975
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
976
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
977
|
+
bool fast_bratio,
|
|
978
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
979
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
980
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
981
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
982
|
+
double prob_pick_col_by_kurt,
|
|
983
|
+
double min_gain, MissingAction missing_action,
|
|
984
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
985
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
986
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
987
|
+
TreesIndexer *indexer,
|
|
988
|
+
real_t ref_numeric_data[], int ref_categ_data[],
|
|
989
|
+
bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
|
|
990
|
+
real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
|
|
991
|
+
uint64_t random_seed, bool use_long_double);
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
/* Predict outlier score, average depth, or terminal node numbers
|
|
995
|
+
*
|
|
996
|
+
* Parameters
|
|
997
|
+
* ==========
|
|
998
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
999
|
+
* Pointer to numeric data for which to make predictions. May be ordered by rows
|
|
1000
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1001
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1002
|
+
* (see parameter 'is_col_major').
|
|
1003
|
+
* Pass NULL if there are no dense numeric columns.
|
|
1004
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
1005
|
+
* - categ_data[nrows * ncols_categ]
|
|
1006
|
+
* Pointer to categorical data for which to make predictions. May be ordered by rows
|
|
1007
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1008
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1009
|
+
* (see parameter 'is_col_major').
|
|
1010
|
+
* Pass NULL if there are no categorical columns.
|
|
1011
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
1012
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
1013
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
1014
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
1015
|
+
* must be the same as was used in the data to which the model was fit.
|
|
1016
|
+
* - is_col_major
|
|
1017
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
|
1018
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
|
1019
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
|
1020
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
|
1021
|
+
* If there is numeric sparse data in combination with categorical dense data and there are many
|
|
1022
|
+
* rows, it is recommended to pass the categorical data in column major order, as it will take
|
|
1023
|
+
* a faster route.
|
|
1024
|
+
* If passing 'is_col_major=true', must also provide 'ld_numeric' and/or 'ld_categ'.
|
|
1025
|
+
* - ld_numeric
|
|
1026
|
+
* Leading dimension of the array 'numeric_data', if it is passed in row-major format.
|
|
1027
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1028
|
+
* be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
|
|
1029
|
+
* 'numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
1030
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
1031
|
+
* data in sparse format.
|
|
1032
|
+
* - ld_categ
|
|
1033
|
+
* Leading dimension of the array 'categ_data', if it is passed in row-major format.
|
|
1034
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1035
|
+
* be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
|
|
1036
|
+
* 'categ_data' in column-major order, this is ignored and will be assumed that the
|
|
1037
|
+
* leading dimension corresponds to the number of rows.
|
|
1038
|
+
* - Xc[nnz]
|
|
1039
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
1040
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1041
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
1042
|
+
* - Xc_ind[nnz]
|
|
1043
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
1044
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1045
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
|
1046
|
+
* - Xc_indptr[ncols_categ + 1]
|
|
1047
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
1048
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
1049
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
|
1050
|
+
* - Xr[nnz]
|
|
1051
|
+
* Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
|
|
1052
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1053
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
1054
|
+
* - Xr_ind[nnz]
|
|
1055
|
+
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
|
1056
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1057
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
1058
|
+
* - Xr_indptr[nrows + 1]
|
|
1059
|
+
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
|
1060
|
+
* start and at entry [row + 1] where does row 'row' end.
|
|
1061
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
1062
|
+
* - nrows
|
|
1063
|
+
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
|
1064
|
+
* - nthreads
|
|
1065
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
1066
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
1067
|
+
* OpenMP support.
|
|
1068
|
+
* - standardize
|
|
1069
|
+
* Whether to standardize the average depths for each row according to their relative magnitude
|
|
1070
|
+
* compared to the expected average, in order to obtain an outlier score. If passing 'false',
|
|
1071
|
+
* will output the average depth instead.
|
|
1072
|
+
* Ignored when not passing 'output_depths'.
|
|
1073
|
+
* - model_outputs
|
|
1074
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
1075
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
|
1076
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1077
|
+
* - model_outputs_ext
|
|
1078
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
1079
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
|
1080
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1081
|
+
* - output_depths[nrows] (out)
|
|
1082
|
+
* Pointer to array where the output average depths or outlier scores will be written into
|
|
1083
|
+
* (the return type is controlled according to parameter 'standardize').
|
|
1084
|
+
* Should always be passed when calling this function (it is not optional).
|
|
1085
|
+
* - tree_num[nrows * ntrees] (out)
|
|
1086
|
+
* Pointer to array where the output terminal node numbers will be written into.
|
|
1087
|
+
* Note that the mapping between tree node and terminal tree node is not stored in
|
|
1088
|
+
* the model object for efficiency reasons, so this mapping will be determined on-the-fly
|
|
1089
|
+
* when passing this parameter, and as such, there will be some overhead regardless of
|
|
1090
|
+
* the actual number of rows. Output will be in column-major order ([nrows, ntrees]).
|
|
1091
|
+
* This will not be calculable when using 'ndim==1' alongside with either
|
|
1092
|
+
* 'missing_action==Divide' or 'new_categ_action=Weighted'.
|
|
1093
|
+
* Pass NULL if this type of output is not needed.
|
|
1094
|
+
* - per_tree_depths[nrows * ntrees] (out)
|
|
1095
|
+
* Pointer to array where to output per-tree depths or expected depths for each row.
|
|
1096
|
+
* Note that these will not include range penalities ('penalize_range=true').
|
|
1097
|
+
* Output will be in row-major order ([nrows, ntrees]).
|
|
1098
|
+
* This will not be calculable when using 'ndim==1' alongside with either
|
|
1099
|
+
* 'missing_action==Divide' or 'new_categ_action=Weighted'.
|
|
1100
|
+
* Pass NULL if this type of output is not needed.
|
|
1101
|
+
* - indexer
|
|
1102
|
+
* Pointer to associated tree indexer for the model being used, if it was constructed,
|
|
1103
|
+
* which can be used to speed up tree numbers/indices predictions.
|
|
1104
|
+
* This is ignored when not passing 'tree_num'.
|
|
1105
|
+
* Pass NULL if the indexer has not been constructed.
|
|
1106
|
+
*/
|
|
1107
|
+
ISOTREE_EXPORTED
|
|
1108
|
+
void predict_iforest(real_t numeric_data[], int categ_data[],
|
|
1109
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1110
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1111
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
1112
|
+
size_t nrows, int nthreads, bool standardize,
|
|
1113
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1114
|
+
double output_depths[], sparse_ix tree_num[],
|
|
1115
|
+
double per_tree_depths[],
|
|
1116
|
+
TreesIndexer *indexer);
|
|
1117
|
+
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
/* Get the number of nodes present in a given model, per tree
|
|
1121
|
+
*
|
|
1122
|
+
* Parameters
|
|
1123
|
+
* ==========
|
|
1124
|
+
* - model_outputs
|
|
1125
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'.
|
|
1126
|
+
* - model_outputs_ext
|
|
1127
|
+
* Pointer to fitted extended model object from function 'fit_iforest'.
|
|
1128
|
+
* - n_nodes[ntrees] (out)
|
|
1129
|
+
* Number of nodes in tree of the model, including non-terminal nodes.
|
|
1130
|
+
* - n_terminal[ntrees] (out)
|
|
1131
|
+
* Number of terminal nodes in each tree of the model.
|
|
1132
|
+
* - nthreads
|
|
1133
|
+
* Number of parallel threads to use.
|
|
1134
|
+
*/
|
|
1135
|
+
ISOTREE_EXPORTED void get_num_nodes(IsoForest &model_outputs, sparse_ix *n_nodes, sparse_ix *n_terminal, int nthreads) noexcept;
|
|
1136
|
+
ISOTREE_EXPORTED void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *n_nodes, sparse_ix *n_terminal, int nthreads) noexcept;
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
|
|
1140
|
+
/* Calculate distance or similarity or kernel/proximity between data points
|
|
1141
|
+
*
|
|
1142
|
+
* Parameters
|
|
1143
|
+
* ==========
|
|
1144
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
1145
|
+
* Pointer to numeric data for which to make calculations. If not using 'indexer', must be
|
|
1146
|
+
* ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
|
|
1147
|
+
* column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
|
|
1148
|
+
* row-major or column-major format (with row-major being faster).
|
|
1149
|
+
* If categorical data is passed, must be in the same storage order (row-major / column-major)
|
|
1150
|
+
* as numerical data (whether dense or sparse).
|
|
1151
|
+
* The column order must be the same as in the data that was used to fit the model.
|
|
1152
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
1153
|
+
* the first group is assumed to be the earlier rows here.
|
|
1154
|
+
* Pass NULL if there are no dense numeric columns.
|
|
1155
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
1156
|
+
* - categ_data[nrows * ncols_categ]
|
|
1157
|
+
* Pointer to categorical data for which to make calculations. If not using 'indexer', must be
|
|
1158
|
+
* ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
|
|
1159
|
+
* column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
|
|
1160
|
+
* row-major or column-major format (with row-major being faster).
|
|
1161
|
+
* If numerical data is passed, must be in the same storage order (row-major / column-major)
|
|
1162
|
+
* as categorical data (whether the numerical data is dense or sparse).
|
|
1163
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
1164
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
1165
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
1166
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
1167
|
+
* must be the same as was used in the data to which the model was fit.
|
|
1168
|
+
* Pass NULL if there are no categorical columns.
|
|
1169
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
1170
|
+
* the first group is assumed to be the earlier rows here.
|
|
1171
|
+
* - Xc[nnz]
|
|
1172
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed),
|
|
1173
|
+
* or optionally in CSR format (row-compressed) if using 'indexer' and passing 'is_col_major=false'
|
|
1174
|
+
* (not recommended as the calculations will be slower if sparse data is passed as CSR).
|
|
1175
|
+
* If categorical data is passed, must be in the same storage order (row-major or CSR / column-major or CSC)
|
|
1176
|
+
* as numerical data (whether dense or sparse).
|
|
1177
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1178
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
1179
|
+
* - Xc_ind[nnz]
|
|
1180
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds
|
|
1181
|
+
* (column indices if 'Xc' is in CSR format).
|
|
1182
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1183
|
+
* Pass NULL if there are no sparse numeric columns in CSC or CSR format.
|
|
1184
|
+
* - Xc_indptr[ncols_categ + 1]
|
|
1185
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
1186
|
+
* start and at entry [col + 1] where does column 'col' end
|
|
1187
|
+
* (row index pointers if 'Xc' is passed in CSR format).
|
|
1188
|
+
* Pass NULL if there are no sparse numeric columns in CSC or CSR format.
|
|
1189
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
1190
|
+
* the first group is assumed to be the earlier rows here.
|
|
1191
|
+
* - nrows
|
|
1192
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
1193
|
+
* - use_long_double
|
|
1194
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
|
1195
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
|
1196
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
|
1197
|
+
* Power8 platforms).
|
|
1198
|
+
* - nthreads
|
|
1199
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
1200
|
+
* allocated, even if the thread does not end up being used (with one exception being kernel calculations
|
|
1201
|
+
* with respect to reference points in an idexer). Ignored when not building with OpenMP support.
|
|
1202
|
+
* - assume_full_distr
|
|
1203
|
+
* Whether to assume that the fitted model represents a full population distribution (will use a
|
|
1204
|
+
* standardizing criterion assuming infinite sample, and the results of the similarity between two points
|
|
1205
|
+
* at prediction time will not depend on the prescence of any third point that is similar to them, but will
|
|
1206
|
+
* differ more compared to the pairwise distances between points from which the model was fit). If passing
|
|
1207
|
+
* 'false', will calculate pairwise distances as if the new observations at prediction time were added to
|
|
1208
|
+
* the sample to which each tree was fit, which will make the distances between two points potentially vary
|
|
1209
|
+
* according to other newly introduced points.
|
|
1210
|
+
* This was added for experimentation purposes only and it's not recommended to pass 'false'.
|
|
1211
|
+
* Note that when calculating distances using 'indexer', there
|
|
1212
|
+
* might be slight discrepancies between the numbers produced with or without the indexer due to what
|
|
1213
|
+
* are considered "additional" observations in this calculation.
|
|
1214
|
+
* This is ignored when passing 'as_kernel=true'.
|
|
1215
|
+
* - standardize_dist
|
|
1216
|
+
* Whether to standardize the resulting average separation depths between rows according
|
|
1217
|
+
* to the expected average separation depth in a similar way as when predicting outlierness,
|
|
1218
|
+
* in order to obtain a standardized distance. If passing 'false', will output the average
|
|
1219
|
+
* separation depth instead.
|
|
1220
|
+
* If passing 'as_kernel=true', this indicates whether to output a fraction (if 'true') or
|
|
1221
|
+
* the raw number of matching trees (if 'false').
|
|
1222
|
+
* - as_kernel
|
|
1223
|
+
* Whether to calculate the "similarities" as isolation kernel or proximity matrix, which counts
|
|
1224
|
+
* the proportion of trees in which two observations end up in the same terminal node. This is
|
|
1225
|
+
* typically much faster than separation-based distance, but is typically not as good quality.
|
|
1226
|
+
* Note that, for kernel calculations, the indexer is only used if it has reference points stored on it.
|
|
1227
|
+
* - model_outputs
|
|
1228
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
1229
|
+
* if the calculations are to be made from an extended model. Can only pass one of
|
|
1230
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1231
|
+
* - model_outputs_ext
|
|
1232
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
1233
|
+
* if the calculations are to be made from a single-variable model. Can only pass one of
|
|
1234
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1235
|
+
* - tmat[nrows * (nrows - 1) / 2] (out)
|
|
1236
|
+
* Pointer to array where the resulting pairwise distances or average separation depths or kernels will
|
|
1237
|
+
* be written into. As the output is a symmetric matrix, this function will only fill in the
|
|
1238
|
+
* upper-triangular part, in which entry 0 <= i < j < n will be located at position
|
|
1239
|
+
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
|
1240
|
+
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
|
1241
|
+
* The array must already be initialized to zeros.
|
|
1242
|
+
* If calculating distance/separation from a group of points to another group of points,
|
|
1243
|
+
* pass NULL here and use 'rmat' instead.
|
|
1244
|
+
* - rmat[nrows1 * nrows2] (out)
|
|
1245
|
+
* Pointer to array where to write the distances or separation depths or kernels between each row in
|
|
1246
|
+
* one set of observations and each row in a different set of observations. If doing these
|
|
1247
|
+
* calculations for all pairs of observations/rows, pass 'tmat' instead.
|
|
1248
|
+
* Will take the first group of observations as the rows in this matrix, and the second
|
|
1249
|
+
* group as the columns. The groups are assumed to be in the same data arrays, with the
|
|
1250
|
+
* first group corresponding to the earlier rows there.
|
|
1251
|
+
* This matrix will be used in row-major order (i.e. entries 1..nrows2 contain the first row from nrows1).
|
|
1252
|
+
* Must be already initialized to zeros.
|
|
1253
|
+
* If passing 'use_indexed_references=true' plus an indexer object with reference points, this
|
|
1254
|
+
* array should have dimension [nrows, n_references].
|
|
1255
|
+
* Ignored when 'tmat' is passed.
|
|
1256
|
+
* - n_from
|
|
1257
|
+
* When calculating distances between two groups of points, this indicates the number of
|
|
1258
|
+
* observations/rows belonging to the first group (the rows in 'rmat'), which will be
|
|
1259
|
+
* assumed to be the first 'n_from' rows.
|
|
1260
|
+
* Ignored when 'tmat' is passed or when 'use_indexed_references=true' plus an indexer with
|
|
1261
|
+
* references are passed.
|
|
1262
|
+
* - use_indexed_references
|
|
1263
|
+
* Whether to calculate distances with respect to reference points stored in the indexer
|
|
1264
|
+
* object, if it has any. This is only supported with 'assume_full_distr=true' or with 'as_kernel=true'.
|
|
1265
|
+
* If passing 'use_indexed_references=true', then 'tmat' must be NULL, and 'rmat' must
|
|
1266
|
+
* be of dimension [nrows, n_references].
|
|
1267
|
+
* - indexer
|
|
1268
|
+
* Pointer to associated tree indexer for the model being used, if it was constructed,
|
|
1269
|
+
* which can be used to speed up distance calculations, assuming that it was built with
|
|
1270
|
+
* option 'with_distances=true'. If it does not contain node distances, it will not be used.
|
|
1271
|
+
* Pass NULL if the indexer has not been constructed or was constructed with 'with_distances=false'.
|
|
1272
|
+
* If it contains reference points and passing 'use_indexed_references=true', distances will be
|
|
1273
|
+
* calculated between between the input data passed here and the reference points stored in this object.
|
|
1274
|
+
* If passing 'as_kernel=true', the indexer can only be used for calculating kernels with respect to
|
|
1275
|
+
* reference points in the indexer, otherwise it will not be used (which also means that the data must be
|
|
1276
|
+
* passed in column-major order for all kernel calculations that are not with respect to reference points
|
|
1277
|
+
* from an indexer).
|
|
1278
|
+
* - is_col_major
|
|
1279
|
+
* Whether the data comes in column-major order. If using 'indexer', predictions are also possible
|
|
1280
|
+
* (and are even faster for the case of dense-only data) if passing the data in row-major format.
|
|
1281
|
+
* Without 'indexer' (and with 'as_kernel=true' but without reference points in the idnexer), data
|
|
1282
|
+
* may only be passed in column-major format.
|
|
1283
|
+
* If there is sparse numeric data, it is highly suggested to pass it in CSC/column-major format.
|
|
1284
|
+
* - ld_numeric
|
|
1285
|
+
* If passing 'is_col_major=false', this indicates the leading dimension of the array 'numeric_data'.
|
|
1286
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1287
|
+
* be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
|
|
1288
|
+
* 'numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
1289
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
1290
|
+
* data in sparse format.
|
|
1291
|
+
* Note that data in row-major order is only accepted when using 'indexer'.
|
|
1292
|
+
* - ld_categ
|
|
1293
|
+
* If passing 'is_col_major=false', this indicates the leading dimension of the array 'categ_data'.
|
|
1294
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
1295
|
+
* be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
|
|
1296
|
+
* 'categ_data' in column-major order, this is ignored and will be assumed that the
|
|
1297
|
+
* leading dimension corresponds to the number of rows.
|
|
1298
|
+
* Note that data in row-major order is only accepted when using 'indexer'.
|
|
1299
|
+
*/
|
|
1300
|
+
ISOTREE_EXPORTED
|
|
1301
|
+
void calc_similarity(real_t numeric_data[], int categ_data[],
|
|
1302
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1303
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
1304
|
+
bool assume_full_distr, bool standardize_dist, bool as_kernel,
|
|
1305
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1306
|
+
double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
|
|
1307
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
|
|
1308
|
+
|
|
1309
|
+
/* Impute missing values in new data
|
|
1310
|
+
*
|
|
1311
|
+
* Parameters
|
|
1312
|
+
* ==========
|
|
1313
|
+
* - numeric_data[nrows * ncols_numeric] (in, out)
|
|
1314
|
+
* Pointer to numeric data in which missing values will be imputed. May be ordered by rows
|
|
1315
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1316
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1317
|
+
* (see parameter 'is_col_major').
|
|
1318
|
+
* Pass NULL if there are no dense numeric columns.
|
|
1319
|
+
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
1320
|
+
* Imputations will overwrite values in this same array.
|
|
1321
|
+
* - categ_data[nrows * ncols_categ]
|
|
1322
|
+
* Pointer to categorical data in which missing values will be imputed. May be ordered by rows
|
|
1323
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
1324
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
1325
|
+
* (see parameter 'is_col_major').
|
|
1326
|
+
* Pass NULL if there are no categorical columns.
|
|
1327
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
1328
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
1329
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
1330
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
1331
|
+
* must be the same as was used in the data to which the model was fit.
|
|
1332
|
+
* Imputations will overwrite values in this same array.
|
|
1333
|
+
* - is_col_major
|
|
1334
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
|
1335
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
|
1336
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
|
1337
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
|
1338
|
+
* - ncols_categ
|
|
1339
|
+
* Number of categorical columns in the data.
|
|
1340
|
+
* - ncat[ncols_categ]
|
|
1341
|
+
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
|
1342
|
+
* the number of categories for that column is '5' (zero is one category).
|
|
1343
|
+
* Must be the same as was passed to 'fit_iforest'.
|
|
1344
|
+
* - Xr[nnz] (in, out)
|
|
1345
|
+
* Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
|
|
1346
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
1347
|
+
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
1348
|
+
* Imputations will overwrite values in this same array.
|
|
1349
|
+
* - Xr_ind[nnz]
|
|
1350
|
+
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
|
1351
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
1352
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
1353
|
+
* - Xr_indptr[nrows + 1]
|
|
1354
|
+
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
|
1355
|
+
* start and at entry [row + 1] where does row 'row' end.
|
|
1356
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
1357
|
+
* - nrows
|
|
1358
|
+
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
|
1359
|
+
* - use_long_double
|
|
1360
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
|
1361
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
|
1362
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
|
1363
|
+
* Power8 platforms).
|
|
1364
|
+
* - nthreads
|
|
1365
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
1366
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
1367
|
+
* OpenMP support.
|
|
1368
|
+
* - model_outputs
|
|
1369
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
1370
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
|
1371
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1372
|
+
* - model_outputs_ext
|
|
1373
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
1374
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
|
1375
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
1376
|
+
* - impute_nodes
|
|
1377
|
+
* Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
|
|
1378
|
+
* as produced from function 'fit_iforest',
|
|
1379
|
+
*/
|
|
1380
|
+
ISOTREE_EXPORTED
|
|
1381
|
+
void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
|
|
1382
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
1383
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
1384
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1385
|
+
Imputer &imputer);
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
/* Append trees from one model into another
|
|
1389
|
+
*
|
|
1390
|
+
* Parameters
|
|
1391
|
+
* ==========
|
|
1392
|
+
* - model (in, out)
|
|
1393
|
+
* Pointer to isolation forest model wich has already been fit through 'fit_iforest'.
|
|
1394
|
+
* The trees from 'other' will be merged into this (will be at the end of vector member 'trees').
|
|
1395
|
+
* Both 'model' and 'other' must have been fit with the same hyperparameters
|
|
1396
|
+
* in order for this merge to work correctly - at the very least, should have
|
|
1397
|
+
* the same 'missing_action', 'cat_split_type', 'new_cat_action'.
|
|
1398
|
+
* Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
|
|
1399
|
+
* Pass NULL if this is not to be used.
|
|
1400
|
+
* - other
|
|
1401
|
+
* Pointer to isolation forest model which has already been fit through 'fit_iforest'.
|
|
1402
|
+
* The trees from this object will be added into 'model' (this object will not be modified).
|
|
1403
|
+
* Both 'model' and 'other' must have been fit with the same hyperparameters
|
|
1404
|
+
* in order for this merge to work correctly - at the very least, should have
|
|
1405
|
+
* the same 'missing_action', 'cat_split_type', 'new_cat_action'.
|
|
1406
|
+
* Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
|
|
1407
|
+
* Pass NULL if this is not to be used.
|
|
1408
|
+
* - ext_model (in, out)
|
|
1409
|
+
* Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
|
|
1410
|
+
* The trees/hyperplanes from 'ext_other' will be merged into this (will be at the end of vector member 'hplanes').
|
|
1411
|
+
* Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
|
|
1412
|
+
* in order for this merge to work correctly - at the very least, should have
|
|
1413
|
+
* the same 'missing_action', 'cat_split_type', 'new_cat_action'.
|
|
1414
|
+
* Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
|
|
1415
|
+
* Pass NULL if this is not to be used.
|
|
1416
|
+
* - ext_other
|
|
1417
|
+
* Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
|
|
1418
|
+
* The trees/hyperplanes from this object will be added into 'ext_model' (this object will not be modified).
|
|
1419
|
+
* Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
|
|
1420
|
+
* in order for this merge to work correctly - at the very least, should have
|
|
1421
|
+
* the same 'missing_action', 'cat_split_type', 'new_cat_action'.
|
|
1422
|
+
* Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
|
|
1423
|
+
* Pass NULL if this is not to be used.
|
|
1424
|
+
* - imputer (in, out)
|
|
1425
|
+
* Pointer to imputation object which has already been fit through 'fit_iforest' along with
|
|
1426
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest'.
|
|
1427
|
+
* The imputation nodes from 'iother' will be merged into this (will be at the end of vector member 'imputer_tree').
|
|
1428
|
+
* Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
|
|
1429
|
+
* hyperparameters after the merge).
|
|
1430
|
+
* Pass NULL if this is not to be used.
|
|
1431
|
+
* - iother
|
|
1432
|
+
* Pointer to imputation object which has already been fit through 'fit_iforest' along with
|
|
1433
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest'.
|
|
1434
|
+
* The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
|
|
1435
|
+
* Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
|
|
1436
|
+
* hyperparameters after the merge).
|
|
1437
|
+
* Pass NULL if this is not to be used.
|
|
1438
|
+
* - indexer (in, out)
|
|
1439
|
+
* Pointer to indexer object which has already been fit through 'fit_iforest' along with
|
|
1440
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized function.
|
|
1441
|
+
* The imputation nodes from 'ind_other' will be merged into this (will be at the end of vector member 'indices').
|
|
1442
|
+
* Reference points should not differ between 'indexer' and 'ind_other'.
|
|
1443
|
+
* Pass NULL if this is not to be used.
|
|
1444
|
+
* - ind_other
|
|
1445
|
+
* Pointer to indexer object which has already been fit through 'fit_iforest' along with
|
|
1446
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized function.
|
|
1447
|
+
* The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
|
|
1448
|
+
* Reference points should not differ between 'indexer' and 'ind_other'.
|
|
1449
|
+
* Pass NULL if this is not to be used.
|
|
1450
|
+
*/
|
|
1451
|
+
ISOTREE_EXPORTED
|
|
1452
|
+
void merge_models(IsoForest* model, IsoForest* other,
|
|
1453
|
+
ExtIsoForest* ext_model, ExtIsoForest* ext_other,
|
|
1454
|
+
Imputer* imputer, Imputer* iother,
|
|
1455
|
+
TreesIndexer* indexer, TreesIndexer* ind_other);
|
|
1456
|
+
|
|
1457
|
+
/* Create a model containing a sub-set of the trees from another model
|
|
1458
|
+
*
|
|
1459
|
+
* Parameters
|
|
1460
|
+
* ==========
|
|
1461
|
+
* - model (in)
|
|
1462
|
+
* Pointer to isolation forest model wich has already been fit through 'fit_iforest',
|
|
1463
|
+
* from which the desired trees will be copied into a new model object.
|
|
1464
|
+
* Pass NULL if using the extended model.
|
|
1465
|
+
* - ext_model (in)
|
|
1466
|
+
* Pointer to extended isolation forest model which has already been fit through 'fit_iforest',
|
|
1467
|
+
* from which the desired trees will be copied into a new model object.
|
|
1468
|
+
* Pass NULL if using the single-variable model.
|
|
1469
|
+
* - imputer (in)
|
|
1470
|
+
* Pointer to imputation object which has already been fit through 'fit_iforest' along with
|
|
1471
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest'.
|
|
1472
|
+
* Pass NULL if the model was built without an imputer.
|
|
1473
|
+
* - indexer (in)
|
|
1474
|
+
* Pointer to indexer object which has already been fit through 'fit_iforest' along with
|
|
1475
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized funcction.
|
|
1476
|
+
* Pass NULL if the model was built without an indexer.
|
|
1477
|
+
* - model_new (out)
|
|
1478
|
+
* Pointer to already-allocated isolation forest model, which will be reset and to
|
|
1479
|
+
* which the selected trees from 'model' will be copied.
|
|
1480
|
+
* Pass NULL if using the extended model.
|
|
1481
|
+
* - ext_model_new (out)
|
|
1482
|
+
* Pointer to already-allocated extended isolation forest model, which will be reset and to
|
|
1483
|
+
* which the selected hyperplanes from 'ext_model' will be copied.
|
|
1484
|
+
* Pass NULL if using the single-variable model.
|
|
1485
|
+
* - imputer_new (out)
|
|
1486
|
+
* Pointer to already-allocated imputation object, which will be reset and to
|
|
1487
|
+
* which the selected nodes from 'imputer' (matching to those of either 'model'
|
|
1488
|
+
* or 'ext_model') will be copied.
|
|
1489
|
+
* Pass NULL if the model was built without an imputer.
|
|
1490
|
+
* - indexer_new (out)
|
|
1491
|
+
* Pointer to already-allocated indexer object, which will be reset and to
|
|
1492
|
+
* which the selected nodes from 'indexer' (matching to those of either 'model'
|
|
1493
|
+
* or 'ext_model') will be copied.
|
|
1494
|
+
* Pass NULL if the model was built without an indexer.
|
|
1495
|
+
*/
|
|
1496
|
+
ISOTREE_EXPORTED
|
|
1497
|
+
void subset_model(IsoForest* model, IsoForest* model_new,
|
|
1498
|
+
ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
|
|
1499
|
+
Imputer* imputer, Imputer* imputer_new,
|
|
1500
|
+
TreesIndexer* indexer, TreesIndexer* indexer_new,
|
|
1501
|
+
size_t *trees_take, size_t ntrees_take);
|
|
1502
|
+
|
|
1503
|
+
/* Build indexer for faster terminal node predictions and/or distance calculations
|
|
1504
|
+
*
|
|
1505
|
+
* Parameters
|
|
1506
|
+
* ==========
|
|
1507
|
+
* - indexer
|
|
1508
|
+
* Pointer or reference to an indexer object which will be associated to a fitted model and in
|
|
1509
|
+
* which indices for terminal nodes and potentially node distances will be stored.
|
|
1510
|
+
* - model / model_outputs / model_outputs_ext
|
|
1511
|
+
* Pointer or reference to a fitted model object for which an indexer will be built.
|
|
1512
|
+
* - nthreads
|
|
1513
|
+
* Number of parallel threads to use. This operation will only be multi-threaded when passing
|
|
1514
|
+
* 'with_distances=true'.
|
|
1515
|
+
* - with_distances
|
|
1516
|
+
* Whether to also pre-calculate node distances in order to speed up 'calc_similarity' (distances).
|
|
1517
|
+
* Note that this will consume a lot more memory and make the resulting object significantly
|
|
1518
|
+
* heavier.
|
|
1519
|
+
*/
|
|
1520
|
+
ISOTREE_EXPORTED
|
|
1521
|
+
void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances);
|
|
1522
|
+
ISOTREE_EXPORTED
|
|
1523
|
+
void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances);
|
|
1524
|
+
ISOTREE_EXPORTED
|
|
1525
|
+
void build_tree_indices
|
|
1526
|
+
(
|
|
1527
|
+
TreesIndexer *indexer,
|
|
1528
|
+
const IsoForest *model_outputs,
|
|
1529
|
+
const ExtIsoForest *model_outputs_ext,
|
|
1530
|
+
int nthreads,
|
|
1531
|
+
const bool with_distances
|
|
1532
|
+
);
|
|
1533
|
+
/* Gets the number of reference points stored in an indexer object */
|
|
1534
|
+
ISOTREE_EXPORTED
|
|
1535
|
+
size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept;
|
|
1536
|
+
|
|
1537
|
+
|
|
1538
|
+
/* Functions to inspect serialized objects
|
|
1539
|
+
*
|
|
1540
|
+
* Parameters
|
|
1541
|
+
* ==========
|
|
1542
|
+
* - serialized_bytes (in)
|
|
1543
|
+
* A model from this library, serialized through the functions available since
|
|
1544
|
+
* version 0.3.0, in any of the varieties offered by the library (as separate
|
|
1545
|
+
* objects or as combined objects with metadata).
|
|
1546
|
+
* - is_isotree_model (out)
|
|
1547
|
+
* Whether the input 'serialized_bytes' is a serialized model from this library.
|
|
1548
|
+
* - is_compatible (out)
|
|
1549
|
+
* Whether the serialized model is compatible (i.e. can be de-serialized) with the
|
|
1550
|
+
* current setup.
|
|
1551
|
+
* Serialized models are compatible between:
|
|
1552
|
+
* - Different operating systems.
|
|
1553
|
+
* - Different compilers.
|
|
1554
|
+
* - Systems with different 'size_t' width (e.g. 32-bit and 64-bit),
|
|
1555
|
+
* as long as the file was produced on a system that was either 32-bit or 64-bit,
|
|
1556
|
+
* and as long as each saved value fits within the range of the machine's 'size_t' type.
|
|
1557
|
+
* - Systems with different 'int' width,
|
|
1558
|
+
* as long as the file was produced on a system that was 16-bit, 32-bit, or 64-bit,
|
|
1559
|
+
* and as long as each saved value fits within the range of the machine's int type.
|
|
1560
|
+
* - Systems with different bit endianness (e.g. x86 and PPC64 in non-le mode).
|
|
1561
|
+
* - Versions of this package from 0.3.0 onwards.
|
|
1562
|
+
* But are not compatible between:
|
|
1563
|
+
* - Systems with different floating point numeric representations
|
|
1564
|
+
* (e.g. standard IEEE754 vs. a base-10 system).
|
|
1565
|
+
* - Versions of this package earlier than 0.3.0.
|
|
1566
|
+
* This pretty much guarantees that a given file can be serialized and de-serialized
|
|
1567
|
+
* in the same machine in which it was built, regardless of how the library was compiled.
|
|
1568
|
+
* Reading a serialized model that was produced in a platform with different
|
|
1569
|
+
* characteristics (e.g. 32-bit vs. 64-bit) will be much slower however.
|
|
1570
|
+
* - has_combined_objects (out)
|
|
1571
|
+
* Whether the serialized model is in the format of combined objects (as produced by the
|
|
1572
|
+
* functions named 'serialized_combined') or in the format of separate objects (as produced
|
|
1573
|
+
* by the functions named 'serialized_<model>').
|
|
1574
|
+
* If if is in the format of combined objects, must be de-serialized through the functions
|
|
1575
|
+
* named 'deserialize_combined'; ohterwise, must be de-serialized through the functions
|
|
1576
|
+
* named 'deserialize_<model>'.
|
|
1577
|
+
* Note that the Python and R interfaces of this library use the combined objects format
|
|
1578
|
+
* when serializing to files.
|
|
1579
|
+
* - has_IsoForest (out)
|
|
1580
|
+
* Whether the serialized bytes include an 'IsoForest' object. If it has 'has_combined_objects=true',
|
|
1581
|
+
* might include additional objects.
|
|
1582
|
+
* - has_ExtIsoForest (out)
|
|
1583
|
+
* Whether the serialized bytes include an 'ExtIsoForest' object. If it has 'has_combined_objects=true',
|
|
1584
|
+
* might include additional objects.
|
|
1585
|
+
* - has_Imputer (out)
|
|
1586
|
+
* Whether the serialized bytes include an 'Imputer' object. If it has 'has_combined_objects=true',
|
|
1587
|
+
* might include additional objects.
|
|
1588
|
+
* - has_metadata (out)
|
|
1589
|
+
* Whether the serialized bytes include additional metadata in the form of a 'char' array.
|
|
1590
|
+
* This can only be present when having 'has_combined_objects=true'.
|
|
1591
|
+
* - size_metadata (out)
|
|
1592
|
+
* When the serialized bytes contain metadata, this denotes the size of the metadata (number
|
|
1593
|
+
* of bytes that it contains).
|
|
1594
|
+
*/
|
|
1595
|
+
ISOTREE_EXPORTED
|
|
1596
|
+
void inspect_serialized_object
|
|
1597
|
+
(
|
|
1598
|
+
const char *serialized_bytes,
|
|
1599
|
+
bool &is_isotree_model,
|
|
1600
|
+
bool &is_compatible,
|
|
1601
|
+
bool &has_combined_objects,
|
|
1602
|
+
bool &has_IsoForest,
|
|
1603
|
+
bool &has_ExtIsoForest,
|
|
1604
|
+
bool &has_Imputer,
|
|
1605
|
+
bool &has_Indexer,
|
|
1606
|
+
bool &has_metadata,
|
|
1607
|
+
size_t &size_metadata
|
|
1608
|
+
);
|
|
1609
|
+
ISOTREE_EXPORTED
|
|
1610
|
+
void inspect_serialized_object
|
|
1611
|
+
(
|
|
1612
|
+
FILE *serialized_bytes,
|
|
1613
|
+
bool &is_isotree_model,
|
|
1614
|
+
bool &is_compatible,
|
|
1615
|
+
bool &has_combined_objects,
|
|
1616
|
+
bool &has_IsoForest,
|
|
1617
|
+
bool &has_ExtIsoForest,
|
|
1618
|
+
bool &has_Imputer,
|
|
1619
|
+
bool &has_Indexer,
|
|
1620
|
+
bool &has_metadata,
|
|
1621
|
+
size_t &size_metadata
|
|
1622
|
+
);
|
|
1623
|
+
ISOTREE_EXPORTED
|
|
1624
|
+
void inspect_serialized_object
|
|
1625
|
+
(
|
|
1626
|
+
std::istream &serialized_bytes,
|
|
1627
|
+
bool &is_isotree_model,
|
|
1628
|
+
bool &is_compatible,
|
|
1629
|
+
bool &has_combined_objects,
|
|
1630
|
+
bool &has_IsoForest,
|
|
1631
|
+
bool &has_ExtIsoForest,
|
|
1632
|
+
bool &has_Imputer,
|
|
1633
|
+
bool &has_Indexer,
|
|
1634
|
+
bool &has_metadata,
|
|
1635
|
+
size_t &size_metadata
|
|
1636
|
+
);
|
|
1637
|
+
ISOTREE_EXPORTED
|
|
1638
|
+
void inspect_serialized_object
|
|
1639
|
+
(
|
|
1640
|
+
const std::string &serialized_bytes,
|
|
1641
|
+
bool &is_isotree_model,
|
|
1642
|
+
bool &is_compatible,
|
|
1643
|
+
bool &has_combined_objects,
|
|
1644
|
+
bool &has_IsoForest,
|
|
1645
|
+
bool &has_ExtIsoForest,
|
|
1646
|
+
bool &has_Imputer,
|
|
1647
|
+
bool &has_Indexer,
|
|
1648
|
+
bool &has_metadata,
|
|
1649
|
+
size_t &size_metadata
|
|
1650
|
+
);
|
|
1651
|
+
|
|
1652
|
+
/* Serialization and de-serialization functions (individual objects)
|
|
1653
|
+
*
|
|
1654
|
+
* Parameters
|
|
1655
|
+
* ==========
|
|
1656
|
+
* - model (in or out depending on function)
|
|
1657
|
+
* A model object to serialize (when it has 'const' qualifier), after being fitted through
|
|
1658
|
+
* function 'fit_iforest'; or an already-allocated object (should be initialized through
|
|
1659
|
+
* the default constructor) into which a serialized object of the same class will be
|
|
1660
|
+
* de-serialized. In the latter case, the contents of this object will be overwritten.
|
|
1661
|
+
* Note that this will only be able to load models generated with isotree version 0.3.0
|
|
1662
|
+
* and later, and that these serialized models are forwards compatible but not backwards
|
|
1663
|
+
* compatible (that is, a model saved with 0.3.0 can be loaded with 0.3.6, but not the other
|
|
1664
|
+
* way around).
|
|
1665
|
+
* - output (out)
|
|
1666
|
+
* A writable object or stream in which to save/persist/serialize the
|
|
1667
|
+
* model or imputer object. In the functions that do not take this as a parameter,
|
|
1668
|
+
* it will be returned as a string containing the raw bytes.
|
|
1669
|
+
* Should be opened in binary mode.
|
|
1670
|
+
* Note: on Windows, if compiling this library with a compiler other than MSVC or MINGW,
|
|
1671
|
+
* there might be issues writing models to FILE pointers if the models are larger than 2GB.
|
|
1672
|
+
* - in (in)
|
|
1673
|
+
* An readable object or stream which contains the serialized/persisted model or
|
|
1674
|
+
* imputer object which will be de-serialized. Should be opened in binary mode.
|
|
1675
|
+
*
|
|
1676
|
+
* Returns
|
|
1677
|
+
* =======
|
|
1678
|
+
* (Only for functions 'determine_serialized_size')
|
|
1679
|
+
* Size that the model or imputer object will use when serialized, intended to be
|
|
1680
|
+
* used for allocating arrays beforehand when serializing to 'char'.
|
|
1681
|
+
*/
|
|
1682
|
+
ISOTREE_EXPORTED
|
|
1683
|
+
size_t determine_serialized_size(const IsoForest &model) noexcept;
|
|
1684
|
+
ISOTREE_EXPORTED
|
|
1685
|
+
size_t determine_serialized_size(const ExtIsoForest &model) noexcept;
|
|
1686
|
+
ISOTREE_EXPORTED
|
|
1687
|
+
size_t determine_serialized_size(const Imputer &model) noexcept;
|
|
1688
|
+
ISOTREE_EXPORTED
|
|
1689
|
+
size_t determine_serialized_size(const TreesIndexer &model) noexcept;
|
|
1690
|
+
ISOTREE_EXPORTED
|
|
1691
|
+
void serialize_IsoForest(const IsoForest &model, char *out);
|
|
1692
|
+
ISOTREE_EXPORTED
|
|
1693
|
+
void serialize_IsoForest(const IsoForest &model, FILE *out);
|
|
1694
|
+
ISOTREE_EXPORTED
|
|
1695
|
+
void serialize_IsoForest(const IsoForest &model, std::ostream &out);
|
|
1696
|
+
ISOTREE_EXPORTED
|
|
1697
|
+
std::string serialize_IsoForest(const IsoForest &model);
|
|
1698
|
+
ISOTREE_EXPORTED
|
|
1699
|
+
void deserialize_IsoForest(IsoForest &model, const char *in);
|
|
1700
|
+
ISOTREE_EXPORTED
|
|
1701
|
+
void deserialize_IsoForest(IsoForest &model, FILE *in);
|
|
1702
|
+
ISOTREE_EXPORTED
|
|
1703
|
+
void deserialize_IsoForest(IsoForest &model, std::istream &in);
|
|
1704
|
+
ISOTREE_EXPORTED
|
|
1705
|
+
void deserialize_IsoForest(IsoForest &model, const std::string &in);
|
|
1706
|
+
ISOTREE_EXPORTED
|
|
1707
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, char *out);
|
|
1708
|
+
ISOTREE_EXPORTED
|
|
1709
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, FILE *out);
|
|
1710
|
+
ISOTREE_EXPORTED
|
|
1711
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, std::ostream &out);
|
|
1712
|
+
ISOTREE_EXPORTED
|
|
1713
|
+
std::string serialize_ExtIsoForest(const ExtIsoForest &model);
|
|
1714
|
+
ISOTREE_EXPORTED
|
|
1715
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, const char *in);
|
|
1716
|
+
ISOTREE_EXPORTED
|
|
1717
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, FILE *in);
|
|
1718
|
+
ISOTREE_EXPORTED
|
|
1719
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, std::istream &in);
|
|
1720
|
+
ISOTREE_EXPORTED
|
|
1721
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, const std::string &in);
|
|
1722
|
+
ISOTREE_EXPORTED
|
|
1723
|
+
void serialize_Imputer(const Imputer &model, char *out);
|
|
1724
|
+
ISOTREE_EXPORTED
|
|
1725
|
+
void serialize_Imputer(const Imputer &model, FILE *out);
|
|
1726
|
+
ISOTREE_EXPORTED
|
|
1727
|
+
void serialize_Imputer(const Imputer &model, std::ostream &out);
|
|
1728
|
+
ISOTREE_EXPORTED
|
|
1729
|
+
std::string serialize_Imputer(const Imputer &model);
|
|
1730
|
+
ISOTREE_EXPORTED
|
|
1731
|
+
void deserialize_Imputer(Imputer &model, const char *in);
|
|
1732
|
+
ISOTREE_EXPORTED
|
|
1733
|
+
void deserialize_Imputer(Imputer &model, FILE *in);
|
|
1734
|
+
ISOTREE_EXPORTED
|
|
1735
|
+
void deserialize_Imputer(Imputer &model, std::istream &in);
|
|
1736
|
+
ISOTREE_EXPORTED
|
|
1737
|
+
void deserialize_Imputer(Imputer &model, const std::string &in);
|
|
1738
|
+
ISOTREE_EXPORTED
|
|
1739
|
+
void serialize_Indexer(const TreesIndexer &model, char *out);
|
|
1740
|
+
ISOTREE_EXPORTED
|
|
1741
|
+
void serialize_Indexer(const TreesIndexer &model, FILE *out);
|
|
1742
|
+
ISOTREE_EXPORTED
|
|
1743
|
+
void serialize_Indexer(const TreesIndexer &model, std::ostream &out);
|
|
1744
|
+
ISOTREE_EXPORTED
|
|
1745
|
+
std::string serialize_Indexer(const TreesIndexer &model);
|
|
1746
|
+
ISOTREE_EXPORTED
|
|
1747
|
+
void deserialize_Indexer(TreesIndexer &model, const char *in);
|
|
1748
|
+
ISOTREE_EXPORTED
|
|
1749
|
+
void deserialize_Indexer(TreesIndexer &model, FILE *in);
|
|
1750
|
+
ISOTREE_EXPORTED
|
|
1751
|
+
void deserialize_Indexer(TreesIndexer &model, std::istream &in);
|
|
1752
|
+
ISOTREE_EXPORTED
|
|
1753
|
+
void deserialize_Indexer(TreesIndexer &model, const std::string &in);
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
/* Serialization and de-serialization functions (combined objects)
|
|
1757
|
+
*
|
|
1758
|
+
* Parameters
|
|
1759
|
+
* ==========
|
|
1760
|
+
* - model (in or out depending on function)
|
|
1761
|
+
* A single-variable model object to serialize or de-serialize.
|
|
1762
|
+
* If the serialized object contains this type of object, it must be
|
|
1763
|
+
* passed, as an already-allocated object (initialized through the default
|
|
1764
|
+
* constructor function).
|
|
1765
|
+
* When de-serializing, can check if it needs to be passed through function
|
|
1766
|
+
* 'inspect_serialized_object'.
|
|
1767
|
+
* If using the extended model, should pass NULL.
|
|
1768
|
+
* Must pass one of 'model' or 'model_ext'.
|
|
1769
|
+
* - model_ext (in or out depending on function)
|
|
1770
|
+
* An extended model object to serialize or de-serialize.
|
|
1771
|
+
* If using the single-variable model, should pass NULL.
|
|
1772
|
+
* Must pass one of 'model' or 'model_ext'.
|
|
1773
|
+
* - imputer (in or out depending on function)
|
|
1774
|
+
* An imputer object to serialize or de-serialize.
|
|
1775
|
+
* Like 'model' and 'model_ext', must also be passed when de-serializing
|
|
1776
|
+
* if the serialized bytes contain such object.
|
|
1777
|
+
* - optional_metadata (in or out depending on function)
|
|
1778
|
+
* Optional metadata to write at the end of the file, which will be written
|
|
1779
|
+
* unformatted (it is assumed files are in binary mode).
|
|
1780
|
+
* Pass NULL if there is no metadata.
|
|
1781
|
+
* - size_optional_metadata (in or out depending on function)
|
|
1782
|
+
* Size of the optional metadata, if passed. Pass zero if there is no metadata.
|
|
1783
|
+
* - serialized_model (in)
|
|
1784
|
+
* A single-variable model which was serialized to raw bytes in the separate-objects
|
|
1785
|
+
* format, using function 'serialize_IsoForest'.
|
|
1786
|
+
* Pass NULL if using the extended model.
|
|
1787
|
+
* Must pass one of 'serialized_model' or 'serialized_model_ext'.
|
|
1788
|
+
* Note that if it was produced on a platform with different characteristics than
|
|
1789
|
+
* the one in which this function is being called (e.g. different 'size_t' width or
|
|
1790
|
+
* different endianness), it will be re-serialized during the function call, which
|
|
1791
|
+
* can be slow and use a lot of memory.
|
|
1792
|
+
* - serialized_model_ext (in)
|
|
1793
|
+
* An extended model which was serialized to raw bytes in the separate-objects
|
|
1794
|
+
* format, using function 'serialize_ExtIsoForest'.
|
|
1795
|
+
* Pass NULL if using the single-variable model.
|
|
1796
|
+
* Must pass one of 'serialized_model' or 'serialized_model_ext'.
|
|
1797
|
+
* - serialized_imputer (in)
|
|
1798
|
+
* An imputer object which was serialized to raw bytes in the separate-objects
|
|
1799
|
+
* format, using function 'serialize_Imputer'.
|
|
1800
|
+
* - output (out)
|
|
1801
|
+
* A writable object or stream in which to save/persist/serialize the
|
|
1802
|
+
* model objects. In the functions that do not take this as a parameter,
|
|
1803
|
+
* it will be returned as a string containing the raw bytes.
|
|
1804
|
+
* Should be opened in binary mode.
|
|
1805
|
+
* - in (in)
|
|
1806
|
+
* An readable object or stream which contains the serialized/persisted model
|
|
1807
|
+
* objects which will be de-serialized. Should be opened in binary mode.
|
|
1808
|
+
*
|
|
1809
|
+
* Returns
|
|
1810
|
+
* =======
|
|
1811
|
+
* (Only for functions 'determine_serialized_size')
|
|
1812
|
+
* Size that the objects will use when serialized, intended to be
|
|
1813
|
+
* used for allocating arrays beforehand when serializing to 'char'.
|
|
1814
|
+
*/
|
|
1815
|
+
ISOTREE_EXPORTED
|
|
1816
|
+
size_t determine_serialized_size_combined
|
|
1817
|
+
(
|
|
1818
|
+
const IsoForest *model,
|
|
1819
|
+
const ExtIsoForest *model_ext,
|
|
1820
|
+
const Imputer *imputer,
|
|
1821
|
+
const TreesIndexer *indexer,
|
|
1822
|
+
const size_t size_optional_metadata
|
|
1823
|
+
) noexcept;
|
|
1824
|
+
ISOTREE_EXPORTED
|
|
1825
|
+
size_t determine_serialized_size_combined
|
|
1826
|
+
(
|
|
1827
|
+
const char *serialized_model,
|
|
1828
|
+
const char *serialized_model_ext,
|
|
1829
|
+
const char *serialized_imputer,
|
|
1830
|
+
const char *serialized_indexer,
|
|
1831
|
+
const size_t size_optional_metadata
|
|
1832
|
+
) noexcept;
|
|
1833
|
+
ISOTREE_EXPORTED
|
|
1834
|
+
void serialize_combined
|
|
1835
|
+
(
|
|
1836
|
+
const IsoForest *model,
|
|
1837
|
+
const ExtIsoForest *model_ext,
|
|
1838
|
+
const Imputer *imputer,
|
|
1839
|
+
const TreesIndexer *indexer,
|
|
1840
|
+
const char *optional_metadata,
|
|
1841
|
+
const size_t size_optional_metadata,
|
|
1842
|
+
char *out
|
|
1843
|
+
);
|
|
1844
|
+
ISOTREE_EXPORTED
|
|
1845
|
+
void serialize_combined
|
|
1846
|
+
(
|
|
1847
|
+
const IsoForest *model,
|
|
1848
|
+
const ExtIsoForest *model_ext,
|
|
1849
|
+
const Imputer *imputer,
|
|
1850
|
+
const TreesIndexer *indexer,
|
|
1851
|
+
const char *optional_metadata,
|
|
1852
|
+
const size_t size_optional_metadata,
|
|
1853
|
+
FILE *out
|
|
1854
|
+
);
|
|
1855
|
+
ISOTREE_EXPORTED
|
|
1856
|
+
void serialize_combined
|
|
1857
|
+
(
|
|
1858
|
+
const IsoForest *model,
|
|
1859
|
+
const ExtIsoForest *model_ext,
|
|
1860
|
+
const Imputer *imputer,
|
|
1861
|
+
const TreesIndexer *indexer,
|
|
1862
|
+
const char *optional_metadata,
|
|
1863
|
+
const size_t size_optional_metadata,
|
|
1864
|
+
std::ostream &out
|
|
1865
|
+
);
|
|
1866
|
+
ISOTREE_EXPORTED
|
|
1867
|
+
std::string serialize_combined
|
|
1868
|
+
(
|
|
1869
|
+
const IsoForest *model,
|
|
1870
|
+
const ExtIsoForest *model_ext,
|
|
1871
|
+
const Imputer *imputer,
|
|
1872
|
+
const TreesIndexer *indexer,
|
|
1873
|
+
const char *optional_metadata,
|
|
1874
|
+
const size_t size_optional_metadata
|
|
1875
|
+
);
|
|
1876
|
+
ISOTREE_EXPORTED
|
|
1877
|
+
void serialize_combined
|
|
1878
|
+
(
|
|
1879
|
+
const char *serialized_model,
|
|
1880
|
+
const char *serialized_model_ext,
|
|
1881
|
+
const char *serialized_imputer,
|
|
1882
|
+
const char *serialized_indexer,
|
|
1883
|
+
const char *optional_metadata,
|
|
1884
|
+
const size_t size_optional_metadata,
|
|
1885
|
+
FILE *out
|
|
1886
|
+
);
|
|
1887
|
+
ISOTREE_EXPORTED
|
|
1888
|
+
void serialize_combined
|
|
1889
|
+
(
|
|
1890
|
+
const char *serialized_model,
|
|
1891
|
+
const char *serialized_model_ext,
|
|
1892
|
+
const char *serialized_imputer,
|
|
1893
|
+
const char *serialized_indexer,
|
|
1894
|
+
const char *optional_metadata,
|
|
1895
|
+
const size_t size_optional_metadata,
|
|
1896
|
+
std::ostream &out
|
|
1897
|
+
);
|
|
1898
|
+
ISOTREE_EXPORTED
|
|
1899
|
+
std::string serialize_combined
|
|
1900
|
+
(
|
|
1901
|
+
const char *serialized_model,
|
|
1902
|
+
const char *serialized_model_ext,
|
|
1903
|
+
const char *serialized_imputer,
|
|
1904
|
+
const char *serialized_indexer,
|
|
1905
|
+
const char *optional_metadata,
|
|
1906
|
+
const size_t size_optional_metadata
|
|
1907
|
+
);
|
|
1908
|
+
ISOTREE_EXPORTED
|
|
1909
|
+
void deserialize_combined
|
|
1910
|
+
(
|
|
1911
|
+
const char* in,
|
|
1912
|
+
IsoForest *model,
|
|
1913
|
+
ExtIsoForest *model_ext,
|
|
1914
|
+
Imputer *imputer,
|
|
1915
|
+
TreesIndexer *indexer,
|
|
1916
|
+
char *optional_metadata
|
|
1917
|
+
);
|
|
1918
|
+
ISOTREE_EXPORTED
|
|
1919
|
+
void deserialize_combined
|
|
1920
|
+
(
|
|
1921
|
+
FILE* in,
|
|
1922
|
+
IsoForest *model,
|
|
1923
|
+
ExtIsoForest *model_ext,
|
|
1924
|
+
Imputer *imputer,
|
|
1925
|
+
TreesIndexer *indexer,
|
|
1926
|
+
char *optional_metadata
|
|
1927
|
+
);
|
|
1928
|
+
ISOTREE_EXPORTED
|
|
1929
|
+
void deserialize_combined
|
|
1930
|
+
(
|
|
1931
|
+
std::istream &in,
|
|
1932
|
+
IsoForest *model,
|
|
1933
|
+
ExtIsoForest *model_ext,
|
|
1934
|
+
Imputer *imputer,
|
|
1935
|
+
TreesIndexer *indexer,
|
|
1936
|
+
char *optional_metadata
|
|
1937
|
+
);
|
|
1938
|
+
ISOTREE_EXPORTED
|
|
1939
|
+
void deserialize_combined
|
|
1940
|
+
(
|
|
1941
|
+
const std::string &in,
|
|
1942
|
+
IsoForest *model,
|
|
1943
|
+
ExtIsoForest *model_ext,
|
|
1944
|
+
Imputer *imputer,
|
|
1945
|
+
TreesIndexer *indexer,
|
|
1946
|
+
char *optional_metadata
|
|
1947
|
+
);
|
|
1948
|
+
|
|
1949
|
+
|
|
1950
|
+
/* Serialize additional trees into previous serialized bytes
|
|
1951
|
+
*
|
|
1952
|
+
* Parameters
|
|
1953
|
+
* ==========
|
|
1954
|
+
* - model (in)
|
|
1955
|
+
* A model object to re-serialize, which had already been serialized into
|
|
1956
|
+
* 'serialized_bytes' with fewer trees than it currently has, and then
|
|
1957
|
+
* additional trees added through functions such as 'add_tree' or 'merge_models'.
|
|
1958
|
+
* - serialized_bytes (in) / old_bytes (out)
|
|
1959
|
+
* Serialized version of 'model', which had previously been produced with
|
|
1960
|
+
* fewer trees than it currently has and then additional trees added through
|
|
1961
|
+
* functions such as 'add_tree' or 'merge_models'.
|
|
1962
|
+
* Must have been produced in a setup with the same characteristics (e.g. width
|
|
1963
|
+
* of 'int' and 'size_t', endianness, etc.).
|
|
1964
|
+
* - old_ntrees
|
|
1965
|
+
* Number of trees which were serialized from 'model' into 'serialized_bytes'
|
|
1966
|
+
* before. Trees that come after this index are assumed to be the additional
|
|
1967
|
+
* trees to serialize.
|
|
1968
|
+
*
|
|
1969
|
+
* Returns
|
|
1970
|
+
* =======
|
|
1971
|
+
* - For functions 'check_can_undergo_incremental_serialization', whether the serialized
|
|
1972
|
+
* object can be incrementally serialized.
|
|
1973
|
+
* - For functions 'determine_serialized_size_additional_trees', additional size (in addition
|
|
1974
|
+
* to current size) that the new serialized objects will have if they undergo incremental
|
|
1975
|
+
* serialization.
|
|
1976
|
+
*/
|
|
1977
|
+
ISOTREE_EXPORTED
|
|
1978
|
+
bool check_can_undergo_incremental_serialization(const IsoForest &model, const char *serialized_bytes);
|
|
1979
|
+
ISOTREE_EXPORTED
|
|
1980
|
+
bool check_can_undergo_incremental_serialization(const ExtIsoForest &model, const char *serialized_bytes);
|
|
1981
|
+
ISOTREE_EXPORTED
|
|
1982
|
+
size_t determine_serialized_size_additional_trees(const IsoForest &model, size_t old_ntrees);
|
|
1983
|
+
ISOTREE_EXPORTED
|
|
1984
|
+
size_t determine_serialized_size_additional_trees(const ExtIsoForest &model, size_t old_ntrees);
|
|
1985
|
+
ISOTREE_EXPORTED
|
|
1986
|
+
size_t determine_serialized_size_additional_trees(const Imputer &model, size_t old_ntrees);
|
|
1987
|
+
ISOTREE_EXPORTED
|
|
1988
|
+
size_t determine_serialized_size_additional_trees(const TreesIndexer &model, size_t old_ntrees);
|
|
1989
|
+
ISOTREE_EXPORTED
|
|
1990
|
+
void incremental_serialize_IsoForest(const IsoForest &model, char *old_bytes_reallocated);
|
|
1991
|
+
ISOTREE_EXPORTED
|
|
1992
|
+
void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, char *old_bytes_reallocated);
|
|
1993
|
+
ISOTREE_EXPORTED
|
|
1994
|
+
void incremental_serialize_Imputer(const Imputer &model, char *old_bytes_reallocated);
|
|
1995
|
+
ISOTREE_EXPORTED
|
|
1996
|
+
void incremental_serialize_Indexer(const TreesIndexer &model, char *old_bytes_reallocated);
|
|
1997
|
+
ISOTREE_EXPORTED
|
|
1998
|
+
void incremental_serialize_IsoForest(const IsoForest &model, std::string &old_bytes);
|
|
1999
|
+
ISOTREE_EXPORTED
|
|
2000
|
+
void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, std::string &old_bytes);
|
|
2001
|
+
ISOTREE_EXPORTED
|
|
2002
|
+
void incremental_serialize_Imputer(const Imputer &model, std::string &old_bytes);
|
|
2003
|
+
ISOTREE_EXPORTED
|
|
2004
|
+
void incremental_serialize_Indexer(const TreesIndexer &model, std::string &old_bytes);
|
|
2005
|
+
|
|
2006
|
+
|
|
2007
|
+
/* Translate isolation forest model into a single SQL select statement
|
|
2008
|
+
*
|
|
2009
|
+
* Parameters
|
|
2010
|
+
* ==========
|
|
2011
|
+
* - model_outputs
|
|
2012
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
2013
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
|
2014
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
2015
|
+
* - model_outputs_ext
|
|
2016
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
2017
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
|
2018
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
2019
|
+
* - table_from
|
|
2020
|
+
* Table name from where the columns used in the model will be selected.
|
|
2021
|
+
* - select_as
|
|
2022
|
+
* Alias to give to the outlier score in the select statement.
|
|
2023
|
+
* - numeric_colnames
|
|
2024
|
+
* Names to use for the numerical columns.
|
|
2025
|
+
* - categ_colnames
|
|
2026
|
+
* Names to use for the categorical columns.
|
|
2027
|
+
* - categ_levels
|
|
2028
|
+
* Names to use for the levels/categories of each categorical column. These will be enclosed
|
|
2029
|
+
* in single quotes.
|
|
2030
|
+
* - index1
|
|
2031
|
+
* Whether to make the node numbers start their numeration at 1 instead of 0 in the
|
|
2032
|
+
* resulting statement. If passing 'output_tree_num=false', this will only affect the
|
|
2033
|
+
* commented lines which act as delimiters. If passing 'output_tree_num=true', will also
|
|
2034
|
+
* affect the results (which will also start at 1).
|
|
2035
|
+
* - nthreads
|
|
2036
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
2037
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
2038
|
+
* OpenMP support.
|
|
2039
|
+
*
|
|
2040
|
+
* Returns
|
|
2041
|
+
* =======
|
|
2042
|
+
* A string with the corresponding SQL statement that will calculate the outlier score
|
|
2043
|
+
* from the model.
|
|
2044
|
+
*/
|
|
2045
|
+
ISOTREE_EXPORTED
|
|
2046
|
+
std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
2047
|
+
std::string &table_from, std::string &select_as,
|
|
2048
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
2049
|
+
std::vector<std::vector<std::string>> &categ_levels,
|
|
2050
|
+
bool index1, int nthreads);
|
|
2051
|
+
|
|
2052
|
+
|
|
2053
|
+
/* Translate model trees into SQL select statements
|
|
2054
|
+
*
|
|
2055
|
+
* Parameters
|
|
2056
|
+
* ==========
|
|
2057
|
+
* - model_outputs
|
|
2058
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
2059
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
|
2060
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
2061
|
+
* - model_outputs_ext
|
|
2062
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
2063
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
|
2064
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
2065
|
+
* - numeric_colnames
|
|
2066
|
+
* Names to use for the numerical columns.
|
|
2067
|
+
* - categ_colnames
|
|
2068
|
+
* Names to use for the categorical columns.
|
|
2069
|
+
* - categ_levels
|
|
2070
|
+
* Names to use for the levels/categories of each categorical column. These will be enclosed
|
|
2071
|
+
* in single quotes.
|
|
2072
|
+
* - output_tree_num
|
|
2073
|
+
* Whether to output the terminal node number instead of the separation depth at each node.
|
|
2074
|
+
* - index1
|
|
2075
|
+
* Whether to make the node numbers start their numeration at 1 instead of 0 in the
|
|
2076
|
+
* resulting statement. If passing 'output_tree_num=false', this will only affect the
|
|
2077
|
+
* commented lines which act as delimiters. If passing 'output_tree_num=true', will also
|
|
2078
|
+
* affect the results (which will also start at 1).
|
|
2079
|
+
* - single_tree
|
|
2080
|
+
* Whether to generate the select statement for a single tree of the model instead of for
|
|
2081
|
+
* all. The tree number to generate is to be passed under 'tree_num'.
|
|
2082
|
+
* - tree_num
|
|
2083
|
+
* Tree number for which to generate an SQL select statement, if passing 'single_tree=true'.
|
|
2084
|
+
* - nthreads
|
|
2085
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
2086
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
2087
|
+
* OpenMP support.
|
|
2088
|
+
*
|
|
2089
|
+
* Returns
|
|
2090
|
+
* =======
|
|
2091
|
+
* A vector containing at each element the SQL statement for the corresponding tree in the model.
|
|
2092
|
+
* If passing 'single_tree=true', will contain only one element, corresponding to the tree given
|
|
2093
|
+
* in 'tree_num'. The statements will be node-by-node, with commented-out separators using '---'
|
|
2094
|
+
* as delimiters and including the node number as part of the comment.
|
|
2095
|
+
*/
|
|
2096
|
+
ISOTREE_EXPORTED
|
|
2097
|
+
std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
2098
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
2099
|
+
std::vector<std::vector<std::string>> &categ_levels,
|
|
2100
|
+
bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
|
|
2101
|
+
int nthreads);
|
|
2102
|
+
|
|
2103
|
+
|
|
2104
|
+
ISOTREE_EXPORTED
|
|
2105
|
+
void set_reference_points(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext, TreesIndexer *indexer,
|
|
2106
|
+
const bool with_distances,
|
|
2107
|
+
real_t *numeric_data, int *categ_data,
|
|
2108
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
2109
|
+
real_t *Xc, sparse_ix *Xc_ind, sparse_ix *Xc_indptr,
|
|
2110
|
+
real_t *Xr, sparse_ix *Xr_ind, sparse_ix *Xr_indptr,
|
|
2111
|
+
size_t nrows, int nthreads);
|