outliertree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,758 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
|
35
|
+
/***********************
|
36
|
+
Standard headers
|
37
|
+
************************/
|
38
|
+
#include <iostream>
|
39
|
+
#include <vector>
|
40
|
+
#include <memory>
|
41
|
+
#include <algorithm>
|
42
|
+
#include <numeric>
|
43
|
+
#include <unordered_set>
|
44
|
+
#include <math.h>
|
45
|
+
#include <cmath>
|
46
|
+
#include <stddef.h>
|
47
|
+
#include <limits.h>
|
48
|
+
#include <stdlib.h>
|
49
|
+
#include <stddef.h>
|
50
|
+
#include <string.h>
|
51
|
+
#ifdef _OPENMP
|
52
|
+
#include <omp.h>
|
53
|
+
#endif
|
54
|
+
|
55
|
+
/************************
|
56
|
+
Short Functions
|
57
|
+
*************************/
|
58
|
+
#define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
|
59
|
+
#define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
|
60
|
+
#define avg_between(a, b) (((a) + (b)) * 0.5)
|
61
|
+
#define square(x) ((x) * (x))
|
62
|
+
#ifndef isinf
|
63
|
+
#define isinf std::isinf
|
64
|
+
#endif
|
65
|
+
#ifndef isnan
|
66
|
+
#define isnan std::isnan
|
67
|
+
#endif
|
68
|
+
#define is_na_or_inf(x) (isnan(x) || isinf(x))
|
69
|
+
|
70
|
+
/* Aliasing for compiler optimizations */
|
71
|
+
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
|
72
|
+
#define restrict __restrict
|
73
|
+
#else
|
74
|
+
#define restrict
|
75
|
+
#endif
|
76
|
+
|
77
|
+
/* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
|
78
|
+
#ifdef _OPENMP
|
79
|
+
#if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
|
80
|
+
#define size_t_for long long
|
81
|
+
#else
|
82
|
+
#define size_t_for size_t
|
83
|
+
#endif
|
84
|
+
#else
|
85
|
+
#define size_t_for size_t
|
86
|
+
#endif
|
87
|
+
|
88
|
+
#ifndef _OPENMP
|
89
|
+
#define omp_get_thread_num() 0
|
90
|
+
#endif
|
91
|
+
|
92
|
+
|
93
|
+
/****************************************************************
|
94
|
+
Data types and structs that are returned from this module
|
95
|
+
*****************************************************************/
|
96
|
+
typedef enum ColType {Numeric, Categorical, Ordinal, NoType} ColType;
|
97
|
+
typedef enum SplitType {
|
98
|
+
LessOrEqual, Greater, /* for numerical and ordinal */
|
99
|
+
Equal, NotEqual, /* will try to simplify to these post-hoc if possible */
|
100
|
+
InSubset, NotInSubset, /* for categoricals */
|
101
|
+
SingleCateg, SubTrees, /* one branch per category of a categorical column */
|
102
|
+
IsNa, Root
|
103
|
+
} SplitType;
|
104
|
+
typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
|
105
|
+
|
106
|
+
/*
|
107
|
+
* 1-d clusters that define homogeneous groups in which observations can be outliers.
|
108
|
+
* Note that these are associated to a tree and define one extra condition from what
|
109
|
+
* the tree already specifies. The branch they follow is stored in the cluster, unlike
|
110
|
+
* for trees in which it's always left and right branch, as these get discarded more often.
|
111
|
+
*/
|
112
|
+
typedef struct Cluster {
|
113
|
+
ColType column_type = NoType;
|
114
|
+
size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
|
115
|
+
SplitType split_type = Root;
|
116
|
+
double split_point = HUGE_VAL; /* numerical */
|
117
|
+
std::vector<char> split_subset = std::vector<char>(); /* categorical */
|
118
|
+
int split_lev = INT_MAX; /* ordinal */
|
119
|
+
bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
|
120
|
+
|
121
|
+
size_t cluster_size = 0;
|
122
|
+
double lower_lim = HUGE_VAL; /* numerical target column */
|
123
|
+
double upper_lim = -HUGE_VAL; /* numerical target column */
|
124
|
+
double perc_below = HUGE_VAL; /* numerical target column */
|
125
|
+
double perc_above = HUGE_VAL; /* numerical target column */
|
126
|
+
double display_lim_low = HUGE_VAL; /* numerical target column */
|
127
|
+
double display_lim_high = -HUGE_VAL; /* numerical target column */
|
128
|
+
double display_mean = -HUGE_VAL; /* numerical target column */
|
129
|
+
double display_sd = -HUGE_VAL; /* numerical target column */
|
130
|
+
std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
|
131
|
+
double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
|
132
|
+
double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
|
133
|
+
int categ_maj = -1; /* when using majority-criterion for categorical outliers */
|
134
|
+
|
135
|
+
double cluster_mean; /* used to calculate outlier scores at prediction time */
|
136
|
+
double cluster_sd; /* used to calculate outlier scores at prediction time */
|
137
|
+
std::vector<double> score_categ; /* used to calculate outlier scores at prediction time */
|
138
|
+
|
139
|
+
/* constructors in order to use C++'s vector emplace */
|
140
|
+
|
141
|
+
/* full data (no conditions) */
|
142
|
+
Cluster(ColType column_type, SplitType split_type)
|
143
|
+
{
|
144
|
+
this->column_type = column_type;
|
145
|
+
this->split_type = split_type;
|
146
|
+
}
|
147
|
+
|
148
|
+
/* numerical split */
|
149
|
+
Cluster(ColType column_type, size_t col_num, SplitType split_type, double split_point, bool has_NA_branch = false)
|
150
|
+
{
|
151
|
+
this->column_type = column_type;
|
152
|
+
this->col_num = col_num;
|
153
|
+
this->split_type = split_type;
|
154
|
+
this->split_point = split_point;
|
155
|
+
this->has_NA_branch = has_NA_branch;
|
156
|
+
}
|
157
|
+
|
158
|
+
/* categorical split */
|
159
|
+
Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
|
160
|
+
{
|
161
|
+
this->column_type = column_type;
|
162
|
+
this->col_num = col_num;
|
163
|
+
this->split_type = split_type;
|
164
|
+
if (split_type != IsNa) this->split_subset.assign(split_subset, split_subset + ncat);
|
165
|
+
this->split_subset.shrink_to_fit();
|
166
|
+
this->has_NA_branch = has_NA_branch;
|
167
|
+
}
|
168
|
+
|
169
|
+
/* categorical split with only one level */
|
170
|
+
Cluster(size_t col_num, int cat, int ncat, bool has_NA_branch = false)
|
171
|
+
{
|
172
|
+
this->column_type = Categorical;
|
173
|
+
this->col_num = col_num;
|
174
|
+
this->has_NA_branch = has_NA_branch;
|
175
|
+
this->split_type = Equal;
|
176
|
+
this->split_lev = cat;
|
177
|
+
}
|
178
|
+
|
179
|
+
/* ordinal split */
|
180
|
+
Cluster(ColType column_type, size_t col_num, SplitType split_type, int split_lev, bool has_NA_branch = false)
|
181
|
+
{
|
182
|
+
this->column_type = column_type;
|
183
|
+
this->col_num = col_num;
|
184
|
+
this->split_type = split_type;
|
185
|
+
this->split_lev = split_lev;
|
186
|
+
this->has_NA_branch = has_NA_branch;
|
187
|
+
}
|
188
|
+
|
189
|
+
/* this is for serialization with cereal */
|
190
|
+
template<class Archive>
|
191
|
+
void serialize(Archive &archive)
|
192
|
+
{
|
193
|
+
archive(
|
194
|
+
this->column_type,
|
195
|
+
this->col_num,
|
196
|
+
this->split_type,
|
197
|
+
this->split_point,
|
198
|
+
this->split_subset,
|
199
|
+
this->split_lev,
|
200
|
+
this->has_NA_branch,
|
201
|
+
this->cluster_size,
|
202
|
+
this->lower_lim,
|
203
|
+
this->upper_lim,
|
204
|
+
this->perc_below,
|
205
|
+
this->perc_above,
|
206
|
+
this->display_lim_low,
|
207
|
+
this->display_lim_high,
|
208
|
+
this->display_mean,
|
209
|
+
this->display_sd,
|
210
|
+
this->subset_common,
|
211
|
+
this->perc_in_subset,
|
212
|
+
this->perc_next_most_comm,
|
213
|
+
this->cluster_mean,
|
214
|
+
this->cluster_sd,
|
215
|
+
this->score_categ
|
216
|
+
);
|
217
|
+
}
|
218
|
+
|
219
|
+
/* this is for serialization with both cereal and cython auto-pickle */
|
220
|
+
Cluster() = default;
|
221
|
+
|
222
|
+
} Cluster;
|
223
|
+
|
224
|
+
/*
|
225
|
+
* Trees that host the aforementioned clusters. These work as follows:
|
226
|
+
* - Each tree contains a split column and condition for splitting.
|
227
|
+
* - The trees that follow them are specified in tree_left/right/NA.
|
228
|
+
* - If the tree is dropped or not used, that branch gets an index of zero.
|
229
|
+
* - The child tree will however remember which branch it took.
|
230
|
+
* - At prediction time, the output will tell into which cluster and which tree
|
231
|
+
* is each row an outlier (if they fall into any).
|
232
|
+
* - The exact conditions are reconstructed by following the trees backwards
|
233
|
+
* (i.e. first the cluster, then deepest tree, then follow parent tree until root).
|
234
|
+
* This way, all the necessary information can be obtained without storing redundant
|
235
|
+
* info, and without needing to reconstruct the conditions as the 'predict'
|
236
|
+
* function is being called (which makes it easier to wrap into other languages).
|
237
|
+
* - At prediction time, as the observation is passed down trees, all the clusters
|
238
|
+
* in all those trees have to be tested for (so if a cluster is discarded, it can
|
239
|
+
* keep only one branch of its split in the struct).
|
240
|
+
* - As a side effect, in ordinal columns, the trees cannot be simplified to 'Equal'.
|
241
|
+
* - All of this is ignored when using 'follow_all', in which case the trees work just
|
242
|
+
* like the clusters, with an array 'all_branches' which contains all trees that have
|
243
|
+
* to be follow from one particular tree.
|
244
|
+
*/
|
245
|
+
typedef struct ClusterTree {
|
246
|
+
size_t parent = 0; /* index in a vector */
|
247
|
+
SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
|
248
|
+
std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
|
249
|
+
|
250
|
+
SplitType split_this_branch = Root; /* when using 'follow_all' */
|
251
|
+
std::vector<size_t> all_branches = std::vector<size_t>(); /* when using 'follow_all' */
|
252
|
+
|
253
|
+
ColType column_type = NoType;
|
254
|
+
size_t col_num = 0;
|
255
|
+
double split_point = HUGE_VAL;
|
256
|
+
std::vector<char> split_subset = std::vector<char>();
|
257
|
+
int split_lev = INT_MAX;
|
258
|
+
|
259
|
+
size_t tree_NA = 0; /* binary splits */
|
260
|
+
size_t tree_left = 0; /* binary splits */
|
261
|
+
size_t tree_right = 0; /* binary splits */
|
262
|
+
std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
|
263
|
+
|
264
|
+
ClusterTree(size_t parent, SplitType parent_branch)
|
265
|
+
{
|
266
|
+
this->parent = parent;
|
267
|
+
this->parent_branch = parent_branch;
|
268
|
+
}
|
269
|
+
|
270
|
+
/* when using 'follow_all' */
|
271
|
+
ClusterTree(size_t parent, size_t col_num, double split_point, SplitType split_this_branch)
|
272
|
+
{
|
273
|
+
this->parent = parent;
|
274
|
+
this->col_num = col_num;
|
275
|
+
this->column_type = Numeric;
|
276
|
+
this->split_this_branch = split_this_branch;
|
277
|
+
this->split_point = split_point;
|
278
|
+
}
|
279
|
+
|
280
|
+
ClusterTree(size_t parent, size_t col_num, int split_lev, SplitType split_this_branch)
|
281
|
+
{
|
282
|
+
this->parent = parent;
|
283
|
+
this->col_num = col_num;
|
284
|
+
this->column_type = Ordinal;
|
285
|
+
this->split_this_branch = split_this_branch;
|
286
|
+
this->split_lev = split_lev;
|
287
|
+
}
|
288
|
+
|
289
|
+
ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
|
290
|
+
{
|
291
|
+
this->parent = parent;
|
292
|
+
this->col_num = col_num;
|
293
|
+
this->column_type = Categorical;
|
294
|
+
if (split_this_branch != IsNa) {
|
295
|
+
this->split_this_branch = split_this_branch;
|
296
|
+
this->split_subset.assign(split_subset, split_subset + ncat);
|
297
|
+
this->split_subset.shrink_to_fit();
|
298
|
+
} else {
|
299
|
+
this->split_this_branch = IsNa;
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
ClusterTree(size_t parent, size_t col_num, int cat_chosen)
|
304
|
+
{
|
305
|
+
this->parent = parent;
|
306
|
+
this->col_num = col_num;
|
307
|
+
this->column_type = Categorical;
|
308
|
+
this->split_this_branch = Equal;
|
309
|
+
this->split_lev = cat_chosen;
|
310
|
+
}
|
311
|
+
|
312
|
+
/* this is for serialization with cereal */
|
313
|
+
template<class Archive>
|
314
|
+
void serialize(Archive &archive)
|
315
|
+
{
|
316
|
+
archive(
|
317
|
+
this->parent,
|
318
|
+
this->parent_branch,
|
319
|
+
this->clusters,
|
320
|
+
this->split_this_branch,
|
321
|
+
this->all_branches,
|
322
|
+
this->column_type,
|
323
|
+
this->col_num,
|
324
|
+
this->split_point,
|
325
|
+
this->split_subset,
|
326
|
+
this->split_lev,
|
327
|
+
this->tree_NA,
|
328
|
+
this->tree_left,
|
329
|
+
this->tree_right,
|
330
|
+
this->binary_branches
|
331
|
+
);
|
332
|
+
}
|
333
|
+
|
334
|
+
/* this is for serialization with both cereal and cython auto-pickle */
|
335
|
+
ClusterTree() = default;
|
336
|
+
|
337
|
+
} ClusterTree;
|
338
|
+
|
339
|
+
/* these are needed for prediction time, and are thus returned from the function that fits the model */
|
340
|
+
typedef struct ModelOutputs {
|
341
|
+
std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
|
342
|
+
std::vector< std::vector<Cluster> > all_clusters; /* decision trees that host the clusters, required for prediction time */
|
343
|
+
std::vector<double> outlier_scores_final; /* if an outlier is flagged, this indicates its score (lower is more outlier) as an upper probability bound */
|
344
|
+
std::vector<size_t> outlier_clusters_final; /* if an outlier is flagged, this indicates the most suitable cluster in which to flag it as outlier */
|
345
|
+
std::vector<size_t> outlier_columns_final; /* if an outlier is flagged, this indicates the column that makes it an outlier */
|
346
|
+
std::vector<size_t> outlier_trees_final; /* if an outlier is flagged, this indicates the tree under which the cluster is found */
|
347
|
+
std::vector<size_t> outlier_depth_final; /* if an outlier is flagged, this indicates the split depth under which the cluster is found */
|
348
|
+
std::vector<int> outlier_decimals_distr; /* if an outlier is flagged, and it's a numeric column, this will indicate how many decimals to print for it */
|
349
|
+
std::vector<size_t> start_ix_cat_counts; /* this is to determine where to index the proportions */
|
350
|
+
std::vector<long double> prop_categ; /* this is just for statistics to show, it's not used for anything */
|
351
|
+
std::vector<ColTransf> col_transf; /* tells whether each numerical columns underwent log/exp transformations */
|
352
|
+
std::vector<double> transf_offset; /* value subtracted for log transform, mean subtracted for exp transform */
|
353
|
+
std::vector<double> sd_div; /* standard deviation with which exp-transformed columns were standardized */
|
354
|
+
std::vector<int> min_decimals_col; /* number of decimals to show for split conditions in numeric columns */
|
355
|
+
std::vector<int> ncat; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
|
356
|
+
std::vector<int> ncat_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
|
357
|
+
size_t ncols_numeric; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
|
358
|
+
size_t ncols_categ; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
|
359
|
+
size_t ncols_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
|
360
|
+
std::vector<double> min_outlier_any_cl; /* redundant info which speeds up prediction */
|
361
|
+
std::vector<double> max_outlier_any_cl; /* redundant info which speeds up prediction */
|
362
|
+
std::vector<std::vector<bool>> cat_outlier_any_cl; /* redundant info which speeds up prediction */
|
363
|
+
size_t max_depth; /* redundant info which speeds up prediction */
|
364
|
+
|
365
|
+
|
366
|
+
/* this is for serialization with cereal */
|
367
|
+
template<class Archive>
|
368
|
+
void serialize(Archive &archive)
|
369
|
+
{
|
370
|
+
archive(
|
371
|
+
this->all_trees,
|
372
|
+
this->all_clusters,
|
373
|
+
this->outlier_scores_final,
|
374
|
+
this->outlier_clusters_final,
|
375
|
+
this->outlier_columns_final,
|
376
|
+
this->outlier_trees_final,
|
377
|
+
this->outlier_depth_final,
|
378
|
+
this->start_ix_cat_counts,
|
379
|
+
this->prop_categ,
|
380
|
+
this->col_transf,
|
381
|
+
this->transf_offset,
|
382
|
+
this->sd_div,
|
383
|
+
this->ncat,
|
384
|
+
this->ncat_ord,
|
385
|
+
this->ncols_numeric,
|
386
|
+
this->ncols_categ,
|
387
|
+
this->ncols_ord,
|
388
|
+
this->min_outlier_any_cl,
|
389
|
+
this->max_outlier_any_cl,
|
390
|
+
this->cat_outlier_any_cl,
|
391
|
+
this->max_depth
|
392
|
+
);
|
393
|
+
}
|
394
|
+
|
395
|
+
/* this is for serialization with both cereal and cython auto-pickle */
|
396
|
+
ModelOutputs() = default;
|
397
|
+
|
398
|
+
} ModelOutputs;
|
399
|
+
|
400
|
+
/*
|
401
|
+
* Note: the vectors with proportions in these structs are supposed to be all small numbers so 'long double' is an overkill for them
|
402
|
+
* and does not make them translate into SIMD instructions in regular x86-64 CPUs, but if setting them as 'double' and then doing casts
|
403
|
+
* from/between 'double' and the 'size_t' and 'long double's of other arrays (such as in function 'find_outlier_categories'), comparisons
|
404
|
+
* such as '<=' will oftentimes fail even with small counts - this is an example that will fail when mixing the 3 types together:
|
405
|
+
* >>> (2 / (88+1)) * 0.5 <= (1 / 89) --> produces FALSE (right answer is TRUE)
|
406
|
+
* All due to decimals (in that example) right of the 10th digit, and ends up creating categorical clusters that it should not create.
|
407
|
+
* So don't change them back to regular 'double', or if necessary, change every 'long double' to 'double' too.
|
408
|
+
*/
|
409
|
+
|
410
|
+
/******************************************
|
411
|
+
Prototypes from fit_model.cpp
|
412
|
+
(This is the main module from which
|
413
|
+
the model is generated)
|
414
|
+
*******************************************/
|
415
|
+
bool fit_outliers_models(ModelOutputs &model_outputs,
|
416
|
+
double *restrict numeric_data, size_t ncols_numeric,
|
417
|
+
int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
|
418
|
+
int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
|
419
|
+
size_t nrows, char *restrict cols_ignore = NULL, int nthreads = 1,
|
420
|
+
bool categ_as_bin = true, bool ord_as_bin = true, bool cat_bruteforce_subset = false, bool categ_from_maj = false, bool take_mid = true,
|
421
|
+
size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
|
422
|
+
double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
|
423
|
+
|
424
|
+
typedef struct {
|
425
|
+
|
426
|
+
std::vector<size_t> ix_arr; /* indices from the target column */
|
427
|
+
size_t st; /* chunk of the indices to take for current function calls */
|
428
|
+
size_t end; /* chunk of the indices to take for current function calls */
|
429
|
+
std::vector<double> outlier_scores; /* these hold the model outputs for 1 column before combining them */
|
430
|
+
std::vector<size_t> outlier_clusters; /* these hold the model outputs for 1 column before combining them */
|
431
|
+
std::vector<size_t> outlier_trees; /* these hold the model outputs for 1 column before combining them */
|
432
|
+
std::vector<size_t> outlier_depth; /* these hold the model outputs for 1 column before combining them */
|
433
|
+
size_t target_col_num; /* if categorical or ordinal, gets subtracted the number of numeric columns (used to index other arrays) */
|
434
|
+
long double sd_y; /* numerical only (standard deviation before splitting) */
|
435
|
+
double mean_y; /* numerical only (used to standardize numbers for extra FP precision) */
|
436
|
+
long double base_info; /* categorical and ordinal (information before splitting and before binarizing) */
|
437
|
+
long double base_info_orig; /* categorical and ordinal (information before splitting and after binarizing if needed) */
|
438
|
+
bool log_transf; /* numerical - whether the target variable underwent a logarithmic transformation */
|
439
|
+
bool exp_transf; /* numerical - whether the target variable underwent exponentiation on its Z values */
|
440
|
+
double *target_numeric_col; /* dynamic pointer */
|
441
|
+
int *target_categ_col; /* dynamic pointer */
|
442
|
+
std::vector<double> buffer_transf_y; /* if applying logarithm or exponentiation, transformed values are stored here */
|
443
|
+
std::vector<int> buffer_bin_y; /* if binarizing, transformed values are stored here */
|
444
|
+
std::vector<Cluster> *clusters; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
|
445
|
+
std::vector<ClusterTree> *tree; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
|
446
|
+
bool has_outliers; /* temporary variable from which the other two are updated */
|
447
|
+
bool lev_has_outliers; /* whether the particular depth level has outliers (if so, wil remove them at the end before new split) */
|
448
|
+
bool col_has_outliers; /* whether there's any outliers in the column (will later merge them into the outputs) */
|
449
|
+
double left_tail; /* approximate value where a long left tail ends */
|
450
|
+
double right_tail; /* approximate value where a long right tail ends */
|
451
|
+
|
452
|
+
bool col_is_bin; /* whether the target categorical/ordinal column has 2 categories or has been forcibly binarized */
|
453
|
+
long double *prop_small_this; /* dynamic pointer */
|
454
|
+
long double *prior_prob; /* dynamic pointer */
|
455
|
+
|
456
|
+
double orig_mean; /* value to reconstruct originals from exponentiated */
|
457
|
+
double orig_sd; /* value to reconstruct originals from exponentiated */
|
458
|
+
double log_minval; /* value to reconstruct originals from logarithms */
|
459
|
+
double *orig_target_col; /* column as it was before applying log/exp (dynamic pointer) */
|
460
|
+
int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
|
461
|
+
int *temp_ptr_x; /* dynamic pointer */
|
462
|
+
|
463
|
+
std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
|
464
|
+
long double this_gain; /* buffer where to store gain */
|
465
|
+
double this_split_point; /* numeric split threshold */
|
466
|
+
int this_split_lev; /* ordinal split threshold */
|
467
|
+
size_t this_split_ix; /* index at which the data is partitioned */
|
468
|
+
size_t this_split_NA; /* index at which the non-NA values start */
|
469
|
+
long double best_gain; /* buffer where to store the info of the splitting column that produced the highest gain */
|
470
|
+
ColType column_type_best; /* buffer where to store the info of the splitting column that produced the highest gain */
|
471
|
+
double split_point_best; /* buffer where to store the info of the splitting column that produced the highest gain */
|
472
|
+
int split_lev_best; /* buffer where to store the info of the splitting column that produced the highest gain */
|
473
|
+
size_t col_best; /* buffer where to store the info of the splitting column that produced the highest gain */
|
474
|
+
|
475
|
+
std::vector<long double> buffer_cat_sum; /* buffer arrays where to allocate values required by functions and not used outside them */
|
476
|
+
std::vector<long double> buffer_cat_sum_sq; /* buffer arrays where to allocate values required by functions and not used outside them */
|
477
|
+
std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
|
478
|
+
std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
|
479
|
+
std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
|
480
|
+
std::vector<char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
|
481
|
+
std::vector<char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
|
482
|
+
std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
|
483
|
+
|
484
|
+
bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
|
485
|
+
bool already_split_main; /* when binarizing categoricals/ordinals, avoid attempting the same split with numerical and ordinals that take the non-binarized data */
|
486
|
+
bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
|
487
|
+
int ncat_this; /* number of categories in the target column */
|
488
|
+
|
489
|
+
} Workspace;
|
490
|
+
|
491
|
+
/* info holders to shorten function call arguments */
|
492
|
+
typedef struct {
|
493
|
+
bool categ_as_bin;
|
494
|
+
bool ord_as_bin;
|
495
|
+
bool cat_bruteforce_subset;
|
496
|
+
bool categ_from_maj;
|
497
|
+
bool take_mid;
|
498
|
+
size_t max_depth;
|
499
|
+
double max_perc_outliers;
|
500
|
+
size_t min_size_numeric;
|
501
|
+
size_t min_size_categ;
|
502
|
+
double min_gain;
|
503
|
+
bool gain_as_pct;
|
504
|
+
bool follow_all;
|
505
|
+
double z_norm;
|
506
|
+
double z_outlier;
|
507
|
+
double z_tail;
|
508
|
+
std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
|
509
|
+
} ModelParams;
|
510
|
+
|
511
|
+
/* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
|
512
|
+
typedef struct {
|
513
|
+
double *restrict numeric_data; size_t ncols_numeric;
|
514
|
+
int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
|
515
|
+
int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
|
516
|
+
size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
|
517
|
+
std::vector<size_t> cat_counts;
|
518
|
+
} InputData;
|
519
|
+
|
520
|
+
|
521
|
+
void process_numeric_col(std::vector<Cluster> &cluster_root,
|
522
|
+
std::vector<ClusterTree> &tree_root,
|
523
|
+
size_t target_col_num,
|
524
|
+
Workspace &workspace,
|
525
|
+
InputData &input_data,
|
526
|
+
ModelParams &model_params,
|
527
|
+
ModelOutputs &model_outputs);
|
528
|
+
void recursive_split_numeric(Workspace &workspace,
|
529
|
+
InputData &input_data,
|
530
|
+
ModelParams &model_params,
|
531
|
+
size_t curr_depth, bool is_NA_branch);
|
532
|
+
void process_categ_col(std::vector<Cluster> &cluster_root,
|
533
|
+
std::vector<ClusterTree> &tree_root,
|
534
|
+
size_t target_col_num, bool is_ord,
|
535
|
+
Workspace &workspace,
|
536
|
+
InputData &input_data,
|
537
|
+
ModelParams &model_params,
|
538
|
+
ModelOutputs &model_outputs);
|
539
|
+
void recursive_split_categ(Workspace &workspace,
|
540
|
+
InputData &input_data,
|
541
|
+
ModelParams &model_params,
|
542
|
+
size_t curr_depth, bool is_NA_branch);
|
543
|
+
|
544
|
+
|
545
|
+
/*******************************************
|
546
|
+
Prototypes from predict.cpp
|
547
|
+
(This is the module from which
|
548
|
+
new data can be flagged as outliers)
|
549
|
+
********************************************/
|
550
|
+
typedef struct {
|
551
|
+
double *restrict numeric_data;
|
552
|
+
int *restrict categorical_data;
|
553
|
+
int *restrict ordinal_data;
|
554
|
+
size_t nrows;
|
555
|
+
} PredictionData;
|
556
|
+
|
557
|
+
bool find_new_outliers(double *restrict numeric_data,
|
558
|
+
int *restrict categorical_data,
|
559
|
+
int *restrict ordinal_data,
|
560
|
+
size_t nrows, int nthreads, ModelOutputs &model_outputs);
|
561
|
+
bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
|
562
|
+
size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this);
|
563
|
+
bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
|
564
|
+
ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
|
565
|
+
bool col_is_num, double num_val_this, int cat_val_this);
|
566
|
+
|
567
|
+
|
568
|
+
/********************************
|
569
|
+
Prototypes from split.cpp
|
570
|
+
*********************************/
|
571
|
+
#define SD_REG 1e-5 /* Regularization for standard deviation estimation */
|
572
|
+
|
573
|
+
typedef struct {
|
574
|
+
size_t cnt;
|
575
|
+
long double sum;
|
576
|
+
long double sum_sq;
|
577
|
+
} NumericBranch;
|
578
|
+
|
579
|
+
typedef struct {
|
580
|
+
NumericBranch NA_branch = {0, 0, 0};
|
581
|
+
NumericBranch left_branch = {0, 0, 0};
|
582
|
+
NumericBranch right_branch = {0, 0, 0};
|
583
|
+
} NumericSplit;
|
584
|
+
|
585
|
+
typedef struct {
|
586
|
+
size_t *restrict NA_branch; /* array of counts of the target variable's categories */
|
587
|
+
size_t *restrict left_branch; /* array of counts of the target variable's categories */
|
588
|
+
size_t *restrict right_branch; /* array of counts of the target variable's categories */
|
589
|
+
size_t ncat; /* number of categories/entries in the arrays above */
|
590
|
+
size_t tot; /* size_NA + size_left + size_right */
|
591
|
+
size_t size_NA = 0;
|
592
|
+
size_t size_left = 0;
|
593
|
+
size_t size_right = 0;
|
594
|
+
} CategSplit;
|
595
|
+
|
596
|
+
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
|
597
|
+
size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
|
598
|
+
void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
|
599
|
+
long double calc_sd(size_t cnt, long double sum, long double sum_sq);
|
600
|
+
long double calc_sd(NumericBranch &branch);
|
601
|
+
long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
|
602
|
+
long double numeric_gain(NumericSplit &split_info, long double tot_sd);
|
603
|
+
long double numeric_gain(long double tot_sd, long double info_left, long double info_right, long double info_NA, long double cnt);
|
604
|
+
long double total_info(size_t categ_counts[], size_t ncat);
|
605
|
+
long double total_info(size_t categ_counts[], size_t ncat, size_t tot);
|
606
|
+
long double total_info(size_t *restrict ix_arr, int *restrict x, size_t st, size_t end, size_t ncat, size_t *restrict buffer_cat_cnt);
|
607
|
+
long double categ_gain(CategSplit split_info, long double base_info);
|
608
|
+
long double categ_gain(size_t *restrict categ_counts, size_t ncat, size_t *restrict ncat_col, size_t maxcat, long double base_info, size_t tot);
|
609
|
+
long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size_t st, size_t st_non_na, size_t split_ix, size_t end,
|
610
|
+
size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
|
611
|
+
void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
|
612
|
+
long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
|
613
|
+
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
|
614
|
+
void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
|
615
|
+
bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
|
616
|
+
long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
|
617
|
+
bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
|
618
|
+
void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
|
619
|
+
size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
|
620
|
+
bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
|
621
|
+
size_t *restrict split_left, size_t *restrict split_NA);
|
622
|
+
void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
623
|
+
size_t ncat_y, size_t ncat_x, long double base_info,
|
624
|
+
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
|
625
|
+
bool has_na, size_t min_size, long double *gain, int *split_point);
|
626
|
+
void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
627
|
+
size_t ncat_x, long double base_info,
|
628
|
+
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
|
629
|
+
bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
|
630
|
+
void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
631
|
+
size_t ncat_x, size_t ncat_y, long double base_info,
|
632
|
+
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
|
633
|
+
bool has_na, size_t min_size, long double *gain);
|
634
|
+
void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
635
|
+
size_t ncat_x, size_t ncat_y, long double base_info,
|
636
|
+
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
|
637
|
+
bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
|
638
|
+
|
639
|
+
|
640
|
+
|
641
|
+
/***********************************
|
642
|
+
Prototypes from clusters.cpp
|
643
|
+
************************************/
|
644
|
+
#define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
|
645
|
+
#define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
|
646
|
+
#define chebyshyov_bound(sd) (1.0 / square(sd))
|
647
|
+
|
648
|
+
bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
649
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
650
|
+
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
|
651
|
+
bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
|
652
|
+
double left_tail, double right_tail, double *restrict orig_x,
|
653
|
+
double max_perc_outliers, double z_norm, double z_outlier);
|
654
|
+
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
655
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
656
|
+
size_t *restrict outlier_depth, Cluster &cluster,
|
657
|
+
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
|
658
|
+
bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
|
659
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
660
|
+
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
|
661
|
+
size_t cluster_num, size_t tree_num, size_t tree_depth,
|
662
|
+
double max_perc_outliers, double z_norm, double z_outlier,
|
663
|
+
long double *restrict perc_threshold, long double *restrict prop_prior,
|
664
|
+
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
665
|
+
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
666
|
+
bool *restrict drop_cluster);
|
667
|
+
void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
|
668
|
+
void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
|
669
|
+
#ifdef TEST_MODE_DEFINE
|
670
|
+
void prune_unused_trees(std::vector<ClusterTree> &trees);
|
671
|
+
#endif
|
672
|
+
bool check_tree_is_not_needed(ClusterTree &tree);
|
673
|
+
void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col);
|
674
|
+
void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel);
|
675
|
+
|
676
|
+
|
677
|
+
/**************************************
|
678
|
+
Prototypes from cat_outlier.cpp
|
679
|
+
***************************************/
|
680
|
+
#define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
|
681
|
+
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
682
|
+
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
683
|
+
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
|
684
|
+
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
685
|
+
long double prior_prob[], double z_outlier, char is_outlier[],
|
686
|
+
bool *found_outliers, bool *new_is_outlier, int *categ_maj);
|
687
|
+
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
688
|
+
char is_outlier[], double *next_most_comm);
|
689
|
+
|
690
|
+
|
691
|
+
|
692
|
+
/*************************************************
|
693
|
+
Prototypes from misc.cpp and other structs
|
694
|
+
**************************************************/
|
695
|
+
|
696
|
+
/* an inefficient workaround for coding up option 'follow_all' */
|
697
|
+
typedef struct {
|
698
|
+
double gain_restore;
|
699
|
+
double gain_best_restore;
|
700
|
+
double split_point_restore;
|
701
|
+
int split_lev_restore;
|
702
|
+
std::vector<char> split_subset_restore;
|
703
|
+
size_t ix1_restore;
|
704
|
+
size_t ix2_restore;
|
705
|
+
size_t ix3_restore;
|
706
|
+
size_t ix4_restore;
|
707
|
+
int * temp_ptr_x;
|
708
|
+
size_t col_best_restore;
|
709
|
+
ColType col_type_best_rememer;
|
710
|
+
double split_point_best_restore;
|
711
|
+
int split_lev_best_restore;
|
712
|
+
std::vector<char> split_subset_best_restore;
|
713
|
+
long double base_info_restore;
|
714
|
+
long double base_info_orig_restore;
|
715
|
+
double sd_y_restore;
|
716
|
+
bool has_outliers_restore;
|
717
|
+
bool lev_has_outliers_restore;
|
718
|
+
} RecursionState;
|
719
|
+
|
720
|
+
|
721
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ = 0);
|
722
|
+
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
723
|
+
int categorical_data[], size_t ncols, size_t nrows,
|
724
|
+
bool has_NA[], bool skip_col[], int nthreads);
|
725
|
+
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
726
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads);
|
727
|
+
void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
|
728
|
+
size_t start_ix_cat_counts[], size_t cat_counts[],
|
729
|
+
size_t ncols, size_t nrows, double z_norm, double z_tail);
|
730
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
|
731
|
+
bool skip_col[], int min_decimals[], int nthreads);
|
732
|
+
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
|
733
|
+
void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
|
734
|
+
double z_norm, double max_perc_outliers,
|
735
|
+
double *restrict buffer_x, double mean, double sd,
|
736
|
+
double *restrict left_tail, double *restrict right_tail,
|
737
|
+
bool *exp_transf, bool *log_transf);
|
738
|
+
size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end);
|
739
|
+
size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
|
740
|
+
size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
|
741
|
+
void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
|
742
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
|
743
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
|
744
|
+
bool check_workspace_is_allocated(Workspace &workspace);
|
745
|
+
void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
|
746
|
+
void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
747
|
+
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
748
|
+
void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
|
749
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
|
750
|
+
void set_tree_as_categorical(ClusterTree &tree, size_t col);
|
751
|
+
void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
|
752
|
+
void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
|
753
|
+
void forget_row_outputs(ModelOutputs &model_outputs);
|
754
|
+
void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth);
|
755
|
+
void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[]);
|
756
|
+
void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
|
757
|
+
int decimals_diff(double val1, double val2);
|
758
|
+
void dealloc_ModelOutputs(ModelOutputs &model_outputs);
|