isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -18,11 +18,29 @@
|
|
|
18
18
|
* [5] https://sourceforge.net/projects/iforest/
|
|
19
19
|
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
-
* [8] Cortes, David.
|
|
22
|
-
*
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
23
41
|
*
|
|
24
42
|
* BSD 2-Clause License
|
|
25
|
-
* Copyright (c)
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
26
44
|
* All rights reserved.
|
|
27
45
|
* Redistribution and use in source and binary forms, with or without
|
|
28
46
|
* modification, are permitted provided that the following conditions are met:
|
|
@@ -43,73 +61,196 @@
|
|
|
43
61
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
44
62
|
*/
|
|
45
63
|
|
|
64
|
+
#ifndef ISOTREE_H
|
|
65
|
+
#define ISOTREE_H
|
|
66
|
+
|
|
67
|
+
/* This is only used for the serialiation format and might not reflect the
|
|
68
|
+
actual version of the library, do not use for anything else. */
|
|
69
|
+
#define ISOTREE_VERSION_MAJOR 0
|
|
70
|
+
#define ISOTREE_VERSION_MINOR 5
|
|
71
|
+
#define ISOTREE_VERSION_PATCH 6
|
|
72
|
+
|
|
73
|
+
/* For MinGW, needs to be defined before including any headers */
|
|
74
|
+
#if (defined(_WIN32) || defined(_WIN64)) && (SIZE_MAX >= UINT64_MAX)
|
|
75
|
+
# if defined(__GNUG__) || defined(__GNUC__)
|
|
76
|
+
# ifndef _FILE_OFFSET_BITS
|
|
77
|
+
# define _FILE_OFFSET_BITS 64
|
|
78
|
+
# endif
|
|
79
|
+
# endif
|
|
80
|
+
#endif
|
|
81
|
+
#ifdef _MSC_VER
|
|
82
|
+
# define _CRT_SECURE_NO_WARNINGS
|
|
83
|
+
#endif
|
|
84
|
+
|
|
85
|
+
|
|
46
86
|
/* Standard headers */
|
|
47
|
-
#include <
|
|
48
|
-
#include <
|
|
49
|
-
#include <
|
|
50
|
-
#include <
|
|
51
|
-
#include <
|
|
87
|
+
#include <cstddef>
|
|
88
|
+
#include <cmath>
|
|
89
|
+
#include <climits>
|
|
90
|
+
#include <cstring>
|
|
91
|
+
#include <cerrno>
|
|
52
92
|
#include <vector>
|
|
53
93
|
#include <iterator>
|
|
54
94
|
#include <numeric>
|
|
55
95
|
#include <algorithm>
|
|
56
96
|
#include <random>
|
|
57
|
-
#include <unordered_set>
|
|
58
|
-
#include <unordered_map>
|
|
59
97
|
#include <memory>
|
|
60
98
|
#include <utility>
|
|
61
99
|
#include <cstdint>
|
|
100
|
+
#include <cinttypes>
|
|
101
|
+
#include <exception>
|
|
102
|
+
#include <stdexcept>
|
|
103
|
+
#include <cassert>
|
|
104
|
+
#include <cfloat>
|
|
62
105
|
#include <iostream>
|
|
63
|
-
#
|
|
64
|
-
|
|
65
|
-
#
|
|
106
|
+
#include <string>
|
|
107
|
+
|
|
108
|
+
#ifdef _FOR_R
|
|
66
109
|
extern "C" {
|
|
67
110
|
#include <R_ext/Print.h>
|
|
68
111
|
}
|
|
69
112
|
#define printf Rprintf
|
|
70
113
|
#define fprintf(f, message) REprintf(message)
|
|
114
|
+
#elif defined(_FOR_PYTHON)
|
|
115
|
+
extern "C" void cy_warning(const char *msg);
|
|
116
|
+
#define fprintf(f, message) cy_warning(message)
|
|
117
|
+
#else
|
|
118
|
+
#include <cstdio>
|
|
119
|
+
using std::printf;
|
|
120
|
+
using std::fprintf;
|
|
71
121
|
#endif
|
|
72
122
|
#ifdef _OPENMP
|
|
73
123
|
#include <omp.h>
|
|
74
124
|
#endif
|
|
75
|
-
#ifdef
|
|
76
|
-
#include <
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
125
|
+
#ifdef _FOR_R
|
|
126
|
+
#include <Rcpp.h>
|
|
127
|
+
#endif
|
|
128
|
+
#include <csignal>
|
|
129
|
+
typedef void (*sig_t_)(int);
|
|
130
|
+
using std::signal;
|
|
131
|
+
using std::raise;
|
|
132
|
+
|
|
133
|
+
using std::size_t;
|
|
134
|
+
using std::memset;
|
|
135
|
+
using std::memcpy;
|
|
136
|
+
|
|
137
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
138
|
+
#define likely(x) __builtin_expect((bool)(x), true)
|
|
139
|
+
#define unlikely(x) __builtin_expect((bool)(x), false)
|
|
140
|
+
#else
|
|
141
|
+
#define likely(x) (x)
|
|
142
|
+
#define unlikely(x) (x)
|
|
143
|
+
#endif
|
|
144
|
+
|
|
145
|
+
#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
|
|
146
|
+
#define unexpected_error() throw std::runtime_error(\
|
|
147
|
+
std::string("Unexpected error in ") + \
|
|
148
|
+
std::string(__FILE__) + \
|
|
149
|
+
std::string(":") + \
|
|
150
|
+
std::to_string(__LINE__) + \
|
|
151
|
+
std::string(". Please open an issue in GitHub with this information, indicating the installed version of 'isotree'.\n"))
|
|
152
|
+
#else
|
|
153
|
+
#define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
|
|
81
154
|
#endif
|
|
82
155
|
|
|
83
|
-
/* By default, will use
|
|
84
|
-
#ifdef
|
|
156
|
+
/* By default, will use Xoshiro256++ or Xoshiro128++ for RNG, but can be switched to something faster */
|
|
157
|
+
#ifdef _USE_XOSHIRO
|
|
158
|
+
#include "xoshiro.hpp"
|
|
85
159
|
#if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
|
|
86
|
-
#define RNG_engine
|
|
160
|
+
#define RNG_engine Xoshiro::Xoshiro256PP
|
|
87
161
|
#else /* 32-bit systems and non-standard architectures */
|
|
88
|
-
#define RNG_engine
|
|
162
|
+
#define RNG_engine Xoshiro::Xoshiro128PP
|
|
163
|
+
#endif
|
|
164
|
+
#if defined(DBL_MANT_DIG) && (DBL_MANT_DIG == 53) && (FLT_RADIX == 2)
|
|
165
|
+
using Xoshiro::UniformUnitInterval;
|
|
166
|
+
using Xoshiro::UniformMinusOneToOne;
|
|
167
|
+
using Xoshiro::StandardNormalDistr;
|
|
168
|
+
#else
|
|
169
|
+
#define UniformUnitInterval std::uniform_real_distribution<double>
|
|
170
|
+
#define UniformMinusOneToOne std::uniform_real_distribution<double>
|
|
171
|
+
#define StandardNormalDistr std::normal_distribution<double>
|
|
172
|
+
#endif
|
|
173
|
+
#else
|
|
174
|
+
#if defined(_USE_MERSENNE_TWISTER)
|
|
175
|
+
#if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
|
|
176
|
+
#define RNG_engine std::mt19937_64
|
|
177
|
+
#else /* 32-bit systems and non-standard architectures */
|
|
178
|
+
#define RNG_engine std::mt19937
|
|
179
|
+
#endif
|
|
180
|
+
#else
|
|
181
|
+
#define RNG_engine std::default_random_engine
|
|
182
|
+
#endif
|
|
183
|
+
|
|
184
|
+
#define UniformUnitInterval std::uniform_real_distribution<double>
|
|
185
|
+
#define UniformMinusOneToOne std::uniform_real_distribution<double>
|
|
186
|
+
#define StandardNormalDistr std::normal_distribution<double>
|
|
187
|
+
#endif
|
|
188
|
+
|
|
189
|
+
/* At the time of writing, this brought a sizeable speed up compared to
|
|
190
|
+
'unordered_map' and 'unordered_set' from both GCC and CLANG.
|
|
191
|
+
But perhaps should consider others in the future, such as this:
|
|
192
|
+
https://github.com/ktprime/emhash */
|
|
193
|
+
#if defined(_USE_ROBIN_MAP)
|
|
194
|
+
#ifndef _USE_SYSTEM_ROBIN
|
|
195
|
+
#include "robinmap/include/tsl/robin_growth_policy.h"
|
|
196
|
+
#include "robinmap/include/tsl/robin_hash.h"
|
|
197
|
+
#include "robinmap/include/tsl/robin_set.h"
|
|
198
|
+
#include "robinmap/include/tsl/robin_map.h"
|
|
199
|
+
#else
|
|
200
|
+
#include "tsl/robin_growth_policy.h"
|
|
201
|
+
#include "tsl/robin_hash.h"
|
|
202
|
+
#include "tsl/robin_set.h"
|
|
203
|
+
#include "tsl/robin_map.h"
|
|
89
204
|
#endif
|
|
205
|
+
#define hashed_set tsl::robin_set
|
|
206
|
+
#define hashed_map tsl::robin_map
|
|
90
207
|
#else
|
|
91
|
-
#
|
|
208
|
+
#include <unordered_set>
|
|
209
|
+
#include <unordered_map>
|
|
210
|
+
#define hashed_set std::unordered_set
|
|
211
|
+
#define hashed_map std::unordered_map
|
|
92
212
|
#endif
|
|
93
213
|
|
|
94
214
|
/* Short functions */
|
|
95
|
-
#define ix_parent(ix) (((ix) - 1) / 2) /* integer division takes care of deciding left-right */
|
|
96
|
-
#define ix_child(ix) (2 * (ix) + 1)
|
|
97
215
|
/* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
|
|
98
216
|
#define pow2(n) ( ((size_t) 1) << (n) )
|
|
217
|
+
#define div2(n) ((n) >> 1)
|
|
218
|
+
#define mult2(n) ((n) << 1)
|
|
219
|
+
#define ix_parent(ix) (div2((ix) - (size_t)1)) /* integer division takes care of deciding left-right */
|
|
220
|
+
#define ix_child(ix) (mult2(ix) + (size_t)1)
|
|
99
221
|
#define square(x) ((x) * (x))
|
|
222
|
+
#ifndef _FOR_R
|
|
223
|
+
#if defined(__GNUC__) && (__GNUC__ >= 5)
|
|
224
|
+
#pragma GCC diagnostic push
|
|
225
|
+
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
|
226
|
+
#elif defined(__clang__) && !defined(_FOR_R)
|
|
227
|
+
#pragma clang diagnostic push
|
|
228
|
+
#pragma clang diagnostic ignored "-Wuninitialized"
|
|
229
|
+
#endif
|
|
230
|
+
#endif
|
|
100
231
|
/* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
|
|
101
232
|
#define extract_bit(number, bit) (((number) >> (bit)) & 1)
|
|
102
|
-
#ifndef
|
|
103
|
-
#
|
|
233
|
+
#ifndef _FOR_R
|
|
234
|
+
#if defined(__GNUC__) && (__GNUC__ >= 5)
|
|
235
|
+
#pragma GCC diagnostic pop
|
|
236
|
+
#elif defined(__clang__)
|
|
237
|
+
#pragma clang diagnostic pop
|
|
238
|
+
#pragma clang diagnostic push
|
|
239
|
+
#pragma clang diagnostic ignored "-Wunknown-attributes"
|
|
240
|
+
#endif
|
|
104
241
|
#endif
|
|
105
|
-
#
|
|
106
|
-
|
|
242
|
+
#define is_na_or_inf(x) (std::isnan(x) || std::isinf(x))
|
|
243
|
+
|
|
244
|
+
/* MSVC doesn't support long doubles, so this avoids unnecessarily increasing library size.
|
|
245
|
+
MinGW supports them but has issues with their computations.
|
|
246
|
+
See https://sourceforge.net/p/mingw-w64/bugs/909/ */
|
|
247
|
+
#if defined(_WIN32) && !defined(NO_LONG_DOUBLE)
|
|
248
|
+
#define NO_LONG_DOUBLE
|
|
107
249
|
#endif
|
|
108
|
-
#define is_na_or_inf(x) (isnan(x) || isinf(x))
|
|
109
250
|
|
|
110
251
|
|
|
111
252
|
/* Aliasing for compiler optimizations */
|
|
112
|
-
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
|
|
253
|
+
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
|
|
113
254
|
#define restrict __restrict
|
|
114
255
|
#else
|
|
115
256
|
#define restrict
|
|
@@ -118,7 +259,7 @@
|
|
|
118
259
|
/* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
|
|
119
260
|
#ifdef _OPENMP
|
|
120
261
|
#if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
|
|
121
|
-
#define size_t_for long
|
|
262
|
+
#define size_t_for long long
|
|
122
263
|
#else
|
|
123
264
|
#define size_t_for size_t
|
|
124
265
|
#endif
|
|
@@ -126,33 +267,51 @@
|
|
|
126
267
|
#define size_t_for size_t
|
|
127
268
|
#endif
|
|
128
269
|
|
|
270
|
+
#if defined(_FOR_R) || defined(_FOR_PYTHON)
|
|
271
|
+
#define ISOTREE_EXPORTED
|
|
272
|
+
#else
|
|
273
|
+
#if defined(_WIN32)
|
|
274
|
+
#ifdef ISOTREE_COMPILE_TIME
|
|
275
|
+
#define ISOTREE_EXPORTED __declspec(dllexport)
|
|
276
|
+
#else
|
|
277
|
+
#define ISOTREE_EXPORTED __declspec(dllimport)
|
|
278
|
+
#endif
|
|
279
|
+
#else
|
|
280
|
+
#if defined(EXPLICITLTY_EXPORT_SYMBOLS) && defined(ISOTREE_COMPILE_TIME)
|
|
281
|
+
#define ISOTREE_EXPORTED [[gnu::visibility("default")]]
|
|
282
|
+
#else
|
|
283
|
+
#define ISOTREE_EXPORTED
|
|
284
|
+
#endif
|
|
285
|
+
#endif
|
|
286
|
+
#endif
|
|
287
|
+
|
|
129
288
|
|
|
130
|
-
/*
|
|
289
|
+
/* Apple at some point decided to drop OMP library and headers from its compiler distribution
|
|
131
290
|
* and to alias 'gcc' to 'clang', which work differently when given flags they cannot interpret,
|
|
132
291
|
* causing installation issues with pretty much all scientific software due to OMP headers that
|
|
133
292
|
* would normally do nothing. This piece of code is to allow compilation without OMP header. */
|
|
134
293
|
#ifndef _OPENMP
|
|
135
|
-
#define omp_get_thread_num() 0
|
|
294
|
+
#define omp_get_thread_num() (0)
|
|
136
295
|
#endif
|
|
137
296
|
|
|
297
|
+
/* Some aggregation functions will prefer more precise data types when the data is large */
|
|
298
|
+
#define THRESHOLD_LONG_DOUBLE (size_t)1e6
|
|
138
299
|
|
|
139
|
-
/*
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
300
|
+
/* Types used through the package */
|
|
301
|
+
typedef enum NewCategAction {Weighted=0, Smallest=11, Random=12} NewCategAction; /* Weighted means Impute in the extended model */
|
|
302
|
+
typedef enum MissingAction {Divide=21, Impute=22, Fail=0} MissingAction; /* Divide is only for non-extended model */
|
|
303
|
+
typedef enum ColType {Numeric=31, Categorical=32, NotUsed=0} ColType;
|
|
304
|
+
typedef enum CategSplit {SubSet=0, SingleCateg=41} CategSplit;
|
|
305
|
+
typedef enum CoefType {Uniform=61, Normal=0} CoefType; /* For extended model */
|
|
306
|
+
typedef enum UseDepthImp {Lower=71, Higher=0, Same=72} UseDepthImp; /* For NA imputation */
|
|
307
|
+
typedef enum WeighImpRows {Inverse=0, Prop=81, Flat=82} WeighImpRows; /* For NA imputation */
|
|
308
|
+
typedef enum ScoringMetric {Depth=0, Density=92, BoxedDensity=94, BoxedDensity2=96, BoxedRatio=95,
|
|
309
|
+
AdjDepth=91, AdjDensity=93} ScoringMetric;
|
|
145
310
|
|
|
311
|
+
/* These are only used internally */
|
|
312
|
+
typedef enum ColCriterion {Uniformly=0, ByRange=1, ByVar=2, ByKurt=3} ColCriterion; /* For proportional choices */
|
|
313
|
+
typedef enum GainCriterion {NoCrit=0, Averaged=1, Pooled=2, FullGain=3, DensityCrit=4} Criterion; /* For guided splits */
|
|
146
314
|
|
|
147
|
-
/* Types used through the package */
|
|
148
|
-
typedef enum NewCategAction {Weighted, Smallest, Random} NewCategAction; /* Weighted means Impute in the extended model */
|
|
149
|
-
typedef enum MissingAction {Divide, Impute, Fail} MissingAction; /* Divide is only for non-extended model */
|
|
150
|
-
typedef enum ColType {Numeric, Categorical, NotUsed} ColType;
|
|
151
|
-
typedef enum CategSplit {SubSet, SingleCateg} CategSplit;
|
|
152
|
-
typedef enum GainCriterion {Averaged, Pooled, NoCrit} Criterion; /* For guided splits */
|
|
153
|
-
typedef enum CoefType {Uniform, Normal} CoefType; /* For extended model */
|
|
154
|
-
typedef enum UseDepthImp {Lower, Higher, Same} UseDepthImp; /* For NA imputation */
|
|
155
|
-
typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /* For NA imputation */
|
|
156
315
|
|
|
157
316
|
/* Notes about new categorical action:
|
|
158
317
|
* - For single-variable case, if using 'Smallest', can then pass data at prediction time
|
|
@@ -167,10 +326,10 @@ typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /
|
|
|
167
326
|
|
|
168
327
|
/* Structs that are output (modified) from the main function */
|
|
169
328
|
typedef struct IsoTree {
|
|
170
|
-
ColType col_type = NotUsed; /* issues with uninitialized values
|
|
329
|
+
ColType col_type = NotUsed; /* issues with uninitialized values when serializing */
|
|
171
330
|
size_t col_num;
|
|
172
331
|
double num_split;
|
|
173
|
-
std::vector<char> cat_split;
|
|
332
|
+
std::vector<signed char> cat_split;
|
|
174
333
|
int chosen_cat;
|
|
175
334
|
size_t tree_left;
|
|
176
335
|
size_t tree_right;
|
|
@@ -180,29 +339,7 @@ typedef struct IsoTree {
|
|
|
180
339
|
double range_high = HUGE_VAL;
|
|
181
340
|
double remainder; /* only used for distance/similarity */
|
|
182
341
|
|
|
183
|
-
#ifdef _ENABLE_CEREAL
|
|
184
|
-
template<class Archive>
|
|
185
|
-
void serialize(Archive &archive)
|
|
186
|
-
{
|
|
187
|
-
archive(
|
|
188
|
-
this->col_type,
|
|
189
|
-
this->col_num,
|
|
190
|
-
this->num_split,
|
|
191
|
-
this->cat_split,
|
|
192
|
-
this->chosen_cat,
|
|
193
|
-
this->tree_left,
|
|
194
|
-
this->tree_right,
|
|
195
|
-
this->pct_tree_left,
|
|
196
|
-
this->score,
|
|
197
|
-
this->range_low,
|
|
198
|
-
this->range_high,
|
|
199
|
-
this->remainder
|
|
200
|
-
);
|
|
201
|
-
}
|
|
202
|
-
#endif
|
|
203
|
-
|
|
204
342
|
IsoTree() = default;
|
|
205
|
-
|
|
206
343
|
} IsoTree;
|
|
207
344
|
|
|
208
345
|
typedef struct IsoHPlane {
|
|
@@ -223,30 +360,6 @@ typedef struct IsoHPlane {
|
|
|
223
360
|
double range_high = HUGE_VAL;
|
|
224
361
|
double remainder; /* only used for distance/similarity */
|
|
225
362
|
|
|
226
|
-
#ifdef _ENABLE_CEREAL
|
|
227
|
-
template<class Archive>
|
|
228
|
-
void serialize(Archive &archive)
|
|
229
|
-
{
|
|
230
|
-
archive(
|
|
231
|
-
this->col_num,
|
|
232
|
-
this->col_type,
|
|
233
|
-
this->coef,
|
|
234
|
-
this->mean,
|
|
235
|
-
this->cat_coef,
|
|
236
|
-
this->chosen_cat,
|
|
237
|
-
this->fill_val,
|
|
238
|
-
this->fill_new,
|
|
239
|
-
this->split_point,
|
|
240
|
-
this->hplane_left,
|
|
241
|
-
this->hplane_right,
|
|
242
|
-
this->score,
|
|
243
|
-
this->range_low,
|
|
244
|
-
this->range_high,
|
|
245
|
-
this->remainder
|
|
246
|
-
);
|
|
247
|
-
}
|
|
248
|
-
#endif
|
|
249
|
-
|
|
250
363
|
IsoHPlane() = default;
|
|
251
364
|
} IsoHPlane;
|
|
252
365
|
|
|
@@ -258,25 +371,11 @@ typedef struct IsoForest {
|
|
|
258
371
|
NewCategAction new_cat_action;
|
|
259
372
|
CategSplit cat_split_type;
|
|
260
373
|
MissingAction missing_action;
|
|
374
|
+
ScoringMetric scoring_metric;
|
|
261
375
|
double exp_avg_depth;
|
|
262
376
|
double exp_avg_sep;
|
|
263
377
|
size_t orig_sample_size;
|
|
264
|
-
|
|
265
|
-
#ifdef _ENABLE_CEREAL
|
|
266
|
-
template<class Archive>
|
|
267
|
-
void serialize(Archive &archive)
|
|
268
|
-
{
|
|
269
|
-
archive(
|
|
270
|
-
this->trees,
|
|
271
|
-
this->new_cat_action,
|
|
272
|
-
this->cat_split_type,
|
|
273
|
-
this->missing_action,
|
|
274
|
-
this->exp_avg_depth,
|
|
275
|
-
this->exp_avg_sep,
|
|
276
|
-
this->orig_sample_size
|
|
277
|
-
);
|
|
278
|
-
}
|
|
279
|
-
#endif
|
|
378
|
+
bool has_range_penalty;
|
|
280
379
|
|
|
281
380
|
IsoForest() = default;
|
|
282
381
|
} IsoForest;
|
|
@@ -286,25 +385,11 @@ typedef struct ExtIsoForest {
|
|
|
286
385
|
NewCategAction new_cat_action;
|
|
287
386
|
CategSplit cat_split_type;
|
|
288
387
|
MissingAction missing_action;
|
|
388
|
+
ScoringMetric scoring_metric;
|
|
289
389
|
double exp_avg_depth;
|
|
290
390
|
double exp_avg_sep;
|
|
291
391
|
size_t orig_sample_size;
|
|
292
|
-
|
|
293
|
-
#ifdef _ENABLE_CEREAL
|
|
294
|
-
template<class Archive>
|
|
295
|
-
void serialize(Archive &archive)
|
|
296
|
-
{
|
|
297
|
-
archive(
|
|
298
|
-
this->hplanes,
|
|
299
|
-
this->new_cat_action,
|
|
300
|
-
this->cat_split_type,
|
|
301
|
-
this->missing_action,
|
|
302
|
-
this->exp_avg_depth,
|
|
303
|
-
this->exp_avg_sep,
|
|
304
|
-
this->orig_sample_size
|
|
305
|
-
);
|
|
306
|
-
}
|
|
307
|
-
#endif
|
|
392
|
+
bool has_range_penalty;
|
|
308
393
|
|
|
309
394
|
ExtIsoForest() = default;
|
|
310
395
|
} ExtIsoForest;
|
|
@@ -316,19 +401,6 @@ typedef struct ImputeNode {
|
|
|
316
401
|
std::vector<double> cat_weight;
|
|
317
402
|
size_t parent;
|
|
318
403
|
|
|
319
|
-
#ifdef _ENABLE_CEREAL
|
|
320
|
-
template<class Archive>
|
|
321
|
-
void serialize(Archive &archive)
|
|
322
|
-
{
|
|
323
|
-
archive(
|
|
324
|
-
this->num_sum,
|
|
325
|
-
this->num_weight,
|
|
326
|
-
this->cat_sum,
|
|
327
|
-
this->cat_weight,
|
|
328
|
-
this->parent
|
|
329
|
-
);
|
|
330
|
-
}
|
|
331
|
-
#endif
|
|
332
404
|
ImputeNode() = default;
|
|
333
405
|
|
|
334
406
|
ImputeNode(size_t parent)
|
|
@@ -345,30 +417,31 @@ typedef struct Imputer {
|
|
|
345
417
|
std::vector<std::vector<ImputeNode>> imputer_tree;
|
|
346
418
|
std::vector<double> col_means;
|
|
347
419
|
std::vector<int> col_modes;
|
|
420
|
+
|
|
421
|
+
Imputer() = default;
|
|
422
|
+
} Imputer;
|
|
348
423
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
this->col_means,
|
|
359
|
-
this->col_modes
|
|
360
|
-
);
|
|
361
|
-
}
|
|
362
|
-
#endif
|
|
424
|
+
typedef struct SingleTreeIndex {
|
|
425
|
+
std::vector<size_t> terminal_node_mappings;
|
|
426
|
+
std::vector<double> node_distances;
|
|
427
|
+
std::vector<double> node_depths;
|
|
428
|
+
std::vector<size_t> reference_points;
|
|
429
|
+
std::vector<size_t> reference_indptr;
|
|
430
|
+
std::vector<size_t> reference_mapping;
|
|
431
|
+
size_t n_terminal;
|
|
432
|
+
} TreeNodeIndex;
|
|
363
433
|
|
|
364
|
-
|
|
434
|
+
typedef struct TreesIndexer {
|
|
435
|
+
std::vector<SingleTreeIndex> indices;
|
|
365
436
|
|
|
366
|
-
|
|
437
|
+
TreesIndexer() = default;
|
|
438
|
+
} TreesIndexer;
|
|
367
439
|
|
|
368
440
|
|
|
369
441
|
/* Structs that are only used internally */
|
|
370
|
-
|
|
371
|
-
|
|
442
|
+
template <class real_t, class sparse_ix>
|
|
443
|
+
struct InputData {
|
|
444
|
+
real_t* numeric_data;
|
|
372
445
|
size_t ncols_numeric;
|
|
373
446
|
int* categ_data;
|
|
374
447
|
int* ncat;
|
|
@@ -376,10 +449,10 @@ typedef struct {
|
|
|
376
449
|
size_t ncols_categ;
|
|
377
450
|
size_t nrows;
|
|
378
451
|
size_t ncols_tot;
|
|
379
|
-
|
|
452
|
+
real_t* sample_weights;
|
|
380
453
|
bool weight_as_sample;
|
|
381
|
-
|
|
382
|
-
|
|
454
|
+
real_t* col_weights;
|
|
455
|
+
real_t* Xc; /* only for sparse matrices */
|
|
383
456
|
sparse_ix* Xc_ind; /* only for sparse matrices */
|
|
384
457
|
sparse_ix* Xc_indptr; /* only for sparse matrices */
|
|
385
458
|
size_t log2_n; /* only when using weights for sampling */
|
|
@@ -387,37 +460,58 @@ typedef struct {
|
|
|
387
460
|
std::vector<double> btree_weights_init; /* only when using weights for sampling */
|
|
388
461
|
std::vector<char> has_missing; /* only used when producing missing imputations on-the-fly */
|
|
389
462
|
size_t n_missing; /* only used when producing missing imputations on-the-fly */
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
double*
|
|
463
|
+
void* preinitialized_col_sampler; /* only when using column weights */
|
|
464
|
+
double* range_low; /* only when calculating variable ranges or boxed densities with no sub-sampling */
|
|
465
|
+
double* range_high; /* only when calculating variable ranges or boxed densities with no sub-sampling */
|
|
466
|
+
int* ncat_; /* only when calculating boxed densities with no sub-sampling */
|
|
467
|
+
std::vector<double> all_kurtoses; /* only when using 'prob_pick_col_by_kurtosis' or mixing 'weigh_by_kurt' with 'prob_pick_col*' with no sub-sampling */
|
|
468
|
+
|
|
469
|
+
std::vector<double> X_row_major; /* created by this library, only used when calculating full gain */
|
|
470
|
+
std::vector<double> Xr; /* created by this library, only used when calculating full gain */
|
|
471
|
+
std::vector<size_t> Xr_ind; /* created by this library, only used when calculating full gain */
|
|
472
|
+
std::vector<size_t> Xr_indptr; /* created by this library, only used when calculating full gain */
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
template <class real_t, class sparse_ix>
|
|
477
|
+
struct PredictionData {
|
|
478
|
+
real_t* numeric_data;
|
|
395
479
|
int* categ_data;
|
|
396
480
|
size_t nrows;
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
sparse_ix*
|
|
402
|
-
sparse_ix*
|
|
403
|
-
|
|
481
|
+
bool is_col_major;
|
|
482
|
+
size_t ncols_numeric; /* only required for row-major data */
|
|
483
|
+
size_t ncols_categ; /* only required for row-major data */
|
|
484
|
+
real_t* Xc; /* only for sparse matrices */
|
|
485
|
+
sparse_ix* Xc_ind; /* only for sparse matrices */
|
|
486
|
+
sparse_ix* Xc_indptr; /* only for sparse matrices */
|
|
487
|
+
real_t* Xr; /* only for sparse matrices */
|
|
488
|
+
sparse_ix* Xr_ind; /* only for sparse matrices */
|
|
489
|
+
sparse_ix* Xr_indptr; /* only for sparse matrices */
|
|
490
|
+
};
|
|
404
491
|
|
|
405
492
|
typedef struct {
|
|
406
493
|
bool with_replacement;
|
|
407
494
|
size_t sample_size;
|
|
408
495
|
size_t ntrees;
|
|
496
|
+
size_t ncols_per_tree;
|
|
409
497
|
size_t max_depth;
|
|
410
498
|
bool penalize_range;
|
|
499
|
+
bool standardize_data;
|
|
411
500
|
uint64_t random_seed;
|
|
412
501
|
bool weigh_by_kurt;
|
|
413
502
|
double prob_pick_by_gain_avg;
|
|
414
|
-
double prob_split_by_gain_avg;
|
|
415
503
|
double prob_pick_by_gain_pl;
|
|
416
|
-
double
|
|
504
|
+
double prob_pick_by_full_gain;
|
|
505
|
+
double prob_pick_by_dens;
|
|
506
|
+
double prob_pick_col_by_range;
|
|
507
|
+
double prob_pick_col_by_var;
|
|
508
|
+
double prob_pick_col_by_kurt;
|
|
417
509
|
double min_gain;
|
|
418
510
|
CategSplit cat_split_type;
|
|
419
511
|
NewCategAction new_cat_action;
|
|
420
512
|
MissingAction missing_action;
|
|
513
|
+
ScoringMetric scoring_metric;
|
|
514
|
+
bool fast_bratio;
|
|
421
515
|
bool all_perm;
|
|
422
516
|
|
|
423
517
|
size_t ndim; /* only for extended model */
|
|
@@ -431,16 +525,17 @@ typedef struct {
|
|
|
431
525
|
|
|
432
526
|
UseDepthImp depth_imp; /* only when building NA imputer */
|
|
433
527
|
WeighImpRows weigh_imp_rows; /* only when building NA imputer */
|
|
434
|
-
size_t min_imp_obs;
|
|
528
|
+
size_t min_imp_obs; /* only when building NA imputer */
|
|
435
529
|
} ModelParams;
|
|
436
530
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
std::vector<
|
|
440
|
-
std::vector<
|
|
441
|
-
std::vector<
|
|
442
|
-
std::vector<
|
|
443
|
-
std::vector<
|
|
531
|
+
template <class sparse_ix, class ldouble_safe>
|
|
532
|
+
struct ImputedData {
|
|
533
|
+
std::vector<ldouble_safe> num_sum;
|
|
534
|
+
std::vector<ldouble_safe> num_weight;
|
|
535
|
+
std::vector<std::vector<ldouble_safe>> cat_sum;
|
|
536
|
+
std::vector<ldouble_safe> cat_weight;
|
|
537
|
+
std::vector<ldouble_safe> sp_num_sum;
|
|
538
|
+
std::vector<ldouble_safe> sp_num_weight;
|
|
444
539
|
|
|
445
540
|
std::vector<size_t> missing_num;
|
|
446
541
|
std::vector<size_t> missing_cat;
|
|
@@ -451,56 +546,288 @@ typedef struct ImputedData {
|
|
|
451
546
|
|
|
452
547
|
ImputedData() {};
|
|
453
548
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
549
|
+
template <class InputData>
|
|
550
|
+
ImputedData(InputData &input_data, size_t row)
|
|
551
|
+
{
|
|
552
|
+
initialize_impute_calc(*this, input_data, row);
|
|
553
|
+
}
|
|
457
554
|
|
|
458
|
-
|
|
555
|
+
};
|
|
556
|
+
|
|
557
|
+
/* This class provides efficient methods for sampling columns at random,
|
|
558
|
+
given that at a given node a column might no longer be splittable,
|
|
559
|
+
and when that happens, it also makes it non-splittable in any children
|
|
560
|
+
node from there onwards. The idea is to provide efficient methods for
|
|
561
|
+
passing the state from a parent node to a left node and then restore
|
|
562
|
+
the state before going for the right node.
|
|
563
|
+
It can be used in 3 modes:
|
|
564
|
+
- As a uniform sampler with replacement.
|
|
565
|
+
- As a weighted sampler with replacement.
|
|
566
|
+
- As an array that keeps track of which columns are still splittable. */
|
|
567
|
+
template <class ldouble_safe>
|
|
568
|
+
class ColumnSampler
|
|
569
|
+
{
|
|
570
|
+
public:
|
|
571
|
+
std::vector<size_t> col_indices;
|
|
572
|
+
std::vector<double> tree_weights;
|
|
573
|
+
size_t curr_pos;
|
|
574
|
+
size_t curr_col;
|
|
575
|
+
size_t last_given;
|
|
576
|
+
size_t n_cols;
|
|
577
|
+
size_t tree_levels;
|
|
578
|
+
size_t offset;
|
|
579
|
+
size_t n_dropped;
|
|
580
|
+
template <class real_t>
|
|
581
|
+
void initialize(real_t weights[], size_t n_cols);
|
|
582
|
+
void initialize(size_t n_cols);
|
|
583
|
+
void drop_weights();
|
|
584
|
+
void leave_m_cols(size_t m, RNG_engine &rnd_generator);
|
|
585
|
+
bool sample_col(size_t &col, RNG_engine &rnd_generator);
|
|
586
|
+
void prepare_full_pass(); /* when passing through all columns */
|
|
587
|
+
bool sample_col(size_t &col); /* when passing through all columns */
|
|
588
|
+
void drop_col(size_t col, size_t nobs_left);
|
|
589
|
+
void drop_col(size_t col);
|
|
590
|
+
void drop_from_tail(size_t col);
|
|
591
|
+
void shuffle_remainder(RNG_engine &rnd_generator);
|
|
592
|
+
bool has_weights();
|
|
593
|
+
size_t get_remaining_cols();
|
|
594
|
+
void get_array_remaining_cols(std::vector<size_t> &restrict cols);
|
|
595
|
+
template <class other_t>
|
|
596
|
+
ColumnSampler& operator=(const ColumnSampler<other_t> &other);
|
|
597
|
+
ColumnSampler() = default;
|
|
598
|
+
};
|
|
599
|
+
|
|
600
|
+
template <class ldouble_safe, class real_t>
|
|
601
|
+
class DensityCalculator
|
|
602
|
+
{
|
|
603
|
+
public:
|
|
604
|
+
std::vector<ldouble_safe> multipliers;
|
|
605
|
+
double xmin;
|
|
606
|
+
double xmax;
|
|
607
|
+
std::vector<size_t> counts;
|
|
608
|
+
int n_present;
|
|
609
|
+
int n_left;
|
|
610
|
+
std::vector<double> box_low;
|
|
611
|
+
std::vector<double> box_high;
|
|
612
|
+
std::vector<double> queue_box;
|
|
613
|
+
bool fast_bratio;
|
|
614
|
+
std::vector<ldouble_safe> ranges;
|
|
615
|
+
std::vector<int> ncat;
|
|
616
|
+
std::vector<int> queue_ncat;
|
|
617
|
+
std::vector<int> ncat_orig;
|
|
618
|
+
std::vector<double> vals_ext_box;
|
|
619
|
+
std::vector<double> queue_ext_box;
|
|
620
|
+
|
|
621
|
+
void initialize(size_t max_depth, int max_categ, bool reserve_counts, ScoringMetric scoring_metric);
|
|
622
|
+
template <class InputData>
|
|
623
|
+
#ifndef _FOR_R
|
|
624
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
625
|
+
#endif
|
|
626
|
+
void initialize_bdens(const InputData &input_data,
|
|
627
|
+
const ModelParams &model_params,
|
|
628
|
+
std::vector<size_t> &ix_arr,
|
|
629
|
+
ColumnSampler<ldouble_safe> &col_sampler);
|
|
630
|
+
template <class InputData>
|
|
631
|
+
void initialize_bdens_ext(const InputData &input_data,
|
|
632
|
+
const ModelParams &model_params,
|
|
633
|
+
std::vector<size_t> &ix_arr,
|
|
634
|
+
ColumnSampler<ldouble_safe> &col_sampler,
|
|
635
|
+
bool col_sampler_is_fresh);
|
|
636
|
+
#ifndef _FOR_R
|
|
637
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
638
|
+
#endif
|
|
639
|
+
void push_density(double xmin, double xmax, double split_point);
|
|
640
|
+
void push_density(size_t counts[], int ncat);
|
|
641
|
+
void push_density(int n_left, int n_present);
|
|
642
|
+
void push_density(int n_present);
|
|
643
|
+
void push_density();
|
|
644
|
+
void push_adj(double xmin, double xmax, double split_point, double pct_tree_left, ScoringMetric scoring_metric);
|
|
645
|
+
void push_adj(signed char *restrict categ_present, size_t *restrict counts, int ncat, ScoringMetric scoring_metric);
|
|
646
|
+
void push_adj(size_t *restrict counts, int ncat, int chosen_cat, ScoringMetric scoring_metric);
|
|
647
|
+
void push_adj(double pct_tree_left, ScoringMetric scoring_metric);
|
|
648
|
+
void push_bdens(double split_point, size_t col);
|
|
649
|
+
void push_bdens(int ncat_branch_left, size_t col);
|
|
650
|
+
void push_bdens(const std::vector<signed char> &cat_split, size_t col);
|
|
651
|
+
#ifndef _FOR_R
|
|
652
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
653
|
+
#endif
|
|
654
|
+
void push_bdens_fast_route(double split_point, size_t col);
|
|
655
|
+
void push_bdens_internal(double split_point, size_t col);
|
|
656
|
+
#ifndef _FOR_R
|
|
657
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
658
|
+
#endif
|
|
659
|
+
void push_bdens_fast_route(int ncat_branch_left, size_t col);
|
|
660
|
+
void push_bdens_internal(int ncat_branch_left, size_t col);
|
|
661
|
+
#ifndef _FOR_R
|
|
662
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
663
|
+
#endif
|
|
664
|
+
void push_bdens_fast_route(const std::vector<signed char> &cat_split, size_t col);
|
|
665
|
+
void push_bdens_internal(const std::vector<signed char> &cat_split, size_t col);
|
|
666
|
+
#ifndef _FOR_R
|
|
667
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
668
|
+
#endif
|
|
669
|
+
void push_bdens_ext(const IsoHPlane &hplane, const ModelParams &model_params);
|
|
670
|
+
void pop();
|
|
671
|
+
void pop_right();
|
|
672
|
+
void pop_bdens(size_t col);
|
|
673
|
+
void pop_bdens_right(size_t col);
|
|
674
|
+
void pop_bdens_cat(size_t col);
|
|
675
|
+
void pop_bdens_cat_right(size_t col);
|
|
676
|
+
void pop_bdens_fast_route(size_t col);
|
|
677
|
+
void pop_bdens_internal(size_t col);
|
|
678
|
+
void pop_bdens_right_fast_route(size_t col);
|
|
679
|
+
void pop_bdens_right_internal(size_t col);
|
|
680
|
+
void pop_bdens_cat_fast_route(size_t col);
|
|
681
|
+
void pop_bdens_cat_internal(size_t col);
|
|
682
|
+
void pop_bdens_cat_right_fast_route(size_t col);
|
|
683
|
+
void pop_bdens_cat_right_internal(size_t col);
|
|
684
|
+
void pop_bdens_ext();
|
|
685
|
+
void pop_bdens_ext_right();
|
|
686
|
+
#ifndef _FOR_R
|
|
687
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
688
|
+
#endif
|
|
689
|
+
double calc_density(ldouble_safe remainder, size_t sample_size);
|
|
690
|
+
ldouble_safe calc_adj_depth();
|
|
691
|
+
double calc_adj_density();
|
|
692
|
+
#ifndef _FOR_R
|
|
693
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
694
|
+
#endif
|
|
695
|
+
ldouble_safe calc_bratio_log();
|
|
696
|
+
#ifndef _FOR_R
|
|
697
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
698
|
+
#endif
|
|
699
|
+
ldouble_safe calc_bratio_inv_log();
|
|
700
|
+
#ifndef _FOR_R
|
|
701
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
702
|
+
#endif
|
|
703
|
+
double calc_bratio();
|
|
704
|
+
#ifndef _FOR_R
|
|
705
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
706
|
+
#endif
|
|
707
|
+
double calc_bdens(ldouble_safe remainder, size_t sample_size);
|
|
708
|
+
#ifndef _FOR_R
|
|
709
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
710
|
+
#endif
|
|
711
|
+
double calc_bdens2(ldouble_safe remainder, size_t sample_size);
|
|
712
|
+
#ifndef _FOR_R
|
|
713
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
714
|
+
#endif
|
|
715
|
+
ldouble_safe calc_bratio_log_ext();
|
|
716
|
+
#ifndef _FOR_R
|
|
717
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
718
|
+
#endif
|
|
719
|
+
double calc_bratio_ext();
|
|
720
|
+
#ifndef _FOR_R
|
|
721
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
722
|
+
#endif
|
|
723
|
+
double calc_bdens_ext(ldouble_safe remainder, size_t sample_size);
|
|
724
|
+
void save_range(double xmin, double xmax);
|
|
725
|
+
void restore_range(double &restrict xmin, double &restrict xmax);
|
|
726
|
+
void save_counts(size_t *restrict cat_counts, int ncat);
|
|
727
|
+
void save_n_present_and_left(signed char *restrict split_left, int ncat);
|
|
728
|
+
void save_n_present(size_t *restrict cat_counts, int ncat);
|
|
729
|
+
};
|
|
730
|
+
|
|
731
|
+
template <class ldouble_safe, class real_t>
|
|
732
|
+
class SingleNodeColumnSampler
|
|
733
|
+
{
|
|
734
|
+
public:
|
|
735
|
+
double *restrict weights_orig;
|
|
736
|
+
std::vector<bool> inifinite_weights;
|
|
737
|
+
ldouble_safe cumw;
|
|
738
|
+
size_t n_inf;
|
|
739
|
+
size_t *restrict col_indices;
|
|
740
|
+
size_t curr_pos;
|
|
741
|
+
bool using_tree;
|
|
742
|
+
|
|
743
|
+
bool backup_weights;
|
|
744
|
+
std::vector<double> weights_own;
|
|
745
|
+
size_t n_left;
|
|
746
|
+
|
|
747
|
+
std::vector<double> tree_weights;
|
|
748
|
+
size_t offset;
|
|
749
|
+
size_t tree_levels;
|
|
750
|
+
std::vector<double> used_weights;
|
|
751
|
+
std::vector<size_t> mapped_indices;
|
|
752
|
+
std::vector<size_t> mapped_inf_indices;
|
|
753
|
+
|
|
754
|
+
bool initialize(
|
|
755
|
+
double *restrict weights,
|
|
756
|
+
std::vector<size_t> *col_indices,
|
|
757
|
+
size_t curr_pos,
|
|
758
|
+
size_t n_sample,
|
|
759
|
+
bool backup_weights
|
|
760
|
+
);
|
|
761
|
+
|
|
762
|
+
bool sample_col(size_t &col_chosen, RNG_engine &rnd_generator);
|
|
763
|
+
|
|
764
|
+
void backup(SingleNodeColumnSampler<ldouble_safe, real_t> &other, size_t ncols_tot);
|
|
765
|
+
|
|
766
|
+
void restore(const SingleNodeColumnSampler<ldouble_safe, real_t> &other);
|
|
767
|
+
};
|
|
768
|
+
|
|
769
|
+
template <class ImputedData, class ldouble_safe, class real_t>
|
|
770
|
+
struct WorkerMemory {
|
|
459
771
|
std::vector<size_t> ix_arr;
|
|
460
772
|
std::vector<size_t> ix_all;
|
|
461
773
|
RNG_engine rnd_generator;
|
|
462
|
-
|
|
463
|
-
std::uniform_real_distribution<double> rbin;
|
|
774
|
+
UniformUnitInterval rbin;
|
|
464
775
|
size_t st;
|
|
465
776
|
size_t end;
|
|
466
777
|
size_t st_NA;
|
|
467
778
|
size_t end_NA;
|
|
468
779
|
size_t split_ix;
|
|
469
|
-
|
|
470
|
-
std::vector<double> weights_arr;
|
|
780
|
+
hashed_map<size_t, double> weights_map;
|
|
781
|
+
std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as dty */
|
|
782
|
+
bool changed_weights; /* when using 'missing_action'='Divide' or density weights */
|
|
471
783
|
double xmin;
|
|
472
784
|
double xmax;
|
|
473
|
-
size_t npresent;
|
|
785
|
+
size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
|
|
474
786
|
bool unsplittable;
|
|
475
787
|
std::vector<bool> is_repeated;
|
|
476
|
-
std::vector<char>
|
|
477
|
-
size_t ncols_tried;
|
|
788
|
+
std::vector<signed char> categs;
|
|
789
|
+
size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
|
|
478
790
|
int ncat_tried;
|
|
479
|
-
std::vector<
|
|
480
|
-
|
|
481
|
-
|
|
791
|
+
std::vector<double> btree_weights; /* only when using weights for sampling */
|
|
792
|
+
ColumnSampler<ldouble_safe> col_sampler; /* columns can get eliminated, keep a copy for each thread */
|
|
793
|
+
SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler;
|
|
794
|
+
SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler_backup;
|
|
482
795
|
|
|
483
796
|
/* for split criterion */
|
|
484
797
|
std::vector<double> buffer_dbl;
|
|
485
798
|
std::vector<size_t> buffer_szt;
|
|
486
|
-
std::vector<char>
|
|
799
|
+
std::vector<signed char> buffer_chr;
|
|
487
800
|
double prob_split_type;
|
|
801
|
+
ColCriterion col_criterion;
|
|
488
802
|
GainCriterion criterion;
|
|
489
803
|
double this_gain;
|
|
490
804
|
double this_split_point;
|
|
491
805
|
int this_categ;
|
|
492
|
-
std::vector<char>
|
|
806
|
+
std::vector<signed char> this_split_categ;
|
|
493
807
|
bool determine_split;
|
|
808
|
+
std::vector<double> imputed_x_buffer;
|
|
809
|
+
double saved_xmedian;
|
|
810
|
+
double best_xmedian;
|
|
811
|
+
int saved_cat_mode;
|
|
812
|
+
int best_cat_mode;
|
|
813
|
+
std::vector<size_t> col_indices; /* only for full gain calculation */
|
|
814
|
+
|
|
815
|
+
/* for weighted column choices */
|
|
816
|
+
std::vector<double> node_col_weights;
|
|
817
|
+
std::vector<double> saved_stat1;
|
|
818
|
+
std::vector<double> saved_stat2;
|
|
819
|
+
bool has_saved_stats;
|
|
820
|
+
double* tree_kurtoses; /* only when mixing 'weight_by_kurt' with 'prob_pick_col*' */
|
|
494
821
|
|
|
495
822
|
/* for the extended model */
|
|
496
823
|
size_t ntry;
|
|
497
824
|
size_t ntaken;
|
|
498
825
|
size_t ntaken_best;
|
|
499
|
-
|
|
500
|
-
|
|
826
|
+
size_t ntried;
|
|
827
|
+
bool try_all;
|
|
828
|
+
size_t col_chosen; /* also used as placeholder in the single-variable model */
|
|
501
829
|
ColType col_type;
|
|
502
830
|
double ext_sd;
|
|
503
|
-
std::vector<size_t> cols_shuffled;
|
|
504
831
|
std::vector<double> comb_val;
|
|
505
832
|
std::vector<size_t> col_take;
|
|
506
833
|
std::vector<ColType> col_take_type;
|
|
@@ -510,9 +837,10 @@ typedef struct {
|
|
|
510
837
|
std::vector<double> ext_fill_val;
|
|
511
838
|
std::vector<double> ext_fill_new;
|
|
512
839
|
std::vector<int> chosen_cat;
|
|
513
|
-
std::vector<std::vector<double>>
|
|
514
|
-
|
|
515
|
-
|
|
840
|
+
std::vector<std::vector<double>> ext_cat_coef;
|
|
841
|
+
UniformMinusOneToOne coef_unif;
|
|
842
|
+
StandardNormalDistr coef_norm;
|
|
843
|
+
std::vector<double> sample_weights; /* when using weights and split criterion */
|
|
516
844
|
|
|
517
845
|
/* for similarity/distance calculations */
|
|
518
846
|
std::vector<double> tmat_sep;
|
|
@@ -522,9 +850,11 @@ typedef struct {
|
|
|
522
850
|
|
|
523
851
|
/* when imputing NAs on-the-fly */
|
|
524
852
|
std::vector<ImputedData> impute_vec;
|
|
525
|
-
|
|
853
|
+
hashed_map<size_t, ImputedData> impute_map;
|
|
526
854
|
|
|
527
|
-
|
|
855
|
+
/* for non-depth scoring metric */
|
|
856
|
+
DensityCalculator<ldouble_safe, real_t> density_calculator;
|
|
857
|
+
};
|
|
528
858
|
|
|
529
859
|
typedef struct WorkerForSimilarity {
|
|
530
860
|
std::vector<size_t> ix_arr;
|
|
@@ -538,55 +868,138 @@ typedef struct WorkerForSimilarity {
|
|
|
538
868
|
bool assume_full_distr; /* doesn't need to have one copy per worker */
|
|
539
869
|
} WorkerForSimilarity;
|
|
540
870
|
|
|
541
|
-
typedef struct {
|
|
871
|
+
typedef struct WorkerForPredictCSC {
|
|
872
|
+
std::vector<size_t> ix_arr;
|
|
873
|
+
size_t st;
|
|
874
|
+
size_t end;
|
|
875
|
+
std::vector<double> comb_val;
|
|
876
|
+
std::vector<double> weights_arr;
|
|
877
|
+
std::vector<double> depths;
|
|
878
|
+
} WorkerForPredictCSC;
|
|
879
|
+
|
|
880
|
+
class RecursionState {
|
|
881
|
+
public:
|
|
542
882
|
size_t st;
|
|
543
883
|
size_t st_NA;
|
|
544
884
|
size_t end_NA;
|
|
545
885
|
size_t split_ix;
|
|
546
886
|
size_t end;
|
|
887
|
+
size_t sampler_pos;
|
|
888
|
+
size_t n_dropped;
|
|
889
|
+
bool changed_weights;
|
|
890
|
+
bool full_state;
|
|
547
891
|
std::vector<size_t> ix_arr;
|
|
548
892
|
std::vector<bool> cols_possible;
|
|
893
|
+
std::vector<double> col_sampler_weights;
|
|
549
894
|
std::unique_ptr<double[]> weights_arr;
|
|
550
|
-
|
|
551
|
-
|
|
895
|
+
|
|
896
|
+
RecursionState() = default;
|
|
897
|
+
template <class WorkerMemory>
|
|
898
|
+
RecursionState(WorkerMemory &workspace, bool full_state);
|
|
899
|
+
template <class WorkerMemory>
|
|
900
|
+
void restore_state(WorkerMemory &workspace);
|
|
901
|
+
};
|
|
552
902
|
|
|
553
903
|
/* Function prototypes */
|
|
554
904
|
|
|
555
905
|
/* fit_model.cpp */
|
|
556
|
-
|
|
906
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
907
|
+
int fit_iforest_internal(
|
|
908
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
909
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
910
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
911
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
912
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
913
|
+
real_t sample_weights[], bool with_replacement, bool weight_as_sample,
|
|
914
|
+
size_t nrows, size_t sample_size, size_t ntrees,
|
|
915
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
916
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
917
|
+
ScoringMetric scoring_metric, bool fast_bratio,
|
|
918
|
+
bool standardize_dist, double tmat[],
|
|
919
|
+
double output_depths[], bool standardize_depth,
|
|
920
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
921
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
922
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
923
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
924
|
+
double prob_pick_col_by_kurt,
|
|
925
|
+
double min_gain, MissingAction missing_action,
|
|
926
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
927
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
928
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
|
929
|
+
uint64_t random_seed, int nthreads);
|
|
930
|
+
template <class real_t, class sparse_ix>
|
|
557
931
|
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
558
|
-
|
|
932
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
559
933
|
int categ_data[], size_t ncols_categ, int ncat[],
|
|
560
|
-
|
|
934
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
561
935
|
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
562
|
-
|
|
563
|
-
size_t nrows, size_t sample_size, size_t ntrees,
|
|
564
|
-
|
|
936
|
+
real_t sample_weights[], bool with_replacement, bool weight_as_sample,
|
|
937
|
+
size_t nrows, size_t sample_size, size_t ntrees,
|
|
938
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
939
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
940
|
+
ScoringMetric scoring_metric, bool fast_bratio,
|
|
565
941
|
bool standardize_dist, double tmat[],
|
|
566
942
|
double output_depths[], bool standardize_depth,
|
|
567
|
-
|
|
568
|
-
double
|
|
569
|
-
double
|
|
943
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
944
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
945
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
946
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
947
|
+
double prob_pick_col_by_kurt,
|
|
570
948
|
double min_gain, MissingAction missing_action,
|
|
571
949
|
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
572
950
|
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
573
951
|
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
|
574
|
-
uint64_t random_seed, bool
|
|
952
|
+
uint64_t random_seed, bool use_long_double, int nthreads);
|
|
953
|
+
template <class real_t, class sparse_ix>
|
|
575
954
|
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
576
|
-
|
|
955
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
577
956
|
int categ_data[], size_t ncols_categ, int ncat[],
|
|
578
|
-
|
|
957
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
579
958
|
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
959
|
+
real_t sample_weights[], size_t nrows,
|
|
960
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
961
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
962
|
+
bool fast_bratio,
|
|
963
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
964
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
965
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
966
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
967
|
+
double prob_pick_col_by_kurt,
|
|
585
968
|
double min_gain, MissingAction missing_action,
|
|
586
969
|
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
587
970
|
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
588
|
-
bool all_perm,
|
|
971
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
972
|
+
TreesIndexer *indexer,
|
|
973
|
+
real_t ref_numeric_data[], int ref_categ_data[],
|
|
974
|
+
bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
|
|
975
|
+
real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
|
|
976
|
+
uint64_t random_seed, bool use_long_double);
|
|
977
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
978
|
+
int add_tree_internal(
|
|
979
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
980
|
+
real_t numeric_data[], size_t ncols_numeric,
|
|
981
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
982
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
983
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
984
|
+
real_t sample_weights[], size_t nrows,
|
|
985
|
+
size_t max_depth, size_t ncols_per_tree,
|
|
986
|
+
bool limit_depth, bool penalize_range, bool standardize_data,
|
|
987
|
+
bool fast_bratio,
|
|
988
|
+
real_t col_weights[], bool weigh_by_kurt,
|
|
989
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
990
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
991
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
992
|
+
double prob_pick_col_by_kurt,
|
|
993
|
+
double min_gain, MissingAction missing_action,
|
|
994
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
995
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
996
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
|
997
|
+
TreesIndexer *indexer,
|
|
998
|
+
real_t ref_numeric_data[], int ref_categ_data[],
|
|
999
|
+
bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
|
|
1000
|
+
real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
|
|
589
1001
|
uint64_t random_seed);
|
|
1002
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
590
1003
|
void fit_itree(std::vector<IsoTree> *tree_root,
|
|
591
1004
|
std::vector<IsoHPlane> *hplane_root,
|
|
592
1005
|
WorkerMemory &workspace,
|
|
@@ -596,6 +1009,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
|
|
|
596
1009
|
size_t tree_num);
|
|
597
1010
|
|
|
598
1011
|
/* isoforest.cpp */
|
|
1012
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
599
1013
|
void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
600
1014
|
WorkerMemory &workspace,
|
|
601
1015
|
InputData &input_data,
|
|
@@ -604,31 +1018,55 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
|
604
1018
|
size_t curr_depth);
|
|
605
1019
|
|
|
606
1020
|
/* extended.cpp */
|
|
1021
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
607
1022
|
void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
|
|
608
1023
|
WorkerMemory &workspace,
|
|
609
1024
|
InputData &input_data,
|
|
610
1025
|
ModelParams &model_params,
|
|
611
1026
|
std::vector<ImputeNode> *impute_nodes,
|
|
612
1027
|
size_t curr_depth);
|
|
1028
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
613
1029
|
void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params,
|
|
614
|
-
std::vector<bool> &col_is_taken,
|
|
1030
|
+
std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s);
|
|
615
1031
|
void shrink_to_fit_hplane(IsoHPlane &hplane, bool clear_vectors);
|
|
1032
|
+
template <class InputData, class WorkerMemory>
|
|
616
1033
|
void simplify_hplane(IsoHPlane &hplane, WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
|
|
617
1034
|
|
|
618
1035
|
|
|
619
1036
|
/* predict.cpp */
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
1037
|
+
template <class real_t, class sparse_ix>
|
|
1038
|
+
#ifndef _FOR_R
|
|
1039
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno"), gnu::hot]]
|
|
1040
|
+
#endif
|
|
1041
|
+
void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
|
|
1042
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1043
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1044
|
+
real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
|
|
623
1045
|
size_t nrows, int nthreads, bool standardize,
|
|
624
1046
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
625
|
-
double output_depths
|
|
1047
|
+
double *restrict output_depths, sparse_ix *restrict tree_num,
|
|
1048
|
+
double *restrict per_tree_depths,
|
|
1049
|
+
TreesIndexer *indexer);
|
|
1050
|
+
template <class real_t, class sparse_ix>
|
|
1051
|
+
[[gnu::hot]]
|
|
1052
|
+
void traverse_itree_fast(std::vector<IsoTree> &tree,
|
|
1053
|
+
IsoForest &model_outputs,
|
|
1054
|
+
real_t *restrict row_numeric_data,
|
|
1055
|
+
double &restrict output_depth,
|
|
1056
|
+
sparse_ix *restrict tree_num,
|
|
1057
|
+
double *restrict tree_depth,
|
|
1058
|
+
size_t row) noexcept;
|
|
1059
|
+
template <class PredictionData, class sparse_ix>
|
|
1060
|
+
[[gnu::hot]]
|
|
626
1061
|
void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
|
|
627
1062
|
IsoForest &model_outputs,
|
|
628
1063
|
PredictionData &prediction_data,
|
|
629
|
-
double
|
|
1064
|
+
double &restrict output_depth,
|
|
630
1065
|
sparse_ix *restrict tree_num,
|
|
631
|
-
|
|
1066
|
+
double *restrict tree_depth,
|
|
1067
|
+
size_t row) noexcept;
|
|
1068
|
+
template <class PredictionData, class sparse_ix, class ImputedData>
|
|
1069
|
+
[[gnu::hot]]
|
|
632
1070
|
double traverse_itree(std::vector<IsoTree> &tree,
|
|
633
1071
|
IsoForest &model_outputs,
|
|
634
1072
|
PredictionData &prediction_data,
|
|
@@ -637,63 +1075,181 @@ double traverse_itree(std::vector<IsoTree> &tree,
|
|
|
637
1075
|
double curr_weight,
|
|
638
1076
|
size_t row,
|
|
639
1077
|
sparse_ix *restrict tree_num,
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
1078
|
+
double *restrict tree_depth,
|
|
1079
|
+
size_t curr_lev) noexcept;
|
|
1080
|
+
template <class PredictionData, class sparse_ix>
|
|
1081
|
+
[[gnu::hot]]
|
|
1082
|
+
void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
|
|
1083
|
+
ExtIsoForest &model_outputs,
|
|
1084
|
+
PredictionData &prediction_data,
|
|
1085
|
+
double &restrict output_depth,
|
|
1086
|
+
sparse_ix *restrict tree_num,
|
|
1087
|
+
double *restrict tree_depth,
|
|
1088
|
+
size_t row) noexcept;
|
|
1089
|
+
template <class real_t, class sparse_ix>
|
|
1090
|
+
[[gnu::hot]]
|
|
1091
|
+
void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
|
|
1092
|
+
ExtIsoForest &model_outputs,
|
|
1093
|
+
real_t *restrict row_numeric_data,
|
|
1094
|
+
double &restrict output_depth,
|
|
1095
|
+
sparse_ix *restrict tree_num,
|
|
1096
|
+
double *restrict tree_depth,
|
|
1097
|
+
size_t row) noexcept;
|
|
1098
|
+
template <class PredictionData, class sparse_ix, class ImputedData>
|
|
1099
|
+
[[gnu::hot]]
|
|
647
1100
|
void traverse_hplane(std::vector<IsoHPlane> &hplane,
|
|
648
1101
|
ExtIsoForest &model_outputs,
|
|
649
1102
|
PredictionData &prediction_data,
|
|
650
|
-
double
|
|
1103
|
+
double &restrict output_depth,
|
|
651
1104
|
std::vector<ImputeNode> *impute_nodes,
|
|
652
1105
|
ImputedData *imputed_data,
|
|
653
1106
|
sparse_ix *restrict tree_num,
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
void
|
|
658
|
-
|
|
1107
|
+
double *restrict tree_depth,
|
|
1108
|
+
size_t row) noexcept;
|
|
1109
|
+
template <class real_t, class sparse_ix>
|
|
1110
|
+
void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
|
|
1111
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1112
|
+
double *restrict output_depths, sparse_ix *restrict tree_num,
|
|
1113
|
+
double *restrict per_tree_depths);
|
|
1114
|
+
template <class PredictionData, class sparse_ix>
|
|
1115
|
+
void traverse_itree_csc(WorkerForPredictCSC &workspace,
|
|
1116
|
+
std::vector<IsoTree> &trees,
|
|
1117
|
+
IsoForest &model_outputs,
|
|
1118
|
+
PredictionData &prediction_data,
|
|
1119
|
+
sparse_ix *restrict tree_num,
|
|
1120
|
+
double *restrict per_tree_depths,
|
|
1121
|
+
size_t curr_tree,
|
|
1122
|
+
bool has_range_penalty);
|
|
1123
|
+
template <class PredictionData, class sparse_ix>
|
|
1124
|
+
void traverse_hplane_csc(WorkerForPredictCSC &workspace,
|
|
1125
|
+
std::vector<IsoHPlane> &hplanes,
|
|
1126
|
+
ExtIsoForest &model_outputs,
|
|
1127
|
+
PredictionData &prediction_data,
|
|
1128
|
+
sparse_ix *restrict tree_num,
|
|
1129
|
+
double *restrict per_tree_depths,
|
|
1130
|
+
size_t curr_tree,
|
|
1131
|
+
bool has_range_penalty);
|
|
1132
|
+
template <class PredictionData>
|
|
1133
|
+
void add_csc_range_penalty(WorkerForPredictCSC &workspace,
|
|
1134
|
+
PredictionData &prediction_data,
|
|
1135
|
+
double *restrict weights_arr,
|
|
1136
|
+
size_t col_num,
|
|
1137
|
+
double range_low,
|
|
1138
|
+
double range_high);
|
|
1139
|
+
template <class PredictionData>
|
|
1140
|
+
double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept;
|
|
1141
|
+
template <class PredictionData, class sparse_ix>
|
|
1142
|
+
static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept;
|
|
1143
|
+
template <class PredictionData, class sparse_ix>
|
|
1144
|
+
double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept;
|
|
1145
|
+
template <class sparse_ix>
|
|
1146
|
+
void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
|
|
1147
|
+
template <class sparse_ix>
|
|
1148
|
+
void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
|
|
659
1149
|
|
|
660
1150
|
/* dist.cpp */
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
1151
|
+
template <class real_t, class sparse_ix>
|
|
1152
|
+
void calc_similarity(real_t numeric_data[], int categ_data[],
|
|
1153
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1154
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
1155
|
+
bool assume_full_distr, bool standardize_dist, bool as_kernel,
|
|
664
1156
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
665
|
-
double tmat[], double rmat[], size_t n_from
|
|
1157
|
+
double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
|
|
1158
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
|
|
1159
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1160
|
+
void calc_similarity_internal(
|
|
1161
|
+
real_t numeric_data[], int categ_data[],
|
|
1162
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1163
|
+
size_t nrows, int nthreads,
|
|
1164
|
+
bool assume_full_distr, bool standardize_dist, bool as_kernel,
|
|
1165
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1166
|
+
double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
|
|
1167
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
|
|
1168
|
+
template <class PredictionData, class ldouble_safe>
|
|
666
1169
|
void traverse_tree_sim(WorkerForSimilarity &workspace,
|
|
667
1170
|
PredictionData &prediction_data,
|
|
668
1171
|
IsoForest &model_outputs,
|
|
669
1172
|
std::vector<IsoTree> &trees,
|
|
670
|
-
size_t curr_tree
|
|
1173
|
+
size_t curr_tree,
|
|
1174
|
+
const bool as_kernel);
|
|
1175
|
+
template <class PredictionData, class ldouble_safe>
|
|
671
1176
|
void traverse_hplane_sim(WorkerForSimilarity &workspace,
|
|
672
1177
|
PredictionData &prediction_data,
|
|
673
1178
|
ExtIsoForest &model_outputs,
|
|
674
1179
|
std::vector<IsoHPlane> &hplanes,
|
|
675
|
-
size_t curr_tree
|
|
1180
|
+
size_t curr_tree,
|
|
1181
|
+
const bool as_kernel);
|
|
1182
|
+
template <class PredictionData, class InputData, class WorkerMemory>
|
|
1183
|
+
#ifndef _FOR_R
|
|
1184
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1185
|
+
#endif
|
|
676
1186
|
void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
|
|
677
1187
|
std::vector<WorkerMemory> *worker_memory_m,
|
|
678
1188
|
PredictionData *prediction_data, InputData *input_data,
|
|
679
1189
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
680
1190
|
double *restrict tmat, double *restrict rmat, size_t n_from,
|
|
681
1191
|
size_t ntrees, bool assume_full_distr,
|
|
682
|
-
bool standardize_dist, int nthreads);
|
|
1192
|
+
bool standardize_dist, bool as_kernel, int nthreads);
|
|
1193
|
+
template <class PredictionData>
|
|
683
1194
|
void initialize_worker_for_sim(WorkerForSimilarity &workspace,
|
|
684
1195
|
PredictionData &prediction_data,
|
|
685
1196
|
IsoForest *model_outputs,
|
|
686
1197
|
ExtIsoForest *model_outputs_ext,
|
|
687
1198
|
size_t n_from,
|
|
688
1199
|
bool assume_full_distr);
|
|
1200
|
+
template <class real_t, class sparse_ix>
|
|
1201
|
+
#ifndef _FOR_R
|
|
1202
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1203
|
+
#endif
|
|
1204
|
+
void calc_similarity_from_indexer
|
|
1205
|
+
(
|
|
1206
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1207
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1208
|
+
size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
|
|
1209
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1210
|
+
double *restrict tmat, double *restrict rmat, size_t n_from,
|
|
1211
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
|
|
1212
|
+
);
|
|
1213
|
+
template <class real_t, class sparse_ix>
|
|
1214
|
+
#ifndef _FOR_R
|
|
1215
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1216
|
+
#endif
|
|
1217
|
+
void calc_similarity_from_indexer_with_references
|
|
1218
|
+
(
|
|
1219
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1220
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1221
|
+
size_t nrows, int nthreads, bool standardize_dist,
|
|
1222
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1223
|
+
double *restrict rmat,
|
|
1224
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
|
|
1225
|
+
);
|
|
1226
|
+
template <class real_t, class sparse_ix>
|
|
1227
|
+
void kernel_to_references(TreesIndexer &indexer,
|
|
1228
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1229
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1230
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1231
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1232
|
+
size_t nrows, int nthreads,
|
|
1233
|
+
double *restrict rmat,
|
|
1234
|
+
bool standardize);
|
|
689
1235
|
|
|
690
1236
|
/* impute.cpp */
|
|
691
|
-
|
|
692
|
-
|
|
1237
|
+
template <class real_t, class sparse_ix>
|
|
1238
|
+
void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
|
|
1239
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
1240
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
1241
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1242
|
+
Imputer &imputer);
|
|
1243
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1244
|
+
void impute_missing_values_internal(
|
|
1245
|
+
real_t numeric_data[], int categ_data[], bool is_col_major,
|
|
1246
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
693
1247
|
size_t nrows, int nthreads,
|
|
694
1248
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
695
1249
|
Imputer &imputer);
|
|
1250
|
+
template <class InputData, class ldouble_safe>
|
|
696
1251
|
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads);
|
|
1252
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
697
1253
|
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
698
1254
|
InputData &input_data, ModelParams &model_params,
|
|
699
1255
|
std::vector<ImputeNode> &imputer_tree,
|
|
@@ -702,232 +1258,1085 @@ void shrink_impute_node(ImputeNode &imputer);
|
|
|
702
1258
|
void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
703
1259
|
std::vector<IsoTree> *trees,
|
|
704
1260
|
std::vector<IsoHPlane> *hplanes);
|
|
705
|
-
|
|
1261
|
+
template <class ImputedData>
|
|
1262
|
+
void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto);
|
|
1263
|
+
template <class ImputedData, class WorkerMemory>
|
|
706
1264
|
void combine_tree_imputations(WorkerMemory &workspace,
|
|
707
1265
|
std::vector<ImputedData> &impute_vec,
|
|
708
|
-
|
|
1266
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
709
1267
|
std::vector<char> &has_missing,
|
|
710
1268
|
int nthreads);
|
|
1269
|
+
template <class ImputedData>
|
|
711
1270
|
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w);
|
|
1271
|
+
template <class InputData, class WorkerMemory>
|
|
712
1272
|
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data);
|
|
713
|
-
template <class imp_arr>
|
|
1273
|
+
template <class imp_arr, class InputData>
|
|
714
1274
|
void apply_imputation_results(imp_arr &impute_vec,
|
|
715
1275
|
Imputer &imputer,
|
|
716
1276
|
InputData &input_data,
|
|
717
1277
|
int nthreads);
|
|
1278
|
+
template <class ImputedData, class InputData>
|
|
718
1279
|
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
|
719
|
-
|
|
1280
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
720
1281
|
Imputer &imputer,
|
|
721
1282
|
InputData &input_data,
|
|
722
1283
|
int nthreads);
|
|
1284
|
+
template <class PredictionData, class ImputedData>
|
|
723
1285
|
void apply_imputation_results(PredictionData &prediction_data,
|
|
724
1286
|
ImputedData &imp,
|
|
725
1287
|
Imputer &imputer,
|
|
726
1288
|
size_t row);
|
|
1289
|
+
template <class ImputedData, class InputData>
|
|
727
1290
|
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row);
|
|
1291
|
+
template <class ImputedData, class PredictionData>
|
|
728
1292
|
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row);
|
|
1293
|
+
template <class ImputedData, class InputData>
|
|
729
1294
|
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads);
|
|
730
|
-
|
|
1295
|
+
template <class ImputedData, class InputData>
|
|
1296
|
+
void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data);
|
|
1297
|
+
template <class ImputedData, class InputData>
|
|
731
1298
|
void allocate_imp(InputData &input_data,
|
|
732
1299
|
std::vector<ImputedData> &impute_vec,
|
|
733
|
-
|
|
1300
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
734
1301
|
int nthreads);
|
|
1302
|
+
template <class ImputedData, class InputData>
|
|
735
1303
|
void check_for_missing(InputData &input_data,
|
|
736
1304
|
std::vector<ImputedData> &impute_vec,
|
|
737
|
-
|
|
1305
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
738
1306
|
int nthreads);
|
|
1307
|
+
template <class PredictionData>
|
|
739
1308
|
size_t check_for_missing(PredictionData &prediction_data,
|
|
740
1309
|
Imputer &imputer,
|
|
741
1310
|
size_t ix_arr[],
|
|
742
1311
|
int nthreads);
|
|
743
1312
|
|
|
744
1313
|
/* helpers_iforest.cpp */
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
1314
|
+
static inline size_t get_ntrees(const IsoForest &model)
|
|
1315
|
+
{
|
|
1316
|
+
return model.trees.size();
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
static inline size_t get_ntrees(const ExtIsoForest &model)
|
|
1320
|
+
{
|
|
1321
|
+
return model.hplanes.size();
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
static inline size_t get_ntrees(const Imputer &model)
|
|
1325
|
+
{
|
|
1326
|
+
return model.imputer_tree.size();
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
static inline size_t get_ntrees(const TreesIndexer &model)
|
|
1330
|
+
{
|
|
1331
|
+
return model.indices.size();
|
|
1332
|
+
}
|
|
1333
|
+
template <class InputData, class WorkerMemory>
|
|
751
1334
|
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree);
|
|
1335
|
+
template <class InputData, class WorkerMemory>
|
|
752
1336
|
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
|
|
1337
|
+
template <class InputData, class WorkerMemory>
|
|
1338
|
+
void get_split_range_v2(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
|
|
1339
|
+
template <class InputData, class WorkerMemory>
|
|
753
1340
|
int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num);
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
void set_col_as_taken(std::vector<bool> &col_is_taken,
|
|
1341
|
+
bool is_col_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
|
|
1342
|
+
size_t col_num);
|
|
1343
|
+
template <class InputData>
|
|
1344
|
+
void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
|
|
758
1345
|
InputData &input_data, size_t col_num, ColType col_type);
|
|
1346
|
+
template <class InputData>
|
|
1347
|
+
void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
|
|
1348
|
+
InputData &input_data, size_t col_num);
|
|
1349
|
+
template <class InputData, class WorkerMemory>
|
|
759
1350
|
void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder);
|
|
760
|
-
|
|
1351
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
1352
|
+
void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, ldouble_safe sum_weight);
|
|
1353
|
+
template <class PredictionData, class sparse_ix>
|
|
761
1354
|
void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
762
1355
|
PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads);
|
|
763
|
-
|
|
764
|
-
|
|
1356
|
+
template <class InputData, class ldouble_safe>
|
|
1357
|
+
std::vector<double> calc_kurtosis_all_data(InputData &input_data, ModelParams &model_params, RNG_engine &rnd_generator);
|
|
1358
|
+
template <class InputData, class WorkerMemory>
|
|
1359
|
+
void calc_ranges_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
|
|
1360
|
+
double *restrict ranges, double *restrict saved_xmin, double *restrict saved_xmax);
|
|
1361
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
1362
|
+
void calc_var_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
|
|
1363
|
+
double *restrict variances, double *restrict saved_xmin, double *restrict saved_xmax,
|
|
1364
|
+
double *restrict saved_means, double *restrict saved_sds);
|
|
1365
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
1366
|
+
void calc_kurt_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
|
|
1367
|
+
double *restrict kurtosis, double *restrict saved_xmin, double *restrict saved_xmax);
|
|
1368
|
+
bool is_boxed_metric(const ScoringMetric scoring_metric);
|
|
765
1369
|
|
|
766
1370
|
|
|
767
1371
|
/* utils.cpp */
|
|
1372
|
+
#define ix_comb_(i, j, n, ncomb) ( ((ncomb) + ((j) - (i))) - (size_t)1 - div2(((n) - (i)) * ((n) - (i) - (size_t)1)) )
|
|
1373
|
+
#define ix_comb(i, j, n, ncomb) ( ((i) < (j))? ix_comb_(i, j, n, ncomb) : ix_comb_(j, i, n, ncomb) )
|
|
1374
|
+
#define calc_ncomb(n) (((n) % 2) == 0)? (div2(n) * ((n)-(size_t)1)) : ((n) * div2((n)-(size_t)1))
|
|
768
1375
|
size_t log2ceil(size_t x);
|
|
1376
|
+
#ifndef _FOR_R
|
|
1377
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1378
|
+
#endif
|
|
1379
|
+
double digamma(double x);
|
|
1380
|
+
template <class ldouble_safe>
|
|
1381
|
+
#ifndef _FOR_R
|
|
1382
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1383
|
+
#endif
|
|
769
1384
|
double harmonic(size_t n);
|
|
770
1385
|
double harmonic_recursive(double a, double b);
|
|
1386
|
+
template <class ldouble_safe>
|
|
771
1387
|
double expected_avg_depth(size_t sample_size);
|
|
772
|
-
|
|
1388
|
+
template <class ldouble_safe>
|
|
1389
|
+
#ifndef _FOR_R
|
|
1390
|
+
[[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
|
|
1391
|
+
#endif
|
|
1392
|
+
double expected_avg_depth(ldouble_safe approx_sample_size);
|
|
773
1393
|
double expected_separation_depth(size_t n);
|
|
774
1394
|
double expected_separation_depth_hotstart(double curr, size_t n_curr, size_t n_final);
|
|
775
|
-
|
|
1395
|
+
template <class ldouble_safe>
|
|
1396
|
+
double expected_separation_depth(ldouble_safe n);
|
|
776
1397
|
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n, double counter[], double exp_remainder);
|
|
777
1398
|
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
|
|
778
1399
|
double *restrict counter, double *restrict weights, double exp_remainder);
|
|
779
1400
|
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
|
|
780
|
-
double counter[],
|
|
1401
|
+
double counter[], hashed_map<size_t, double> &weights, double exp_remainder);
|
|
781
1402
|
void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
|
|
782
1403
|
double counter[], double exp_remainder);
|
|
783
1404
|
void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
|
|
784
1405
|
double *restrict counter, double *restrict weights, double exp_remainder);
|
|
785
|
-
void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n,
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
void sample_random_rows(std::vector<size_t> &ix_arr, size_t nrows, bool with_replacement,
|
|
791
|
-
RNG_engine &rnd_generator, std::vector<size_t> &ix_all,
|
|
792
|
-
|
|
1406
|
+
void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, double fill_diag);
|
|
1407
|
+
template <class real_t=double>
|
|
1408
|
+
void build_btree_sampler(std::vector<double> &btree_weights, real_t *restrict sample_weights,
|
|
1409
|
+
size_t nrows, size_t &restrict log2_n, size_t &restrict btree_offset);
|
|
1410
|
+
template <class real_t=double, class ldouble_safe>
|
|
1411
|
+
void sample_random_rows(std::vector<size_t> &restrict ix_arr, size_t nrows, bool with_replacement,
|
|
1412
|
+
RNG_engine &rnd_generator, std::vector<size_t> &restrict ix_all,
|
|
1413
|
+
real_t *restrict sample_weights, std::vector<double> &restrict btree_weights,
|
|
793
1414
|
size_t log2_n, size_t btree_offset, std::vector<bool> &is_repeated);
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
1415
|
+
template <class real_t=double>
|
|
1416
|
+
void weighted_shuffle(size_t *restrict outp, size_t n, real_t *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
|
|
1417
|
+
double sample_random_uniform(double xmin, double xmax, RNG_engine &rng) noexcept;
|
|
1418
|
+
size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point) noexcept;
|
|
1419
|
+
template <class real_t=double>
|
|
1420
|
+
void divide_subset_split(size_t *restrict ix_arr, real_t x[], size_t st, size_t end, double split_point,
|
|
1421
|
+
MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1422
|
+
template <class real_t, class sparse_ix>
|
|
1423
|
+
void divide_subset_split(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1424
|
+
real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, double split_point,
|
|
1425
|
+
MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1426
|
+
void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
|
|
1427
|
+
MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1428
|
+
void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
|
|
804
1429
|
int ncat, MissingAction missing_action, NewCategAction new_cat_action,
|
|
805
|
-
bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
|
806
|
-
void divide_subset_split(size_t ix_arr
|
|
807
|
-
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
|
808
|
-
void divide_subset_split(size_t ix_arr
|
|
1430
|
+
bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1431
|
+
void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, int split_categ,
|
|
1432
|
+
MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1433
|
+
void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end,
|
|
809
1434
|
MissingAction missing_action, NewCategAction new_cat_action,
|
|
810
|
-
bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
1435
|
+
bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
|
|
1436
|
+
template <class real_t=double>
|
|
1437
|
+
void get_range(size_t ix_arr[], real_t *restrict x, size_t st, size_t end,
|
|
1438
|
+
MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
|
|
1439
|
+
template <class real_t>
|
|
1440
|
+
void get_range(real_t *restrict x, size_t n,
|
|
1441
|
+
MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
|
|
1442
|
+
template <class real_t, class sparse_ix>
|
|
1443
|
+
void get_range(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1444
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1445
|
+
MissingAction missing_action, double &restrict xmin_, double &restrict xmax_, bool &unsplittable) noexcept;
|
|
1446
|
+
template <class real_t, class sparse_ix>
|
|
1447
|
+
void get_range(size_t col_num, size_t nrows,
|
|
1448
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1449
|
+
MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
|
|
1450
|
+
void get_categs(size_t *restrict ix_arr, int x[], size_t st, size_t end, int ncat,
|
|
1451
|
+
MissingAction missing_action, signed char categs[], size_t &restrict npresent, bool &unsplittable) noexcept;
|
|
1452
|
+
template <class real_t>
|
|
1453
|
+
bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
|
|
1454
|
+
bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, int x[], MissingAction missing_action);
|
|
1455
|
+
template <class real_t, class sparse_ix>
|
|
1456
|
+
bool check_more_than_two_unique_values(size_t *restrict ix_arr, size_t st, size_t end, size_t col,
|
|
1457
|
+
sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
|
|
1458
|
+
MissingAction missing_action);
|
|
1459
|
+
template <class real_t, class sparse_ix>
|
|
1460
|
+
bool check_more_than_two_unique_values(size_t nrows, size_t col,
|
|
1461
|
+
sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
|
|
1462
|
+
MissingAction missing_action);
|
|
1463
|
+
void count_categs(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict counts);
|
|
1464
|
+
int count_ncateg_in_col(const int x[], const size_t n, const int ncat, unsigned char buffer[]);
|
|
1465
|
+
template <class ldouble_safe>
|
|
1466
|
+
ldouble_safe calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
|
|
1467
|
+
std::vector<double> &weights_arr, hashed_map<size_t, double> &weights_map);
|
|
1468
|
+
extern bool interrupt_switch;
|
|
1469
|
+
extern bool signal_is_locked;
|
|
820
1470
|
void set_interrup_global_variable(int s);
|
|
1471
|
+
#ifdef _FOR_PYTHON
|
|
1472
|
+
bool cy_check_interrupt_switch();
|
|
1473
|
+
void cy_tick_off_interrupt_switch();
|
|
1474
|
+
#endif
|
|
1475
|
+
class SignalSwitcher
|
|
1476
|
+
{
|
|
1477
|
+
public:
|
|
1478
|
+
sig_t_ old_sig;
|
|
1479
|
+
bool is_active;
|
|
1480
|
+
SignalSwitcher();
|
|
1481
|
+
~SignalSwitcher();
|
|
1482
|
+
void restore_handle();
|
|
1483
|
+
};
|
|
1484
|
+
void check_interrupt_switch(SignalSwitcher &ss);
|
|
1485
|
+
bool has_long_double();
|
|
821
1486
|
int return_EXIT_SUCCESS();
|
|
822
1487
|
int return_EXIT_FAILURE();
|
|
823
1488
|
|
|
824
1489
|
|
|
825
1490
|
|
|
826
|
-
|
|
827
|
-
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end,
|
|
1491
|
+
template <class real_t=double>
|
|
1492
|
+
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, real_t x[]);
|
|
1493
|
+
template <class real_t, class sparse_ix>
|
|
1494
|
+
size_t move_NAs_to_front(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
|
|
828
1495
|
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, int x[]);
|
|
829
|
-
size_t center_NAs(size_t
|
|
830
|
-
|
|
831
|
-
|
|
1496
|
+
size_t center_NAs(size_t ix_arr[], size_t st_left, size_t st, size_t curr_pos);
|
|
1497
|
+
template <class real_t>
|
|
1498
|
+
void fill_NAs_with_median(size_t *restrict ix_arr, size_t st_orig, size_t st, size_t end, real_t *restrict x,
|
|
1499
|
+
double *restrict buffer_imputed_x, double *restrict xmedian);
|
|
1500
|
+
template <class real_t, class sparse_ix>
|
|
1501
|
+
void todense(size_t *restrict ix_arr, size_t st, size_t end,
|
|
1502
|
+
size_t col_num, real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
832
1503
|
double *restrict buffer_arr);
|
|
1504
|
+
template <class real_t>
|
|
1505
|
+
void colmajor_to_rowmajor(real_t *restrict X, size_t nrows, size_t ncols, std::vector<double> &X_row_major);
|
|
1506
|
+
template <class real_t, class sparse_ix>
|
|
1507
|
+
void colmajor_to_rowmajor(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1508
|
+
size_t nrows, size_t ncols,
|
|
1509
|
+
std::vector<double> &Xr, std::vector<size_t> &Xr_ind, std::vector<size_t> &Xr_indptr);
|
|
1510
|
+
template <class sparse_ix=size_t>
|
|
1511
|
+
bool check_indices_are_sorted(sparse_ix indices[], size_t n);
|
|
1512
|
+
template <class real_t, class sparse_ix>
|
|
1513
|
+
void sort_csc_indices(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, size_t ncols_numeric);
|
|
833
1514
|
|
|
834
1515
|
/* mult.cpp */
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1516
|
+
template <class real_t, class real_t_>
|
|
1517
|
+
void calc_mean_and_sd_t(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
|
|
1518
|
+
MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
|
|
1519
|
+
template <class real_t_, class ldouble_safe>
|
|
1520
|
+
void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
|
|
1521
|
+
MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
|
|
1522
|
+
template <class real_t_>
|
|
1523
|
+
double calc_mean_only(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x);
|
|
1524
|
+
template <class real_t_, class mapping, class ldouble_safe>
|
|
1525
|
+
void calc_mean_and_sd_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w,
|
|
1526
|
+
MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
|
|
1527
|
+
template <class real_t_, class mapping>
|
|
1528
|
+
double calc_mean_only_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w);
|
|
1529
|
+
template <class real_t_, class sparse_ix, class ldouble_safe>
|
|
1530
|
+
void calc_mean_and_sd(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1531
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1532
|
+
double &restrict x_sd, double &restrict x_mean);
|
|
1533
|
+
template <class real_t_, class sparse_ix, class ldouble_safe>
|
|
1534
|
+
double calc_mean_only(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1535
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
|
|
1536
|
+
template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
|
|
1537
|
+
void calc_mean_and_sd_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1538
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1539
|
+
double &restrict x_sd, double &restrict x_mean, mapping &restrict w);
|
|
1540
|
+
template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
|
|
1541
|
+
double calc_mean_only_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1542
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1543
|
+
mapping &restrict w);
|
|
1544
|
+
template <class real_t_>
|
|
840
1545
|
void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
|
|
841
|
-
|
|
1546
|
+
real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
|
|
842
1547
|
MissingAction missing_action, double *restrict buffer_arr,
|
|
843
1548
|
size_t *restrict buffer_NAs, bool first_run);
|
|
1549
|
+
template <class real_t_, class mapping, class ldouble_safe>
|
|
1550
|
+
void add_linear_comb_weighted(size_t ix_arr[], size_t st, size_t end, double *restrict res,
|
|
1551
|
+
real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
|
|
1552
|
+
MissingAction missing_action, double *restrict buffer_arr,
|
|
1553
|
+
size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
|
|
1554
|
+
template <class real_t_, class sparse_ix>
|
|
844
1555
|
void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
|
|
845
|
-
|
|
846
|
-
double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
|
|
1556
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1557
|
+
double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
|
|
847
1558
|
double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run);
|
|
1559
|
+
template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
|
|
1560
|
+
void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
|
|
1561
|
+
real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1562
|
+
double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
|
|
1563
|
+
double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
|
|
1564
|
+
template <class mapping>
|
|
1565
|
+
void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
|
|
1566
|
+
int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
|
|
1567
|
+
double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
|
|
1568
|
+
NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
|
|
1569
|
+
bool first_run, mapping &restrict w);
|
|
1570
|
+
template <class ldouble_safe>
|
|
848
1571
|
void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
|
|
849
1572
|
int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
|
|
850
|
-
double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
|
|
1573
|
+
double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
|
|
851
1574
|
NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run);
|
|
1575
|
+
template <class mapping, class ldouble_safe>
|
|
1576
|
+
void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
|
|
1577
|
+
int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
|
|
1578
|
+
double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
|
|
1579
|
+
NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
|
|
1580
|
+
bool first_run, mapping &restrict w);
|
|
852
1581
|
|
|
853
1582
|
/* crit.cpp */
|
|
854
|
-
|
|
855
|
-
double calc_kurtosis(size_t ix_arr[], size_t st, size_t end,
|
|
856
|
-
|
|
1583
|
+
template <class real_t, class ldouble_safe>
|
|
1584
|
+
double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
|
|
1585
|
+
template <class real_t, class ldouble_safe>
|
|
1586
|
+
double calc_kurtosis(real_t x[], size_t n, MissingAction missing_action);
|
|
1587
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1588
|
+
double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, real_t x[],
|
|
1589
|
+
MissingAction missing_action, mapping &restrict w);
|
|
1590
|
+
template <class real_t, class ldouble_safe>
|
|
1591
|
+
double calc_kurtosis_weighted(real_t *restrict x, size_t n_, MissingAction missing_action, real_t *restrict w);
|
|
1592
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1593
|
+
double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1594
|
+
real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1595
|
+
MissingAction missing_action);
|
|
1596
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1597
|
+
double calc_kurtosis(size_t col_num, size_t nrows,
|
|
1598
|
+
real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
857
1599
|
MissingAction missing_action);
|
|
858
|
-
|
|
1600
|
+
template <class real_t, class sparse_ix, class mapping, class ldouble_safe>
|
|
1601
|
+
double calc_kurtosis_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
|
|
1602
|
+
real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1603
|
+
MissingAction missing_action, mapping &restrict w);
|
|
1604
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
1605
|
+
double calc_kurtosis_weighted(size_t col_num, size_t nrows,
|
|
1606
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1607
|
+
MissingAction missing_action, real_t *restrict w);
|
|
1608
|
+
template <class ldouble_safe>
|
|
1609
|
+
double calc_kurtosis_internal(size_t cnt, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
|
|
1610
|
+
MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
|
|
1611
|
+
template <class ldouble_safe>
|
|
1612
|
+
double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict buffer_cnt, double buffer_prob[],
|
|
859
1613
|
MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
|
|
860
|
-
|
|
861
|
-
double
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
double
|
|
877
|
-
|
|
878
|
-
|
|
1614
|
+
template <class ldouble_safe>
|
|
1615
|
+
double calc_kurtosis(size_t nrows, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
|
|
1616
|
+
MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
|
|
1617
|
+
template <class mapping, class ldouble_safe>
|
|
1618
|
+
double calc_kurtosis_weighted_internal(std::vector<ldouble_safe> &buffer_cnt, int x[], int ncat,
|
|
1619
|
+
double buffer_prob[], MissingAction missing_action, CategSplit cat_split_type,
|
|
1620
|
+
RNG_engine &rnd_generator, mapping &restrict w);
|
|
1621
|
+
template <class mapping, class ldouble_safe>
|
|
1622
|
+
double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, double buffer_prob[],
|
|
1623
|
+
MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator,
|
|
1624
|
+
mapping &restrict w);
|
|
1625
|
+
template <class real_t, class ldouble_safe>
|
|
1626
|
+
double calc_kurtosis_weighted(size_t nrows, int x[], int ncat, double *restrict buffer_prob,
|
|
1627
|
+
MissingAction missing_action, CategSplit cat_split_type,
|
|
1628
|
+
RNG_engine &rnd_generator, real_t *restrict w);
|
|
1629
|
+
template <class int_t, class ldouble_safe>
|
|
1630
|
+
double expected_sd_cat(double p[], size_t n, int_t pos[]);
|
|
1631
|
+
template <class number, class int_t, class ldouble_safe>
|
|
1632
|
+
double expected_sd_cat(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos);
|
|
1633
|
+
template <class number, class int_t, class ldouble_safe>
|
|
1634
|
+
double expected_sd_cat_single(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos, size_t cat_exclude, number cnt);
|
|
1635
|
+
template <class number, class int_t, class ldouble_safe>
|
|
1636
|
+
double expected_sd_cat_internal(int ncat, number *restrict buffer_cnt, ldouble_safe cnt_l,
|
|
1637
|
+
int_t *restrict buffer_pos, double *restrict buffer_prob);
|
|
1638
|
+
template <class int_t, class ldouble_safe>
|
|
1639
|
+
double expected_sd_cat(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
|
|
1640
|
+
MissingAction missing_action,
|
|
1641
|
+
size_t *restrict buffer_cnt, int_t *restrict buffer_pos, double buffer_prob[]);
|
|
1642
|
+
template <class mapping, class int_t, class ldouble_safe>
|
|
1643
|
+
double expected_sd_cat_weighted(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
|
|
1644
|
+
MissingAction missing_action, mapping &restrict w,
|
|
1645
|
+
double *restrict buffer_cnt, int_t *restrict buffer_pos, double *restrict buffer_prob);
|
|
1646
|
+
template <class number, class ldouble_safe>
|
|
1647
|
+
double categ_gain(number cnt_left, number cnt_right,
|
|
1648
|
+
ldouble_safe s_left, ldouble_safe s_right,
|
|
1649
|
+
ldouble_safe base_info, ldouble_safe cnt);
|
|
1650
|
+
template <class real_t, class real_t_>
|
|
1651
|
+
double find_split_rel_gain_t(real_t_ *restrict x, size_t n, double &restrict split_point);
|
|
1652
|
+
template <class real_t_, class ldouble_safe>
|
|
1653
|
+
double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix);
|
|
1654
|
+
template <class real_t, class real_t_>
|
|
1655
|
+
double find_split_rel_gain_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix);
|
|
1656
|
+
template <class real_t_, class ldouble_safe>
|
|
1657
|
+
double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double &split_point, size_t &split_ix);
|
|
1658
|
+
template <class real_t, class real_t_, class mapping>
|
|
1659
|
+
double find_split_rel_gain_weighted_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1660
|
+
template <class real_t_, class mapping, class ldouble_safe>
|
|
1661
|
+
double find_split_rel_gain_weighted(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1662
|
+
template <class real_t, class real_t_=double>
|
|
1663
|
+
real_t calc_sd_right_to_left(real_t_ *restrict x, size_t n, double *restrict sd_arr);
|
|
1664
|
+
template <class real_t_, class ldouble_safe>
|
|
1665
|
+
ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, size_t n, double *restrict sd_arr,
|
|
1666
|
+
double *restrict w, ldouble_safe &cumw, size_t *restrict sorted_ix);
|
|
1667
|
+
template <class real_t, class real_t_>
|
|
1668
|
+
real_t calc_sd_right_to_left(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr);
|
|
1669
|
+
template <class real_t_, class mapping, class ldouble_safe>
|
|
1670
|
+
ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end,
|
|
1671
|
+
double *restrict sd_arr, mapping &restrict w, ldouble_safe &cumw);
|
|
1672
|
+
template <class real_t, class real_t_>
|
|
1673
|
+
double find_split_std_gain_t(real_t_ *restrict x, size_t n, double *restrict sd_arr,
|
|
1674
|
+
GainCriterion criterion, double min_gain, double &restrict split_point);
|
|
1675
|
+
template <class real_t_, class ldouble_safe>
|
|
1676
|
+
double find_split_std_gain(real_t_ *restrict x, size_t n, double *restrict sd_arr,
|
|
1677
|
+
GainCriterion criterion, double min_gain, double &restrict split_point);
|
|
1678
|
+
template <class real_t, class ldouble_safe>
|
|
1679
|
+
double find_split_std_gain_weighted(real_t *restrict x, size_t n, double *restrict sd_arr,
|
|
1680
|
+
GainCriterion criterion, double min_gain, double &restrict split_point,
|
|
1681
|
+
double *restrict w, size_t *restrict sorted_ix);
|
|
1682
|
+
template <class real_t, class real_t_>
|
|
1683
|
+
double find_split_std_gain_t(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
|
|
1684
|
+
GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
|
|
1685
|
+
template <class real_t_, class ldouble_safe>
|
|
1686
|
+
double find_split_std_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
|
|
1687
|
+
GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
|
|
1688
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1689
|
+
double find_split_std_gain_weighted(real_t *restrict x, real_t xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
|
|
1690
|
+
GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1691
|
+
template <class real_t, class ldouble_safe>
|
|
1692
|
+
double find_split_full_gain(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
|
|
1693
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1694
|
+
double *restrict X_row_major, size_t ncols,
|
|
1695
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
|
|
1696
|
+
double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
|
|
1697
|
+
size_t &restrict split_ix, double &restrict split_point,
|
|
1698
|
+
bool x_uses_ix_arr);
|
|
1699
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1700
|
+
double find_split_full_gain_weighted(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
|
|
1701
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1702
|
+
double *restrict X_row_major, size_t ncols,
|
|
1703
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
|
|
1704
|
+
double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
|
|
1705
|
+
size_t &restrict split_ix, double &restrict split_point,
|
|
1706
|
+
bool x_uses_ix_arr,
|
|
1707
|
+
mapping &restrict w);
|
|
1708
|
+
template <class real_t_, class real_t>
|
|
1709
|
+
double find_split_dens_shortform_t(real_t *restrict x, size_t n, double &restrict split_point);
|
|
1710
|
+
template <class real_t, class ldouble_safe>
|
|
1711
|
+
double find_split_dens_shortform(real_t *restrict x, size_t n, double &restrict split_point);
|
|
1712
|
+
template <class real_t_, class real_t, class mapping>
|
|
1713
|
+
double find_split_dens_shortform_weighted_t(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
|
|
1714
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1715
|
+
double find_split_dens_shortform_weighted(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
|
|
1716
|
+
template <class real_t>
|
|
1717
|
+
double find_split_dens_shortform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1718
|
+
double &restrict split_point, size_t &restrict split_ix);
|
|
1719
|
+
template <class real_t, class mapping>
|
|
1720
|
+
double find_split_dens_shortform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1721
|
+
double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1722
|
+
template <class real_t, class ldouble_safe>
|
|
1723
|
+
double find_split_dens_longform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1724
|
+
double &restrict split_point, size_t &restrict split_ix);
|
|
1725
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1726
|
+
double find_split_dens_longform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1727
|
+
double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1728
|
+
template <class real_t, class ldouble_safe>
|
|
1729
|
+
double find_split_dens(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1730
|
+
double &restrict split_point, size_t &restrict split_ix);
|
|
1731
|
+
template <class real_t, class mapping, class ldouble_safe>
|
|
1732
|
+
double find_split_dens_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1733
|
+
double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
|
|
1734
|
+
template <class int_t, class ldouble_safe>
|
|
1735
|
+
double find_split_dens_longform(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1736
|
+
CategSplit cat_split_type, MissingAction missing_action,
|
|
1737
|
+
int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
|
|
1738
|
+
size_t *restrict buffer_cnt, int_t *restrict buffer_indices);
|
|
1739
|
+
template <class mapping, class int_t, class ldouble_safe>
|
|
1740
|
+
double find_split_dens_longform_weighted(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
|
|
1741
|
+
CategSplit cat_split_type, MissingAction missing_action,
|
|
1742
|
+
int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
|
|
1743
|
+
int_t *restrict buffer_indices, mapping &restrict w);
|
|
1744
|
+
template <class ldouble_safe>
|
|
1745
|
+
double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion,
|
|
1746
|
+
double min_gain, bool as_relative_gain, double *restrict buffer_sd,
|
|
1747
|
+
double &restrict split_point, double &restrict xmin, double &restrict xmax,
|
|
1748
|
+
size_t *restrict ix_arr_plus_st,
|
|
1749
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1750
|
+
double *restrict X_row_major, size_t ncols,
|
|
1751
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
|
|
1752
|
+
template <class ldouble_safe>
|
|
1753
|
+
double eval_guided_crit_weighted(double *restrict x, size_t n, GainCriterion criterion,
|
|
1754
|
+
double min_gain, bool as_relative_gain, double *restrict buffer_sd,
|
|
1755
|
+
double &restrict split_point, double &restrict xmin, double &restrict xmax,
|
|
1756
|
+
double *restrict w, size_t *restrict buffer_indices,
|
|
1757
|
+
size_t *restrict ix_arr_plus_st,
|
|
1758
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1759
|
+
double *restrict X_row_major, size_t ncols,
|
|
1760
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
|
|
1761
|
+
template <class real_t_, class ldouble_safe>
|
|
1762
|
+
double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
|
|
1763
|
+
double *restrict buffer_sd, bool as_relative_gain,
|
|
1764
|
+
double *restrict buffer_imputed_x, double *restrict saved_xmedian,
|
|
1765
|
+
size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
|
|
1766
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action,
|
|
1767
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1768
|
+
double *restrict X_row_major, size_t ncols,
|
|
1769
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
|
|
1770
|
+
template <class real_t_, class mapping, class ldouble_safe>
|
|
1771
|
+
double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
|
|
1772
|
+
double *restrict buffer_sd, bool as_relative_gain,
|
|
1773
|
+
double *restrict buffer_imputed_x, double *restrict saved_xmedian,
|
|
1774
|
+
size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
|
|
1775
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action,
|
|
1776
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1777
|
+
double *restrict X_row_major, size_t ncols,
|
|
1778
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
|
|
1779
|
+
mapping &restrict w);
|
|
1780
|
+
template <class real_t_, class sparse_ix, class ldouble_safe>
|
|
879
1781
|
double eval_guided_crit(size_t ix_arr[], size_t st, size_t end,
|
|
880
|
-
size_t col_num,
|
|
881
|
-
double buffer_arr[], size_t buffer_pos[],
|
|
1782
|
+
size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1783
|
+
double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
|
|
1784
|
+
double *restrict saved_xmedian,
|
|
882
1785
|
double &split_point, double &xmin, double &xmax,
|
|
883
|
-
GainCriterion criterion, double min_gain, MissingAction missing_action
|
|
1786
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action,
|
|
1787
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1788
|
+
double *restrict X_row_major, size_t ncols,
|
|
1789
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
|
|
1790
|
+
template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
|
|
1791
|
+
double eval_guided_crit_weighted(size_t ix_arr[], size_t st, size_t end,
|
|
1792
|
+
size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
1793
|
+
double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
|
|
1794
|
+
double *restrict saved_xmedian,
|
|
1795
|
+
double &restrict split_point, double &restrict xmin, double &restrict xmax,
|
|
1796
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action,
|
|
1797
|
+
size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
|
|
1798
|
+
double *restrict X_row_major, size_t ncols,
|
|
1799
|
+
double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
|
|
1800
|
+
mapping &restrict w);
|
|
1801
|
+
template <class ldouble_safe>
|
|
884
1802
|
double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
|
|
1803
|
+
int *restrict saved_cat_mode,
|
|
885
1804
|
size_t *restrict buffer_cnt, size_t *restrict buffer_pos, double *restrict buffer_prob,
|
|
886
|
-
int &chosen_cat, char *restrict split_categ, char *restrict buffer_split,
|
|
887
|
-
GainCriterion criterion, double min_gain, bool all_perm,
|
|
1805
|
+
int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
|
|
1806
|
+
GainCriterion criterion, double min_gain, bool all_perm,
|
|
1807
|
+
MissingAction missing_action, CategSplit cat_split_type);
|
|
1808
|
+
template <class mapping, class ldouble_safe>
|
|
1809
|
+
double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
|
|
1810
|
+
int *restrict saved_cat_mode,
|
|
1811
|
+
size_t *restrict buffer_pos, double *restrict buffer_prob,
|
|
1812
|
+
int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
|
|
1813
|
+
GainCriterion criterion, double min_gain, bool all_perm,
|
|
1814
|
+
MissingAction missing_action, CategSplit cat_split_type,
|
|
1815
|
+
mapping &restrict w);
|
|
1816
|
+
|
|
1817
|
+
/* indexer.cpp */
|
|
1818
|
+
template <class Tree>
|
|
1819
|
+
void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<Tree> &tree);
|
|
1820
|
+
void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoTree> &tree);
|
|
1821
|
+
void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoHPlane> &tree);
|
|
1822
|
+
template <class Model>
|
|
1823
|
+
void build_terminal_node_mappings(TreesIndexer &indexer, const Model &model);
|
|
1824
|
+
template <class Node>
|
|
1825
|
+
void build_dindex_recursive
|
|
1826
|
+
(
|
|
1827
|
+
const size_t curr_node,
|
|
1828
|
+
const size_t n_terminal, const size_t ncomb,
|
|
1829
|
+
const size_t st, const size_t end,
|
|
1830
|
+
std::vector<size_t> &restrict node_indices, /* array with all terminal indices in 'tree' */
|
|
1831
|
+
const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
|
|
1832
|
+
std::vector<double> &restrict node_distances, /* indexed by terminal_index */
|
|
1833
|
+
std::vector<double> &restrict node_depths, /* indexed by terminal_index */
|
|
1834
|
+
size_t curr_depth,
|
|
1835
|
+
const std::vector<Node> &tree
|
|
1836
|
+
);
|
|
1837
|
+
template <class Node>
|
|
1838
|
+
void build_dindex
|
|
1839
|
+
(
|
|
1840
|
+
std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
|
|
1841
|
+
const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
|
|
1842
|
+
std::vector<double> &restrict node_distances, /* indexed by terminal_index */
|
|
1843
|
+
std::vector<double> &restrict node_depths, /* indexed by terminal_index */
|
|
1844
|
+
const size_t n_terminal,
|
|
1845
|
+
const std::vector<Node> &tree
|
|
1846
|
+
);
|
|
1847
|
+
void build_dindex
|
|
1848
|
+
(
|
|
1849
|
+
std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
|
|
1850
|
+
const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
|
|
1851
|
+
std::vector<double> &restrict node_distances, /* indexed by terminal_index */
|
|
1852
|
+
std::vector<double> &restrict node_depths, /* indexed by terminal_index */
|
|
1853
|
+
const size_t n_terminal,
|
|
1854
|
+
const std::vector<IsoTree> &tree
|
|
1855
|
+
);
|
|
1856
|
+
void build_dindex
|
|
1857
|
+
(
|
|
1858
|
+
std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
|
|
1859
|
+
const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
|
|
1860
|
+
std::vector<double> &restrict node_distances, /* indexed by terminal_index */
|
|
1861
|
+
std::vector<double> &restrict node_depths, /* indexed by terminal_index */
|
|
1862
|
+
const size_t n_terminal,
|
|
1863
|
+
const std::vector<IsoHPlane> &tree
|
|
1864
|
+
);
|
|
1865
|
+
template <class Model>
|
|
1866
|
+
void build_distance_mappings(TreesIndexer &indexer, const Model &model, int nthreads);
|
|
1867
|
+
template <class Model>
|
|
1868
|
+
void build_tree_indices(TreesIndexer &indexer, const Model &model, int nthreads, const bool with_distances);
|
|
1869
|
+
ISOTREE_EXPORTED
|
|
1870
|
+
void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances);
|
|
1871
|
+
ISOTREE_EXPORTED
|
|
1872
|
+
void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances);
|
|
1873
|
+
ISOTREE_EXPORTED
|
|
1874
|
+
void build_tree_indices
|
|
1875
|
+
(
|
|
1876
|
+
TreesIndexer *indexer,
|
|
1877
|
+
const IsoForest *model_outputs,
|
|
1878
|
+
const ExtIsoForest *model_outputs_ext,
|
|
1879
|
+
int nthreads,
|
|
1880
|
+
const bool with_distances
|
|
1881
|
+
);
|
|
1882
|
+
ISOTREE_EXPORTED
|
|
1883
|
+
size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept;
|
|
1884
|
+
void build_ref_node(SingleTreeIndex &node);
|
|
1885
|
+
|
|
1886
|
+
/* ref_indexer.hpp */
|
|
1887
|
+
template <class Model, class real_t, class sparse_ix>
|
|
1888
|
+
void set_reference_points(TreesIndexer &indexer, Model &model, const bool with_distances,
|
|
1889
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1890
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1891
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1892
|
+
real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
|
|
1893
|
+
size_t nrows, int nthreads);
|
|
1894
|
+
template <class real_t, class sparse_ix>
|
|
1895
|
+
void set_reference_points(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext, TreesIndexer *indexer,
|
|
1896
|
+
const bool with_distances,
|
|
1897
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1898
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1899
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1900
|
+
real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
|
|
1901
|
+
size_t nrows, int nthreads);
|
|
888
1902
|
|
|
889
1903
|
/* merge_models.cpp */
|
|
1904
|
+
ISOTREE_EXPORTED
|
|
890
1905
|
void merge_models(IsoForest* model, IsoForest* other,
|
|
891
1906
|
ExtIsoForest* ext_model, ExtIsoForest* ext_other,
|
|
892
|
-
Imputer* imputer, Imputer* iother
|
|
1907
|
+
Imputer* imputer, Imputer* iother,
|
|
1908
|
+
TreesIndexer* indexer, TreesIndexer* ind_other);
|
|
1909
|
+
|
|
1910
|
+
/* subset_models.cpp */
|
|
1911
|
+
ISOTREE_EXPORTED
|
|
1912
|
+
void subset_model(IsoForest* model, IsoForest* model_new,
|
|
1913
|
+
ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
|
|
1914
|
+
Imputer* imputer, Imputer* imputer_new,
|
|
1915
|
+
TreesIndexer* indexer, TreesIndexer* indexer_new,
|
|
1916
|
+
size_t *trees_take, size_t ntrees_take);
|
|
893
1917
|
|
|
894
|
-
#ifdef _ENABLE_CEREAL
|
|
895
1918
|
/* serialize.cpp */
|
|
896
|
-
|
|
897
|
-
void
|
|
898
|
-
|
|
899
|
-
void
|
|
900
|
-
|
|
901
|
-
void
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
#
|
|
1919
|
+
[[noreturn]]
|
|
1920
|
+
void throw_errno();
|
|
1921
|
+
[[noreturn]]
|
|
1922
|
+
void throw_ferror(FILE *file);
|
|
1923
|
+
[[noreturn]]
|
|
1924
|
+
void throw_feoferr();
|
|
1925
|
+
class FileHandle
|
|
1926
|
+
{
|
|
1927
|
+
public:
|
|
1928
|
+
FILE *handle = NULL;
|
|
1929
|
+
FileHandle(const char *fname, const char *mode)
|
|
1930
|
+
{
|
|
1931
|
+
this->handle = std::fopen(fname, mode);
|
|
1932
|
+
if (!(this->handle))
|
|
1933
|
+
throw_errno();
|
|
1934
|
+
}
|
|
1935
|
+
~FileHandle()
|
|
1936
|
+
{
|
|
1937
|
+
if (this->handle) {
|
|
1938
|
+
int err = std::fclose(this->handle);
|
|
1939
|
+
if (err)
|
|
1940
|
+
fprintf(stderr, "Error: could not close file.\n");
|
|
1941
|
+
}
|
|
1942
|
+
this->handle = NULL;
|
|
1943
|
+
}
|
|
1944
|
+
};
|
|
1945
|
+
|
|
1946
|
+
#if defined(_WIN32) && (defined(_MSC_VER) || defined(__GNUC__))
|
|
1947
|
+
#define WCHAR_T_FUNS
|
|
1948
|
+
#include <stdio.h>
|
|
1949
|
+
class WFileHandle
|
|
1950
|
+
{
|
|
1951
|
+
public:
|
|
1952
|
+
FILE *handle = NULL;
|
|
1953
|
+
WFileHandle(const wchar_t *fname, const wchar_t *mode)
|
|
1954
|
+
{
|
|
1955
|
+
this->handle = _wfopen(fname, mode);
|
|
1956
|
+
if (!(this->handle))
|
|
1957
|
+
throw_errno();
|
|
1958
|
+
}
|
|
1959
|
+
~WFileHandle()
|
|
1960
|
+
{
|
|
1961
|
+
if (this->handle) {
|
|
1962
|
+
int err = std::fclose(this->handle);
|
|
1963
|
+
if (err)
|
|
1964
|
+
fprintf(stderr, "Error: could not close file.\n");
|
|
1965
|
+
}
|
|
1966
|
+
this->handle = NULL;
|
|
1967
|
+
}
|
|
1968
|
+
};
|
|
1969
|
+
#endif
|
|
1970
|
+
ISOTREE_EXPORTED
|
|
1971
|
+
bool has_wchar_t_file_serializers() noexcept;
|
|
1972
|
+
ISOTREE_EXPORTED
|
|
1973
|
+
size_t determine_serialized_size(const IsoForest &model) noexcept;
|
|
1974
|
+
ISOTREE_EXPORTED
|
|
1975
|
+
size_t determine_serialized_size(const ExtIsoForest &model) noexcept;
|
|
1976
|
+
ISOTREE_EXPORTED
|
|
1977
|
+
size_t determine_serialized_size(const Imputer &model) noexcept;
|
|
1978
|
+
ISOTREE_EXPORTED
|
|
1979
|
+
size_t determine_serialized_size(const TreesIndexer &model) noexcept;
|
|
1980
|
+
ISOTREE_EXPORTED
|
|
1981
|
+
void serialize_IsoForest(const IsoForest &model, char *out);
|
|
1982
|
+
ISOTREE_EXPORTED
|
|
1983
|
+
void serialize_IsoForest(const IsoForest &model, FILE *out);
|
|
1984
|
+
ISOTREE_EXPORTED
|
|
1985
|
+
void serialize_IsoForest(const IsoForest &model, std::ostream &out);
|
|
1986
|
+
ISOTREE_EXPORTED
|
|
1987
|
+
std::string serialize_IsoForest(const IsoForest &model);
|
|
1988
|
+
ISOTREE_EXPORTED
|
|
1989
|
+
void serialize_IsoForest_ToFile(const IsoForest &model, const char *fname);
|
|
1990
|
+
#ifdef WCHAR_T_FUNS
|
|
1991
|
+
ISOTREE_EXPORTED
|
|
1992
|
+
void serialize_IsoForest_ToFile(const IsoForest &model, const wchar_t *fname);
|
|
1993
|
+
#endif
|
|
1994
|
+
ISOTREE_EXPORTED
|
|
1995
|
+
void deserialize_IsoForest(IsoForest &model, const char *in);
|
|
1996
|
+
ISOTREE_EXPORTED
|
|
1997
|
+
void deserialize_IsoForest(IsoForest &model, FILE *in);
|
|
1998
|
+
ISOTREE_EXPORTED
|
|
1999
|
+
void deserialize_IsoForest(IsoForest &model, std::istream &in);
|
|
2000
|
+
ISOTREE_EXPORTED
|
|
2001
|
+
void deserialize_IsoForest(IsoForest &model, const std::string &in);
|
|
2002
|
+
ISOTREE_EXPORTED
|
|
2003
|
+
void deserialize_IsoForest_FromFile(IsoForest &model, const char *fname);
|
|
2004
|
+
#ifdef WCHAR_T_FUNS
|
|
2005
|
+
ISOTREE_EXPORTED
|
|
2006
|
+
void deserialize_IsoForest_FromFile(IsoForest &model, const wchar_t *fname);
|
|
2007
|
+
#endif
|
|
2008
|
+
ISOTREE_EXPORTED
|
|
2009
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, char *out);
|
|
2010
|
+
ISOTREE_EXPORTED
|
|
2011
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, FILE *out);
|
|
2012
|
+
ISOTREE_EXPORTED
|
|
2013
|
+
void serialize_ExtIsoForest(const ExtIsoForest &model, std::ostream &out);
|
|
2014
|
+
ISOTREE_EXPORTED
|
|
2015
|
+
std::string serialize_ExtIsoForest(const ExtIsoForest &model);
|
|
2016
|
+
ISOTREE_EXPORTED
|
|
2017
|
+
void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const char *fname);
|
|
2018
|
+
#ifdef WCHAR_T_FUNS
|
|
2019
|
+
ISOTREE_EXPORTED
|
|
2020
|
+
void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const wchar_t *fname);
|
|
2021
|
+
#endif
|
|
2022
|
+
ISOTREE_EXPORTED
|
|
2023
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, const char *in);
|
|
2024
|
+
ISOTREE_EXPORTED
|
|
2025
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, FILE *in);
|
|
2026
|
+
ISOTREE_EXPORTED
|
|
2027
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, std::istream &in);
|
|
2028
|
+
ISOTREE_EXPORTED
|
|
2029
|
+
void deserialize_ExtIsoForest(ExtIsoForest &model, const std::string &in);
|
|
2030
|
+
ISOTREE_EXPORTED
|
|
2031
|
+
void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const char *fname);
|
|
2032
|
+
#ifdef WCHAR_T_FUNS
|
|
2033
|
+
ISOTREE_EXPORTED
|
|
2034
|
+
void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const wchar_t *fname);
|
|
2035
|
+
#endif
|
|
2036
|
+
ISOTREE_EXPORTED
|
|
2037
|
+
void serialize_Imputer(const Imputer &model, char *out);
|
|
2038
|
+
ISOTREE_EXPORTED
|
|
2039
|
+
void serialize_Imputer(const Imputer &model, FILE *out);
|
|
2040
|
+
ISOTREE_EXPORTED
|
|
2041
|
+
void serialize_Imputer(const Imputer &model, std::ostream &out);
|
|
2042
|
+
ISOTREE_EXPORTED
|
|
2043
|
+
std::string serialize_Imputer(const Imputer &model);
|
|
2044
|
+
ISOTREE_EXPORTED
|
|
2045
|
+
void serialize_Imputer_ToFile(const Imputer &model, const char *fname);
|
|
2046
|
+
#ifdef WCHAR_T_FUNS
|
|
2047
|
+
ISOTREE_EXPORTED
|
|
2048
|
+
void serialize_Imputer_ToFile(const Imputer &model, const wchar_t *fname);
|
|
2049
|
+
#endif
|
|
2050
|
+
ISOTREE_EXPORTED
|
|
2051
|
+
void deserialize_Imputer(Imputer &model, const char *in);
|
|
2052
|
+
ISOTREE_EXPORTED
|
|
2053
|
+
void deserialize_Imputer(Imputer &model, FILE *in);
|
|
2054
|
+
ISOTREE_EXPORTED
|
|
2055
|
+
void deserialize_Imputer(Imputer &model, std::istream &in);
|
|
2056
|
+
ISOTREE_EXPORTED
|
|
2057
|
+
void deserialize_Imputer(Imputer &model, const std::string &in);
|
|
2058
|
+
ISOTREE_EXPORTED
|
|
2059
|
+
void deserialize_Imputer_FromFile(Imputer &model, const char *fname);
|
|
2060
|
+
#ifdef WCHAR_T_FUNS
|
|
2061
|
+
ISOTREE_EXPORTED
|
|
2062
|
+
void deserialize_Imputer_FromFile(Imputer &model, const wchar_t *fname);
|
|
2063
|
+
#endif
|
|
2064
|
+
ISOTREE_EXPORTED
|
|
2065
|
+
void serialize_Indexer(const TreesIndexer &model, char *out);
|
|
2066
|
+
ISOTREE_EXPORTED
|
|
2067
|
+
void serialize_Indexer(const TreesIndexer &model, FILE *out);
|
|
2068
|
+
ISOTREE_EXPORTED
|
|
2069
|
+
void serialize_Indexer(const TreesIndexer &model, std::ostream &out);
|
|
2070
|
+
ISOTREE_EXPORTED
|
|
2071
|
+
std::string serialize_Indexer(const TreesIndexer &model);
|
|
2072
|
+
ISOTREE_EXPORTED
|
|
2073
|
+
void serialize_Indexer_ToFile(const TreesIndexer &model, const char *fname);
|
|
2074
|
+
#ifdef WCHAR_T_FUNS
|
|
2075
|
+
ISOTREE_EXPORTED
|
|
2076
|
+
void serialize_Indexer_ToFile(const TreesIndexer &model, const wchar_t *fname);
|
|
2077
|
+
#endif
|
|
2078
|
+
ISOTREE_EXPORTED
|
|
2079
|
+
void deserialize_Indexer(TreesIndexer &model, const char *in);
|
|
2080
|
+
ISOTREE_EXPORTED
|
|
2081
|
+
void deserialize_Indexer(TreesIndexer &model, FILE *in);
|
|
2082
|
+
ISOTREE_EXPORTED
|
|
2083
|
+
void deserialize_Indexer(TreesIndexer &model, std::istream &in);
|
|
2084
|
+
ISOTREE_EXPORTED
|
|
2085
|
+
void deserialize_Indexer(TreesIndexer &model, const std::string &in);
|
|
2086
|
+
ISOTREE_EXPORTED
|
|
2087
|
+
void deserialize_Indexer_FromFile(TreesIndexer &model, const char *fname);
|
|
2088
|
+
#ifdef WCHAR_T_FUNS
|
|
2089
|
+
ISOTREE_EXPORTED
|
|
2090
|
+
void deserialize_Indexer_FromFile(TreesIndexer &model, const wchar_t *fname);
|
|
2091
|
+
#endif
|
|
2092
|
+
void serialize_isotree(const IsoForest &model, char *out);
|
|
2093
|
+
void serialize_isotree(const ExtIsoForest &model, char *out);
|
|
2094
|
+
void serialize_isotree(const Imputer &model, char *out);
|
|
2095
|
+
void serialize_isotree(const TreesIndexer &model, char *out);
|
|
2096
|
+
void deserialize_isotree(IsoForest &model, const char *in);
|
|
2097
|
+
void deserialize_isotree(ExtIsoForest &model, const char *in);
|
|
2098
|
+
void deserialize_isotree(Imputer &model, const char *in);
|
|
2099
|
+
void deserialize_isotree(TreesIndexer &model, const char *in);
|
|
2100
|
+
void incremental_serialize_isotree(const IsoForest &model, char *old_bytes_reallocated);
|
|
2101
|
+
void incremental_serialize_isotree(const ExtIsoForest &model, char *old_bytes_reallocated);
|
|
2102
|
+
void incremental_serialize_isotree(const Imputer &model, char *old_bytes_reallocated);
|
|
2103
|
+
void incremental_serialize_isotree(const TreesIndexer &model, char *old_bytes_reallocated);
|
|
2104
|
+
ISOTREE_EXPORTED
|
|
2105
|
+
void incremental_serialize_IsoForest(const IsoForest &model, std::string &old_bytes);
|
|
2106
|
+
ISOTREE_EXPORTED
|
|
2107
|
+
void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, std::string &old_bytes);
|
|
2108
|
+
ISOTREE_EXPORTED
|
|
2109
|
+
void incremental_serialize_Imputer(const Imputer &model, std::string &old_bytes);
|
|
2110
|
+
ISOTREE_EXPORTED
|
|
2111
|
+
void incremental_serialize_Indexer(const TreesIndexer &model, std::string &old_bytes);
|
|
2112
|
+
ISOTREE_EXPORTED
|
|
2113
|
+
void inspect_serialized_object
|
|
2114
|
+
(
|
|
2115
|
+
const char *serialized_bytes,
|
|
2116
|
+
bool &is_isotree_model,
|
|
2117
|
+
bool &is_compatible,
|
|
2118
|
+
bool &has_combined_objects,
|
|
2119
|
+
bool &has_IsoForest,
|
|
2120
|
+
bool &has_ExtIsoForest,
|
|
2121
|
+
bool &has_Imputer,
|
|
2122
|
+
bool &has_Indexer,
|
|
2123
|
+
bool &has_metadata,
|
|
2124
|
+
size_t &size_metadata
|
|
2125
|
+
);
|
|
2126
|
+
ISOTREE_EXPORTED
|
|
2127
|
+
void inspect_serialized_object
|
|
2128
|
+
(
|
|
2129
|
+
FILE *serialized_bytes,
|
|
2130
|
+
bool &is_isotree_model,
|
|
2131
|
+
bool &is_compatible,
|
|
2132
|
+
bool &has_combined_objects,
|
|
2133
|
+
bool &has_IsoForest,
|
|
2134
|
+
bool &has_ExtIsoForest,
|
|
2135
|
+
bool &has_Imputer,
|
|
2136
|
+
bool &has_Indexer,
|
|
2137
|
+
bool &has_metadata,
|
|
2138
|
+
size_t &size_metadata
|
|
2139
|
+
);
|
|
2140
|
+
ISOTREE_EXPORTED
|
|
2141
|
+
void inspect_serialized_object
|
|
2142
|
+
(
|
|
2143
|
+
std::istream &serialized_bytes,
|
|
2144
|
+
bool &is_isotree_model,
|
|
2145
|
+
bool &is_compatible,
|
|
2146
|
+
bool &has_combined_objects,
|
|
2147
|
+
bool &has_IsoForest,
|
|
2148
|
+
bool &has_ExtIsoForest,
|
|
2149
|
+
bool &has_Imputer,
|
|
2150
|
+
bool &has_Indexer,
|
|
2151
|
+
bool &has_metadata,
|
|
2152
|
+
size_t &size_metadata
|
|
2153
|
+
);
|
|
2154
|
+
ISOTREE_EXPORTED
|
|
2155
|
+
void inspect_serialized_object
|
|
2156
|
+
(
|
|
2157
|
+
const std::string &serialized_bytes,
|
|
2158
|
+
bool &is_isotree_model,
|
|
2159
|
+
bool &is_compatible,
|
|
2160
|
+
bool &has_combined_objects,
|
|
2161
|
+
bool &has_IsoForest,
|
|
2162
|
+
bool &has_ExtIsoForest,
|
|
2163
|
+
bool &has_Imputer,
|
|
2164
|
+
bool &has_Indexer,
|
|
2165
|
+
bool &has_metadata,
|
|
2166
|
+
size_t &size_metadata
|
|
2167
|
+
);
|
|
2168
|
+
ISOTREE_EXPORTED
|
|
2169
|
+
bool check_can_undergo_incremental_serialization(const IsoForest &model, const char *serialized_bytes);
|
|
2170
|
+
ISOTREE_EXPORTED
|
|
2171
|
+
bool check_can_undergo_incremental_serialization(const ExtIsoForest &model, const char *serialized_bytes);
|
|
2172
|
+
ISOTREE_EXPORTED
|
|
2173
|
+
bool check_can_undergo_incremental_serialization(const Imputer &model, const char *serialized_bytes);
|
|
2174
|
+
ISOTREE_EXPORTED
|
|
2175
|
+
bool check_can_undergo_incremental_serialization(const TreesIndexer &model, const char *serialized_bytes);
|
|
2176
|
+
ISOTREE_EXPORTED
|
|
2177
|
+
size_t determine_serialized_size_additional_trees(const IsoForest &model, size_t old_ntrees) noexcept;
|
|
2178
|
+
ISOTREE_EXPORTED
|
|
2179
|
+
size_t determine_serialized_size_additional_trees(const ExtIsoForest &model, size_t old_ntrees) noexcept;
|
|
2180
|
+
ISOTREE_EXPORTED
|
|
2181
|
+
size_t determine_serialized_size_additional_trees(const Imputer &model, size_t old_ntrees) noexcept;
|
|
2182
|
+
ISOTREE_EXPORTED
|
|
2183
|
+
size_t determine_serialized_size_additional_trees(const TreesIndexer &model, size_t old_ntrees) noexcept;
|
|
2184
|
+
ISOTREE_EXPORTED
|
|
2185
|
+
void incremental_serialize_IsoForest(const IsoForest &model, char *old_bytes_reallocated);
|
|
2186
|
+
ISOTREE_EXPORTED
|
|
2187
|
+
void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, char *old_bytes_reallocated);
|
|
2188
|
+
ISOTREE_EXPORTED
|
|
2189
|
+
void incremental_serialize_Imputer(const Imputer &model, char *old_bytes_reallocated);
|
|
2190
|
+
ISOTREE_EXPORTED
|
|
2191
|
+
void incremental_serialize_Indexer(const TreesIndexer &model, char *old_bytes_reallocated);
|
|
2192
|
+
ISOTREE_EXPORTED
|
|
2193
|
+
size_t determine_serialized_size_combined
|
|
2194
|
+
(
|
|
2195
|
+
const IsoForest *model,
|
|
2196
|
+
const ExtIsoForest *model_ext,
|
|
2197
|
+
const Imputer *imputer,
|
|
2198
|
+
const TreesIndexer *indexer,
|
|
2199
|
+
const size_t size_optional_metadata
|
|
2200
|
+
) noexcept;
|
|
2201
|
+
ISOTREE_EXPORTED
|
|
2202
|
+
size_t determine_serialized_size_combined
|
|
2203
|
+
(
|
|
2204
|
+
const char *serialized_model,
|
|
2205
|
+
const char *serialized_model_ext,
|
|
2206
|
+
const char *serialized_imputer,
|
|
2207
|
+
const char *serialized_indexer,
|
|
2208
|
+
const size_t size_optional_metadata
|
|
2209
|
+
) noexcept;
|
|
2210
|
+
ISOTREE_EXPORTED
|
|
2211
|
+
void serialize_combined
|
|
2212
|
+
(
|
|
2213
|
+
const IsoForest *model,
|
|
2214
|
+
const ExtIsoForest *model_ext,
|
|
2215
|
+
const Imputer *imputer,
|
|
2216
|
+
const TreesIndexer *indexer,
|
|
2217
|
+
const char *optional_metadata,
|
|
2218
|
+
const size_t size_optional_metadata,
|
|
2219
|
+
char *out
|
|
2220
|
+
);
|
|
2221
|
+
ISOTREE_EXPORTED
|
|
2222
|
+
void serialize_combined
|
|
2223
|
+
(
|
|
2224
|
+
const IsoForest *model,
|
|
2225
|
+
const ExtIsoForest *model_ext,
|
|
2226
|
+
const Imputer *imputer,
|
|
2227
|
+
const TreesIndexer *indexer,
|
|
2228
|
+
const char *optional_metadata,
|
|
2229
|
+
const size_t size_optional_metadata,
|
|
2230
|
+
FILE *out
|
|
2231
|
+
);
|
|
2232
|
+
ISOTREE_EXPORTED
|
|
2233
|
+
void serialize_combined
|
|
2234
|
+
(
|
|
2235
|
+
const IsoForest *model,
|
|
2236
|
+
const ExtIsoForest *model_ext,
|
|
2237
|
+
const Imputer *imputer,
|
|
2238
|
+
const TreesIndexer *indexer,
|
|
2239
|
+
const char *optional_metadata,
|
|
2240
|
+
const size_t size_optional_metadata,
|
|
2241
|
+
std::ostream &out
|
|
2242
|
+
);
|
|
2243
|
+
ISOTREE_EXPORTED
|
|
2244
|
+
std::string serialize_combined
|
|
2245
|
+
(
|
|
2246
|
+
const IsoForest *model,
|
|
2247
|
+
const ExtIsoForest *model_ext,
|
|
2248
|
+
const Imputer *imputer,
|
|
2249
|
+
const TreesIndexer *indexer,
|
|
2250
|
+
const char *optional_metadata,
|
|
2251
|
+
const size_t size_optional_metadata
|
|
2252
|
+
);
|
|
2253
|
+
ISOTREE_EXPORTED
|
|
2254
|
+
void serialize_combined
|
|
2255
|
+
(
|
|
2256
|
+
const char *serialized_model,
|
|
2257
|
+
const char *serialized_model_ext,
|
|
2258
|
+
const char *serialized_imputer,
|
|
2259
|
+
const char *serialized_indexer,
|
|
2260
|
+
const char *optional_metadata,
|
|
2261
|
+
const size_t size_optional_metadata,
|
|
2262
|
+
FILE *out
|
|
2263
|
+
);
|
|
2264
|
+
ISOTREE_EXPORTED
|
|
2265
|
+
void serialize_combined
|
|
2266
|
+
(
|
|
2267
|
+
const char *serialized_model,
|
|
2268
|
+
const char *serialized_model_ext,
|
|
2269
|
+
const char *serialized_imputer,
|
|
2270
|
+
const char *serialized_indexer,
|
|
2271
|
+
const char *optional_metadata,
|
|
2272
|
+
const size_t size_optional_metadata,
|
|
2273
|
+
std::ostream &out
|
|
2274
|
+
);
|
|
2275
|
+
ISOTREE_EXPORTED
|
|
2276
|
+
std::string serialize_combined
|
|
2277
|
+
(
|
|
2278
|
+
const char *serialized_model,
|
|
2279
|
+
const char *serialized_model_ext,
|
|
2280
|
+
const char *serialized_imputer,
|
|
2281
|
+
const char *serialized_indexer,
|
|
2282
|
+
const char *optional_metadata,
|
|
2283
|
+
const size_t size_optional_metadata
|
|
2284
|
+
);
|
|
2285
|
+
ISOTREE_EXPORTED
|
|
2286
|
+
void deserialize_combined
|
|
2287
|
+
(
|
|
2288
|
+
const char* in,
|
|
2289
|
+
IsoForest *model,
|
|
2290
|
+
ExtIsoForest *model_ext,
|
|
2291
|
+
Imputer *imputer,
|
|
2292
|
+
TreesIndexer *indexer,
|
|
2293
|
+
char *optional_metadata
|
|
2294
|
+
);
|
|
2295
|
+
ISOTREE_EXPORTED
|
|
2296
|
+
void deserialize_combined
|
|
2297
|
+
(
|
|
2298
|
+
FILE* in,
|
|
2299
|
+
IsoForest *model,
|
|
2300
|
+
ExtIsoForest *model_ext,
|
|
2301
|
+
Imputer *imputer,
|
|
2302
|
+
TreesIndexer *indexer,
|
|
2303
|
+
char *optional_metadata
|
|
2304
|
+
);
|
|
2305
|
+
ISOTREE_EXPORTED
|
|
2306
|
+
void deserialize_combined
|
|
2307
|
+
(
|
|
2308
|
+
std::istream &in,
|
|
2309
|
+
IsoForest *model,
|
|
2310
|
+
ExtIsoForest *model_ext,
|
|
2311
|
+
Imputer *imputer,
|
|
2312
|
+
TreesIndexer *indexer,
|
|
2313
|
+
char *optional_metadata
|
|
2314
|
+
);
|
|
2315
|
+
ISOTREE_EXPORTED
|
|
2316
|
+
void deserialize_combined
|
|
2317
|
+
(
|
|
2318
|
+
const std::string &in,
|
|
2319
|
+
IsoForest *model,
|
|
2320
|
+
ExtIsoForest *model_ext,
|
|
2321
|
+
Imputer *imputer,
|
|
2322
|
+
TreesIndexer *indexer,
|
|
2323
|
+
char *optional_metadata
|
|
2324
|
+
);
|
|
2325
|
+
bool check_model_has_range_penalty(const IsoForest &model) noexcept;
|
|
2326
|
+
bool check_model_has_range_penalty(const ExtIsoForest &model) noexcept;
|
|
2327
|
+
void add_range_penalty(IsoForest &model) noexcept;
|
|
2328
|
+
void add_range_penalty(ExtIsoForest &model) noexcept;
|
|
2329
|
+
void add_range_penalty(Imputer &model) noexcept;
|
|
2330
|
+
void add_range_penalty(TreesIndexer &model) noexcept;
|
|
924
2331
|
|
|
925
2332
|
/* sql.cpp */
|
|
2333
|
+
ISOTREE_EXPORTED
|
|
926
2334
|
std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
927
2335
|
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
928
2336
|
std::vector<std::vector<std::string>> &categ_levels,
|
|
929
2337
|
bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
|
|
930
2338
|
int nthreads);
|
|
2339
|
+
ISOTREE_EXPORTED
|
|
931
2340
|
std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
932
2341
|
std::string &table_from, std::string &select_as,
|
|
933
2342
|
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
@@ -935,7 +2344,8 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
|
|
|
935
2344
|
bool index1, int nthreads);
|
|
936
2345
|
void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
|
|
937
2346
|
size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
|
|
938
|
-
std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right
|
|
2347
|
+
std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
|
|
2348
|
+
const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext);
|
|
939
2349
|
void extract_cond_isotree(IsoForest &model, IsoTree &tree,
|
|
940
2350
|
std::string &cond_left, std::string &cond_right,
|
|
941
2351
|
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
@@ -945,7 +2355,9 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
|
|
|
945
2355
|
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
|
946
2356
|
std::vector<std::vector<std::string>> &categ_levels);
|
|
947
2357
|
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
2358
|
+
#ifndef _FOR_R
|
|
2359
|
+
#if defined(__clang__)
|
|
2360
|
+
#pragma clang diagnostic pop
|
|
2361
|
+
#endif
|
|
2362
|
+
#endif
|
|
2363
|
+
#endif /* ISOTREE_H */
|