isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -43,73 +61,196 @@
43
61
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
62
  */
45
63
 
64
+ #ifndef ISOTREE_H
65
+ #define ISOTREE_H
66
+
67
+ /* This is only used for the serialiation format and might not reflect the
68
+ actual version of the library, do not use for anything else. */
69
+ #define ISOTREE_VERSION_MAJOR 0
70
+ #define ISOTREE_VERSION_MINOR 5
71
+ #define ISOTREE_VERSION_PATCH 6
72
+
73
+ /* For MinGW, needs to be defined before including any headers */
74
+ #if (defined(_WIN32) || defined(_WIN64)) && (SIZE_MAX >= UINT64_MAX)
75
+ # if defined(__GNUG__) || defined(__GNUC__)
76
+ # ifndef _FILE_OFFSET_BITS
77
+ # define _FILE_OFFSET_BITS 64
78
+ # endif
79
+ # endif
80
+ #endif
81
+ #ifdef _MSC_VER
82
+ # define _CRT_SECURE_NO_WARNINGS
83
+ #endif
84
+
85
+
46
86
  /* Standard headers */
47
- #include <stddef.h>
48
- #include <math.h>
49
- #include <limits.h>
50
- #include <string.h>
51
- #include <signal.h>
87
+ #include <cstddef>
88
+ #include <cmath>
89
+ #include <climits>
90
+ #include <cstring>
91
+ #include <cerrno>
52
92
  #include <vector>
53
93
  #include <iterator>
54
94
  #include <numeric>
55
95
  #include <algorithm>
56
96
  #include <random>
57
- #include <unordered_set>
58
- #include <unordered_map>
59
97
  #include <memory>
60
98
  #include <utility>
61
99
  #include <cstdint>
100
+ #include <cinttypes>
101
+ #include <exception>
102
+ #include <stdexcept>
103
+ #include <cassert>
104
+ #include <cfloat>
62
105
  #include <iostream>
63
- #ifndef _FOR_R
64
- #include <stdio.h>
65
- #else
106
+ #include <string>
107
+
108
+ #ifdef _FOR_R
66
109
  extern "C" {
67
110
  #include <R_ext/Print.h>
68
111
  }
69
112
  #define printf Rprintf
70
113
  #define fprintf(f, message) REprintf(message)
114
+ #elif defined(_FOR_PYTHON)
115
+ extern "C" void cy_warning(const char *msg);
116
+ #define fprintf(f, message) cy_warning(message)
117
+ #else
118
+ #include <cstdio>
119
+ using std::printf;
120
+ using std::fprintf;
71
121
  #endif
72
122
  #ifdef _OPENMP
73
123
  #include <omp.h>
74
124
  #endif
75
- #ifdef _ENABLE_CEREAL
76
- #include <cereal/archives/binary.hpp>
77
- #include <cereal/types/vector.hpp>
78
- #include <sstream>
79
- #include <string>
80
- #include <fstream>
125
+ #ifdef _FOR_R
126
+ #include <Rcpp.h>
127
+ #endif
128
+ #include <csignal>
129
+ typedef void (*sig_t_)(int);
130
+ using std::signal;
131
+ using std::raise;
132
+
133
+ using std::size_t;
134
+ using std::memset;
135
+ using std::memcpy;
136
+
137
+ #if defined(__GNUC__) || defined(__clang__)
138
+ #define likely(x) __builtin_expect((bool)(x), true)
139
+ #define unlikely(x) __builtin_expect((bool)(x), false)
140
+ #else
141
+ #define likely(x) (x)
142
+ #define unlikely(x) (x)
143
+ #endif
144
+
145
+ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
146
+ #define unexpected_error() throw std::runtime_error(\
147
+ std::string("Unexpected error in ") + \
148
+ std::string(__FILE__) + \
149
+ std::string(":") + \
150
+ std::to_string(__LINE__) + \
151
+ std::string(". Please open an issue in GitHub with this information, indicating the installed version of 'isotree'.\n"))
152
+ #else
153
+ #define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
81
154
  #endif
82
155
 
83
- /* By default, will use Mersenne-Twister for RNG, but can be switched to something faster */
84
- #ifdef _USE_MERSENNE_TWISTER
156
+ /* By default, will use Xoshiro256++ or Xoshiro128++ for RNG, but can be switched to something faster */
157
+ #ifdef _USE_XOSHIRO
158
+ #include "xoshiro.hpp"
85
159
  #if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
86
- #define RNG_engine std::mt19937_64
160
+ #define RNG_engine Xoshiro::Xoshiro256PP
87
161
  #else /* 32-bit systems and non-standard architectures */
88
- #define RNG_engine std::mt19937
162
+ #define RNG_engine Xoshiro::Xoshiro128PP
163
+ #endif
164
+ #if defined(DBL_MANT_DIG) && (DBL_MANT_DIG == 53) && (FLT_RADIX == 2)
165
+ using Xoshiro::UniformUnitInterval;
166
+ using Xoshiro::UniformMinusOneToOne;
167
+ using Xoshiro::StandardNormalDistr;
168
+ #else
169
+ #define UniformUnitInterval std::uniform_real_distribution<double>
170
+ #define UniformMinusOneToOne std::uniform_real_distribution<double>
171
+ #define StandardNormalDistr std::normal_distribution<double>
172
+ #endif
173
+ #else
174
+ #if defined(_USE_MERSENNE_TWISTER)
175
+ #if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
176
+ #define RNG_engine std::mt19937_64
177
+ #else /* 32-bit systems and non-standard architectures */
178
+ #define RNG_engine std::mt19937
179
+ #endif
180
+ #else
181
+ #define RNG_engine std::default_random_engine
182
+ #endif
183
+
184
+ #define UniformUnitInterval std::uniform_real_distribution<double>
185
+ #define UniformMinusOneToOne std::uniform_real_distribution<double>
186
+ #define StandardNormalDistr std::normal_distribution<double>
187
+ #endif
188
+
189
+ /* At the time of writing, this brought a sizeable speed up compared to
190
+ 'unordered_map' and 'unordered_set' from both GCC and CLANG.
191
+ But perhaps should consider others in the future, such as this:
192
+ https://github.com/ktprime/emhash */
193
+ #if defined(_USE_ROBIN_MAP)
194
+ #ifndef _USE_SYSTEM_ROBIN
195
+ #include "robinmap/include/tsl/robin_growth_policy.h"
196
+ #include "robinmap/include/tsl/robin_hash.h"
197
+ #include "robinmap/include/tsl/robin_set.h"
198
+ #include "robinmap/include/tsl/robin_map.h"
199
+ #else
200
+ #include "tsl/robin_growth_policy.h"
201
+ #include "tsl/robin_hash.h"
202
+ #include "tsl/robin_set.h"
203
+ #include "tsl/robin_map.h"
89
204
  #endif
205
+ #define hashed_set tsl::robin_set
206
+ #define hashed_map tsl::robin_map
90
207
  #else
91
- #define RNG_engine std::default_random_engine
208
+ #include <unordered_set>
209
+ #include <unordered_map>
210
+ #define hashed_set std::unordered_set
211
+ #define hashed_map std::unordered_map
92
212
  #endif
93
213
 
94
214
  /* Short functions */
95
- #define ix_parent(ix) (((ix) - 1) / 2) /* integer division takes care of deciding left-right */
96
- #define ix_child(ix) (2 * (ix) + 1)
97
215
  /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
98
216
  #define pow2(n) ( ((size_t) 1) << (n) )
217
+ #define div2(n) ((n) >> 1)
218
+ #define mult2(n) ((n) << 1)
219
+ #define ix_parent(ix) (div2((ix) - (size_t)1)) /* integer division takes care of deciding left-right */
220
+ #define ix_child(ix) (mult2(ix) + (size_t)1)
99
221
  #define square(x) ((x) * (x))
222
+ #ifndef _FOR_R
223
+ #if defined(__GNUC__) && (__GNUC__ >= 5)
224
+ #pragma GCC diagnostic push
225
+ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
226
+ #elif defined(__clang__) && !defined(_FOR_R)
227
+ #pragma clang diagnostic push
228
+ #pragma clang diagnostic ignored "-Wuninitialized"
229
+ #endif
230
+ #endif
100
231
  /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
101
232
  #define extract_bit(number, bit) (((number) >> (bit)) & 1)
102
- #ifndef isinf
103
- #define isinf std::isinf
233
+ #ifndef _FOR_R
234
+ #if defined(__GNUC__) && (__GNUC__ >= 5)
235
+ #pragma GCC diagnostic pop
236
+ #elif defined(__clang__)
237
+ #pragma clang diagnostic pop
238
+ #pragma clang diagnostic push
239
+ #pragma clang diagnostic ignored "-Wunknown-attributes"
240
+ #endif
104
241
  #endif
105
- #ifndef isnan
106
- #define isnan std::isnan
242
+ #define is_na_or_inf(x) (std::isnan(x) || std::isinf(x))
243
+
244
+ /* MSVC doesn't support long doubles, so this avoids unnecessarily increasing library size.
245
+ MinGW supports them but has issues with their computations.
246
+ See https://sourceforge.net/p/mingw-w64/bugs/909/ */
247
+ #if defined(_WIN32) && !defined(NO_LONG_DOUBLE)
248
+ #define NO_LONG_DOUBLE
107
249
  #endif
108
- #define is_na_or_inf(x) (isnan(x) || isinf(x))
109
250
 
110
251
 
111
252
  /* Aliasing for compiler optimizations */
112
- #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
253
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
113
254
  #define restrict __restrict
114
255
  #else
115
256
  #define restrict
@@ -118,7 +259,7 @@
118
259
  /* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
119
260
  #ifdef _OPENMP
120
261
  #if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
121
- #define size_t_for long
262
+ #define size_t_for long long
122
263
  #else
123
264
  #define size_t_for size_t
124
265
  #endif
@@ -126,33 +267,51 @@
126
267
  #define size_t_for size_t
127
268
  #endif
128
269
 
270
+ #if defined(_FOR_R) || defined(_FOR_PYTHON)
271
+ #define ISOTREE_EXPORTED
272
+ #else
273
+ #if defined(_WIN32)
274
+ #ifdef ISOTREE_COMPILE_TIME
275
+ #define ISOTREE_EXPORTED __declspec(dllexport)
276
+ #else
277
+ #define ISOTREE_EXPORTED __declspec(dllimport)
278
+ #endif
279
+ #else
280
+ #if defined(EXPLICITLTY_EXPORT_SYMBOLS) && defined(ISOTREE_COMPILE_TIME)
281
+ #define ISOTREE_EXPORTED [[gnu::visibility("default")]]
282
+ #else
283
+ #define ISOTREE_EXPORTED
284
+ #endif
285
+ #endif
286
+ #endif
287
+
129
288
 
130
- /* Apple at some point decided to drop OMP library and headersfrom its compiler distribution
289
+ /* Apple at some point decided to drop OMP library and headers from its compiler distribution
131
290
  * and to alias 'gcc' to 'clang', which work differently when given flags they cannot interpret,
132
291
  * causing installation issues with pretty much all scientific software due to OMP headers that
133
292
  * would normally do nothing. This piece of code is to allow compilation without OMP header. */
134
293
  #ifndef _OPENMP
135
- #define omp_get_thread_num() 0
294
+ #define omp_get_thread_num() (0)
136
295
  #endif
137
296
 
297
+ /* Some aggregation functions will prefer more precise data types when the data is large */
298
+ #define THRESHOLD_LONG_DOUBLE (size_t)1e6
138
299
 
139
- /* For sparse matrices */
140
- #ifdef _FOR_R
141
- #define sparse_ix int
142
- #else
143
- #define sparse_ix size_t
144
- #endif
300
+ /* Types used through the package */
301
+ typedef enum NewCategAction {Weighted=0, Smallest=11, Random=12} NewCategAction; /* Weighted means Impute in the extended model */
302
+ typedef enum MissingAction {Divide=21, Impute=22, Fail=0} MissingAction; /* Divide is only for non-extended model */
303
+ typedef enum ColType {Numeric=31, Categorical=32, NotUsed=0} ColType;
304
+ typedef enum CategSplit {SubSet=0, SingleCateg=41} CategSplit;
305
+ typedef enum CoefType {Uniform=61, Normal=0} CoefType; /* For extended model */
306
+ typedef enum UseDepthImp {Lower=71, Higher=0, Same=72} UseDepthImp; /* For NA imputation */
307
+ typedef enum WeighImpRows {Inverse=0, Prop=81, Flat=82} WeighImpRows; /* For NA imputation */
308
+ typedef enum ScoringMetric {Depth=0, Density=92, BoxedDensity=94, BoxedDensity2=96, BoxedRatio=95,
309
+ AdjDepth=91, AdjDensity=93} ScoringMetric;
145
310
 
311
+ /* These are only used internally */
312
+ typedef enum ColCriterion {Uniformly=0, ByRange=1, ByVar=2, ByKurt=3} ColCriterion; /* For proportional choices */
313
+ typedef enum GainCriterion {NoCrit=0, Averaged=1, Pooled=2, FullGain=3, DensityCrit=4} Criterion; /* For guided splits */
146
314
 
147
- /* Types used through the package */
148
- typedef enum NewCategAction {Weighted, Smallest, Random} NewCategAction; /* Weighted means Impute in the extended model */
149
- typedef enum MissingAction {Divide, Impute, Fail} MissingAction; /* Divide is only for non-extended model */
150
- typedef enum ColType {Numeric, Categorical, NotUsed} ColType;
151
- typedef enum CategSplit {SubSet, SingleCateg} CategSplit;
152
- typedef enum GainCriterion {Averaged, Pooled, NoCrit} Criterion; /* For guided splits */
153
- typedef enum CoefType {Uniform, Normal} CoefType; /* For extended model */
154
- typedef enum UseDepthImp {Lower, Higher, Same} UseDepthImp; /* For NA imputation */
155
- typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /* For NA imputation */
156
315
 
157
316
  /* Notes about new categorical action:
158
317
  * - For single-variable case, if using 'Smallest', can then pass data at prediction time
@@ -167,10 +326,10 @@ typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /
167
326
 
168
327
  /* Structs that are output (modified) from the main function */
169
328
  typedef struct IsoTree {
170
- ColType col_type = NotUsed; /* issues with uninitialized values passed to Cereal */
329
+ ColType col_type = NotUsed; /* issues with uninitialized values when serializing */
171
330
  size_t col_num;
172
331
  double num_split;
173
- std::vector<char> cat_split;
332
+ std::vector<signed char> cat_split;
174
333
  int chosen_cat;
175
334
  size_t tree_left;
176
335
  size_t tree_right;
@@ -180,29 +339,7 @@ typedef struct IsoTree {
180
339
  double range_high = HUGE_VAL;
181
340
  double remainder; /* only used for distance/similarity */
182
341
 
183
- #ifdef _ENABLE_CEREAL
184
- template<class Archive>
185
- void serialize(Archive &archive)
186
- {
187
- archive(
188
- this->col_type,
189
- this->col_num,
190
- this->num_split,
191
- this->cat_split,
192
- this->chosen_cat,
193
- this->tree_left,
194
- this->tree_right,
195
- this->pct_tree_left,
196
- this->score,
197
- this->range_low,
198
- this->range_high,
199
- this->remainder
200
- );
201
- }
202
- #endif
203
-
204
342
  IsoTree() = default;
205
-
206
343
  } IsoTree;
207
344
 
208
345
  typedef struct IsoHPlane {
@@ -223,30 +360,6 @@ typedef struct IsoHPlane {
223
360
  double range_high = HUGE_VAL;
224
361
  double remainder; /* only used for distance/similarity */
225
362
 
226
- #ifdef _ENABLE_CEREAL
227
- template<class Archive>
228
- void serialize(Archive &archive)
229
- {
230
- archive(
231
- this->col_num,
232
- this->col_type,
233
- this->coef,
234
- this->mean,
235
- this->cat_coef,
236
- this->chosen_cat,
237
- this->fill_val,
238
- this->fill_new,
239
- this->split_point,
240
- this->hplane_left,
241
- this->hplane_right,
242
- this->score,
243
- this->range_low,
244
- this->range_high,
245
- this->remainder
246
- );
247
- }
248
- #endif
249
-
250
363
  IsoHPlane() = default;
251
364
  } IsoHPlane;
252
365
 
@@ -258,25 +371,11 @@ typedef struct IsoForest {
258
371
  NewCategAction new_cat_action;
259
372
  CategSplit cat_split_type;
260
373
  MissingAction missing_action;
374
+ ScoringMetric scoring_metric;
261
375
  double exp_avg_depth;
262
376
  double exp_avg_sep;
263
377
  size_t orig_sample_size;
264
-
265
- #ifdef _ENABLE_CEREAL
266
- template<class Archive>
267
- void serialize(Archive &archive)
268
- {
269
- archive(
270
- this->trees,
271
- this->new_cat_action,
272
- this->cat_split_type,
273
- this->missing_action,
274
- this->exp_avg_depth,
275
- this->exp_avg_sep,
276
- this->orig_sample_size
277
- );
278
- }
279
- #endif
378
+ bool has_range_penalty;
280
379
 
281
380
  IsoForest() = default;
282
381
  } IsoForest;
@@ -286,25 +385,11 @@ typedef struct ExtIsoForest {
286
385
  NewCategAction new_cat_action;
287
386
  CategSplit cat_split_type;
288
387
  MissingAction missing_action;
388
+ ScoringMetric scoring_metric;
289
389
  double exp_avg_depth;
290
390
  double exp_avg_sep;
291
391
  size_t orig_sample_size;
292
-
293
- #ifdef _ENABLE_CEREAL
294
- template<class Archive>
295
- void serialize(Archive &archive)
296
- {
297
- archive(
298
- this->hplanes,
299
- this->new_cat_action,
300
- this->cat_split_type,
301
- this->missing_action,
302
- this->exp_avg_depth,
303
- this->exp_avg_sep,
304
- this->orig_sample_size
305
- );
306
- }
307
- #endif
392
+ bool has_range_penalty;
308
393
 
309
394
  ExtIsoForest() = default;
310
395
  } ExtIsoForest;
@@ -316,19 +401,6 @@ typedef struct ImputeNode {
316
401
  std::vector<double> cat_weight;
317
402
  size_t parent;
318
403
 
319
- #ifdef _ENABLE_CEREAL
320
- template<class Archive>
321
- void serialize(Archive &archive)
322
- {
323
- archive(
324
- this->num_sum,
325
- this->num_weight,
326
- this->cat_sum,
327
- this->cat_weight,
328
- this->parent
329
- );
330
- }
331
- #endif
332
404
  ImputeNode() = default;
333
405
 
334
406
  ImputeNode(size_t parent)
@@ -345,30 +417,31 @@ typedef struct Imputer {
345
417
  std::vector<std::vector<ImputeNode>> imputer_tree;
346
418
  std::vector<double> col_means;
347
419
  std::vector<int> col_modes;
420
+
421
+ Imputer() = default;
422
+ } Imputer;
348
423
 
349
- #ifdef _ENABLE_CEREAL
350
- template<class Archive>
351
- void serialize(Archive &archive)
352
- {
353
- archive(
354
- this->ncols_numeric,
355
- this->ncols_categ,
356
- this->ncat,
357
- this->imputer_tree,
358
- this->col_means,
359
- this->col_modes
360
- );
361
- }
362
- #endif
424
+ typedef struct SingleTreeIndex {
425
+ std::vector<size_t> terminal_node_mappings;
426
+ std::vector<double> node_distances;
427
+ std::vector<double> node_depths;
428
+ std::vector<size_t> reference_points;
429
+ std::vector<size_t> reference_indptr;
430
+ std::vector<size_t> reference_mapping;
431
+ size_t n_terminal;
432
+ } TreeNodeIndex;
363
433
 
364
- Imputer() = default;
434
+ typedef struct TreesIndexer {
435
+ std::vector<SingleTreeIndex> indices;
365
436
 
366
- } Imputer;
437
+ TreesIndexer() = default;
438
+ } TreesIndexer;
367
439
 
368
440
 
369
441
  /* Structs that are only used internally */
370
- typedef struct {
371
- double* numeric_data;
442
+ template <class real_t, class sparse_ix>
443
+ struct InputData {
444
+ real_t* numeric_data;
372
445
  size_t ncols_numeric;
373
446
  int* categ_data;
374
447
  int* ncat;
@@ -376,10 +449,10 @@ typedef struct {
376
449
  size_t ncols_categ;
377
450
  size_t nrows;
378
451
  size_t ncols_tot;
379
- double* sample_weights;
452
+ real_t* sample_weights;
380
453
  bool weight_as_sample;
381
- double* col_weights;
382
- double* Xc; /* only for sparse matrices */
454
+ real_t* col_weights;
455
+ real_t* Xc; /* only for sparse matrices */
383
456
  sparse_ix* Xc_ind; /* only for sparse matrices */
384
457
  sparse_ix* Xc_indptr; /* only for sparse matrices */
385
458
  size_t log2_n; /* only when using weights for sampling */
@@ -387,37 +460,58 @@ typedef struct {
387
460
  std::vector<double> btree_weights_init; /* only when using weights for sampling */
388
461
  std::vector<char> has_missing; /* only used when producing missing imputations on-the-fly */
389
462
  size_t n_missing; /* only used when producing missing imputations on-the-fly */
390
- } InputData;
391
-
392
-
393
- typedef struct {
394
- double* numeric_data;
463
+ void* preinitialized_col_sampler; /* only when using column weights */
464
+ double* range_low; /* only when calculating variable ranges or boxed densities with no sub-sampling */
465
+ double* range_high; /* only when calculating variable ranges or boxed densities with no sub-sampling */
466
+ int* ncat_; /* only when calculating boxed densities with no sub-sampling */
467
+ std::vector<double> all_kurtoses; /* only when using 'prob_pick_col_by_kurtosis' or mixing 'weigh_by_kurt' with 'prob_pick_col*' with no sub-sampling */
468
+
469
+ std::vector<double> X_row_major; /* created by this library, only used when calculating full gain */
470
+ std::vector<double> Xr; /* created by this library, only used when calculating full gain */
471
+ std::vector<size_t> Xr_ind; /* created by this library, only used when calculating full gain */
472
+ std::vector<size_t> Xr_indptr; /* created by this library, only used when calculating full gain */
473
+ };
474
+
475
+
476
+ template <class real_t, class sparse_ix>
477
+ struct PredictionData {
478
+ real_t* numeric_data;
395
479
  int* categ_data;
396
480
  size_t nrows;
397
- double* Xc; /* only for sparse matrices */
398
- sparse_ix* Xc_ind; /* only for sparse matrices */
399
- sparse_ix* Xc_indptr; /* only for sparse matrices */
400
- double* Xr; /* only for sparse matrices */
401
- sparse_ix* Xr_ind; /* only for sparse matrices */
402
- sparse_ix* Xr_indptr; /* only for sparse matrices */
403
- } PredictionData;
481
+ bool is_col_major;
482
+ size_t ncols_numeric; /* only required for row-major data */
483
+ size_t ncols_categ; /* only required for row-major data */
484
+ real_t* Xc; /* only for sparse matrices */
485
+ sparse_ix* Xc_ind; /* only for sparse matrices */
486
+ sparse_ix* Xc_indptr; /* only for sparse matrices */
487
+ real_t* Xr; /* only for sparse matrices */
488
+ sparse_ix* Xr_ind; /* only for sparse matrices */
489
+ sparse_ix* Xr_indptr; /* only for sparse matrices */
490
+ };
404
491
 
405
492
  typedef struct {
406
493
  bool with_replacement;
407
494
  size_t sample_size;
408
495
  size_t ntrees;
496
+ size_t ncols_per_tree;
409
497
  size_t max_depth;
410
498
  bool penalize_range;
499
+ bool standardize_data;
411
500
  uint64_t random_seed;
412
501
  bool weigh_by_kurt;
413
502
  double prob_pick_by_gain_avg;
414
- double prob_split_by_gain_avg;
415
503
  double prob_pick_by_gain_pl;
416
- double prob_split_by_gain_pl;
504
+ double prob_pick_by_full_gain;
505
+ double prob_pick_by_dens;
506
+ double prob_pick_col_by_range;
507
+ double prob_pick_col_by_var;
508
+ double prob_pick_col_by_kurt;
417
509
  double min_gain;
418
510
  CategSplit cat_split_type;
419
511
  NewCategAction new_cat_action;
420
512
  MissingAction missing_action;
513
+ ScoringMetric scoring_metric;
514
+ bool fast_bratio;
421
515
  bool all_perm;
422
516
 
423
517
  size_t ndim; /* only for extended model */
@@ -431,16 +525,17 @@ typedef struct {
431
525
 
432
526
  UseDepthImp depth_imp; /* only when building NA imputer */
433
527
  WeighImpRows weigh_imp_rows; /* only when building NA imputer */
434
- size_t min_imp_obs; /* only when building NA imputer */
528
+ size_t min_imp_obs; /* only when building NA imputer */
435
529
  } ModelParams;
436
530
 
437
- typedef struct ImputedData {
438
- std::vector<long double> num_sum;
439
- std::vector<long double> num_weight;
440
- std::vector<std::vector<long double>> cat_sum;
441
- std::vector<long double> cat_weight;
442
- std::vector<long double> sp_num_sum;
443
- std::vector<long double> sp_num_weight;
531
+ template <class sparse_ix, class ldouble_safe>
532
+ struct ImputedData {
533
+ std::vector<ldouble_safe> num_sum;
534
+ std::vector<ldouble_safe> num_weight;
535
+ std::vector<std::vector<ldouble_safe>> cat_sum;
536
+ std::vector<ldouble_safe> cat_weight;
537
+ std::vector<ldouble_safe> sp_num_sum;
538
+ std::vector<ldouble_safe> sp_num_weight;
444
539
 
445
540
  std::vector<size_t> missing_num;
446
541
  std::vector<size_t> missing_cat;
@@ -451,56 +546,288 @@ typedef struct ImputedData {
451
546
 
452
547
  ImputedData() {};
453
548
 
454
- ImputedData(InputData &input_data, size_t row);
455
-
456
- } ImputedData;
549
+ template <class InputData>
550
+ ImputedData(InputData &input_data, size_t row)
551
+ {
552
+ initialize_impute_calc(*this, input_data, row);
553
+ }
457
554
 
458
- typedef struct {
555
+ };
556
+
557
+ /* This class provides efficient methods for sampling columns at random,
558
+ given that at a given node a column might no longer be splittable,
559
+ and when that happens, it also makes it non-splittable in any children
560
+ node from there onwards. The idea is to provide efficient methods for
561
+ passing the state from a parent node to a left node and then restore
562
+ the state before going for the right node.
563
+ It can be used in 3 modes:
564
+ - As a uniform sampler with replacement.
565
+ - As a weighted sampler with replacement.
566
+ - As an array that keeps track of which columns are still splittable. */
567
+ template <class ldouble_safe>
568
+ class ColumnSampler
569
+ {
570
+ public:
571
+ std::vector<size_t> col_indices;
572
+ std::vector<double> tree_weights;
573
+ size_t curr_pos;
574
+ size_t curr_col;
575
+ size_t last_given;
576
+ size_t n_cols;
577
+ size_t tree_levels;
578
+ size_t offset;
579
+ size_t n_dropped;
580
+ template <class real_t>
581
+ void initialize(real_t weights[], size_t n_cols);
582
+ void initialize(size_t n_cols);
583
+ void drop_weights();
584
+ void leave_m_cols(size_t m, RNG_engine &rnd_generator);
585
+ bool sample_col(size_t &col, RNG_engine &rnd_generator);
586
+ void prepare_full_pass(); /* when passing through all columns */
587
+ bool sample_col(size_t &col); /* when passing through all columns */
588
+ void drop_col(size_t col, size_t nobs_left);
589
+ void drop_col(size_t col);
590
+ void drop_from_tail(size_t col);
591
+ void shuffle_remainder(RNG_engine &rnd_generator);
592
+ bool has_weights();
593
+ size_t get_remaining_cols();
594
+ void get_array_remaining_cols(std::vector<size_t> &restrict cols);
595
+ template <class other_t>
596
+ ColumnSampler& operator=(const ColumnSampler<other_t> &other);
597
+ ColumnSampler() = default;
598
+ };
599
+
600
+ template <class ldouble_safe, class real_t>
601
+ class DensityCalculator
602
+ {
603
+ public:
604
+ std::vector<ldouble_safe> multipliers;
605
+ double xmin;
606
+ double xmax;
607
+ std::vector<size_t> counts;
608
+ int n_present;
609
+ int n_left;
610
+ std::vector<double> box_low;
611
+ std::vector<double> box_high;
612
+ std::vector<double> queue_box;
613
+ bool fast_bratio;
614
+ std::vector<ldouble_safe> ranges;
615
+ std::vector<int> ncat;
616
+ std::vector<int> queue_ncat;
617
+ std::vector<int> ncat_orig;
618
+ std::vector<double> vals_ext_box;
619
+ std::vector<double> queue_ext_box;
620
+
621
+ void initialize(size_t max_depth, int max_categ, bool reserve_counts, ScoringMetric scoring_metric);
622
+ template <class InputData>
623
+ #ifndef _FOR_R
624
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
625
+ #endif
626
+ void initialize_bdens(const InputData &input_data,
627
+ const ModelParams &model_params,
628
+ std::vector<size_t> &ix_arr,
629
+ ColumnSampler<ldouble_safe> &col_sampler);
630
+ template <class InputData>
631
+ void initialize_bdens_ext(const InputData &input_data,
632
+ const ModelParams &model_params,
633
+ std::vector<size_t> &ix_arr,
634
+ ColumnSampler<ldouble_safe> &col_sampler,
635
+ bool col_sampler_is_fresh);
636
+ #ifndef _FOR_R
637
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
638
+ #endif
639
+ void push_density(double xmin, double xmax, double split_point);
640
+ void push_density(size_t counts[], int ncat);
641
+ void push_density(int n_left, int n_present);
642
+ void push_density(int n_present);
643
+ void push_density();
644
+ void push_adj(double xmin, double xmax, double split_point, double pct_tree_left, ScoringMetric scoring_metric);
645
+ void push_adj(signed char *restrict categ_present, size_t *restrict counts, int ncat, ScoringMetric scoring_metric);
646
+ void push_adj(size_t *restrict counts, int ncat, int chosen_cat, ScoringMetric scoring_metric);
647
+ void push_adj(double pct_tree_left, ScoringMetric scoring_metric);
648
+ void push_bdens(double split_point, size_t col);
649
+ void push_bdens(int ncat_branch_left, size_t col);
650
+ void push_bdens(const std::vector<signed char> &cat_split, size_t col);
651
+ #ifndef _FOR_R
652
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
653
+ #endif
654
+ void push_bdens_fast_route(double split_point, size_t col);
655
+ void push_bdens_internal(double split_point, size_t col);
656
+ #ifndef _FOR_R
657
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
658
+ #endif
659
+ void push_bdens_fast_route(int ncat_branch_left, size_t col);
660
+ void push_bdens_internal(int ncat_branch_left, size_t col);
661
+ #ifndef _FOR_R
662
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
663
+ #endif
664
+ void push_bdens_fast_route(const std::vector<signed char> &cat_split, size_t col);
665
+ void push_bdens_internal(const std::vector<signed char> &cat_split, size_t col);
666
+ #ifndef _FOR_R
667
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
668
+ #endif
669
+ void push_bdens_ext(const IsoHPlane &hplane, const ModelParams &model_params);
670
+ void pop();
671
+ void pop_right();
672
+ void pop_bdens(size_t col);
673
+ void pop_bdens_right(size_t col);
674
+ void pop_bdens_cat(size_t col);
675
+ void pop_bdens_cat_right(size_t col);
676
+ void pop_bdens_fast_route(size_t col);
677
+ void pop_bdens_internal(size_t col);
678
+ void pop_bdens_right_fast_route(size_t col);
679
+ void pop_bdens_right_internal(size_t col);
680
+ void pop_bdens_cat_fast_route(size_t col);
681
+ void pop_bdens_cat_internal(size_t col);
682
+ void pop_bdens_cat_right_fast_route(size_t col);
683
+ void pop_bdens_cat_right_internal(size_t col);
684
+ void pop_bdens_ext();
685
+ void pop_bdens_ext_right();
686
+ #ifndef _FOR_R
687
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
688
+ #endif
689
+ double calc_density(ldouble_safe remainder, size_t sample_size);
690
+ ldouble_safe calc_adj_depth();
691
+ double calc_adj_density();
692
+ #ifndef _FOR_R
693
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
694
+ #endif
695
+ ldouble_safe calc_bratio_log();
696
+ #ifndef _FOR_R
697
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
698
+ #endif
699
+ ldouble_safe calc_bratio_inv_log();
700
+ #ifndef _FOR_R
701
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
702
+ #endif
703
+ double calc_bratio();
704
+ #ifndef _FOR_R
705
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
706
+ #endif
707
+ double calc_bdens(ldouble_safe remainder, size_t sample_size);
708
+ #ifndef _FOR_R
709
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
710
+ #endif
711
+ double calc_bdens2(ldouble_safe remainder, size_t sample_size);
712
+ #ifndef _FOR_R
713
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
714
+ #endif
715
+ ldouble_safe calc_bratio_log_ext();
716
+ #ifndef _FOR_R
717
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
718
+ #endif
719
+ double calc_bratio_ext();
720
+ #ifndef _FOR_R
721
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
722
+ #endif
723
+ double calc_bdens_ext(ldouble_safe remainder, size_t sample_size);
724
+ void save_range(double xmin, double xmax);
725
+ void restore_range(double &restrict xmin, double &restrict xmax);
726
+ void save_counts(size_t *restrict cat_counts, int ncat);
727
+ void save_n_present_and_left(signed char *restrict split_left, int ncat);
728
+ void save_n_present(size_t *restrict cat_counts, int ncat);
729
+ };
730
+
731
+ template <class ldouble_safe, class real_t>
732
+ class SingleNodeColumnSampler
733
+ {
734
+ public:
735
+ double *restrict weights_orig;
736
+ std::vector<bool> inifinite_weights;
737
+ ldouble_safe cumw;
738
+ size_t n_inf;
739
+ size_t *restrict col_indices;
740
+ size_t curr_pos;
741
+ bool using_tree;
742
+
743
+ bool backup_weights;
744
+ std::vector<double> weights_own;
745
+ size_t n_left;
746
+
747
+ std::vector<double> tree_weights;
748
+ size_t offset;
749
+ size_t tree_levels;
750
+ std::vector<double> used_weights;
751
+ std::vector<size_t> mapped_indices;
752
+ std::vector<size_t> mapped_inf_indices;
753
+
754
+ bool initialize(
755
+ double *restrict weights,
756
+ std::vector<size_t> *col_indices,
757
+ size_t curr_pos,
758
+ size_t n_sample,
759
+ bool backup_weights
760
+ );
761
+
762
+ bool sample_col(size_t &col_chosen, RNG_engine &rnd_generator);
763
+
764
+ void backup(SingleNodeColumnSampler<ldouble_safe, real_t> &other, size_t ncols_tot);
765
+
766
+ void restore(const SingleNodeColumnSampler<ldouble_safe, real_t> &other);
767
+ };
768
+
769
+ template <class ImputedData, class ldouble_safe, class real_t>
770
+ struct WorkerMemory {
459
771
  std::vector<size_t> ix_arr;
460
772
  std::vector<size_t> ix_all;
461
773
  RNG_engine rnd_generator;
462
- std::uniform_int_distribution<size_t> runif;
463
- std::uniform_real_distribution<double> rbin;
774
+ UniformUnitInterval rbin;
464
775
  size_t st;
465
776
  size_t end;
466
777
  size_t st_NA;
467
778
  size_t end_NA;
468
779
  size_t split_ix;
469
- std::unordered_map<size_t, double> weights_map;
470
- std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as density */
780
+ hashed_map<size_t, double> weights_map;
781
+ std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as dty */
782
+ bool changed_weights; /* when using 'missing_action'='Divide' or density weights */
471
783
  double xmin;
472
784
  double xmax;
473
- size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
785
+ size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
474
786
  bool unsplittable;
475
787
  std::vector<bool> is_repeated;
476
- std::vector<char> categs;
477
- size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
788
+ std::vector<signed char> categs;
789
+ size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
478
790
  int ncat_tried;
479
- std::vector<bool> cols_possible;
480
- std::vector<double> btree_weights; /* only when using weights for sampling */
481
- std::discrete_distribution<size_t> col_sampler; /* columns can get eliminated, keep a copy for each thread */
791
+ std::vector<double> btree_weights; /* only when using weights for sampling */
792
+ ColumnSampler<ldouble_safe> col_sampler; /* columns can get eliminated, keep a copy for each thread */
793
+ SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler;
794
+ SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler_backup;
482
795
 
483
796
  /* for split criterion */
484
797
  std::vector<double> buffer_dbl;
485
798
  std::vector<size_t> buffer_szt;
486
- std::vector<char> buffer_chr;
799
+ std::vector<signed char> buffer_chr;
487
800
  double prob_split_type;
801
+ ColCriterion col_criterion;
488
802
  GainCriterion criterion;
489
803
  double this_gain;
490
804
  double this_split_point;
491
805
  int this_categ;
492
- std::vector<char> this_split_categ;
806
+ std::vector<signed char> this_split_categ;
493
807
  bool determine_split;
808
+ std::vector<double> imputed_x_buffer;
809
+ double saved_xmedian;
810
+ double best_xmedian;
811
+ int saved_cat_mode;
812
+ int best_cat_mode;
813
+ std::vector<size_t> col_indices; /* only for full gain calculation */
814
+
815
+ /* for weighted column choices */
816
+ std::vector<double> node_col_weights;
817
+ std::vector<double> saved_stat1;
818
+ std::vector<double> saved_stat2;
819
+ bool has_saved_stats;
820
+ double* tree_kurtoses; /* only when mixing 'weight_by_kurt' with 'prob_pick_col*' */
494
821
 
495
822
  /* for the extended model */
496
823
  size_t ntry;
497
824
  size_t ntaken;
498
825
  size_t ntaken_best;
499
- bool tried_all;
500
- size_t col_chosen;
826
+ size_t ntried;
827
+ bool try_all;
828
+ size_t col_chosen; /* also used as placeholder in the single-variable model */
501
829
  ColType col_type;
502
830
  double ext_sd;
503
- std::vector<size_t> cols_shuffled;
504
831
  std::vector<double> comb_val;
505
832
  std::vector<size_t> col_take;
506
833
  std::vector<ColType> col_take_type;
@@ -510,9 +837,10 @@ typedef struct {
510
837
  std::vector<double> ext_fill_val;
511
838
  std::vector<double> ext_fill_new;
512
839
  std::vector<int> chosen_cat;
513
- std::vector<std::vector<double>> ext_cat_coef;
514
- std::uniform_real_distribution<double> coef_unif;
515
- std::normal_distribution<double> coef_norm;
840
+ std::vector<std::vector<double>> ext_cat_coef;
841
+ UniformMinusOneToOne coef_unif;
842
+ StandardNormalDistr coef_norm;
843
+ std::vector<double> sample_weights; /* when using weights and split criterion */
516
844
 
517
845
  /* for similarity/distance calculations */
518
846
  std::vector<double> tmat_sep;
@@ -522,9 +850,11 @@ typedef struct {
522
850
 
523
851
  /* when imputing NAs on-the-fly */
524
852
  std::vector<ImputedData> impute_vec;
525
- std::unordered_map<size_t, ImputedData> impute_map;
853
+ hashed_map<size_t, ImputedData> impute_map;
526
854
 
527
- } WorkerMemory;
855
+ /* for non-depth scoring metric */
856
+ DensityCalculator<ldouble_safe, real_t> density_calculator;
857
+ };
528
858
 
529
859
  typedef struct WorkerForSimilarity {
530
860
  std::vector<size_t> ix_arr;
@@ -538,55 +868,138 @@ typedef struct WorkerForSimilarity {
538
868
  bool assume_full_distr; /* doesn't need to have one copy per worker */
539
869
  } WorkerForSimilarity;
540
870
 
541
- typedef struct {
871
+ typedef struct WorkerForPredictCSC {
872
+ std::vector<size_t> ix_arr;
873
+ size_t st;
874
+ size_t end;
875
+ std::vector<double> comb_val;
876
+ std::vector<double> weights_arr;
877
+ std::vector<double> depths;
878
+ } WorkerForPredictCSC;
879
+
880
+ class RecursionState {
881
+ public:
542
882
  size_t st;
543
883
  size_t st_NA;
544
884
  size_t end_NA;
545
885
  size_t split_ix;
546
886
  size_t end;
887
+ size_t sampler_pos;
888
+ size_t n_dropped;
889
+ bool changed_weights;
890
+ bool full_state;
547
891
  std::vector<size_t> ix_arr;
548
892
  std::vector<bool> cols_possible;
893
+ std::vector<double> col_sampler_weights;
549
894
  std::unique_ptr<double[]> weights_arr;
550
- std::discrete_distribution<size_t> col_sampler;
551
- } RecursionState;
895
+
896
+ RecursionState() = default;
897
+ template <class WorkerMemory>
898
+ RecursionState(WorkerMemory &workspace, bool full_state);
899
+ template <class WorkerMemory>
900
+ void restore_state(WorkerMemory &workspace);
901
+ };
552
902
 
553
903
  /* Function prototypes */
554
904
 
555
905
  /* fit_model.cpp */
556
- extern bool interrupt_switch;
906
+ template <class real_t, class sparse_ix, class ldouble_safe>
907
+ int fit_iforest_internal(
908
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
909
+ real_t numeric_data[], size_t ncols_numeric,
910
+ int categ_data[], size_t ncols_categ, int ncat[],
911
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
912
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
913
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
914
+ size_t nrows, size_t sample_size, size_t ntrees,
915
+ size_t max_depth, size_t ncols_per_tree,
916
+ bool limit_depth, bool penalize_range, bool standardize_data,
917
+ ScoringMetric scoring_metric, bool fast_bratio,
918
+ bool standardize_dist, double tmat[],
919
+ double output_depths[], bool standardize_depth,
920
+ real_t col_weights[], bool weigh_by_kurt,
921
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
922
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
923
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
924
+ double prob_pick_col_by_kurt,
925
+ double min_gain, MissingAction missing_action,
926
+ CategSplit cat_split_type, NewCategAction new_cat_action,
927
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
928
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
929
+ uint64_t random_seed, int nthreads);
930
+ template <class real_t, class sparse_ix>
557
931
  int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
558
- double numeric_data[], size_t ncols_numeric,
932
+ real_t numeric_data[], size_t ncols_numeric,
559
933
  int categ_data[], size_t ncols_categ, int ncat[],
560
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
934
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
561
935
  size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
562
- double sample_weights[], bool with_replacement, bool weight_as_sample,
563
- size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
564
- bool limit_depth, bool penalize_range,
936
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
937
+ size_t nrows, size_t sample_size, size_t ntrees,
938
+ size_t max_depth, size_t ncols_per_tree,
939
+ bool limit_depth, bool penalize_range, bool standardize_data,
940
+ ScoringMetric scoring_metric, bool fast_bratio,
565
941
  bool standardize_dist, double tmat[],
566
942
  double output_depths[], bool standardize_depth,
567
- double col_weights[], bool weigh_by_kurt,
568
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
569
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
943
+ real_t col_weights[], bool weigh_by_kurt,
944
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
945
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
946
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
947
+ double prob_pick_col_by_kurt,
570
948
  double min_gain, MissingAction missing_action,
571
949
  CategSplit cat_split_type, NewCategAction new_cat_action,
572
950
  bool all_perm, Imputer *imputer, size_t min_imp_obs,
573
951
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
574
- uint64_t random_seed, bool handle_interrupt, int nthreads);
952
+ uint64_t random_seed, bool use_long_double, int nthreads);
953
+ template <class real_t, class sparse_ix>
575
954
  int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
576
- double numeric_data[], size_t ncols_numeric,
955
+ real_t numeric_data[], size_t ncols_numeric,
577
956
  int categ_data[], size_t ncols_categ, int ncat[],
578
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
957
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
579
958
  size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
580
- double sample_weights[], size_t nrows, size_t max_depth,
581
- bool limit_depth, bool penalize_range,
582
- double col_weights[], bool weigh_by_kurt,
583
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
584
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
959
+ real_t sample_weights[], size_t nrows,
960
+ size_t max_depth, size_t ncols_per_tree,
961
+ bool limit_depth, bool penalize_range, bool standardize_data,
962
+ bool fast_bratio,
963
+ real_t col_weights[], bool weigh_by_kurt,
964
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
965
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
966
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
967
+ double prob_pick_col_by_kurt,
585
968
  double min_gain, MissingAction missing_action,
586
969
  CategSplit cat_split_type, NewCategAction new_cat_action,
587
970
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
588
- bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
971
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
972
+ TreesIndexer *indexer,
973
+ real_t ref_numeric_data[], int ref_categ_data[],
974
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
975
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
976
+ uint64_t random_seed, bool use_long_double);
977
+ template <class real_t, class sparse_ix, class ldouble_safe>
978
+ int add_tree_internal(
979
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
980
+ real_t numeric_data[], size_t ncols_numeric,
981
+ int categ_data[], size_t ncols_categ, int ncat[],
982
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
983
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
984
+ real_t sample_weights[], size_t nrows,
985
+ size_t max_depth, size_t ncols_per_tree,
986
+ bool limit_depth, bool penalize_range, bool standardize_data,
987
+ bool fast_bratio,
988
+ real_t col_weights[], bool weigh_by_kurt,
989
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
990
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
991
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
992
+ double prob_pick_col_by_kurt,
993
+ double min_gain, MissingAction missing_action,
994
+ CategSplit cat_split_type, NewCategAction new_cat_action,
995
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
996
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
997
+ TreesIndexer *indexer,
998
+ real_t ref_numeric_data[], int ref_categ_data[],
999
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1000
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
589
1001
  uint64_t random_seed);
1002
+ template <class InputData, class WorkerMemory, class ldouble_safe>
590
1003
  void fit_itree(std::vector<IsoTree> *tree_root,
591
1004
  std::vector<IsoHPlane> *hplane_root,
592
1005
  WorkerMemory &workspace,
@@ -596,6 +1009,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
596
1009
  size_t tree_num);
597
1010
 
598
1011
  /* isoforest.cpp */
1012
+ template <class InputData, class WorkerMemory, class ldouble_safe>
599
1013
  void split_itree_recursive(std::vector<IsoTree> &trees,
600
1014
  WorkerMemory &workspace,
601
1015
  InputData &input_data,
@@ -604,31 +1018,55 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
604
1018
  size_t curr_depth);
605
1019
 
606
1020
  /* extended.cpp */
1021
+ template <class InputData, class WorkerMemory, class ldouble_safe>
607
1022
  void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
608
1023
  WorkerMemory &workspace,
609
1024
  InputData &input_data,
610
1025
  ModelParams &model_params,
611
1026
  std::vector<ImputeNode> *impute_nodes,
612
1027
  size_t curr_depth);
1028
+ template <class InputData, class WorkerMemory, class ldouble_safe>
613
1029
  void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params,
614
- std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s);
1030
+ std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s);
615
1031
  void shrink_to_fit_hplane(IsoHPlane &hplane, bool clear_vectors);
1032
+ template <class InputData, class WorkerMemory>
616
1033
  void simplify_hplane(IsoHPlane &hplane, WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
617
1034
 
618
1035
 
619
1036
  /* predict.cpp */
620
- void predict_iforest(double numeric_data[], int categ_data[],
621
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
622
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1037
+ template <class real_t, class sparse_ix>
1038
+ #ifndef _FOR_R
1039
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno"), gnu::hot]]
1040
+ #endif
1041
+ void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
1042
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1043
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1044
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
623
1045
  size_t nrows, int nthreads, bool standardize,
624
1046
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
625
- double output_depths[], sparse_ix tree_num[]);
1047
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1048
+ double *restrict per_tree_depths,
1049
+ TreesIndexer *indexer);
1050
+ template <class real_t, class sparse_ix>
1051
+ [[gnu::hot]]
1052
+ void traverse_itree_fast(std::vector<IsoTree> &tree,
1053
+ IsoForest &model_outputs,
1054
+ real_t *restrict row_numeric_data,
1055
+ double &restrict output_depth,
1056
+ sparse_ix *restrict tree_num,
1057
+ double *restrict tree_depth,
1058
+ size_t row) noexcept;
1059
+ template <class PredictionData, class sparse_ix>
1060
+ [[gnu::hot]]
626
1061
  void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
627
1062
  IsoForest &model_outputs,
628
1063
  PredictionData &prediction_data,
629
- double &output_depth,
1064
+ double &restrict output_depth,
630
1065
  sparse_ix *restrict tree_num,
631
- size_t row);
1066
+ double *restrict tree_depth,
1067
+ size_t row) noexcept;
1068
+ template <class PredictionData, class sparse_ix, class ImputedData>
1069
+ [[gnu::hot]]
632
1070
  double traverse_itree(std::vector<IsoTree> &tree,
633
1071
  IsoForest &model_outputs,
634
1072
  PredictionData &prediction_data,
@@ -637,63 +1075,181 @@ double traverse_itree(std::vector<IsoTree> &tree,
637
1075
  double curr_weight,
638
1076
  size_t row,
639
1077
  sparse_ix *restrict tree_num,
640
- size_t curr_lev);
641
- void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
642
- ExtIsoForest &model_outputs,
643
- PredictionData &prediction_data,
644
- double &output_depth,
645
- sparse_ix *restrict tree_num,
646
- size_t row);
1078
+ double *restrict tree_depth,
1079
+ size_t curr_lev) noexcept;
1080
+ template <class PredictionData, class sparse_ix>
1081
+ [[gnu::hot]]
1082
+ void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
1083
+ ExtIsoForest &model_outputs,
1084
+ PredictionData &prediction_data,
1085
+ double &restrict output_depth,
1086
+ sparse_ix *restrict tree_num,
1087
+ double *restrict tree_depth,
1088
+ size_t row) noexcept;
1089
+ template <class real_t, class sparse_ix>
1090
+ [[gnu::hot]]
1091
+ void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
1092
+ ExtIsoForest &model_outputs,
1093
+ real_t *restrict row_numeric_data,
1094
+ double &restrict output_depth,
1095
+ sparse_ix *restrict tree_num,
1096
+ double *restrict tree_depth,
1097
+ size_t row) noexcept;
1098
+ template <class PredictionData, class sparse_ix, class ImputedData>
1099
+ [[gnu::hot]]
647
1100
  void traverse_hplane(std::vector<IsoHPlane> &hplane,
648
1101
  ExtIsoForest &model_outputs,
649
1102
  PredictionData &prediction_data,
650
- double &output_depth,
1103
+ double &restrict output_depth,
651
1104
  std::vector<ImputeNode> *impute_nodes,
652
1105
  ImputedData *imputed_data,
653
1106
  sparse_ix *restrict tree_num,
654
- size_t row);
655
- double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num);
656
- double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num);
657
- void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
658
- void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
1107
+ double *restrict tree_depth,
1108
+ size_t row) noexcept;
1109
+ template <class real_t, class sparse_ix>
1110
+ void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
1111
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1112
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1113
+ double *restrict per_tree_depths);
1114
+ template <class PredictionData, class sparse_ix>
1115
+ void traverse_itree_csc(WorkerForPredictCSC &workspace,
1116
+ std::vector<IsoTree> &trees,
1117
+ IsoForest &model_outputs,
1118
+ PredictionData &prediction_data,
1119
+ sparse_ix *restrict tree_num,
1120
+ double *restrict per_tree_depths,
1121
+ size_t curr_tree,
1122
+ bool has_range_penalty);
1123
+ template <class PredictionData, class sparse_ix>
1124
+ void traverse_hplane_csc(WorkerForPredictCSC &workspace,
1125
+ std::vector<IsoHPlane> &hplanes,
1126
+ ExtIsoForest &model_outputs,
1127
+ PredictionData &prediction_data,
1128
+ sparse_ix *restrict tree_num,
1129
+ double *restrict per_tree_depths,
1130
+ size_t curr_tree,
1131
+ bool has_range_penalty);
1132
+ template <class PredictionData>
1133
+ void add_csc_range_penalty(WorkerForPredictCSC &workspace,
1134
+ PredictionData &prediction_data,
1135
+ double *restrict weights_arr,
1136
+ size_t col_num,
1137
+ double range_low,
1138
+ double range_high);
1139
+ template <class PredictionData>
1140
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept;
1141
+ template <class PredictionData, class sparse_ix>
1142
+ static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept;
1143
+ template <class PredictionData, class sparse_ix>
1144
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept;
1145
+ template <class sparse_ix>
1146
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
1147
+ template <class sparse_ix>
1148
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
659
1149
 
660
1150
  /* dist.cpp */
661
- void calc_similarity(double numeric_data[], int categ_data[],
662
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
663
- size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1151
+ template <class real_t, class sparse_ix>
1152
+ void calc_similarity(real_t numeric_data[], int categ_data[],
1153
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1154
+ size_t nrows, bool use_long_double, int nthreads,
1155
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
664
1156
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
665
- double tmat[], double rmat[], size_t n_from);
1157
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
1158
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
1159
+ template <class real_t, class sparse_ix, class ldouble_safe>
1160
+ void calc_similarity_internal(
1161
+ real_t numeric_data[], int categ_data[],
1162
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1163
+ size_t nrows, int nthreads,
1164
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
1165
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1166
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
1167
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
1168
+ template <class PredictionData, class ldouble_safe>
666
1169
  void traverse_tree_sim(WorkerForSimilarity &workspace,
667
1170
  PredictionData &prediction_data,
668
1171
  IsoForest &model_outputs,
669
1172
  std::vector<IsoTree> &trees,
670
- size_t curr_tree);
1173
+ size_t curr_tree,
1174
+ const bool as_kernel);
1175
+ template <class PredictionData, class ldouble_safe>
671
1176
  void traverse_hplane_sim(WorkerForSimilarity &workspace,
672
1177
  PredictionData &prediction_data,
673
1178
  ExtIsoForest &model_outputs,
674
1179
  std::vector<IsoHPlane> &hplanes,
675
- size_t curr_tree);
1180
+ size_t curr_tree,
1181
+ const bool as_kernel);
1182
+ template <class PredictionData, class InputData, class WorkerMemory>
1183
+ #ifndef _FOR_R
1184
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1185
+ #endif
676
1186
  void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
677
1187
  std::vector<WorkerMemory> *worker_memory_m,
678
1188
  PredictionData *prediction_data, InputData *input_data,
679
1189
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
680
1190
  double *restrict tmat, double *restrict rmat, size_t n_from,
681
1191
  size_t ntrees, bool assume_full_distr,
682
- bool standardize_dist, int nthreads);
1192
+ bool standardize_dist, bool as_kernel, int nthreads);
1193
+ template <class PredictionData>
683
1194
  void initialize_worker_for_sim(WorkerForSimilarity &workspace,
684
1195
  PredictionData &prediction_data,
685
1196
  IsoForest *model_outputs,
686
1197
  ExtIsoForest *model_outputs_ext,
687
1198
  size_t n_from,
688
1199
  bool assume_full_distr);
1200
+ template <class real_t, class sparse_ix>
1201
+ #ifndef _FOR_R
1202
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1203
+ #endif
1204
+ void calc_similarity_from_indexer
1205
+ (
1206
+ real_t *restrict numeric_data, int *restrict categ_data,
1207
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1208
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1209
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1210
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1211
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1212
+ );
1213
+ template <class real_t, class sparse_ix>
1214
+ #ifndef _FOR_R
1215
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1216
+ #endif
1217
+ void calc_similarity_from_indexer_with_references
1218
+ (
1219
+ real_t *restrict numeric_data, int *restrict categ_data,
1220
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1221
+ size_t nrows, int nthreads, bool standardize_dist,
1222
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1223
+ double *restrict rmat,
1224
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1225
+ );
1226
+ template <class real_t, class sparse_ix>
1227
+ void kernel_to_references(TreesIndexer &indexer,
1228
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1229
+ real_t *restrict numeric_data, int *restrict categ_data,
1230
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1231
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1232
+ size_t nrows, int nthreads,
1233
+ double *restrict rmat,
1234
+ bool standardize);
689
1235
 
690
1236
  /* impute.cpp */
691
- void impute_missing_values(double numeric_data[], int categ_data[],
692
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1237
+ template <class real_t, class sparse_ix>
1238
+ void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
1239
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1240
+ size_t nrows, bool use_long_double, int nthreads,
1241
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1242
+ Imputer &imputer);
1243
+ template <class real_t, class sparse_ix, class ldouble_safe>
1244
+ void impute_missing_values_internal(
1245
+ real_t numeric_data[], int categ_data[], bool is_col_major,
1246
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
693
1247
  size_t nrows, int nthreads,
694
1248
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
695
1249
  Imputer &imputer);
1250
+ template <class InputData, class ldouble_safe>
696
1251
  void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads);
1252
+ template <class InputData, class WorkerMemory, class ldouble_safe>
697
1253
  void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
698
1254
  InputData &input_data, ModelParams &model_params,
699
1255
  std::vector<ImputeNode> &imputer_tree,
@@ -702,232 +1258,1085 @@ void shrink_impute_node(ImputeNode &imputer);
702
1258
  void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
703
1259
  std::vector<IsoTree> *trees,
704
1260
  std::vector<IsoHPlane> *hplanes);
705
- void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto);
1261
+ template <class ImputedData>
1262
+ void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto);
1263
+ template <class ImputedData, class WorkerMemory>
706
1264
  void combine_tree_imputations(WorkerMemory &workspace,
707
1265
  std::vector<ImputedData> &impute_vec,
708
- std::unordered_map<size_t, ImputedData> &impute_map,
1266
+ hashed_map<size_t, ImputedData> &impute_map,
709
1267
  std::vector<char> &has_missing,
710
1268
  int nthreads);
1269
+ template <class ImputedData>
711
1270
  void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w);
1271
+ template <class InputData, class WorkerMemory>
712
1272
  void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data);
713
- template <class imp_arr>
1273
+ template <class imp_arr, class InputData>
714
1274
  void apply_imputation_results(imp_arr &impute_vec,
715
1275
  Imputer &imputer,
716
1276
  InputData &input_data,
717
1277
  int nthreads);
1278
+ template <class ImputedData, class InputData>
718
1279
  void apply_imputation_results(std::vector<ImputedData> &impute_vec,
719
- std::unordered_map<size_t, ImputedData> &impute_map,
1280
+ hashed_map<size_t, ImputedData> &impute_map,
720
1281
  Imputer &imputer,
721
1282
  InputData &input_data,
722
1283
  int nthreads);
1284
+ template <class PredictionData, class ImputedData>
723
1285
  void apply_imputation_results(PredictionData &prediction_data,
724
1286
  ImputedData &imp,
725
1287
  Imputer &imputer,
726
1288
  size_t row);
1289
+ template <class ImputedData, class InputData>
727
1290
  void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row);
1291
+ template <class ImputedData, class PredictionData>
728
1292
  void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row);
1293
+ template <class ImputedData, class InputData>
729
1294
  void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads);
730
- void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data);
1295
+ template <class ImputedData, class InputData>
1296
+ void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data);
1297
+ template <class ImputedData, class InputData>
731
1298
  void allocate_imp(InputData &input_data,
732
1299
  std::vector<ImputedData> &impute_vec,
733
- std::unordered_map<size_t, ImputedData> &impute_map,
1300
+ hashed_map<size_t, ImputedData> &impute_map,
734
1301
  int nthreads);
1302
+ template <class ImputedData, class InputData>
735
1303
  void check_for_missing(InputData &input_data,
736
1304
  std::vector<ImputedData> &impute_vec,
737
- std::unordered_map<size_t, ImputedData> &impute_map,
1305
+ hashed_map<size_t, ImputedData> &impute_map,
738
1306
  int nthreads);
1307
+ template <class PredictionData>
739
1308
  size_t check_for_missing(PredictionData &prediction_data,
740
1309
  Imputer &imputer,
741
1310
  size_t ix_arr[],
742
1311
  int nthreads);
743
1312
 
744
1313
  /* helpers_iforest.cpp */
745
- void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
746
- RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
747
- std::discrete_distribution<size_t> &col_sampler);
748
- void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
749
- void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data);
750
- bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
1314
+ static inline size_t get_ntrees(const IsoForest &model)
1315
+ {
1316
+ return model.trees.size();
1317
+ }
1318
+
1319
+ static inline size_t get_ntrees(const ExtIsoForest &model)
1320
+ {
1321
+ return model.hplanes.size();
1322
+ }
1323
+
1324
+ static inline size_t get_ntrees(const Imputer &model)
1325
+ {
1326
+ return model.imputer_tree.size();
1327
+ }
1328
+
1329
+ static inline size_t get_ntrees(const TreesIndexer &model)
1330
+ {
1331
+ return model.indices.size();
1332
+ }
1333
+ template <class InputData, class WorkerMemory>
751
1334
  void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree);
1335
+ template <class InputData, class WorkerMemory>
752
1336
  void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
1337
+ template <class InputData, class WorkerMemory>
1338
+ void get_split_range_v2(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
1339
+ template <class InputData, class WorkerMemory>
753
1340
  int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num);
754
- void update_col_sampler(WorkerMemory &workspace, InputData &input_data);
755
- bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
756
- InputData &input_data, size_t col_num, ColType col_type);
757
- void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
1341
+ bool is_col_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
1342
+ size_t col_num);
1343
+ template <class InputData>
1344
+ void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
758
1345
  InputData &input_data, size_t col_num, ColType col_type);
1346
+ template <class InputData>
1347
+ void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
1348
+ InputData &input_data, size_t col_num);
1349
+ template <class InputData, class WorkerMemory>
759
1350
  void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder);
760
- void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight);
1351
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1352
+ void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, ldouble_safe sum_weight);
1353
+ template <class PredictionData, class sparse_ix>
761
1354
  void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
762
1355
  PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads);
763
- void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
764
- void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
1356
+ template <class InputData, class ldouble_safe>
1357
+ std::vector<double> calc_kurtosis_all_data(InputData &input_data, ModelParams &model_params, RNG_engine &rnd_generator);
1358
+ template <class InputData, class WorkerMemory>
1359
+ void calc_ranges_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1360
+ double *restrict ranges, double *restrict saved_xmin, double *restrict saved_xmax);
1361
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1362
+ void calc_var_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1363
+ double *restrict variances, double *restrict saved_xmin, double *restrict saved_xmax,
1364
+ double *restrict saved_means, double *restrict saved_sds);
1365
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1366
+ void calc_kurt_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1367
+ double *restrict kurtosis, double *restrict saved_xmin, double *restrict saved_xmax);
1368
+ bool is_boxed_metric(const ScoringMetric scoring_metric);
765
1369
 
766
1370
 
767
1371
  /* utils.cpp */
1372
+ #define ix_comb_(i, j, n, ncomb) ( ((ncomb) + ((j) - (i))) - (size_t)1 - div2(((n) - (i)) * ((n) - (i) - (size_t)1)) )
1373
+ #define ix_comb(i, j, n, ncomb) ( ((i) < (j))? ix_comb_(i, j, n, ncomb) : ix_comb_(j, i, n, ncomb) )
1374
+ #define calc_ncomb(n) (((n) % 2) == 0)? (div2(n) * ((n)-(size_t)1)) : ((n) * div2((n)-(size_t)1))
768
1375
  size_t log2ceil(size_t x);
1376
+ #ifndef _FOR_R
1377
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1378
+ #endif
1379
+ double digamma(double x);
1380
+ template <class ldouble_safe>
1381
+ #ifndef _FOR_R
1382
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1383
+ #endif
769
1384
  double harmonic(size_t n);
770
1385
  double harmonic_recursive(double a, double b);
1386
+ template <class ldouble_safe>
771
1387
  double expected_avg_depth(size_t sample_size);
772
- double expected_avg_depth(long double approx_sample_size);
1388
+ template <class ldouble_safe>
1389
+ #ifndef _FOR_R
1390
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1391
+ #endif
1392
+ double expected_avg_depth(ldouble_safe approx_sample_size);
773
1393
  double expected_separation_depth(size_t n);
774
1394
  double expected_separation_depth_hotstart(double curr, size_t n_curr, size_t n_final);
775
- double expected_separation_depth(long double n);
1395
+ template <class ldouble_safe>
1396
+ double expected_separation_depth(ldouble_safe n);
776
1397
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n, double counter[], double exp_remainder);
777
1398
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
778
1399
  double *restrict counter, double *restrict weights, double exp_remainder);
779
1400
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
780
- double counter[], std::unordered_map<size_t, double> &weights, double exp_remainder);
1401
+ double counter[], hashed_map<size_t, double> &weights, double exp_remainder);
781
1402
  void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
782
1403
  double counter[], double exp_remainder);
783
1404
  void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
784
1405
  double *restrict counter, double *restrict weights, double exp_remainder);
785
- void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, bool diag_to_one);
786
- double calc_sd_raw(size_t cnt, long double sum, long double sum_sq);
787
- long double calc_sd_raw_l(size_t cnt, long double sum, long double sum_sq);
788
- void build_btree_sampler(std::vector<double> &btree_weights, double *restrict sample_weights,
789
- size_t nrows, size_t &log2_n, size_t &btree_offset);
790
- void sample_random_rows(std::vector<size_t> &ix_arr, size_t nrows, bool with_replacement,
791
- RNG_engine &rnd_generator, std::vector<size_t> &ix_all,
792
- double sample_weights[], std::vector<double> &btree_weights,
1406
+ void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, double fill_diag);
1407
+ template <class real_t=double>
1408
+ void build_btree_sampler(std::vector<double> &btree_weights, real_t *restrict sample_weights,
1409
+ size_t nrows, size_t &restrict log2_n, size_t &restrict btree_offset);
1410
+ template <class real_t=double, class ldouble_safe>
1411
+ void sample_random_rows(std::vector<size_t> &restrict ix_arr, size_t nrows, bool with_replacement,
1412
+ RNG_engine &rnd_generator, std::vector<size_t> &restrict ix_all,
1413
+ real_t *restrict sample_weights, std::vector<double> &restrict btree_weights,
793
1414
  size_t log2_n, size_t btree_offset, std::vector<bool> &is_repeated);
794
- void weighted_shuffle(size_t *restrict outp, size_t n, double *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
795
- size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point);
796
- void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point,
797
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
798
- void divide_subset_split(size_t ix_arr[], size_t st, size_t end, size_t col_num,
799
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[], double split_point,
800
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
801
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
802
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
803
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
1415
+ template <class real_t=double>
1416
+ void weighted_shuffle(size_t *restrict outp, size_t n, real_t *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
1417
+ double sample_random_uniform(double xmin, double xmax, RNG_engine &rng) noexcept;
1418
+ size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point) noexcept;
1419
+ template <class real_t=double>
1420
+ void divide_subset_split(size_t *restrict ix_arr, real_t x[], size_t st, size_t end, double split_point,
1421
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1422
+ template <class real_t, class sparse_ix>
1423
+ void divide_subset_split(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1424
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, double split_point,
1425
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1426
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
1427
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1428
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
804
1429
  int ncat, MissingAction missing_action, NewCategAction new_cat_action,
805
- bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
806
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_categ,
807
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
808
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end,
1430
+ bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1431
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, int split_categ,
1432
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1433
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end,
809
1434
  MissingAction missing_action, NewCategAction new_cat_action,
810
- bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
811
- void get_range(size_t ix_arr[], double x[], size_t st, size_t end,
812
- MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
813
- void get_range(size_t ix_arr[], size_t st, size_t end, size_t col_num,
814
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
815
- MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
816
- void get_categs(size_t ix_arr[], int x[], size_t st, size_t end, int ncat,
817
- MissingAction missing_action, char categs[], size_t &npresent, bool &unsplittable);
818
- long double calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
819
- std::vector<double> &weights_arr, std::unordered_map<size_t, double> &weights_map);
1435
+ bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1436
+ template <class real_t=double>
1437
+ void get_range(size_t ix_arr[], real_t *restrict x, size_t st, size_t end,
1438
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1439
+ template <class real_t>
1440
+ void get_range(real_t *restrict x, size_t n,
1441
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1442
+ template <class real_t, class sparse_ix>
1443
+ void get_range(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1444
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1445
+ MissingAction missing_action, double &restrict xmin_, double &restrict xmax_, bool &unsplittable) noexcept;
1446
+ template <class real_t, class sparse_ix>
1447
+ void get_range(size_t col_num, size_t nrows,
1448
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1449
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1450
+ void get_categs(size_t *restrict ix_arr, int x[], size_t st, size_t end, int ncat,
1451
+ MissingAction missing_action, signed char categs[], size_t &restrict npresent, bool &unsplittable) noexcept;
1452
+ template <class real_t>
1453
+ bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
1454
+ bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, int x[], MissingAction missing_action);
1455
+ template <class real_t, class sparse_ix>
1456
+ bool check_more_than_two_unique_values(size_t *restrict ix_arr, size_t st, size_t end, size_t col,
1457
+ sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
1458
+ MissingAction missing_action);
1459
+ template <class real_t, class sparse_ix>
1460
+ bool check_more_than_two_unique_values(size_t nrows, size_t col,
1461
+ sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
1462
+ MissingAction missing_action);
1463
+ void count_categs(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict counts);
1464
+ int count_ncateg_in_col(const int x[], const size_t n, const int ncat, unsigned char buffer[]);
1465
+ template <class ldouble_safe>
1466
+ ldouble_safe calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
1467
+ std::vector<double> &weights_arr, hashed_map<size_t, double> &weights_map);
1468
+ extern bool interrupt_switch;
1469
+ extern bool signal_is_locked;
820
1470
  void set_interrup_global_variable(int s);
1471
+ #ifdef _FOR_PYTHON
1472
+ bool cy_check_interrupt_switch();
1473
+ void cy_tick_off_interrupt_switch();
1474
+ #endif
1475
+ class SignalSwitcher
1476
+ {
1477
+ public:
1478
+ sig_t_ old_sig;
1479
+ bool is_active;
1480
+ SignalSwitcher();
1481
+ ~SignalSwitcher();
1482
+ void restore_handle();
1483
+ };
1484
+ void check_interrupt_switch(SignalSwitcher &ss);
1485
+ bool has_long_double();
821
1486
  int return_EXIT_SUCCESS();
822
1487
  int return_EXIT_FAILURE();
823
1488
 
824
1489
 
825
1490
 
826
- size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, double x[]);
827
- size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, size_t col_num, double Xc[], size_t Xc_ind[], size_t Xc_indptr[]);
1491
+ template <class real_t=double>
1492
+ size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, real_t x[]);
1493
+ template <class real_t, class sparse_ix>
1494
+ size_t move_NAs_to_front(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
828
1495
  size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, int x[]);
829
- size_t center_NAs(size_t *restrict ix_arr, size_t st_left, size_t st, size_t curr_pos);
830
- void todense(size_t ix_arr[], size_t st, size_t end,
831
- size_t col_num, double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1496
+ size_t center_NAs(size_t ix_arr[], size_t st_left, size_t st, size_t curr_pos);
1497
+ template <class real_t>
1498
+ void fill_NAs_with_median(size_t *restrict ix_arr, size_t st_orig, size_t st, size_t end, real_t *restrict x,
1499
+ double *restrict buffer_imputed_x, double *restrict xmedian);
1500
+ template <class real_t, class sparse_ix>
1501
+ void todense(size_t *restrict ix_arr, size_t st, size_t end,
1502
+ size_t col_num, real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
832
1503
  double *restrict buffer_arr);
1504
+ template <class real_t>
1505
+ void colmajor_to_rowmajor(real_t *restrict X, size_t nrows, size_t ncols, std::vector<double> &X_row_major);
1506
+ template <class real_t, class sparse_ix>
1507
+ void colmajor_to_rowmajor(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1508
+ size_t nrows, size_t ncols,
1509
+ std::vector<double> &Xr, std::vector<size_t> &Xr_ind, std::vector<size_t> &Xr_indptr);
1510
+ template <class sparse_ix=size_t>
1511
+ bool check_indices_are_sorted(sparse_ix indices[], size_t n);
1512
+ template <class real_t, class sparse_ix>
1513
+ void sort_csc_indices(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, size_t ncols_numeric);
833
1514
 
834
1515
  /* mult.cpp */
835
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
836
- MissingAction missing_action, double &x_sd, double &x_mean);
837
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
838
- double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
839
- double &x_sd, double &x_mean);
1516
+ template <class real_t, class real_t_>
1517
+ void calc_mean_and_sd_t(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
1518
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1519
+ template <class real_t_, class ldouble_safe>
1520
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
1521
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1522
+ template <class real_t_>
1523
+ double calc_mean_only(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x);
1524
+ template <class real_t_, class mapping, class ldouble_safe>
1525
+ void calc_mean_and_sd_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w,
1526
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1527
+ template <class real_t_, class mapping>
1528
+ double calc_mean_only_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w);
1529
+ template <class real_t_, class sparse_ix, class ldouble_safe>
1530
+ void calc_mean_and_sd(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1531
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1532
+ double &restrict x_sd, double &restrict x_mean);
1533
+ template <class real_t_, class sparse_ix, class ldouble_safe>
1534
+ double calc_mean_only(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1535
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
1536
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1537
+ void calc_mean_and_sd_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1538
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1539
+ double &restrict x_sd, double &restrict x_mean, mapping &restrict w);
1540
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1541
+ double calc_mean_only_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1542
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1543
+ mapping &restrict w);
1544
+ template <class real_t_>
840
1545
  void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
841
- double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
1546
+ real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
842
1547
  MissingAction missing_action, double *restrict buffer_arr,
843
1548
  size_t *restrict buffer_NAs, bool first_run);
1549
+ template <class real_t_, class mapping, class ldouble_safe>
1550
+ void add_linear_comb_weighted(size_t ix_arr[], size_t st, size_t end, double *restrict res,
1551
+ real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
1552
+ MissingAction missing_action, double *restrict buffer_arr,
1553
+ size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
1554
+ template <class real_t_, class sparse_ix>
844
1555
  void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
845
- double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
846
- double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
1556
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1557
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
847
1558
  double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run);
1559
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1560
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
1561
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1562
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
1563
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
1564
+ template <class mapping>
1565
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
1566
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
1567
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
1568
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
1569
+ bool first_run, mapping &restrict w);
1570
+ template <class ldouble_safe>
848
1571
  void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
849
1572
  int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
850
- double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
1573
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
851
1574
  NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run);
1575
+ template <class mapping, class ldouble_safe>
1576
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
1577
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
1578
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
1579
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
1580
+ bool first_run, mapping &restrict w);
852
1581
 
853
1582
  /* crit.cpp */
854
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, double x[], MissingAction missing_action);
855
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, size_t col_num,
856
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1583
+ template <class real_t, class ldouble_safe>
1584
+ double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
1585
+ template <class real_t, class ldouble_safe>
1586
+ double calc_kurtosis(real_t x[], size_t n, MissingAction missing_action);
1587
+ template <class real_t, class mapping, class ldouble_safe>
1588
+ double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, real_t x[],
1589
+ MissingAction missing_action, mapping &restrict w);
1590
+ template <class real_t, class ldouble_safe>
1591
+ double calc_kurtosis_weighted(real_t *restrict x, size_t n_, MissingAction missing_action, real_t *restrict w);
1592
+ template <class real_t, class sparse_ix, class ldouble_safe>
1593
+ double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1594
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1595
+ MissingAction missing_action);
1596
+ template <class real_t, class sparse_ix, class ldouble_safe>
1597
+ double calc_kurtosis(size_t col_num, size_t nrows,
1598
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
857
1599
  MissingAction missing_action);
858
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1600
+ template <class real_t, class sparse_ix, class mapping, class ldouble_safe>
1601
+ double calc_kurtosis_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1602
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1603
+ MissingAction missing_action, mapping &restrict w);
1604
+ template <class real_t, class sparse_ix, class ldouble_safe>
1605
+ double calc_kurtosis_weighted(size_t col_num, size_t nrows,
1606
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1607
+ MissingAction missing_action, real_t *restrict w);
1608
+ template <class ldouble_safe>
1609
+ double calc_kurtosis_internal(size_t cnt, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1610
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
1611
+ template <class ldouble_safe>
1612
+ double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict buffer_cnt, double buffer_prob[],
859
1613
  MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
860
- double expected_sd_cat(double p[], size_t n, size_t pos[]);
861
- double expected_sd_cat(size_t counts[], double p[], size_t n, size_t pos[]);
862
- double expected_sd_cat_single(size_t counts[], double p[], size_t n, size_t pos[], size_t cat_exclude, size_t cnt);
863
- double numeric_gain(size_t cnt_left, size_t cnt_right,
864
- long double sum_left, long double sum_right,
865
- long double sum_sq_left, long double sum_sq_right,
866
- double sd_full, long double cnt);
867
- double numeric_gain_no_div(size_t cnt_left, size_t cnt_right,
868
- long double sum_left, long double sum_right,
869
- long double sum_sq_left, long double sum_sq_right,
870
- double sd_full, long double cnt);
871
- double categ_gain(size_t cnt_left, size_t cnt_right,
872
- long double s_left, long double s_right,
873
- long double base_info, long double cnt);
874
- double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion, double min_gain,
875
- double &split_point, double &xmin, double &xmax);
876
- double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x,
877
- size_t &split_ix, double &split_point, double &xmin, double &xmax,
878
- GainCriterion criterion, double min_gain, MissingAction missing_action);
1614
+ template <class ldouble_safe>
1615
+ double calc_kurtosis(size_t nrows, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1616
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
1617
+ template <class mapping, class ldouble_safe>
1618
+ double calc_kurtosis_weighted_internal(std::vector<ldouble_safe> &buffer_cnt, int x[], int ncat,
1619
+ double buffer_prob[], MissingAction missing_action, CategSplit cat_split_type,
1620
+ RNG_engine &rnd_generator, mapping &restrict w);
1621
+ template <class mapping, class ldouble_safe>
1622
+ double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, double buffer_prob[],
1623
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator,
1624
+ mapping &restrict w);
1625
+ template <class real_t, class ldouble_safe>
1626
+ double calc_kurtosis_weighted(size_t nrows, int x[], int ncat, double *restrict buffer_prob,
1627
+ MissingAction missing_action, CategSplit cat_split_type,
1628
+ RNG_engine &rnd_generator, real_t *restrict w);
1629
+ template <class int_t, class ldouble_safe>
1630
+ double expected_sd_cat(double p[], size_t n, int_t pos[]);
1631
+ template <class number, class int_t, class ldouble_safe>
1632
+ double expected_sd_cat(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos);
1633
+ template <class number, class int_t, class ldouble_safe>
1634
+ double expected_sd_cat_single(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos, size_t cat_exclude, number cnt);
1635
+ template <class number, class int_t, class ldouble_safe>
1636
+ double expected_sd_cat_internal(int ncat, number *restrict buffer_cnt, ldouble_safe cnt_l,
1637
+ int_t *restrict buffer_pos, double *restrict buffer_prob);
1638
+ template <class int_t, class ldouble_safe>
1639
+ double expected_sd_cat(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
1640
+ MissingAction missing_action,
1641
+ size_t *restrict buffer_cnt, int_t *restrict buffer_pos, double buffer_prob[]);
1642
+ template <class mapping, class int_t, class ldouble_safe>
1643
+ double expected_sd_cat_weighted(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
1644
+ MissingAction missing_action, mapping &restrict w,
1645
+ double *restrict buffer_cnt, int_t *restrict buffer_pos, double *restrict buffer_prob);
1646
+ template <class number, class ldouble_safe>
1647
+ double categ_gain(number cnt_left, number cnt_right,
1648
+ ldouble_safe s_left, ldouble_safe s_right,
1649
+ ldouble_safe base_info, ldouble_safe cnt);
1650
+ template <class real_t, class real_t_>
1651
+ double find_split_rel_gain_t(real_t_ *restrict x, size_t n, double &restrict split_point);
1652
+ template <class real_t_, class ldouble_safe>
1653
+ double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix);
1654
+ template <class real_t, class real_t_>
1655
+ double find_split_rel_gain_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix);
1656
+ template <class real_t_, class ldouble_safe>
1657
+ double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double &split_point, size_t &split_ix);
1658
+ template <class real_t, class real_t_, class mapping>
1659
+ double find_split_rel_gain_weighted_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix, mapping &restrict w);
1660
+ template <class real_t_, class mapping, class ldouble_safe>
1661
+ double find_split_rel_gain_weighted(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1662
+ template <class real_t, class real_t_=double>
1663
+ real_t calc_sd_right_to_left(real_t_ *restrict x, size_t n, double *restrict sd_arr);
1664
+ template <class real_t_, class ldouble_safe>
1665
+ ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1666
+ double *restrict w, ldouble_safe &cumw, size_t *restrict sorted_ix);
1667
+ template <class real_t, class real_t_>
1668
+ real_t calc_sd_right_to_left(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr);
1669
+ template <class real_t_, class mapping, class ldouble_safe>
1670
+ ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end,
1671
+ double *restrict sd_arr, mapping &restrict w, ldouble_safe &cumw);
1672
+ template <class real_t, class real_t_>
1673
+ double find_split_std_gain_t(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1674
+ GainCriterion criterion, double min_gain, double &restrict split_point);
1675
+ template <class real_t_, class ldouble_safe>
1676
+ double find_split_std_gain(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1677
+ GainCriterion criterion, double min_gain, double &restrict split_point);
1678
+ template <class real_t, class ldouble_safe>
1679
+ double find_split_std_gain_weighted(real_t *restrict x, size_t n, double *restrict sd_arr,
1680
+ GainCriterion criterion, double min_gain, double &restrict split_point,
1681
+ double *restrict w, size_t *restrict sorted_ix);
1682
+ template <class real_t, class real_t_>
1683
+ double find_split_std_gain_t(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1684
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
1685
+ template <class real_t_, class ldouble_safe>
1686
+ double find_split_std_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1687
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
1688
+ template <class real_t, class mapping, class ldouble_safe>
1689
+ double find_split_std_gain_weighted(real_t *restrict x, real_t xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1690
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1691
+ template <class real_t, class ldouble_safe>
1692
+ double find_split_full_gain(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
1693
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1694
+ double *restrict X_row_major, size_t ncols,
1695
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1696
+ double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
1697
+ size_t &restrict split_ix, double &restrict split_point,
1698
+ bool x_uses_ix_arr);
1699
+ template <class real_t, class mapping, class ldouble_safe>
1700
+ double find_split_full_gain_weighted(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
1701
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1702
+ double *restrict X_row_major, size_t ncols,
1703
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1704
+ double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
1705
+ size_t &restrict split_ix, double &restrict split_point,
1706
+ bool x_uses_ix_arr,
1707
+ mapping &restrict w);
1708
+ template <class real_t_, class real_t>
1709
+ double find_split_dens_shortform_t(real_t *restrict x, size_t n, double &restrict split_point);
1710
+ template <class real_t, class ldouble_safe>
1711
+ double find_split_dens_shortform(real_t *restrict x, size_t n, double &restrict split_point);
1712
+ template <class real_t_, class real_t, class mapping>
1713
+ double find_split_dens_shortform_weighted_t(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
1714
+ template <class real_t, class mapping, class ldouble_safe>
1715
+ double find_split_dens_shortform_weighted(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
1716
+ template <class real_t>
1717
+ double find_split_dens_shortform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1718
+ double &restrict split_point, size_t &restrict split_ix);
1719
+ template <class real_t, class mapping>
1720
+ double find_split_dens_shortform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1721
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1722
+ template <class real_t, class ldouble_safe>
1723
+ double find_split_dens_longform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1724
+ double &restrict split_point, size_t &restrict split_ix);
1725
+ template <class real_t, class mapping, class ldouble_safe>
1726
+ double find_split_dens_longform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1727
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1728
+ template <class real_t, class ldouble_safe>
1729
+ double find_split_dens(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1730
+ double &restrict split_point, size_t &restrict split_ix);
1731
+ template <class real_t, class mapping, class ldouble_safe>
1732
+ double find_split_dens_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1733
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1734
+ template <class int_t, class ldouble_safe>
1735
+ double find_split_dens_longform(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
1736
+ CategSplit cat_split_type, MissingAction missing_action,
1737
+ int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
1738
+ size_t *restrict buffer_cnt, int_t *restrict buffer_indices);
1739
+ template <class mapping, class int_t, class ldouble_safe>
1740
+ double find_split_dens_longform_weighted(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
1741
+ CategSplit cat_split_type, MissingAction missing_action,
1742
+ int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
1743
+ int_t *restrict buffer_indices, mapping &restrict w);
1744
+ template <class ldouble_safe>
1745
+ double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion,
1746
+ double min_gain, bool as_relative_gain, double *restrict buffer_sd,
1747
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1748
+ size_t *restrict ix_arr_plus_st,
1749
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1750
+ double *restrict X_row_major, size_t ncols,
1751
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1752
+ template <class ldouble_safe>
1753
+ double eval_guided_crit_weighted(double *restrict x, size_t n, GainCriterion criterion,
1754
+ double min_gain, bool as_relative_gain, double *restrict buffer_sd,
1755
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1756
+ double *restrict w, size_t *restrict buffer_indices,
1757
+ size_t *restrict ix_arr_plus_st,
1758
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1759
+ double *restrict X_row_major, size_t ncols,
1760
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1761
+ template <class real_t_, class ldouble_safe>
1762
+ double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
1763
+ double *restrict buffer_sd, bool as_relative_gain,
1764
+ double *restrict buffer_imputed_x, double *restrict saved_xmedian,
1765
+ size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
1766
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1767
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1768
+ double *restrict X_row_major, size_t ncols,
1769
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1770
+ template <class real_t_, class mapping, class ldouble_safe>
1771
+ double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
1772
+ double *restrict buffer_sd, bool as_relative_gain,
1773
+ double *restrict buffer_imputed_x, double *restrict saved_xmedian,
1774
+ size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
1775
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1776
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1777
+ double *restrict X_row_major, size_t ncols,
1778
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1779
+ mapping &restrict w);
1780
+ template <class real_t_, class sparse_ix, class ldouble_safe>
879
1781
  double eval_guided_crit(size_t ix_arr[], size_t st, size_t end,
880
- size_t col_num, double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
881
- double buffer_arr[], size_t buffer_pos[],
1782
+ size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1783
+ double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
1784
+ double *restrict saved_xmedian,
882
1785
  double &split_point, double &xmin, double &xmax,
883
- GainCriterion criterion, double min_gain, MissingAction missing_action);
1786
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1787
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1788
+ double *restrict X_row_major, size_t ncols,
1789
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1790
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1791
+ double eval_guided_crit_weighted(size_t ix_arr[], size_t st, size_t end,
1792
+ size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1793
+ double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
1794
+ double *restrict saved_xmedian,
1795
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1796
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1797
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1798
+ double *restrict X_row_major, size_t ncols,
1799
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1800
+ mapping &restrict w);
1801
+ template <class ldouble_safe>
884
1802
  double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
1803
+ int *restrict saved_cat_mode,
885
1804
  size_t *restrict buffer_cnt, size_t *restrict buffer_pos, double *restrict buffer_prob,
886
- int &chosen_cat, char *restrict split_categ, char *restrict buffer_split,
887
- GainCriterion criterion, double min_gain, bool all_perm, MissingAction missing_action, CategSplit cat_split_type);
1805
+ int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
1806
+ GainCriterion criterion, double min_gain, bool all_perm,
1807
+ MissingAction missing_action, CategSplit cat_split_type);
1808
+ template <class mapping, class ldouble_safe>
1809
+ double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
1810
+ int *restrict saved_cat_mode,
1811
+ size_t *restrict buffer_pos, double *restrict buffer_prob,
1812
+ int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
1813
+ GainCriterion criterion, double min_gain, bool all_perm,
1814
+ MissingAction missing_action, CategSplit cat_split_type,
1815
+ mapping &restrict w);
1816
+
1817
+ /* indexer.cpp */
1818
+ template <class Tree>
1819
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<Tree> &tree);
1820
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoTree> &tree);
1821
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoHPlane> &tree);
1822
+ template <class Model>
1823
+ void build_terminal_node_mappings(TreesIndexer &indexer, const Model &model);
1824
+ template <class Node>
1825
+ void build_dindex_recursive
1826
+ (
1827
+ const size_t curr_node,
1828
+ const size_t n_terminal, const size_t ncomb,
1829
+ const size_t st, const size_t end,
1830
+ std::vector<size_t> &restrict node_indices, /* array with all terminal indices in 'tree' */
1831
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1832
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1833
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1834
+ size_t curr_depth,
1835
+ const std::vector<Node> &tree
1836
+ );
1837
+ template <class Node>
1838
+ void build_dindex
1839
+ (
1840
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1841
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1842
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1843
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1844
+ const size_t n_terminal,
1845
+ const std::vector<Node> &tree
1846
+ );
1847
+ void build_dindex
1848
+ (
1849
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1850
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1851
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1852
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1853
+ const size_t n_terminal,
1854
+ const std::vector<IsoTree> &tree
1855
+ );
1856
+ void build_dindex
1857
+ (
1858
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1859
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1860
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1861
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1862
+ const size_t n_terminal,
1863
+ const std::vector<IsoHPlane> &tree
1864
+ );
1865
+ template <class Model>
1866
+ void build_distance_mappings(TreesIndexer &indexer, const Model &model, int nthreads);
1867
+ template <class Model>
1868
+ void build_tree_indices(TreesIndexer &indexer, const Model &model, int nthreads, const bool with_distances);
1869
+ ISOTREE_EXPORTED
1870
+ void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances);
1871
+ ISOTREE_EXPORTED
1872
+ void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances);
1873
+ ISOTREE_EXPORTED
1874
+ void build_tree_indices
1875
+ (
1876
+ TreesIndexer *indexer,
1877
+ const IsoForest *model_outputs,
1878
+ const ExtIsoForest *model_outputs_ext,
1879
+ int nthreads,
1880
+ const bool with_distances
1881
+ );
1882
+ ISOTREE_EXPORTED
1883
+ size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept;
1884
+ void build_ref_node(SingleTreeIndex &node);
1885
+
1886
+ /* ref_indexer.hpp */
1887
+ template <class Model, class real_t, class sparse_ix>
1888
+ void set_reference_points(TreesIndexer &indexer, Model &model, const bool with_distances,
1889
+ real_t *restrict numeric_data, int *restrict categ_data,
1890
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1891
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1892
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
1893
+ size_t nrows, int nthreads);
1894
+ template <class real_t, class sparse_ix>
1895
+ void set_reference_points(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext, TreesIndexer *indexer,
1896
+ const bool with_distances,
1897
+ real_t *restrict numeric_data, int *restrict categ_data,
1898
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1899
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1900
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
1901
+ size_t nrows, int nthreads);
888
1902
 
889
1903
  /* merge_models.cpp */
1904
+ ISOTREE_EXPORTED
890
1905
  void merge_models(IsoForest* model, IsoForest* other,
891
1906
  ExtIsoForest* ext_model, ExtIsoForest* ext_other,
892
- Imputer* imputer, Imputer* iother);
1907
+ Imputer* imputer, Imputer* iother,
1908
+ TreesIndexer* indexer, TreesIndexer* ind_other);
1909
+
1910
+ /* subset_models.cpp */
1911
+ ISOTREE_EXPORTED
1912
+ void subset_model(IsoForest* model, IsoForest* model_new,
1913
+ ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
1914
+ Imputer* imputer, Imputer* imputer_new,
1915
+ TreesIndexer* indexer, TreesIndexer* indexer_new,
1916
+ size_t *trees_take, size_t ntrees_take);
893
1917
 
894
- #ifdef _ENABLE_CEREAL
895
1918
  /* serialize.cpp */
896
- void serialize_isoforest(IsoForest &model, std::ostream &output);
897
- void serialize_isoforest(IsoForest &model, const char *output_file_path);
898
- std::string serialize_isoforest(IsoForest &model);
899
- void deserialize_isoforest(IsoForest &output_obj, std::istream &serialized);
900
- void deserialize_isoforest(IsoForest &output_obj, const char *input_file_path);
901
- void deserialize_isoforest(IsoForest &output_obj, std::string &serialized, bool move_str);
902
- void serialize_ext_isoforest(ExtIsoForest &model, std::ostream &output);
903
- void serialize_ext_isoforest(ExtIsoForest &model, const char *output_file_path);
904
- std::string serialize_ext_isoforest(ExtIsoForest &model);
905
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::istream &serialized);
906
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, const char *input_file_path);
907
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::string &serialized, bool move_str);
908
- void serialize_imputer(Imputer &imputer, std::ostream &output);
909
- void serialize_imputer(Imputer &imputer, const char *output_file_path);
910
- std::string serialize_imputer(Imputer &imputer);
911
- void deserialize_imputer(Imputer &output_obj, std::istream &serialized);
912
- void deserialize_imputer(Imputer &output_obj, const char *input_file_path);
913
- void deserialize_imputer(Imputer &output_obj, std::string &serialized, bool move_str);
914
- #ifdef _MSC_VER
915
- void serialize_isoforest(IsoForest &model, const wchar_t *output_file_path);
916
- void deserialize_isoforest(IsoForest &output_obj, const wchar_t *input_file_path);
917
- void serialize_ext_isoforest(ExtIsoForest &model, const wchar_t *output_file_path);
918
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, const wchar_t *input_file_path);
919
- void serialize_imputer(Imputer &imputer, const wchar_t *output_file_path);
920
- void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
921
- #endif /* _MSC_VER */
922
- bool has_msvc();
923
- #endif /* _ENABLE_CEREAL */
1919
+ [[noreturn]]
1920
+ void throw_errno();
1921
+ [[noreturn]]
1922
+ void throw_ferror(FILE *file);
1923
+ [[noreturn]]
1924
+ void throw_feoferr();
1925
+ class FileHandle
1926
+ {
1927
+ public:
1928
+ FILE *handle = NULL;
1929
+ FileHandle(const char *fname, const char *mode)
1930
+ {
1931
+ this->handle = std::fopen(fname, mode);
1932
+ if (!(this->handle))
1933
+ throw_errno();
1934
+ }
1935
+ ~FileHandle()
1936
+ {
1937
+ if (this->handle) {
1938
+ int err = std::fclose(this->handle);
1939
+ if (err)
1940
+ fprintf(stderr, "Error: could not close file.\n");
1941
+ }
1942
+ this->handle = NULL;
1943
+ }
1944
+ };
1945
+
1946
+ #if defined(_WIN32) && (defined(_MSC_VER) || defined(__GNUC__))
1947
+ #define WCHAR_T_FUNS
1948
+ #include <stdio.h>
1949
+ class WFileHandle
1950
+ {
1951
+ public:
1952
+ FILE *handle = NULL;
1953
+ WFileHandle(const wchar_t *fname, const wchar_t *mode)
1954
+ {
1955
+ this->handle = _wfopen(fname, mode);
1956
+ if (!(this->handle))
1957
+ throw_errno();
1958
+ }
1959
+ ~WFileHandle()
1960
+ {
1961
+ if (this->handle) {
1962
+ int err = std::fclose(this->handle);
1963
+ if (err)
1964
+ fprintf(stderr, "Error: could not close file.\n");
1965
+ }
1966
+ this->handle = NULL;
1967
+ }
1968
+ };
1969
+ #endif
1970
+ ISOTREE_EXPORTED
1971
+ bool has_wchar_t_file_serializers() noexcept;
1972
+ ISOTREE_EXPORTED
1973
+ size_t determine_serialized_size(const IsoForest &model) noexcept;
1974
+ ISOTREE_EXPORTED
1975
+ size_t determine_serialized_size(const ExtIsoForest &model) noexcept;
1976
+ ISOTREE_EXPORTED
1977
+ size_t determine_serialized_size(const Imputer &model) noexcept;
1978
+ ISOTREE_EXPORTED
1979
+ size_t determine_serialized_size(const TreesIndexer &model) noexcept;
1980
+ ISOTREE_EXPORTED
1981
+ void serialize_IsoForest(const IsoForest &model, char *out);
1982
+ ISOTREE_EXPORTED
1983
+ void serialize_IsoForest(const IsoForest &model, FILE *out);
1984
+ ISOTREE_EXPORTED
1985
+ void serialize_IsoForest(const IsoForest &model, std::ostream &out);
1986
+ ISOTREE_EXPORTED
1987
+ std::string serialize_IsoForest(const IsoForest &model);
1988
+ ISOTREE_EXPORTED
1989
+ void serialize_IsoForest_ToFile(const IsoForest &model, const char *fname);
1990
+ #ifdef WCHAR_T_FUNS
1991
+ ISOTREE_EXPORTED
1992
+ void serialize_IsoForest_ToFile(const IsoForest &model, const wchar_t *fname);
1993
+ #endif
1994
+ ISOTREE_EXPORTED
1995
+ void deserialize_IsoForest(IsoForest &model, const char *in);
1996
+ ISOTREE_EXPORTED
1997
+ void deserialize_IsoForest(IsoForest &model, FILE *in);
1998
+ ISOTREE_EXPORTED
1999
+ void deserialize_IsoForest(IsoForest &model, std::istream &in);
2000
+ ISOTREE_EXPORTED
2001
+ void deserialize_IsoForest(IsoForest &model, const std::string &in);
2002
+ ISOTREE_EXPORTED
2003
+ void deserialize_IsoForest_FromFile(IsoForest &model, const char *fname);
2004
+ #ifdef WCHAR_T_FUNS
2005
+ ISOTREE_EXPORTED
2006
+ void deserialize_IsoForest_FromFile(IsoForest &model, const wchar_t *fname);
2007
+ #endif
2008
+ ISOTREE_EXPORTED
2009
+ void serialize_ExtIsoForest(const ExtIsoForest &model, char *out);
2010
+ ISOTREE_EXPORTED
2011
+ void serialize_ExtIsoForest(const ExtIsoForest &model, FILE *out);
2012
+ ISOTREE_EXPORTED
2013
+ void serialize_ExtIsoForest(const ExtIsoForest &model, std::ostream &out);
2014
+ ISOTREE_EXPORTED
2015
+ std::string serialize_ExtIsoForest(const ExtIsoForest &model);
2016
+ ISOTREE_EXPORTED
2017
+ void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const char *fname);
2018
+ #ifdef WCHAR_T_FUNS
2019
+ ISOTREE_EXPORTED
2020
+ void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const wchar_t *fname);
2021
+ #endif
2022
+ ISOTREE_EXPORTED
2023
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const char *in);
2024
+ ISOTREE_EXPORTED
2025
+ void deserialize_ExtIsoForest(ExtIsoForest &model, FILE *in);
2026
+ ISOTREE_EXPORTED
2027
+ void deserialize_ExtIsoForest(ExtIsoForest &model, std::istream &in);
2028
+ ISOTREE_EXPORTED
2029
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const std::string &in);
2030
+ ISOTREE_EXPORTED
2031
+ void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const char *fname);
2032
+ #ifdef WCHAR_T_FUNS
2033
+ ISOTREE_EXPORTED
2034
+ void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const wchar_t *fname);
2035
+ #endif
2036
+ ISOTREE_EXPORTED
2037
+ void serialize_Imputer(const Imputer &model, char *out);
2038
+ ISOTREE_EXPORTED
2039
+ void serialize_Imputer(const Imputer &model, FILE *out);
2040
+ ISOTREE_EXPORTED
2041
+ void serialize_Imputer(const Imputer &model, std::ostream &out);
2042
+ ISOTREE_EXPORTED
2043
+ std::string serialize_Imputer(const Imputer &model);
2044
+ ISOTREE_EXPORTED
2045
+ void serialize_Imputer_ToFile(const Imputer &model, const char *fname);
2046
+ #ifdef WCHAR_T_FUNS
2047
+ ISOTREE_EXPORTED
2048
+ void serialize_Imputer_ToFile(const Imputer &model, const wchar_t *fname);
2049
+ #endif
2050
+ ISOTREE_EXPORTED
2051
+ void deserialize_Imputer(Imputer &model, const char *in);
2052
+ ISOTREE_EXPORTED
2053
+ void deserialize_Imputer(Imputer &model, FILE *in);
2054
+ ISOTREE_EXPORTED
2055
+ void deserialize_Imputer(Imputer &model, std::istream &in);
2056
+ ISOTREE_EXPORTED
2057
+ void deserialize_Imputer(Imputer &model, const std::string &in);
2058
+ ISOTREE_EXPORTED
2059
+ void deserialize_Imputer_FromFile(Imputer &model, const char *fname);
2060
+ #ifdef WCHAR_T_FUNS
2061
+ ISOTREE_EXPORTED
2062
+ void deserialize_Imputer_FromFile(Imputer &model, const wchar_t *fname);
2063
+ #endif
2064
+ ISOTREE_EXPORTED
2065
+ void serialize_Indexer(const TreesIndexer &model, char *out);
2066
+ ISOTREE_EXPORTED
2067
+ void serialize_Indexer(const TreesIndexer &model, FILE *out);
2068
+ ISOTREE_EXPORTED
2069
+ void serialize_Indexer(const TreesIndexer &model, std::ostream &out);
2070
+ ISOTREE_EXPORTED
2071
+ std::string serialize_Indexer(const TreesIndexer &model);
2072
+ ISOTREE_EXPORTED
2073
+ void serialize_Indexer_ToFile(const TreesIndexer &model, const char *fname);
2074
+ #ifdef WCHAR_T_FUNS
2075
+ ISOTREE_EXPORTED
2076
+ void serialize_Indexer_ToFile(const TreesIndexer &model, const wchar_t *fname);
2077
+ #endif
2078
+ ISOTREE_EXPORTED
2079
+ void deserialize_Indexer(TreesIndexer &model, const char *in);
2080
+ ISOTREE_EXPORTED
2081
+ void deserialize_Indexer(TreesIndexer &model, FILE *in);
2082
+ ISOTREE_EXPORTED
2083
+ void deserialize_Indexer(TreesIndexer &model, std::istream &in);
2084
+ ISOTREE_EXPORTED
2085
+ void deserialize_Indexer(TreesIndexer &model, const std::string &in);
2086
+ ISOTREE_EXPORTED
2087
+ void deserialize_Indexer_FromFile(TreesIndexer &model, const char *fname);
2088
+ #ifdef WCHAR_T_FUNS
2089
+ ISOTREE_EXPORTED
2090
+ void deserialize_Indexer_FromFile(TreesIndexer &model, const wchar_t *fname);
2091
+ #endif
2092
+ void serialize_isotree(const IsoForest &model, char *out);
2093
+ void serialize_isotree(const ExtIsoForest &model, char *out);
2094
+ void serialize_isotree(const Imputer &model, char *out);
2095
+ void serialize_isotree(const TreesIndexer &model, char *out);
2096
+ void deserialize_isotree(IsoForest &model, const char *in);
2097
+ void deserialize_isotree(ExtIsoForest &model, const char *in);
2098
+ void deserialize_isotree(Imputer &model, const char *in);
2099
+ void deserialize_isotree(TreesIndexer &model, const char *in);
2100
+ void incremental_serialize_isotree(const IsoForest &model, char *old_bytes_reallocated);
2101
+ void incremental_serialize_isotree(const ExtIsoForest &model, char *old_bytes_reallocated);
2102
+ void incremental_serialize_isotree(const Imputer &model, char *old_bytes_reallocated);
2103
+ void incremental_serialize_isotree(const TreesIndexer &model, char *old_bytes_reallocated);
2104
+ ISOTREE_EXPORTED
2105
+ void incremental_serialize_IsoForest(const IsoForest &model, std::string &old_bytes);
2106
+ ISOTREE_EXPORTED
2107
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, std::string &old_bytes);
2108
+ ISOTREE_EXPORTED
2109
+ void incremental_serialize_Imputer(const Imputer &model, std::string &old_bytes);
2110
+ ISOTREE_EXPORTED
2111
+ void incremental_serialize_Indexer(const TreesIndexer &model, std::string &old_bytes);
2112
+ ISOTREE_EXPORTED
2113
+ void inspect_serialized_object
2114
+ (
2115
+ const char *serialized_bytes,
2116
+ bool &is_isotree_model,
2117
+ bool &is_compatible,
2118
+ bool &has_combined_objects,
2119
+ bool &has_IsoForest,
2120
+ bool &has_ExtIsoForest,
2121
+ bool &has_Imputer,
2122
+ bool &has_Indexer,
2123
+ bool &has_metadata,
2124
+ size_t &size_metadata
2125
+ );
2126
+ ISOTREE_EXPORTED
2127
+ void inspect_serialized_object
2128
+ (
2129
+ FILE *serialized_bytes,
2130
+ bool &is_isotree_model,
2131
+ bool &is_compatible,
2132
+ bool &has_combined_objects,
2133
+ bool &has_IsoForest,
2134
+ bool &has_ExtIsoForest,
2135
+ bool &has_Imputer,
2136
+ bool &has_Indexer,
2137
+ bool &has_metadata,
2138
+ size_t &size_metadata
2139
+ );
2140
+ ISOTREE_EXPORTED
2141
+ void inspect_serialized_object
2142
+ (
2143
+ std::istream &serialized_bytes,
2144
+ bool &is_isotree_model,
2145
+ bool &is_compatible,
2146
+ bool &has_combined_objects,
2147
+ bool &has_IsoForest,
2148
+ bool &has_ExtIsoForest,
2149
+ bool &has_Imputer,
2150
+ bool &has_Indexer,
2151
+ bool &has_metadata,
2152
+ size_t &size_metadata
2153
+ );
2154
+ ISOTREE_EXPORTED
2155
+ void inspect_serialized_object
2156
+ (
2157
+ const std::string &serialized_bytes,
2158
+ bool &is_isotree_model,
2159
+ bool &is_compatible,
2160
+ bool &has_combined_objects,
2161
+ bool &has_IsoForest,
2162
+ bool &has_ExtIsoForest,
2163
+ bool &has_Imputer,
2164
+ bool &has_Indexer,
2165
+ bool &has_metadata,
2166
+ size_t &size_metadata
2167
+ );
2168
+ ISOTREE_EXPORTED
2169
+ bool check_can_undergo_incremental_serialization(const IsoForest &model, const char *serialized_bytes);
2170
+ ISOTREE_EXPORTED
2171
+ bool check_can_undergo_incremental_serialization(const ExtIsoForest &model, const char *serialized_bytes);
2172
+ ISOTREE_EXPORTED
2173
+ bool check_can_undergo_incremental_serialization(const Imputer &model, const char *serialized_bytes);
2174
+ ISOTREE_EXPORTED
2175
+ bool check_can_undergo_incremental_serialization(const TreesIndexer &model, const char *serialized_bytes);
2176
+ ISOTREE_EXPORTED
2177
+ size_t determine_serialized_size_additional_trees(const IsoForest &model, size_t old_ntrees) noexcept;
2178
+ ISOTREE_EXPORTED
2179
+ size_t determine_serialized_size_additional_trees(const ExtIsoForest &model, size_t old_ntrees) noexcept;
2180
+ ISOTREE_EXPORTED
2181
+ size_t determine_serialized_size_additional_trees(const Imputer &model, size_t old_ntrees) noexcept;
2182
+ ISOTREE_EXPORTED
2183
+ size_t determine_serialized_size_additional_trees(const TreesIndexer &model, size_t old_ntrees) noexcept;
2184
+ ISOTREE_EXPORTED
2185
+ void incremental_serialize_IsoForest(const IsoForest &model, char *old_bytes_reallocated);
2186
+ ISOTREE_EXPORTED
2187
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, char *old_bytes_reallocated);
2188
+ ISOTREE_EXPORTED
2189
+ void incremental_serialize_Imputer(const Imputer &model, char *old_bytes_reallocated);
2190
+ ISOTREE_EXPORTED
2191
+ void incremental_serialize_Indexer(const TreesIndexer &model, char *old_bytes_reallocated);
2192
+ ISOTREE_EXPORTED
2193
+ size_t determine_serialized_size_combined
2194
+ (
2195
+ const IsoForest *model,
2196
+ const ExtIsoForest *model_ext,
2197
+ const Imputer *imputer,
2198
+ const TreesIndexer *indexer,
2199
+ const size_t size_optional_metadata
2200
+ ) noexcept;
2201
+ ISOTREE_EXPORTED
2202
+ size_t determine_serialized_size_combined
2203
+ (
2204
+ const char *serialized_model,
2205
+ const char *serialized_model_ext,
2206
+ const char *serialized_imputer,
2207
+ const char *serialized_indexer,
2208
+ const size_t size_optional_metadata
2209
+ ) noexcept;
2210
+ ISOTREE_EXPORTED
2211
+ void serialize_combined
2212
+ (
2213
+ const IsoForest *model,
2214
+ const ExtIsoForest *model_ext,
2215
+ const Imputer *imputer,
2216
+ const TreesIndexer *indexer,
2217
+ const char *optional_metadata,
2218
+ const size_t size_optional_metadata,
2219
+ char *out
2220
+ );
2221
+ ISOTREE_EXPORTED
2222
+ void serialize_combined
2223
+ (
2224
+ const IsoForest *model,
2225
+ const ExtIsoForest *model_ext,
2226
+ const Imputer *imputer,
2227
+ const TreesIndexer *indexer,
2228
+ const char *optional_metadata,
2229
+ const size_t size_optional_metadata,
2230
+ FILE *out
2231
+ );
2232
+ ISOTREE_EXPORTED
2233
+ void serialize_combined
2234
+ (
2235
+ const IsoForest *model,
2236
+ const ExtIsoForest *model_ext,
2237
+ const Imputer *imputer,
2238
+ const TreesIndexer *indexer,
2239
+ const char *optional_metadata,
2240
+ const size_t size_optional_metadata,
2241
+ std::ostream &out
2242
+ );
2243
+ ISOTREE_EXPORTED
2244
+ std::string serialize_combined
2245
+ (
2246
+ const IsoForest *model,
2247
+ const ExtIsoForest *model_ext,
2248
+ const Imputer *imputer,
2249
+ const TreesIndexer *indexer,
2250
+ const char *optional_metadata,
2251
+ const size_t size_optional_metadata
2252
+ );
2253
+ ISOTREE_EXPORTED
2254
+ void serialize_combined
2255
+ (
2256
+ const char *serialized_model,
2257
+ const char *serialized_model_ext,
2258
+ const char *serialized_imputer,
2259
+ const char *serialized_indexer,
2260
+ const char *optional_metadata,
2261
+ const size_t size_optional_metadata,
2262
+ FILE *out
2263
+ );
2264
+ ISOTREE_EXPORTED
2265
+ void serialize_combined
2266
+ (
2267
+ const char *serialized_model,
2268
+ const char *serialized_model_ext,
2269
+ const char *serialized_imputer,
2270
+ const char *serialized_indexer,
2271
+ const char *optional_metadata,
2272
+ const size_t size_optional_metadata,
2273
+ std::ostream &out
2274
+ );
2275
+ ISOTREE_EXPORTED
2276
+ std::string serialize_combined
2277
+ (
2278
+ const char *serialized_model,
2279
+ const char *serialized_model_ext,
2280
+ const char *serialized_imputer,
2281
+ const char *serialized_indexer,
2282
+ const char *optional_metadata,
2283
+ const size_t size_optional_metadata
2284
+ );
2285
+ ISOTREE_EXPORTED
2286
+ void deserialize_combined
2287
+ (
2288
+ const char* in,
2289
+ IsoForest *model,
2290
+ ExtIsoForest *model_ext,
2291
+ Imputer *imputer,
2292
+ TreesIndexer *indexer,
2293
+ char *optional_metadata
2294
+ );
2295
+ ISOTREE_EXPORTED
2296
+ void deserialize_combined
2297
+ (
2298
+ FILE* in,
2299
+ IsoForest *model,
2300
+ ExtIsoForest *model_ext,
2301
+ Imputer *imputer,
2302
+ TreesIndexer *indexer,
2303
+ char *optional_metadata
2304
+ );
2305
+ ISOTREE_EXPORTED
2306
+ void deserialize_combined
2307
+ (
2308
+ std::istream &in,
2309
+ IsoForest *model,
2310
+ ExtIsoForest *model_ext,
2311
+ Imputer *imputer,
2312
+ TreesIndexer *indexer,
2313
+ char *optional_metadata
2314
+ );
2315
+ ISOTREE_EXPORTED
2316
+ void deserialize_combined
2317
+ (
2318
+ const std::string &in,
2319
+ IsoForest *model,
2320
+ ExtIsoForest *model_ext,
2321
+ Imputer *imputer,
2322
+ TreesIndexer *indexer,
2323
+ char *optional_metadata
2324
+ );
2325
+ bool check_model_has_range_penalty(const IsoForest &model) noexcept;
2326
+ bool check_model_has_range_penalty(const ExtIsoForest &model) noexcept;
2327
+ void add_range_penalty(IsoForest &model) noexcept;
2328
+ void add_range_penalty(ExtIsoForest &model) noexcept;
2329
+ void add_range_penalty(Imputer &model) noexcept;
2330
+ void add_range_penalty(TreesIndexer &model) noexcept;
924
2331
 
925
2332
  /* sql.cpp */
2333
+ ISOTREE_EXPORTED
926
2334
  std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
927
2335
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
928
2336
  std::vector<std::vector<std::string>> &categ_levels,
929
2337
  bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
930
2338
  int nthreads);
2339
+ ISOTREE_EXPORTED
931
2340
  std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
932
2341
  std::string &table_from, std::string &select_as,
933
2342
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
@@ -935,7 +2344,8 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
935
2344
  bool index1, int nthreads);
936
2345
  void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
937
2346
  size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
938
- std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right);
2347
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
2348
+ const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext);
939
2349
  void extract_cond_isotree(IsoForest &model, IsoTree &tree,
940
2350
  std::string &cond_left, std::string &cond_right,
941
2351
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
@@ -945,7 +2355,9 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
945
2355
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
946
2356
  std::vector<std::vector<std::string>> &categ_levels);
947
2357
 
948
- /* dealloc.cpp */
949
- void dealloc_IsoForest(IsoForest &model_outputs);
950
- void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);
951
- void dealloc_Imputer(Imputer &imputer);
2358
+ #ifndef _FOR_R
2359
+ #if defined(__clang__)
2360
+ #pragma clang diagnostic pop
2361
+ #endif
2362
+ #endif
2363
+ #endif /* ISOTREE_H */