isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -43,73 +61,196 @@
43
61
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
62
  */
45
63
 
64
+ #ifndef ISOTREE_H
65
+ #define ISOTREE_H
66
+
67
+ /* This is only used for the serialiation format and might not reflect the
68
+ actual version of the library, do not use for anything else. */
69
+ #define ISOTREE_VERSION_MAJOR 0
70
+ #define ISOTREE_VERSION_MINOR 5
71
+ #define ISOTREE_VERSION_PATCH 6
72
+
73
+ /* For MinGW, needs to be defined before including any headers */
74
+ #if (defined(_WIN32) || defined(_WIN64)) && (SIZE_MAX >= UINT64_MAX)
75
+ # if defined(__GNUG__) || defined(__GNUC__)
76
+ # ifndef _FILE_OFFSET_BITS
77
+ # define _FILE_OFFSET_BITS 64
78
+ # endif
79
+ # endif
80
+ #endif
81
+ #ifdef _MSC_VER
82
+ # define _CRT_SECURE_NO_WARNINGS
83
+ #endif
84
+
85
+
46
86
  /* Standard headers */
47
- #include <stddef.h>
48
- #include <math.h>
49
- #include <limits.h>
50
- #include <string.h>
51
- #include <signal.h>
87
+ #include <cstddef>
88
+ #include <cmath>
89
+ #include <climits>
90
+ #include <cstring>
91
+ #include <cerrno>
52
92
  #include <vector>
53
93
  #include <iterator>
54
94
  #include <numeric>
55
95
  #include <algorithm>
56
96
  #include <random>
57
- #include <unordered_set>
58
- #include <unordered_map>
59
97
  #include <memory>
60
98
  #include <utility>
61
99
  #include <cstdint>
100
+ #include <cinttypes>
101
+ #include <exception>
102
+ #include <stdexcept>
103
+ #include <cassert>
104
+ #include <cfloat>
62
105
  #include <iostream>
63
- #ifndef _FOR_R
64
- #include <stdio.h>
65
- #else
106
+ #include <string>
107
+
108
+ #ifdef _FOR_R
66
109
  extern "C" {
67
110
  #include <R_ext/Print.h>
68
111
  }
69
112
  #define printf Rprintf
70
113
  #define fprintf(f, message) REprintf(message)
114
+ #elif defined(_FOR_PYTHON)
115
+ extern "C" void cy_warning(const char *msg);
116
+ #define fprintf(f, message) cy_warning(message)
117
+ #else
118
+ #include <cstdio>
119
+ using std::printf;
120
+ using std::fprintf;
71
121
  #endif
72
122
  #ifdef _OPENMP
73
123
  #include <omp.h>
74
124
  #endif
75
- #ifdef _ENABLE_CEREAL
76
- #include <cereal/archives/binary.hpp>
77
- #include <cereal/types/vector.hpp>
78
- #include <sstream>
79
- #include <string>
80
- #include <fstream>
125
+ #ifdef _FOR_R
126
+ #include <Rcpp.h>
127
+ #endif
128
+ #include <csignal>
129
+ typedef void (*sig_t_)(int);
130
+ using std::signal;
131
+ using std::raise;
132
+
133
+ using std::size_t;
134
+ using std::memset;
135
+ using std::memcpy;
136
+
137
+ #if defined(__GNUC__) || defined(__clang__)
138
+ #define likely(x) __builtin_expect((bool)(x), true)
139
+ #define unlikely(x) __builtin_expect((bool)(x), false)
140
+ #else
141
+ #define likely(x) (x)
142
+ #define unlikely(x) (x)
143
+ #endif
144
+
145
+ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
146
+ #define unexpected_error() throw std::runtime_error(\
147
+ std::string("Unexpected error in ") + \
148
+ std::string(__FILE__) + \
149
+ std::string(":") + \
150
+ std::to_string(__LINE__) + \
151
+ std::string(". Please open an issue in GitHub with this information, indicating the installed version of 'isotree'.\n"))
152
+ #else
153
+ #define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
81
154
  #endif
82
155
 
83
- /* By default, will use Mersenne-Twister for RNG, but can be switched to something faster */
84
- #ifdef _USE_MERSENNE_TWISTER
156
+ /* By default, will use Xoshiro256++ or Xoshiro128++ for RNG, but can be switched to something faster */
157
+ #ifdef _USE_XOSHIRO
158
+ #include "xoshiro.hpp"
85
159
  #if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
86
- #define RNG_engine std::mt19937_64
160
+ #define RNG_engine Xoshiro::Xoshiro256PP
87
161
  #else /* 32-bit systems and non-standard architectures */
88
- #define RNG_engine std::mt19937
162
+ #define RNG_engine Xoshiro::Xoshiro128PP
163
+ #endif
164
+ #if defined(DBL_MANT_DIG) && (DBL_MANT_DIG == 53) && (FLT_RADIX == 2)
165
+ using Xoshiro::UniformUnitInterval;
166
+ using Xoshiro::UniformMinusOneToOne;
167
+ using Xoshiro::StandardNormalDistr;
168
+ #else
169
+ #define UniformUnitInterval std::uniform_real_distribution<double>
170
+ #define UniformMinusOneToOne std::uniform_real_distribution<double>
171
+ #define StandardNormalDistr std::normal_distribution<double>
172
+ #endif
173
+ #else
174
+ #if defined(_USE_MERSENNE_TWISTER)
175
+ #if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
176
+ #define RNG_engine std::mt19937_64
177
+ #else /* 32-bit systems and non-standard architectures */
178
+ #define RNG_engine std::mt19937
179
+ #endif
180
+ #else
181
+ #define RNG_engine std::default_random_engine
182
+ #endif
183
+
184
+ #define UniformUnitInterval std::uniform_real_distribution<double>
185
+ #define UniformMinusOneToOne std::uniform_real_distribution<double>
186
+ #define StandardNormalDistr std::normal_distribution<double>
187
+ #endif
188
+
189
+ /* At the time of writing, this brought a sizeable speed up compared to
190
+ 'unordered_map' and 'unordered_set' from both GCC and CLANG.
191
+ But perhaps should consider others in the future, such as this:
192
+ https://github.com/ktprime/emhash */
193
+ #if defined(_USE_ROBIN_MAP)
194
+ #ifndef _USE_SYSTEM_ROBIN
195
+ #include "robinmap/include/tsl/robin_growth_policy.h"
196
+ #include "robinmap/include/tsl/robin_hash.h"
197
+ #include "robinmap/include/tsl/robin_set.h"
198
+ #include "robinmap/include/tsl/robin_map.h"
199
+ #else
200
+ #include "tsl/robin_growth_policy.h"
201
+ #include "tsl/robin_hash.h"
202
+ #include "tsl/robin_set.h"
203
+ #include "tsl/robin_map.h"
89
204
  #endif
205
+ #define hashed_set tsl::robin_set
206
+ #define hashed_map tsl::robin_map
90
207
  #else
91
- #define RNG_engine std::default_random_engine
208
+ #include <unordered_set>
209
+ #include <unordered_map>
210
+ #define hashed_set std::unordered_set
211
+ #define hashed_map std::unordered_map
92
212
  #endif
93
213
 
94
214
  /* Short functions */
95
- #define ix_parent(ix) (((ix) - 1) / 2) /* integer division takes care of deciding left-right */
96
- #define ix_child(ix) (2 * (ix) + 1)
97
215
  /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
98
216
  #define pow2(n) ( ((size_t) 1) << (n) )
217
+ #define div2(n) ((n) >> 1)
218
+ #define mult2(n) ((n) << 1)
219
+ #define ix_parent(ix) (div2((ix) - (size_t)1)) /* integer division takes care of deciding left-right */
220
+ #define ix_child(ix) (mult2(ix) + (size_t)1)
99
221
  #define square(x) ((x) * (x))
222
+ #ifndef _FOR_R
223
+ #if defined(__GNUC__) && (__GNUC__ >= 5)
224
+ #pragma GCC diagnostic push
225
+ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
226
+ #elif defined(__clang__) && !defined(_FOR_R)
227
+ #pragma clang diagnostic push
228
+ #pragma clang diagnostic ignored "-Wuninitialized"
229
+ #endif
230
+ #endif
100
231
  /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
101
232
  #define extract_bit(number, bit) (((number) >> (bit)) & 1)
102
- #ifndef isinf
103
- #define isinf std::isinf
233
+ #ifndef _FOR_R
234
+ #if defined(__GNUC__) && (__GNUC__ >= 5)
235
+ #pragma GCC diagnostic pop
236
+ #elif defined(__clang__)
237
+ #pragma clang diagnostic pop
238
+ #pragma clang diagnostic push
239
+ #pragma clang diagnostic ignored "-Wunknown-attributes"
240
+ #endif
104
241
  #endif
105
- #ifndef isnan
106
- #define isnan std::isnan
242
+ #define is_na_or_inf(x) (std::isnan(x) || std::isinf(x))
243
+
244
+ /* MSVC doesn't support long doubles, so this avoids unnecessarily increasing library size.
245
+ MinGW supports them but has issues with their computations.
246
+ See https://sourceforge.net/p/mingw-w64/bugs/909/ */
247
+ #if defined(_WIN32) && !defined(NO_LONG_DOUBLE)
248
+ #define NO_LONG_DOUBLE
107
249
  #endif
108
- #define is_na_or_inf(x) (isnan(x) || isinf(x))
109
250
 
110
251
 
111
252
  /* Aliasing for compiler optimizations */
112
- #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
253
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
113
254
  #define restrict __restrict
114
255
  #else
115
256
  #define restrict
@@ -118,7 +259,7 @@
118
259
  /* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
119
260
  #ifdef _OPENMP
120
261
  #if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
121
- #define size_t_for long
262
+ #define size_t_for long long
122
263
  #else
123
264
  #define size_t_for size_t
124
265
  #endif
@@ -126,33 +267,51 @@
126
267
  #define size_t_for size_t
127
268
  #endif
128
269
 
270
+ #if defined(_FOR_R) || defined(_FOR_PYTHON)
271
+ #define ISOTREE_EXPORTED
272
+ #else
273
+ #if defined(_WIN32)
274
+ #ifdef ISOTREE_COMPILE_TIME
275
+ #define ISOTREE_EXPORTED __declspec(dllexport)
276
+ #else
277
+ #define ISOTREE_EXPORTED __declspec(dllimport)
278
+ #endif
279
+ #else
280
+ #if defined(EXPLICITLTY_EXPORT_SYMBOLS) && defined(ISOTREE_COMPILE_TIME)
281
+ #define ISOTREE_EXPORTED [[gnu::visibility("default")]]
282
+ #else
283
+ #define ISOTREE_EXPORTED
284
+ #endif
285
+ #endif
286
+ #endif
287
+
129
288
 
130
- /* Apple at some point decided to drop OMP library and headersfrom its compiler distribution
289
+ /* Apple at some point decided to drop OMP library and headers from its compiler distribution
131
290
  * and to alias 'gcc' to 'clang', which work differently when given flags they cannot interpret,
132
291
  * causing installation issues with pretty much all scientific software due to OMP headers that
133
292
  * would normally do nothing. This piece of code is to allow compilation without OMP header. */
134
293
  #ifndef _OPENMP
135
- #define omp_get_thread_num() 0
294
+ #define omp_get_thread_num() (0)
136
295
  #endif
137
296
 
297
+ /* Some aggregation functions will prefer more precise data types when the data is large */
298
+ #define THRESHOLD_LONG_DOUBLE (size_t)1e6
138
299
 
139
- /* For sparse matrices */
140
- #ifdef _FOR_R
141
- #define sparse_ix int
142
- #else
143
- #define sparse_ix size_t
144
- #endif
300
+ /* Types used through the package */
301
+ typedef enum NewCategAction {Weighted=0, Smallest=11, Random=12} NewCategAction; /* Weighted means Impute in the extended model */
302
+ typedef enum MissingAction {Divide=21, Impute=22, Fail=0} MissingAction; /* Divide is only for non-extended model */
303
+ typedef enum ColType {Numeric=31, Categorical=32, NotUsed=0} ColType;
304
+ typedef enum CategSplit {SubSet=0, SingleCateg=41} CategSplit;
305
+ typedef enum CoefType {Uniform=61, Normal=0} CoefType; /* For extended model */
306
+ typedef enum UseDepthImp {Lower=71, Higher=0, Same=72} UseDepthImp; /* For NA imputation */
307
+ typedef enum WeighImpRows {Inverse=0, Prop=81, Flat=82} WeighImpRows; /* For NA imputation */
308
+ typedef enum ScoringMetric {Depth=0, Density=92, BoxedDensity=94, BoxedDensity2=96, BoxedRatio=95,
309
+ AdjDepth=91, AdjDensity=93} ScoringMetric;
145
310
 
311
+ /* These are only used internally */
312
+ typedef enum ColCriterion {Uniformly=0, ByRange=1, ByVar=2, ByKurt=3} ColCriterion; /* For proportional choices */
313
+ typedef enum GainCriterion {NoCrit=0, Averaged=1, Pooled=2, FullGain=3, DensityCrit=4} Criterion; /* For guided splits */
146
314
 
147
- /* Types used through the package */
148
- typedef enum NewCategAction {Weighted, Smallest, Random} NewCategAction; /* Weighted means Impute in the extended model */
149
- typedef enum MissingAction {Divide, Impute, Fail} MissingAction; /* Divide is only for non-extended model */
150
- typedef enum ColType {Numeric, Categorical, NotUsed} ColType;
151
- typedef enum CategSplit {SubSet, SingleCateg} CategSplit;
152
- typedef enum GainCriterion {Averaged, Pooled, NoCrit} Criterion; /* For guided splits */
153
- typedef enum CoefType {Uniform, Normal} CoefType; /* For extended model */
154
- typedef enum UseDepthImp {Lower, Higher, Same} UseDepthImp; /* For NA imputation */
155
- typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /* For NA imputation */
156
315
 
157
316
  /* Notes about new categorical action:
158
317
  * - For single-variable case, if using 'Smallest', can then pass data at prediction time
@@ -167,10 +326,10 @@ typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /
167
326
 
168
327
  /* Structs that are output (modified) from the main function */
169
328
  typedef struct IsoTree {
170
- ColType col_type = NotUsed; /* issues with uninitialized values passed to Cereal */
329
+ ColType col_type = NotUsed; /* issues with uninitialized values when serializing */
171
330
  size_t col_num;
172
331
  double num_split;
173
- std::vector<char> cat_split;
332
+ std::vector<signed char> cat_split;
174
333
  int chosen_cat;
175
334
  size_t tree_left;
176
335
  size_t tree_right;
@@ -180,29 +339,7 @@ typedef struct IsoTree {
180
339
  double range_high = HUGE_VAL;
181
340
  double remainder; /* only used for distance/similarity */
182
341
 
183
- #ifdef _ENABLE_CEREAL
184
- template<class Archive>
185
- void serialize(Archive &archive)
186
- {
187
- archive(
188
- this->col_type,
189
- this->col_num,
190
- this->num_split,
191
- this->cat_split,
192
- this->chosen_cat,
193
- this->tree_left,
194
- this->tree_right,
195
- this->pct_tree_left,
196
- this->score,
197
- this->range_low,
198
- this->range_high,
199
- this->remainder
200
- );
201
- }
202
- #endif
203
-
204
342
  IsoTree() = default;
205
-
206
343
  } IsoTree;
207
344
 
208
345
  typedef struct IsoHPlane {
@@ -223,30 +360,6 @@ typedef struct IsoHPlane {
223
360
  double range_high = HUGE_VAL;
224
361
  double remainder; /* only used for distance/similarity */
225
362
 
226
- #ifdef _ENABLE_CEREAL
227
- template<class Archive>
228
- void serialize(Archive &archive)
229
- {
230
- archive(
231
- this->col_num,
232
- this->col_type,
233
- this->coef,
234
- this->mean,
235
- this->cat_coef,
236
- this->chosen_cat,
237
- this->fill_val,
238
- this->fill_new,
239
- this->split_point,
240
- this->hplane_left,
241
- this->hplane_right,
242
- this->score,
243
- this->range_low,
244
- this->range_high,
245
- this->remainder
246
- );
247
- }
248
- #endif
249
-
250
363
  IsoHPlane() = default;
251
364
  } IsoHPlane;
252
365
 
@@ -258,25 +371,11 @@ typedef struct IsoForest {
258
371
  NewCategAction new_cat_action;
259
372
  CategSplit cat_split_type;
260
373
  MissingAction missing_action;
374
+ ScoringMetric scoring_metric;
261
375
  double exp_avg_depth;
262
376
  double exp_avg_sep;
263
377
  size_t orig_sample_size;
264
-
265
- #ifdef _ENABLE_CEREAL
266
- template<class Archive>
267
- void serialize(Archive &archive)
268
- {
269
- archive(
270
- this->trees,
271
- this->new_cat_action,
272
- this->cat_split_type,
273
- this->missing_action,
274
- this->exp_avg_depth,
275
- this->exp_avg_sep,
276
- this->orig_sample_size
277
- );
278
- }
279
- #endif
378
+ bool has_range_penalty;
280
379
 
281
380
  IsoForest() = default;
282
381
  } IsoForest;
@@ -286,25 +385,11 @@ typedef struct ExtIsoForest {
286
385
  NewCategAction new_cat_action;
287
386
  CategSplit cat_split_type;
288
387
  MissingAction missing_action;
388
+ ScoringMetric scoring_metric;
289
389
  double exp_avg_depth;
290
390
  double exp_avg_sep;
291
391
  size_t orig_sample_size;
292
-
293
- #ifdef _ENABLE_CEREAL
294
- template<class Archive>
295
- void serialize(Archive &archive)
296
- {
297
- archive(
298
- this->hplanes,
299
- this->new_cat_action,
300
- this->cat_split_type,
301
- this->missing_action,
302
- this->exp_avg_depth,
303
- this->exp_avg_sep,
304
- this->orig_sample_size
305
- );
306
- }
307
- #endif
392
+ bool has_range_penalty;
308
393
 
309
394
  ExtIsoForest() = default;
310
395
  } ExtIsoForest;
@@ -316,19 +401,6 @@ typedef struct ImputeNode {
316
401
  std::vector<double> cat_weight;
317
402
  size_t parent;
318
403
 
319
- #ifdef _ENABLE_CEREAL
320
- template<class Archive>
321
- void serialize(Archive &archive)
322
- {
323
- archive(
324
- this->num_sum,
325
- this->num_weight,
326
- this->cat_sum,
327
- this->cat_weight,
328
- this->parent
329
- );
330
- }
331
- #endif
332
404
  ImputeNode() = default;
333
405
 
334
406
  ImputeNode(size_t parent)
@@ -345,30 +417,31 @@ typedef struct Imputer {
345
417
  std::vector<std::vector<ImputeNode>> imputer_tree;
346
418
  std::vector<double> col_means;
347
419
  std::vector<int> col_modes;
420
+
421
+ Imputer() = default;
422
+ } Imputer;
348
423
 
349
- #ifdef _ENABLE_CEREAL
350
- template<class Archive>
351
- void serialize(Archive &archive)
352
- {
353
- archive(
354
- this->ncols_numeric,
355
- this->ncols_categ,
356
- this->ncat,
357
- this->imputer_tree,
358
- this->col_means,
359
- this->col_modes
360
- );
361
- }
362
- #endif
424
+ typedef struct SingleTreeIndex {
425
+ std::vector<size_t> terminal_node_mappings;
426
+ std::vector<double> node_distances;
427
+ std::vector<double> node_depths;
428
+ std::vector<size_t> reference_points;
429
+ std::vector<size_t> reference_indptr;
430
+ std::vector<size_t> reference_mapping;
431
+ size_t n_terminal;
432
+ } TreeNodeIndex;
363
433
 
364
- Imputer() = default;
434
+ typedef struct TreesIndexer {
435
+ std::vector<SingleTreeIndex> indices;
365
436
 
366
- } Imputer;
437
+ TreesIndexer() = default;
438
+ } TreesIndexer;
367
439
 
368
440
 
369
441
  /* Structs that are only used internally */
370
- typedef struct {
371
- double* numeric_data;
442
+ template <class real_t, class sparse_ix>
443
+ struct InputData {
444
+ real_t* numeric_data;
372
445
  size_t ncols_numeric;
373
446
  int* categ_data;
374
447
  int* ncat;
@@ -376,10 +449,10 @@ typedef struct {
376
449
  size_t ncols_categ;
377
450
  size_t nrows;
378
451
  size_t ncols_tot;
379
- double* sample_weights;
452
+ real_t* sample_weights;
380
453
  bool weight_as_sample;
381
- double* col_weights;
382
- double* Xc; /* only for sparse matrices */
454
+ real_t* col_weights;
455
+ real_t* Xc; /* only for sparse matrices */
383
456
  sparse_ix* Xc_ind; /* only for sparse matrices */
384
457
  sparse_ix* Xc_indptr; /* only for sparse matrices */
385
458
  size_t log2_n; /* only when using weights for sampling */
@@ -387,37 +460,58 @@ typedef struct {
387
460
  std::vector<double> btree_weights_init; /* only when using weights for sampling */
388
461
  std::vector<char> has_missing; /* only used when producing missing imputations on-the-fly */
389
462
  size_t n_missing; /* only used when producing missing imputations on-the-fly */
390
- } InputData;
391
-
392
-
393
- typedef struct {
394
- double* numeric_data;
463
+ void* preinitialized_col_sampler; /* only when using column weights */
464
+ double* range_low; /* only when calculating variable ranges or boxed densities with no sub-sampling */
465
+ double* range_high; /* only when calculating variable ranges or boxed densities with no sub-sampling */
466
+ int* ncat_; /* only when calculating boxed densities with no sub-sampling */
467
+ std::vector<double> all_kurtoses; /* only when using 'prob_pick_col_by_kurtosis' or mixing 'weigh_by_kurt' with 'prob_pick_col*' with no sub-sampling */
468
+
469
+ std::vector<double> X_row_major; /* created by this library, only used when calculating full gain */
470
+ std::vector<double> Xr; /* created by this library, only used when calculating full gain */
471
+ std::vector<size_t> Xr_ind; /* created by this library, only used when calculating full gain */
472
+ std::vector<size_t> Xr_indptr; /* created by this library, only used when calculating full gain */
473
+ };
474
+
475
+
476
+ template <class real_t, class sparse_ix>
477
+ struct PredictionData {
478
+ real_t* numeric_data;
395
479
  int* categ_data;
396
480
  size_t nrows;
397
- double* Xc; /* only for sparse matrices */
398
- sparse_ix* Xc_ind; /* only for sparse matrices */
399
- sparse_ix* Xc_indptr; /* only for sparse matrices */
400
- double* Xr; /* only for sparse matrices */
401
- sparse_ix* Xr_ind; /* only for sparse matrices */
402
- sparse_ix* Xr_indptr; /* only for sparse matrices */
403
- } PredictionData;
481
+ bool is_col_major;
482
+ size_t ncols_numeric; /* only required for row-major data */
483
+ size_t ncols_categ; /* only required for row-major data */
484
+ real_t* Xc; /* only for sparse matrices */
485
+ sparse_ix* Xc_ind; /* only for sparse matrices */
486
+ sparse_ix* Xc_indptr; /* only for sparse matrices */
487
+ real_t* Xr; /* only for sparse matrices */
488
+ sparse_ix* Xr_ind; /* only for sparse matrices */
489
+ sparse_ix* Xr_indptr; /* only for sparse matrices */
490
+ };
404
491
 
405
492
  typedef struct {
406
493
  bool with_replacement;
407
494
  size_t sample_size;
408
495
  size_t ntrees;
496
+ size_t ncols_per_tree;
409
497
  size_t max_depth;
410
498
  bool penalize_range;
499
+ bool standardize_data;
411
500
  uint64_t random_seed;
412
501
  bool weigh_by_kurt;
413
502
  double prob_pick_by_gain_avg;
414
- double prob_split_by_gain_avg;
415
503
  double prob_pick_by_gain_pl;
416
- double prob_split_by_gain_pl;
504
+ double prob_pick_by_full_gain;
505
+ double prob_pick_by_dens;
506
+ double prob_pick_col_by_range;
507
+ double prob_pick_col_by_var;
508
+ double prob_pick_col_by_kurt;
417
509
  double min_gain;
418
510
  CategSplit cat_split_type;
419
511
  NewCategAction new_cat_action;
420
512
  MissingAction missing_action;
513
+ ScoringMetric scoring_metric;
514
+ bool fast_bratio;
421
515
  bool all_perm;
422
516
 
423
517
  size_t ndim; /* only for extended model */
@@ -431,16 +525,17 @@ typedef struct {
431
525
 
432
526
  UseDepthImp depth_imp; /* only when building NA imputer */
433
527
  WeighImpRows weigh_imp_rows; /* only when building NA imputer */
434
- size_t min_imp_obs; /* only when building NA imputer */
528
+ size_t min_imp_obs; /* only when building NA imputer */
435
529
  } ModelParams;
436
530
 
437
- typedef struct ImputedData {
438
- std::vector<long double> num_sum;
439
- std::vector<long double> num_weight;
440
- std::vector<std::vector<long double>> cat_sum;
441
- std::vector<long double> cat_weight;
442
- std::vector<long double> sp_num_sum;
443
- std::vector<long double> sp_num_weight;
531
+ template <class sparse_ix, class ldouble_safe>
532
+ struct ImputedData {
533
+ std::vector<ldouble_safe> num_sum;
534
+ std::vector<ldouble_safe> num_weight;
535
+ std::vector<std::vector<ldouble_safe>> cat_sum;
536
+ std::vector<ldouble_safe> cat_weight;
537
+ std::vector<ldouble_safe> sp_num_sum;
538
+ std::vector<ldouble_safe> sp_num_weight;
444
539
 
445
540
  std::vector<size_t> missing_num;
446
541
  std::vector<size_t> missing_cat;
@@ -451,56 +546,288 @@ typedef struct ImputedData {
451
546
 
452
547
  ImputedData() {};
453
548
 
454
- ImputedData(InputData &input_data, size_t row);
455
-
456
- } ImputedData;
549
+ template <class InputData>
550
+ ImputedData(InputData &input_data, size_t row)
551
+ {
552
+ initialize_impute_calc(*this, input_data, row);
553
+ }
457
554
 
458
- typedef struct {
555
+ };
556
+
557
+ /* This class provides efficient methods for sampling columns at random,
558
+ given that at a given node a column might no longer be splittable,
559
+ and when that happens, it also makes it non-splittable in any children
560
+ node from there onwards. The idea is to provide efficient methods for
561
+ passing the state from a parent node to a left node and then restore
562
+ the state before going for the right node.
563
+ It can be used in 3 modes:
564
+ - As a uniform sampler with replacement.
565
+ - As a weighted sampler with replacement.
566
+ - As an array that keeps track of which columns are still splittable. */
567
+ template <class ldouble_safe>
568
+ class ColumnSampler
569
+ {
570
+ public:
571
+ std::vector<size_t> col_indices;
572
+ std::vector<double> tree_weights;
573
+ size_t curr_pos;
574
+ size_t curr_col;
575
+ size_t last_given;
576
+ size_t n_cols;
577
+ size_t tree_levels;
578
+ size_t offset;
579
+ size_t n_dropped;
580
+ template <class real_t>
581
+ void initialize(real_t weights[], size_t n_cols);
582
+ void initialize(size_t n_cols);
583
+ void drop_weights();
584
+ void leave_m_cols(size_t m, RNG_engine &rnd_generator);
585
+ bool sample_col(size_t &col, RNG_engine &rnd_generator);
586
+ void prepare_full_pass(); /* when passing through all columns */
587
+ bool sample_col(size_t &col); /* when passing through all columns */
588
+ void drop_col(size_t col, size_t nobs_left);
589
+ void drop_col(size_t col);
590
+ void drop_from_tail(size_t col);
591
+ void shuffle_remainder(RNG_engine &rnd_generator);
592
+ bool has_weights();
593
+ size_t get_remaining_cols();
594
+ void get_array_remaining_cols(std::vector<size_t> &restrict cols);
595
+ template <class other_t>
596
+ ColumnSampler& operator=(const ColumnSampler<other_t> &other);
597
+ ColumnSampler() = default;
598
+ };
599
+
600
+ template <class ldouble_safe, class real_t>
601
+ class DensityCalculator
602
+ {
603
+ public:
604
+ std::vector<ldouble_safe> multipliers;
605
+ double xmin;
606
+ double xmax;
607
+ std::vector<size_t> counts;
608
+ int n_present;
609
+ int n_left;
610
+ std::vector<double> box_low;
611
+ std::vector<double> box_high;
612
+ std::vector<double> queue_box;
613
+ bool fast_bratio;
614
+ std::vector<ldouble_safe> ranges;
615
+ std::vector<int> ncat;
616
+ std::vector<int> queue_ncat;
617
+ std::vector<int> ncat_orig;
618
+ std::vector<double> vals_ext_box;
619
+ std::vector<double> queue_ext_box;
620
+
621
+ void initialize(size_t max_depth, int max_categ, bool reserve_counts, ScoringMetric scoring_metric);
622
+ template <class InputData>
623
+ #ifndef _FOR_R
624
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
625
+ #endif
626
+ void initialize_bdens(const InputData &input_data,
627
+ const ModelParams &model_params,
628
+ std::vector<size_t> &ix_arr,
629
+ ColumnSampler<ldouble_safe> &col_sampler);
630
+ template <class InputData>
631
+ void initialize_bdens_ext(const InputData &input_data,
632
+ const ModelParams &model_params,
633
+ std::vector<size_t> &ix_arr,
634
+ ColumnSampler<ldouble_safe> &col_sampler,
635
+ bool col_sampler_is_fresh);
636
+ #ifndef _FOR_R
637
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
638
+ #endif
639
+ void push_density(double xmin, double xmax, double split_point);
640
+ void push_density(size_t counts[], int ncat);
641
+ void push_density(int n_left, int n_present);
642
+ void push_density(int n_present);
643
+ void push_density();
644
+ void push_adj(double xmin, double xmax, double split_point, double pct_tree_left, ScoringMetric scoring_metric);
645
+ void push_adj(signed char *restrict categ_present, size_t *restrict counts, int ncat, ScoringMetric scoring_metric);
646
+ void push_adj(size_t *restrict counts, int ncat, int chosen_cat, ScoringMetric scoring_metric);
647
+ void push_adj(double pct_tree_left, ScoringMetric scoring_metric);
648
+ void push_bdens(double split_point, size_t col);
649
+ void push_bdens(int ncat_branch_left, size_t col);
650
+ void push_bdens(const std::vector<signed char> &cat_split, size_t col);
651
+ #ifndef _FOR_R
652
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
653
+ #endif
654
+ void push_bdens_fast_route(double split_point, size_t col);
655
+ void push_bdens_internal(double split_point, size_t col);
656
+ #ifndef _FOR_R
657
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
658
+ #endif
659
+ void push_bdens_fast_route(int ncat_branch_left, size_t col);
660
+ void push_bdens_internal(int ncat_branch_left, size_t col);
661
+ #ifndef _FOR_R
662
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
663
+ #endif
664
+ void push_bdens_fast_route(const std::vector<signed char> &cat_split, size_t col);
665
+ void push_bdens_internal(const std::vector<signed char> &cat_split, size_t col);
666
+ #ifndef _FOR_R
667
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
668
+ #endif
669
+ void push_bdens_ext(const IsoHPlane &hplane, const ModelParams &model_params);
670
+ void pop();
671
+ void pop_right();
672
+ void pop_bdens(size_t col);
673
+ void pop_bdens_right(size_t col);
674
+ void pop_bdens_cat(size_t col);
675
+ void pop_bdens_cat_right(size_t col);
676
+ void pop_bdens_fast_route(size_t col);
677
+ void pop_bdens_internal(size_t col);
678
+ void pop_bdens_right_fast_route(size_t col);
679
+ void pop_bdens_right_internal(size_t col);
680
+ void pop_bdens_cat_fast_route(size_t col);
681
+ void pop_bdens_cat_internal(size_t col);
682
+ void pop_bdens_cat_right_fast_route(size_t col);
683
+ void pop_bdens_cat_right_internal(size_t col);
684
+ void pop_bdens_ext();
685
+ void pop_bdens_ext_right();
686
+ #ifndef _FOR_R
687
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
688
+ #endif
689
+ double calc_density(ldouble_safe remainder, size_t sample_size);
690
+ ldouble_safe calc_adj_depth();
691
+ double calc_adj_density();
692
+ #ifndef _FOR_R
693
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
694
+ #endif
695
+ ldouble_safe calc_bratio_log();
696
+ #ifndef _FOR_R
697
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
698
+ #endif
699
+ ldouble_safe calc_bratio_inv_log();
700
+ #ifndef _FOR_R
701
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
702
+ #endif
703
+ double calc_bratio();
704
+ #ifndef _FOR_R
705
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
706
+ #endif
707
+ double calc_bdens(ldouble_safe remainder, size_t sample_size);
708
+ #ifndef _FOR_R
709
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
710
+ #endif
711
+ double calc_bdens2(ldouble_safe remainder, size_t sample_size);
712
+ #ifndef _FOR_R
713
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
714
+ #endif
715
+ ldouble_safe calc_bratio_log_ext();
716
+ #ifndef _FOR_R
717
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
718
+ #endif
719
+ double calc_bratio_ext();
720
+ #ifndef _FOR_R
721
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
722
+ #endif
723
+ double calc_bdens_ext(ldouble_safe remainder, size_t sample_size);
724
+ void save_range(double xmin, double xmax);
725
+ void restore_range(double &restrict xmin, double &restrict xmax);
726
+ void save_counts(size_t *restrict cat_counts, int ncat);
727
+ void save_n_present_and_left(signed char *restrict split_left, int ncat);
728
+ void save_n_present(size_t *restrict cat_counts, int ncat);
729
+ };
730
+
731
+ template <class ldouble_safe, class real_t>
732
+ class SingleNodeColumnSampler
733
+ {
734
+ public:
735
+ double *restrict weights_orig;
736
+ std::vector<bool> inifinite_weights;
737
+ ldouble_safe cumw;
738
+ size_t n_inf;
739
+ size_t *restrict col_indices;
740
+ size_t curr_pos;
741
+ bool using_tree;
742
+
743
+ bool backup_weights;
744
+ std::vector<double> weights_own;
745
+ size_t n_left;
746
+
747
+ std::vector<double> tree_weights;
748
+ size_t offset;
749
+ size_t tree_levels;
750
+ std::vector<double> used_weights;
751
+ std::vector<size_t> mapped_indices;
752
+ std::vector<size_t> mapped_inf_indices;
753
+
754
+ bool initialize(
755
+ double *restrict weights,
756
+ std::vector<size_t> *col_indices,
757
+ size_t curr_pos,
758
+ size_t n_sample,
759
+ bool backup_weights
760
+ );
761
+
762
+ bool sample_col(size_t &col_chosen, RNG_engine &rnd_generator);
763
+
764
+ void backup(SingleNodeColumnSampler<ldouble_safe, real_t> &other, size_t ncols_tot);
765
+
766
+ void restore(const SingleNodeColumnSampler<ldouble_safe, real_t> &other);
767
+ };
768
+
769
+ template <class ImputedData, class ldouble_safe, class real_t>
770
+ struct WorkerMemory {
459
771
  std::vector<size_t> ix_arr;
460
772
  std::vector<size_t> ix_all;
461
773
  RNG_engine rnd_generator;
462
- std::uniform_int_distribution<size_t> runif;
463
- std::uniform_real_distribution<double> rbin;
774
+ UniformUnitInterval rbin;
464
775
  size_t st;
465
776
  size_t end;
466
777
  size_t st_NA;
467
778
  size_t end_NA;
468
779
  size_t split_ix;
469
- std::unordered_map<size_t, double> weights_map;
470
- std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as density */
780
+ hashed_map<size_t, double> weights_map;
781
+ std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as dty */
782
+ bool changed_weights; /* when using 'missing_action'='Divide' or density weights */
471
783
  double xmin;
472
784
  double xmax;
473
- size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
785
+ size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
474
786
  bool unsplittable;
475
787
  std::vector<bool> is_repeated;
476
- std::vector<char> categs;
477
- size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
788
+ std::vector<signed char> categs;
789
+ size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
478
790
  int ncat_tried;
479
- std::vector<bool> cols_possible;
480
- std::vector<double> btree_weights; /* only when using weights for sampling */
481
- std::discrete_distribution<size_t> col_sampler; /* columns can get eliminated, keep a copy for each thread */
791
+ std::vector<double> btree_weights; /* only when using weights for sampling */
792
+ ColumnSampler<ldouble_safe> col_sampler; /* columns can get eliminated, keep a copy for each thread */
793
+ SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler;
794
+ SingleNodeColumnSampler<ldouble_safe, real_t> node_col_sampler_backup;
482
795
 
483
796
  /* for split criterion */
484
797
  std::vector<double> buffer_dbl;
485
798
  std::vector<size_t> buffer_szt;
486
- std::vector<char> buffer_chr;
799
+ std::vector<signed char> buffer_chr;
487
800
  double prob_split_type;
801
+ ColCriterion col_criterion;
488
802
  GainCriterion criterion;
489
803
  double this_gain;
490
804
  double this_split_point;
491
805
  int this_categ;
492
- std::vector<char> this_split_categ;
806
+ std::vector<signed char> this_split_categ;
493
807
  bool determine_split;
808
+ std::vector<double> imputed_x_buffer;
809
+ double saved_xmedian;
810
+ double best_xmedian;
811
+ int saved_cat_mode;
812
+ int best_cat_mode;
813
+ std::vector<size_t> col_indices; /* only for full gain calculation */
814
+
815
+ /* for weighted column choices */
816
+ std::vector<double> node_col_weights;
817
+ std::vector<double> saved_stat1;
818
+ std::vector<double> saved_stat2;
819
+ bool has_saved_stats;
820
+ double* tree_kurtoses; /* only when mixing 'weight_by_kurt' with 'prob_pick_col*' */
494
821
 
495
822
  /* for the extended model */
496
823
  size_t ntry;
497
824
  size_t ntaken;
498
825
  size_t ntaken_best;
499
- bool tried_all;
500
- size_t col_chosen;
826
+ size_t ntried;
827
+ bool try_all;
828
+ size_t col_chosen; /* also used as placeholder in the single-variable model */
501
829
  ColType col_type;
502
830
  double ext_sd;
503
- std::vector<size_t> cols_shuffled;
504
831
  std::vector<double> comb_val;
505
832
  std::vector<size_t> col_take;
506
833
  std::vector<ColType> col_take_type;
@@ -510,9 +837,10 @@ typedef struct {
510
837
  std::vector<double> ext_fill_val;
511
838
  std::vector<double> ext_fill_new;
512
839
  std::vector<int> chosen_cat;
513
- std::vector<std::vector<double>> ext_cat_coef;
514
- std::uniform_real_distribution<double> coef_unif;
515
- std::normal_distribution<double> coef_norm;
840
+ std::vector<std::vector<double>> ext_cat_coef;
841
+ UniformMinusOneToOne coef_unif;
842
+ StandardNormalDistr coef_norm;
843
+ std::vector<double> sample_weights; /* when using weights and split criterion */
516
844
 
517
845
  /* for similarity/distance calculations */
518
846
  std::vector<double> tmat_sep;
@@ -522,9 +850,11 @@ typedef struct {
522
850
 
523
851
  /* when imputing NAs on-the-fly */
524
852
  std::vector<ImputedData> impute_vec;
525
- std::unordered_map<size_t, ImputedData> impute_map;
853
+ hashed_map<size_t, ImputedData> impute_map;
526
854
 
527
- } WorkerMemory;
855
+ /* for non-depth scoring metric */
856
+ DensityCalculator<ldouble_safe, real_t> density_calculator;
857
+ };
528
858
 
529
859
  typedef struct WorkerForSimilarity {
530
860
  std::vector<size_t> ix_arr;
@@ -538,55 +868,138 @@ typedef struct WorkerForSimilarity {
538
868
  bool assume_full_distr; /* doesn't need to have one copy per worker */
539
869
  } WorkerForSimilarity;
540
870
 
541
- typedef struct {
871
+ typedef struct WorkerForPredictCSC {
872
+ std::vector<size_t> ix_arr;
873
+ size_t st;
874
+ size_t end;
875
+ std::vector<double> comb_val;
876
+ std::vector<double> weights_arr;
877
+ std::vector<double> depths;
878
+ } WorkerForPredictCSC;
879
+
880
+ class RecursionState {
881
+ public:
542
882
  size_t st;
543
883
  size_t st_NA;
544
884
  size_t end_NA;
545
885
  size_t split_ix;
546
886
  size_t end;
887
+ size_t sampler_pos;
888
+ size_t n_dropped;
889
+ bool changed_weights;
890
+ bool full_state;
547
891
  std::vector<size_t> ix_arr;
548
892
  std::vector<bool> cols_possible;
893
+ std::vector<double> col_sampler_weights;
549
894
  std::unique_ptr<double[]> weights_arr;
550
- std::discrete_distribution<size_t> col_sampler;
551
- } RecursionState;
895
+
896
+ RecursionState() = default;
897
+ template <class WorkerMemory>
898
+ RecursionState(WorkerMemory &workspace, bool full_state);
899
+ template <class WorkerMemory>
900
+ void restore_state(WorkerMemory &workspace);
901
+ };
552
902
 
553
903
  /* Function prototypes */
554
904
 
555
905
  /* fit_model.cpp */
556
- extern bool interrupt_switch;
906
+ template <class real_t, class sparse_ix, class ldouble_safe>
907
+ int fit_iforest_internal(
908
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
909
+ real_t numeric_data[], size_t ncols_numeric,
910
+ int categ_data[], size_t ncols_categ, int ncat[],
911
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
912
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
913
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
914
+ size_t nrows, size_t sample_size, size_t ntrees,
915
+ size_t max_depth, size_t ncols_per_tree,
916
+ bool limit_depth, bool penalize_range, bool standardize_data,
917
+ ScoringMetric scoring_metric, bool fast_bratio,
918
+ bool standardize_dist, double tmat[],
919
+ double output_depths[], bool standardize_depth,
920
+ real_t col_weights[], bool weigh_by_kurt,
921
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
922
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
923
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
924
+ double prob_pick_col_by_kurt,
925
+ double min_gain, MissingAction missing_action,
926
+ CategSplit cat_split_type, NewCategAction new_cat_action,
927
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
928
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
929
+ uint64_t random_seed, int nthreads);
930
+ template <class real_t, class sparse_ix>
557
931
  int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
558
- double numeric_data[], size_t ncols_numeric,
932
+ real_t numeric_data[], size_t ncols_numeric,
559
933
  int categ_data[], size_t ncols_categ, int ncat[],
560
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
934
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
561
935
  size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
562
- double sample_weights[], bool with_replacement, bool weight_as_sample,
563
- size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
564
- bool limit_depth, bool penalize_range,
936
+ real_t sample_weights[], bool with_replacement, bool weight_as_sample,
937
+ size_t nrows, size_t sample_size, size_t ntrees,
938
+ size_t max_depth, size_t ncols_per_tree,
939
+ bool limit_depth, bool penalize_range, bool standardize_data,
940
+ ScoringMetric scoring_metric, bool fast_bratio,
565
941
  bool standardize_dist, double tmat[],
566
942
  double output_depths[], bool standardize_depth,
567
- double col_weights[], bool weigh_by_kurt,
568
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
569
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
943
+ real_t col_weights[], bool weigh_by_kurt,
944
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
945
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
946
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
947
+ double prob_pick_col_by_kurt,
570
948
  double min_gain, MissingAction missing_action,
571
949
  CategSplit cat_split_type, NewCategAction new_cat_action,
572
950
  bool all_perm, Imputer *imputer, size_t min_imp_obs,
573
951
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
574
- uint64_t random_seed, bool handle_interrupt, int nthreads);
952
+ uint64_t random_seed, bool use_long_double, int nthreads);
953
+ template <class real_t, class sparse_ix>
575
954
  int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
576
- double numeric_data[], size_t ncols_numeric,
955
+ real_t numeric_data[], size_t ncols_numeric,
577
956
  int categ_data[], size_t ncols_categ, int ncat[],
578
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
957
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
579
958
  size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
580
- double sample_weights[], size_t nrows, size_t max_depth,
581
- bool limit_depth, bool penalize_range,
582
- double col_weights[], bool weigh_by_kurt,
583
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
584
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
959
+ real_t sample_weights[], size_t nrows,
960
+ size_t max_depth, size_t ncols_per_tree,
961
+ bool limit_depth, bool penalize_range, bool standardize_data,
962
+ bool fast_bratio,
963
+ real_t col_weights[], bool weigh_by_kurt,
964
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
965
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
966
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
967
+ double prob_pick_col_by_kurt,
585
968
  double min_gain, MissingAction missing_action,
586
969
  CategSplit cat_split_type, NewCategAction new_cat_action,
587
970
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
588
- bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
971
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
972
+ TreesIndexer *indexer,
973
+ real_t ref_numeric_data[], int ref_categ_data[],
974
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
975
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
976
+ uint64_t random_seed, bool use_long_double);
977
+ template <class real_t, class sparse_ix, class ldouble_safe>
978
+ int add_tree_internal(
979
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
980
+ real_t numeric_data[], size_t ncols_numeric,
981
+ int categ_data[], size_t ncols_categ, int ncat[],
982
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
983
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
984
+ real_t sample_weights[], size_t nrows,
985
+ size_t max_depth, size_t ncols_per_tree,
986
+ bool limit_depth, bool penalize_range, bool standardize_data,
987
+ bool fast_bratio,
988
+ real_t col_weights[], bool weigh_by_kurt,
989
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
990
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
991
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
992
+ double prob_pick_col_by_kurt,
993
+ double min_gain, MissingAction missing_action,
994
+ CategSplit cat_split_type, NewCategAction new_cat_action,
995
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
996
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
997
+ TreesIndexer *indexer,
998
+ real_t ref_numeric_data[], int ref_categ_data[],
999
+ bool ref_is_col_major, size_t ref_ld_numeric, size_t ref_ld_categ,
1000
+ real_t ref_Xc[], sparse_ix ref_Xc_ind[], sparse_ix ref_Xc_indptr[],
589
1001
  uint64_t random_seed);
1002
+ template <class InputData, class WorkerMemory, class ldouble_safe>
590
1003
  void fit_itree(std::vector<IsoTree> *tree_root,
591
1004
  std::vector<IsoHPlane> *hplane_root,
592
1005
  WorkerMemory &workspace,
@@ -596,6 +1009,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
596
1009
  size_t tree_num);
597
1010
 
598
1011
  /* isoforest.cpp */
1012
+ template <class InputData, class WorkerMemory, class ldouble_safe>
599
1013
  void split_itree_recursive(std::vector<IsoTree> &trees,
600
1014
  WorkerMemory &workspace,
601
1015
  InputData &input_data,
@@ -604,31 +1018,55 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
604
1018
  size_t curr_depth);
605
1019
 
606
1020
  /* extended.cpp */
1021
+ template <class InputData, class WorkerMemory, class ldouble_safe>
607
1022
  void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
608
1023
  WorkerMemory &workspace,
609
1024
  InputData &input_data,
610
1025
  ModelParams &model_params,
611
1026
  std::vector<ImputeNode> *impute_nodes,
612
1027
  size_t curr_depth);
1028
+ template <class InputData, class WorkerMemory, class ldouble_safe>
613
1029
  void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params,
614
- std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s);
1030
+ std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s);
615
1031
  void shrink_to_fit_hplane(IsoHPlane &hplane, bool clear_vectors);
1032
+ template <class InputData, class WorkerMemory>
616
1033
  void simplify_hplane(IsoHPlane &hplane, WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
617
1034
 
618
1035
 
619
1036
  /* predict.cpp */
620
- void predict_iforest(double numeric_data[], int categ_data[],
621
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
622
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1037
+ template <class real_t, class sparse_ix>
1038
+ #ifndef _FOR_R
1039
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno"), gnu::hot]]
1040
+ #endif
1041
+ void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
1042
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1043
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1044
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
623
1045
  size_t nrows, int nthreads, bool standardize,
624
1046
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
625
- double output_depths[], sparse_ix tree_num[]);
1047
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1048
+ double *restrict per_tree_depths,
1049
+ TreesIndexer *indexer);
1050
+ template <class real_t, class sparse_ix>
1051
+ [[gnu::hot]]
1052
+ void traverse_itree_fast(std::vector<IsoTree> &tree,
1053
+ IsoForest &model_outputs,
1054
+ real_t *restrict row_numeric_data,
1055
+ double &restrict output_depth,
1056
+ sparse_ix *restrict tree_num,
1057
+ double *restrict tree_depth,
1058
+ size_t row) noexcept;
1059
+ template <class PredictionData, class sparse_ix>
1060
+ [[gnu::hot]]
626
1061
  void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
627
1062
  IsoForest &model_outputs,
628
1063
  PredictionData &prediction_data,
629
- double &output_depth,
1064
+ double &restrict output_depth,
630
1065
  sparse_ix *restrict tree_num,
631
- size_t row);
1066
+ double *restrict tree_depth,
1067
+ size_t row) noexcept;
1068
+ template <class PredictionData, class sparse_ix, class ImputedData>
1069
+ [[gnu::hot]]
632
1070
  double traverse_itree(std::vector<IsoTree> &tree,
633
1071
  IsoForest &model_outputs,
634
1072
  PredictionData &prediction_data,
@@ -637,63 +1075,181 @@ double traverse_itree(std::vector<IsoTree> &tree,
637
1075
  double curr_weight,
638
1076
  size_t row,
639
1077
  sparse_ix *restrict tree_num,
640
- size_t curr_lev);
641
- void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
642
- ExtIsoForest &model_outputs,
643
- PredictionData &prediction_data,
644
- double &output_depth,
645
- sparse_ix *restrict tree_num,
646
- size_t row);
1078
+ double *restrict tree_depth,
1079
+ size_t curr_lev) noexcept;
1080
+ template <class PredictionData, class sparse_ix>
1081
+ [[gnu::hot]]
1082
+ void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
1083
+ ExtIsoForest &model_outputs,
1084
+ PredictionData &prediction_data,
1085
+ double &restrict output_depth,
1086
+ sparse_ix *restrict tree_num,
1087
+ double *restrict tree_depth,
1088
+ size_t row) noexcept;
1089
+ template <class real_t, class sparse_ix>
1090
+ [[gnu::hot]]
1091
+ void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
1092
+ ExtIsoForest &model_outputs,
1093
+ real_t *restrict row_numeric_data,
1094
+ double &restrict output_depth,
1095
+ sparse_ix *restrict tree_num,
1096
+ double *restrict tree_depth,
1097
+ size_t row) noexcept;
1098
+ template <class PredictionData, class sparse_ix, class ImputedData>
1099
+ [[gnu::hot]]
647
1100
  void traverse_hplane(std::vector<IsoHPlane> &hplane,
648
1101
  ExtIsoForest &model_outputs,
649
1102
  PredictionData &prediction_data,
650
- double &output_depth,
1103
+ double &restrict output_depth,
651
1104
  std::vector<ImputeNode> *impute_nodes,
652
1105
  ImputedData *imputed_data,
653
1106
  sparse_ix *restrict tree_num,
654
- size_t row);
655
- double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num);
656
- double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num);
657
- void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
658
- void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
1107
+ double *restrict tree_depth,
1108
+ size_t row) noexcept;
1109
+ template <class real_t, class sparse_ix>
1110
+ void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
1111
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1112
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1113
+ double *restrict per_tree_depths);
1114
+ template <class PredictionData, class sparse_ix>
1115
+ void traverse_itree_csc(WorkerForPredictCSC &workspace,
1116
+ std::vector<IsoTree> &trees,
1117
+ IsoForest &model_outputs,
1118
+ PredictionData &prediction_data,
1119
+ sparse_ix *restrict tree_num,
1120
+ double *restrict per_tree_depths,
1121
+ size_t curr_tree,
1122
+ bool has_range_penalty);
1123
+ template <class PredictionData, class sparse_ix>
1124
+ void traverse_hplane_csc(WorkerForPredictCSC &workspace,
1125
+ std::vector<IsoHPlane> &hplanes,
1126
+ ExtIsoForest &model_outputs,
1127
+ PredictionData &prediction_data,
1128
+ sparse_ix *restrict tree_num,
1129
+ double *restrict per_tree_depths,
1130
+ size_t curr_tree,
1131
+ bool has_range_penalty);
1132
+ template <class PredictionData>
1133
+ void add_csc_range_penalty(WorkerForPredictCSC &workspace,
1134
+ PredictionData &prediction_data,
1135
+ double *restrict weights_arr,
1136
+ size_t col_num,
1137
+ double range_low,
1138
+ double range_high);
1139
+ template <class PredictionData>
1140
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept;
1141
+ template <class PredictionData, class sparse_ix>
1142
+ static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept;
1143
+ template <class PredictionData, class sparse_ix>
1144
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept;
1145
+ template <class sparse_ix>
1146
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
1147
+ template <class sparse_ix>
1148
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept;
659
1149
 
660
1150
  /* dist.cpp */
661
- void calc_similarity(double numeric_data[], int categ_data[],
662
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
663
- size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1151
+ template <class real_t, class sparse_ix>
1152
+ void calc_similarity(real_t numeric_data[], int categ_data[],
1153
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1154
+ size_t nrows, bool use_long_double, int nthreads,
1155
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
664
1156
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
665
- double tmat[], double rmat[], size_t n_from);
1157
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
1158
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
1159
+ template <class real_t, class sparse_ix, class ldouble_safe>
1160
+ void calc_similarity_internal(
1161
+ real_t numeric_data[], int categ_data[],
1162
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1163
+ size_t nrows, int nthreads,
1164
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
1165
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1166
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
1167
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ);
1168
+ template <class PredictionData, class ldouble_safe>
666
1169
  void traverse_tree_sim(WorkerForSimilarity &workspace,
667
1170
  PredictionData &prediction_data,
668
1171
  IsoForest &model_outputs,
669
1172
  std::vector<IsoTree> &trees,
670
- size_t curr_tree);
1173
+ size_t curr_tree,
1174
+ const bool as_kernel);
1175
+ template <class PredictionData, class ldouble_safe>
671
1176
  void traverse_hplane_sim(WorkerForSimilarity &workspace,
672
1177
  PredictionData &prediction_data,
673
1178
  ExtIsoForest &model_outputs,
674
1179
  std::vector<IsoHPlane> &hplanes,
675
- size_t curr_tree);
1180
+ size_t curr_tree,
1181
+ const bool as_kernel);
1182
+ template <class PredictionData, class InputData, class WorkerMemory>
1183
+ #ifndef _FOR_R
1184
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1185
+ #endif
676
1186
  void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
677
1187
  std::vector<WorkerMemory> *worker_memory_m,
678
1188
  PredictionData *prediction_data, InputData *input_data,
679
1189
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
680
1190
  double *restrict tmat, double *restrict rmat, size_t n_from,
681
1191
  size_t ntrees, bool assume_full_distr,
682
- bool standardize_dist, int nthreads);
1192
+ bool standardize_dist, bool as_kernel, int nthreads);
1193
+ template <class PredictionData>
683
1194
  void initialize_worker_for_sim(WorkerForSimilarity &workspace,
684
1195
  PredictionData &prediction_data,
685
1196
  IsoForest *model_outputs,
686
1197
  ExtIsoForest *model_outputs_ext,
687
1198
  size_t n_from,
688
1199
  bool assume_full_distr);
1200
+ template <class real_t, class sparse_ix>
1201
+ #ifndef _FOR_R
1202
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1203
+ #endif
1204
+ void calc_similarity_from_indexer
1205
+ (
1206
+ real_t *restrict numeric_data, int *restrict categ_data,
1207
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1208
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1209
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1210
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1211
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1212
+ );
1213
+ template <class real_t, class sparse_ix>
1214
+ #ifndef _FOR_R
1215
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1216
+ #endif
1217
+ void calc_similarity_from_indexer_with_references
1218
+ (
1219
+ real_t *restrict numeric_data, int *restrict categ_data,
1220
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1221
+ size_t nrows, int nthreads, bool standardize_dist,
1222
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1223
+ double *restrict rmat,
1224
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1225
+ );
1226
+ template <class real_t, class sparse_ix>
1227
+ void kernel_to_references(TreesIndexer &indexer,
1228
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1229
+ real_t *restrict numeric_data, int *restrict categ_data,
1230
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1231
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1232
+ size_t nrows, int nthreads,
1233
+ double *restrict rmat,
1234
+ bool standardize);
689
1235
 
690
1236
  /* impute.cpp */
691
- void impute_missing_values(double numeric_data[], int categ_data[],
692
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1237
+ template <class real_t, class sparse_ix>
1238
+ void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
1239
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
1240
+ size_t nrows, bool use_long_double, int nthreads,
1241
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1242
+ Imputer &imputer);
1243
+ template <class real_t, class sparse_ix, class ldouble_safe>
1244
+ void impute_missing_values_internal(
1245
+ real_t numeric_data[], int categ_data[], bool is_col_major,
1246
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
693
1247
  size_t nrows, int nthreads,
694
1248
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
695
1249
  Imputer &imputer);
1250
+ template <class InputData, class ldouble_safe>
696
1251
  void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads);
1252
+ template <class InputData, class WorkerMemory, class ldouble_safe>
697
1253
  void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
698
1254
  InputData &input_data, ModelParams &model_params,
699
1255
  std::vector<ImputeNode> &imputer_tree,
@@ -702,232 +1258,1085 @@ void shrink_impute_node(ImputeNode &imputer);
702
1258
  void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
703
1259
  std::vector<IsoTree> *trees,
704
1260
  std::vector<IsoHPlane> *hplanes);
705
- void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto);
1261
+ template <class ImputedData>
1262
+ void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto);
1263
+ template <class ImputedData, class WorkerMemory>
706
1264
  void combine_tree_imputations(WorkerMemory &workspace,
707
1265
  std::vector<ImputedData> &impute_vec,
708
- std::unordered_map<size_t, ImputedData> &impute_map,
1266
+ hashed_map<size_t, ImputedData> &impute_map,
709
1267
  std::vector<char> &has_missing,
710
1268
  int nthreads);
1269
+ template <class ImputedData>
711
1270
  void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w);
1271
+ template <class InputData, class WorkerMemory>
712
1272
  void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data);
713
- template <class imp_arr>
1273
+ template <class imp_arr, class InputData>
714
1274
  void apply_imputation_results(imp_arr &impute_vec,
715
1275
  Imputer &imputer,
716
1276
  InputData &input_data,
717
1277
  int nthreads);
1278
+ template <class ImputedData, class InputData>
718
1279
  void apply_imputation_results(std::vector<ImputedData> &impute_vec,
719
- std::unordered_map<size_t, ImputedData> &impute_map,
1280
+ hashed_map<size_t, ImputedData> &impute_map,
720
1281
  Imputer &imputer,
721
1282
  InputData &input_data,
722
1283
  int nthreads);
1284
+ template <class PredictionData, class ImputedData>
723
1285
  void apply_imputation_results(PredictionData &prediction_data,
724
1286
  ImputedData &imp,
725
1287
  Imputer &imputer,
726
1288
  size_t row);
1289
+ template <class ImputedData, class InputData>
727
1290
  void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row);
1291
+ template <class ImputedData, class PredictionData>
728
1292
  void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row);
1293
+ template <class ImputedData, class InputData>
729
1294
  void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads);
730
- void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data);
1295
+ template <class ImputedData, class InputData>
1296
+ void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data);
1297
+ template <class ImputedData, class InputData>
731
1298
  void allocate_imp(InputData &input_data,
732
1299
  std::vector<ImputedData> &impute_vec,
733
- std::unordered_map<size_t, ImputedData> &impute_map,
1300
+ hashed_map<size_t, ImputedData> &impute_map,
734
1301
  int nthreads);
1302
+ template <class ImputedData, class InputData>
735
1303
  void check_for_missing(InputData &input_data,
736
1304
  std::vector<ImputedData> &impute_vec,
737
- std::unordered_map<size_t, ImputedData> &impute_map,
1305
+ hashed_map<size_t, ImputedData> &impute_map,
738
1306
  int nthreads);
1307
+ template <class PredictionData>
739
1308
  size_t check_for_missing(PredictionData &prediction_data,
740
1309
  Imputer &imputer,
741
1310
  size_t ix_arr[],
742
1311
  int nthreads);
743
1312
 
744
1313
  /* helpers_iforest.cpp */
745
- void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
746
- RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
747
- std::discrete_distribution<size_t> &col_sampler);
748
- void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
749
- void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data);
750
- bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
1314
+ static inline size_t get_ntrees(const IsoForest &model)
1315
+ {
1316
+ return model.trees.size();
1317
+ }
1318
+
1319
+ static inline size_t get_ntrees(const ExtIsoForest &model)
1320
+ {
1321
+ return model.hplanes.size();
1322
+ }
1323
+
1324
+ static inline size_t get_ntrees(const Imputer &model)
1325
+ {
1326
+ return model.imputer_tree.size();
1327
+ }
1328
+
1329
+ static inline size_t get_ntrees(const TreesIndexer &model)
1330
+ {
1331
+ return model.indices.size();
1332
+ }
1333
+ template <class InputData, class WorkerMemory>
751
1334
  void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree);
1335
+ template <class InputData, class WorkerMemory>
752
1336
  void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
1337
+ template <class InputData, class WorkerMemory>
1338
+ void get_split_range_v2(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
1339
+ template <class InputData, class WorkerMemory>
753
1340
  int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num);
754
- void update_col_sampler(WorkerMemory &workspace, InputData &input_data);
755
- bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
756
- InputData &input_data, size_t col_num, ColType col_type);
757
- void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
1341
+ bool is_col_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
1342
+ size_t col_num);
1343
+ template <class InputData>
1344
+ void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
758
1345
  InputData &input_data, size_t col_num, ColType col_type);
1346
+ template <class InputData>
1347
+ void set_col_as_taken(std::vector<bool> &col_is_taken, hashed_set<size_t> &col_is_taken_s,
1348
+ InputData &input_data, size_t col_num);
1349
+ template <class InputData, class WorkerMemory>
759
1350
  void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder);
760
- void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight);
1351
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1352
+ void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, ldouble_safe sum_weight);
1353
+ template <class PredictionData, class sparse_ix>
761
1354
  void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
762
1355
  PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads);
763
- void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
764
- void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
1356
+ template <class InputData, class ldouble_safe>
1357
+ std::vector<double> calc_kurtosis_all_data(InputData &input_data, ModelParams &model_params, RNG_engine &rnd_generator);
1358
+ template <class InputData, class WorkerMemory>
1359
+ void calc_ranges_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1360
+ double *restrict ranges, double *restrict saved_xmin, double *restrict saved_xmax);
1361
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1362
+ void calc_var_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1363
+ double *restrict variances, double *restrict saved_xmin, double *restrict saved_xmax,
1364
+ double *restrict saved_means, double *restrict saved_sds);
1365
+ template <class InputData, class WorkerMemory, class ldouble_safe>
1366
+ void calc_kurt_all_cols(InputData &input_data, WorkerMemory &workspace, ModelParams &model_params,
1367
+ double *restrict kurtosis, double *restrict saved_xmin, double *restrict saved_xmax);
1368
+ bool is_boxed_metric(const ScoringMetric scoring_metric);
765
1369
 
766
1370
 
767
1371
  /* utils.cpp */
1372
+ #define ix_comb_(i, j, n, ncomb) ( ((ncomb) + ((j) - (i))) - (size_t)1 - div2(((n) - (i)) * ((n) - (i) - (size_t)1)) )
1373
+ #define ix_comb(i, j, n, ncomb) ( ((i) < (j))? ix_comb_(i, j, n, ncomb) : ix_comb_(j, i, n, ncomb) )
1374
+ #define calc_ncomb(n) (((n) % 2) == 0)? (div2(n) * ((n)-(size_t)1)) : ((n) * div2((n)-(size_t)1))
768
1375
  size_t log2ceil(size_t x);
1376
+ #ifndef _FOR_R
1377
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1378
+ #endif
1379
+ double digamma(double x);
1380
+ template <class ldouble_safe>
1381
+ #ifndef _FOR_R
1382
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1383
+ #endif
769
1384
  double harmonic(size_t n);
770
1385
  double harmonic_recursive(double a, double b);
1386
+ template <class ldouble_safe>
771
1387
  double expected_avg_depth(size_t sample_size);
772
- double expected_avg_depth(long double approx_sample_size);
1388
+ template <class ldouble_safe>
1389
+ #ifndef _FOR_R
1390
+ [[gnu::optimize("no-trapping-math"), gnu::optimize("no-math-errno")]]
1391
+ #endif
1392
+ double expected_avg_depth(ldouble_safe approx_sample_size);
773
1393
  double expected_separation_depth(size_t n);
774
1394
  double expected_separation_depth_hotstart(double curr, size_t n_curr, size_t n_final);
775
- double expected_separation_depth(long double n);
1395
+ template <class ldouble_safe>
1396
+ double expected_separation_depth(ldouble_safe n);
776
1397
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n, double counter[], double exp_remainder);
777
1398
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
778
1399
  double *restrict counter, double *restrict weights, double exp_remainder);
779
1400
  void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
780
- double counter[], std::unordered_map<size_t, double> &weights, double exp_remainder);
1401
+ double counter[], hashed_map<size_t, double> &weights, double exp_remainder);
781
1402
  void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
782
1403
  double counter[], double exp_remainder);
783
1404
  void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
784
1405
  double *restrict counter, double *restrict weights, double exp_remainder);
785
- void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, bool diag_to_one);
786
- double calc_sd_raw(size_t cnt, long double sum, long double sum_sq);
787
- long double calc_sd_raw_l(size_t cnt, long double sum, long double sum_sq);
788
- void build_btree_sampler(std::vector<double> &btree_weights, double *restrict sample_weights,
789
- size_t nrows, size_t &log2_n, size_t &btree_offset);
790
- void sample_random_rows(std::vector<size_t> &ix_arr, size_t nrows, bool with_replacement,
791
- RNG_engine &rnd_generator, std::vector<size_t> &ix_all,
792
- double sample_weights[], std::vector<double> &btree_weights,
1406
+ void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, double fill_diag);
1407
+ template <class real_t=double>
1408
+ void build_btree_sampler(std::vector<double> &btree_weights, real_t *restrict sample_weights,
1409
+ size_t nrows, size_t &restrict log2_n, size_t &restrict btree_offset);
1410
+ template <class real_t=double, class ldouble_safe>
1411
+ void sample_random_rows(std::vector<size_t> &restrict ix_arr, size_t nrows, bool with_replacement,
1412
+ RNG_engine &rnd_generator, std::vector<size_t> &restrict ix_all,
1413
+ real_t *restrict sample_weights, std::vector<double> &restrict btree_weights,
793
1414
  size_t log2_n, size_t btree_offset, std::vector<bool> &is_repeated);
794
- void weighted_shuffle(size_t *restrict outp, size_t n, double *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
795
- size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point);
796
- void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point,
797
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
798
- void divide_subset_split(size_t ix_arr[], size_t st, size_t end, size_t col_num,
799
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[], double split_point,
800
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
801
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
802
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
803
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
1415
+ template <class real_t=double>
1416
+ void weighted_shuffle(size_t *restrict outp, size_t n, real_t *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
1417
+ double sample_random_uniform(double xmin, double xmax, RNG_engine &rng) noexcept;
1418
+ size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point) noexcept;
1419
+ template <class real_t=double>
1420
+ void divide_subset_split(size_t *restrict ix_arr, real_t x[], size_t st, size_t end, double split_point,
1421
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1422
+ template <class real_t, class sparse_ix>
1423
+ void divide_subset_split(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1424
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, double split_point,
1425
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1426
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
1427
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1428
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, signed char split_categ[],
804
1429
  int ncat, MissingAction missing_action, NewCategAction new_cat_action,
805
- bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
806
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_categ,
807
- MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
808
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end,
1430
+ bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1431
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end, int split_categ,
1432
+ MissingAction missing_action, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1433
+ void divide_subset_split(size_t *restrict ix_arr, int x[], size_t st, size_t end,
809
1434
  MissingAction missing_action, NewCategAction new_cat_action,
810
- bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
811
- void get_range(size_t ix_arr[], double x[], size_t st, size_t end,
812
- MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
813
- void get_range(size_t ix_arr[], size_t st, size_t end, size_t col_num,
814
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
815
- MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
816
- void get_categs(size_t ix_arr[], int x[], size_t st, size_t end, int ncat,
817
- MissingAction missing_action, char categs[], size_t &npresent, bool &unsplittable);
818
- long double calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
819
- std::vector<double> &weights_arr, std::unordered_map<size_t, double> &weights_map);
1435
+ bool move_new_to_left, size_t &restrict st_NA, size_t &restrict end_NA, size_t &restrict split_ix) noexcept;
1436
+ template <class real_t=double>
1437
+ void get_range(size_t ix_arr[], real_t *restrict x, size_t st, size_t end,
1438
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1439
+ template <class real_t>
1440
+ void get_range(real_t *restrict x, size_t n,
1441
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1442
+ template <class real_t, class sparse_ix>
1443
+ void get_range(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1444
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1445
+ MissingAction missing_action, double &restrict xmin_, double &restrict xmax_, bool &unsplittable) noexcept;
1446
+ template <class real_t, class sparse_ix>
1447
+ void get_range(size_t col_num, size_t nrows,
1448
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1449
+ MissingAction missing_action, double &restrict xmin, double &restrict xmax, bool &unsplittable) noexcept;
1450
+ void get_categs(size_t *restrict ix_arr, int x[], size_t st, size_t end, int ncat,
1451
+ MissingAction missing_action, signed char categs[], size_t &restrict npresent, bool &unsplittable) noexcept;
1452
+ template <class real_t>
1453
+ bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
1454
+ bool check_more_than_two_unique_values(size_t ix_arr[], size_t st, size_t end, int x[], MissingAction missing_action);
1455
+ template <class real_t, class sparse_ix>
1456
+ bool check_more_than_two_unique_values(size_t *restrict ix_arr, size_t st, size_t end, size_t col,
1457
+ sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
1458
+ MissingAction missing_action);
1459
+ template <class real_t, class sparse_ix>
1460
+ bool check_more_than_two_unique_values(size_t nrows, size_t col,
1461
+ sparse_ix *restrict Xc_indptr, sparse_ix *restrict Xc_ind, real_t *restrict Xc,
1462
+ MissingAction missing_action);
1463
+ void count_categs(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict counts);
1464
+ int count_ncateg_in_col(const int x[], const size_t n, const int ncat, unsigned char buffer[]);
1465
+ template <class ldouble_safe>
1466
+ ldouble_safe calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
1467
+ std::vector<double> &weights_arr, hashed_map<size_t, double> &weights_map);
1468
+ extern bool interrupt_switch;
1469
+ extern bool signal_is_locked;
820
1470
  void set_interrup_global_variable(int s);
1471
+ #ifdef _FOR_PYTHON
1472
+ bool cy_check_interrupt_switch();
1473
+ void cy_tick_off_interrupt_switch();
1474
+ #endif
1475
+ class SignalSwitcher
1476
+ {
1477
+ public:
1478
+ sig_t_ old_sig;
1479
+ bool is_active;
1480
+ SignalSwitcher();
1481
+ ~SignalSwitcher();
1482
+ void restore_handle();
1483
+ };
1484
+ void check_interrupt_switch(SignalSwitcher &ss);
1485
+ bool has_long_double();
821
1486
  int return_EXIT_SUCCESS();
822
1487
  int return_EXIT_FAILURE();
823
1488
 
824
1489
 
825
1490
 
826
- size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, double x[]);
827
- size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, size_t col_num, double Xc[], size_t Xc_ind[], size_t Xc_indptr[]);
1491
+ template <class real_t=double>
1492
+ size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, real_t x[]);
1493
+ template <class real_t, class sparse_ix>
1494
+ size_t move_NAs_to_front(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
828
1495
  size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, int x[]);
829
- size_t center_NAs(size_t *restrict ix_arr, size_t st_left, size_t st, size_t curr_pos);
830
- void todense(size_t ix_arr[], size_t st, size_t end,
831
- size_t col_num, double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1496
+ size_t center_NAs(size_t ix_arr[], size_t st_left, size_t st, size_t curr_pos);
1497
+ template <class real_t>
1498
+ void fill_NAs_with_median(size_t *restrict ix_arr, size_t st_orig, size_t st, size_t end, real_t *restrict x,
1499
+ double *restrict buffer_imputed_x, double *restrict xmedian);
1500
+ template <class real_t, class sparse_ix>
1501
+ void todense(size_t *restrict ix_arr, size_t st, size_t end,
1502
+ size_t col_num, real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
832
1503
  double *restrict buffer_arr);
1504
+ template <class real_t>
1505
+ void colmajor_to_rowmajor(real_t *restrict X, size_t nrows, size_t ncols, std::vector<double> &X_row_major);
1506
+ template <class real_t, class sparse_ix>
1507
+ void colmajor_to_rowmajor(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1508
+ size_t nrows, size_t ncols,
1509
+ std::vector<double> &Xr, std::vector<size_t> &Xr_ind, std::vector<size_t> &Xr_indptr);
1510
+ template <class sparse_ix=size_t>
1511
+ bool check_indices_are_sorted(sparse_ix indices[], size_t n);
1512
+ template <class real_t, class sparse_ix>
1513
+ void sort_csc_indices(real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr, size_t ncols_numeric);
833
1514
 
834
1515
  /* mult.cpp */
835
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
836
- MissingAction missing_action, double &x_sd, double &x_mean);
837
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
838
- double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
839
- double &x_sd, double &x_mean);
1516
+ template <class real_t, class real_t_>
1517
+ void calc_mean_and_sd_t(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
1518
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1519
+ template <class real_t_, class ldouble_safe>
1520
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
1521
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1522
+ template <class real_t_>
1523
+ double calc_mean_only(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x);
1524
+ template <class real_t_, class mapping, class ldouble_safe>
1525
+ void calc_mean_and_sd_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w,
1526
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean);
1527
+ template <class real_t_, class mapping>
1528
+ double calc_mean_only_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w);
1529
+ template <class real_t_, class sparse_ix, class ldouble_safe>
1530
+ void calc_mean_and_sd(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1531
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1532
+ double &restrict x_sd, double &restrict x_mean);
1533
+ template <class real_t_, class sparse_ix, class ldouble_safe>
1534
+ double calc_mean_only(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1535
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr);
1536
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1537
+ void calc_mean_and_sd_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1538
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1539
+ double &restrict x_sd, double &restrict x_mean, mapping &restrict w);
1540
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1541
+ double calc_mean_only_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1542
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1543
+ mapping &restrict w);
1544
+ template <class real_t_>
840
1545
  void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
841
- double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
1546
+ real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
842
1547
  MissingAction missing_action, double *restrict buffer_arr,
843
1548
  size_t *restrict buffer_NAs, bool first_run);
1549
+ template <class real_t_, class mapping, class ldouble_safe>
1550
+ void add_linear_comb_weighted(size_t ix_arr[], size_t st, size_t end, double *restrict res,
1551
+ real_t_ *restrict x, double &restrict coef, double x_sd, double x_mean, double &restrict fill_val,
1552
+ MissingAction missing_action, double *restrict buffer_arr,
1553
+ size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
1554
+ template <class real_t_, class sparse_ix>
844
1555
  void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
845
- double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
846
- double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
1556
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1557
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
847
1558
  double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run);
1559
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1560
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
1561
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1562
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
1563
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run, mapping &restrict w);
1564
+ template <class mapping>
1565
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
1566
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
1567
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
1568
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
1569
+ bool first_run, mapping &restrict w);
1570
+ template <class ldouble_safe>
848
1571
  void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
849
1572
  int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
850
- double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
1573
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
851
1574
  NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run);
1575
+ template <class mapping, class ldouble_safe>
1576
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
1577
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
1578
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
1579
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
1580
+ bool first_run, mapping &restrict w);
852
1581
 
853
1582
  /* crit.cpp */
854
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, double x[], MissingAction missing_action);
855
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, size_t col_num,
856
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1583
+ template <class real_t, class ldouble_safe>
1584
+ double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, real_t x[], MissingAction missing_action);
1585
+ template <class real_t, class ldouble_safe>
1586
+ double calc_kurtosis(real_t x[], size_t n, MissingAction missing_action);
1587
+ template <class real_t, class mapping, class ldouble_safe>
1588
+ double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, real_t x[],
1589
+ MissingAction missing_action, mapping &restrict w);
1590
+ template <class real_t, class ldouble_safe>
1591
+ double calc_kurtosis_weighted(real_t *restrict x, size_t n_, MissingAction missing_action, real_t *restrict w);
1592
+ template <class real_t, class sparse_ix, class ldouble_safe>
1593
+ double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1594
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1595
+ MissingAction missing_action);
1596
+ template <class real_t, class sparse_ix, class ldouble_safe>
1597
+ double calc_kurtosis(size_t col_num, size_t nrows,
1598
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
857
1599
  MissingAction missing_action);
858
- double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1600
+ template <class real_t, class sparse_ix, class mapping, class ldouble_safe>
1601
+ double calc_kurtosis_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
1602
+ real_t Xc[], sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1603
+ MissingAction missing_action, mapping &restrict w);
1604
+ template <class real_t, class sparse_ix, class ldouble_safe>
1605
+ double calc_kurtosis_weighted(size_t col_num, size_t nrows,
1606
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1607
+ MissingAction missing_action, real_t *restrict w);
1608
+ template <class ldouble_safe>
1609
+ double calc_kurtosis_internal(size_t cnt, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1610
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
1611
+ template <class ldouble_safe>
1612
+ double calc_kurtosis(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat, size_t *restrict buffer_cnt, double buffer_prob[],
859
1613
  MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
860
- double expected_sd_cat(double p[], size_t n, size_t pos[]);
861
- double expected_sd_cat(size_t counts[], double p[], size_t n, size_t pos[]);
862
- double expected_sd_cat_single(size_t counts[], double p[], size_t n, size_t pos[], size_t cat_exclude, size_t cnt);
863
- double numeric_gain(size_t cnt_left, size_t cnt_right,
864
- long double sum_left, long double sum_right,
865
- long double sum_sq_left, long double sum_sq_right,
866
- double sd_full, long double cnt);
867
- double numeric_gain_no_div(size_t cnt_left, size_t cnt_right,
868
- long double sum_left, long double sum_right,
869
- long double sum_sq_left, long double sum_sq_right,
870
- double sd_full, long double cnt);
871
- double categ_gain(size_t cnt_left, size_t cnt_right,
872
- long double s_left, long double s_right,
873
- long double base_info, long double cnt);
874
- double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion, double min_gain,
875
- double &split_point, double &xmin, double &xmax);
876
- double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x,
877
- size_t &split_ix, double &split_point, double &xmin, double &xmax,
878
- GainCriterion criterion, double min_gain, MissingAction missing_action);
1614
+ template <class ldouble_safe>
1615
+ double calc_kurtosis(size_t nrows, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
1616
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
1617
+ template <class mapping, class ldouble_safe>
1618
+ double calc_kurtosis_weighted_internal(std::vector<ldouble_safe> &buffer_cnt, int x[], int ncat,
1619
+ double buffer_prob[], MissingAction missing_action, CategSplit cat_split_type,
1620
+ RNG_engine &rnd_generator, mapping &restrict w);
1621
+ template <class mapping, class ldouble_safe>
1622
+ double calc_kurtosis_weighted(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, double buffer_prob[],
1623
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator,
1624
+ mapping &restrict w);
1625
+ template <class real_t, class ldouble_safe>
1626
+ double calc_kurtosis_weighted(size_t nrows, int x[], int ncat, double *restrict buffer_prob,
1627
+ MissingAction missing_action, CategSplit cat_split_type,
1628
+ RNG_engine &rnd_generator, real_t *restrict w);
1629
+ template <class int_t, class ldouble_safe>
1630
+ double expected_sd_cat(double p[], size_t n, int_t pos[]);
1631
+ template <class number, class int_t, class ldouble_safe>
1632
+ double expected_sd_cat(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos);
1633
+ template <class number, class int_t, class ldouble_safe>
1634
+ double expected_sd_cat_single(number *restrict counts, double *restrict p, size_t n, int_t *restrict pos, size_t cat_exclude, number cnt);
1635
+ template <class number, class int_t, class ldouble_safe>
1636
+ double expected_sd_cat_internal(int ncat, number *restrict buffer_cnt, ldouble_safe cnt_l,
1637
+ int_t *restrict buffer_pos, double *restrict buffer_prob);
1638
+ template <class int_t, class ldouble_safe>
1639
+ double expected_sd_cat(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
1640
+ MissingAction missing_action,
1641
+ size_t *restrict buffer_cnt, int_t *restrict buffer_pos, double buffer_prob[]);
1642
+ template <class mapping, class int_t, class ldouble_safe>
1643
+ double expected_sd_cat_weighted(size_t *restrict ix_arr, size_t st, size_t end, int x[], int ncat,
1644
+ MissingAction missing_action, mapping &restrict w,
1645
+ double *restrict buffer_cnt, int_t *restrict buffer_pos, double *restrict buffer_prob);
1646
+ template <class number, class ldouble_safe>
1647
+ double categ_gain(number cnt_left, number cnt_right,
1648
+ ldouble_safe s_left, ldouble_safe s_right,
1649
+ ldouble_safe base_info, ldouble_safe cnt);
1650
+ template <class real_t, class real_t_>
1651
+ double find_split_rel_gain_t(real_t_ *restrict x, size_t n, double &restrict split_point);
1652
+ template <class real_t_, class ldouble_safe>
1653
+ double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix);
1654
+ template <class real_t, class real_t_>
1655
+ double find_split_rel_gain_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix);
1656
+ template <class real_t_, class ldouble_safe>
1657
+ double find_split_rel_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double &split_point, size_t &split_ix);
1658
+ template <class real_t, class real_t_, class mapping>
1659
+ double find_split_rel_gain_weighted_t(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &split_point, size_t &restrict split_ix, mapping &restrict w);
1660
+ template <class real_t_, class mapping, class ldouble_safe>
1661
+ double find_split_rel_gain_weighted(real_t_ *restrict x, real_t_ xmean, size_t *restrict ix_arr, size_t st, size_t end, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1662
+ template <class real_t, class real_t_=double>
1663
+ real_t calc_sd_right_to_left(real_t_ *restrict x, size_t n, double *restrict sd_arr);
1664
+ template <class real_t_, class ldouble_safe>
1665
+ ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1666
+ double *restrict w, ldouble_safe &cumw, size_t *restrict sorted_ix);
1667
+ template <class real_t, class real_t_>
1668
+ real_t calc_sd_right_to_left(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr);
1669
+ template <class real_t_, class mapping, class ldouble_safe>
1670
+ ldouble_safe calc_sd_right_to_left_weighted(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end,
1671
+ double *restrict sd_arr, mapping &restrict w, ldouble_safe &cumw);
1672
+ template <class real_t, class real_t_>
1673
+ double find_split_std_gain_t(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1674
+ GainCriterion criterion, double min_gain, double &restrict split_point);
1675
+ template <class real_t_, class ldouble_safe>
1676
+ double find_split_std_gain(real_t_ *restrict x, size_t n, double *restrict sd_arr,
1677
+ GainCriterion criterion, double min_gain, double &restrict split_point);
1678
+ template <class real_t, class ldouble_safe>
1679
+ double find_split_std_gain_weighted(real_t *restrict x, size_t n, double *restrict sd_arr,
1680
+ GainCriterion criterion, double min_gain, double &restrict split_point,
1681
+ double *restrict w, size_t *restrict sorted_ix);
1682
+ template <class real_t, class real_t_>
1683
+ double find_split_std_gain_t(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1684
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
1685
+ template <class real_t_, class ldouble_safe>
1686
+ double find_split_std_gain(real_t_ *restrict x, real_t_ xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1687
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix);
1688
+ template <class real_t, class mapping, class ldouble_safe>
1689
+ double find_split_std_gain_weighted(real_t *restrict x, real_t xmean, size_t ix_arr[], size_t st, size_t end, double *restrict sd_arr,
1690
+ GainCriterion criterion, double min_gain, double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1691
+ template <class real_t, class ldouble_safe>
1692
+ double find_split_full_gain(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
1693
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1694
+ double *restrict X_row_major, size_t ncols,
1695
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1696
+ double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
1697
+ size_t &restrict split_ix, double &restrict split_point,
1698
+ bool x_uses_ix_arr);
1699
+ template <class real_t, class mapping, class ldouble_safe>
1700
+ double find_split_full_gain_weighted(real_t *restrict x, size_t st, size_t end, size_t *restrict ix_arr,
1701
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1702
+ double *restrict X_row_major, size_t ncols,
1703
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1704
+ double *restrict buffer_sum_left, double *restrict buffer_sum_tot,
1705
+ size_t &restrict split_ix, double &restrict split_point,
1706
+ bool x_uses_ix_arr,
1707
+ mapping &restrict w);
1708
+ template <class real_t_, class real_t>
1709
+ double find_split_dens_shortform_t(real_t *restrict x, size_t n, double &restrict split_point);
1710
+ template <class real_t, class ldouble_safe>
1711
+ double find_split_dens_shortform(real_t *restrict x, size_t n, double &restrict split_point);
1712
+ template <class real_t_, class real_t, class mapping>
1713
+ double find_split_dens_shortform_weighted_t(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
1714
+ template <class real_t, class mapping, class ldouble_safe>
1715
+ double find_split_dens_shortform_weighted(real_t *restrict x, size_t n, double &restrict split_point, mapping &restrict w, size_t *restrict buffer_indices);
1716
+ template <class real_t>
1717
+ double find_split_dens_shortform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1718
+ double &restrict split_point, size_t &restrict split_ix);
1719
+ template <class real_t, class mapping>
1720
+ double find_split_dens_shortform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1721
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1722
+ template <class real_t, class ldouble_safe>
1723
+ double find_split_dens_longform(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1724
+ double &restrict split_point, size_t &restrict split_ix);
1725
+ template <class real_t, class mapping, class ldouble_safe>
1726
+ double find_split_dens_longform_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1727
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1728
+ template <class real_t, class ldouble_safe>
1729
+ double find_split_dens(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1730
+ double &restrict split_point, size_t &restrict split_ix);
1731
+ template <class real_t, class mapping, class ldouble_safe>
1732
+ double find_split_dens_weighted(real_t *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
1733
+ double &restrict split_point, size_t &restrict split_ix, mapping &restrict w);
1734
+ template <class int_t, class ldouble_safe>
1735
+ double find_split_dens_longform(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
1736
+ CategSplit cat_split_type, MissingAction missing_action,
1737
+ int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
1738
+ size_t *restrict buffer_cnt, int_t *restrict buffer_indices);
1739
+ template <class mapping, class int_t, class ldouble_safe>
1740
+ double find_split_dens_longform_weighted(int *restrict x, int ncat, size_t *restrict ix_arr, size_t st, size_t end,
1741
+ CategSplit cat_split_type, MissingAction missing_action,
1742
+ int &restrict chosen_cat, signed char *restrict split_categ, int *restrict saved_cat_mode,
1743
+ int_t *restrict buffer_indices, mapping &restrict w);
1744
+ template <class ldouble_safe>
1745
+ double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion,
1746
+ double min_gain, bool as_relative_gain, double *restrict buffer_sd,
1747
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1748
+ size_t *restrict ix_arr_plus_st,
1749
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1750
+ double *restrict X_row_major, size_t ncols,
1751
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1752
+ template <class ldouble_safe>
1753
+ double eval_guided_crit_weighted(double *restrict x, size_t n, GainCriterion criterion,
1754
+ double min_gain, bool as_relative_gain, double *restrict buffer_sd,
1755
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1756
+ double *restrict w, size_t *restrict buffer_indices,
1757
+ size_t *restrict ix_arr_plus_st,
1758
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1759
+ double *restrict X_row_major, size_t ncols,
1760
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1761
+ template <class real_t_, class ldouble_safe>
1762
+ double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
1763
+ double *restrict buffer_sd, bool as_relative_gain,
1764
+ double *restrict buffer_imputed_x, double *restrict saved_xmedian,
1765
+ size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
1766
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1767
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1768
+ double *restrict X_row_major, size_t ncols,
1769
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1770
+ template <class real_t_, class mapping, class ldouble_safe>
1771
+ double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, real_t_ *restrict x,
1772
+ double *restrict buffer_sd, bool as_relative_gain,
1773
+ double *restrict buffer_imputed_x, double *restrict saved_xmedian,
1774
+ size_t &split_ix, double &restrict split_point, double &restrict xmin, double &restrict xmax,
1775
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1776
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1777
+ double *restrict X_row_major, size_t ncols,
1778
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1779
+ mapping &restrict w);
1780
+ template <class real_t_, class sparse_ix, class ldouble_safe>
879
1781
  double eval_guided_crit(size_t ix_arr[], size_t st, size_t end,
880
- size_t col_num, double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
881
- double buffer_arr[], size_t buffer_pos[],
1782
+ size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1783
+ double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
1784
+ double *restrict saved_xmedian,
882
1785
  double &split_point, double &xmin, double &xmax,
883
- GainCriterion criterion, double min_gain, MissingAction missing_action);
1786
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1787
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1788
+ double *restrict X_row_major, size_t ncols,
1789
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr);
1790
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
1791
+ double eval_guided_crit_weighted(size_t ix_arr[], size_t st, size_t end,
1792
+ size_t col_num, real_t_ Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
1793
+ double buffer_arr[], size_t buffer_pos[], bool as_relative_gain,
1794
+ double *restrict saved_xmedian,
1795
+ double &restrict split_point, double &restrict xmin, double &restrict xmax,
1796
+ GainCriterion criterion, double min_gain, MissingAction missing_action,
1797
+ size_t *restrict cols_use, size_t ncols_use, bool force_cols_use,
1798
+ double *restrict X_row_major, size_t ncols,
1799
+ double *restrict Xr, size_t *restrict Xr_ind, size_t *restrict Xr_indptr,
1800
+ mapping &restrict w);
1801
+ template <class ldouble_safe>
884
1802
  double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
1803
+ int *restrict saved_cat_mode,
885
1804
  size_t *restrict buffer_cnt, size_t *restrict buffer_pos, double *restrict buffer_prob,
886
- int &chosen_cat, char *restrict split_categ, char *restrict buffer_split,
887
- GainCriterion criterion, double min_gain, bool all_perm, MissingAction missing_action, CategSplit cat_split_type);
1805
+ int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
1806
+ GainCriterion criterion, double min_gain, bool all_perm,
1807
+ MissingAction missing_action, CategSplit cat_split_type);
1808
+ template <class mapping, class ldouble_safe>
1809
+ double eval_guided_crit_weighted(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
1810
+ int *restrict saved_cat_mode,
1811
+ size_t *restrict buffer_pos, double *restrict buffer_prob,
1812
+ int &restrict chosen_cat, signed char *restrict split_categ, signed char *restrict buffer_split,
1813
+ GainCriterion criterion, double min_gain, bool all_perm,
1814
+ MissingAction missing_action, CategSplit cat_split_type,
1815
+ mapping &restrict w);
1816
+
1817
+ /* indexer.cpp */
1818
+ template <class Tree>
1819
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<Tree> &tree);
1820
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoTree> &tree);
1821
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoHPlane> &tree);
1822
+ template <class Model>
1823
+ void build_terminal_node_mappings(TreesIndexer &indexer, const Model &model);
1824
+ template <class Node>
1825
+ void build_dindex_recursive
1826
+ (
1827
+ const size_t curr_node,
1828
+ const size_t n_terminal, const size_t ncomb,
1829
+ const size_t st, const size_t end,
1830
+ std::vector<size_t> &restrict node_indices, /* array with all terminal indices in 'tree' */
1831
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1832
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1833
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1834
+ size_t curr_depth,
1835
+ const std::vector<Node> &tree
1836
+ );
1837
+ template <class Node>
1838
+ void build_dindex
1839
+ (
1840
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1841
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1842
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1843
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1844
+ const size_t n_terminal,
1845
+ const std::vector<Node> &tree
1846
+ );
1847
+ void build_dindex
1848
+ (
1849
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1850
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1851
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1852
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1853
+ const size_t n_terminal,
1854
+ const std::vector<IsoTree> &tree
1855
+ );
1856
+ void build_dindex
1857
+ (
1858
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
1859
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
1860
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
1861
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
1862
+ const size_t n_terminal,
1863
+ const std::vector<IsoHPlane> &tree
1864
+ );
1865
+ template <class Model>
1866
+ void build_distance_mappings(TreesIndexer &indexer, const Model &model, int nthreads);
1867
+ template <class Model>
1868
+ void build_tree_indices(TreesIndexer &indexer, const Model &model, int nthreads, const bool with_distances);
1869
+ ISOTREE_EXPORTED
1870
+ void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances);
1871
+ ISOTREE_EXPORTED
1872
+ void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances);
1873
+ ISOTREE_EXPORTED
1874
+ void build_tree_indices
1875
+ (
1876
+ TreesIndexer *indexer,
1877
+ const IsoForest *model_outputs,
1878
+ const ExtIsoForest *model_outputs_ext,
1879
+ int nthreads,
1880
+ const bool with_distances
1881
+ );
1882
+ ISOTREE_EXPORTED
1883
+ size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept;
1884
+ void build_ref_node(SingleTreeIndex &node);
1885
+
1886
+ /* ref_indexer.hpp */
1887
+ template <class Model, class real_t, class sparse_ix>
1888
+ void set_reference_points(TreesIndexer &indexer, Model &model, const bool with_distances,
1889
+ real_t *restrict numeric_data, int *restrict categ_data,
1890
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1891
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1892
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
1893
+ size_t nrows, int nthreads);
1894
+ template <class real_t, class sparse_ix>
1895
+ void set_reference_points(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext, TreesIndexer *indexer,
1896
+ const bool with_distances,
1897
+ real_t *restrict numeric_data, int *restrict categ_data,
1898
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1899
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1900
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
1901
+ size_t nrows, int nthreads);
888
1902
 
889
1903
  /* merge_models.cpp */
1904
+ ISOTREE_EXPORTED
890
1905
  void merge_models(IsoForest* model, IsoForest* other,
891
1906
  ExtIsoForest* ext_model, ExtIsoForest* ext_other,
892
- Imputer* imputer, Imputer* iother);
1907
+ Imputer* imputer, Imputer* iother,
1908
+ TreesIndexer* indexer, TreesIndexer* ind_other);
1909
+
1910
+ /* subset_models.cpp */
1911
+ ISOTREE_EXPORTED
1912
+ void subset_model(IsoForest* model, IsoForest* model_new,
1913
+ ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
1914
+ Imputer* imputer, Imputer* imputer_new,
1915
+ TreesIndexer* indexer, TreesIndexer* indexer_new,
1916
+ size_t *trees_take, size_t ntrees_take);
893
1917
 
894
- #ifdef _ENABLE_CEREAL
895
1918
  /* serialize.cpp */
896
- void serialize_isoforest(IsoForest &model, std::ostream &output);
897
- void serialize_isoforest(IsoForest &model, const char *output_file_path);
898
- std::string serialize_isoforest(IsoForest &model);
899
- void deserialize_isoforest(IsoForest &output_obj, std::istream &serialized);
900
- void deserialize_isoforest(IsoForest &output_obj, const char *input_file_path);
901
- void deserialize_isoforest(IsoForest &output_obj, std::string &serialized, bool move_str);
902
- void serialize_ext_isoforest(ExtIsoForest &model, std::ostream &output);
903
- void serialize_ext_isoforest(ExtIsoForest &model, const char *output_file_path);
904
- std::string serialize_ext_isoforest(ExtIsoForest &model);
905
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::istream &serialized);
906
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, const char *input_file_path);
907
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::string &serialized, bool move_str);
908
- void serialize_imputer(Imputer &imputer, std::ostream &output);
909
- void serialize_imputer(Imputer &imputer, const char *output_file_path);
910
- std::string serialize_imputer(Imputer &imputer);
911
- void deserialize_imputer(Imputer &output_obj, std::istream &serialized);
912
- void deserialize_imputer(Imputer &output_obj, const char *input_file_path);
913
- void deserialize_imputer(Imputer &output_obj, std::string &serialized, bool move_str);
914
- #ifdef _MSC_VER
915
- void serialize_isoforest(IsoForest &model, const wchar_t *output_file_path);
916
- void deserialize_isoforest(IsoForest &output_obj, const wchar_t *input_file_path);
917
- void serialize_ext_isoforest(ExtIsoForest &model, const wchar_t *output_file_path);
918
- void deserialize_ext_isoforest(ExtIsoForest &output_obj, const wchar_t *input_file_path);
919
- void serialize_imputer(Imputer &imputer, const wchar_t *output_file_path);
920
- void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
921
- #endif /* _MSC_VER */
922
- bool has_msvc();
923
- #endif /* _ENABLE_CEREAL */
1919
+ [[noreturn]]
1920
+ void throw_errno();
1921
+ [[noreturn]]
1922
+ void throw_ferror(FILE *file);
1923
+ [[noreturn]]
1924
+ void throw_feoferr();
1925
+ class FileHandle
1926
+ {
1927
+ public:
1928
+ FILE *handle = NULL;
1929
+ FileHandle(const char *fname, const char *mode)
1930
+ {
1931
+ this->handle = std::fopen(fname, mode);
1932
+ if (!(this->handle))
1933
+ throw_errno();
1934
+ }
1935
+ ~FileHandle()
1936
+ {
1937
+ if (this->handle) {
1938
+ int err = std::fclose(this->handle);
1939
+ if (err)
1940
+ fprintf(stderr, "Error: could not close file.\n");
1941
+ }
1942
+ this->handle = NULL;
1943
+ }
1944
+ };
1945
+
1946
+ #if defined(_WIN32) && (defined(_MSC_VER) || defined(__GNUC__))
1947
+ #define WCHAR_T_FUNS
1948
+ #include <stdio.h>
1949
+ class WFileHandle
1950
+ {
1951
+ public:
1952
+ FILE *handle = NULL;
1953
+ WFileHandle(const wchar_t *fname, const wchar_t *mode)
1954
+ {
1955
+ this->handle = _wfopen(fname, mode);
1956
+ if (!(this->handle))
1957
+ throw_errno();
1958
+ }
1959
+ ~WFileHandle()
1960
+ {
1961
+ if (this->handle) {
1962
+ int err = std::fclose(this->handle);
1963
+ if (err)
1964
+ fprintf(stderr, "Error: could not close file.\n");
1965
+ }
1966
+ this->handle = NULL;
1967
+ }
1968
+ };
1969
+ #endif
1970
+ ISOTREE_EXPORTED
1971
+ bool has_wchar_t_file_serializers() noexcept;
1972
+ ISOTREE_EXPORTED
1973
+ size_t determine_serialized_size(const IsoForest &model) noexcept;
1974
+ ISOTREE_EXPORTED
1975
+ size_t determine_serialized_size(const ExtIsoForest &model) noexcept;
1976
+ ISOTREE_EXPORTED
1977
+ size_t determine_serialized_size(const Imputer &model) noexcept;
1978
+ ISOTREE_EXPORTED
1979
+ size_t determine_serialized_size(const TreesIndexer &model) noexcept;
1980
+ ISOTREE_EXPORTED
1981
+ void serialize_IsoForest(const IsoForest &model, char *out);
1982
+ ISOTREE_EXPORTED
1983
+ void serialize_IsoForest(const IsoForest &model, FILE *out);
1984
+ ISOTREE_EXPORTED
1985
+ void serialize_IsoForest(const IsoForest &model, std::ostream &out);
1986
+ ISOTREE_EXPORTED
1987
+ std::string serialize_IsoForest(const IsoForest &model);
1988
+ ISOTREE_EXPORTED
1989
+ void serialize_IsoForest_ToFile(const IsoForest &model, const char *fname);
1990
+ #ifdef WCHAR_T_FUNS
1991
+ ISOTREE_EXPORTED
1992
+ void serialize_IsoForest_ToFile(const IsoForest &model, const wchar_t *fname);
1993
+ #endif
1994
+ ISOTREE_EXPORTED
1995
+ void deserialize_IsoForest(IsoForest &model, const char *in);
1996
+ ISOTREE_EXPORTED
1997
+ void deserialize_IsoForest(IsoForest &model, FILE *in);
1998
+ ISOTREE_EXPORTED
1999
+ void deserialize_IsoForest(IsoForest &model, std::istream &in);
2000
+ ISOTREE_EXPORTED
2001
+ void deserialize_IsoForest(IsoForest &model, const std::string &in);
2002
+ ISOTREE_EXPORTED
2003
+ void deserialize_IsoForest_FromFile(IsoForest &model, const char *fname);
2004
+ #ifdef WCHAR_T_FUNS
2005
+ ISOTREE_EXPORTED
2006
+ void deserialize_IsoForest_FromFile(IsoForest &model, const wchar_t *fname);
2007
+ #endif
2008
+ ISOTREE_EXPORTED
2009
+ void serialize_ExtIsoForest(const ExtIsoForest &model, char *out);
2010
+ ISOTREE_EXPORTED
2011
+ void serialize_ExtIsoForest(const ExtIsoForest &model, FILE *out);
2012
+ ISOTREE_EXPORTED
2013
+ void serialize_ExtIsoForest(const ExtIsoForest &model, std::ostream &out);
2014
+ ISOTREE_EXPORTED
2015
+ std::string serialize_ExtIsoForest(const ExtIsoForest &model);
2016
+ ISOTREE_EXPORTED
2017
+ void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const char *fname);
2018
+ #ifdef WCHAR_T_FUNS
2019
+ ISOTREE_EXPORTED
2020
+ void serialize_ExtIsoForest_ToFile(const ExtIsoForest &model, const wchar_t *fname);
2021
+ #endif
2022
+ ISOTREE_EXPORTED
2023
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const char *in);
2024
+ ISOTREE_EXPORTED
2025
+ void deserialize_ExtIsoForest(ExtIsoForest &model, FILE *in);
2026
+ ISOTREE_EXPORTED
2027
+ void deserialize_ExtIsoForest(ExtIsoForest &model, std::istream &in);
2028
+ ISOTREE_EXPORTED
2029
+ void deserialize_ExtIsoForest(ExtIsoForest &model, const std::string &in);
2030
+ ISOTREE_EXPORTED
2031
+ void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const char *fname);
2032
+ #ifdef WCHAR_T_FUNS
2033
+ ISOTREE_EXPORTED
2034
+ void deserialize_ExtIsoForest_FromFile(ExtIsoForest &model, const wchar_t *fname);
2035
+ #endif
2036
+ ISOTREE_EXPORTED
2037
+ void serialize_Imputer(const Imputer &model, char *out);
2038
+ ISOTREE_EXPORTED
2039
+ void serialize_Imputer(const Imputer &model, FILE *out);
2040
+ ISOTREE_EXPORTED
2041
+ void serialize_Imputer(const Imputer &model, std::ostream &out);
2042
+ ISOTREE_EXPORTED
2043
+ std::string serialize_Imputer(const Imputer &model);
2044
+ ISOTREE_EXPORTED
2045
+ void serialize_Imputer_ToFile(const Imputer &model, const char *fname);
2046
+ #ifdef WCHAR_T_FUNS
2047
+ ISOTREE_EXPORTED
2048
+ void serialize_Imputer_ToFile(const Imputer &model, const wchar_t *fname);
2049
+ #endif
2050
+ ISOTREE_EXPORTED
2051
+ void deserialize_Imputer(Imputer &model, const char *in);
2052
+ ISOTREE_EXPORTED
2053
+ void deserialize_Imputer(Imputer &model, FILE *in);
2054
+ ISOTREE_EXPORTED
2055
+ void deserialize_Imputer(Imputer &model, std::istream &in);
2056
+ ISOTREE_EXPORTED
2057
+ void deserialize_Imputer(Imputer &model, const std::string &in);
2058
+ ISOTREE_EXPORTED
2059
+ void deserialize_Imputer_FromFile(Imputer &model, const char *fname);
2060
+ #ifdef WCHAR_T_FUNS
2061
+ ISOTREE_EXPORTED
2062
+ void deserialize_Imputer_FromFile(Imputer &model, const wchar_t *fname);
2063
+ #endif
2064
+ ISOTREE_EXPORTED
2065
+ void serialize_Indexer(const TreesIndexer &model, char *out);
2066
+ ISOTREE_EXPORTED
2067
+ void serialize_Indexer(const TreesIndexer &model, FILE *out);
2068
+ ISOTREE_EXPORTED
2069
+ void serialize_Indexer(const TreesIndexer &model, std::ostream &out);
2070
+ ISOTREE_EXPORTED
2071
+ std::string serialize_Indexer(const TreesIndexer &model);
2072
+ ISOTREE_EXPORTED
2073
+ void serialize_Indexer_ToFile(const TreesIndexer &model, const char *fname);
2074
+ #ifdef WCHAR_T_FUNS
2075
+ ISOTREE_EXPORTED
2076
+ void serialize_Indexer_ToFile(const TreesIndexer &model, const wchar_t *fname);
2077
+ #endif
2078
+ ISOTREE_EXPORTED
2079
+ void deserialize_Indexer(TreesIndexer &model, const char *in);
2080
+ ISOTREE_EXPORTED
2081
+ void deserialize_Indexer(TreesIndexer &model, FILE *in);
2082
+ ISOTREE_EXPORTED
2083
+ void deserialize_Indexer(TreesIndexer &model, std::istream &in);
2084
+ ISOTREE_EXPORTED
2085
+ void deserialize_Indexer(TreesIndexer &model, const std::string &in);
2086
+ ISOTREE_EXPORTED
2087
+ void deserialize_Indexer_FromFile(TreesIndexer &model, const char *fname);
2088
+ #ifdef WCHAR_T_FUNS
2089
+ ISOTREE_EXPORTED
2090
+ void deserialize_Indexer_FromFile(TreesIndexer &model, const wchar_t *fname);
2091
+ #endif
2092
+ void serialize_isotree(const IsoForest &model, char *out);
2093
+ void serialize_isotree(const ExtIsoForest &model, char *out);
2094
+ void serialize_isotree(const Imputer &model, char *out);
2095
+ void serialize_isotree(const TreesIndexer &model, char *out);
2096
+ void deserialize_isotree(IsoForest &model, const char *in);
2097
+ void deserialize_isotree(ExtIsoForest &model, const char *in);
2098
+ void deserialize_isotree(Imputer &model, const char *in);
2099
+ void deserialize_isotree(TreesIndexer &model, const char *in);
2100
+ void incremental_serialize_isotree(const IsoForest &model, char *old_bytes_reallocated);
2101
+ void incremental_serialize_isotree(const ExtIsoForest &model, char *old_bytes_reallocated);
2102
+ void incremental_serialize_isotree(const Imputer &model, char *old_bytes_reallocated);
2103
+ void incremental_serialize_isotree(const TreesIndexer &model, char *old_bytes_reallocated);
2104
+ ISOTREE_EXPORTED
2105
+ void incremental_serialize_IsoForest(const IsoForest &model, std::string &old_bytes);
2106
+ ISOTREE_EXPORTED
2107
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, std::string &old_bytes);
2108
+ ISOTREE_EXPORTED
2109
+ void incremental_serialize_Imputer(const Imputer &model, std::string &old_bytes);
2110
+ ISOTREE_EXPORTED
2111
+ void incremental_serialize_Indexer(const TreesIndexer &model, std::string &old_bytes);
2112
+ ISOTREE_EXPORTED
2113
+ void inspect_serialized_object
2114
+ (
2115
+ const char *serialized_bytes,
2116
+ bool &is_isotree_model,
2117
+ bool &is_compatible,
2118
+ bool &has_combined_objects,
2119
+ bool &has_IsoForest,
2120
+ bool &has_ExtIsoForest,
2121
+ bool &has_Imputer,
2122
+ bool &has_Indexer,
2123
+ bool &has_metadata,
2124
+ size_t &size_metadata
2125
+ );
2126
+ ISOTREE_EXPORTED
2127
+ void inspect_serialized_object
2128
+ (
2129
+ FILE *serialized_bytes,
2130
+ bool &is_isotree_model,
2131
+ bool &is_compatible,
2132
+ bool &has_combined_objects,
2133
+ bool &has_IsoForest,
2134
+ bool &has_ExtIsoForest,
2135
+ bool &has_Imputer,
2136
+ bool &has_Indexer,
2137
+ bool &has_metadata,
2138
+ size_t &size_metadata
2139
+ );
2140
+ ISOTREE_EXPORTED
2141
+ void inspect_serialized_object
2142
+ (
2143
+ std::istream &serialized_bytes,
2144
+ bool &is_isotree_model,
2145
+ bool &is_compatible,
2146
+ bool &has_combined_objects,
2147
+ bool &has_IsoForest,
2148
+ bool &has_ExtIsoForest,
2149
+ bool &has_Imputer,
2150
+ bool &has_Indexer,
2151
+ bool &has_metadata,
2152
+ size_t &size_metadata
2153
+ );
2154
+ ISOTREE_EXPORTED
2155
+ void inspect_serialized_object
2156
+ (
2157
+ const std::string &serialized_bytes,
2158
+ bool &is_isotree_model,
2159
+ bool &is_compatible,
2160
+ bool &has_combined_objects,
2161
+ bool &has_IsoForest,
2162
+ bool &has_ExtIsoForest,
2163
+ bool &has_Imputer,
2164
+ bool &has_Indexer,
2165
+ bool &has_metadata,
2166
+ size_t &size_metadata
2167
+ );
2168
+ ISOTREE_EXPORTED
2169
+ bool check_can_undergo_incremental_serialization(const IsoForest &model, const char *serialized_bytes);
2170
+ ISOTREE_EXPORTED
2171
+ bool check_can_undergo_incremental_serialization(const ExtIsoForest &model, const char *serialized_bytes);
2172
+ ISOTREE_EXPORTED
2173
+ bool check_can_undergo_incremental_serialization(const Imputer &model, const char *serialized_bytes);
2174
+ ISOTREE_EXPORTED
2175
+ bool check_can_undergo_incremental_serialization(const TreesIndexer &model, const char *serialized_bytes);
2176
+ ISOTREE_EXPORTED
2177
+ size_t determine_serialized_size_additional_trees(const IsoForest &model, size_t old_ntrees) noexcept;
2178
+ ISOTREE_EXPORTED
2179
+ size_t determine_serialized_size_additional_trees(const ExtIsoForest &model, size_t old_ntrees) noexcept;
2180
+ ISOTREE_EXPORTED
2181
+ size_t determine_serialized_size_additional_trees(const Imputer &model, size_t old_ntrees) noexcept;
2182
+ ISOTREE_EXPORTED
2183
+ size_t determine_serialized_size_additional_trees(const TreesIndexer &model, size_t old_ntrees) noexcept;
2184
+ ISOTREE_EXPORTED
2185
+ void incremental_serialize_IsoForest(const IsoForest &model, char *old_bytes_reallocated);
2186
+ ISOTREE_EXPORTED
2187
+ void incremental_serialize_ExtIsoForest(const ExtIsoForest &model, char *old_bytes_reallocated);
2188
+ ISOTREE_EXPORTED
2189
+ void incremental_serialize_Imputer(const Imputer &model, char *old_bytes_reallocated);
2190
+ ISOTREE_EXPORTED
2191
+ void incremental_serialize_Indexer(const TreesIndexer &model, char *old_bytes_reallocated);
2192
+ ISOTREE_EXPORTED
2193
+ size_t determine_serialized_size_combined
2194
+ (
2195
+ const IsoForest *model,
2196
+ const ExtIsoForest *model_ext,
2197
+ const Imputer *imputer,
2198
+ const TreesIndexer *indexer,
2199
+ const size_t size_optional_metadata
2200
+ ) noexcept;
2201
+ ISOTREE_EXPORTED
2202
+ size_t determine_serialized_size_combined
2203
+ (
2204
+ const char *serialized_model,
2205
+ const char *serialized_model_ext,
2206
+ const char *serialized_imputer,
2207
+ const char *serialized_indexer,
2208
+ const size_t size_optional_metadata
2209
+ ) noexcept;
2210
+ ISOTREE_EXPORTED
2211
+ void serialize_combined
2212
+ (
2213
+ const IsoForest *model,
2214
+ const ExtIsoForest *model_ext,
2215
+ const Imputer *imputer,
2216
+ const TreesIndexer *indexer,
2217
+ const char *optional_metadata,
2218
+ const size_t size_optional_metadata,
2219
+ char *out
2220
+ );
2221
+ ISOTREE_EXPORTED
2222
+ void serialize_combined
2223
+ (
2224
+ const IsoForest *model,
2225
+ const ExtIsoForest *model_ext,
2226
+ const Imputer *imputer,
2227
+ const TreesIndexer *indexer,
2228
+ const char *optional_metadata,
2229
+ const size_t size_optional_metadata,
2230
+ FILE *out
2231
+ );
2232
+ ISOTREE_EXPORTED
2233
+ void serialize_combined
2234
+ (
2235
+ const IsoForest *model,
2236
+ const ExtIsoForest *model_ext,
2237
+ const Imputer *imputer,
2238
+ const TreesIndexer *indexer,
2239
+ const char *optional_metadata,
2240
+ const size_t size_optional_metadata,
2241
+ std::ostream &out
2242
+ );
2243
+ ISOTREE_EXPORTED
2244
+ std::string serialize_combined
2245
+ (
2246
+ const IsoForest *model,
2247
+ const ExtIsoForest *model_ext,
2248
+ const Imputer *imputer,
2249
+ const TreesIndexer *indexer,
2250
+ const char *optional_metadata,
2251
+ const size_t size_optional_metadata
2252
+ );
2253
+ ISOTREE_EXPORTED
2254
+ void serialize_combined
2255
+ (
2256
+ const char *serialized_model,
2257
+ const char *serialized_model_ext,
2258
+ const char *serialized_imputer,
2259
+ const char *serialized_indexer,
2260
+ const char *optional_metadata,
2261
+ const size_t size_optional_metadata,
2262
+ FILE *out
2263
+ );
2264
+ ISOTREE_EXPORTED
2265
+ void serialize_combined
2266
+ (
2267
+ const char *serialized_model,
2268
+ const char *serialized_model_ext,
2269
+ const char *serialized_imputer,
2270
+ const char *serialized_indexer,
2271
+ const char *optional_metadata,
2272
+ const size_t size_optional_metadata,
2273
+ std::ostream &out
2274
+ );
2275
+ ISOTREE_EXPORTED
2276
+ std::string serialize_combined
2277
+ (
2278
+ const char *serialized_model,
2279
+ const char *serialized_model_ext,
2280
+ const char *serialized_imputer,
2281
+ const char *serialized_indexer,
2282
+ const char *optional_metadata,
2283
+ const size_t size_optional_metadata
2284
+ );
2285
+ ISOTREE_EXPORTED
2286
+ void deserialize_combined
2287
+ (
2288
+ const char* in,
2289
+ IsoForest *model,
2290
+ ExtIsoForest *model_ext,
2291
+ Imputer *imputer,
2292
+ TreesIndexer *indexer,
2293
+ char *optional_metadata
2294
+ );
2295
+ ISOTREE_EXPORTED
2296
+ void deserialize_combined
2297
+ (
2298
+ FILE* in,
2299
+ IsoForest *model,
2300
+ ExtIsoForest *model_ext,
2301
+ Imputer *imputer,
2302
+ TreesIndexer *indexer,
2303
+ char *optional_metadata
2304
+ );
2305
+ ISOTREE_EXPORTED
2306
+ void deserialize_combined
2307
+ (
2308
+ std::istream &in,
2309
+ IsoForest *model,
2310
+ ExtIsoForest *model_ext,
2311
+ Imputer *imputer,
2312
+ TreesIndexer *indexer,
2313
+ char *optional_metadata
2314
+ );
2315
+ ISOTREE_EXPORTED
2316
+ void deserialize_combined
2317
+ (
2318
+ const std::string &in,
2319
+ IsoForest *model,
2320
+ ExtIsoForest *model_ext,
2321
+ Imputer *imputer,
2322
+ TreesIndexer *indexer,
2323
+ char *optional_metadata
2324
+ );
2325
+ bool check_model_has_range_penalty(const IsoForest &model) noexcept;
2326
+ bool check_model_has_range_penalty(const ExtIsoForest &model) noexcept;
2327
+ void add_range_penalty(IsoForest &model) noexcept;
2328
+ void add_range_penalty(ExtIsoForest &model) noexcept;
2329
+ void add_range_penalty(Imputer &model) noexcept;
2330
+ void add_range_penalty(TreesIndexer &model) noexcept;
924
2331
 
925
2332
  /* sql.cpp */
2333
+ ISOTREE_EXPORTED
926
2334
  std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
927
2335
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
928
2336
  std::vector<std::vector<std::string>> &categ_levels,
929
2337
  bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
930
2338
  int nthreads);
2339
+ ISOTREE_EXPORTED
931
2340
  std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
932
2341
  std::string &table_from, std::string &select_as,
933
2342
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
@@ -935,7 +2344,8 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
935
2344
  bool index1, int nthreads);
936
2345
  void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
937
2346
  size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
938
- std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right);
2347
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
2348
+ const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext);
939
2349
  void extract_cond_isotree(IsoForest &model, IsoTree &tree,
940
2350
  std::string &cond_left, std::string &cond_right,
941
2351
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
@@ -945,7 +2355,9 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
945
2355
  std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
946
2356
  std::vector<std::vector<std::string>> &categ_levels);
947
2357
 
948
- /* dealloc.cpp */
949
- void dealloc_IsoForest(IsoForest &model_outputs);
950
- void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);
951
- void dealloc_Imputer(Imputer &imputer);
2358
+ #ifndef _FOR_R
2359
+ #if defined(__clang__)
2360
+ #pragma clang diagnostic pop
2361
+ #endif
2362
+ #endif
2363
+ #endif /* ISOTREE_H */