isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,515 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ static inline bool is_terminal_node(const IsoTree &node)
66
+ {
67
+ return node.tree_left == 0;
68
+ }
69
+
70
+ static inline bool is_terminal_node(const IsoHPlane &node)
71
+ {
72
+ return node.hplane_left == 0;
73
+ }
74
+
75
+ template <class Tree>
76
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<Tree> &tree)
77
+ {
78
+ mappings.resize(tree.size());
79
+ mappings.shrink_to_fit();
80
+ std::fill(mappings.begin(), mappings.end(), (size_t)0);
81
+
82
+ n_terminal = 0;
83
+ for (size_t node = 0; node < tree.size(); node++)
84
+ {
85
+ if (is_terminal_node(tree[node]))
86
+ {
87
+ mappings[node] = n_terminal;
88
+ n_terminal++;
89
+ }
90
+ }
91
+ }
92
+
93
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoTree> &tree)
94
+ {
95
+ build_terminal_node_mappings_single_tree<IsoTree>(mappings, n_terminal, tree);
96
+ }
97
+
98
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoHPlane> &tree)
99
+ {
100
+ build_terminal_node_mappings_single_tree<IsoHPlane>(mappings, n_terminal, tree);
101
+ }
102
+
103
+ static inline const std::vector<IsoTree>& get_tree(const IsoForest &model, size_t tree)
104
+ {
105
+ return model.trees[tree];
106
+ }
107
+
108
+ static inline const std::vector<IsoHPlane>& get_tree(const ExtIsoForest &model, size_t tree)
109
+ {
110
+ return model.hplanes[tree];
111
+ }
112
+
113
+ template <class Model>
114
+ void build_terminal_node_mappings(TreesIndexer &indexer, const Model &model)
115
+ {
116
+ indexer.indices.resize(get_ntrees(model));
117
+ indexer.indices.shrink_to_fit();
118
+
119
+ if (!indexer.indices.empty() && !indexer.indices.front().reference_points.empty())
120
+ {
121
+ for (auto &ind : indexer.indices)
122
+ {
123
+ ind.reference_points.clear();
124
+ ind.reference_indptr.clear();
125
+ ind.reference_mapping.clear();
126
+ }
127
+ }
128
+
129
+ for (size_t tree = 0; tree < indexer.indices.size(); tree++)
130
+ {
131
+ build_terminal_node_mappings_single_tree(indexer.indices[tree].terminal_node_mappings,
132
+ indexer.indices[tree].n_terminal,
133
+ get_tree(model, tree));
134
+ }
135
+ }
136
+
137
+ static inline size_t get_idx_tree_left(const IsoTree &node)
138
+ {
139
+ return node.tree_left;
140
+ }
141
+
142
+ static inline size_t get_idx_tree_left(const IsoHPlane &node)
143
+ {
144
+ return node.hplane_left;
145
+ }
146
+
147
+ static inline size_t get_idx_tree_right(const IsoTree &node)
148
+ {
149
+ return node.tree_right;
150
+ }
151
+
152
+ static inline size_t get_idx_tree_right(const IsoHPlane &node)
153
+ {
154
+ return node.hplane_right;
155
+ }
156
+
157
+ template <class Node>
158
+ void build_dindex_recursive
159
+ (
160
+ const size_t curr_node,
161
+ const size_t n_terminal, const size_t ncomb,
162
+ const size_t st, const size_t end,
163
+ std::vector<size_t> &restrict node_indices, /* array with all terminal indices in 'tree' */
164
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
165
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
166
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
167
+ size_t curr_depth,
168
+ const std::vector<Node> &tree
169
+ )
170
+ {
171
+ if (end > st)
172
+ {
173
+ size_t i, j;
174
+ for (size_t el1 = st; el1 < end; el1++)
175
+ {
176
+ for (size_t el2 = el1 + 1; el2 <= end; el2++)
177
+ {
178
+ i = node_mappings[node_indices[el1]];
179
+ j = node_mappings[node_indices[el2]];
180
+ node_distances[ix_comb(i, j, n_terminal, ncomb)]++;
181
+ }
182
+ }
183
+ }
184
+
185
+ if (!is_terminal_node(tree[curr_node]))
186
+ {
187
+ const size_t delim = get_idx_tree_right(tree[curr_node]);
188
+ size_t frontier = st;
189
+ size_t temp;
190
+ for (size_t ix = st; ix <= end; ix++)
191
+ {
192
+ if (node_indices[ix] < delim)
193
+ {
194
+ temp = node_indices[frontier];
195
+ node_indices[frontier] = node_indices[ix];
196
+ node_indices[ix] = temp;
197
+ frontier++;
198
+ }
199
+ }
200
+
201
+ if (unlikely(frontier == st)) unexpected_error();
202
+
203
+ curr_depth++;
204
+ build_dindex_recursive<Node>(get_idx_tree_left(tree[curr_node]),
205
+ n_terminal, ncomb,
206
+ st, frontier-1,
207
+ node_indices,
208
+ node_mappings,
209
+ node_distances,
210
+ node_depths,
211
+ curr_depth,
212
+ tree);
213
+ build_dindex_recursive<Node>(get_idx_tree_right(tree[curr_node]),
214
+ n_terminal, ncomb,
215
+ frontier, end,
216
+ node_indices,
217
+ node_mappings,
218
+ node_distances,
219
+ node_depths,
220
+ curr_depth,
221
+ tree);
222
+ }
223
+
224
+ else
225
+ {
226
+ node_depths[node_mappings[curr_node]] = curr_depth;
227
+ }
228
+ }
229
+
230
+ template <class Node>
231
+ void build_dindex
232
+ (
233
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
234
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
235
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
236
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
237
+ const size_t n_terminal,
238
+ const std::vector<Node> &tree
239
+ )
240
+ {
241
+ if (tree.size() <= 1) return;
242
+
243
+ std::fill(node_distances.begin(), node_distances.end(), 0.);
244
+
245
+ node_indices.clear();
246
+ for (size_t node = 0; node < tree.size(); node++)
247
+ {
248
+ if (is_terminal_node(tree[node]))
249
+ node_indices.push_back(node);
250
+ }
251
+
252
+ node_depths.resize(n_terminal);
253
+
254
+ build_dindex_recursive<Node>(
255
+ (size_t)0,
256
+ node_indices.size(), calc_ncomb(node_indices.size()),
257
+ 0, node_indices.size()-1,
258
+ node_indices,
259
+ node_mappings,
260
+ node_distances,
261
+ node_depths,
262
+ (size_t)0,
263
+ tree
264
+ );
265
+ }
266
+
267
+ void build_dindex
268
+ (
269
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
270
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
271
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
272
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
273
+ const size_t n_terminal,
274
+ const std::vector<IsoTree> &tree
275
+ )
276
+ {
277
+ build_dindex<IsoTree>(
278
+ node_indices,
279
+ node_mappings,
280
+ node_distances,
281
+ node_depths,
282
+ n_terminal,
283
+ tree
284
+ );
285
+ }
286
+
287
+ void build_dindex
288
+ (
289
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
290
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
291
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
292
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
293
+ const size_t n_terminal,
294
+ const std::vector<IsoHPlane> &tree
295
+ )
296
+ {
297
+ build_dindex<IsoHPlane>(
298
+ node_indices,
299
+ node_mappings,
300
+ node_distances,
301
+ node_depths,
302
+ n_terminal,
303
+ tree
304
+ );
305
+ }
306
+
307
+ template <class Model>
308
+ void build_distance_mappings(TreesIndexer &indexer, const Model &model, int nthreads)
309
+ {
310
+ build_terminal_node_mappings(indexer, model);
311
+ SignalSwitcher ss = SignalSwitcher();
312
+
313
+ size_t ntrees = get_ntrees(model);
314
+ std::vector<size_t> n_terminal(ntrees);
315
+ for (size_t tree = 0; tree < ntrees; tree++)
316
+ n_terminal[tree] = indexer.indices[tree].n_terminal;
317
+
318
+ size_t max_n_terminal = *std::max_element(n_terminal.begin(), n_terminal.end());
319
+ check_interrupt_switch(ss);
320
+ if (max_n_terminal <= 1) return;
321
+
322
+ #ifndef _OPENMP
323
+ nthreads = 1;
324
+ #endif
325
+ std::vector<std::vector<size_t>> thread_buffer_indices(nthreads);
326
+ for (std::vector<size_t> &v : thread_buffer_indices)
327
+ v.reserve(max_n_terminal);
328
+ check_interrupt_switch(ss);
329
+
330
+
331
+
332
+ bool threw_exception = false;
333
+ std::exception_ptr ex = NULL;
334
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(indexer, model, n_terminal, threw_exception, ex)
335
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
336
+ {
337
+ if (interrupt_switch || threw_exception) continue;
338
+
339
+ try
340
+ {
341
+ size_t n_terminal_this = n_terminal[tree];
342
+ size_t ncomb = calc_ncomb(n_terminal_this);
343
+ indexer.indices[tree].node_distances.assign(ncomb, 0.);
344
+ indexer.indices[tree].node_distances.shrink_to_fit();
345
+ build_dindex(
346
+ thread_buffer_indices[omp_get_thread_num()],
347
+ indexer.indices[tree].terminal_node_mappings,
348
+ indexer.indices[tree].node_distances,
349
+ indexer.indices[tree].node_depths,
350
+ n_terminal_this,
351
+ get_tree(model, tree)
352
+ );
353
+ }
354
+
355
+ catch (...)
356
+ {
357
+ #pragma omp critical
358
+ {
359
+ if (!threw_exception)
360
+ {
361
+ threw_exception = true;
362
+ ex = std::current_exception();
363
+ }
364
+ }
365
+ }
366
+ }
367
+
368
+ if (interrupt_switch || threw_exception)
369
+ {
370
+ indexer.indices.clear();
371
+ }
372
+
373
+ check_interrupt_switch(ss);
374
+ if (threw_exception) std::rethrow_exception(ex);
375
+ }
376
+
377
+ template <class Model>
378
+ void build_tree_indices(TreesIndexer &indexer, const Model &model, int nthreads, const bool with_distances)
379
+ {
380
+ if (!indexer.indices.empty() && !indexer.indices.front().reference_points.empty())
381
+ {
382
+ for (auto &ind : indexer.indices)
383
+ {
384
+ ind.reference_points.clear();
385
+ ind.reference_indptr.clear();
386
+ ind.reference_mapping.clear();
387
+ }
388
+ }
389
+
390
+
391
+ try
392
+ {
393
+ if (with_distances) {
394
+ build_distance_mappings(indexer, model, nthreads);
395
+ }
396
+
397
+ else {
398
+ if (!indexer.indices.empty() && !indexer.indices.front().node_distances.empty())
399
+ {
400
+ for (auto &ind : indexer.indices)
401
+ {
402
+ ind.node_distances.clear();
403
+ ind.node_depths.clear();
404
+ }
405
+ }
406
+
407
+ build_terminal_node_mappings(indexer, model);
408
+ }
409
+ }
410
+
411
+ catch (...)
412
+ {
413
+ indexer.indices.clear();
414
+ throw;
415
+ }
416
+ }
417
+
418
+ void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances)
419
+ {
420
+ if (model.trees.empty())
421
+ throw std::runtime_error("Cannot build indexed for unfitted model.\n");
422
+ if (model.missing_action == Divide)
423
+ throw std::runtime_error("Cannot build tree indexer with 'missing_action=Divide'.\n");
424
+ if (model.new_cat_action == Weighted && model.cat_split_type == SubSet)
425
+ {
426
+ for (const std::vector<IsoTree> &tree : model.trees)
427
+ {
428
+ for (const IsoTree &node : tree)
429
+ {
430
+ if (!is_terminal_node(node) && node.col_type == Categorical)
431
+ throw std::runtime_error("Cannot build tree indexer with 'new_cat_action=Weighted'.\n");
432
+ }
433
+ }
434
+ }
435
+
436
+ build_tree_indices<IsoForest>(indexer, model, nthreads, with_distances);
437
+ }
438
+
439
+ void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances)
440
+ {
441
+ if (model.hplanes.empty())
442
+ throw std::runtime_error("Cannot build indexed for unfitted model.\n");
443
+ build_tree_indices<ExtIsoForest>(indexer, model, nthreads, with_distances);
444
+ }
445
+
446
+ /* Build indexer for faster terminal node predictions and/or distance calculations
447
+ *
448
+ * Parameters
449
+ * ==========
450
+ * - indexer
451
+ * Pointer or reference to an indexer object which will be associated to a fitted model and in
452
+ * which indices for terminal nodes and potentially node distances will be stored.
453
+ * - model / model_outputs / model_outputs_ext
454
+ * Pointer or reference to a fitted model object for which an indexer will be built.
455
+ * - nthreads
456
+ * Number of parallel threads to use. This operation will only be multi-threaded when passing
457
+ * 'with_distances=true'.
458
+ * - with_distances
459
+ * Whether to also pre-calculate node distances in order to speed up 'calc_similarity' (distances).
460
+ * Note that this will consume a lot more memory and make the resulting object significantly
461
+ * heavier.
462
+ */
463
+ void build_tree_indices
464
+ (
465
+ TreesIndexer *indexer,
466
+ const IsoForest *model_outputs,
467
+ const ExtIsoForest *model_outputs_ext,
468
+ int nthreads,
469
+ const bool with_distances
470
+ )
471
+ {
472
+ if (model_outputs != NULL)
473
+ build_tree_indices(*indexer, *model_outputs, nthreads, with_distances);
474
+ else
475
+ build_tree_indices(*indexer, *model_outputs_ext, nthreads, with_distances);
476
+ }
477
+
478
+ /* Gets the number of reference points stored in an indexer object */
479
+ size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept
480
+ {
481
+ if (indexer.indices.empty()) return 0;
482
+ return indexer.indices.front().reference_points.size();
483
+ }
484
+
485
+ /* This assumes it already has the indexer and 'reference_points' were just added.
486
+ It builds up 'reference_mapping' and 'reference_indptr' from it. */
487
+ void build_ref_node(SingleTreeIndex &node)
488
+ {
489
+ node.reference_mapping.resize(node.reference_points.size());
490
+ node.reference_mapping.shrink_to_fit();
491
+ std::iota(node.reference_mapping.begin(), node.reference_mapping.end(), (size_t)0);
492
+ std::sort(node.reference_mapping.begin(), node.reference_mapping.end(),
493
+ [&node](const size_t a, const size_t b)
494
+ {return node.reference_points[a] < node.reference_points[b];});
495
+
496
+ size_t n_terminal = node.n_terminal;
497
+ node.reference_indptr.assign(n_terminal+1, (size_t)0);
498
+ node.reference_indptr.shrink_to_fit();
499
+
500
+ std::vector<size_t>::iterator curr_begin = node.reference_mapping.begin();
501
+ std::vector<size_t>::iterator new_begin;
502
+ size_t curr_node;
503
+ while (curr_begin != node.reference_mapping.end())
504
+ {
505
+ curr_node = node.reference_points[*curr_begin];
506
+ new_begin = std::upper_bound(curr_begin, node.reference_mapping.end(), curr_node,
507
+ [&node](const size_t a, const size_t b)
508
+ {return a < node.reference_points[b];});
509
+ node.reference_indptr[curr_node+1] = std::distance(curr_begin, new_begin);
510
+ curr_begin = new_begin;
511
+ }
512
+
513
+ for (size_t ix = 1; ix < n_terminal; ix++)
514
+ node.reference_indptr[ix+1] += node.reference_indptr[ix];
515
+ }
@@ -0,0 +1,118 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+
64
+ /* Note: the R and Python versions calls the 'sort_csc_indices' templated function,
65
+ so it's not enough to just include 'isotree_exportable.hpp' under them and let
66
+ this same file instantiate all supported templated types.
67
+ Also, Cython makes it hard to use overloaded functions since they have to
68
+ be declared multiple times. */
69
+
70
+ #if !defined(_FOR_R) && !defined(_FOR_PYTHON)
71
+
72
+ #include "headers_joined.hpp"
73
+
74
+ #define real_t double
75
+ #define sparse_ix int
76
+ #include "instantiate_template_headers.hpp"
77
+ #undef real_t
78
+ #undef sparse_ix
79
+
80
+ #ifndef NO_TEMPLATED_VERSIONS
81
+
82
+ #define real_t double
83
+ #define sparse_ix int64_t
84
+ #include "instantiate_template_headers.hpp"
85
+ #undef real_t
86
+ #undef sparse_ix
87
+
88
+ #define real_t double
89
+ #define sparse_ix size_t
90
+ #include "instantiate_template_headers.hpp"
91
+ #undef real_t
92
+ #undef sparse_ix
93
+
94
+ #define _NO_REAL_T
95
+
96
+ #define real_t float
97
+ #define sparse_ix int
98
+ #include "instantiate_template_headers.hpp"
99
+ #undef real_t
100
+ #undef sparse_ix
101
+
102
+ #define real_t float
103
+ #define sparse_ix int64_t
104
+ #include "instantiate_template_headers.hpp"
105
+ #undef real_t
106
+ #undef sparse_ix
107
+
108
+ #define real_t float
109
+ #define sparse_ix size_t
110
+ #include "instantiate_template_headers.hpp"
111
+ #undef real_t
112
+ #undef sparse_ix
113
+
114
+ #undef _NO_REAL_T
115
+
116
+ #endif /* NO_TEMPLATED_VERSIONS */
117
+
118
+ #endif