isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,515 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ static inline bool is_terminal_node(const IsoTree &node)
66
+ {
67
+ return node.tree_left == 0;
68
+ }
69
+
70
+ static inline bool is_terminal_node(const IsoHPlane &node)
71
+ {
72
+ return node.hplane_left == 0;
73
+ }
74
+
75
+ template <class Tree>
76
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<Tree> &tree)
77
+ {
78
+ mappings.resize(tree.size());
79
+ mappings.shrink_to_fit();
80
+ std::fill(mappings.begin(), mappings.end(), (size_t)0);
81
+
82
+ n_terminal = 0;
83
+ for (size_t node = 0; node < tree.size(); node++)
84
+ {
85
+ if (is_terminal_node(tree[node]))
86
+ {
87
+ mappings[node] = n_terminal;
88
+ n_terminal++;
89
+ }
90
+ }
91
+ }
92
+
93
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoTree> &tree)
94
+ {
95
+ build_terminal_node_mappings_single_tree<IsoTree>(mappings, n_terminal, tree);
96
+ }
97
+
98
+ void build_terminal_node_mappings_single_tree(std::vector<size_t> &mappings, size_t &n_terminal, const std::vector<IsoHPlane> &tree)
99
+ {
100
+ build_terminal_node_mappings_single_tree<IsoHPlane>(mappings, n_terminal, tree);
101
+ }
102
+
103
+ static inline const std::vector<IsoTree>& get_tree(const IsoForest &model, size_t tree)
104
+ {
105
+ return model.trees[tree];
106
+ }
107
+
108
+ static inline const std::vector<IsoHPlane>& get_tree(const ExtIsoForest &model, size_t tree)
109
+ {
110
+ return model.hplanes[tree];
111
+ }
112
+
113
+ template <class Model>
114
+ void build_terminal_node_mappings(TreesIndexer &indexer, const Model &model)
115
+ {
116
+ indexer.indices.resize(get_ntrees(model));
117
+ indexer.indices.shrink_to_fit();
118
+
119
+ if (!indexer.indices.empty() && !indexer.indices.front().reference_points.empty())
120
+ {
121
+ for (auto &ind : indexer.indices)
122
+ {
123
+ ind.reference_points.clear();
124
+ ind.reference_indptr.clear();
125
+ ind.reference_mapping.clear();
126
+ }
127
+ }
128
+
129
+ for (size_t tree = 0; tree < indexer.indices.size(); tree++)
130
+ {
131
+ build_terminal_node_mappings_single_tree(indexer.indices[tree].terminal_node_mappings,
132
+ indexer.indices[tree].n_terminal,
133
+ get_tree(model, tree));
134
+ }
135
+ }
136
+
137
+ static inline size_t get_idx_tree_left(const IsoTree &node)
138
+ {
139
+ return node.tree_left;
140
+ }
141
+
142
+ static inline size_t get_idx_tree_left(const IsoHPlane &node)
143
+ {
144
+ return node.hplane_left;
145
+ }
146
+
147
+ static inline size_t get_idx_tree_right(const IsoTree &node)
148
+ {
149
+ return node.tree_right;
150
+ }
151
+
152
+ static inline size_t get_idx_tree_right(const IsoHPlane &node)
153
+ {
154
+ return node.hplane_right;
155
+ }
156
+
157
+ template <class Node>
158
+ void build_dindex_recursive
159
+ (
160
+ const size_t curr_node,
161
+ const size_t n_terminal, const size_t ncomb,
162
+ const size_t st, const size_t end,
163
+ std::vector<size_t> &restrict node_indices, /* array with all terminal indices in 'tree' */
164
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
165
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
166
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
167
+ size_t curr_depth,
168
+ const std::vector<Node> &tree
169
+ )
170
+ {
171
+ if (end > st)
172
+ {
173
+ size_t i, j;
174
+ for (size_t el1 = st; el1 < end; el1++)
175
+ {
176
+ for (size_t el2 = el1 + 1; el2 <= end; el2++)
177
+ {
178
+ i = node_mappings[node_indices[el1]];
179
+ j = node_mappings[node_indices[el2]];
180
+ node_distances[ix_comb(i, j, n_terminal, ncomb)]++;
181
+ }
182
+ }
183
+ }
184
+
185
+ if (!is_terminal_node(tree[curr_node]))
186
+ {
187
+ const size_t delim = get_idx_tree_right(tree[curr_node]);
188
+ size_t frontier = st;
189
+ size_t temp;
190
+ for (size_t ix = st; ix <= end; ix++)
191
+ {
192
+ if (node_indices[ix] < delim)
193
+ {
194
+ temp = node_indices[frontier];
195
+ node_indices[frontier] = node_indices[ix];
196
+ node_indices[ix] = temp;
197
+ frontier++;
198
+ }
199
+ }
200
+
201
+ if (unlikely(frontier == st)) unexpected_error();
202
+
203
+ curr_depth++;
204
+ build_dindex_recursive<Node>(get_idx_tree_left(tree[curr_node]),
205
+ n_terminal, ncomb,
206
+ st, frontier-1,
207
+ node_indices,
208
+ node_mappings,
209
+ node_distances,
210
+ node_depths,
211
+ curr_depth,
212
+ tree);
213
+ build_dindex_recursive<Node>(get_idx_tree_right(tree[curr_node]),
214
+ n_terminal, ncomb,
215
+ frontier, end,
216
+ node_indices,
217
+ node_mappings,
218
+ node_distances,
219
+ node_depths,
220
+ curr_depth,
221
+ tree);
222
+ }
223
+
224
+ else
225
+ {
226
+ node_depths[node_mappings[curr_node]] = curr_depth;
227
+ }
228
+ }
229
+
230
+ template <class Node>
231
+ void build_dindex
232
+ (
233
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
234
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
235
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
236
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
237
+ const size_t n_terminal,
238
+ const std::vector<Node> &tree
239
+ )
240
+ {
241
+ if (tree.size() <= 1) return;
242
+
243
+ std::fill(node_distances.begin(), node_distances.end(), 0.);
244
+
245
+ node_indices.clear();
246
+ for (size_t node = 0; node < tree.size(); node++)
247
+ {
248
+ if (is_terminal_node(tree[node]))
249
+ node_indices.push_back(node);
250
+ }
251
+
252
+ node_depths.resize(n_terminal);
253
+
254
+ build_dindex_recursive<Node>(
255
+ (size_t)0,
256
+ node_indices.size(), calc_ncomb(node_indices.size()),
257
+ 0, node_indices.size()-1,
258
+ node_indices,
259
+ node_mappings,
260
+ node_distances,
261
+ node_depths,
262
+ (size_t)0,
263
+ tree
264
+ );
265
+ }
266
+
267
+ void build_dindex
268
+ (
269
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
270
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
271
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
272
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
273
+ const size_t n_terminal,
274
+ const std::vector<IsoTree> &tree
275
+ )
276
+ {
277
+ build_dindex<IsoTree>(
278
+ node_indices,
279
+ node_mappings,
280
+ node_distances,
281
+ node_depths,
282
+ n_terminal,
283
+ tree
284
+ );
285
+ }
286
+
287
+ void build_dindex
288
+ (
289
+ std::vector<size_t> &restrict node_indices, /* empty, but correctly sized */
290
+ const std::vector<size_t> &restrict node_mappings, /* tree_index : terminal_index */
291
+ std::vector<double> &restrict node_distances, /* indexed by terminal_index */
292
+ std::vector<double> &restrict node_depths, /* indexed by terminal_index */
293
+ const size_t n_terminal,
294
+ const std::vector<IsoHPlane> &tree
295
+ )
296
+ {
297
+ build_dindex<IsoHPlane>(
298
+ node_indices,
299
+ node_mappings,
300
+ node_distances,
301
+ node_depths,
302
+ n_terminal,
303
+ tree
304
+ );
305
+ }
306
+
307
+ template <class Model>
308
+ void build_distance_mappings(TreesIndexer &indexer, const Model &model, int nthreads)
309
+ {
310
+ build_terminal_node_mappings(indexer, model);
311
+ SignalSwitcher ss = SignalSwitcher();
312
+
313
+ size_t ntrees = get_ntrees(model);
314
+ std::vector<size_t> n_terminal(ntrees);
315
+ for (size_t tree = 0; tree < ntrees; tree++)
316
+ n_terminal[tree] = indexer.indices[tree].n_terminal;
317
+
318
+ size_t max_n_terminal = *std::max_element(n_terminal.begin(), n_terminal.end());
319
+ check_interrupt_switch(ss);
320
+ if (max_n_terminal <= 1) return;
321
+
322
+ #ifndef _OPENMP
323
+ nthreads = 1;
324
+ #endif
325
+ std::vector<std::vector<size_t>> thread_buffer_indices(nthreads);
326
+ for (std::vector<size_t> &v : thread_buffer_indices)
327
+ v.reserve(max_n_terminal);
328
+ check_interrupt_switch(ss);
329
+
330
+
331
+
332
+ bool threw_exception = false;
333
+ std::exception_ptr ex = NULL;
334
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(indexer, model, n_terminal, threw_exception, ex)
335
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
336
+ {
337
+ if (interrupt_switch || threw_exception) continue;
338
+
339
+ try
340
+ {
341
+ size_t n_terminal_this = n_terminal[tree];
342
+ size_t ncomb = calc_ncomb(n_terminal_this);
343
+ indexer.indices[tree].node_distances.assign(ncomb, 0.);
344
+ indexer.indices[tree].node_distances.shrink_to_fit();
345
+ build_dindex(
346
+ thread_buffer_indices[omp_get_thread_num()],
347
+ indexer.indices[tree].terminal_node_mappings,
348
+ indexer.indices[tree].node_distances,
349
+ indexer.indices[tree].node_depths,
350
+ n_terminal_this,
351
+ get_tree(model, tree)
352
+ );
353
+ }
354
+
355
+ catch (...)
356
+ {
357
+ #pragma omp critical
358
+ {
359
+ if (!threw_exception)
360
+ {
361
+ threw_exception = true;
362
+ ex = std::current_exception();
363
+ }
364
+ }
365
+ }
366
+ }
367
+
368
+ if (interrupt_switch || threw_exception)
369
+ {
370
+ indexer.indices.clear();
371
+ }
372
+
373
+ check_interrupt_switch(ss);
374
+ if (threw_exception) std::rethrow_exception(ex);
375
+ }
376
+
377
+ template <class Model>
378
+ void build_tree_indices(TreesIndexer &indexer, const Model &model, int nthreads, const bool with_distances)
379
+ {
380
+ if (!indexer.indices.empty() && !indexer.indices.front().reference_points.empty())
381
+ {
382
+ for (auto &ind : indexer.indices)
383
+ {
384
+ ind.reference_points.clear();
385
+ ind.reference_indptr.clear();
386
+ ind.reference_mapping.clear();
387
+ }
388
+ }
389
+
390
+
391
+ try
392
+ {
393
+ if (with_distances) {
394
+ build_distance_mappings(indexer, model, nthreads);
395
+ }
396
+
397
+ else {
398
+ if (!indexer.indices.empty() && !indexer.indices.front().node_distances.empty())
399
+ {
400
+ for (auto &ind : indexer.indices)
401
+ {
402
+ ind.node_distances.clear();
403
+ ind.node_depths.clear();
404
+ }
405
+ }
406
+
407
+ build_terminal_node_mappings(indexer, model);
408
+ }
409
+ }
410
+
411
+ catch (...)
412
+ {
413
+ indexer.indices.clear();
414
+ throw;
415
+ }
416
+ }
417
+
418
+ void build_tree_indices(TreesIndexer &indexer, const IsoForest &model, int nthreads, const bool with_distances)
419
+ {
420
+ if (model.trees.empty())
421
+ throw std::runtime_error("Cannot build indexed for unfitted model.\n");
422
+ if (model.missing_action == Divide)
423
+ throw std::runtime_error("Cannot build tree indexer with 'missing_action=Divide'.\n");
424
+ if (model.new_cat_action == Weighted && model.cat_split_type == SubSet)
425
+ {
426
+ for (const std::vector<IsoTree> &tree : model.trees)
427
+ {
428
+ for (const IsoTree &node : tree)
429
+ {
430
+ if (!is_terminal_node(node) && node.col_type == Categorical)
431
+ throw std::runtime_error("Cannot build tree indexer with 'new_cat_action=Weighted'.\n");
432
+ }
433
+ }
434
+ }
435
+
436
+ build_tree_indices<IsoForest>(indexer, model, nthreads, with_distances);
437
+ }
438
+
439
+ void build_tree_indices(TreesIndexer &indexer, const ExtIsoForest &model, int nthreads, const bool with_distances)
440
+ {
441
+ if (model.hplanes.empty())
442
+ throw std::runtime_error("Cannot build indexed for unfitted model.\n");
443
+ build_tree_indices<ExtIsoForest>(indexer, model, nthreads, with_distances);
444
+ }
445
+
446
+ /* Build indexer for faster terminal node predictions and/or distance calculations
447
+ *
448
+ * Parameters
449
+ * ==========
450
+ * - indexer
451
+ * Pointer or reference to an indexer object which will be associated to a fitted model and in
452
+ * which indices for terminal nodes and potentially node distances will be stored.
453
+ * - model / model_outputs / model_outputs_ext
454
+ * Pointer or reference to a fitted model object for which an indexer will be built.
455
+ * - nthreads
456
+ * Number of parallel threads to use. This operation will only be multi-threaded when passing
457
+ * 'with_distances=true'.
458
+ * - with_distances
459
+ * Whether to also pre-calculate node distances in order to speed up 'calc_similarity' (distances).
460
+ * Note that this will consume a lot more memory and make the resulting object significantly
461
+ * heavier.
462
+ */
463
+ void build_tree_indices
464
+ (
465
+ TreesIndexer *indexer,
466
+ const IsoForest *model_outputs,
467
+ const ExtIsoForest *model_outputs_ext,
468
+ int nthreads,
469
+ const bool with_distances
470
+ )
471
+ {
472
+ if (model_outputs != NULL)
473
+ build_tree_indices(*indexer, *model_outputs, nthreads, with_distances);
474
+ else
475
+ build_tree_indices(*indexer, *model_outputs_ext, nthreads, with_distances);
476
+ }
477
+
478
+ /* Gets the number of reference points stored in an indexer object */
479
+ size_t get_number_of_reference_points(const TreesIndexer &indexer) noexcept
480
+ {
481
+ if (indexer.indices.empty()) return 0;
482
+ return indexer.indices.front().reference_points.size();
483
+ }
484
+
485
+ /* This assumes it already has the indexer and 'reference_points' were just added.
486
+ It builds up 'reference_mapping' and 'reference_indptr' from it. */
487
+ void build_ref_node(SingleTreeIndex &node)
488
+ {
489
+ node.reference_mapping.resize(node.reference_points.size());
490
+ node.reference_mapping.shrink_to_fit();
491
+ std::iota(node.reference_mapping.begin(), node.reference_mapping.end(), (size_t)0);
492
+ std::sort(node.reference_mapping.begin(), node.reference_mapping.end(),
493
+ [&node](const size_t a, const size_t b)
494
+ {return node.reference_points[a] < node.reference_points[b];});
495
+
496
+ size_t n_terminal = node.n_terminal;
497
+ node.reference_indptr.assign(n_terminal+1, (size_t)0);
498
+ node.reference_indptr.shrink_to_fit();
499
+
500
+ std::vector<size_t>::iterator curr_begin = node.reference_mapping.begin();
501
+ std::vector<size_t>::iterator new_begin;
502
+ size_t curr_node;
503
+ while (curr_begin != node.reference_mapping.end())
504
+ {
505
+ curr_node = node.reference_points[*curr_begin];
506
+ new_begin = std::upper_bound(curr_begin, node.reference_mapping.end(), curr_node,
507
+ [&node](const size_t a, const size_t b)
508
+ {return a < node.reference_points[b];});
509
+ node.reference_indptr[curr_node+1] = std::distance(curr_begin, new_begin);
510
+ curr_begin = new_begin;
511
+ }
512
+
513
+ for (size_t ix = 1; ix < n_terminal; ix++)
514
+ node.reference_indptr[ix+1] += node.reference_indptr[ix];
515
+ }
@@ -0,0 +1,118 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+
64
+ /* Note: the R and Python versions calls the 'sort_csc_indices' templated function,
65
+ so it's not enough to just include 'isotree_exportable.hpp' under them and let
66
+ this same file instantiate all supported templated types.
67
+ Also, Cython makes it hard to use overloaded functions since they have to
68
+ be declared multiple times. */
69
+
70
+ #if !defined(_FOR_R) && !defined(_FOR_PYTHON)
71
+
72
+ #include "headers_joined.hpp"
73
+
74
+ #define real_t double
75
+ #define sparse_ix int
76
+ #include "instantiate_template_headers.hpp"
77
+ #undef real_t
78
+ #undef sparse_ix
79
+
80
+ #ifndef NO_TEMPLATED_VERSIONS
81
+
82
+ #define real_t double
83
+ #define sparse_ix int64_t
84
+ #include "instantiate_template_headers.hpp"
85
+ #undef real_t
86
+ #undef sparse_ix
87
+
88
+ #define real_t double
89
+ #define sparse_ix size_t
90
+ #include "instantiate_template_headers.hpp"
91
+ #undef real_t
92
+ #undef sparse_ix
93
+
94
+ #define _NO_REAL_T
95
+
96
+ #define real_t float
97
+ #define sparse_ix int
98
+ #include "instantiate_template_headers.hpp"
99
+ #undef real_t
100
+ #undef sparse_ix
101
+
102
+ #define real_t float
103
+ #define sparse_ix int64_t
104
+ #include "instantiate_template_headers.hpp"
105
+ #undef real_t
106
+ #undef sparse_ix
107
+
108
+ #define real_t float
109
+ #define sparse_ix size_t
110
+ #include "instantiate_template_headers.hpp"
111
+ #undef real_t
112
+ #undef sparse_ix
113
+
114
+ #undef _NO_REAL_T
115
+
116
+ #endif /* NO_TEMPLATED_VERSIONS */
117
+
118
+ #endif