isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
data/vendor/isotree/src/sql.cpp
CHANGED
|
@@ -18,11 +18,29 @@
|
|
|
18
18
|
* [5] https://sourceforge.net/projects/iforest/
|
|
19
19
|
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
-
* [8] Cortes, David.
|
|
22
|
-
*
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
23
41
|
*
|
|
24
42
|
* BSD 2-Clause License
|
|
25
|
-
* Copyright (c)
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
26
44
|
* All rights reserved.
|
|
27
45
|
* Redistribution and use in source and binary forms, with or without
|
|
28
46
|
* modification, are permitted provided that the following conditions are met:
|
|
@@ -93,7 +111,26 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
|
|
|
93
111
|
categ_levels,
|
|
94
112
|
false, index1, false, 0,
|
|
95
113
|
nthreads);
|
|
96
|
-
|
|
114
|
+
bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
|
|
115
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
|
|
116
|
+
bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
|
|
117
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
|
|
118
|
+
bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
|
|
119
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
|
|
120
|
+
bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
|
|
121
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
|
|
122
|
+
is_density = is_density || is_bdens2;
|
|
123
|
+
std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(),
|
|
124
|
+
is_density?
|
|
125
|
+
std::string("SELECT\n(-(0.0")
|
|
126
|
+
:
|
|
127
|
+
(is_bdens?
|
|
128
|
+
std::string("SELECT\n((0.0")
|
|
129
|
+
:
|
|
130
|
+
(is_bratio?
|
|
131
|
+
std::string("SELECT\n((0.0")
|
|
132
|
+
:
|
|
133
|
+
std::string("SELECT\nPOWER(2.0, -(0.0"))),
|
|
97
134
|
[&tree_conds, &index1](std::string &a, std::string &b)
|
|
98
135
|
{return a
|
|
99
136
|
+ std::string(" + \n---BEGIN TREE ")
|
|
@@ -104,11 +141,11 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
|
|
|
104
141
|
+ std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
|
|
105
142
|
+ std::string("---\n");});
|
|
106
143
|
size_t ntrees = (model_outputs != NULL)? (model_outputs->trees.size()) : (model_outputs_ext->hplanes.size());
|
|
107
|
-
|
|
144
|
+
return
|
|
108
145
|
out
|
|
109
146
|
+ std::string(") / ")
|
|
110
|
-
+ std::to_string((
|
|
111
|
-
|
|
147
|
+
+ std::to_string((double)ntrees * ((model_outputs != NULL)?
|
|
148
|
+
(model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
|
|
112
149
|
+ std::string(") AS ")
|
|
113
150
|
+ select_as
|
|
114
151
|
+ std::string("\nFROM ")
|
|
@@ -174,13 +211,13 @@ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *mo
|
|
|
174
211
|
size_t_for loop_end = ntrees_use;
|
|
175
212
|
if (single_tree)
|
|
176
213
|
{
|
|
177
|
-
loop_st = tree_num;
|
|
214
|
+
loop_st = tree_num - index1;
|
|
178
215
|
loop_end = loop_st + 1;
|
|
179
216
|
}
|
|
180
217
|
|
|
181
218
|
/* determine maximum number of nodes in a tree */
|
|
182
219
|
size_t max_nodes = 0;
|
|
183
|
-
for (size_t tree = loop_st; tree < loop_end; tree++)
|
|
220
|
+
for (size_t tree = loop_st; tree < (size_t)loop_end; tree++)
|
|
184
221
|
max_nodes = std::max(max_nodes,
|
|
185
222
|
(model_outputs != NULL)?
|
|
186
223
|
(model_outputs->trees[tree].size()) : (model_outputs_ext->hplanes[tree].size()));
|
|
@@ -192,80 +229,111 @@ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *mo
|
|
|
192
229
|
|
|
193
230
|
size_t tree_use;
|
|
194
231
|
|
|
232
|
+
bool threw_exception = false;
|
|
233
|
+
std::exception_ptr ex = NULL;
|
|
234
|
+
|
|
195
235
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
196
236
|
shared(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, \
|
|
197
|
-
loop_st, loop_end, index1, single_tree, all_node_rules, out) \
|
|
237
|
+
loop_st, loop_end, index1, single_tree, all_node_rules, out, ex, threw_exception) \
|
|
198
238
|
firstprivate(conditions_left, conditions_right) private(tree_use)
|
|
199
239
|
for (size_t_for tree = loop_st; tree < loop_end; tree++)
|
|
200
240
|
{
|
|
201
|
-
if (
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
|
|
205
|
-
conditions_left[node], conditions_right[node],
|
|
206
|
-
numeric_colnames, categ_colnames,
|
|
207
|
-
categ_levels);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
else
|
|
241
|
+
if (threw_exception) continue;
|
|
242
|
+
|
|
243
|
+
try
|
|
211
244
|
{
|
|
212
|
-
|
|
213
|
-
|
|
245
|
+
if (model_outputs != NULL)
|
|
246
|
+
{
|
|
247
|
+
for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
|
|
248
|
+
extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
|
|
214
249
|
conditions_left[node], conditions_right[node],
|
|
215
250
|
numeric_colnames, categ_colnames,
|
|
216
251
|
categ_levels);
|
|
217
|
-
|
|
252
|
+
}
|
|
218
253
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
254
|
+
else
|
|
255
|
+
{
|
|
256
|
+
for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
|
|
257
|
+
extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
|
|
258
|
+
conditions_left[node], conditions_right[node],
|
|
259
|
+
numeric_colnames, categ_colnames,
|
|
260
|
+
categ_levels);
|
|
261
|
+
}
|
|
226
262
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
263
|
+
generate_tree_rules(
|
|
264
|
+
(model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
|
|
265
|
+
(model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
|
|
266
|
+
output_score,
|
|
267
|
+
0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
|
|
268
|
+
conditions_left, conditions_right,
|
|
269
|
+
model_outputs, model_outputs_ext
|
|
270
|
+
);
|
|
231
271
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
272
|
+
/* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
|
|
273
|
+
// if (single_tree)
|
|
274
|
+
// tree = 0;
|
|
275
|
+
tree_use = single_tree? (size_t)0 : tree;
|
|
276
|
+
|
|
277
|
+
if (all_node_rules[tree_use].size() <= 1)
|
|
278
|
+
{
|
|
279
|
+
for (std::string &rule : all_node_rules[tree_use])
|
|
280
|
+
rule = std::string("WHEN TRUE THEN ")
|
|
281
|
+
+ std::to_string((model_outputs != NULL)?
|
|
282
|
+
(model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
|
|
283
|
+
+ std::string(" ");
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
|
|
287
|
+
std::string("CASE\n"),
|
|
288
|
+
[&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
|
|
289
|
+
{return a
|
|
290
|
+
+ std::string("---begin terminal node ")
|
|
291
|
+
+ std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
|
|
292
|
+
+ std::string("---\n")
|
|
293
|
+
+ b;})
|
|
294
|
+
+ std::string("END\n");
|
|
295
|
+
all_node_rules[tree_use].clear();
|
|
239
296
|
}
|
|
240
297
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
298
|
+
catch (...)
|
|
299
|
+
{
|
|
300
|
+
#pragma omp critical
|
|
301
|
+
{
|
|
302
|
+
if (!threw_exception)
|
|
303
|
+
{
|
|
304
|
+
threw_exception = true;
|
|
305
|
+
ex = std::current_exception();
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
}
|
|
251
309
|
}
|
|
252
310
|
|
|
311
|
+
if (threw_exception)
|
|
312
|
+
std::rethrow_exception(ex);
|
|
313
|
+
|
|
253
314
|
return out;
|
|
254
315
|
}
|
|
255
316
|
|
|
256
317
|
|
|
257
318
|
void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
|
|
258
319
|
size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
|
|
259
|
-
std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right
|
|
320
|
+
std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
|
|
321
|
+
const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext)
|
|
260
322
|
{
|
|
261
|
-
if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
|
|
262
|
-
|
|
323
|
+
// if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
|
|
324
|
+
// (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
|
|
325
|
+
if ((trees != NULL && (*trees)[curr_ix].tree_left == 0) ||
|
|
326
|
+
(hplanes != NULL && (*hplanes)[curr_ix].hplane_left == 0))
|
|
263
327
|
{
|
|
264
328
|
node_rules.push_back(prev_cond
|
|
265
329
|
+ std::string("\tTHEN ")
|
|
266
330
|
+ (output_score?
|
|
267
331
|
(std::to_string((trees != NULL)?
|
|
268
|
-
((
|
|
332
|
+
((model_outputs->scoring_metric != Density && model_outputs->scoring_metric != BoxedRatio)?
|
|
333
|
+
(*trees)[curr_ix].score : (-(*trees)[curr_ix].score))
|
|
334
|
+
:
|
|
335
|
+
((model_outputs_ext->scoring_metric != Density && model_outputs_ext->scoring_metric != BoxedRatio)?
|
|
336
|
+
(*hplanes)[curr_ix].score : (-(*hplanes)[curr_ix].score))))
|
|
269
337
|
:
|
|
270
338
|
(std::to_string(node_rules.size() + (size_t)index1)))
|
|
271
339
|
+ std::string("\n---end of terminal node ")
|
|
@@ -283,7 +351,7 @@ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hp
|
|
|
283
351
|
(trees != NULL)?
|
|
284
352
|
((*trees)[curr_ix].tree_left) : ((*hplanes)[curr_ix].hplane_left),
|
|
285
353
|
index1, cond_left, node_rules,
|
|
286
|
-
conditions_left, conditions_right);
|
|
354
|
+
conditions_left, conditions_right, model_outputs, model_outputs_ext);
|
|
287
355
|
cond_left.clear();
|
|
288
356
|
std::string cond_right = prev_cond
|
|
289
357
|
+ ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t ("))
|
|
@@ -293,7 +361,7 @@ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hp
|
|
|
293
361
|
(trees != NULL)?
|
|
294
362
|
((*trees)[curr_ix].tree_right) : ((*hplanes)[curr_ix].hplane_right),
|
|
295
363
|
index1, cond_right, node_rules,
|
|
296
|
-
conditions_left, conditions_right);
|
|
364
|
+
conditions_left, conditions_right, model_outputs, model_outputs_ext);
|
|
297
365
|
}
|
|
298
366
|
|
|
299
367
|
|
|
@@ -304,7 +372,8 @@ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
|
|
|
304
372
|
{
|
|
305
373
|
cond_left = std::string("");
|
|
306
374
|
cond_right = std::string("");
|
|
307
|
-
if (tree.score >= 0.)
|
|
375
|
+
// if (tree.score >= 0.)
|
|
376
|
+
if (tree.tree_left == 0)
|
|
308
377
|
return;
|
|
309
378
|
|
|
310
379
|
switch(tree.col_type)
|
|
@@ -457,6 +526,12 @@ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
|
|
|
457
526
|
}
|
|
458
527
|
break;
|
|
459
528
|
}
|
|
529
|
+
|
|
530
|
+
default:
|
|
531
|
+
{
|
|
532
|
+
unexpected_error();
|
|
533
|
+
break;
|
|
534
|
+
}
|
|
460
535
|
}
|
|
461
536
|
}
|
|
462
537
|
|
|
@@ -467,7 +542,8 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
|
|
|
467
542
|
{
|
|
468
543
|
cond_left = std::string("");
|
|
469
544
|
cond_right = std::string("");
|
|
470
|
-
if (hplane.score >= 0.)
|
|
545
|
+
// if (hplane.score >= 0.)
|
|
546
|
+
if (hplane.hplane_left == 0)
|
|
471
547
|
return;
|
|
472
548
|
|
|
473
549
|
std::string hplane_conds = std::string("");
|
|
@@ -535,6 +611,12 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
|
|
|
535
611
|
n_visited_categ++;
|
|
536
612
|
break;
|
|
537
613
|
}
|
|
614
|
+
|
|
615
|
+
default:
|
|
616
|
+
{
|
|
617
|
+
unexpected_error();
|
|
618
|
+
break;
|
|
619
|
+
}
|
|
538
620
|
}
|
|
539
621
|
hplane_conds += ((model.missing_action == Impute)?
|
|
540
622
|
(std::string(", ") + std::to_string(hplane.fill_val[ix]) + std::string(")")) : (std::string("")));
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
#include "isotree.hpp"
|
|
64
|
+
|
|
65
|
+
/* Create a model containing a sub-set of the trees from another model
|
|
66
|
+
*
|
|
67
|
+
* Parameters
|
|
68
|
+
* ==========
|
|
69
|
+
* - model (in)
|
|
70
|
+
* Pointer to isolation forest model wich has already been fit through 'fit_iforest',
|
|
71
|
+
* from which the desired trees will be copied into a new model object.
|
|
72
|
+
* Pass NULL if using the extended model.
|
|
73
|
+
* - ext_model (in)
|
|
74
|
+
* Pointer to extended isolation forest model which has already been fit through 'fit_iforest',
|
|
75
|
+
* from which the desired trees will be copied into a new model object.
|
|
76
|
+
* Pass NULL if using the single-variable model.
|
|
77
|
+
* - imputer (in)
|
|
78
|
+
* Pointer to imputation object which has already been fit through 'fit_iforest' along with
|
|
79
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest'.
|
|
80
|
+
* Pass NULL if the model was built without an imputer.
|
|
81
|
+
* - indexer (in)
|
|
82
|
+
* Pointer to indexer object which has already been fit through 'fit_iforest' along with
|
|
83
|
+
* either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized funcction.
|
|
84
|
+
* Pass NULL if the model was built without an indexer.
|
|
85
|
+
* - model_new (out)
|
|
86
|
+
* Pointer to already-allocated isolation forest model, which will be reset and to
|
|
87
|
+
* which the selected trees from 'model' will be copied.
|
|
88
|
+
* Pass NULL if using the extended model.
|
|
89
|
+
* - ext_model_new (out)
|
|
90
|
+
* Pointer to already-allocated extended isolation forest model, which will be reset and to
|
|
91
|
+
* which the selected hyperplanes from 'ext_model' will be copied.
|
|
92
|
+
* Pass NULL if using the single-variable model.
|
|
93
|
+
* - imputer_new (out)
|
|
94
|
+
* Pointer to already-allocated imputation object, which will be reset and to
|
|
95
|
+
* which the selected nodes from 'imputer' (matching to those of either 'model'
|
|
96
|
+
* or 'ext_model') will be copied.
|
|
97
|
+
* Pass NULL if the model was built without an imputer.
|
|
98
|
+
* - indexer_new (out)
|
|
99
|
+
* Pointer to already-allocated indexer object, which will be reset and to
|
|
100
|
+
* which the selected nodes from 'indexer' (matching to those of either 'model'
|
|
101
|
+
* or 'ext_model') will be copied.
|
|
102
|
+
* Pass NULL if the model was built without an indexer.
|
|
103
|
+
*/
|
|
104
|
+
void subset_model(IsoForest* model, IsoForest* model_new,
|
|
105
|
+
ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
|
|
106
|
+
Imputer* imputer, Imputer* imputer_new,
|
|
107
|
+
TreesIndexer* indexer, TreesIndexer* indexer_new,
|
|
108
|
+
size_t *trees_take, size_t ntrees_take)
|
|
109
|
+
{
|
|
110
|
+
if (model != NULL)
|
|
111
|
+
{
|
|
112
|
+
if (model_new == NULL)
|
|
113
|
+
throw std::runtime_error("Must pass an already-allocated 'model_new'.\n");
|
|
114
|
+
if (imputer != NULL && model->trees.size() != imputer->imputer_tree.size())
|
|
115
|
+
throw std::runtime_error("Number of trees in imputer does not match with model.\n");
|
|
116
|
+
if (ext_model != NULL)
|
|
117
|
+
throw std::runtime_error("Should pass only one of 'model' or 'ext_model'.\n");
|
|
118
|
+
model_new->new_cat_action = model->new_cat_action;
|
|
119
|
+
model_new->cat_split_type = model->cat_split_type;
|
|
120
|
+
model_new->missing_action = model->missing_action;
|
|
121
|
+
model_new->exp_avg_depth = model->exp_avg_depth;
|
|
122
|
+
model_new->exp_avg_sep = model->exp_avg_sep;
|
|
123
|
+
model_new->orig_sample_size = model->orig_sample_size;
|
|
124
|
+
|
|
125
|
+
model_new->trees.resize(ntrees_take);
|
|
126
|
+
for (size_t ix = 0; ix < ntrees_take; ix++)
|
|
127
|
+
model_new->trees[ix] = model->trees[trees_take[ix]];
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
else if (ext_model != NULL)
|
|
131
|
+
{
|
|
132
|
+
if (ext_model_new == NULL)
|
|
133
|
+
throw std::runtime_error("Must pass an already-allocated 'ext_model_new'.");
|
|
134
|
+
if (imputer != NULL && ext_model->hplanes.size() != imputer->imputer_tree.size())
|
|
135
|
+
throw std::runtime_error("Number of trees in imputer does not match with model.\n");
|
|
136
|
+
if (model != NULL)
|
|
137
|
+
throw std::runtime_error("Should pass only one of 'model' or 'ext_model'.\n");
|
|
138
|
+
ext_model_new->new_cat_action = ext_model->new_cat_action;
|
|
139
|
+
ext_model_new->cat_split_type = ext_model->cat_split_type;
|
|
140
|
+
ext_model_new->missing_action = ext_model->missing_action;
|
|
141
|
+
ext_model_new->exp_avg_depth = ext_model->exp_avg_depth;
|
|
142
|
+
ext_model_new->exp_avg_sep = ext_model->exp_avg_sep;
|
|
143
|
+
ext_model_new->orig_sample_size = ext_model->orig_sample_size;
|
|
144
|
+
|
|
145
|
+
ext_model_new->hplanes.resize(ntrees_take);
|
|
146
|
+
for (size_t ix = 0; ix < ntrees_take; ix++)
|
|
147
|
+
ext_model_new->hplanes[ix] = ext_model->hplanes[trees_take[ix]];
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (imputer != NULL)
|
|
151
|
+
{
|
|
152
|
+
if (imputer_new == NULL)
|
|
153
|
+
throw std::runtime_error("Must pass an already-allocated 'imputer_new'.");
|
|
154
|
+
imputer_new->ncols_numeric = imputer->ncols_numeric;
|
|
155
|
+
imputer_new->ncols_categ = imputer->ncols_categ;
|
|
156
|
+
imputer_new->ncat = imputer->ncat;
|
|
157
|
+
imputer_new->col_means = imputer->col_means;
|
|
158
|
+
imputer_new->col_modes = imputer->col_modes;
|
|
159
|
+
|
|
160
|
+
imputer_new->imputer_tree.resize(ntrees_take);
|
|
161
|
+
for (size_t ix = 0; ix < ntrees_take; ix++)
|
|
162
|
+
imputer_new->imputer_tree[ix] = imputer->imputer_tree[trees_take[ix]];
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
if (indexer != NULL)
|
|
166
|
+
{
|
|
167
|
+
if (indexer_new == NULL)
|
|
168
|
+
throw std::runtime_error("Must pass an already-allocated 'indexer_new'.");
|
|
169
|
+
|
|
170
|
+
indexer_new->indices.resize(ntrees_take);
|
|
171
|
+
for (size_t ix = 0; ix < ntrees_take; ix++)
|
|
172
|
+
indexer_new->indices[ix] = indexer->indices[trees_take[ix]];
|
|
173
|
+
}
|
|
174
|
+
}
|