isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -93,7 +111,26 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
93
111
  categ_levels,
94
112
  false, index1, false, 0,
95
113
  nthreads);
96
- std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(), std::string("SELECT\nPOWER(2.0, -(0.0"),
114
+ bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
115
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
116
+ bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
117
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
118
+ bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
119
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
120
+ bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
121
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
122
+ is_density = is_density || is_bdens2;
123
+ std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(),
124
+ is_density?
125
+ std::string("SELECT\n(-(0.0")
126
+ :
127
+ (is_bdens?
128
+ std::string("SELECT\n((0.0")
129
+ :
130
+ (is_bratio?
131
+ std::string("SELECT\n((0.0")
132
+ :
133
+ std::string("SELECT\nPOWER(2.0, -(0.0"))),
97
134
  [&tree_conds, &index1](std::string &a, std::string &b)
98
135
  {return a
99
136
  + std::string(" + \n---BEGIN TREE ")
@@ -104,11 +141,11 @@ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest
104
141
  + std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
105
142
  + std::string("---\n");});
106
143
  size_t ntrees = (model_outputs != NULL)? (model_outputs->trees.size()) : (model_outputs_ext->hplanes.size());
107
- return
144
+ return
108
145
  out
109
146
  + std::string(") / ")
110
- + std::to_string((long double)ntrees * ((model_outputs != NULL)?
111
- (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
147
+ + std::to_string((double)ntrees * ((model_outputs != NULL)?
148
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
112
149
  + std::string(") AS ")
113
150
  + select_as
114
151
  + std::string("\nFROM ")
@@ -174,13 +211,13 @@ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *mo
174
211
  size_t_for loop_end = ntrees_use;
175
212
  if (single_tree)
176
213
  {
177
- loop_st = tree_num;
214
+ loop_st = tree_num - index1;
178
215
  loop_end = loop_st + 1;
179
216
  }
180
217
 
181
218
  /* determine maximum number of nodes in a tree */
182
219
  size_t max_nodes = 0;
183
- for (size_t tree = loop_st; tree < loop_end; tree++)
220
+ for (size_t tree = loop_st; tree < (size_t)loop_end; tree++)
184
221
  max_nodes = std::max(max_nodes,
185
222
  (model_outputs != NULL)?
186
223
  (model_outputs->trees[tree].size()) : (model_outputs_ext->hplanes[tree].size()));
@@ -192,80 +229,111 @@ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *mo
192
229
 
193
230
  size_t tree_use;
194
231
 
232
+ bool threw_exception = false;
233
+ std::exception_ptr ex = NULL;
234
+
195
235
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
196
236
  shared(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, \
197
- loop_st, loop_end, index1, single_tree, all_node_rules, out) \
237
+ loop_st, loop_end, index1, single_tree, all_node_rules, out, ex, threw_exception) \
198
238
  firstprivate(conditions_left, conditions_right) private(tree_use)
199
239
  for (size_t_for tree = loop_st; tree < loop_end; tree++)
200
240
  {
201
- if (model_outputs != NULL)
202
- {
203
- for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
204
- extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
205
- conditions_left[node], conditions_right[node],
206
- numeric_colnames, categ_colnames,
207
- categ_levels);
208
- }
209
-
210
- else
241
+ if (threw_exception) continue;
242
+
243
+ try
211
244
  {
212
- for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
213
- extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
245
+ if (model_outputs != NULL)
246
+ {
247
+ for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
248
+ extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
214
249
  conditions_left[node], conditions_right[node],
215
250
  numeric_colnames, categ_colnames,
216
251
  categ_levels);
217
- }
252
+ }
218
253
 
219
- generate_tree_rules(
220
- (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
221
- (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
222
- output_score,
223
- 0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
224
- conditions_left, conditions_right
225
- );
254
+ else
255
+ {
256
+ for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
257
+ extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
258
+ conditions_left[node], conditions_right[node],
259
+ numeric_colnames, categ_colnames,
260
+ categ_levels);
261
+ }
226
262
 
227
- /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
228
- // if (single_tree)
229
- // tree = 0;
230
- tree_use = single_tree? (size_t)0 : tree;
263
+ generate_tree_rules(
264
+ (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
265
+ (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
266
+ output_score,
267
+ 0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
268
+ conditions_left, conditions_right,
269
+ model_outputs, model_outputs_ext
270
+ );
231
271
 
232
- if (all_node_rules[tree_use].size() <= 1)
233
- {
234
- for (std::string &rule : all_node_rules[tree_use])
235
- rule = std::string("WHEN TRUE THEN ")
236
- + std::to_string((model_outputs != NULL)?
237
- (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
238
- + std::string(" ");
272
+ /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
273
+ // if (single_tree)
274
+ // tree = 0;
275
+ tree_use = single_tree? (size_t)0 : tree;
276
+
277
+ if (all_node_rules[tree_use].size() <= 1)
278
+ {
279
+ for (std::string &rule : all_node_rules[tree_use])
280
+ rule = std::string("WHEN TRUE THEN ")
281
+ + std::to_string((model_outputs != NULL)?
282
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
283
+ + std::string(" ");
284
+ }
285
+
286
+ out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
287
+ std::string("CASE\n"),
288
+ [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
289
+ {return a
290
+ + std::string("---begin terminal node ")
291
+ + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
292
+ + std::string("---\n")
293
+ + b;})
294
+ + std::string("END\n");
295
+ all_node_rules[tree_use].clear();
239
296
  }
240
297
 
241
- out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
242
- std::string("CASE\n"),
243
- [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
244
- {return a
245
- + std::string("---begin terminal node ")
246
- + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
247
- + std::string("---\n")
248
- + b;})
249
- + std::string("END\n");
250
- all_node_rules[tree_use].clear();
298
+ catch (...)
299
+ {
300
+ #pragma omp critical
301
+ {
302
+ if (!threw_exception)
303
+ {
304
+ threw_exception = true;
305
+ ex = std::current_exception();
306
+ }
307
+ }
308
+ }
251
309
  }
252
310
 
311
+ if (threw_exception)
312
+ std::rethrow_exception(ex);
313
+
253
314
  return out;
254
315
  }
255
316
 
256
317
 
257
318
  void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
258
319
  size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
259
- std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right)
320
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
321
+ const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext)
260
322
  {
261
- if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
262
- (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
323
+ // if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
324
+ // (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
325
+ if ((trees != NULL && (*trees)[curr_ix].tree_left == 0) ||
326
+ (hplanes != NULL && (*hplanes)[curr_ix].hplane_left == 0))
263
327
  {
264
328
  node_rules.push_back(prev_cond
265
329
  + std::string("\tTHEN ")
266
330
  + (output_score?
267
331
  (std::to_string((trees != NULL)?
268
- ((*trees)[curr_ix].score) : ((*hplanes)[curr_ix].score)))
332
+ ((model_outputs->scoring_metric != Density && model_outputs->scoring_metric != BoxedRatio)?
333
+ (*trees)[curr_ix].score : (-(*trees)[curr_ix].score))
334
+ :
335
+ ((model_outputs_ext->scoring_metric != Density && model_outputs_ext->scoring_metric != BoxedRatio)?
336
+ (*hplanes)[curr_ix].score : (-(*hplanes)[curr_ix].score))))
269
337
  :
270
338
  (std::to_string(node_rules.size() + (size_t)index1)))
271
339
  + std::string("\n---end of terminal node ")
@@ -283,7 +351,7 @@ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hp
283
351
  (trees != NULL)?
284
352
  ((*trees)[curr_ix].tree_left) : ((*hplanes)[curr_ix].hplane_left),
285
353
  index1, cond_left, node_rules,
286
- conditions_left, conditions_right);
354
+ conditions_left, conditions_right, model_outputs, model_outputs_ext);
287
355
  cond_left.clear();
288
356
  std::string cond_right = prev_cond
289
357
  + ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t ("))
@@ -293,7 +361,7 @@ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hp
293
361
  (trees != NULL)?
294
362
  ((*trees)[curr_ix].tree_right) : ((*hplanes)[curr_ix].hplane_right),
295
363
  index1, cond_right, node_rules,
296
- conditions_left, conditions_right);
364
+ conditions_left, conditions_right, model_outputs, model_outputs_ext);
297
365
  }
298
366
 
299
367
 
@@ -304,7 +372,8 @@ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
304
372
  {
305
373
  cond_left = std::string("");
306
374
  cond_right = std::string("");
307
- if (tree.score >= 0.)
375
+ // if (tree.score >= 0.)
376
+ if (tree.tree_left == 0)
308
377
  return;
309
378
 
310
379
  switch(tree.col_type)
@@ -457,6 +526,12 @@ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
457
526
  }
458
527
  break;
459
528
  }
529
+
530
+ default:
531
+ {
532
+ unexpected_error();
533
+ break;
534
+ }
460
535
  }
461
536
  }
462
537
 
@@ -467,7 +542,8 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
467
542
  {
468
543
  cond_left = std::string("");
469
544
  cond_right = std::string("");
470
- if (hplane.score >= 0.)
545
+ // if (hplane.score >= 0.)
546
+ if (hplane.hplane_left == 0)
471
547
  return;
472
548
 
473
549
  std::string hplane_conds = std::string("");
@@ -535,6 +611,12 @@ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
535
611
  n_visited_categ++;
536
612
  break;
537
613
  }
614
+
615
+ default:
616
+ {
617
+ unexpected_error();
618
+ break;
619
+ }
538
620
  }
539
621
  hplane_conds += ((model.missing_action == Impute)?
540
622
  (std::string(", ") + std::to_string(hplane.fill_val[ix]) + std::string(")")) : (std::string("")));
@@ -0,0 +1,174 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* Create a model containing a sub-set of the trees from another model
66
+ *
67
+ * Parameters
68
+ * ==========
69
+ * - model (in)
70
+ * Pointer to isolation forest model wich has already been fit through 'fit_iforest',
71
+ * from which the desired trees will be copied into a new model object.
72
+ * Pass NULL if using the extended model.
73
+ * - ext_model (in)
74
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest',
75
+ * from which the desired trees will be copied into a new model object.
76
+ * Pass NULL if using the single-variable model.
77
+ * - imputer (in)
78
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
79
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
80
+ * Pass NULL if the model was built without an imputer.
81
+ * - indexer (in)
82
+ * Pointer to indexer object which has already been fit through 'fit_iforest' along with
83
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest' or through another specialized funcction.
84
+ * Pass NULL if the model was built without an indexer.
85
+ * - model_new (out)
86
+ * Pointer to already-allocated isolation forest model, which will be reset and to
87
+ * which the selected trees from 'model' will be copied.
88
+ * Pass NULL if using the extended model.
89
+ * - ext_model_new (out)
90
+ * Pointer to already-allocated extended isolation forest model, which will be reset and to
91
+ * which the selected hyperplanes from 'ext_model' will be copied.
92
+ * Pass NULL if using the single-variable model.
93
+ * - imputer_new (out)
94
+ * Pointer to already-allocated imputation object, which will be reset and to
95
+ * which the selected nodes from 'imputer' (matching to those of either 'model'
96
+ * or 'ext_model') will be copied.
97
+ * Pass NULL if the model was built without an imputer.
98
+ * - indexer_new (out)
99
+ * Pointer to already-allocated indexer object, which will be reset and to
100
+ * which the selected nodes from 'indexer' (matching to those of either 'model'
101
+ * or 'ext_model') will be copied.
102
+ * Pass NULL if the model was built without an indexer.
103
+ */
104
+ void subset_model(IsoForest* model, IsoForest* model_new,
105
+ ExtIsoForest* ext_model, ExtIsoForest* ext_model_new,
106
+ Imputer* imputer, Imputer* imputer_new,
107
+ TreesIndexer* indexer, TreesIndexer* indexer_new,
108
+ size_t *trees_take, size_t ntrees_take)
109
+ {
110
+ if (model != NULL)
111
+ {
112
+ if (model_new == NULL)
113
+ throw std::runtime_error("Must pass an already-allocated 'model_new'.\n");
114
+ if (imputer != NULL && model->trees.size() != imputer->imputer_tree.size())
115
+ throw std::runtime_error("Number of trees in imputer does not match with model.\n");
116
+ if (ext_model != NULL)
117
+ throw std::runtime_error("Should pass only one of 'model' or 'ext_model'.\n");
118
+ model_new->new_cat_action = model->new_cat_action;
119
+ model_new->cat_split_type = model->cat_split_type;
120
+ model_new->missing_action = model->missing_action;
121
+ model_new->exp_avg_depth = model->exp_avg_depth;
122
+ model_new->exp_avg_sep = model->exp_avg_sep;
123
+ model_new->orig_sample_size = model->orig_sample_size;
124
+
125
+ model_new->trees.resize(ntrees_take);
126
+ for (size_t ix = 0; ix < ntrees_take; ix++)
127
+ model_new->trees[ix] = model->trees[trees_take[ix]];
128
+ }
129
+
130
+ else if (ext_model != NULL)
131
+ {
132
+ if (ext_model_new == NULL)
133
+ throw std::runtime_error("Must pass an already-allocated 'ext_model_new'.");
134
+ if (imputer != NULL && ext_model->hplanes.size() != imputer->imputer_tree.size())
135
+ throw std::runtime_error("Number of trees in imputer does not match with model.\n");
136
+ if (model != NULL)
137
+ throw std::runtime_error("Should pass only one of 'model' or 'ext_model'.\n");
138
+ ext_model_new->new_cat_action = ext_model->new_cat_action;
139
+ ext_model_new->cat_split_type = ext_model->cat_split_type;
140
+ ext_model_new->missing_action = ext_model->missing_action;
141
+ ext_model_new->exp_avg_depth = ext_model->exp_avg_depth;
142
+ ext_model_new->exp_avg_sep = ext_model->exp_avg_sep;
143
+ ext_model_new->orig_sample_size = ext_model->orig_sample_size;
144
+
145
+ ext_model_new->hplanes.resize(ntrees_take);
146
+ for (size_t ix = 0; ix < ntrees_take; ix++)
147
+ ext_model_new->hplanes[ix] = ext_model->hplanes[trees_take[ix]];
148
+ }
149
+
150
+ if (imputer != NULL)
151
+ {
152
+ if (imputer_new == NULL)
153
+ throw std::runtime_error("Must pass an already-allocated 'imputer_new'.");
154
+ imputer_new->ncols_numeric = imputer->ncols_numeric;
155
+ imputer_new->ncols_categ = imputer->ncols_categ;
156
+ imputer_new->ncat = imputer->ncat;
157
+ imputer_new->col_means = imputer->col_means;
158
+ imputer_new->col_modes = imputer->col_modes;
159
+
160
+ imputer_new->imputer_tree.resize(ntrees_take);
161
+ for (size_t ix = 0; ix < ntrees_take; ix++)
162
+ imputer_new->imputer_tree[ix] = imputer->imputer_tree[trees_take[ix]];
163
+ }
164
+
165
+ if (indexer != NULL)
166
+ {
167
+ if (indexer_new == NULL)
168
+ throw std::runtime_error("Must pass an already-allocated 'indexer_new'.");
169
+
170
+ indexer_new->indices.resize(ntrees_take);
171
+ for (size_t ix = 0; ix < ntrees_take; ix++)
172
+ indexer_new->indices[ix] = indexer->indices[trees_take[ix]];
173
+ }
174
+ }