isotree 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (118) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/LICENSE.txt +2 -2
  4. data/README.md +22 -1
  5. data/ext/isotree/ext.cpp +26 -0
  6. data/ext/isotree/extconf.rb +3 -3
  7. data/lib/isotree.rb +1 -0
  8. data/lib/isotree/isolation_forest.rb +86 -1
  9. data/lib/isotree/version.rb +1 -1
  10. data/vendor/cereal/LICENSE +24 -0
  11. data/vendor/cereal/README.md +85 -0
  12. data/vendor/cereal/include/cereal/access.hpp +351 -0
  13. data/vendor/cereal/include/cereal/archives/adapters.hpp +163 -0
  14. data/vendor/cereal/include/cereal/archives/binary.hpp +169 -0
  15. data/vendor/cereal/include/cereal/archives/json.hpp +1019 -0
  16. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +334 -0
  17. data/vendor/cereal/include/cereal/archives/xml.hpp +956 -0
  18. data/vendor/cereal/include/cereal/cereal.hpp +1089 -0
  19. data/vendor/cereal/include/cereal/details/helpers.hpp +422 -0
  20. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +796 -0
  21. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +65 -0
  22. data/vendor/cereal/include/cereal/details/static_object.hpp +127 -0
  23. data/vendor/cereal/include/cereal/details/traits.hpp +1411 -0
  24. data/vendor/cereal/include/cereal/details/util.hpp +84 -0
  25. data/vendor/cereal/include/cereal/external/base64.hpp +134 -0
  26. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +284 -0
  27. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +78 -0
  28. data/vendor/cereal/include/cereal/external/rapidjson/document.h +2652 -0
  29. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +299 -0
  30. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +716 -0
  31. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +74 -0
  32. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +161 -0
  33. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +99 -0
  34. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +104 -0
  35. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +151 -0
  36. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +290 -0
  37. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +271 -0
  38. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +245 -0
  39. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +78 -0
  40. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +308 -0
  41. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +186 -0
  42. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +55 -0
  43. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +740 -0
  44. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +232 -0
  45. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +69 -0
  46. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +290 -0
  47. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +46 -0
  48. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +128 -0
  49. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +70 -0
  50. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +71 -0
  51. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +316 -0
  52. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +300 -0
  53. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +81 -0
  54. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +1414 -0
  55. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +277 -0
  56. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +656 -0
  57. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +2230 -0
  58. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +2497 -0
  59. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +223 -0
  60. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +121 -0
  61. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +709 -0
  62. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +52 -0
  63. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +406 -0
  64. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +2624 -0
  65. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +175 -0
  66. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +428 -0
  67. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +123 -0
  68. data/vendor/cereal/include/cereal/macros.hpp +154 -0
  69. data/vendor/cereal/include/cereal/specialize.hpp +139 -0
  70. data/vendor/cereal/include/cereal/types/array.hpp +79 -0
  71. data/vendor/cereal/include/cereal/types/atomic.hpp +55 -0
  72. data/vendor/cereal/include/cereal/types/base_class.hpp +203 -0
  73. data/vendor/cereal/include/cereal/types/bitset.hpp +176 -0
  74. data/vendor/cereal/include/cereal/types/boost_variant.hpp +164 -0
  75. data/vendor/cereal/include/cereal/types/chrono.hpp +72 -0
  76. data/vendor/cereal/include/cereal/types/common.hpp +129 -0
  77. data/vendor/cereal/include/cereal/types/complex.hpp +56 -0
  78. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +73 -0
  79. data/vendor/cereal/include/cereal/types/deque.hpp +62 -0
  80. data/vendor/cereal/include/cereal/types/forward_list.hpp +68 -0
  81. data/vendor/cereal/include/cereal/types/functional.hpp +43 -0
  82. data/vendor/cereal/include/cereal/types/list.hpp +62 -0
  83. data/vendor/cereal/include/cereal/types/map.hpp +36 -0
  84. data/vendor/cereal/include/cereal/types/memory.hpp +425 -0
  85. data/vendor/cereal/include/cereal/types/optional.hpp +66 -0
  86. data/vendor/cereal/include/cereal/types/polymorphic.hpp +483 -0
  87. data/vendor/cereal/include/cereal/types/queue.hpp +132 -0
  88. data/vendor/cereal/include/cereal/types/set.hpp +103 -0
  89. data/vendor/cereal/include/cereal/types/stack.hpp +76 -0
  90. data/vendor/cereal/include/cereal/types/string.hpp +61 -0
  91. data/vendor/cereal/include/cereal/types/tuple.hpp +123 -0
  92. data/vendor/cereal/include/cereal/types/unordered_map.hpp +36 -0
  93. data/vendor/cereal/include/cereal/types/unordered_set.hpp +99 -0
  94. data/vendor/cereal/include/cereal/types/utility.hpp +47 -0
  95. data/vendor/cereal/include/cereal/types/valarray.hpp +89 -0
  96. data/vendor/cereal/include/cereal/types/variant.hpp +109 -0
  97. data/vendor/cereal/include/cereal/types/vector.hpp +112 -0
  98. data/vendor/cereal/include/cereal/version.hpp +52 -0
  99. data/vendor/isotree/LICENSE +1 -1
  100. data/vendor/isotree/README.md +2 -1
  101. data/vendor/isotree/src/RcppExports.cpp +44 -4
  102. data/vendor/isotree/src/Rwrapper.cpp +141 -51
  103. data/vendor/isotree/src/crit.cpp +1 -1
  104. data/vendor/isotree/src/dealloc.cpp +1 -1
  105. data/vendor/isotree/src/dist.cpp +6 -6
  106. data/vendor/isotree/src/extended.cpp +5 -5
  107. data/vendor/isotree/src/fit_model.cpp +30 -19
  108. data/vendor/isotree/src/helpers_iforest.cpp +26 -11
  109. data/vendor/isotree/src/impute.cpp +7 -7
  110. data/vendor/isotree/src/isoforest.cpp +7 -7
  111. data/vendor/isotree/src/isotree.hpp +27 -5
  112. data/vendor/isotree/src/merge_models.cpp +1 -1
  113. data/vendor/isotree/src/mult.cpp +1 -1
  114. data/vendor/isotree/src/predict.cpp +20 -16
  115. data/vendor/isotree/src/serialize.cpp +1 -1
  116. data/vendor/isotree/src/sql.cpp +545 -0
  117. data/vendor/isotree/src/utils.cpp +36 -44
  118. metadata +98 -92
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -137,7 +137,7 @@ void predict_iforest(double numeric_data[], int categ_data[],
137
137
  if (
138
138
  model_outputs->missing_action == Fail &&
139
139
  (model_outputs->new_cat_action != Weighted || prediction_data.categ_data == NULL) &&
140
- prediction_data.Xc == NULL && prediction_data.Xr == NULL
140
+ prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL
141
141
  )
142
142
  {
143
143
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
@@ -180,8 +180,8 @@ void predict_iforest(double numeric_data[], int categ_data[],
180
180
  if (
181
181
  model_outputs_ext->missing_action == Fail &&
182
182
  prediction_data.categ_data == NULL &&
183
- prediction_data.Xc == NULL &&
184
- prediction_data.Xr == NULL
183
+ prediction_data.Xc_indptr == NULL &&
184
+ prediction_data.Xr_indptr == NULL
185
185
  )
186
186
  {
187
187
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
@@ -252,6 +252,8 @@ void predict_iforest(double numeric_data[], int categ_data[],
252
252
  }
253
253
 
254
254
 
255
+ /* TODO: these functions would be faster if done with row-major order,
256
+ should at least give the option of taking arrays as row-major. */
255
257
  void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
256
258
  IsoForest &model_outputs,
257
259
  PredictionData &prediction_data,
@@ -381,7 +383,7 @@ double traverse_itree(std::vector<IsoTree> &tree,
381
383
  double range_penalty = 0;
382
384
 
383
385
  sparse_ix *row_st = NULL, *row_end = NULL;
384
- if (prediction_data.Xr != NULL)
386
+ if (prediction_data.Xr_indptr != NULL)
385
387
  {
386
388
  row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
387
389
  row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
@@ -406,12 +408,12 @@ double traverse_itree(std::vector<IsoTree> &tree,
406
408
  case Numeric:
407
409
  {
408
410
 
409
- if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
411
+ if (prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL)
410
412
  xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
411
- else if (row_st != NULL)
412
- xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
413
- else
413
+ else if (prediction_data.Xc_indptr != NULL)
414
414
  xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
415
+ else
416
+ xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
415
417
 
416
418
  if (isnan(xval))
417
419
  {
@@ -682,7 +684,7 @@ void traverse_hplane(std::vector<IsoHPlane> &hplane,
682
684
  size_t ncols_numeric, ncols_categ;
683
685
 
684
686
  sparse_ix *row_st = NULL, *row_end = NULL;
685
- if (prediction_data.Xr != NULL)
687
+ if (prediction_data.Xr_indptr != NULL)
686
688
  {
687
689
  row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
688
690
  row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
@@ -712,12 +714,12 @@ void traverse_hplane(std::vector<IsoHPlane> &hplane,
712
714
  {
713
715
  case Numeric:
714
716
  {
715
- if (prediction_data.Xc == NULL && prediction_data.Xr == NULL)
717
+ if (prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL)
716
718
  xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
717
- else if (row_st != NULL)
718
- xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col]);
719
- else
719
+ else if (prediction_data.Xc_indptr != NULL)
720
720
  xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
721
+ else
722
+ xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col]);
721
723
 
722
724
  if (is_na_or_inf(xval))
723
725
  {
@@ -805,16 +807,18 @@ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num)
805
807
  ||
806
808
  *search_res != row
807
809
  )
808
- return 0;
810
+ return 0.;
809
811
  else
810
812
  return prediction_data.Xc[search_res - prediction_data.Xc_ind];
811
813
  }
812
814
 
813
815
  double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num)
814
816
  {
817
+ if (row_end == row_st)
818
+ return 0.;
815
819
  sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
816
820
  if (search_res == row_end || *search_res != (sparse_ix)col_num)
817
- return 0;
821
+ return 0.;
818
822
  else
819
823
  return prediction_data.Xr[search_res - prediction_data.Xr_ind];
820
824
  }
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -0,0 +1,545 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2020, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Translate isolation forest model into a single SQL select statement
48
+ *
49
+ * Parameters
50
+ * ==========
51
+ * - model_outputs
52
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
53
+ * if the predictions are to be made from an extended model. Can only pass one of
54
+ * 'model_outputs' and 'model_outputs_ext'.
55
+ * - model_outputs_ext
56
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
57
+ * if the predictions are to be made from a single-variable model. Can only pass one of
58
+ * 'model_outputs' and 'model_outputs_ext'.
59
+ * - table_from
60
+ * Table name from where the columns used in the model will be selected.
61
+ * - select_as
62
+ * Alias to give to the outlier score in the select statement.
63
+ * - numeric_colnames
64
+ * Names to use for the numerical columns.
65
+ * - categ_colnames
66
+ * Names to use for the categorical columns.
67
+ * - categ_levels
68
+ * Names to use for the levels/categories of each categorical column. These will be enclosed
69
+ * in single quotes.
70
+ * - index1
71
+ * Whether to make the node numbers start their numeration at 1 instead of 0 in the
72
+ * resulting statement. If passing 'output_tree_num=false', this will only affect the
73
+ * commented lines which act as delimiters. If passing 'output_tree_num=true', will also
74
+ * affect the results (which will also start at 1).
75
+ * - nthreads
76
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
77
+ * allocated, even if the thread does not end up being used. Ignored when not building with
78
+ * OpenMP support.
79
+ *
80
+ * Returns
81
+ * =======
82
+ * A string with the corresponding SQL statement that will calculate the outlier score
83
+ * from the model.
84
+ */
85
+ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
86
+ std::string &table_from, std::string &select_as,
87
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
88
+ std::vector<std::vector<std::string>> &categ_levels,
89
+ bool index1, int nthreads)
90
+ {
91
+ std::vector<std::string> tree_conds = generate_sql(model_outputs, model_outputs_ext,
92
+ numeric_colnames, categ_colnames,
93
+ categ_levels,
94
+ false, index1, false, 0,
95
+ nthreads);
96
+ std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(), std::string("SELECT\nPOWER(2.0, -(0.0"),
97
+ [&tree_conds, &index1](std::string &a, std::string &b)
98
+ {return a
99
+ + std::string(" + \n---BEGIN TREE ")
100
+ + std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
101
+ + std::string("---\n")
102
+ + b
103
+ + std::string("\n---END OF TREE ")
104
+ + std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
105
+ + std::string("---\n");});
106
+ size_t ntrees = (model_outputs != NULL)? (model_outputs->trees.size()) : (model_outputs_ext->hplanes.size());
107
+ return
108
+ out
109
+ + std::string(") / ")
110
+ + std::to_string((long double)ntrees * ((model_outputs != NULL)?
111
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
112
+ + std::string(") AS ")
113
+ + select_as
114
+ + std::string("\nFROM ")
115
+ + table_from;
116
+ }
117
+
118
+ /* Translate model trees into SQL select statements
119
+ *
120
+ * Parameters
121
+ * ==========
122
+ * - model_outputs
123
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
124
+ * if the predictions are to be made from an extended model. Can only pass one of
125
+ * 'model_outputs' and 'model_outputs_ext'.
126
+ * - model_outputs_ext
127
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
128
+ * if the predictions are to be made from a single-variable model. Can only pass one of
129
+ * 'model_outputs' and 'model_outputs_ext'.
130
+ * - numeric_colnames
131
+ * Names to use for the numerical columns.
132
+ * - categ_colnames
133
+ * Names to use for the categorical columns.
134
+ * - categ_levels
135
+ * Names to use for the levels/categories of each categorical column. These will be enclosed
136
+ * in single quotes.
137
+ * - output_tree_num
138
+ * Whether to output the terminal node number instead of the separation depth at each node.
139
+ * - index1
140
+ * Whether to make the node numbers start their numeration at 1 instead of 0 in the
141
+ * resulting statement. If passing 'output_tree_num=false', this will only affect the
142
+ * commented lines which act as delimiters. If passing 'output_tree_num=true', will also
143
+ * affect the results (which will also start at 1).
144
+ * - single_tree
145
+ * Whether to generate the select statement for a single tree of the model instead of for
146
+ * all. The tree number to generate is to be passed under 'tree_num'.
147
+ * - tree_num
148
+ * Tree number for which to generate an SQL select statement, if passing 'single_tree=true'.
149
+ * - nthreads
150
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
151
+ * allocated, even if the thread does not end up being used. Ignored when not building with
152
+ * OpenMP support.
153
+ *
154
+ * Returns
155
+ * =======
156
+ * A vector containing at each element the SQL statement for the corresponding tree in the model.
157
+ * If passing 'single_tree=true', will contain only one element, corresponding to the tree given
158
+ * in 'tree_num'. The statements will be node-by-node, with commented-out separators using '---'
159
+ * as delimiters and including the node number as part of the comment.
160
+ */
161
+ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
162
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
163
+ std::vector<std::vector<std::string>> &categ_levels,
164
+ bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
165
+ int nthreads)
166
+ {
167
+ bool output_score = !output_tree_num;
168
+ size_t ntrees_use = single_tree?
169
+ 1 : ((model_outputs != NULL)?
170
+ model_outputs->trees.size() : model_outputs_ext->hplanes.size());
171
+ std::string initial_str = std::string("\tWHEN\n");
172
+
173
+ size_t_for loop_st = 0;
174
+ size_t_for loop_end = ntrees_use;
175
+ if (single_tree)
176
+ {
177
+ loop_st = tree_num;
178
+ loop_end = loop_st + 1;
179
+ }
180
+
181
+ /* determine maximum number of nodes in a tree */
182
+ size_t max_nodes = 0;
183
+ for (size_t tree = loop_st; tree < loop_end; tree++)
184
+ max_nodes = std::max(max_nodes,
185
+ (model_outputs != NULL)?
186
+ (model_outputs->trees[tree].size()) : (model_outputs_ext->hplanes[tree].size()));
187
+ std::vector<std::string> conditions_left(max_nodes);
188
+ std::vector<std::string> conditions_right(max_nodes);
189
+
190
+ std::vector<std::vector<std::string>> all_node_rules(ntrees_use);
191
+ std::vector<std::string> out(ntrees_use);
192
+
193
+ size_t tree_use;
194
+
195
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
196
+ shared(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, \
197
+ loop_st, loop_end, index1, single_tree, all_node_rules, out) \
198
+ firstprivate(conditions_left, conditions_right) private(tree_use)
199
+ for (size_t_for tree = loop_st; tree < loop_end; tree++)
200
+ {
201
+ if (model_outputs != NULL)
202
+ {
203
+ for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
204
+ extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
205
+ conditions_left[node], conditions_right[node],
206
+ numeric_colnames, categ_colnames,
207
+ categ_levels);
208
+ }
209
+
210
+ else
211
+ {
212
+ for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
213
+ extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
214
+ conditions_left[node], conditions_right[node],
215
+ numeric_colnames, categ_colnames,
216
+ categ_levels);
217
+ }
218
+
219
+ generate_tree_rules(
220
+ (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
221
+ (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
222
+ output_score,
223
+ 0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
224
+ conditions_left, conditions_right
225
+ );
226
+
227
+ /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
228
+ // if (single_tree)
229
+ // tree = 0;
230
+ tree_use = single_tree? (size_t)0 : tree;
231
+
232
+ if (all_node_rules[tree_use].size() <= 1)
233
+ {
234
+ for (std::string &rule : all_node_rules[tree_use])
235
+ rule = std::string("WHEN TRUE THEN ")
236
+ + std::to_string((model_outputs != NULL)?
237
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
238
+ + std::string(" ");
239
+ }
240
+
241
+ out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
242
+ std::string("CASE\n"),
243
+ [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
244
+ {return a
245
+ + std::string("---begin terminal node ")
246
+ + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
247
+ + std::string("---\n")
248
+ + b;})
249
+ + std::string("END\n");
250
+ all_node_rules[tree_use].clear();
251
+ }
252
+
253
+ return out;
254
+ }
255
+
256
+
257
+ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
258
+ size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
259
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right)
260
+ {
261
+ if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
262
+ (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
263
+ {
264
+ node_rules.push_back(prev_cond
265
+ + std::string("\tTHEN ")
266
+ + (output_score?
267
+ (std::to_string((trees != NULL)?
268
+ ((*trees)[curr_ix].score) : ((*hplanes)[curr_ix].score)))
269
+ :
270
+ (std::to_string(node_rules.size() + (size_t)index1)))
271
+ + std::string("\n---end of terminal node ")
272
+ + std::to_string(node_rules.size() + (size_t)index1)
273
+ + std::string("---\n"));
274
+ return;
275
+ }
276
+
277
+
278
+ std::string cond_left = prev_cond
279
+ + ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t ("))
280
+ + conditions_left[curr_ix]
281
+ + std::string(")\n");
282
+ generate_tree_rules(trees, hplanes, output_score,
283
+ (trees != NULL)?
284
+ ((*trees)[curr_ix].tree_left) : ((*hplanes)[curr_ix].hplane_left),
285
+ index1, cond_left, node_rules,
286
+ conditions_left, conditions_right);
287
+ cond_left.clear();
288
+ std::string cond_right = prev_cond
289
+ + ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t ("))
290
+ + conditions_right[curr_ix]
291
+ + std::string(")\n");
292
+ generate_tree_rules(trees, hplanes, output_score,
293
+ (trees != NULL)?
294
+ ((*trees)[curr_ix].tree_right) : ((*hplanes)[curr_ix].hplane_right),
295
+ index1, cond_right, node_rules,
296
+ conditions_left, conditions_right);
297
+ }
298
+
299
+
300
+ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
301
+ std::string &cond_left, std::string &cond_right,
302
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
303
+ std::vector<std::vector<std::string>> &categ_levels)
304
+ {
305
+ cond_left = std::string("");
306
+ cond_right = std::string("");
307
+ if (tree.score >= 0.)
308
+ return;
309
+
310
+ switch(tree.col_type)
311
+ {
312
+ case Numeric:
313
+ {
314
+ cond_left = ((model.missing_action != Impute)? (std::string("")) :
315
+ ((tree.pct_tree_left >= .5)?
316
+ (numeric_colnames[tree.col_num]
317
+ + std::string(" IS NULL OR "))
318
+ :
319
+ (numeric_colnames[tree.col_num]
320
+ + std::string(" IS NOT NULL AND "))))
321
+ + numeric_colnames[tree.col_num]
322
+ + std::string(" <= ")
323
+ + std::to_string(tree.num_split);
324
+ cond_right = ((model.missing_action != Impute)? (std::string("")) :
325
+ ((tree.pct_tree_left >= .5)?
326
+ (numeric_colnames[tree.col_num]
327
+ + std::string(" IS NOT NULL AND "))
328
+ :
329
+ (numeric_colnames[tree.col_num]
330
+ + std::string(" IS NULL OR "))))
331
+ + numeric_colnames[tree.col_num]
332
+ + std::string(" > ")
333
+ + std::to_string(tree.num_split);
334
+ break;
335
+ }
336
+
337
+ case Categorical:
338
+ {
339
+ switch(model.cat_split_type)
340
+ {
341
+ case SingleCateg:
342
+ {
343
+ cond_left = ((model.missing_action != Impute)? (std::string("")) :
344
+ ((model.missing_action == Impute && tree.pct_tree_left >= .5)?
345
+ (categ_colnames[tree.col_num]
346
+ + std::string(" IS NULL OR "))
347
+ :
348
+ (categ_colnames[tree.col_num]
349
+ + std::string(" IS NOT NULL AND "))))
350
+ + categ_colnames[tree.col_num]
351
+ + std::string(" = '")
352
+ + categ_levels[tree.col_num][tree.chosen_cat]
353
+ + std::string("'");
354
+ cond_right = ((model.missing_action != Impute)? (std::string("")) :
355
+ ((model.missing_action == Impute && tree.pct_tree_left >= .5)?
356
+ (categ_colnames[tree.col_num]
357
+ + std::string(" IS NOT NULL AND "))
358
+ :
359
+ (categ_colnames[tree.col_num]
360
+ + std::string(" IS NULL OR "))))
361
+ + categ_colnames[tree.col_num]
362
+ + std::string(" != '")
363
+ + categ_levels[tree.col_num][tree.chosen_cat]
364
+ + std::string("'");
365
+ break;
366
+ }
367
+
368
+ case SubSet:
369
+ {
370
+ cond_left = categ_colnames[tree.col_num] + std::string(" IN (");
371
+ cond_right = cond_left;
372
+ if (model.missing_action == Impute)
373
+ {
374
+ if (tree.pct_tree_left >= .5)
375
+ {
376
+ cond_left = categ_colnames[tree.col_num] + std::string(" IS NULL OR ") + cond_left;
377
+ cond_right = categ_colnames[tree.col_num] + std::string(" IS NOT NULL AND ") + cond_right;
378
+ }
379
+
380
+ else
381
+ {
382
+ cond_left = categ_colnames[tree.col_num] + std::string(" IS NOT NULL AND ") + cond_left;
383
+ cond_right = categ_colnames[tree.col_num] + std::string(" IS NULL OR ") + cond_right;
384
+ }
385
+ }
386
+ bool added_left = false;
387
+ bool added_right = false;
388
+ for (size_t categ = 0; categ < tree.cat_split.size(); categ++)
389
+ {
390
+ switch(tree.cat_split[categ])
391
+ {
392
+ case 1:
393
+ {
394
+ cond_left
395
+ +=
396
+ ((added_left)? (std::string(", ")) : (std::string("")))
397
+ + std::string("'")
398
+ + categ_levels[tree.col_num][categ]
399
+ + std::string("'");
400
+ added_left = true;
401
+ break;
402
+ }
403
+
404
+ case 0:
405
+ {
406
+ cond_right
407
+ +=
408
+ ((added_right)? (std::string(", ")) : (std::string("")))
409
+ + std::string("'")
410
+ + categ_levels[tree.col_num][categ]
411
+ + std::string("'");
412
+ added_right = true;
413
+ break;
414
+ }
415
+
416
+ case -1:
417
+ {
418
+ if (model.new_cat_action == Smallest || model.missing_action == Impute)
419
+ {
420
+ if ((model.new_cat_action == Smallest && tree.pct_tree_left < .5) ||
421
+ (model.missing_action == Impute && tree.pct_tree_left >= .5))
422
+ {
423
+ cond_left
424
+ +=
425
+ ((added_left)? (std::string(", ")) : (std::string("")))
426
+ + std::string("'")
427
+ + categ_levels[tree.col_num][categ]
428
+ + std::string("'");
429
+ added_left = true;
430
+ }
431
+ else
432
+ {
433
+ cond_right
434
+ +=
435
+ ((added_right)? (std::string(", ")) : (std::string("")))
436
+ + std::string("'")
437
+ + categ_levels[tree.col_num][categ]
438
+ + std::string("'");
439
+ added_right = true;
440
+ }
441
+ }
442
+ break;
443
+ }
444
+ }
445
+ }
446
+ if (added_left)
447
+ cond_left += std::string(")");
448
+ else
449
+ cond_left = std::string("");
450
+ if (added_right)
451
+ cond_right += std::string(")");
452
+ else
453
+ cond_right = std::string("");
454
+
455
+ break;
456
+ }
457
+ }
458
+ break;
459
+ }
460
+ }
461
+ }
462
+
463
+ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
464
+ std::string &cond_left, std::string &cond_right,
465
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
466
+ std::vector<std::vector<std::string>> &categ_levels)
467
+ {
468
+ cond_left = std::string("");
469
+ cond_right = std::string("");
470
+ if (hplane.score >= 0.)
471
+ return;
472
+
473
+ std::string hplane_conds = std::string("");
474
+
475
+ size_t n_visited_numeric = 0;
476
+ size_t n_visited_categ = 0;
477
+ for (size_t ix = 0; ix < hplane.col_num.size(); ix++)
478
+ {
479
+ hplane_conds
480
+ +=
481
+ ((hplane_conds.length())? (std::string(" + ")) : (std::string("")))
482
+ + ((model.missing_action == Impute)? (std::string("COALESCE(")) : (std::string("")));
483
+ switch(hplane.col_type[ix])
484
+ {
485
+ case Numeric:
486
+ {
487
+ hplane_conds
488
+ +=
489
+ std::to_string(hplane.coef[n_visited_numeric])
490
+ + std::string(" * (")
491
+ + numeric_colnames[hplane.col_num[ix]]
492
+ + ((hplane.mean[n_visited_numeric] >= 0.)? (std::string(" - ")) : (std::string(" - (")))
493
+ + std::to_string(hplane.mean[n_visited_numeric])
494
+ + ((hplane.mean[n_visited_numeric] >= 0.)? (std::string(")")) : (std::string("))")));
495
+ n_visited_numeric++;
496
+ break;
497
+ }
498
+
499
+ case Categorical:
500
+ {
501
+ switch(model.cat_split_type)
502
+ {
503
+ case SingleCateg:
504
+ {
505
+ hplane_conds
506
+ +=
507
+ std::string("CASE WHEN ")
508
+ + categ_colnames[hplane.col_num[ix]]
509
+ + std::string(" = '")
510
+ + categ_levels[hplane.col_num[ix]][hplane.chosen_cat[n_visited_categ]]
511
+ + std::string("' THEN ")
512
+ + std::to_string(hplane.fill_new[n_visited_categ])
513
+ + std::string(" ELSE 0.0 END");
514
+ break;
515
+ }
516
+
517
+ case SubSet:
518
+ {
519
+ hplane_conds += std::string("CASE ") + categ_colnames[hplane.col_num[ix]];
520
+ for (size_t categ = 0; categ < hplane.cat_coef[hplane.col_num[ix]].size(); categ++)
521
+ {
522
+ hplane_conds
523
+ +=
524
+ std::string(" WHEN '")
525
+ + categ_levels[hplane.col_num[ix]][categ]
526
+ + std::string("' THEN ")
527
+ + std::to_string( hplane.cat_coef[hplane.col_num[ix]][categ]);
528
+ }
529
+ if (model.new_cat_action == Smallest)
530
+ hplane_conds += std::string(" ELSE ") + std::to_string(hplane.fill_new[n_visited_categ]);
531
+ hplane_conds += std::string(" END");
532
+ break;
533
+ }
534
+ }
535
+ n_visited_categ++;
536
+ break;
537
+ }
538
+ }
539
+ hplane_conds += ((model.missing_action == Impute)?
540
+ (std::string(", ") + std::to_string(hplane.fill_val[ix]) + std::string(")")) : (std::string("")));
541
+ }
542
+
543
+ cond_left = hplane_conds + std::string(" <= ") + std::to_string(hplane.split_point);
544
+ cond_right = hplane_conds + std::string(" > ") + std::to_string(hplane.split_point);
545
+ }