isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,853 +0,0 @@
1
- /* Isolation forests and variations thereof, with adjustments for incorporation
2
- * of categorical variables and missing values.
3
- * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
5
- * This library is based on the following works:
6
- * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
- * "Isolation forest."
8
- * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
- * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
- * "Isolation-based anomaly detection."
11
- * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
- * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
- * "Extended Isolation Forest."
14
- * arXiv preprint arXiv:1811.02141 (2018).
15
- * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
- * "On detecting clustered anomalies using SCiForest."
17
- * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
- * [5] https://sourceforge.net/projects/iforest/
19
- * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
- * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
24
- * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
26
- * All rights reserved.
27
- * Redistribution and use in source and binary forms, with or without
28
- * modification, are permitted provided that the following conditions are met:
29
- * * Redistributions of source code must retain the above copyright notice, this
30
- * list of conditions and the following disclaimer.
31
- * * Redistributions in binary form must reproduce the above copyright notice,
32
- * this list of conditions and the following disclaimer in the documentation
33
- * and/or other materials provided with the distribution.
34
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
- */
45
- #include "isotree.hpp"
46
-
47
- /* Predict outlier score, average depth, or terminal node numbers
48
- *
49
- * Parameters
50
- * ==========
51
- * - numeric_data[nrows * ncols_numeric]
52
- * Pointer to numeric data for which to make predictions. Must be ordered by columns like Fortran,
53
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
54
- * and the column order must be the same as in the data that was used to fit the model.
55
- * Pass NULL if there are no dense numeric columns.
56
- * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
57
- * - categ_data[nrows * ncols_categ]
58
- * Pointer to categorical data for which to make predictions. Must be ordered by columns like Fortran,
59
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
60
- * and the column order must be the same as in the data that was used to fit the model.
61
- * Pass NULL if there are no categorical columns.
62
- * Each category should be represented as an integer, and these integers must start at zero and
63
- * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
64
- * present when the model was fit (note that they are not treated as being ordinal, this is just
65
- * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
66
- * must be the same as was used in the data to which the model was fit.
67
- * - Xc[nnz]
68
- * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
69
- * Pass NULL if there are no sparse numeric columns.
70
- * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
71
- * - Xc_ind[nnz]
72
- * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
73
- * Pass NULL if there are no sparse numeric columns in CSC format.
74
- * - Xc_indptr[ncols_categ + 1]
75
- * Pointer to column index pointers that tell at entry [col] where does column 'col'
76
- * start and at entry [col + 1] where does column 'col' end.
77
- * Pass NULL if there are no sparse numeric columns in CSC format.
78
- * - Xr[nnz]
79
- * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
80
- * Pass NULL if there are no sparse numeric columns.
81
- * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
82
- * - Xr_ind[nnz]
83
- * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
84
- * Pass NULL if there are no sparse numeric columns in CSR format.
85
- * - Xr_indptr[nrows + 1]
86
- * Pointer to row index pointers that tell at entry [row] where does row 'row'
87
- * start and at entry [row + 1] where does row 'row' end.
88
- * Pass NULL if there are no sparse numeric columns in CSR format.
89
- * - nrows
90
- * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
91
- * - nthreads
92
- * Number of parallel threads to use. Note that, the more threads, the more memory will be
93
- * allocated, even if the thread does not end up being used. Ignored when not building with
94
- * OpenMP support.
95
- * - standardize
96
- * Whether to standardize the average depths for each row according to their relative magnitude
97
- * compared to the expected average, in order to obtain an outlier score. If passing 'false',
98
- * will output the average depth instead.
99
- * Ignored when not passing 'output_depths'.
100
- * - model_outputs
101
- * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
102
- * if the predictions are to be made from an extended model. Can only pass one of
103
- * 'model_outputs' and 'model_outputs_ext'.
104
- * - model_outputs_ext
105
- * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
106
- * if the predictions are to be made from a single-variable model. Can only pass one of
107
- * 'model_outputs' and 'model_outputs_ext'.
108
- * - output_depths[nrows] (out)
109
- * Pointer to array where the output average depths or outlier scores will be written into
110
- * (the return type is control according to parameter 'standardize').
111
- * Must already be initialized to zeros. Must also be passed and when the desired output
112
- * is terminal node numbers.
113
- * - tree_num[nrows * ntrees] (out)
114
- * Pointer to array where the output terminal node numbers will be written into.
115
- * Note that the mapping between tree node and terminal tree node is not stored in
116
- * the model object for efficiency reasons, so this mapping will be determined on-the-fly
117
- * when passing this parameter, and as such, there will be some overhead regardless of
118
- * the actual number of rows. Pass NULL if only average depths or outlier scores are desired.
119
- */
120
- void predict_iforest(double numeric_data[], int categ_data[],
121
- double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
122
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
123
- size_t nrows, int nthreads, bool standardize,
124
- IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
125
- double output_depths[], sparse_ix tree_num[])
126
- {
127
- /* put data in a struct for passing it in fewer lines */
128
- PredictionData prediction_data = {numeric_data, categ_data, nrows,
129
- Xc, Xc_ind, Xc_indptr,
130
- Xr, Xr_ind, Xr_indptr};
131
-
132
- if ((size_t)nthreads > nrows)
133
- nthreads = nrows;
134
-
135
- if (model_outputs != NULL)
136
- {
137
- if (
138
- model_outputs->missing_action == Fail &&
139
- (model_outputs->new_cat_action != Weighted || prediction_data.categ_data == NULL) &&
140
- prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL
141
- )
142
- {
143
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
144
- for (size_t_for row = 0; row < nrows; row++)
145
- {
146
- for (std::vector<IsoTree> &tree : model_outputs->trees)
147
- {
148
- traverse_itree_no_recurse(tree,
149
- *model_outputs,
150
- prediction_data,
151
- output_depths[row],
152
- (tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
153
- (size_t) row);
154
- }
155
- }
156
- }
157
-
158
- else
159
- {
160
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs, prediction_data, output_depths, tree_num)
161
- for (size_t_for row = 0; row < nrows; row++)
162
- {
163
- for (std::vector<IsoTree> &tree : model_outputs->trees)
164
- {
165
- output_depths[row] += traverse_itree(tree,
166
- *model_outputs,
167
- prediction_data,
168
- NULL, NULL, 0,
169
- (size_t) row,
170
- (tree_num == NULL)? NULL : tree_num + nrows * (&tree - &(model_outputs->trees[0])),
171
- (size_t) 0);
172
- }
173
- }
174
- }
175
- }
176
-
177
-
178
- else
179
- {
180
- if (
181
- model_outputs_ext->missing_action == Fail &&
182
- prediction_data.categ_data == NULL &&
183
- prediction_data.Xc_indptr == NULL &&
184
- prediction_data.Xr_indptr == NULL
185
- )
186
- {
187
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
188
- for (size_t_for row = 0; row < nrows; row++)
189
- {
190
- for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
191
- {
192
- traverse_hplane_fast(hplane,
193
- *model_outputs_ext,
194
- prediction_data,
195
- output_depths[row],
196
- (tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
197
- (size_t) row);
198
- }
199
- }
200
- }
201
-
202
- else
203
- {
204
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num)
205
- for (size_t_for row = 0; row < nrows; row++)
206
- {
207
- for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
208
- {
209
- traverse_hplane(hplane,
210
- *model_outputs_ext,
211
- prediction_data,
212
- output_depths[row],
213
- NULL, NULL,
214
- (tree_num == NULL)? NULL : tree_num + nrows * (&hplane - &(model_outputs_ext->hplanes[0])),
215
- (size_t) row);
216
- }
217
- }
218
- }
219
- }
220
-
221
- /* translate sum-of-depths to outlier score */
222
- double ntrees, depth_divisor;
223
- if (model_outputs != NULL)
224
- {
225
- ntrees = (double) model_outputs->trees.size();
226
- depth_divisor = ntrees * (model_outputs->exp_avg_depth);
227
- }
228
-
229
- else
230
- {
231
- ntrees = (double) model_outputs_ext->hplanes.size();
232
- depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
233
- }
234
-
235
- if (standardize)
236
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, depth_divisor)
237
- for (size_t_for row = 0; row < nrows; row++)
238
- output_depths[row] = exp2( - output_depths[row] / depth_divisor );
239
- else
240
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(nrows, output_depths, ntrees)
241
- for (size_t_for row = 0; row < nrows; row++)
242
- output_depths[row] /= ntrees;
243
-
244
-
245
- /* re-map tree numbers to start at zero (if predicting tree numbers) */
246
- /* Note: usually this type of 'prediction' is not required,
247
- thus this mapping is not stored in the model objects so as to
248
- save memory */
249
- if (tree_num != NULL)
250
- remap_terminal_trees(model_outputs, model_outputs_ext,
251
- prediction_data, tree_num, nthreads);
252
- }
253
-
254
-
255
- /* TODO: these functions would be faster if done with row-major order,
256
- should at least give the option of taking arrays as row-major. */
257
- void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
258
- IsoForest &model_outputs,
259
- PredictionData &prediction_data,
260
- double &output_depth,
261
- sparse_ix *restrict tree_num,
262
- size_t row)
263
- {
264
- size_t curr_lev = 0;
265
- double xval;
266
- while (true)
267
- {
268
- if (tree[curr_lev].score > 0)
269
- {
270
- output_depth += tree[curr_lev].score;
271
- if (tree_num != NULL)
272
- tree_num[row] = curr_lev;
273
- break;
274
- }
275
-
276
- else
277
- {
278
- switch(tree[curr_lev].col_type)
279
- {
280
- case Numeric:
281
- {
282
- xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
283
- curr_lev = (xval <= tree[curr_lev].num_split)?
284
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
285
- output_depth += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
286
- break;
287
- }
288
-
289
- case Categorical:
290
- {
291
- switch(model_outputs.cat_split_type)
292
- {
293
- case SubSet:
294
- {
295
-
296
- if (!tree[curr_lev].cat_split.size()) /* this is for binary columns */
297
- {
298
- if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
299
- {
300
- curr_lev = (
301
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
302
- == 0
303
- )?
304
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
305
- }
306
-
307
- else /* can only work with 'Smallest' + no NAs if reaching this point */
308
- {
309
- curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
310
- }
311
- }
312
-
313
- else
314
- {
315
-
316
- switch(model_outputs.new_cat_action)
317
- {
318
- case Random:
319
- {
320
- curr_lev = (tree[curr_lev].cat_split[
321
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
322
- ]
323
- )?
324
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
325
- break;
326
- }
327
-
328
- case Smallest:
329
- {
330
- if (
331
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
332
- >= (int)tree[curr_lev].cat_split.size()
333
- )
334
- {
335
- curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
336
- }
337
-
338
- else
339
- {
340
- curr_lev = (tree[curr_lev].cat_split[
341
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
342
- ]
343
- )?
344
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
345
- }
346
- break;
347
- }
348
- }
349
- }
350
- break;
351
- }
352
-
353
- case SingleCateg:
354
- {
355
- curr_lev = (
356
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
357
- ==
358
- tree[curr_lev].chosen_cat
359
- )?
360
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
361
- break;
362
- }
363
- }
364
- break;
365
- }
366
- }
367
- }
368
- }
369
- }
370
-
371
-
372
- double traverse_itree(std::vector<IsoTree> &tree,
373
- IsoForest &model_outputs,
374
- PredictionData &prediction_data,
375
- std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
376
- ImputedData *imputed_data, /* only when imputing missing */
377
- double curr_weight, /* only when imputing missing */
378
- size_t row,
379
- sparse_ix *restrict tree_num,
380
- size_t curr_lev)
381
- {
382
- double xval;
383
- double range_penalty = 0;
384
-
385
- sparse_ix *row_st = NULL, *row_end = NULL;
386
- if (prediction_data.Xr_indptr != NULL)
387
- {
388
- row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
389
- row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
390
- }
391
-
392
- while (true)
393
- {
394
- if (tree[curr_lev].score >= 0.)
395
- {
396
- if (tree_num != NULL)
397
- tree_num[row] = curr_lev;
398
- if (imputed_data != NULL)
399
- add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
400
-
401
- return tree[curr_lev].score + range_penalty;
402
- }
403
-
404
- else
405
- {
406
- switch(tree[curr_lev].col_type)
407
- {
408
- case Numeric:
409
- {
410
-
411
- if (prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL)
412
- xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
413
- else if (prediction_data.Xc_indptr != NULL)
414
- xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
415
- else
416
- xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
417
-
418
- if (isnan(xval))
419
- {
420
- switch(model_outputs.missing_action)
421
- {
422
- case Divide:
423
- {
424
- return
425
- tree[curr_lev].pct_tree_left
426
- * traverse_itree(tree, model_outputs, prediction_data,
427
- impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
428
- row, NULL, tree[curr_lev].tree_left)
429
- + (1 - tree[curr_lev].pct_tree_left)
430
- * traverse_itree(tree, model_outputs, prediction_data,
431
- impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
432
- row, NULL, tree[curr_lev].tree_right)
433
- + range_penalty;
434
- }
435
-
436
- case Impute:
437
- {
438
- curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
439
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
440
- break;
441
- }
442
-
443
- case Fail:
444
- {
445
- return NAN;
446
- }
447
- }
448
- }
449
-
450
- else
451
- {
452
- curr_lev = (xval <=tree[curr_lev].num_split)?
453
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
454
- range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
455
- }
456
- break;
457
- }
458
-
459
- case Categorical:
460
- {
461
-
462
- if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] < 0)
463
- {
464
- switch(model_outputs.missing_action)
465
- {
466
- case Divide:
467
- {
468
- return
469
- tree[curr_lev].pct_tree_left
470
- * traverse_itree(tree, model_outputs, prediction_data,
471
- impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
472
- row, NULL, tree[curr_lev].tree_left)
473
- + (1 - tree[curr_lev].pct_tree_left)
474
- * traverse_itree(tree, model_outputs, prediction_data,
475
- impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
476
- row, NULL, tree[curr_lev].tree_right)
477
- + range_penalty;
478
- }
479
-
480
- case Impute:
481
- {
482
- curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
483
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
484
- break;
485
- }
486
-
487
- case Fail:
488
- {
489
- return NAN;
490
- }
491
- }
492
- }
493
-
494
- else
495
- {
496
- switch(model_outputs.cat_split_type)
497
- {
498
- case SingleCateg:
499
- {
500
- curr_lev = (
501
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
502
- ==
503
- tree[curr_lev].chosen_cat
504
- )?
505
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
506
- break;
507
- }
508
-
509
- case SubSet:
510
- {
511
-
512
- if (!tree[curr_lev].cat_split.size())
513
- {
514
- if (prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows] <= 1)
515
- {
516
- curr_lev = (
517
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
518
- == 0
519
- )?
520
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
521
- }
522
-
523
- else
524
- {
525
- switch(model_outputs.new_cat_action)
526
- {
527
- case Smallest:
528
- {
529
- curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
530
- break;
531
- }
532
-
533
- case Weighted:
534
- {
535
- return
536
- tree[curr_lev].pct_tree_left
537
- * traverse_itree(tree, model_outputs, prediction_data,
538
- impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
539
- row, NULL, tree[curr_lev].tree_left)
540
- + (1 - tree[curr_lev].pct_tree_left)
541
- * traverse_itree(tree, model_outputs, prediction_data,
542
- impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
543
- row, NULL, tree[curr_lev].tree_right)
544
- + range_penalty;
545
- }
546
- }
547
- }
548
- }
549
-
550
- else
551
- {
552
- switch(model_outputs.new_cat_action)
553
- {
554
- case Random:
555
- {
556
- curr_lev = (tree[curr_lev].cat_split[
557
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
558
- ]
559
- )?
560
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
561
- break;
562
- }
563
-
564
- case Smallest:
565
- {
566
- if (
567
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
568
- >= (int)tree[curr_lev].cat_split.size()
569
- )
570
- {
571
- curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
572
- }
573
-
574
- else
575
- {
576
- curr_lev = (tree[curr_lev].cat_split[
577
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
578
- ]
579
- )?
580
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
581
- }
582
- break;
583
- }
584
-
585
- case Weighted:
586
- {
587
- if (
588
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
589
- >= (int)tree[curr_lev].cat_split.size()
590
- ||
591
- tree[curr_lev].cat_split[
592
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
593
- ]
594
- == (-1)
595
- )
596
- {
597
- return
598
- tree[curr_lev].pct_tree_left
599
- * traverse_itree(tree, model_outputs, prediction_data,
600
- impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
601
- row, NULL, tree[curr_lev].tree_left)
602
- + (1 - tree[curr_lev].pct_tree_left)
603
- * traverse_itree(tree, model_outputs, prediction_data,
604
- impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
605
- row, NULL, tree[curr_lev].tree_right)
606
- + range_penalty;
607
- }
608
-
609
- else
610
- {
611
- curr_lev = (tree[curr_lev].cat_split[
612
- prediction_data.categ_data[row + tree[curr_lev].col_num * prediction_data.nrows]
613
- ]
614
- )?
615
- tree[curr_lev].tree_left : tree[curr_lev].tree_right;
616
- }
617
- break;
618
- }
619
- }
620
- }
621
- break;
622
- }
623
- }
624
- }
625
- break;
626
- }
627
- }
628
- }
629
- }
630
- }
631
-
632
- /* this is a simpler version for situations in which there is
633
- only numeric data in dense arrays and no missing values */
634
- void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
635
- ExtIsoForest &model_outputs,
636
- PredictionData &prediction_data,
637
- double &output_depth,
638
- sparse_ix *restrict tree_num,
639
- size_t row)
640
- {
641
- size_t curr_lev = 0;
642
- double hval;
643
-
644
- while(true)
645
- {
646
- if (hplane[curr_lev].score > 0)
647
- {
648
- output_depth += hplane[curr_lev].score;
649
- if (tree_num != NULL)
650
- tree_num[row] = curr_lev;
651
- return;
652
- }
653
-
654
- else
655
- {
656
- hval = 0;
657
- for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
658
- hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
659
- - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
660
- }
661
-
662
- output_depth += (hval < hplane[curr_lev].range_low) ||
663
- (hval > hplane[curr_lev].range_high);
664
- curr_lev = (hval <= hplane[curr_lev].split_point)?
665
- hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
666
- }
667
- }
668
-
669
- /* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
670
- void traverse_hplane(std::vector<IsoHPlane> &hplane,
671
- ExtIsoForest &model_outputs,
672
- PredictionData &prediction_data,
673
- double &output_depth,
674
- std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
675
- ImputedData *imputed_data, /* only when imputing missing */
676
- sparse_ix *restrict tree_num,
677
- size_t row)
678
- {
679
- size_t curr_lev = 0;
680
- double xval;
681
- int cval;
682
- double hval;
683
-
684
- size_t ncols_numeric, ncols_categ;
685
-
686
- sparse_ix *row_st = NULL, *row_end = NULL;
687
- if (prediction_data.Xr_indptr != NULL)
688
- {
689
- row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
690
- row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
691
- }
692
-
693
- while(true)
694
- {
695
- if (hplane[curr_lev].score > 0)
696
- {
697
- output_depth += hplane[curr_lev].score;
698
- if (tree_num != NULL)
699
- tree_num[row] = curr_lev;
700
- if (imputed_data != NULL)
701
- {
702
- add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
703
- }
704
- return;
705
- }
706
-
707
- else
708
- {
709
- hval = 0;
710
- ncols_numeric = 0; ncols_categ = 0;
711
- for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
712
- {
713
- switch(hplane[curr_lev].col_type[col])
714
- {
715
- case Numeric:
716
- {
717
- if (prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL)
718
- xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
719
- else if (prediction_data.Xc_indptr != NULL)
720
- xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
721
- else
722
- xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col]);
723
-
724
- if (is_na_or_inf(xval))
725
- {
726
- if (model_outputs.missing_action != Fail)
727
- {
728
- hval += hplane[curr_lev].fill_val[col];
729
- }
730
-
731
- else
732
- {
733
- output_depth = NAN;
734
- return;
735
- }
736
- }
737
-
738
- else
739
- {
740
- hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
741
- }
742
-
743
- ncols_numeric++;
744
- break;
745
- }
746
-
747
- case Categorical:
748
- {
749
- cval = prediction_data.categ_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
750
- if (cval < 0)
751
- {
752
- if (model_outputs.missing_action != Fail)
753
- {
754
- hval += hplane[curr_lev].fill_val[col];
755
- }
756
-
757
- else
758
- {
759
- output_depth = NAN;
760
- return;
761
- }
762
- }
763
-
764
- else
765
- {
766
- switch(model_outputs.cat_split_type)
767
- {
768
- case SingleCateg:
769
- {
770
- hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
771
- break;
772
- }
773
-
774
- case SubSet:
775
- {
776
- if (cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size())
777
- hval += hplane[curr_lev].fill_new[ncols_categ];
778
- else
779
- hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
780
- break;
781
- }
782
- }
783
- }
784
-
785
- ncols_categ++;
786
- break;
787
- }
788
- }
789
-
790
- }
791
-
792
- output_depth += (hval < hplane[curr_lev].range_low) ||
793
- (hval > hplane[curr_lev].range_high);
794
- curr_lev = (hval <= hplane[curr_lev].split_point)?
795
- hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
796
- }
797
- }
798
- }
799
-
800
- double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num)
801
- {
802
- sparse_ix *search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
803
- prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
804
- (sparse_ix) row);
805
- if (
806
- search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
807
- ||
808
- *search_res != row
809
- )
810
- return 0.;
811
- else
812
- return prediction_data.Xc[search_res - prediction_data.Xc_ind];
813
- }
814
-
815
- double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num)
816
- {
817
- if (row_end == row_st)
818
- return 0.;
819
- sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
820
- if (search_res == row_end || *search_res != (sparse_ix)col_num)
821
- return 0.;
822
- else
823
- return prediction_data.Xr[search_res - prediction_data.Xr_ind];
824
- }
825
-
826
- void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
827
- {
828
- std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
829
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
830
- for (size_t_for tree = 0; tree < model_outputs.trees.size(); tree++)
831
- {
832
- n_nodes[tree] = model_outputs.trees[tree].size();
833
- for (IsoTree &node : model_outputs.trees[tree])
834
- {
835
- n_terminal[tree] += (node.score > 0);
836
- }
837
- }
838
- }
839
-
840
- void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads)
841
- {
842
- std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
843
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
844
- for (size_t_for hplane = 0; hplane < model_outputs.hplanes.size(); hplane++)
845
- {
846
- n_nodes[hplane] = model_outputs.hplanes[hplane].size();
847
- for (IsoHPlane &node : model_outputs.hplanes[hplane])
848
- {
849
- n_terminal[hplane] += (node.score > 0);
850
- }
851
- }
852
- }
853
-