isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,1932 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* TODO: should create versions of these functions that would work on the
66
+ serialized raw bytes instead, as it will likely be faster due to better
67
+ cache utilizations and those objects use less memory. */
68
+
69
+ /* TODO: these trees are all created in a depth-first fashion, which will
70
+ not be cache-friendly when predictions are sent to a right-side branch. In
71
+ order to make predictions faster, could re-arrange the trees after-the-fact
72
+ so that they contain batches of consecutive nodes (parent and children and
73
+ grandchildren) up to some depth - that way these prediction functions would
74
+ run faster. After that, could also do a manual tree leaves unroll within each
75
+ batch with stack-assigned variables for an even faster prediction function. */
76
+
77
+
78
+ /* Predict outlier score, average depth, or terminal node numbers
79
+ *
80
+ * Parameters
81
+ * ==========
82
+ * - numeric_data[nrows * ncols_numeric]
83
+ * Pointer to numeric data for which to make predictions. May be ordered by rows
84
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
85
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
86
+ * (see parameter 'is_col_major').
87
+ * Pass NULL if there are no dense numeric columns.
88
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
89
+ * - categ_data[nrows * ncols_categ]
90
+ * Pointer to categorical data for which to make predictions. May be ordered by rows
91
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
92
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
93
+ * (see parameter 'is_col_major').
94
+ * Pass NULL if there are no categorical columns.
95
+ * Each category should be represented as an integer, and these integers must start at zero and
96
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
97
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
98
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
99
+ * must be the same as was used in the data to which the model was fit.
100
+ * - is_col_major
101
+ * Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
102
+ * model was fit. If passing 'false', will assume they are in row-major order. Note that most of
103
+ * the functions in this library work only with column-major order, but here both are suitable
104
+ * and row-major is preferred. Both arrays must have the same orientation (row/column major).
105
+ * If there is numeric sparse data in combination with categorical dense data and there are many
106
+ * rows, it is recommended to pass the categorical data in column major order, as it will take
107
+ * a faster route.
108
+ * If passing 'is_col_major=true', must also provide 'ld_numeric' and/or 'ld_categ'.
109
+ * - ld_numeric
110
+ * Leading dimension of the array 'numeric_data', if it is passed in row-major format.
111
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
112
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
113
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
114
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
115
+ * data in sparse format.
116
+ * - ld_categ
117
+ * Leading dimension of the array 'categ_data', if it is passed in row-major format.
118
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
119
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
120
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
121
+ * leading dimension corresponds to the number of rows.
122
+ * - Xc[nnz]
123
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
124
+ * Pass NULL if there are no sparse numeric columns.
125
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
126
+ * - Xc_ind[nnz]
127
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
128
+ * Must be in sorted order, otherwise results will be incorrect.
129
+ * Pass NULL if there are no sparse numeric columns in CSC format.
130
+ * - Xc_indptr[ncols_categ + 1]
131
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
132
+ * start and at entry [col + 1] where does column 'col' end.
133
+ * Pass NULL if there are no sparse numeric columns in CSC format.
134
+ * - Xr[nnz]
135
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
136
+ * Pass NULL if there are no sparse numeric columns.
137
+ * Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
138
+ * - Xr_ind[nnz]
139
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
140
+ * Must be in sorted order, otherwise results will be incorrect.
141
+ * Pass NULL if there are no sparse numeric columns in CSR format.
142
+ * - Xr_indptr[nrows + 1]
143
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
144
+ * start and at entry [row + 1] where does row 'row' end.
145
+ * Pass NULL if there are no sparse numeric columns in CSR format.
146
+ * - nrows
147
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
148
+ * - nthreads
149
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
150
+ * allocated, even if the thread does not end up being used. Ignored when not building with
151
+ * OpenMP support.
152
+ * - standardize
153
+ * Whether to standardize the average depths for each row according to their relative magnitude
154
+ * compared to the expected average, in order to obtain an outlier score. If passing 'false',
155
+ * will output the average depth instead.
156
+ * Ignored when not passing 'output_depths'.
157
+ * - model_outputs
158
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
159
+ * if the predictions are to be made from an extended model. Can only pass one of
160
+ * 'model_outputs' and 'model_outputs_ext'.
161
+ * - model_outputs_ext
162
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
163
+ * if the predictions are to be made from a single-variable model. Can only pass one of
164
+ * 'model_outputs' and 'model_outputs_ext'.
165
+ * - output_depths[nrows] (out)
166
+ * Pointer to array where the output average depths or outlier scores will be written into
167
+ * (the return type is controlled according to parameter 'standardize').
168
+ * Should always be passed when calling this function (it is not optional).
169
+ * - tree_num[nrows * ntrees] (out)
170
+ * Pointer to array where the output terminal node numbers will be written into.
171
+ * Note that the mapping between tree node and terminal tree node is not stored in
172
+ * the model object for efficiency reasons, so this mapping will be determined on-the-fly
173
+ * when passing this parameter, and as such, there will be some overhead regardless of
174
+ * the actual number of rows. Output will be in column-major order ([nrows, ntrees]).
175
+ * This will not be calculable when using 'ndim==1' alongside with either
176
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
177
+ * Pass NULL if this type of output is not needed.
178
+ * - per_tree_depths[nrows * ntrees] (out)
179
+ * Pointer to array where to output per-tree depths or expected depths for each row.
180
+ * Note that these will not include range penalities ('penalize_range=true').
181
+ * Output will be in row-major order ([nrows, ntrees]).
182
+ * This will not be calculable when using 'ndim==1' alongside with either
183
+ * 'missing_action==Divide' or 'new_categ_action=Weighted'.
184
+ * Pass NULL if this type of output is not needed.
185
+ * - indexer
186
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
187
+ * which can be used to speed up tree numbers/indices predictions.
188
+ * This is ignored when not passing 'tree_num'.
189
+ * Pass NULL if the indexer has not been constructed.
190
+ */
191
+ template <class real_t, class sparse_ix>
192
+ void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
193
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
194
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
195
+ real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
196
+ size_t nrows, int nthreads, bool standardize,
197
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
198
+ double *restrict output_depths, sparse_ix *restrict tree_num,
199
+ double *restrict per_tree_depths,
200
+ TreesIndexer *indexer)
201
+ {
202
+ if (unlikely(!nrows)) return;
203
+
204
+ /* put data in a struct for passing it in fewer lines */
205
+ PredictionData<real_t, sparse_ix>
206
+ prediction_data = {numeric_data, categ_data, nrows,
207
+ is_col_major, ld_numeric, ld_categ,
208
+ Xc, Xc_ind, Xc_indptr,
209
+ Xr, Xr_ind, Xr_indptr};
210
+
211
+ int nthreads_orig = nthreads;
212
+ if ((size_t)nthreads > nrows)
213
+ nthreads = nrows;
214
+
215
+ /* For batch predictions of sparse CSC, will take a specialized route */
216
+ if (prediction_data.Xc_indptr != NULL && (prediction_data.categ_data == NULL || prediction_data.is_col_major))
217
+ {
218
+ batched_csc_predict(prediction_data, nthreads_orig,
219
+ model_outputs, model_outputs_ext,
220
+ output_depths, tree_num,
221
+ per_tree_depths);
222
+ }
223
+
224
+ /* Regular case (no specialized CSC route) */
225
+ else if (model_outputs != NULL)
226
+ {
227
+ if (
228
+ model_outputs->missing_action == Fail &&
229
+ (model_outputs->new_cat_action != Weighted || model_outputs->cat_split_type == SingleCateg || prediction_data.categ_data == NULL) &&
230
+ prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL &&
231
+ !model_outputs->has_range_penalty
232
+ )
233
+ {
234
+ if (prediction_data.categ_data == NULL && (nrows == 1 || !prediction_data.is_col_major))
235
+ {
236
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
237
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
238
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
239
+ {
240
+ double score = 0;
241
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
242
+ {
243
+ traverse_itree_fast(model_outputs->trees[tree],
244
+ *model_outputs,
245
+ prediction_data.numeric_data + row * prediction_data.ncols_numeric,
246
+ score,
247
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
248
+ (per_tree_depths == NULL)?
249
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
250
+ (size_t) row);
251
+ }
252
+ output_depths[row] = score;
253
+ }
254
+ }
255
+
256
+ else
257
+ {
258
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
259
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
260
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
261
+ {
262
+ double score = 0;
263
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
264
+ {
265
+ traverse_itree_no_recurse(model_outputs->trees[tree],
266
+ *model_outputs,
267
+ prediction_data,
268
+ score,
269
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
270
+ (per_tree_depths == NULL)?
271
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
272
+ (size_t) row);
273
+ }
274
+ output_depths[row] = score;
275
+ }
276
+ }
277
+ }
278
+
279
+ else
280
+ {
281
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
282
+ shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
283
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
284
+ {
285
+ double score = 0;
286
+ for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
287
+ {
288
+ score += traverse_itree(model_outputs->trees[tree],
289
+ *model_outputs,
290
+ prediction_data,
291
+ (std::vector<ImputeNode>*)NULL,
292
+ (ImputedData<sparse_ix, double>*)NULL,
293
+ (double)0,
294
+ (size_t) row,
295
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
296
+ (per_tree_depths == NULL)?
297
+ NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
298
+ (size_t) 0);
299
+ }
300
+ output_depths[row] = score;
301
+ }
302
+ }
303
+ }
304
+
305
+
306
+ else
307
+ {
308
+ if (
309
+ model_outputs_ext->missing_action == Fail &&
310
+ prediction_data.categ_data == NULL &&
311
+ prediction_data.Xc_indptr == NULL &&
312
+ prediction_data.Xr_indptr == NULL &&
313
+ !model_outputs_ext->has_range_penalty
314
+ )
315
+ {
316
+ if (prediction_data.is_col_major && nrows > 1)
317
+ {
318
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
319
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
320
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
321
+ {
322
+ double score = 0;
323
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
324
+ {
325
+ traverse_hplane_fast_colmajor(model_outputs_ext->hplanes[tree],
326
+ *model_outputs_ext,
327
+ prediction_data,
328
+ score,
329
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
330
+ (per_tree_depths == NULL)?
331
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
332
+ (size_t) row);
333
+ }
334
+ output_depths[row] = score;
335
+ }
336
+ }
337
+
338
+ else
339
+ {
340
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
341
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
342
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
343
+ {
344
+ double score = 0;
345
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
346
+ {
347
+ traverse_hplane_fast_rowmajor(model_outputs_ext->hplanes[tree],
348
+ *model_outputs_ext,
349
+ prediction_data.numeric_data + row * prediction_data.ncols_numeric,
350
+ score,
351
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
352
+ (per_tree_depths == NULL)?
353
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
354
+ (size_t) row);
355
+ }
356
+ output_depths[row] = score;
357
+ }
358
+ }
359
+ }
360
+
361
+ else
362
+ {
363
+ #pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
364
+ shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
365
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
366
+ {
367
+ double score = 0;
368
+ for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
369
+ {
370
+ traverse_hplane(model_outputs_ext->hplanes[tree],
371
+ *model_outputs_ext,
372
+ prediction_data,
373
+ score,
374
+ (std::vector<ImputeNode>*)NULL,
375
+ (ImputedData<sparse_ix, double>*)NULL,
376
+ (tree_num == NULL)? NULL : (tree_num + nrows * tree),
377
+ (per_tree_depths == NULL)?
378
+ NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
379
+ (size_t) row);
380
+ }
381
+ output_depths[row] = score;
382
+ }
383
+ }
384
+ }
385
+
386
+ /* translate sum-of-depths to outlier score */
387
+ double ntrees, depth_divisor;
388
+ if (model_outputs != NULL)
389
+ {
390
+ ntrees = (double) model_outputs->trees.size();
391
+ depth_divisor = ntrees * (model_outputs->exp_avg_depth);
392
+ }
393
+
394
+ else
395
+ {
396
+ ntrees = (double) model_outputs_ext->hplanes.size();
397
+ depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
398
+ }
399
+
400
+
401
+ /* for density and boxed_ratio, each tree will have 'log(d)'' instead of 'd' */
402
+ bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
403
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
404
+ bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
405
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
406
+ bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
407
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
408
+ bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity2) ||
409
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity2);
410
+
411
+ if (standardize)
412
+ {
413
+ if (is_density || is_bdens2)
414
+ {
415
+ ntrees = -ntrees;
416
+ for (size_t row = 0; row < nrows; row++)
417
+ output_depths[row] /= ntrees;
418
+ }
419
+
420
+ else if (is_bdens)
421
+ {
422
+ #ifndef _WIN32
423
+ #pragma omp simd
424
+ #endif
425
+ for (size_t row = 0; row < nrows; row++)
426
+ output_depths[row] = -std::exp(output_depths[row] / ntrees);
427
+ }
428
+
429
+ else if (is_bratio)
430
+ {
431
+ for (size_t row = 0; row < nrows; row++)
432
+ output_depths[row] = output_depths[row] / ntrees;
433
+ }
434
+
435
+ else
436
+ {
437
+ #ifndef _WIN32
438
+ #pragma omp simd
439
+ #endif
440
+ for (size_t row = 0; row < nrows; row++)
441
+ output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
442
+ }
443
+ }
444
+
445
+ else
446
+ {
447
+ if (is_density || is_bdens || is_bdens2)
448
+ {
449
+ #ifndef _WIN32
450
+ #pragma omp simd
451
+ #endif
452
+ for (size_t row = 0; row < nrows; row++)
453
+ output_depths[row] = std::exp(output_depths[row] / ntrees);
454
+ }
455
+
456
+ else if (is_bratio)
457
+ {
458
+ ntrees = -ntrees;
459
+ for (size_t row = 0; row < nrows; row++)
460
+ output_depths[row] /= ntrees;
461
+ }
462
+
463
+ else
464
+ {
465
+ for (size_t row = 0; row < nrows; row++)
466
+ output_depths[row] /= ntrees;
467
+ }
468
+ }
469
+
470
+ if (per_tree_depths != NULL && (is_density || is_bdens || is_bdens2))
471
+ {
472
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
473
+ #ifndef _WIN32
474
+ #pragma omp simd
475
+ #endif
476
+ for (size_t ix = 0; ix < nrows*ntrees; ix++)
477
+ per_tree_depths[ix] = std::exp(per_tree_depths[ix]);
478
+ }
479
+
480
+
481
+ /* re-map tree numbers to start at zero (if predicting tree numbers) */
482
+ /* Note: usually this type of 'prediction' is not required,
483
+ thus this mapping is not stored in the model objects so as to
484
+ save memory */
485
+ if (tree_num != NULL)
486
+ {
487
+ if (indexer != NULL && !indexer->indices.empty())
488
+ {
489
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
490
+ if (model_outputs != NULL)
491
+ {
492
+ if (model_outputs->missing_action == Divide)
493
+ goto manual_remap;
494
+ if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL)
495
+ goto manual_remap;
496
+ }
497
+
498
+ for (size_t tree = 0; tree < ntrees; tree++)
499
+ {
500
+ size_t *restrict mapping = indexer->indices[tree].terminal_node_mappings.data();
501
+ for (size_t row = 0; row < nrows; row++)
502
+ {
503
+ tree_num[row + tree*nrows] = mapping[tree_num[row + tree*nrows]];
504
+ }
505
+ }
506
+ }
507
+
508
+ else
509
+ {
510
+ manual_remap:
511
+ remap_terminal_trees(model_outputs, model_outputs_ext,
512
+ prediction_data, tree_num, nthreads);
513
+ }
514
+ }
515
+ }
516
+
517
+ template <class real_t, class sparse_ix>
518
+ void traverse_itree_fast(std::vector<IsoTree> &tree,
519
+ IsoForest &model_outputs,
520
+ real_t *restrict row_numeric_data,
521
+ double &restrict output_depth,
522
+ sparse_ix *restrict tree_num,
523
+ double *restrict tree_depth,
524
+ size_t row) noexcept
525
+ {
526
+ size_t curr_lev = 0;
527
+ double xval;
528
+ while (true)
529
+ {
530
+ if (unlikely(tree[curr_lev].tree_left == 0))
531
+ {
532
+ output_depth += tree[curr_lev].score;
533
+ if (unlikely(tree_num != NULL))
534
+ tree_num[row] = curr_lev;
535
+ if (unlikely(tree_depth != NULL))
536
+ *tree_depth = tree[curr_lev].score;
537
+ break;
538
+ }
539
+
540
+ else
541
+ {
542
+ xval = row_numeric_data[tree[curr_lev].col_num];
543
+ curr_lev = (xval <= tree[curr_lev].num_split)?
544
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
545
+ }
546
+ }
547
+ }
548
+
549
+ template <class PredictionData, class sparse_ix>
550
+ void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
551
+ IsoForest &model_outputs,
552
+ PredictionData &prediction_data,
553
+ double &restrict output_depth,
554
+ sparse_ix *restrict tree_num,
555
+ double *restrict tree_depth,
556
+ size_t row) noexcept
557
+ {
558
+ size_t curr_lev = 0;
559
+ double xval;
560
+ int cval;
561
+ while (true)
562
+ {
563
+ // if (tree[curr_lev].score > 0)
564
+ if (unlikely(tree[curr_lev].tree_left == 0))
565
+ {
566
+ output_depth += tree[curr_lev].score;
567
+ if (unlikely(tree_num != NULL))
568
+ tree_num[row] = curr_lev;
569
+ if (unlikely(tree_depth != NULL))
570
+ *tree_depth = tree[curr_lev].score;
571
+ break;
572
+ }
573
+
574
+ else
575
+ {
576
+ switch (tree[curr_lev].col_type)
577
+ {
578
+ case Numeric:
579
+ {
580
+ xval = prediction_data.numeric_data[
581
+ prediction_data.is_col_major?
582
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
583
+ :
584
+ (tree[curr_lev].col_num + row * prediction_data.ncols_numeric)
585
+ ];
586
+ curr_lev = (xval <= tree[curr_lev].num_split)?
587
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
588
+ break;
589
+ }
590
+
591
+ case Categorical:
592
+ {
593
+ cval = prediction_data.categ_data[
594
+ prediction_data.is_col_major?
595
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
596
+ :
597
+ (tree[curr_lev].col_num + row * prediction_data.ncols_categ)
598
+ ];
599
+ switch (model_outputs.cat_split_type)
600
+ {
601
+ case SubSet:
602
+ {
603
+
604
+ if (tree[curr_lev].cat_split.empty()) /* this is for binary columns */
605
+ {
606
+ if (cval <= 1)
607
+ {
608
+ curr_lev = (cval == 0)?
609
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
610
+ }
611
+
612
+ else /* can only work with 'Smallest' + no NAs if reaching this point */
613
+ {
614
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
615
+ }
616
+ }
617
+
618
+ else
619
+ {
620
+
621
+ switch (model_outputs.new_cat_action)
622
+ {
623
+ case Random:
624
+ {
625
+ cval = (cval >= (int)tree[curr_lev].cat_split.size())?
626
+ (cval % (int)tree[curr_lev].cat_split.size()) : cval;
627
+ curr_lev = (tree[curr_lev].cat_split[cval])?
628
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
629
+ break;
630
+ }
631
+
632
+ case Smallest:
633
+ {
634
+ if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
635
+ {
636
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
637
+ }
638
+
639
+ else
640
+ {
641
+ curr_lev = (tree[curr_lev].cat_split[cval])?
642
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
643
+ }
644
+ break;
645
+ }
646
+
647
+ default:
648
+ {
649
+ assert(0);
650
+ break;
651
+ }
652
+ }
653
+ }
654
+ break;
655
+ }
656
+
657
+ case SingleCateg:
658
+ {
659
+ curr_lev = (cval == tree[curr_lev].chosen_cat)?
660
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
661
+ break;
662
+ }
663
+ }
664
+ break;
665
+ }
666
+
667
+ default:
668
+ {
669
+ assert(0);
670
+ break;
671
+ }
672
+ }
673
+ }
674
+ }
675
+ }
676
+
677
+ enum NumericConfig {DenseRowMajor, DenseColMajor, SparseCSR, SparseCSC};
678
+
679
+ template <class PredictionData, class sparse_ix, class ImputedData>
680
+ double traverse_itree(std::vector<IsoTree> &tree,
681
+ IsoForest &model_outputs,
682
+ PredictionData &prediction_data,
683
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
684
+ ImputedData *imputed_data, /* only when imputing missing */
685
+ double curr_weight, /* only when imputing missing */
686
+ size_t row,
687
+ sparse_ix *restrict tree_num,
688
+ double *restrict tree_depth,
689
+ size_t curr_lev) noexcept
690
+ {
691
+ double xval;
692
+ int cval;
693
+ double range_penalty = 0;
694
+
695
+ NumericConfig numeric_config;
696
+ if (prediction_data.Xr_indptr != NULL)
697
+ numeric_config = SparseCSR;
698
+ else if (prediction_data.Xc_indptr != NULL)
699
+ numeric_config = SparseCSC;
700
+ else if (prediction_data.is_col_major)
701
+ numeric_config = DenseColMajor;
702
+ else
703
+ numeric_config = DenseRowMajor;
704
+
705
+ sparse_ix *row_st = NULL, *row_end = NULL;
706
+ if (numeric_config == SparseCSR)
707
+ {
708
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
709
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
710
+ }
711
+
712
+ while (true)
713
+ {
714
+ // if (tree[curr_lev].score >= 0.)
715
+ if (unlikely(tree[curr_lev].tree_left == 0))
716
+ {
717
+ if (unlikely(tree_num != NULL))
718
+ tree_num[row] = curr_lev;
719
+ if (unlikely(tree_depth != NULL))
720
+ *tree_depth = tree[curr_lev].score;
721
+ if (unlikely(imputed_data != NULL))
722
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
723
+
724
+ return tree[curr_lev].score - range_penalty;
725
+ }
726
+
727
+ else
728
+ {
729
+ switch(tree[curr_lev].col_type)
730
+ {
731
+ case Numeric:
732
+ {
733
+ switch(numeric_config)
734
+ {
735
+ case DenseRowMajor:
736
+ {
737
+ xval = prediction_data.numeric_data[tree[curr_lev].col_num + row * prediction_data.ncols_numeric];
738
+ break;
739
+ }
740
+
741
+ case DenseColMajor:
742
+ {
743
+ xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
744
+ break;
745
+ }
746
+
747
+ case SparseCSR:
748
+ {
749
+ xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
750
+ break;
751
+ }
752
+
753
+ case SparseCSC:
754
+ {
755
+ xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
756
+ break;
757
+ }
758
+ }
759
+
760
+ if (unlikely(std::isnan(xval)))
761
+ {
762
+ switch(model_outputs.missing_action)
763
+ {
764
+ case Divide:
765
+ {
766
+ return
767
+ tree[curr_lev].pct_tree_left
768
+ * traverse_itree(tree, model_outputs, prediction_data,
769
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
770
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
771
+ + (1. - tree[curr_lev].pct_tree_left)
772
+ * traverse_itree(tree, model_outputs, prediction_data,
773
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
774
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
775
+ - range_penalty;
776
+ }
777
+
778
+ case Impute:
779
+ {
780
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
781
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
782
+ break;
783
+ }
784
+
785
+ case Fail:
786
+ {
787
+ return NAN;
788
+ }
789
+ }
790
+ }
791
+
792
+ else
793
+ {
794
+ range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
795
+ curr_lev = (xval <= tree[curr_lev].num_split)?
796
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
797
+ }
798
+ break;
799
+ }
800
+
801
+ case Categorical:
802
+ {
803
+ cval = prediction_data.categ_data[
804
+ prediction_data.is_col_major?
805
+ (row + tree[curr_lev].col_num * prediction_data.nrows)
806
+ :
807
+ (tree[curr_lev].col_num + row * prediction_data.ncols_categ)
808
+ ];
809
+ if (unlikely(cval < 0))
810
+ {
811
+ switch(model_outputs.missing_action)
812
+ {
813
+ case Divide:
814
+ {
815
+ return
816
+ tree[curr_lev].pct_tree_left
817
+ * traverse_itree(tree, model_outputs, prediction_data,
818
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
819
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
820
+ + (1. - tree[curr_lev].pct_tree_left)
821
+ * traverse_itree(tree, model_outputs, prediction_data,
822
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
823
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
824
+ - range_penalty;
825
+ }
826
+
827
+ case Impute:
828
+ {
829
+ curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
830
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
831
+ break;
832
+ }
833
+
834
+ case Fail:
835
+ {
836
+ return NAN;
837
+ }
838
+ }
839
+ }
840
+
841
+ else
842
+ {
843
+ switch(model_outputs.cat_split_type)
844
+ {
845
+ case SingleCateg:
846
+ {
847
+ curr_lev = (cval == tree[curr_lev].chosen_cat)?
848
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
849
+ break;
850
+ }
851
+
852
+ case SubSet:
853
+ {
854
+
855
+ if (tree[curr_lev].cat_split.empty())
856
+ {
857
+ if (cval <= 1)
858
+ {
859
+ curr_lev = (cval == 0)?
860
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
861
+ }
862
+
863
+ else
864
+ {
865
+ switch(model_outputs.new_cat_action)
866
+ {
867
+ case Smallest:
868
+ {
869
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
870
+ break;
871
+ }
872
+
873
+ case Weighted:
874
+ {
875
+ return
876
+ tree[curr_lev].pct_tree_left
877
+ * traverse_itree(tree, model_outputs, prediction_data,
878
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
879
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
880
+ + (1. - tree[curr_lev].pct_tree_left)
881
+ * traverse_itree(tree, model_outputs, prediction_data,
882
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
883
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
884
+ - range_penalty;
885
+ }
886
+
887
+ default:
888
+ {
889
+ assert(0);
890
+ break;
891
+ }
892
+ }
893
+ }
894
+ }
895
+
896
+ else
897
+ {
898
+ switch(model_outputs.new_cat_action)
899
+ {
900
+ case Random:
901
+ {
902
+ cval = (cval >= (int)tree[curr_lev].cat_split.size())?
903
+ (cval % (int)tree[curr_lev].cat_split.size()) : cval;
904
+ curr_lev = (tree[curr_lev].cat_split[cval])?
905
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
906
+ break;
907
+ }
908
+
909
+ case Smallest:
910
+ {
911
+ if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
912
+ {
913
+ curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
914
+ }
915
+
916
+ else
917
+ {
918
+ curr_lev = (tree[curr_lev].cat_split[cval])?
919
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
920
+ }
921
+ break;
922
+ }
923
+
924
+ case Weighted:
925
+ {
926
+ if (cval >= (int)tree[curr_lev].cat_split.size()
927
+ ||
928
+ tree[curr_lev].cat_split[cval] == (-1))
929
+ {
930
+ return
931
+ tree[curr_lev].pct_tree_left
932
+ * traverse_itree(tree, model_outputs, prediction_data,
933
+ impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
934
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
935
+ + (1. - tree[curr_lev].pct_tree_left)
936
+ * traverse_itree(tree, model_outputs, prediction_data,
937
+ impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
938
+ row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
939
+ - range_penalty;
940
+ }
941
+
942
+ else
943
+ {
944
+ curr_lev = (tree[curr_lev].cat_split[cval])?
945
+ tree[curr_lev].tree_left : tree[curr_lev].tree_right;
946
+ }
947
+ break;
948
+ }
949
+ }
950
+ }
951
+ break;
952
+ }
953
+ }
954
+ }
955
+ break;
956
+ }
957
+
958
+ default:
959
+ {
960
+ assert(0);
961
+ break;
962
+ }
963
+ }
964
+ }
965
+ }
966
+ }
967
+
968
+ /* this is a simpler version for situations in which there is
969
+ only numeric data in dense arrays, no missing values, no range penalty */
970
+ template <class PredictionData, class sparse_ix>
971
+ void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
972
+ ExtIsoForest &model_outputs,
973
+ PredictionData &prediction_data,
974
+ double &restrict output_depth,
975
+ sparse_ix *restrict tree_num,
976
+ double *restrict tree_depth,
977
+ size_t row) noexcept
978
+ {
979
+ size_t curr_lev = 0;
980
+ double hval;
981
+
982
+ while(true)
983
+ {
984
+ // if (hplane[curr_lev].score > 0)
985
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
986
+ {
987
+ output_depth += hplane[curr_lev].score;
988
+ if (unlikely(tree_num != NULL))
989
+ tree_num[row] = curr_lev;
990
+ if (unlikely(tree_depth != NULL))
991
+ *tree_depth = hplane[curr_lev].score;
992
+ return;
993
+ }
994
+
995
+ else
996
+ {
997
+ hval = 0;
998
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
999
+ hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
1000
+ - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
1001
+
1002
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1003
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1004
+
1005
+ }
1006
+ }
1007
+ }
1008
+
1009
+ template <class real_t, class sparse_ix>
1010
+ void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
1011
+ ExtIsoForest &model_outputs,
1012
+ real_t *restrict row_numeric_data,
1013
+ double &restrict output_depth,
1014
+ sparse_ix *restrict tree_num,
1015
+ double *restrict tree_depth,
1016
+ size_t row) noexcept
1017
+ {
1018
+ size_t curr_lev = 0;
1019
+ double hval;
1020
+
1021
+ while(true)
1022
+ {
1023
+ // if (hplane[curr_lev].score > 0)
1024
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
1025
+ {
1026
+ output_depth += hplane[curr_lev].score;
1027
+ if (unlikely(tree_num != NULL))
1028
+ tree_num[row] = curr_lev;
1029
+ if (unlikely(tree_depth != NULL))
1030
+ *tree_depth = hplane[curr_lev].score;
1031
+ return;
1032
+ }
1033
+
1034
+ else
1035
+ {
1036
+ hval = 0;
1037
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
1038
+ hval += (row_numeric_data[hplane[curr_lev].col_num[col]]
1039
+ - hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
1040
+
1041
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1042
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1043
+
1044
+ }
1045
+ }
1046
+ }
1047
+
1048
+ /* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
1049
+ template <class PredictionData, class sparse_ix, class ImputedData>
1050
+ void traverse_hplane(std::vector<IsoHPlane> &hplane,
1051
+ ExtIsoForest &model_outputs,
1052
+ PredictionData &prediction_data,
1053
+ double &restrict output_depth,
1054
+ std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
1055
+ ImputedData *imputed_data, /* only when imputing missing */
1056
+ sparse_ix *restrict tree_num,
1057
+ double *restrict tree_depth,
1058
+ size_t row) noexcept
1059
+ {
1060
+ size_t curr_lev = 0;
1061
+ double xval;
1062
+ int cval;
1063
+ double hval;
1064
+
1065
+ size_t ncols_numeric, ncols_categ;
1066
+
1067
+ NumericConfig numeric_config;
1068
+ if (prediction_data.Xr_indptr != NULL)
1069
+ numeric_config = SparseCSR;
1070
+ else if (prediction_data.Xc_indptr != NULL)
1071
+ numeric_config = SparseCSC;
1072
+ else if (prediction_data.is_col_major)
1073
+ numeric_config = DenseColMajor;
1074
+ else
1075
+ numeric_config = DenseRowMajor;
1076
+
1077
+ sparse_ix *row_st = NULL, *row_end = NULL;
1078
+ size_t lb, ub;
1079
+ if (numeric_config == SparseCSR)
1080
+ {
1081
+ row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
1082
+ row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
1083
+ lb = *row_st;
1084
+ ub = *(row_end-1);
1085
+ }
1086
+
1087
+ while (true)
1088
+ {
1089
+ // if (hplane[curr_lev].score > 0)
1090
+ if (unlikely(hplane[curr_lev].hplane_left == 0))
1091
+ {
1092
+ output_depth += hplane[curr_lev].score;
1093
+ if (unlikely(tree_num != NULL))
1094
+ tree_num[row] = curr_lev;
1095
+ if (unlikely(tree_depth != NULL))
1096
+ *tree_depth = hplane[curr_lev].score;
1097
+ if (unlikely(imputed_data != NULL))
1098
+ {
1099
+ add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
1100
+ }
1101
+ return;
1102
+ }
1103
+
1104
+ else
1105
+ {
1106
+ hval = 0;
1107
+ ncols_numeric = 0; ncols_categ = 0;
1108
+ for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
1109
+ {
1110
+ switch(hplane[curr_lev].col_type[col])
1111
+ {
1112
+ case Numeric:
1113
+ {
1114
+ switch(numeric_config)
1115
+ {
1116
+ case DenseRowMajor:
1117
+ {
1118
+ xval = prediction_data.numeric_data[hplane[curr_lev].col_num[col] + row * prediction_data.ncols_numeric];
1119
+ break;
1120
+ }
1121
+
1122
+ case DenseColMajor:
1123
+ {
1124
+ xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
1125
+ break;
1126
+ }
1127
+
1128
+ case SparseCSR:
1129
+ {
1130
+ xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col], lb, ub);
1131
+ break;
1132
+ }
1133
+
1134
+ case SparseCSC:
1135
+ {
1136
+ xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
1137
+ break;
1138
+ }
1139
+ }
1140
+
1141
+ if (unlikely(is_na_or_inf(xval)))
1142
+ {
1143
+ if (model_outputs.missing_action != Fail)
1144
+ {
1145
+ hval += hplane[curr_lev].fill_val[col];
1146
+ }
1147
+
1148
+ else
1149
+ {
1150
+ output_depth = NAN;
1151
+ return;
1152
+ }
1153
+ }
1154
+
1155
+ else
1156
+ {
1157
+ hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
1158
+ }
1159
+
1160
+ ncols_numeric++;
1161
+ break;
1162
+ }
1163
+
1164
+ case Categorical:
1165
+ {
1166
+ cval = prediction_data.categ_data[
1167
+ prediction_data.is_col_major?
1168
+ (row + hplane[curr_lev].col_num[col] * prediction_data.nrows)
1169
+ :
1170
+ (hplane[curr_lev].col_num[col] + row * prediction_data.ncols_categ)
1171
+ ];
1172
+ if (unlikely(cval < 0))
1173
+ {
1174
+ if (model_outputs.missing_action != Fail)
1175
+ {
1176
+ hval += hplane[curr_lev].fill_val[col];
1177
+ }
1178
+
1179
+ else
1180
+ {
1181
+ output_depth = NAN;
1182
+ return;
1183
+ }
1184
+ }
1185
+
1186
+ else
1187
+ {
1188
+ switch(model_outputs.cat_split_type)
1189
+ {
1190
+ case SingleCateg:
1191
+ {
1192
+ hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
1193
+ break;
1194
+ }
1195
+
1196
+ case SubSet:
1197
+ {
1198
+ if (unlikely(cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size()))
1199
+ {
1200
+ if (model_outputs.new_cat_action == Random) {
1201
+ cval = cval % (int)hplane[curr_lev].cat_coef[ncols_categ].size();
1202
+ hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
1203
+ }
1204
+
1205
+ else {
1206
+ hval += hplane[curr_lev].fill_new[ncols_categ];
1207
+ }
1208
+ }
1209
+
1210
+ else
1211
+ {
1212
+ hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
1213
+ }
1214
+
1215
+ break;
1216
+ }
1217
+ }
1218
+ }
1219
+
1220
+ ncols_categ++;
1221
+ break;
1222
+ }
1223
+
1224
+ default:
1225
+ {
1226
+ assert(0);
1227
+ break;
1228
+ }
1229
+ }
1230
+
1231
+ }
1232
+
1233
+ output_depth -= (hval < hplane[curr_lev].range_low) ||
1234
+ (hval > hplane[curr_lev].range_high);
1235
+ curr_lev = (hval <= hplane[curr_lev].split_point)?
1236
+ hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
1237
+ }
1238
+ }
1239
+ }
1240
+
1241
+ template <class real_t, class sparse_ix>
1242
+ void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
1243
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1244
+ double *restrict output_depths, sparse_ix *restrict tree_num,
1245
+ double *restrict per_tree_depths)
1246
+ {
1247
+ #ifdef _OPENMP
1248
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1249
+ if ((size_t)nthreads > ntrees)
1250
+ nthreads = ntrees;
1251
+ #else
1252
+ nthreads = 1;
1253
+ #endif
1254
+ std::vector<WorkerForPredictCSC> worker_memory(nthreads);
1255
+
1256
+ bool threw_exception = false;
1257
+ std::exception_ptr ex = NULL;
1258
+
1259
+ if (model_outputs != NULL)
1260
+ {
1261
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
1262
+ shared(worker_memory, model_outputs, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
1263
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs->trees.size(); tree++)
1264
+ {
1265
+ if (threw_exception) continue;
1266
+ try
1267
+ {
1268
+ WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
1269
+ if (!ptr_worker->depths.size())
1270
+ {
1271
+ ptr_worker->depths.resize(prediction_data.nrows);
1272
+ ptr_worker->ix_arr.resize(prediction_data.nrows);
1273
+ std::iota(ptr_worker->ix_arr.begin(),
1274
+ ptr_worker->ix_arr.end(),
1275
+ (size_t)0);
1276
+
1277
+ if (model_outputs->missing_action == Divide ||
1278
+ (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)
1279
+ ) {
1280
+ ptr_worker->weights_arr.resize(prediction_data.nrows);
1281
+ }
1282
+ }
1283
+
1284
+ ptr_worker->st = 0;
1285
+ ptr_worker->end = prediction_data.nrows - 1;
1286
+ if (model_outputs->missing_action == Divide)
1287
+ std::fill(ptr_worker->weights_arr.begin(),
1288
+ ptr_worker->weights_arr.end(),
1289
+ (double)1);
1290
+
1291
+ traverse_itree_csc(*ptr_worker,
1292
+ model_outputs->trees[tree],
1293
+ *model_outputs,
1294
+ prediction_data,
1295
+ (tree_num == NULL)?
1296
+ ((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
1297
+ per_tree_depths,
1298
+ (size_t)0,
1299
+ model_outputs->has_range_penalty);
1300
+ }
1301
+
1302
+ catch (...)
1303
+ {
1304
+ #pragma omp critical
1305
+ {
1306
+ if (!threw_exception)
1307
+ {
1308
+ threw_exception = true;
1309
+ ex = std::current_exception();
1310
+ }
1311
+ }
1312
+ }
1313
+ }
1314
+ }
1315
+
1316
+ else
1317
+ {
1318
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
1319
+ shared(worker_memory, model_outputs_ext, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
1320
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs_ext->hplanes.size(); tree++)
1321
+ {
1322
+ if (threw_exception) continue;
1323
+ try
1324
+ {
1325
+ WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
1326
+ if (!ptr_worker->depths.size())
1327
+ {
1328
+ ptr_worker->depths.resize(prediction_data.nrows);
1329
+ ptr_worker->comb_val.resize(prediction_data.nrows);
1330
+ ptr_worker->ix_arr.resize(prediction_data.nrows);
1331
+ std::iota(ptr_worker->ix_arr.begin(),
1332
+ ptr_worker->ix_arr.end(),
1333
+ (size_t)0);
1334
+ }
1335
+
1336
+ ptr_worker->st = 0;
1337
+ ptr_worker->end = prediction_data.nrows - 1;
1338
+
1339
+ traverse_hplane_csc(*ptr_worker,
1340
+ model_outputs_ext->hplanes[tree],
1341
+ *model_outputs_ext,
1342
+ prediction_data,
1343
+ (tree_num == NULL)?
1344
+ ((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
1345
+ per_tree_depths,
1346
+ (size_t)0,
1347
+ model_outputs_ext->has_range_penalty);
1348
+ }
1349
+
1350
+ catch (...)
1351
+ {
1352
+ #pragma omp critical
1353
+ {
1354
+ if (!threw_exception)
1355
+ {
1356
+ threw_exception = true;
1357
+ ex = std::current_exception();
1358
+ }
1359
+ }
1360
+ }
1361
+ }
1362
+
1363
+ if (threw_exception)
1364
+ std::rethrow_exception(ex);
1365
+ }
1366
+
1367
+ #ifdef _OPENMP
1368
+ if (nthreads <= 1)
1369
+ #endif
1370
+ {
1371
+ std::copy(worker_memory.front().depths.begin(), worker_memory.front().depths.end(), output_depths);
1372
+ }
1373
+
1374
+ #ifdef _OPENMP
1375
+ else
1376
+ {
1377
+ std::fill(output_depths, output_depths + prediction_data.nrows, (double)0);
1378
+ for (auto &workspace : worker_memory)
1379
+ if (workspace.depths.size())
1380
+ #if !defined(_MSC_VER) && !defined(_WIN32)
1381
+ #pragma omp simd
1382
+ #endif
1383
+ for (size_t row = 0; row < prediction_data.nrows; row++)
1384
+ output_depths[row] += workspace.depths[row];
1385
+ }
1386
+ #endif
1387
+ }
1388
+
1389
+ template <class PredictionData, class sparse_ix>
1390
+ void traverse_itree_csc(WorkerForPredictCSC &workspace,
1391
+ std::vector<IsoTree> &trees,
1392
+ IsoForest &model_outputs,
1393
+ PredictionData &prediction_data,
1394
+ sparse_ix *restrict tree_num,
1395
+ double *restrict per_tree_depths,
1396
+ size_t curr_tree,
1397
+ bool has_range_penalty)
1398
+ {
1399
+ // if (trees[curr_tree].score >= 0)
1400
+ if (unlikely(trees[curr_tree].tree_left == 0))
1401
+ {
1402
+ if (model_outputs.missing_action != Divide)
1403
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1404
+ workspace.depths[workspace.ix_arr[row]] += trees[curr_tree].score;
1405
+ else
1406
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1407
+ workspace.depths[workspace.ix_arr[row]] += workspace.weights_arr[workspace.ix_arr[row]] * trees[curr_tree].score;
1408
+ if (unlikely(tree_num != NULL))
1409
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1410
+ tree_num[workspace.ix_arr[row]] = curr_tree;
1411
+ if (unlikely(per_tree_depths != NULL))
1412
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1413
+ per_tree_depths[workspace.ix_arr[row]] = trees[curr_tree].score;
1414
+ return;
1415
+ }
1416
+
1417
+ /* in this case, the indices are sorted in the csc penalty function */
1418
+ if (!(has_range_penalty && model_outputs.missing_action != Divide && curr_tree > 0) && trees[curr_tree].col_type == Numeric)
1419
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1420
+
1421
+ /* TODO: should mix the splitting function with the range penalty */
1422
+
1423
+ /* divide according to tree */
1424
+ size_t orig_end = workspace.end;
1425
+ size_t st_NA, end_NA, split_ix;
1426
+ switch (trees[curr_tree].col_type)
1427
+ {
1428
+ case Numeric:
1429
+ {
1430
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
1431
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1432
+ trees[curr_tree].num_split, model_outputs.missing_action,
1433
+ st_NA, end_NA, split_ix);
1434
+ break;
1435
+ }
1436
+
1437
+ case Categorical:
1438
+ {
1439
+ switch (model_outputs.cat_split_type)
1440
+ {
1441
+ case SingleCateg:
1442
+ {
1443
+ divide_subset_split(workspace.ix_arr.data(),
1444
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1445
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
1446
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
1447
+ break;
1448
+ }
1449
+
1450
+ case SubSet:
1451
+ {
1452
+ if (!trees[curr_tree].cat_split.size())
1453
+ divide_subset_split(workspace.ix_arr.data(),
1454
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1455
+ workspace.st, workspace.end,
1456
+ model_outputs.missing_action, model_outputs.new_cat_action,
1457
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
1458
+ else
1459
+ divide_subset_split(workspace.ix_arr.data(),
1460
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
1461
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
1462
+ (int) trees[curr_tree].cat_split.size(),
1463
+ model_outputs.missing_action, model_outputs.new_cat_action,
1464
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
1465
+ break;
1466
+ }
1467
+ }
1468
+ break;
1469
+ }
1470
+
1471
+ default:
1472
+ {
1473
+ assert(0);
1474
+ break;
1475
+ }
1476
+ }
1477
+
1478
+ /* continue splitting recursively */
1479
+ if (unlikely(model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL))
1480
+ goto missing_action_divide;
1481
+ switch (model_outputs.missing_action)
1482
+ {
1483
+ case Impute:
1484
+ {
1485
+ split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
1486
+ }
1487
+
1488
+ case Fail:
1489
+ {
1490
+ if (split_ix > workspace.st)
1491
+ {
1492
+ workspace.end = split_ix - 1;
1493
+
1494
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1495
+ add_csc_range_penalty(workspace,
1496
+ prediction_data,
1497
+ (double*)NULL,
1498
+ trees[curr_tree].col_num,
1499
+ trees[curr_tree].range_low,
1500
+ trees[curr_tree].range_high);
1501
+
1502
+ traverse_itree_csc(workspace,
1503
+ trees,
1504
+ model_outputs,
1505
+ prediction_data,
1506
+ tree_num,
1507
+ per_tree_depths,
1508
+ trees[curr_tree].tree_left,
1509
+ has_range_penalty);
1510
+ }
1511
+
1512
+
1513
+ if (split_ix <= orig_end)
1514
+ {
1515
+ workspace.st = split_ix;
1516
+ workspace.end = orig_end;
1517
+
1518
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1519
+ add_csc_range_penalty(workspace,
1520
+ prediction_data,
1521
+ (double*)NULL,
1522
+ trees[curr_tree].col_num,
1523
+ trees[curr_tree].range_low,
1524
+ trees[curr_tree].range_high);
1525
+
1526
+ traverse_itree_csc(workspace,
1527
+ trees,
1528
+ model_outputs,
1529
+ prediction_data,
1530
+ tree_num,
1531
+ per_tree_depths,
1532
+ trees[curr_tree].tree_right,
1533
+ has_range_penalty);
1534
+ }
1535
+ break;
1536
+ }
1537
+
1538
+ case Divide:
1539
+ {
1540
+ missing_action_divide:
1541
+ /* TODO: maybe here it shouldn't copy the whole ix_arr,
1542
+ but then it'd need to re-generate it from outside too */
1543
+ std::vector<double> weights_arr;
1544
+ std::vector<size_t> ix_arr;
1545
+ if (end_NA > workspace.st)
1546
+ {
1547
+ weights_arr.assign(workspace.weights_arr.begin(),
1548
+ workspace.weights_arr.begin() + end_NA);
1549
+ ix_arr.assign(workspace.ix_arr.data(),
1550
+ workspace.ix_arr.data() + end_NA);
1551
+ }
1552
+
1553
+ if (has_range_penalty && trees[curr_tree].col_type == Numeric)
1554
+ {
1555
+ size_t st = workspace.st;
1556
+ size_t end = workspace.end;
1557
+
1558
+ if (workspace.st < st_NA)
1559
+ {
1560
+ workspace.end = st_NA - 1;
1561
+ add_csc_range_penalty(workspace,
1562
+ prediction_data,
1563
+ workspace.weights_arr.data(),
1564
+ trees[curr_tree].col_num,
1565
+ trees[curr_tree].range_low,
1566
+ trees[curr_tree].range_high);
1567
+ }
1568
+
1569
+ if (workspace.end >= end_NA)
1570
+ {
1571
+ workspace.st = end_NA;
1572
+ workspace.end = end;
1573
+ add_csc_range_penalty(workspace,
1574
+ prediction_data,
1575
+ workspace.weights_arr.data(),
1576
+ trees[curr_tree].col_num,
1577
+ trees[curr_tree].range_low,
1578
+ trees[curr_tree].range_high);
1579
+ }
1580
+
1581
+ workspace.st = st;
1582
+ workspace.end = end;
1583
+ }
1584
+
1585
+ if (end_NA > workspace.st)
1586
+ {
1587
+ workspace.end = end_NA - 1;
1588
+ for (size_t row = st_NA; row < end_NA; row++)
1589
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
1590
+ traverse_itree_csc(workspace,
1591
+ trees,
1592
+ model_outputs,
1593
+ prediction_data,
1594
+ tree_num,
1595
+ per_tree_depths,
1596
+ trees[curr_tree].tree_left,
1597
+ has_range_penalty);
1598
+ }
1599
+
1600
+ if (st_NA <= orig_end)
1601
+ {
1602
+ workspace.st = st_NA;
1603
+ workspace.end = orig_end;
1604
+ if (weights_arr.size())
1605
+ {
1606
+ std::copy(weights_arr.begin(),
1607
+ weights_arr.end(),
1608
+ workspace.weights_arr.begin());
1609
+ std::copy(ix_arr.begin(),
1610
+ ix_arr.end(),
1611
+ workspace.ix_arr.begin());
1612
+ weights_arr.clear();
1613
+ weights_arr.shrink_to_fit();
1614
+ ix_arr.clear();
1615
+ ix_arr.shrink_to_fit();
1616
+ }
1617
+
1618
+ for (size_t row = st_NA; row < end_NA; row++)
1619
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
1620
+ traverse_itree_csc(workspace,
1621
+ trees,
1622
+ model_outputs,
1623
+ prediction_data,
1624
+ tree_num,
1625
+ per_tree_depths,
1626
+ trees[curr_tree].tree_right,
1627
+ has_range_penalty);
1628
+ }
1629
+ break;
1630
+ }
1631
+ }
1632
+ }
1633
+
1634
+ template <class PredictionData, class sparse_ix>
1635
+ void traverse_hplane_csc(WorkerForPredictCSC &workspace,
1636
+ std::vector<IsoHPlane> &hplanes,
1637
+ ExtIsoForest &model_outputs,
1638
+ PredictionData &prediction_data,
1639
+ sparse_ix *restrict tree_num,
1640
+ double *restrict per_tree_depths,
1641
+ size_t curr_tree,
1642
+ bool has_range_penalty)
1643
+ {
1644
+ // if (hplanes[curr_tree].score >= 0)
1645
+ if (unlikely(hplanes[curr_tree].hplane_left == 0))
1646
+ {
1647
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1648
+ workspace.depths[workspace.ix_arr[row]] += hplanes[curr_tree].score;
1649
+ if (unlikely(tree_num != NULL))
1650
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1651
+ tree_num[workspace.ix_arr[row]] = curr_tree;
1652
+ if (unlikely(per_tree_depths != NULL))
1653
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1654
+ per_tree_depths[workspace.ix_arr[row]] = hplanes[curr_tree].score;
1655
+ return;
1656
+ }
1657
+
1658
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1659
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0.);
1660
+ double unused;
1661
+
1662
+ if (likely(prediction_data.categ_data == NULL))
1663
+ {
1664
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1665
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
1666
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
1667
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1668
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
1669
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1670
+ model_outputs.missing_action, NULL, NULL, false);
1671
+ }
1672
+
1673
+ else
1674
+ {
1675
+ size_t ncols_numeric = 0;
1676
+ size_t ncols_categ = 0;
1677
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1678
+ {
1679
+ switch (hplanes[curr_tree].col_type[col])
1680
+ {
1681
+ case Numeric:
1682
+ {
1683
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
1684
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
1685
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
1686
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
1687
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1688
+ model_outputs.missing_action, NULL, NULL, false);
1689
+ ncols_numeric++;
1690
+ break;
1691
+ }
1692
+
1693
+ case Categorical:
1694
+ {
1695
+ add_linear_comb<double>(
1696
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1697
+ prediction_data.categ_data + hplanes[curr_tree].col_num[col] * prediction_data.nrows,
1698
+ (model_outputs.cat_split_type == SubSet)? (int)hplanes[curr_tree].cat_coef[ncols_categ].size() : 0,
1699
+ (model_outputs.cat_split_type == SubSet)? hplanes[curr_tree].cat_coef[ncols_categ].data() : NULL,
1700
+ (model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].fill_new[ncols_categ] : 0.,
1701
+ (model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].chosen_cat[ncols_categ] : 0,
1702
+ hplanes[curr_tree].fill_val[col], hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
1703
+ model_outputs.new_cat_action, model_outputs.missing_action, model_outputs.cat_split_type, false);
1704
+ ncols_categ++;
1705
+ break;
1706
+ }
1707
+
1708
+ default:
1709
+ {
1710
+ assert(0);
1711
+ break;
1712
+ }
1713
+ }
1714
+ }
1715
+ }
1716
+
1717
+ if (has_range_penalty)
1718
+ {
1719
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1720
+ workspace.depths[workspace.ix_arr[row]]
1721
+ -=
1722
+ (workspace.comb_val[row - workspace.st] < hplanes[curr_tree].range_low) ||
1723
+ (workspace.comb_val[row - workspace.st] > hplanes[curr_tree].range_high);
1724
+ }
1725
+
1726
+ /* divide data */
1727
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
1728
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
1729
+
1730
+ /* continue splitting recursively */
1731
+ size_t orig_end = workspace.end;
1732
+ if (split_ix > workspace.st)
1733
+ {
1734
+ workspace.end = split_ix - 1;
1735
+ traverse_hplane_csc(workspace,
1736
+ hplanes,
1737
+ model_outputs,
1738
+ prediction_data,
1739
+ tree_num,
1740
+ per_tree_depths,
1741
+ hplanes[curr_tree].hplane_left,
1742
+ has_range_penalty);
1743
+ }
1744
+
1745
+ if (split_ix <= orig_end)
1746
+ {
1747
+ workspace.st = split_ix;
1748
+ workspace.end = orig_end;
1749
+ traverse_hplane_csc(workspace,
1750
+ hplanes,
1751
+ model_outputs,
1752
+ prediction_data,
1753
+ tree_num,
1754
+ per_tree_depths,
1755
+ hplanes[curr_tree].hplane_right,
1756
+ has_range_penalty);
1757
+ }
1758
+ }
1759
+
1760
+ template <class PredictionData>
1761
+ void add_csc_range_penalty(WorkerForPredictCSC &workspace,
1762
+ PredictionData &prediction_data,
1763
+ double *restrict weights_arr,
1764
+ size_t col_num,
1765
+ double range_low,
1766
+ double range_high)
1767
+ {
1768
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
1769
+
1770
+ size_t st_col = prediction_data.Xc_indptr[col_num];
1771
+ size_t end_col = prediction_data.Xc_indptr[col_num + 1] - 1;
1772
+ size_t curr_pos = st_col;
1773
+ size_t ind_end_col = prediction_data.Xc_ind[end_col];
1774
+ size_t *ptr_st = std::lower_bound(workspace.ix_arr.data() + workspace.st,
1775
+ workspace.ix_arr.data() + workspace.end + 1,
1776
+ prediction_data.Xc_ind[st_col]);
1777
+
1778
+ if (range_low <= 0 && range_high >= 0)
1779
+ {
1780
+ for (size_t *row = ptr_st;
1781
+ row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
1782
+ )
1783
+ {
1784
+ if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
1785
+ {
1786
+ if (likely(!std::isnan(prediction_data.Xc[curr_pos])
1787
+ &&
1788
+ ( prediction_data.Xc[curr_pos] < range_low ||
1789
+ prediction_data.Xc[curr_pos] > range_high )))
1790
+ {
1791
+ workspace.depths[*row] -= (weights_arr == NULL)? 1. : weights_arr[*row];
1792
+ }
1793
+
1794
+ if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
1795
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1796
+ prediction_data.Xc_ind + end_col + 1,
1797
+ *(++row))
1798
+ - prediction_data.Xc_ind;
1799
+ }
1800
+
1801
+ else
1802
+ {
1803
+ if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
1804
+ row = std::lower_bound(row + 1,
1805
+ workspace.ix_arr.data() + workspace.end + 1,
1806
+ prediction_data.Xc_ind[curr_pos]);
1807
+ else
1808
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1809
+ prediction_data.Xc_ind + end_col + 1,
1810
+ *row)
1811
+ - prediction_data.Xc_ind;
1812
+ }
1813
+ }
1814
+ }
1815
+
1816
+ else
1817
+ {
1818
+ if (likely(weights_arr == NULL))
1819
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1820
+ workspace.depths[workspace.ix_arr[row]]--;
1821
+ else
1822
+ for (size_t row = workspace.st; row <= workspace.end; row++)
1823
+ workspace.depths[workspace.ix_arr[row]] -= weights_arr[workspace.ix_arr[row]];
1824
+
1825
+
1826
+ for (size_t *row = ptr_st;
1827
+ row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
1828
+ )
1829
+ {
1830
+ if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
1831
+ {
1832
+ if (likely(std::isnan(prediction_data.Xc[curr_pos])
1833
+ ||
1834
+ ( prediction_data.Xc[curr_pos] >= range_low &&
1835
+ prediction_data.Xc[curr_pos] <= range_high )))
1836
+ {
1837
+ workspace.depths[*row] += (weights_arr == NULL)? 1. : weights_arr[*row];
1838
+ }
1839
+
1840
+ if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
1841
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1842
+ prediction_data.Xc_ind + end_col + 1,
1843
+ *(++row))
1844
+ - prediction_data.Xc_ind;
1845
+ }
1846
+
1847
+ else
1848
+ {
1849
+ if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
1850
+ row = std::lower_bound(row + 1,
1851
+ workspace.ix_arr.data() + workspace.end + 1,
1852
+ prediction_data.Xc_ind[curr_pos]);
1853
+ else
1854
+ curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
1855
+ prediction_data.Xc_ind + end_col + 1,
1856
+ *row)
1857
+ - prediction_data.Xc_ind;
1858
+ }
1859
+ }
1860
+ }
1861
+ }
1862
+
1863
+ template <class PredictionData>
1864
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept
1865
+ {
1866
+ decltype(prediction_data.Xc_indptr)
1867
+ search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
1868
+ prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
1869
+ row);
1870
+ if (
1871
+ search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
1872
+ ||
1873
+ (*search_res) != static_cast<typename std::remove_pointer<decltype(search_res)>::type>(row)
1874
+ )
1875
+ return 0.;
1876
+ else
1877
+ return prediction_data.Xc[search_res - prediction_data.Xc_ind];
1878
+ }
1879
+
1880
+ template <class PredictionData, class sparse_ix>
1881
+ static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept
1882
+ {
1883
+ if (row_end == row_st || col_num < lb || col_num > ub)
1884
+ return 0.;
1885
+ sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
1886
+ if (search_res == row_end || *search_res != (sparse_ix)col_num)
1887
+ return 0.;
1888
+ else
1889
+ return prediction_data.Xr[search_res - prediction_data.Xr_ind];
1890
+ }
1891
+
1892
+ template <class PredictionData, class sparse_ix>
1893
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept
1894
+ {
1895
+ if (row_end == row_st)
1896
+ return 0.;
1897
+ sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
1898
+ if (search_res == row_end || *search_res != (sparse_ix)col_num)
1899
+ return 0.;
1900
+ else
1901
+ return prediction_data.Xr[search_res - prediction_data.Xr_ind];
1902
+ }
1903
+
1904
+ template <class sparse_ix>
1905
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
1906
+ {
1907
+ std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
1908
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
1909
+ for (size_t_for tree = 0; tree < (decltype(tree))model_outputs.trees.size(); tree++)
1910
+ {
1911
+ n_nodes[tree] = model_outputs.trees[tree].size();
1912
+ for (IsoTree &node : model_outputs.trees[tree])
1913
+ {
1914
+ n_terminal[tree] += (node.tree_left == 0);
1915
+ }
1916
+ }
1917
+ }
1918
+
1919
+ template <class sparse_ix>
1920
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
1921
+ {
1922
+ std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
1923
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
1924
+ for (size_t_for hplane = 0; hplane <(decltype(hplane)) model_outputs.hplanes.size(); hplane++)
1925
+ {
1926
+ n_nodes[hplane] = model_outputs.hplanes[hplane].size();
1927
+ for (IsoHPlane &node : model_outputs.hplanes[hplane])
1928
+ {
1929
+ n_terminal[hplane] += (node.hplane_left == 0);
1930
+ }
1931
+ }
1932
+ }