isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,1932 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
#include "isotree.hpp"
|
|
64
|
+
|
|
65
|
+
/* TODO: should create versions of these functions that would work on the
|
|
66
|
+
serialized raw bytes instead, as it will likely be faster due to better
|
|
67
|
+
cache utilizations and those objects use less memory. */
|
|
68
|
+
|
|
69
|
+
/* TODO: these trees are all created in a depth-first fashion, which will
|
|
70
|
+
not be cache-friendly when predictions are sent to a right-side branch. In
|
|
71
|
+
order to make predictions faster, could re-arrange the trees after-the-fact
|
|
72
|
+
so that they contain batches of consecutive nodes (parent and children and
|
|
73
|
+
grandchildren) up to some depth - that way these prediction functions would
|
|
74
|
+
run faster. After that, could also do a manual tree leaves unroll within each
|
|
75
|
+
batch with stack-assigned variables for an even faster prediction function. */
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
/* Predict outlier score, average depth, or terminal node numbers
|
|
79
|
+
*
|
|
80
|
+
* Parameters
|
|
81
|
+
* ==========
|
|
82
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
83
|
+
* Pointer to numeric data for which to make predictions. May be ordered by rows
|
|
84
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
85
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
86
|
+
* (see parameter 'is_col_major').
|
|
87
|
+
* Pass NULL if there are no dense numeric columns.
|
|
88
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
89
|
+
* - categ_data[nrows * ncols_categ]
|
|
90
|
+
* Pointer to categorical data for which to make predictions. May be ordered by rows
|
|
91
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
92
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
93
|
+
* (see parameter 'is_col_major').
|
|
94
|
+
* Pass NULL if there are no categorical columns.
|
|
95
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
96
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
97
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
98
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
99
|
+
* must be the same as was used in the data to which the model was fit.
|
|
100
|
+
* - is_col_major
|
|
101
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
|
102
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
|
103
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
|
104
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
|
105
|
+
* If there is numeric sparse data in combination with categorical dense data and there are many
|
|
106
|
+
* rows, it is recommended to pass the categorical data in column major order, as it will take
|
|
107
|
+
* a faster route.
|
|
108
|
+
* If passing 'is_col_major=true', must also provide 'ld_numeric' and/or 'ld_categ'.
|
|
109
|
+
* - ld_numeric
|
|
110
|
+
* Leading dimension of the array 'numeric_data', if it is passed in row-major format.
|
|
111
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
112
|
+
* be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
|
|
113
|
+
* 'numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
114
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
115
|
+
* data in sparse format.
|
|
116
|
+
* - ld_categ
|
|
117
|
+
* Leading dimension of the array 'categ_data', if it is passed in row-major format.
|
|
118
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
119
|
+
* be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
|
|
120
|
+
* 'categ_data' in column-major order, this is ignored and will be assumed that the
|
|
121
|
+
* leading dimension corresponds to the number of rows.
|
|
122
|
+
* - Xc[nnz]
|
|
123
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed).
|
|
124
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
125
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
126
|
+
* - Xc_ind[nnz]
|
|
127
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds.
|
|
128
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
129
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
|
130
|
+
* - Xc_indptr[ncols_categ + 1]
|
|
131
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
132
|
+
* start and at entry [col + 1] where does column 'col' end.
|
|
133
|
+
* Pass NULL if there are no sparse numeric columns in CSC format.
|
|
134
|
+
* - Xr[nnz]
|
|
135
|
+
* Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
|
|
136
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
137
|
+
* Can only pass one of 'numeric_data', 'Xc' + 'Xc_ind' + 'Xc_indptr', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
138
|
+
* - Xr_ind[nnz]
|
|
139
|
+
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
|
140
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
141
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
142
|
+
* - Xr_indptr[nrows + 1]
|
|
143
|
+
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
|
144
|
+
* start and at entry [row + 1] where does row 'row' end.
|
|
145
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
146
|
+
* - nrows
|
|
147
|
+
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
|
148
|
+
* - nthreads
|
|
149
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
150
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
151
|
+
* OpenMP support.
|
|
152
|
+
* - standardize
|
|
153
|
+
* Whether to standardize the average depths for each row according to their relative magnitude
|
|
154
|
+
* compared to the expected average, in order to obtain an outlier score. If passing 'false',
|
|
155
|
+
* will output the average depth instead.
|
|
156
|
+
* Ignored when not passing 'output_depths'.
|
|
157
|
+
* - model_outputs
|
|
158
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
159
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
|
160
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
161
|
+
* - model_outputs_ext
|
|
162
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
163
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
|
164
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
165
|
+
* - output_depths[nrows] (out)
|
|
166
|
+
* Pointer to array where the output average depths or outlier scores will be written into
|
|
167
|
+
* (the return type is controlled according to parameter 'standardize').
|
|
168
|
+
* Should always be passed when calling this function (it is not optional).
|
|
169
|
+
* - tree_num[nrows * ntrees] (out)
|
|
170
|
+
* Pointer to array where the output terminal node numbers will be written into.
|
|
171
|
+
* Note that the mapping between tree node and terminal tree node is not stored in
|
|
172
|
+
* the model object for efficiency reasons, so this mapping will be determined on-the-fly
|
|
173
|
+
* when passing this parameter, and as such, there will be some overhead regardless of
|
|
174
|
+
* the actual number of rows. Output will be in column-major order ([nrows, ntrees]).
|
|
175
|
+
* This will not be calculable when using 'ndim==1' alongside with either
|
|
176
|
+
* 'missing_action==Divide' or 'new_categ_action=Weighted'.
|
|
177
|
+
* Pass NULL if this type of output is not needed.
|
|
178
|
+
* - per_tree_depths[nrows * ntrees] (out)
|
|
179
|
+
* Pointer to array where to output per-tree depths or expected depths for each row.
|
|
180
|
+
* Note that these will not include range penalities ('penalize_range=true').
|
|
181
|
+
* Output will be in row-major order ([nrows, ntrees]).
|
|
182
|
+
* This will not be calculable when using 'ndim==1' alongside with either
|
|
183
|
+
* 'missing_action==Divide' or 'new_categ_action=Weighted'.
|
|
184
|
+
* Pass NULL if this type of output is not needed.
|
|
185
|
+
* - indexer
|
|
186
|
+
* Pointer to associated tree indexer for the model being used, if it was constructed,
|
|
187
|
+
* which can be used to speed up tree numbers/indices predictions.
|
|
188
|
+
* This is ignored when not passing 'tree_num'.
|
|
189
|
+
* Pass NULL if the indexer has not been constructed.
|
|
190
|
+
*/
|
|
191
|
+
template <class real_t, class sparse_ix>
|
|
192
|
+
void predict_iforest(real_t *restrict numeric_data, int *restrict categ_data,
|
|
193
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
194
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
195
|
+
real_t *restrict Xr, sparse_ix *restrict Xr_ind, sparse_ix *restrict Xr_indptr,
|
|
196
|
+
size_t nrows, int nthreads, bool standardize,
|
|
197
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
198
|
+
double *restrict output_depths, sparse_ix *restrict tree_num,
|
|
199
|
+
double *restrict per_tree_depths,
|
|
200
|
+
TreesIndexer *indexer)
|
|
201
|
+
{
|
|
202
|
+
if (unlikely(!nrows)) return;
|
|
203
|
+
|
|
204
|
+
/* put data in a struct for passing it in fewer lines */
|
|
205
|
+
PredictionData<real_t, sparse_ix>
|
|
206
|
+
prediction_data = {numeric_data, categ_data, nrows,
|
|
207
|
+
is_col_major, ld_numeric, ld_categ,
|
|
208
|
+
Xc, Xc_ind, Xc_indptr,
|
|
209
|
+
Xr, Xr_ind, Xr_indptr};
|
|
210
|
+
|
|
211
|
+
int nthreads_orig = nthreads;
|
|
212
|
+
if ((size_t)nthreads > nrows)
|
|
213
|
+
nthreads = nrows;
|
|
214
|
+
|
|
215
|
+
/* For batch predictions of sparse CSC, will take a specialized route */
|
|
216
|
+
if (prediction_data.Xc_indptr != NULL && (prediction_data.categ_data == NULL || prediction_data.is_col_major))
|
|
217
|
+
{
|
|
218
|
+
batched_csc_predict(prediction_data, nthreads_orig,
|
|
219
|
+
model_outputs, model_outputs_ext,
|
|
220
|
+
output_depths, tree_num,
|
|
221
|
+
per_tree_depths);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/* Regular case (no specialized CSC route) */
|
|
225
|
+
else if (model_outputs != NULL)
|
|
226
|
+
{
|
|
227
|
+
if (
|
|
228
|
+
model_outputs->missing_action == Fail &&
|
|
229
|
+
(model_outputs->new_cat_action != Weighted || model_outputs->cat_split_type == SingleCateg || prediction_data.categ_data == NULL) &&
|
|
230
|
+
prediction_data.Xc_indptr == NULL && prediction_data.Xr_indptr == NULL &&
|
|
231
|
+
!model_outputs->has_range_penalty
|
|
232
|
+
)
|
|
233
|
+
{
|
|
234
|
+
if (prediction_data.categ_data == NULL && (nrows == 1 || !prediction_data.is_col_major))
|
|
235
|
+
{
|
|
236
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
237
|
+
shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
238
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
239
|
+
{
|
|
240
|
+
double score = 0;
|
|
241
|
+
for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
|
|
242
|
+
{
|
|
243
|
+
traverse_itree_fast(model_outputs->trees[tree],
|
|
244
|
+
*model_outputs,
|
|
245
|
+
prediction_data.numeric_data + row * prediction_data.ncols_numeric,
|
|
246
|
+
score,
|
|
247
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
248
|
+
(per_tree_depths == NULL)?
|
|
249
|
+
NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
|
|
250
|
+
(size_t) row);
|
|
251
|
+
}
|
|
252
|
+
output_depths[row] = score;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
else
|
|
257
|
+
{
|
|
258
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
259
|
+
shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
260
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
261
|
+
{
|
|
262
|
+
double score = 0;
|
|
263
|
+
for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
|
|
264
|
+
{
|
|
265
|
+
traverse_itree_no_recurse(model_outputs->trees[tree],
|
|
266
|
+
*model_outputs,
|
|
267
|
+
prediction_data,
|
|
268
|
+
score,
|
|
269
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
270
|
+
(per_tree_depths == NULL)?
|
|
271
|
+
NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
|
|
272
|
+
(size_t) row);
|
|
273
|
+
}
|
|
274
|
+
output_depths[row] = score;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
else
|
|
280
|
+
{
|
|
281
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
282
|
+
shared(nrows, model_outputs, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
283
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
284
|
+
{
|
|
285
|
+
double score = 0;
|
|
286
|
+
for (size_t tree = 0; tree < model_outputs->trees.size(); tree++)
|
|
287
|
+
{
|
|
288
|
+
score += traverse_itree(model_outputs->trees[tree],
|
|
289
|
+
*model_outputs,
|
|
290
|
+
prediction_data,
|
|
291
|
+
(std::vector<ImputeNode>*)NULL,
|
|
292
|
+
(ImputedData<sparse_ix, double>*)NULL,
|
|
293
|
+
(double)0,
|
|
294
|
+
(size_t) row,
|
|
295
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
296
|
+
(per_tree_depths == NULL)?
|
|
297
|
+
NULL : (per_tree_depths + tree + row*model_outputs->trees.size()),
|
|
298
|
+
(size_t) 0);
|
|
299
|
+
}
|
|
300
|
+
output_depths[row] = score;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
else
|
|
307
|
+
{
|
|
308
|
+
if (
|
|
309
|
+
model_outputs_ext->missing_action == Fail &&
|
|
310
|
+
prediction_data.categ_data == NULL &&
|
|
311
|
+
prediction_data.Xc_indptr == NULL &&
|
|
312
|
+
prediction_data.Xr_indptr == NULL &&
|
|
313
|
+
!model_outputs_ext->has_range_penalty
|
|
314
|
+
)
|
|
315
|
+
{
|
|
316
|
+
if (prediction_data.is_col_major && nrows > 1)
|
|
317
|
+
{
|
|
318
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
319
|
+
shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
320
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
321
|
+
{
|
|
322
|
+
double score = 0;
|
|
323
|
+
for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
|
|
324
|
+
{
|
|
325
|
+
traverse_hplane_fast_colmajor(model_outputs_ext->hplanes[tree],
|
|
326
|
+
*model_outputs_ext,
|
|
327
|
+
prediction_data,
|
|
328
|
+
score,
|
|
329
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
330
|
+
(per_tree_depths == NULL)?
|
|
331
|
+
NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
|
|
332
|
+
(size_t) row);
|
|
333
|
+
}
|
|
334
|
+
output_depths[row] = score;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
else
|
|
339
|
+
{
|
|
340
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
341
|
+
shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
342
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
343
|
+
{
|
|
344
|
+
double score = 0;
|
|
345
|
+
for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
|
|
346
|
+
{
|
|
347
|
+
traverse_hplane_fast_rowmajor(model_outputs_ext->hplanes[tree],
|
|
348
|
+
*model_outputs_ext,
|
|
349
|
+
prediction_data.numeric_data + row * prediction_data.ncols_numeric,
|
|
350
|
+
score,
|
|
351
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
352
|
+
(per_tree_depths == NULL)?
|
|
353
|
+
NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
|
|
354
|
+
(size_t) row);
|
|
355
|
+
}
|
|
356
|
+
output_depths[row] = score;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
else
|
|
362
|
+
{
|
|
363
|
+
#pragma omp parallel for if(nrows > 1) schedule(static) num_threads(nthreads) \
|
|
364
|
+
shared(nrows, model_outputs_ext, prediction_data, output_depths, tree_num, per_tree_depths)
|
|
365
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
366
|
+
{
|
|
367
|
+
double score = 0;
|
|
368
|
+
for (size_t tree = 0; tree < model_outputs_ext->hplanes.size(); tree++)
|
|
369
|
+
{
|
|
370
|
+
traverse_hplane(model_outputs_ext->hplanes[tree],
|
|
371
|
+
*model_outputs_ext,
|
|
372
|
+
prediction_data,
|
|
373
|
+
score,
|
|
374
|
+
(std::vector<ImputeNode>*)NULL,
|
|
375
|
+
(ImputedData<sparse_ix, double>*)NULL,
|
|
376
|
+
(tree_num == NULL)? NULL : (tree_num + nrows * tree),
|
|
377
|
+
(per_tree_depths == NULL)?
|
|
378
|
+
NULL : (per_tree_depths + tree + row*model_outputs_ext->hplanes.size()),
|
|
379
|
+
(size_t) row);
|
|
380
|
+
}
|
|
381
|
+
output_depths[row] = score;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/* translate sum-of-depths to outlier score */
|
|
387
|
+
double ntrees, depth_divisor;
|
|
388
|
+
if (model_outputs != NULL)
|
|
389
|
+
{
|
|
390
|
+
ntrees = (double) model_outputs->trees.size();
|
|
391
|
+
depth_divisor = ntrees * (model_outputs->exp_avg_depth);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
else
|
|
395
|
+
{
|
|
396
|
+
ntrees = (double) model_outputs_ext->hplanes.size();
|
|
397
|
+
depth_divisor = ntrees * (model_outputs_ext->exp_avg_depth);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
/* for density and boxed_ratio, each tree will have 'log(d)'' instead of 'd' */
|
|
402
|
+
bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
|
|
403
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
|
|
404
|
+
bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
|
|
405
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
|
|
406
|
+
bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
|
|
407
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
|
|
408
|
+
bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity2) ||
|
|
409
|
+
(model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity2);
|
|
410
|
+
|
|
411
|
+
if (standardize)
|
|
412
|
+
{
|
|
413
|
+
if (is_density || is_bdens2)
|
|
414
|
+
{
|
|
415
|
+
ntrees = -ntrees;
|
|
416
|
+
for (size_t row = 0; row < nrows; row++)
|
|
417
|
+
output_depths[row] /= ntrees;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
else if (is_bdens)
|
|
421
|
+
{
|
|
422
|
+
#ifndef _WIN32
|
|
423
|
+
#pragma omp simd
|
|
424
|
+
#endif
|
|
425
|
+
for (size_t row = 0; row < nrows; row++)
|
|
426
|
+
output_depths[row] = -std::exp(output_depths[row] / ntrees);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
else if (is_bratio)
|
|
430
|
+
{
|
|
431
|
+
for (size_t row = 0; row < nrows; row++)
|
|
432
|
+
output_depths[row] = output_depths[row] / ntrees;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
else
|
|
436
|
+
{
|
|
437
|
+
#ifndef _WIN32
|
|
438
|
+
#pragma omp simd
|
|
439
|
+
#endif
|
|
440
|
+
for (size_t row = 0; row < nrows; row++)
|
|
441
|
+
output_depths[row] = std::exp2( - output_depths[row] / depth_divisor );
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
else
|
|
446
|
+
{
|
|
447
|
+
if (is_density || is_bdens || is_bdens2)
|
|
448
|
+
{
|
|
449
|
+
#ifndef _WIN32
|
|
450
|
+
#pragma omp simd
|
|
451
|
+
#endif
|
|
452
|
+
for (size_t row = 0; row < nrows; row++)
|
|
453
|
+
output_depths[row] = std::exp(output_depths[row] / ntrees);
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
else if (is_bratio)
|
|
457
|
+
{
|
|
458
|
+
ntrees = -ntrees;
|
|
459
|
+
for (size_t row = 0; row < nrows; row++)
|
|
460
|
+
output_depths[row] /= ntrees;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
else
|
|
464
|
+
{
|
|
465
|
+
for (size_t row = 0; row < nrows; row++)
|
|
466
|
+
output_depths[row] /= ntrees;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
if (per_tree_depths != NULL && (is_density || is_bdens || is_bdens2))
|
|
471
|
+
{
|
|
472
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
473
|
+
#ifndef _WIN32
|
|
474
|
+
#pragma omp simd
|
|
475
|
+
#endif
|
|
476
|
+
for (size_t ix = 0; ix < nrows*ntrees; ix++)
|
|
477
|
+
per_tree_depths[ix] = std::exp(per_tree_depths[ix]);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
/* re-map tree numbers to start at zero (if predicting tree numbers) */
|
|
482
|
+
/* Note: usually this type of 'prediction' is not required,
|
|
483
|
+
thus this mapping is not stored in the model objects so as to
|
|
484
|
+
save memory */
|
|
485
|
+
if (tree_num != NULL)
|
|
486
|
+
{
|
|
487
|
+
if (indexer != NULL && !indexer->indices.empty())
|
|
488
|
+
{
|
|
489
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
490
|
+
if (model_outputs != NULL)
|
|
491
|
+
{
|
|
492
|
+
if (model_outputs->missing_action == Divide)
|
|
493
|
+
goto manual_remap;
|
|
494
|
+
if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL)
|
|
495
|
+
goto manual_remap;
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
for (size_t tree = 0; tree < ntrees; tree++)
|
|
499
|
+
{
|
|
500
|
+
size_t *restrict mapping = indexer->indices[tree].terminal_node_mappings.data();
|
|
501
|
+
for (size_t row = 0; row < nrows; row++)
|
|
502
|
+
{
|
|
503
|
+
tree_num[row + tree*nrows] = mapping[tree_num[row + tree*nrows]];
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
else
|
|
509
|
+
{
|
|
510
|
+
manual_remap:
|
|
511
|
+
remap_terminal_trees(model_outputs, model_outputs_ext,
|
|
512
|
+
prediction_data, tree_num, nthreads);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
template <class real_t, class sparse_ix>
|
|
518
|
+
void traverse_itree_fast(std::vector<IsoTree> &tree,
|
|
519
|
+
IsoForest &model_outputs,
|
|
520
|
+
real_t *restrict row_numeric_data,
|
|
521
|
+
double &restrict output_depth,
|
|
522
|
+
sparse_ix *restrict tree_num,
|
|
523
|
+
double *restrict tree_depth,
|
|
524
|
+
size_t row) noexcept
|
|
525
|
+
{
|
|
526
|
+
size_t curr_lev = 0;
|
|
527
|
+
double xval;
|
|
528
|
+
while (true)
|
|
529
|
+
{
|
|
530
|
+
if (unlikely(tree[curr_lev].tree_left == 0))
|
|
531
|
+
{
|
|
532
|
+
output_depth += tree[curr_lev].score;
|
|
533
|
+
if (unlikely(tree_num != NULL))
|
|
534
|
+
tree_num[row] = curr_lev;
|
|
535
|
+
if (unlikely(tree_depth != NULL))
|
|
536
|
+
*tree_depth = tree[curr_lev].score;
|
|
537
|
+
break;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
else
|
|
541
|
+
{
|
|
542
|
+
xval = row_numeric_data[tree[curr_lev].col_num];
|
|
543
|
+
curr_lev = (xval <= tree[curr_lev].num_split)?
|
|
544
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
template <class PredictionData, class sparse_ix>
|
|
550
|
+
void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
|
|
551
|
+
IsoForest &model_outputs,
|
|
552
|
+
PredictionData &prediction_data,
|
|
553
|
+
double &restrict output_depth,
|
|
554
|
+
sparse_ix *restrict tree_num,
|
|
555
|
+
double *restrict tree_depth,
|
|
556
|
+
size_t row) noexcept
|
|
557
|
+
{
|
|
558
|
+
size_t curr_lev = 0;
|
|
559
|
+
double xval;
|
|
560
|
+
int cval;
|
|
561
|
+
while (true)
|
|
562
|
+
{
|
|
563
|
+
// if (tree[curr_lev].score > 0)
|
|
564
|
+
if (unlikely(tree[curr_lev].tree_left == 0))
|
|
565
|
+
{
|
|
566
|
+
output_depth += tree[curr_lev].score;
|
|
567
|
+
if (unlikely(tree_num != NULL))
|
|
568
|
+
tree_num[row] = curr_lev;
|
|
569
|
+
if (unlikely(tree_depth != NULL))
|
|
570
|
+
*tree_depth = tree[curr_lev].score;
|
|
571
|
+
break;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
else
|
|
575
|
+
{
|
|
576
|
+
switch (tree[curr_lev].col_type)
|
|
577
|
+
{
|
|
578
|
+
case Numeric:
|
|
579
|
+
{
|
|
580
|
+
xval = prediction_data.numeric_data[
|
|
581
|
+
prediction_data.is_col_major?
|
|
582
|
+
(row + tree[curr_lev].col_num * prediction_data.nrows)
|
|
583
|
+
:
|
|
584
|
+
(tree[curr_lev].col_num + row * prediction_data.ncols_numeric)
|
|
585
|
+
];
|
|
586
|
+
curr_lev = (xval <= tree[curr_lev].num_split)?
|
|
587
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
588
|
+
break;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
case Categorical:
|
|
592
|
+
{
|
|
593
|
+
cval = prediction_data.categ_data[
|
|
594
|
+
prediction_data.is_col_major?
|
|
595
|
+
(row + tree[curr_lev].col_num * prediction_data.nrows)
|
|
596
|
+
:
|
|
597
|
+
(tree[curr_lev].col_num + row * prediction_data.ncols_categ)
|
|
598
|
+
];
|
|
599
|
+
switch (model_outputs.cat_split_type)
|
|
600
|
+
{
|
|
601
|
+
case SubSet:
|
|
602
|
+
{
|
|
603
|
+
|
|
604
|
+
if (tree[curr_lev].cat_split.empty()) /* this is for binary columns */
|
|
605
|
+
{
|
|
606
|
+
if (cval <= 1)
|
|
607
|
+
{
|
|
608
|
+
curr_lev = (cval == 0)?
|
|
609
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
else /* can only work with 'Smallest' + no NAs if reaching this point */
|
|
613
|
+
{
|
|
614
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
else
|
|
619
|
+
{
|
|
620
|
+
|
|
621
|
+
switch (model_outputs.new_cat_action)
|
|
622
|
+
{
|
|
623
|
+
case Random:
|
|
624
|
+
{
|
|
625
|
+
cval = (cval >= (int)tree[curr_lev].cat_split.size())?
|
|
626
|
+
(cval % (int)tree[curr_lev].cat_split.size()) : cval;
|
|
627
|
+
curr_lev = (tree[curr_lev].cat_split[cval])?
|
|
628
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
629
|
+
break;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
case Smallest:
|
|
633
|
+
{
|
|
634
|
+
if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
|
|
635
|
+
{
|
|
636
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
else
|
|
640
|
+
{
|
|
641
|
+
curr_lev = (tree[curr_lev].cat_split[cval])?
|
|
642
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
643
|
+
}
|
|
644
|
+
break;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
default:
|
|
648
|
+
{
|
|
649
|
+
assert(0);
|
|
650
|
+
break;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
break;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
case SingleCateg:
|
|
658
|
+
{
|
|
659
|
+
curr_lev = (cval == tree[curr_lev].chosen_cat)?
|
|
660
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
661
|
+
break;
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
break;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
default:
|
|
668
|
+
{
|
|
669
|
+
assert(0);
|
|
670
|
+
break;
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
enum NumericConfig {DenseRowMajor, DenseColMajor, SparseCSR, SparseCSC};
|
|
678
|
+
|
|
679
|
+
template <class PredictionData, class sparse_ix, class ImputedData>
|
|
680
|
+
double traverse_itree(std::vector<IsoTree> &tree,
|
|
681
|
+
IsoForest &model_outputs,
|
|
682
|
+
PredictionData &prediction_data,
|
|
683
|
+
std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
|
|
684
|
+
ImputedData *imputed_data, /* only when imputing missing */
|
|
685
|
+
double curr_weight, /* only when imputing missing */
|
|
686
|
+
size_t row,
|
|
687
|
+
sparse_ix *restrict tree_num,
|
|
688
|
+
double *restrict tree_depth,
|
|
689
|
+
size_t curr_lev) noexcept
|
|
690
|
+
{
|
|
691
|
+
double xval;
|
|
692
|
+
int cval;
|
|
693
|
+
double range_penalty = 0;
|
|
694
|
+
|
|
695
|
+
NumericConfig numeric_config;
|
|
696
|
+
if (prediction_data.Xr_indptr != NULL)
|
|
697
|
+
numeric_config = SparseCSR;
|
|
698
|
+
else if (prediction_data.Xc_indptr != NULL)
|
|
699
|
+
numeric_config = SparseCSC;
|
|
700
|
+
else if (prediction_data.is_col_major)
|
|
701
|
+
numeric_config = DenseColMajor;
|
|
702
|
+
else
|
|
703
|
+
numeric_config = DenseRowMajor;
|
|
704
|
+
|
|
705
|
+
sparse_ix *row_st = NULL, *row_end = NULL;
|
|
706
|
+
if (numeric_config == SparseCSR)
|
|
707
|
+
{
|
|
708
|
+
row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
|
|
709
|
+
row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
while (true)
|
|
713
|
+
{
|
|
714
|
+
// if (tree[curr_lev].score >= 0.)
|
|
715
|
+
if (unlikely(tree[curr_lev].tree_left == 0))
|
|
716
|
+
{
|
|
717
|
+
if (unlikely(tree_num != NULL))
|
|
718
|
+
tree_num[row] = curr_lev;
|
|
719
|
+
if (unlikely(tree_depth != NULL))
|
|
720
|
+
*tree_depth = tree[curr_lev].score;
|
|
721
|
+
if (unlikely(imputed_data != NULL))
|
|
722
|
+
add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, curr_weight);
|
|
723
|
+
|
|
724
|
+
return tree[curr_lev].score - range_penalty;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
else
|
|
728
|
+
{
|
|
729
|
+
switch(tree[curr_lev].col_type)
|
|
730
|
+
{
|
|
731
|
+
case Numeric:
|
|
732
|
+
{
|
|
733
|
+
switch(numeric_config)
|
|
734
|
+
{
|
|
735
|
+
case DenseRowMajor:
|
|
736
|
+
{
|
|
737
|
+
xval = prediction_data.numeric_data[tree[curr_lev].col_num + row * prediction_data.ncols_numeric];
|
|
738
|
+
break;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
case DenseColMajor:
|
|
742
|
+
{
|
|
743
|
+
xval = prediction_data.numeric_data[row + tree[curr_lev].col_num * prediction_data.nrows];
|
|
744
|
+
break;
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
case SparseCSR:
|
|
748
|
+
{
|
|
749
|
+
xval = extract_spR(prediction_data, row_st, row_end, tree[curr_lev].col_num);
|
|
750
|
+
break;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
case SparseCSC:
|
|
754
|
+
{
|
|
755
|
+
xval = extract_spC(prediction_data, row, tree[curr_lev].col_num);
|
|
756
|
+
break;
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
if (unlikely(std::isnan(xval)))
|
|
761
|
+
{
|
|
762
|
+
switch(model_outputs.missing_action)
|
|
763
|
+
{
|
|
764
|
+
case Divide:
|
|
765
|
+
{
|
|
766
|
+
return
|
|
767
|
+
tree[curr_lev].pct_tree_left
|
|
768
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
769
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
|
770
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
|
|
771
|
+
+ (1. - tree[curr_lev].pct_tree_left)
|
|
772
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
773
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
|
774
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
|
|
775
|
+
- range_penalty;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
case Impute:
|
|
779
|
+
{
|
|
780
|
+
curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
|
|
781
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
782
|
+
break;
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
case Fail:
|
|
786
|
+
{
|
|
787
|
+
return NAN;
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
else
|
|
793
|
+
{
|
|
794
|
+
range_penalty += (xval < tree[curr_lev].range_low) || (xval > tree[curr_lev].range_high);
|
|
795
|
+
curr_lev = (xval <= tree[curr_lev].num_split)?
|
|
796
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
797
|
+
}
|
|
798
|
+
break;
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
case Categorical:
|
|
802
|
+
{
|
|
803
|
+
cval = prediction_data.categ_data[
|
|
804
|
+
prediction_data.is_col_major?
|
|
805
|
+
(row + tree[curr_lev].col_num * prediction_data.nrows)
|
|
806
|
+
:
|
|
807
|
+
(tree[curr_lev].col_num + row * prediction_data.ncols_categ)
|
|
808
|
+
];
|
|
809
|
+
if (unlikely(cval < 0))
|
|
810
|
+
{
|
|
811
|
+
switch(model_outputs.missing_action)
|
|
812
|
+
{
|
|
813
|
+
case Divide:
|
|
814
|
+
{
|
|
815
|
+
return
|
|
816
|
+
tree[curr_lev].pct_tree_left
|
|
817
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
818
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
|
819
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
|
|
820
|
+
+ (1. - tree[curr_lev].pct_tree_left)
|
|
821
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
822
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
|
823
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
|
|
824
|
+
- range_penalty;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
case Impute:
|
|
828
|
+
{
|
|
829
|
+
curr_lev = (tree[curr_lev].pct_tree_left >= .5)?
|
|
830
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
831
|
+
break;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
case Fail:
|
|
835
|
+
{
|
|
836
|
+
return NAN;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
else
|
|
842
|
+
{
|
|
843
|
+
switch(model_outputs.cat_split_type)
|
|
844
|
+
{
|
|
845
|
+
case SingleCateg:
|
|
846
|
+
{
|
|
847
|
+
curr_lev = (cval == tree[curr_lev].chosen_cat)?
|
|
848
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
849
|
+
break;
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
case SubSet:
|
|
853
|
+
{
|
|
854
|
+
|
|
855
|
+
if (tree[curr_lev].cat_split.empty())
|
|
856
|
+
{
|
|
857
|
+
if (cval <= 1)
|
|
858
|
+
{
|
|
859
|
+
curr_lev = (cval == 0)?
|
|
860
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
else
|
|
864
|
+
{
|
|
865
|
+
switch(model_outputs.new_cat_action)
|
|
866
|
+
{
|
|
867
|
+
case Smallest:
|
|
868
|
+
{
|
|
869
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
case Weighted:
|
|
874
|
+
{
|
|
875
|
+
return
|
|
876
|
+
tree[curr_lev].pct_tree_left
|
|
877
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
878
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
|
879
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
|
|
880
|
+
+ (1. - tree[curr_lev].pct_tree_left)
|
|
881
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
882
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
|
883
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
|
|
884
|
+
- range_penalty;
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
default:
|
|
888
|
+
{
|
|
889
|
+
assert(0);
|
|
890
|
+
break;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
else
|
|
897
|
+
{
|
|
898
|
+
switch(model_outputs.new_cat_action)
|
|
899
|
+
{
|
|
900
|
+
case Random:
|
|
901
|
+
{
|
|
902
|
+
cval = (cval >= (int)tree[curr_lev].cat_split.size())?
|
|
903
|
+
(cval % (int)tree[curr_lev].cat_split.size()) : cval;
|
|
904
|
+
curr_lev = (tree[curr_lev].cat_split[cval])?
|
|
905
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
906
|
+
break;
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
case Smallest:
|
|
910
|
+
{
|
|
911
|
+
if (unlikely(cval >= (int)tree[curr_lev].cat_split.size()))
|
|
912
|
+
{
|
|
913
|
+
curr_lev = (tree[curr_lev].pct_tree_left < .5)? tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
else
|
|
917
|
+
{
|
|
918
|
+
curr_lev = (tree[curr_lev].cat_split[cval])?
|
|
919
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
920
|
+
}
|
|
921
|
+
break;
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
case Weighted:
|
|
925
|
+
{
|
|
926
|
+
if (cval >= (int)tree[curr_lev].cat_split.size()
|
|
927
|
+
||
|
|
928
|
+
tree[curr_lev].cat_split[cval] == (-1))
|
|
929
|
+
{
|
|
930
|
+
return
|
|
931
|
+
tree[curr_lev].pct_tree_left
|
|
932
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
933
|
+
impute_nodes, imputed_data, curr_weight * tree[curr_lev].pct_tree_left,
|
|
934
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_left)
|
|
935
|
+
+ (1. - tree[curr_lev].pct_tree_left)
|
|
936
|
+
* traverse_itree(tree, model_outputs, prediction_data,
|
|
937
|
+
impute_nodes, imputed_data, curr_weight * (1 - tree[curr_lev].pct_tree_left),
|
|
938
|
+
row, (sparse_ix*)NULL, tree_depth, tree[curr_lev].tree_right)
|
|
939
|
+
- range_penalty;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
else
|
|
943
|
+
{
|
|
944
|
+
curr_lev = (tree[curr_lev].cat_split[cval])?
|
|
945
|
+
tree[curr_lev].tree_left : tree[curr_lev].tree_right;
|
|
946
|
+
}
|
|
947
|
+
break;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
break;
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
break;
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
default:
|
|
959
|
+
{
|
|
960
|
+
assert(0);
|
|
961
|
+
break;
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
/* this is a simpler version for situations in which there is
|
|
969
|
+
only numeric data in dense arrays, no missing values, no range penalty */
|
|
970
|
+
template <class PredictionData, class sparse_ix>
|
|
971
|
+
void traverse_hplane_fast_colmajor(std::vector<IsoHPlane> &hplane,
|
|
972
|
+
ExtIsoForest &model_outputs,
|
|
973
|
+
PredictionData &prediction_data,
|
|
974
|
+
double &restrict output_depth,
|
|
975
|
+
sparse_ix *restrict tree_num,
|
|
976
|
+
double *restrict tree_depth,
|
|
977
|
+
size_t row) noexcept
|
|
978
|
+
{
|
|
979
|
+
size_t curr_lev = 0;
|
|
980
|
+
double hval;
|
|
981
|
+
|
|
982
|
+
while(true)
|
|
983
|
+
{
|
|
984
|
+
// if (hplane[curr_lev].score > 0)
|
|
985
|
+
if (unlikely(hplane[curr_lev].hplane_left == 0))
|
|
986
|
+
{
|
|
987
|
+
output_depth += hplane[curr_lev].score;
|
|
988
|
+
if (unlikely(tree_num != NULL))
|
|
989
|
+
tree_num[row] = curr_lev;
|
|
990
|
+
if (unlikely(tree_depth != NULL))
|
|
991
|
+
*tree_depth = hplane[curr_lev].score;
|
|
992
|
+
return;
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
else
|
|
996
|
+
{
|
|
997
|
+
hval = 0;
|
|
998
|
+
for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
|
|
999
|
+
hval += (prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows]
|
|
1000
|
+
- hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
|
|
1001
|
+
|
|
1002
|
+
curr_lev = (hval <= hplane[curr_lev].split_point)?
|
|
1003
|
+
hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
|
|
1004
|
+
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
template <class real_t, class sparse_ix>
|
|
1010
|
+
void traverse_hplane_fast_rowmajor(std::vector<IsoHPlane> &hplane,
|
|
1011
|
+
ExtIsoForest &model_outputs,
|
|
1012
|
+
real_t *restrict row_numeric_data,
|
|
1013
|
+
double &restrict output_depth,
|
|
1014
|
+
sparse_ix *restrict tree_num,
|
|
1015
|
+
double *restrict tree_depth,
|
|
1016
|
+
size_t row) noexcept
|
|
1017
|
+
{
|
|
1018
|
+
size_t curr_lev = 0;
|
|
1019
|
+
double hval;
|
|
1020
|
+
|
|
1021
|
+
while(true)
|
|
1022
|
+
{
|
|
1023
|
+
// if (hplane[curr_lev].score > 0)
|
|
1024
|
+
if (unlikely(hplane[curr_lev].hplane_left == 0))
|
|
1025
|
+
{
|
|
1026
|
+
output_depth += hplane[curr_lev].score;
|
|
1027
|
+
if (unlikely(tree_num != NULL))
|
|
1028
|
+
tree_num[row] = curr_lev;
|
|
1029
|
+
if (unlikely(tree_depth != NULL))
|
|
1030
|
+
*tree_depth = hplane[curr_lev].score;
|
|
1031
|
+
return;
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
else
|
|
1035
|
+
{
|
|
1036
|
+
hval = 0;
|
|
1037
|
+
for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
|
|
1038
|
+
hval += (row_numeric_data[hplane[curr_lev].col_num[col]]
|
|
1039
|
+
- hplane[curr_lev].mean[col]) * hplane[curr_lev].coef[col];
|
|
1040
|
+
|
|
1041
|
+
curr_lev = (hval <= hplane[curr_lev].split_point)?
|
|
1042
|
+
hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
|
|
1043
|
+
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
/* this is the full version that works with potentially missing values, sparse matrices, and categoricals */
|
|
1049
|
+
template <class PredictionData, class sparse_ix, class ImputedData>
|
|
1050
|
+
void traverse_hplane(std::vector<IsoHPlane> &hplane,
|
|
1051
|
+
ExtIsoForest &model_outputs,
|
|
1052
|
+
PredictionData &prediction_data,
|
|
1053
|
+
double &restrict output_depth,
|
|
1054
|
+
std::vector<ImputeNode> *impute_nodes, /* only when imputing missing */
|
|
1055
|
+
ImputedData *imputed_data, /* only when imputing missing */
|
|
1056
|
+
sparse_ix *restrict tree_num,
|
|
1057
|
+
double *restrict tree_depth,
|
|
1058
|
+
size_t row) noexcept
|
|
1059
|
+
{
|
|
1060
|
+
size_t curr_lev = 0;
|
|
1061
|
+
double xval;
|
|
1062
|
+
int cval;
|
|
1063
|
+
double hval;
|
|
1064
|
+
|
|
1065
|
+
size_t ncols_numeric, ncols_categ;
|
|
1066
|
+
|
|
1067
|
+
NumericConfig numeric_config;
|
|
1068
|
+
if (prediction_data.Xr_indptr != NULL)
|
|
1069
|
+
numeric_config = SparseCSR;
|
|
1070
|
+
else if (prediction_data.Xc_indptr != NULL)
|
|
1071
|
+
numeric_config = SparseCSC;
|
|
1072
|
+
else if (prediction_data.is_col_major)
|
|
1073
|
+
numeric_config = DenseColMajor;
|
|
1074
|
+
else
|
|
1075
|
+
numeric_config = DenseRowMajor;
|
|
1076
|
+
|
|
1077
|
+
sparse_ix *row_st = NULL, *row_end = NULL;
|
|
1078
|
+
size_t lb, ub;
|
|
1079
|
+
if (numeric_config == SparseCSR)
|
|
1080
|
+
{
|
|
1081
|
+
row_st = prediction_data.Xr_ind + prediction_data.Xr_indptr[row];
|
|
1082
|
+
row_end = prediction_data.Xr_ind + prediction_data.Xr_indptr[row + 1];
|
|
1083
|
+
lb = *row_st;
|
|
1084
|
+
ub = *(row_end-1);
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
while (true)
|
|
1088
|
+
{
|
|
1089
|
+
// if (hplane[curr_lev].score > 0)
|
|
1090
|
+
if (unlikely(hplane[curr_lev].hplane_left == 0))
|
|
1091
|
+
{
|
|
1092
|
+
output_depth += hplane[curr_lev].score;
|
|
1093
|
+
if (unlikely(tree_num != NULL))
|
|
1094
|
+
tree_num[row] = curr_lev;
|
|
1095
|
+
if (unlikely(tree_depth != NULL))
|
|
1096
|
+
*tree_depth = hplane[curr_lev].score;
|
|
1097
|
+
if (unlikely(imputed_data != NULL))
|
|
1098
|
+
{
|
|
1099
|
+
add_from_impute_node((*impute_nodes)[curr_lev], *imputed_data, (double)1);
|
|
1100
|
+
}
|
|
1101
|
+
return;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
else
|
|
1105
|
+
{
|
|
1106
|
+
hval = 0;
|
|
1107
|
+
ncols_numeric = 0; ncols_categ = 0;
|
|
1108
|
+
for (size_t col = 0; col < hplane[curr_lev].col_num.size(); col++)
|
|
1109
|
+
{
|
|
1110
|
+
switch(hplane[curr_lev].col_type[col])
|
|
1111
|
+
{
|
|
1112
|
+
case Numeric:
|
|
1113
|
+
{
|
|
1114
|
+
switch(numeric_config)
|
|
1115
|
+
{
|
|
1116
|
+
case DenseRowMajor:
|
|
1117
|
+
{
|
|
1118
|
+
xval = prediction_data.numeric_data[hplane[curr_lev].col_num[col] + row * prediction_data.ncols_numeric];
|
|
1119
|
+
break;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
case DenseColMajor:
|
|
1123
|
+
{
|
|
1124
|
+
xval = prediction_data.numeric_data[row + hplane[curr_lev].col_num[col] * prediction_data.nrows];
|
|
1125
|
+
break;
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
case SparseCSR:
|
|
1129
|
+
{
|
|
1130
|
+
xval = extract_spR(prediction_data, row_st, row_end, hplane[curr_lev].col_num[col], lb, ub);
|
|
1131
|
+
break;
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
case SparseCSC:
|
|
1135
|
+
{
|
|
1136
|
+
xval = extract_spC(prediction_data, row, hplane[curr_lev].col_num[col]);
|
|
1137
|
+
break;
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
if (unlikely(is_na_or_inf(xval)))
|
|
1142
|
+
{
|
|
1143
|
+
if (model_outputs.missing_action != Fail)
|
|
1144
|
+
{
|
|
1145
|
+
hval += hplane[curr_lev].fill_val[col];
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
else
|
|
1149
|
+
{
|
|
1150
|
+
output_depth = NAN;
|
|
1151
|
+
return;
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
else
|
|
1156
|
+
{
|
|
1157
|
+
hval += (xval - hplane[curr_lev].mean[ncols_numeric]) * hplane[curr_lev].coef[ncols_numeric];
|
|
1158
|
+
}
|
|
1159
|
+
|
|
1160
|
+
ncols_numeric++;
|
|
1161
|
+
break;
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
case Categorical:
|
|
1165
|
+
{
|
|
1166
|
+
cval = prediction_data.categ_data[
|
|
1167
|
+
prediction_data.is_col_major?
|
|
1168
|
+
(row + hplane[curr_lev].col_num[col] * prediction_data.nrows)
|
|
1169
|
+
:
|
|
1170
|
+
(hplane[curr_lev].col_num[col] + row * prediction_data.ncols_categ)
|
|
1171
|
+
];
|
|
1172
|
+
if (unlikely(cval < 0))
|
|
1173
|
+
{
|
|
1174
|
+
if (model_outputs.missing_action != Fail)
|
|
1175
|
+
{
|
|
1176
|
+
hval += hplane[curr_lev].fill_val[col];
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
else
|
|
1180
|
+
{
|
|
1181
|
+
output_depth = NAN;
|
|
1182
|
+
return;
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
else
|
|
1187
|
+
{
|
|
1188
|
+
switch(model_outputs.cat_split_type)
|
|
1189
|
+
{
|
|
1190
|
+
case SingleCateg:
|
|
1191
|
+
{
|
|
1192
|
+
hval += (cval == hplane[curr_lev].chosen_cat[ncols_categ])? hplane[curr_lev].fill_new[ncols_categ] : 0;
|
|
1193
|
+
break;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
case SubSet:
|
|
1197
|
+
{
|
|
1198
|
+
if (unlikely(cval >= (int)hplane[curr_lev].cat_coef[ncols_categ].size()))
|
|
1199
|
+
{
|
|
1200
|
+
if (model_outputs.new_cat_action == Random) {
|
|
1201
|
+
cval = cval % (int)hplane[curr_lev].cat_coef[ncols_categ].size();
|
|
1202
|
+
hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
else {
|
|
1206
|
+
hval += hplane[curr_lev].fill_new[ncols_categ];
|
|
1207
|
+
}
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
else
|
|
1211
|
+
{
|
|
1212
|
+
hval += hplane[curr_lev].cat_coef[ncols_categ][cval];
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
break;
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
ncols_categ++;
|
|
1221
|
+
break;
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
default:
|
|
1225
|
+
{
|
|
1226
|
+
assert(0);
|
|
1227
|
+
break;
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
output_depth -= (hval < hplane[curr_lev].range_low) ||
|
|
1234
|
+
(hval > hplane[curr_lev].range_high);
|
|
1235
|
+
curr_lev = (hval <= hplane[curr_lev].split_point)?
|
|
1236
|
+
hplane[curr_lev].hplane_left : hplane[curr_lev].hplane_right;
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
template <class real_t, class sparse_ix>
|
|
1242
|
+
void batched_csc_predict(PredictionData<real_t, sparse_ix> &prediction_data, int nthreads,
|
|
1243
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1244
|
+
double *restrict output_depths, sparse_ix *restrict tree_num,
|
|
1245
|
+
double *restrict per_tree_depths)
|
|
1246
|
+
{
|
|
1247
|
+
#ifdef _OPENMP
|
|
1248
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
1249
|
+
if ((size_t)nthreads > ntrees)
|
|
1250
|
+
nthreads = ntrees;
|
|
1251
|
+
#else
|
|
1252
|
+
nthreads = 1;
|
|
1253
|
+
#endif
|
|
1254
|
+
std::vector<WorkerForPredictCSC> worker_memory(nthreads);
|
|
1255
|
+
|
|
1256
|
+
bool threw_exception = false;
|
|
1257
|
+
std::exception_ptr ex = NULL;
|
|
1258
|
+
|
|
1259
|
+
if (model_outputs != NULL)
|
|
1260
|
+
{
|
|
1261
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
1262
|
+
shared(worker_memory, model_outputs, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
|
|
1263
|
+
for (size_t_for tree = 0; tree < (decltype(tree))model_outputs->trees.size(); tree++)
|
|
1264
|
+
{
|
|
1265
|
+
if (threw_exception) continue;
|
|
1266
|
+
try
|
|
1267
|
+
{
|
|
1268
|
+
WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
|
|
1269
|
+
if (!ptr_worker->depths.size())
|
|
1270
|
+
{
|
|
1271
|
+
ptr_worker->depths.resize(prediction_data.nrows);
|
|
1272
|
+
ptr_worker->ix_arr.resize(prediction_data.nrows);
|
|
1273
|
+
std::iota(ptr_worker->ix_arr.begin(),
|
|
1274
|
+
ptr_worker->ix_arr.end(),
|
|
1275
|
+
(size_t)0);
|
|
1276
|
+
|
|
1277
|
+
if (model_outputs->missing_action == Divide ||
|
|
1278
|
+
(model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)
|
|
1279
|
+
) {
|
|
1280
|
+
ptr_worker->weights_arr.resize(prediction_data.nrows);
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
ptr_worker->st = 0;
|
|
1285
|
+
ptr_worker->end = prediction_data.nrows - 1;
|
|
1286
|
+
if (model_outputs->missing_action == Divide)
|
|
1287
|
+
std::fill(ptr_worker->weights_arr.begin(),
|
|
1288
|
+
ptr_worker->weights_arr.end(),
|
|
1289
|
+
(double)1);
|
|
1290
|
+
|
|
1291
|
+
traverse_itree_csc(*ptr_worker,
|
|
1292
|
+
model_outputs->trees[tree],
|
|
1293
|
+
*model_outputs,
|
|
1294
|
+
prediction_data,
|
|
1295
|
+
(tree_num == NULL)?
|
|
1296
|
+
((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
|
|
1297
|
+
per_tree_depths,
|
|
1298
|
+
(size_t)0,
|
|
1299
|
+
model_outputs->has_range_penalty);
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
catch (...)
|
|
1303
|
+
{
|
|
1304
|
+
#pragma omp critical
|
|
1305
|
+
{
|
|
1306
|
+
if (!threw_exception)
|
|
1307
|
+
{
|
|
1308
|
+
threw_exception = true;
|
|
1309
|
+
ex = std::current_exception();
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
else
|
|
1317
|
+
{
|
|
1318
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
1319
|
+
shared(worker_memory, model_outputs_ext, prediction_data, tree_num, per_tree_depths, threw_exception, ex)
|
|
1320
|
+
for (size_t_for tree = 0; tree < (decltype(tree))model_outputs_ext->hplanes.size(); tree++)
|
|
1321
|
+
{
|
|
1322
|
+
if (threw_exception) continue;
|
|
1323
|
+
try
|
|
1324
|
+
{
|
|
1325
|
+
WorkerForPredictCSC *ptr_worker = &worker_memory[omp_get_thread_num()];
|
|
1326
|
+
if (!ptr_worker->depths.size())
|
|
1327
|
+
{
|
|
1328
|
+
ptr_worker->depths.resize(prediction_data.nrows);
|
|
1329
|
+
ptr_worker->comb_val.resize(prediction_data.nrows);
|
|
1330
|
+
ptr_worker->ix_arr.resize(prediction_data.nrows);
|
|
1331
|
+
std::iota(ptr_worker->ix_arr.begin(),
|
|
1332
|
+
ptr_worker->ix_arr.end(),
|
|
1333
|
+
(size_t)0);
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1336
|
+
ptr_worker->st = 0;
|
|
1337
|
+
ptr_worker->end = prediction_data.nrows - 1;
|
|
1338
|
+
|
|
1339
|
+
traverse_hplane_csc(*ptr_worker,
|
|
1340
|
+
model_outputs_ext->hplanes[tree],
|
|
1341
|
+
*model_outputs_ext,
|
|
1342
|
+
prediction_data,
|
|
1343
|
+
(tree_num == NULL)?
|
|
1344
|
+
((sparse_ix*)NULL) : (tree_num + tree*prediction_data.nrows),
|
|
1345
|
+
per_tree_depths,
|
|
1346
|
+
(size_t)0,
|
|
1347
|
+
model_outputs_ext->has_range_penalty);
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
catch (...)
|
|
1351
|
+
{
|
|
1352
|
+
#pragma omp critical
|
|
1353
|
+
{
|
|
1354
|
+
if (!threw_exception)
|
|
1355
|
+
{
|
|
1356
|
+
threw_exception = true;
|
|
1357
|
+
ex = std::current_exception();
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
if (threw_exception)
|
|
1364
|
+
std::rethrow_exception(ex);
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
#ifdef _OPENMP
|
|
1368
|
+
if (nthreads <= 1)
|
|
1369
|
+
#endif
|
|
1370
|
+
{
|
|
1371
|
+
std::copy(worker_memory.front().depths.begin(), worker_memory.front().depths.end(), output_depths);
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
#ifdef _OPENMP
|
|
1375
|
+
else
|
|
1376
|
+
{
|
|
1377
|
+
std::fill(output_depths, output_depths + prediction_data.nrows, (double)0);
|
|
1378
|
+
for (auto &workspace : worker_memory)
|
|
1379
|
+
if (workspace.depths.size())
|
|
1380
|
+
#if !defined(_MSC_VER) && !defined(_WIN32)
|
|
1381
|
+
#pragma omp simd
|
|
1382
|
+
#endif
|
|
1383
|
+
for (size_t row = 0; row < prediction_data.nrows; row++)
|
|
1384
|
+
output_depths[row] += workspace.depths[row];
|
|
1385
|
+
}
|
|
1386
|
+
#endif
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
template <class PredictionData, class sparse_ix>
|
|
1390
|
+
void traverse_itree_csc(WorkerForPredictCSC &workspace,
|
|
1391
|
+
std::vector<IsoTree> &trees,
|
|
1392
|
+
IsoForest &model_outputs,
|
|
1393
|
+
PredictionData &prediction_data,
|
|
1394
|
+
sparse_ix *restrict tree_num,
|
|
1395
|
+
double *restrict per_tree_depths,
|
|
1396
|
+
size_t curr_tree,
|
|
1397
|
+
bool has_range_penalty)
|
|
1398
|
+
{
|
|
1399
|
+
// if (trees[curr_tree].score >= 0)
|
|
1400
|
+
if (unlikely(trees[curr_tree].tree_left == 0))
|
|
1401
|
+
{
|
|
1402
|
+
if (model_outputs.missing_action != Divide)
|
|
1403
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1404
|
+
workspace.depths[workspace.ix_arr[row]] += trees[curr_tree].score;
|
|
1405
|
+
else
|
|
1406
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1407
|
+
workspace.depths[workspace.ix_arr[row]] += workspace.weights_arr[workspace.ix_arr[row]] * trees[curr_tree].score;
|
|
1408
|
+
if (unlikely(tree_num != NULL))
|
|
1409
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1410
|
+
tree_num[workspace.ix_arr[row]] = curr_tree;
|
|
1411
|
+
if (unlikely(per_tree_depths != NULL))
|
|
1412
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1413
|
+
per_tree_depths[workspace.ix_arr[row]] = trees[curr_tree].score;
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
/* in this case, the indices are sorted in the csc penalty function */
|
|
1418
|
+
if (!(has_range_penalty && model_outputs.missing_action != Divide && curr_tree > 0) && trees[curr_tree].col_type == Numeric)
|
|
1419
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
1420
|
+
|
|
1421
|
+
/* TODO: should mix the splitting function with the range penalty */
|
|
1422
|
+
|
|
1423
|
+
/* divide according to tree */
|
|
1424
|
+
size_t orig_end = workspace.end;
|
|
1425
|
+
size_t st_NA, end_NA, split_ix;
|
|
1426
|
+
switch (trees[curr_tree].col_type)
|
|
1427
|
+
{
|
|
1428
|
+
case Numeric:
|
|
1429
|
+
{
|
|
1430
|
+
divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
|
|
1431
|
+
prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
|
|
1432
|
+
trees[curr_tree].num_split, model_outputs.missing_action,
|
|
1433
|
+
st_NA, end_NA, split_ix);
|
|
1434
|
+
break;
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
case Categorical:
|
|
1438
|
+
{
|
|
1439
|
+
switch (model_outputs.cat_split_type)
|
|
1440
|
+
{
|
|
1441
|
+
case SingleCateg:
|
|
1442
|
+
{
|
|
1443
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
1444
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
1445
|
+
workspace.st, workspace.end, trees[curr_tree].chosen_cat,
|
|
1446
|
+
model_outputs.missing_action, st_NA, end_NA, split_ix);
|
|
1447
|
+
break;
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
case SubSet:
|
|
1451
|
+
{
|
|
1452
|
+
if (!trees[curr_tree].cat_split.size())
|
|
1453
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
1454
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
1455
|
+
workspace.st, workspace.end,
|
|
1456
|
+
model_outputs.missing_action, model_outputs.new_cat_action,
|
|
1457
|
+
trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
|
|
1458
|
+
else
|
|
1459
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
1460
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
1461
|
+
workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
|
|
1462
|
+
(int) trees[curr_tree].cat_split.size(),
|
|
1463
|
+
model_outputs.missing_action, model_outputs.new_cat_action,
|
|
1464
|
+
(bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
|
|
1465
|
+
break;
|
|
1466
|
+
}
|
|
1467
|
+
}
|
|
1468
|
+
break;
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
default:
|
|
1472
|
+
{
|
|
1473
|
+
assert(0);
|
|
1474
|
+
break;
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
/* continue splitting recursively */
|
|
1479
|
+
if (unlikely(model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL))
|
|
1480
|
+
goto missing_action_divide;
|
|
1481
|
+
switch (model_outputs.missing_action)
|
|
1482
|
+
{
|
|
1483
|
+
case Impute:
|
|
1484
|
+
{
|
|
1485
|
+
split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
case Fail:
|
|
1489
|
+
{
|
|
1490
|
+
if (split_ix > workspace.st)
|
|
1491
|
+
{
|
|
1492
|
+
workspace.end = split_ix - 1;
|
|
1493
|
+
|
|
1494
|
+
if (has_range_penalty && trees[curr_tree].col_type == Numeric)
|
|
1495
|
+
add_csc_range_penalty(workspace,
|
|
1496
|
+
prediction_data,
|
|
1497
|
+
(double*)NULL,
|
|
1498
|
+
trees[curr_tree].col_num,
|
|
1499
|
+
trees[curr_tree].range_low,
|
|
1500
|
+
trees[curr_tree].range_high);
|
|
1501
|
+
|
|
1502
|
+
traverse_itree_csc(workspace,
|
|
1503
|
+
trees,
|
|
1504
|
+
model_outputs,
|
|
1505
|
+
prediction_data,
|
|
1506
|
+
tree_num,
|
|
1507
|
+
per_tree_depths,
|
|
1508
|
+
trees[curr_tree].tree_left,
|
|
1509
|
+
has_range_penalty);
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
|
|
1513
|
+
if (split_ix <= orig_end)
|
|
1514
|
+
{
|
|
1515
|
+
workspace.st = split_ix;
|
|
1516
|
+
workspace.end = orig_end;
|
|
1517
|
+
|
|
1518
|
+
if (has_range_penalty && trees[curr_tree].col_type == Numeric)
|
|
1519
|
+
add_csc_range_penalty(workspace,
|
|
1520
|
+
prediction_data,
|
|
1521
|
+
(double*)NULL,
|
|
1522
|
+
trees[curr_tree].col_num,
|
|
1523
|
+
trees[curr_tree].range_low,
|
|
1524
|
+
trees[curr_tree].range_high);
|
|
1525
|
+
|
|
1526
|
+
traverse_itree_csc(workspace,
|
|
1527
|
+
trees,
|
|
1528
|
+
model_outputs,
|
|
1529
|
+
prediction_data,
|
|
1530
|
+
tree_num,
|
|
1531
|
+
per_tree_depths,
|
|
1532
|
+
trees[curr_tree].tree_right,
|
|
1533
|
+
has_range_penalty);
|
|
1534
|
+
}
|
|
1535
|
+
break;
|
|
1536
|
+
}
|
|
1537
|
+
|
|
1538
|
+
case Divide:
|
|
1539
|
+
{
|
|
1540
|
+
missing_action_divide:
|
|
1541
|
+
/* TODO: maybe here it shouldn't copy the whole ix_arr,
|
|
1542
|
+
but then it'd need to re-generate it from outside too */
|
|
1543
|
+
std::vector<double> weights_arr;
|
|
1544
|
+
std::vector<size_t> ix_arr;
|
|
1545
|
+
if (end_NA > workspace.st)
|
|
1546
|
+
{
|
|
1547
|
+
weights_arr.assign(workspace.weights_arr.begin(),
|
|
1548
|
+
workspace.weights_arr.begin() + end_NA);
|
|
1549
|
+
ix_arr.assign(workspace.ix_arr.data(),
|
|
1550
|
+
workspace.ix_arr.data() + end_NA);
|
|
1551
|
+
}
|
|
1552
|
+
|
|
1553
|
+
if (has_range_penalty && trees[curr_tree].col_type == Numeric)
|
|
1554
|
+
{
|
|
1555
|
+
size_t st = workspace.st;
|
|
1556
|
+
size_t end = workspace.end;
|
|
1557
|
+
|
|
1558
|
+
if (workspace.st < st_NA)
|
|
1559
|
+
{
|
|
1560
|
+
workspace.end = st_NA - 1;
|
|
1561
|
+
add_csc_range_penalty(workspace,
|
|
1562
|
+
prediction_data,
|
|
1563
|
+
workspace.weights_arr.data(),
|
|
1564
|
+
trees[curr_tree].col_num,
|
|
1565
|
+
trees[curr_tree].range_low,
|
|
1566
|
+
trees[curr_tree].range_high);
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
if (workspace.end >= end_NA)
|
|
1570
|
+
{
|
|
1571
|
+
workspace.st = end_NA;
|
|
1572
|
+
workspace.end = end;
|
|
1573
|
+
add_csc_range_penalty(workspace,
|
|
1574
|
+
prediction_data,
|
|
1575
|
+
workspace.weights_arr.data(),
|
|
1576
|
+
trees[curr_tree].col_num,
|
|
1577
|
+
trees[curr_tree].range_low,
|
|
1578
|
+
trees[curr_tree].range_high);
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
workspace.st = st;
|
|
1582
|
+
workspace.end = end;
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
if (end_NA > workspace.st)
|
|
1586
|
+
{
|
|
1587
|
+
workspace.end = end_NA - 1;
|
|
1588
|
+
for (size_t row = st_NA; row < end_NA; row++)
|
|
1589
|
+
workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
|
|
1590
|
+
traverse_itree_csc(workspace,
|
|
1591
|
+
trees,
|
|
1592
|
+
model_outputs,
|
|
1593
|
+
prediction_data,
|
|
1594
|
+
tree_num,
|
|
1595
|
+
per_tree_depths,
|
|
1596
|
+
trees[curr_tree].tree_left,
|
|
1597
|
+
has_range_penalty);
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
if (st_NA <= orig_end)
|
|
1601
|
+
{
|
|
1602
|
+
workspace.st = st_NA;
|
|
1603
|
+
workspace.end = orig_end;
|
|
1604
|
+
if (weights_arr.size())
|
|
1605
|
+
{
|
|
1606
|
+
std::copy(weights_arr.begin(),
|
|
1607
|
+
weights_arr.end(),
|
|
1608
|
+
workspace.weights_arr.begin());
|
|
1609
|
+
std::copy(ix_arr.begin(),
|
|
1610
|
+
ix_arr.end(),
|
|
1611
|
+
workspace.ix_arr.begin());
|
|
1612
|
+
weights_arr.clear();
|
|
1613
|
+
weights_arr.shrink_to_fit();
|
|
1614
|
+
ix_arr.clear();
|
|
1615
|
+
ix_arr.shrink_to_fit();
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1618
|
+
for (size_t row = st_NA; row < end_NA; row++)
|
|
1619
|
+
workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
|
|
1620
|
+
traverse_itree_csc(workspace,
|
|
1621
|
+
trees,
|
|
1622
|
+
model_outputs,
|
|
1623
|
+
prediction_data,
|
|
1624
|
+
tree_num,
|
|
1625
|
+
per_tree_depths,
|
|
1626
|
+
trees[curr_tree].tree_right,
|
|
1627
|
+
has_range_penalty);
|
|
1628
|
+
}
|
|
1629
|
+
break;
|
|
1630
|
+
}
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
template <class PredictionData, class sparse_ix>
|
|
1635
|
+
void traverse_hplane_csc(WorkerForPredictCSC &workspace,
|
|
1636
|
+
std::vector<IsoHPlane> &hplanes,
|
|
1637
|
+
ExtIsoForest &model_outputs,
|
|
1638
|
+
PredictionData &prediction_data,
|
|
1639
|
+
sparse_ix *restrict tree_num,
|
|
1640
|
+
double *restrict per_tree_depths,
|
|
1641
|
+
size_t curr_tree,
|
|
1642
|
+
bool has_range_penalty)
|
|
1643
|
+
{
|
|
1644
|
+
// if (hplanes[curr_tree].score >= 0)
|
|
1645
|
+
if (unlikely(hplanes[curr_tree].hplane_left == 0))
|
|
1646
|
+
{
|
|
1647
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1648
|
+
workspace.depths[workspace.ix_arr[row]] += hplanes[curr_tree].score;
|
|
1649
|
+
if (unlikely(tree_num != NULL))
|
|
1650
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1651
|
+
tree_num[workspace.ix_arr[row]] = curr_tree;
|
|
1652
|
+
if (unlikely(per_tree_depths != NULL))
|
|
1653
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1654
|
+
per_tree_depths[workspace.ix_arr[row]] = hplanes[curr_tree].score;
|
|
1655
|
+
return;
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1658
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
1659
|
+
std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0.);
|
|
1660
|
+
double unused;
|
|
1661
|
+
|
|
1662
|
+
if (likely(prediction_data.categ_data == NULL))
|
|
1663
|
+
{
|
|
1664
|
+
for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
|
|
1665
|
+
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
1666
|
+
hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
|
|
1667
|
+
prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
|
|
1668
|
+
hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
|
|
1669
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
1670
|
+
model_outputs.missing_action, NULL, NULL, false);
|
|
1671
|
+
}
|
|
1672
|
+
|
|
1673
|
+
else
|
|
1674
|
+
{
|
|
1675
|
+
size_t ncols_numeric = 0;
|
|
1676
|
+
size_t ncols_categ = 0;
|
|
1677
|
+
for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
|
|
1678
|
+
{
|
|
1679
|
+
switch (hplanes[curr_tree].col_type[col])
|
|
1680
|
+
{
|
|
1681
|
+
case Numeric:
|
|
1682
|
+
{
|
|
1683
|
+
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
1684
|
+
hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
|
|
1685
|
+
prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
|
|
1686
|
+
hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
|
|
1687
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
1688
|
+
model_outputs.missing_action, NULL, NULL, false);
|
|
1689
|
+
ncols_numeric++;
|
|
1690
|
+
break;
|
|
1691
|
+
}
|
|
1692
|
+
|
|
1693
|
+
case Categorical:
|
|
1694
|
+
{
|
|
1695
|
+
add_linear_comb<double>(
|
|
1696
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
|
1697
|
+
prediction_data.categ_data + hplanes[curr_tree].col_num[col] * prediction_data.nrows,
|
|
1698
|
+
(model_outputs.cat_split_type == SubSet)? (int)hplanes[curr_tree].cat_coef[ncols_categ].size() : 0,
|
|
1699
|
+
(model_outputs.cat_split_type == SubSet)? hplanes[curr_tree].cat_coef[ncols_categ].data() : NULL,
|
|
1700
|
+
(model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].fill_new[ncols_categ] : 0.,
|
|
1701
|
+
(model_outputs.cat_split_type == SingleCateg)? hplanes[curr_tree].chosen_cat[ncols_categ] : 0,
|
|
1702
|
+
hplanes[curr_tree].fill_val[col], hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
|
|
1703
|
+
model_outputs.new_cat_action, model_outputs.missing_action, model_outputs.cat_split_type, false);
|
|
1704
|
+
ncols_categ++;
|
|
1705
|
+
break;
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
default:
|
|
1709
|
+
{
|
|
1710
|
+
assert(0);
|
|
1711
|
+
break;
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
if (has_range_penalty)
|
|
1718
|
+
{
|
|
1719
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1720
|
+
workspace.depths[workspace.ix_arr[row]]
|
|
1721
|
+
-=
|
|
1722
|
+
(workspace.comb_val[row - workspace.st] < hplanes[curr_tree].range_low) ||
|
|
1723
|
+
(workspace.comb_val[row - workspace.st] > hplanes[curr_tree].range_high);
|
|
1724
|
+
}
|
|
1725
|
+
|
|
1726
|
+
/* divide data */
|
|
1727
|
+
size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
|
|
1728
|
+
workspace.st, workspace.end, hplanes[curr_tree].split_point);
|
|
1729
|
+
|
|
1730
|
+
/* continue splitting recursively */
|
|
1731
|
+
size_t orig_end = workspace.end;
|
|
1732
|
+
if (split_ix > workspace.st)
|
|
1733
|
+
{
|
|
1734
|
+
workspace.end = split_ix - 1;
|
|
1735
|
+
traverse_hplane_csc(workspace,
|
|
1736
|
+
hplanes,
|
|
1737
|
+
model_outputs,
|
|
1738
|
+
prediction_data,
|
|
1739
|
+
tree_num,
|
|
1740
|
+
per_tree_depths,
|
|
1741
|
+
hplanes[curr_tree].hplane_left,
|
|
1742
|
+
has_range_penalty);
|
|
1743
|
+
}
|
|
1744
|
+
|
|
1745
|
+
if (split_ix <= orig_end)
|
|
1746
|
+
{
|
|
1747
|
+
workspace.st = split_ix;
|
|
1748
|
+
workspace.end = orig_end;
|
|
1749
|
+
traverse_hplane_csc(workspace,
|
|
1750
|
+
hplanes,
|
|
1751
|
+
model_outputs,
|
|
1752
|
+
prediction_data,
|
|
1753
|
+
tree_num,
|
|
1754
|
+
per_tree_depths,
|
|
1755
|
+
hplanes[curr_tree].hplane_right,
|
|
1756
|
+
has_range_penalty);
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
template <class PredictionData>
|
|
1761
|
+
void add_csc_range_penalty(WorkerForPredictCSC &workspace,
|
|
1762
|
+
PredictionData &prediction_data,
|
|
1763
|
+
double *restrict weights_arr,
|
|
1764
|
+
size_t col_num,
|
|
1765
|
+
double range_low,
|
|
1766
|
+
double range_high)
|
|
1767
|
+
{
|
|
1768
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
1769
|
+
|
|
1770
|
+
size_t st_col = prediction_data.Xc_indptr[col_num];
|
|
1771
|
+
size_t end_col = prediction_data.Xc_indptr[col_num + 1] - 1;
|
|
1772
|
+
size_t curr_pos = st_col;
|
|
1773
|
+
size_t ind_end_col = prediction_data.Xc_ind[end_col];
|
|
1774
|
+
size_t *ptr_st = std::lower_bound(workspace.ix_arr.data() + workspace.st,
|
|
1775
|
+
workspace.ix_arr.data() + workspace.end + 1,
|
|
1776
|
+
prediction_data.Xc_ind[st_col]);
|
|
1777
|
+
|
|
1778
|
+
if (range_low <= 0 && range_high >= 0)
|
|
1779
|
+
{
|
|
1780
|
+
for (size_t *row = ptr_st;
|
|
1781
|
+
row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
|
1782
|
+
)
|
|
1783
|
+
{
|
|
1784
|
+
if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
|
|
1785
|
+
{
|
|
1786
|
+
if (likely(!std::isnan(prediction_data.Xc[curr_pos])
|
|
1787
|
+
&&
|
|
1788
|
+
( prediction_data.Xc[curr_pos] < range_low ||
|
|
1789
|
+
prediction_data.Xc[curr_pos] > range_high )))
|
|
1790
|
+
{
|
|
1791
|
+
workspace.depths[*row] -= (weights_arr == NULL)? 1. : weights_arr[*row];
|
|
1792
|
+
}
|
|
1793
|
+
|
|
1794
|
+
if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
|
|
1795
|
+
curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
|
|
1796
|
+
prediction_data.Xc_ind + end_col + 1,
|
|
1797
|
+
*(++row))
|
|
1798
|
+
- prediction_data.Xc_ind;
|
|
1799
|
+
}
|
|
1800
|
+
|
|
1801
|
+
else
|
|
1802
|
+
{
|
|
1803
|
+
if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
|
|
1804
|
+
row = std::lower_bound(row + 1,
|
|
1805
|
+
workspace.ix_arr.data() + workspace.end + 1,
|
|
1806
|
+
prediction_data.Xc_ind[curr_pos]);
|
|
1807
|
+
else
|
|
1808
|
+
curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
|
|
1809
|
+
prediction_data.Xc_ind + end_col + 1,
|
|
1810
|
+
*row)
|
|
1811
|
+
- prediction_data.Xc_ind;
|
|
1812
|
+
}
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
|
|
1816
|
+
else
|
|
1817
|
+
{
|
|
1818
|
+
if (likely(weights_arr == NULL))
|
|
1819
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1820
|
+
workspace.depths[workspace.ix_arr[row]]--;
|
|
1821
|
+
else
|
|
1822
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
|
1823
|
+
workspace.depths[workspace.ix_arr[row]] -= weights_arr[workspace.ix_arr[row]];
|
|
1824
|
+
|
|
1825
|
+
|
|
1826
|
+
for (size_t *row = ptr_st;
|
|
1827
|
+
row != workspace.ix_arr.data() + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
|
1828
|
+
)
|
|
1829
|
+
{
|
|
1830
|
+
if (prediction_data.Xc_ind[curr_pos] == (decltype(*prediction_data.Xc_ind))(*row))
|
|
1831
|
+
{
|
|
1832
|
+
if (likely(std::isnan(prediction_data.Xc[curr_pos])
|
|
1833
|
+
||
|
|
1834
|
+
( prediction_data.Xc[curr_pos] >= range_low &&
|
|
1835
|
+
prediction_data.Xc[curr_pos] <= range_high )))
|
|
1836
|
+
{
|
|
1837
|
+
workspace.depths[*row] += (weights_arr == NULL)? 1. : weights_arr[*row];
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
if (row == workspace.ix_arr.data() + workspace.end || curr_pos == end_col) break;
|
|
1841
|
+
curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
|
|
1842
|
+
prediction_data.Xc_ind + end_col + 1,
|
|
1843
|
+
*(++row))
|
|
1844
|
+
- prediction_data.Xc_ind;
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
else
|
|
1848
|
+
{
|
|
1849
|
+
if (prediction_data.Xc_ind[curr_pos] > (decltype(*prediction_data.Xc_ind))(*row))
|
|
1850
|
+
row = std::lower_bound(row + 1,
|
|
1851
|
+
workspace.ix_arr.data() + workspace.end + 1,
|
|
1852
|
+
prediction_data.Xc_ind[curr_pos]);
|
|
1853
|
+
else
|
|
1854
|
+
curr_pos = std::lower_bound(prediction_data.Xc_ind + curr_pos + 1,
|
|
1855
|
+
prediction_data.Xc_ind + end_col + 1,
|
|
1856
|
+
*row)
|
|
1857
|
+
- prediction_data.Xc_ind;
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
template <class PredictionData>
|
|
1864
|
+
double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num) noexcept
|
|
1865
|
+
{
|
|
1866
|
+
decltype(prediction_data.Xc_indptr)
|
|
1867
|
+
search_res = std::lower_bound(prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num],
|
|
1868
|
+
prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1],
|
|
1869
|
+
row);
|
|
1870
|
+
if (
|
|
1871
|
+
search_res == (prediction_data.Xc_ind + prediction_data.Xc_indptr[col_num + 1])
|
|
1872
|
+
||
|
|
1873
|
+
(*search_res) != static_cast<typename std::remove_pointer<decltype(search_res)>::type>(row)
|
|
1874
|
+
)
|
|
1875
|
+
return 0.;
|
|
1876
|
+
else
|
|
1877
|
+
return prediction_data.Xc[search_res - prediction_data.Xc_ind];
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
template <class PredictionData, class sparse_ix>
|
|
1881
|
+
static inline double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num, size_t lb, size_t ub) noexcept
|
|
1882
|
+
{
|
|
1883
|
+
if (row_end == row_st || col_num < lb || col_num > ub)
|
|
1884
|
+
return 0.;
|
|
1885
|
+
sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
|
|
1886
|
+
if (search_res == row_end || *search_res != (sparse_ix)col_num)
|
|
1887
|
+
return 0.;
|
|
1888
|
+
else
|
|
1889
|
+
return prediction_data.Xr[search_res - prediction_data.Xr_ind];
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
template <class PredictionData, class sparse_ix>
|
|
1893
|
+
double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num) noexcept
|
|
1894
|
+
{
|
|
1895
|
+
if (row_end == row_st)
|
|
1896
|
+
return 0.;
|
|
1897
|
+
sparse_ix *search_res = std::lower_bound(row_st, row_end, (sparse_ix) col_num);
|
|
1898
|
+
if (search_res == row_end || *search_res != (sparse_ix)col_num)
|
|
1899
|
+
return 0.;
|
|
1900
|
+
else
|
|
1901
|
+
return prediction_data.Xr[search_res - prediction_data.Xr_ind];
|
|
1902
|
+
}
|
|
1903
|
+
|
|
1904
|
+
template <class sparse_ix>
|
|
1905
|
+
void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
|
|
1906
|
+
{
|
|
1907
|
+
std::fill(n_terminal, n_terminal + model_outputs.trees.size(), 0);
|
|
1908
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
|
|
1909
|
+
for (size_t_for tree = 0; tree < (decltype(tree))model_outputs.trees.size(); tree++)
|
|
1910
|
+
{
|
|
1911
|
+
n_nodes[tree] = model_outputs.trees[tree].size();
|
|
1912
|
+
for (IsoTree &node : model_outputs.trees[tree])
|
|
1913
|
+
{
|
|
1914
|
+
n_terminal[tree] += (node.tree_left == 0);
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
template <class sparse_ix>
|
|
1920
|
+
void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads) noexcept
|
|
1921
|
+
{
|
|
1922
|
+
std::fill(n_terminal, n_terminal + model_outputs.hplanes.size(), 0);
|
|
1923
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(model_outputs, n_nodes, n_terminal)
|
|
1924
|
+
for (size_t_for hplane = 0; hplane <(decltype(hplane)) model_outputs.hplanes.size(); hplane++)
|
|
1925
|
+
{
|
|
1926
|
+
n_nodes[hplane] = model_outputs.hplanes[hplane].size();
|
|
1927
|
+
for (IsoHPlane &node : model_outputs.hplanes[hplane])
|
|
1928
|
+
{
|
|
1929
|
+
n_terminal[hplane] += (node.hplane_left == 0);
|
|
1930
|
+
}
|
|
1931
|
+
}
|
|
1932
|
+
}
|