isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,1886 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
#include "isotree.hpp"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
/* Calculate distance or similarity or kernel/proximity between data points
|
|
67
|
+
*
|
|
68
|
+
* Parameters
|
|
69
|
+
* ==========
|
|
70
|
+
* - numeric_data[nrows * ncols_numeric]
|
|
71
|
+
* Pointer to numeric data for which to make calculations. If not using 'indexer', must be
|
|
72
|
+
* ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
|
|
73
|
+
* column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
|
|
74
|
+
* row-major or column-major format (with row-major being faster).
|
|
75
|
+
* If categorical data is passed, must be in the same storage order (row-major / column-major)
|
|
76
|
+
* as numerical data (whether dense or sparse).
|
|
77
|
+
* The column order must be the same as in the data that was used to fit the model.
|
|
78
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
79
|
+
* the first group is assumed to be the earlier rows here.
|
|
80
|
+
* Pass NULL if there are no dense numeric columns.
|
|
81
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
82
|
+
* - categ_data[nrows * ncols_categ]
|
|
83
|
+
* Pointer to categorical data for which to make calculations. If not using 'indexer', must be
|
|
84
|
+
* ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
|
|
85
|
+
* column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
|
|
86
|
+
* row-major or column-major format (with row-major being faster).
|
|
87
|
+
* If numerical data is passed, must be in the same storage order (row-major / column-major)
|
|
88
|
+
* as categorical data (whether the numerical data is dense or sparse).
|
|
89
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
|
90
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
91
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
|
92
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
93
|
+
* must be the same as was used in the data to which the model was fit.
|
|
94
|
+
* Pass NULL if there are no categorical columns.
|
|
95
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
96
|
+
* the first group is assumed to be the earlier rows here.
|
|
97
|
+
* - Xc[nnz]
|
|
98
|
+
* Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed),
|
|
99
|
+
* or optionally in CSR format (row-compressed) if using 'indexer' and passing 'is_col_major=false'
|
|
100
|
+
* (not recommended as the calculations will be slower if sparse data is passed as CSR).
|
|
101
|
+
* If categorical data is passed, must be in the same storage order (row-major or CSR / column-major or CSC)
|
|
102
|
+
* as numerical data (whether dense or sparse).
|
|
103
|
+
* Pass NULL if there are no sparse numeric columns.
|
|
104
|
+
* Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
|
|
105
|
+
* - Xc_ind[nnz]
|
|
106
|
+
* Pointer to row indices to which each non-zero entry in 'Xc' corresponds
|
|
107
|
+
* (column indices if 'Xc' is in CSR format).
|
|
108
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
109
|
+
* Pass NULL if there are no sparse numeric columns in CSC or CSR format.
|
|
110
|
+
* - Xc_indptr[ncols_categ + 1]
|
|
111
|
+
* Pointer to column index pointers that tell at entry [col] where does column 'col'
|
|
112
|
+
* start and at entry [col + 1] where does column 'col' end
|
|
113
|
+
* (row index pointers if 'Xc' is passed in CSR format).
|
|
114
|
+
* Pass NULL if there are no sparse numeric columns in CSC or CSR format.
|
|
115
|
+
* If making calculations between two sets of observations/rows (see documentation for 'rmat'),
|
|
116
|
+
* the first group is assumed to be the earlier rows here.
|
|
117
|
+
* - nrows
|
|
118
|
+
* Number of rows in 'numeric_data', 'Xc', 'categ_data'.
|
|
119
|
+
* - use_long_double
|
|
120
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
|
121
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
|
122
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
|
123
|
+
* Power8 platforms).
|
|
124
|
+
* - nthreads
|
|
125
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
126
|
+
* allocated, even if the thread does not end up being used (with one exception being kernel calculations
|
|
127
|
+
* with respect to reference points in an idexer). Ignored when not building with OpenMP support.
|
|
128
|
+
* - assume_full_distr
|
|
129
|
+
* Whether to assume that the fitted model represents a full population distribution (will use a
|
|
130
|
+
* standardizing criterion assuming infinite sample, and the results of the similarity between two points
|
|
131
|
+
* at prediction time will not depend on the prescence of any third point that is similar to them, but will
|
|
132
|
+
* differ more compared to the pairwise distances between points from which the model was fit). If passing
|
|
133
|
+
* 'false', will calculate pairwise distances as if the new observations at prediction time were added to
|
|
134
|
+
* the sample to which each tree was fit, which will make the distances between two points potentially vary
|
|
135
|
+
* according to other newly introduced points.
|
|
136
|
+
* This was added for experimentation purposes only and it's not recommended to pass 'false'.
|
|
137
|
+
* Note that when calculating distances using 'indexer', there
|
|
138
|
+
* might be slight discrepancies between the numbers produced with or without the indexer due to what
|
|
139
|
+
* are considered "additional" observations in this calculation.
|
|
140
|
+
* This is ignored when passing 'as_kernel=true'.
|
|
141
|
+
* - standardize_dist
|
|
142
|
+
* Whether to standardize the resulting average separation depths between rows according
|
|
143
|
+
* to the expected average separation depth in a similar way as when predicting outlierness,
|
|
144
|
+
* in order to obtain a standardized distance. If passing 'false', will output the average
|
|
145
|
+
* separation depth instead.
|
|
146
|
+
* If passing 'as_kernel=true', this indicates whether to output a fraction (if 'true') or
|
|
147
|
+
* the raw number of matching trees (if 'false').
|
|
148
|
+
* - as_kernel
|
|
149
|
+
* Whether to calculate the "similarities" as isolation kernel or proximity matrix, which counts
|
|
150
|
+
* the proportion of trees in which two observations end up in the same terminal node. This is
|
|
151
|
+
* typically much faster than separation-based distance, but is typically not as good quality.
|
|
152
|
+
* Note that, for kernel calculations, the indexer is only used if it has reference points stored on it.
|
|
153
|
+
* - model_outputs
|
|
154
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
|
155
|
+
* if the calculations are to be made from an extended model. Can only pass one of
|
|
156
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
157
|
+
* - model_outputs_ext
|
|
158
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
|
159
|
+
* if the calculations are to be made from a single-variable model. Can only pass one of
|
|
160
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
|
161
|
+
* - tmat[nrows * (nrows - 1) / 2] (out)
|
|
162
|
+
* Pointer to array where the resulting pairwise distances or average separation depths or kernels will
|
|
163
|
+
* be written into. As the output is a symmetric matrix, this function will only fill in the
|
|
164
|
+
* upper-triangular part, in which entry 0 <= i < j < n will be located at position
|
|
165
|
+
* p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
|
|
166
|
+
* Can be converted to a dense square matrix through function 'tmat_to_dense'.
|
|
167
|
+
* The array must already be initialized to zeros.
|
|
168
|
+
* If calculating distance/separation from a group of points to another group of points,
|
|
169
|
+
* pass NULL here and use 'rmat' instead.
|
|
170
|
+
* - rmat[nrows1 * nrows2] (out)
|
|
171
|
+
* Pointer to array where to write the distances or separation depths or kernels between each row in
|
|
172
|
+
* one set of observations and each row in a different set of observations. If doing these
|
|
173
|
+
* calculations for all pairs of observations/rows, pass 'tmat' instead.
|
|
174
|
+
* Will take the first group of observations as the rows in this matrix, and the second
|
|
175
|
+
* group as the columns. The groups are assumed to be in the same data arrays, with the
|
|
176
|
+
* first group corresponding to the earlier rows there.
|
|
177
|
+
* This matrix will be used in row-major order (i.e. entries 1..nrows2 contain the first row from nrows1).
|
|
178
|
+
* Must be already initialized to zeros.
|
|
179
|
+
* If passing 'use_indexed_references=true' plus an indexer object with reference points, this
|
|
180
|
+
* array should have dimension [nrows, n_references].
|
|
181
|
+
* Ignored when 'tmat' is passed.
|
|
182
|
+
* - n_from
|
|
183
|
+
* When calculating distances between two groups of points, this indicates the number of
|
|
184
|
+
* observations/rows belonging to the first group (the rows in 'rmat'), which will be
|
|
185
|
+
* assumed to be the first 'n_from' rows.
|
|
186
|
+
* Ignored when 'tmat' is passed or when 'use_indexed_references=true' plus an indexer with
|
|
187
|
+
* references are passed.
|
|
188
|
+
* - use_indexed_references
|
|
189
|
+
* Whether to calculate distances with respect to reference points stored in the indexer
|
|
190
|
+
* object, if it has any. This is only supported with 'assume_full_distr=true' or with 'as_kernel=true'.
|
|
191
|
+
* If passing 'use_indexed_references=true', then 'tmat' must be NULL, and 'rmat' must
|
|
192
|
+
* be of dimension [nrows, n_references].
|
|
193
|
+
* - indexer
|
|
194
|
+
* Pointer to associated tree indexer for the model being used, if it was constructed,
|
|
195
|
+
* which can be used to speed up distance calculations, assuming that it was built with
|
|
196
|
+
* option 'with_distances=true'. If it does not contain node distances, it will not be used.
|
|
197
|
+
* Pass NULL if the indexer has not been constructed or was constructed with 'with_distances=false'.
|
|
198
|
+
* If it contains reference points and passing 'use_indexed_references=true', distances will be
|
|
199
|
+
* calculated between between the input data passed here and the reference points stored in this object.
|
|
200
|
+
* If passing 'as_kernel=true', the indexer can only be used for calculating kernels with respect to
|
|
201
|
+
* reference points in the indexer, otherwise it will not be used (which also means that the data must be
|
|
202
|
+
* passed in column-major order for all kernel calculations that are not with respect to reference points
|
|
203
|
+
* from an indexer).
|
|
204
|
+
* - is_col_major
|
|
205
|
+
* Whether the data comes in column-major order. If using 'indexer', predictions are also possible
|
|
206
|
+
* (and are even faster for the case of dense-only data) if passing the data in row-major format.
|
|
207
|
+
* Without 'indexer' (and with 'as_kernel=true' but without reference points in the idnexer), data
|
|
208
|
+
* may only be passed in column-major format.
|
|
209
|
+
* If there is sparse numeric data, it is highly suggested to pass it in CSC/column-major format.
|
|
210
|
+
* - ld_numeric
|
|
211
|
+
* If passing 'is_col_major=false', this indicates the leading dimension of the array 'numeric_data'.
|
|
212
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
213
|
+
* be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
|
|
214
|
+
* 'numeric_data' in column-major order, this is ignored and will be assumed that the
|
|
215
|
+
* leading dimension corresponds to the number of rows. This is ignored when passing numeric
|
|
216
|
+
* data in sparse format.
|
|
217
|
+
* Note that data in row-major order is only accepted when using 'indexer'.
|
|
218
|
+
* - ld_categ
|
|
219
|
+
* If passing 'is_col_major=false', this indicates the leading dimension of the array 'categ_data'.
|
|
220
|
+
* Typically, this corresponds to the number of columns, but may be larger (the array will
|
|
221
|
+
* be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
|
|
222
|
+
* 'categ_data' in column-major order, this is ignored and will be assumed that the
|
|
223
|
+
* leading dimension corresponds to the number of rows.
|
|
224
|
+
* Note that data in row-major order is only accepted when using 'indexer'.
|
|
225
|
+
*/
|
|
226
|
+
template <class real_t, class sparse_ix>
|
|
227
|
+
void calc_similarity(real_t numeric_data[], int categ_data[],
|
|
228
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
229
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
230
|
+
bool assume_full_distr, bool standardize_dist, bool as_kernel,
|
|
231
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
232
|
+
double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
|
|
233
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
|
|
234
|
+
{
|
|
235
|
+
if (use_long_double && !has_long_double()) {
|
|
236
|
+
use_long_double = false;
|
|
237
|
+
fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
|
238
|
+
}
|
|
239
|
+
#ifndef NO_LONG_DOUBLE
|
|
240
|
+
if (likely(!use_long_double))
|
|
241
|
+
#endif
|
|
242
|
+
calc_similarity_internal<real_t, sparse_ix, double>(
|
|
243
|
+
numeric_data, categ_data,
|
|
244
|
+
Xc, Xc_ind, Xc_indptr,
|
|
245
|
+
nrows, nthreads,
|
|
246
|
+
assume_full_distr, standardize_dist, as_kernel,
|
|
247
|
+
model_outputs, model_outputs_ext,
|
|
248
|
+
tmat, rmat, n_from, use_indexed_references,
|
|
249
|
+
indexer, is_col_major, ld_numeric, ld_categ
|
|
250
|
+
);
|
|
251
|
+
#ifndef NO_LONG_DOUBLE
|
|
252
|
+
else
|
|
253
|
+
calc_similarity_internal<real_t, sparse_ix, long double>(
|
|
254
|
+
numeric_data, categ_data,
|
|
255
|
+
Xc, Xc_ind, Xc_indptr,
|
|
256
|
+
nrows, nthreads,
|
|
257
|
+
assume_full_distr, standardize_dist, as_kernel,
|
|
258
|
+
model_outputs, model_outputs_ext,
|
|
259
|
+
tmat, rmat, n_from, use_indexed_references,
|
|
260
|
+
indexer, is_col_major, ld_numeric, ld_categ
|
|
261
|
+
);
|
|
262
|
+
#endif
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
266
|
+
void calc_similarity_internal(
|
|
267
|
+
real_t numeric_data[], int categ_data[],
|
|
268
|
+
real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
|
269
|
+
size_t nrows, int nthreads,
|
|
270
|
+
bool assume_full_distr, bool standardize_dist, bool as_kernel,
|
|
271
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
272
|
+
double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
|
|
273
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
|
|
274
|
+
{
|
|
275
|
+
if (nrows < 2 && (!use_indexed_references || indexer == NULL || indexer->indices.empty() || indexer->indices.front().reference_points.empty()))
|
|
276
|
+
throw std::runtime_error("Cannot calculate distances from less than 2 rows.\n");
|
|
277
|
+
if (as_kernel && (tmat != NULL || !use_indexed_references || (indexer != NULL && !indexer->indices.empty() && indexer->indices.front().reference_points.empty())))
|
|
278
|
+
indexer = NULL;
|
|
279
|
+
|
|
280
|
+
if (indexer != NULL && model_outputs != NULL)
|
|
281
|
+
{
|
|
282
|
+
if (model_outputs->missing_action == Divide) {
|
|
283
|
+
indexer = NULL;
|
|
284
|
+
if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
|
|
285
|
+
}
|
|
286
|
+
if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL) {
|
|
287
|
+
indexer = NULL;
|
|
288
|
+
if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
if (
|
|
292
|
+
!as_kernel &&
|
|
293
|
+
indexer != NULL &&
|
|
294
|
+
(indexer->indices.empty() || indexer->indices.front().node_distances.empty())
|
|
295
|
+
) {
|
|
296
|
+
if (use_indexed_references && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
|
|
297
|
+
throw std::runtime_error("Indexer was built without distances. Cannot use references from it.\n");
|
|
298
|
+
else {
|
|
299
|
+
indexer = NULL;
|
|
300
|
+
fprintf(stderr, "Indexer has no pre-computed distances, will not be used for distance calculations.\n");
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
if (
|
|
304
|
+
!is_col_major &&
|
|
305
|
+
indexer == NULL &&
|
|
306
|
+
(
|
|
307
|
+
Xc_indptr != NULL
|
|
308
|
+
||
|
|
309
|
+
(nrows != 1 &&
|
|
310
|
+
((numeric_data != NULL && ld_numeric > 1) || (categ_data != NULL && ld_categ > 1)))
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
throw std::runtime_error("Cannot calculate distances with row-major data without indexer.\n");
|
|
314
|
+
if (indexer != NULL)
|
|
315
|
+
{
|
|
316
|
+
if (use_indexed_references && tmat == NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
|
|
317
|
+
{
|
|
318
|
+
if (unlikely(!assume_full_distr))
|
|
319
|
+
throw std::runtime_error("Cannot calculate distances to reference points in indexer with 'assume_full_distr=false'.\n");
|
|
320
|
+
|
|
321
|
+
if (!as_kernel)
|
|
322
|
+
{
|
|
323
|
+
calc_similarity_from_indexer_with_references(
|
|
324
|
+
numeric_data, categ_data,
|
|
325
|
+
Xc, Xc_ind, Xc_indptr,
|
|
326
|
+
nrows, nthreads, standardize_dist,
|
|
327
|
+
model_outputs, model_outputs_ext,
|
|
328
|
+
rmat,
|
|
329
|
+
indexer, is_col_major, ld_numeric, ld_categ
|
|
330
|
+
);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
else
|
|
334
|
+
{
|
|
335
|
+
kernel_to_references(*indexer,
|
|
336
|
+
model_outputs, model_outputs_ext,
|
|
337
|
+
numeric_data, categ_data,
|
|
338
|
+
Xc, Xc_ind, Xc_indptr,
|
|
339
|
+
is_col_major, ld_numeric, ld_categ,
|
|
340
|
+
nrows, nthreads,
|
|
341
|
+
rmat,
|
|
342
|
+
standardize_dist);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
else
|
|
347
|
+
{
|
|
348
|
+
if (as_kernel) goto skip_indexer_if_kernel;
|
|
349
|
+
calc_similarity_from_indexer(
|
|
350
|
+
numeric_data, categ_data,
|
|
351
|
+
Xc, Xc_ind, Xc_indptr,
|
|
352
|
+
nrows, nthreads, assume_full_distr, standardize_dist,
|
|
353
|
+
model_outputs, model_outputs_ext,
|
|
354
|
+
tmat, rmat, n_from,
|
|
355
|
+
indexer, is_col_major, ld_numeric, ld_categ
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return;
|
|
360
|
+
}
|
|
361
|
+
skip_indexer_if_kernel:
|
|
362
|
+
|
|
363
|
+
PredictionData<real_t, sparse_ix>
|
|
364
|
+
prediction_data = {numeric_data, categ_data, nrows,
|
|
365
|
+
false, 0, 0,
|
|
366
|
+
Xc, Xc_ind, Xc_indptr,
|
|
367
|
+
NULL, NULL, NULL};
|
|
368
|
+
|
|
369
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
370
|
+
|
|
371
|
+
if (tmat != NULL) n_from = 0;
|
|
372
|
+
|
|
373
|
+
if (n_from == 0) {
|
|
374
|
+
#if SIZE_MAX == UINT32_MAX
|
|
375
|
+
size_t lim_rows = (size_t)UINT16_MAX - (size_t)1;
|
|
376
|
+
#elif SIZE_MAX == UINT64_MAX
|
|
377
|
+
size_t lim_rows = (size_t)UINT32_MAX - (size_t)1;
|
|
378
|
+
#else
|
|
379
|
+
size_t lim_rows = (size_t)std::ceil(std::sqrt((ldouble_safe)SIZE_MAX));
|
|
380
|
+
#endif
|
|
381
|
+
if (nrows > lim_rows)
|
|
382
|
+
throw std::runtime_error("Number of rows implies too large distance matrix (integer overflow).");
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if ((size_t)nthreads > ntrees)
|
|
386
|
+
nthreads = (int)ntrees;
|
|
387
|
+
#ifdef _OPENMP
|
|
388
|
+
std::vector<WorkerForSimilarity> worker_memory(nthreads);
|
|
389
|
+
#else
|
|
390
|
+
std::vector<WorkerForSimilarity> worker_memory(1);
|
|
391
|
+
nthreads = 1;
|
|
392
|
+
#endif
|
|
393
|
+
|
|
394
|
+
/* Global variable that determines if the procedure receives a stop signal */
|
|
395
|
+
SignalSwitcher ss = SignalSwitcher();
|
|
396
|
+
check_interrupt_switch(ss);
|
|
397
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
398
|
+
if (interrupt_switch) return;
|
|
399
|
+
#endif
|
|
400
|
+
/* For handling exceptions */
|
|
401
|
+
bool threw_exception = false;
|
|
402
|
+
std::exception_ptr ex = NULL;
|
|
403
|
+
|
|
404
|
+
if (
|
|
405
|
+
tmat == NULL &&
|
|
406
|
+
use_indexed_references &&
|
|
407
|
+
indexer != NULL &&
|
|
408
|
+
!indexer->indices.empty() &&
|
|
409
|
+
!indexer->indices.front().reference_points.empty() &&
|
|
410
|
+
(as_kernel || !indexer->indices.front().node_distances.empty())
|
|
411
|
+
) {
|
|
412
|
+
n_from = indexer->indices.front().reference_points.size();
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if (model_outputs != NULL)
|
|
416
|
+
{
|
|
417
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
418
|
+
shared(ntrees, worker_memory, prediction_data, model_outputs, ex, threw_exception, n_from)
|
|
419
|
+
for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
|
|
420
|
+
{
|
|
421
|
+
if (threw_exception || interrupt_switch) continue;
|
|
422
|
+
try
|
|
423
|
+
{
|
|
424
|
+
initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
|
|
425
|
+
model_outputs, NULL, n_from, assume_full_distr);
|
|
426
|
+
traverse_tree_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
|
|
427
|
+
worker_memory[omp_get_thread_num()],
|
|
428
|
+
prediction_data,
|
|
429
|
+
*model_outputs,
|
|
430
|
+
model_outputs->trees[tree],
|
|
431
|
+
(size_t)0,
|
|
432
|
+
as_kernel);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
catch (...)
|
|
436
|
+
{
|
|
437
|
+
#pragma omp critical
|
|
438
|
+
{
|
|
439
|
+
if (!threw_exception)
|
|
440
|
+
{
|
|
441
|
+
threw_exception = true;
|
|
442
|
+
ex = std::current_exception();
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
else
|
|
450
|
+
{
|
|
451
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
452
|
+
shared(ntrees, worker_memory, prediction_data, model_outputs_ext, ex, threw_exception, n_from)
|
|
453
|
+
for (size_t_for hplane = 0; hplane < (decltype(hplane))ntrees; hplane++)
|
|
454
|
+
{
|
|
455
|
+
if (threw_exception || interrupt_switch) continue;
|
|
456
|
+
try
|
|
457
|
+
{
|
|
458
|
+
initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
|
|
459
|
+
NULL, model_outputs_ext, n_from, assume_full_distr);
|
|
460
|
+
traverse_hplane_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
|
|
461
|
+
worker_memory[omp_get_thread_num()],
|
|
462
|
+
prediction_data,
|
|
463
|
+
*model_outputs_ext,
|
|
464
|
+
model_outputs_ext->hplanes[hplane],
|
|
465
|
+
(size_t)0,
|
|
466
|
+
as_kernel);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
catch (...)
|
|
470
|
+
{
|
|
471
|
+
#pragma omp critical
|
|
472
|
+
{
|
|
473
|
+
if (!threw_exception)
|
|
474
|
+
{
|
|
475
|
+
threw_exception = true;
|
|
476
|
+
ex = std::current_exception();
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
check_interrupt_switch(ss);
|
|
484
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
485
|
+
if (interrupt_switch) return;
|
|
486
|
+
#endif
|
|
487
|
+
|
|
488
|
+
if (threw_exception)
|
|
489
|
+
std::rethrow_exception(ex);
|
|
490
|
+
|
|
491
|
+
/* gather and transform the results */
|
|
492
|
+
gather_sim_result< PredictionData<real_t, sparse_ix>,
|
|
493
|
+
InputData<real_t, sparse_ix>,
|
|
494
|
+
WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t> >
|
|
495
|
+
(&worker_memory, NULL,
|
|
496
|
+
&prediction_data, NULL,
|
|
497
|
+
model_outputs, model_outputs_ext,
|
|
498
|
+
tmat, rmat, n_from,
|
|
499
|
+
ntrees, assume_full_distr,
|
|
500
|
+
standardize_dist, as_kernel, nthreads);
|
|
501
|
+
|
|
502
|
+
check_interrupt_switch(ss);
|
|
503
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
|
504
|
+
if (interrupt_switch) return;
|
|
505
|
+
#endif
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
template <class PredictionData, class ldouble_safe>
|
|
509
|
+
void traverse_tree_sim(WorkerForSimilarity &workspace,
|
|
510
|
+
PredictionData &prediction_data,
|
|
511
|
+
IsoForest &model_outputs,
|
|
512
|
+
std::vector<IsoTree> &trees,
|
|
513
|
+
size_t curr_tree,
|
|
514
|
+
const bool as_kernel)
|
|
515
|
+
{
|
|
516
|
+
if (interrupt_switch)
|
|
517
|
+
return;
|
|
518
|
+
|
|
519
|
+
if (workspace.st == workspace.end)
|
|
520
|
+
return;
|
|
521
|
+
|
|
522
|
+
if (workspace.tmat_sep.empty())
|
|
523
|
+
{
|
|
524
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
525
|
+
if (workspace.ix_arr[workspace.st] >= workspace.n_from)
|
|
526
|
+
return;
|
|
527
|
+
if (workspace.ix_arr[workspace.end] < workspace.n_from)
|
|
528
|
+
return;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
/* Note: the first separation step will not be added here, as it simply consists of adding +1
|
|
532
|
+
to every combination regardless. It has to be added at the end in 'gather_sim_result' to
|
|
533
|
+
obtain the average separation depth. */
|
|
534
|
+
if (trees[curr_tree].tree_left == 0)
|
|
535
|
+
{
|
|
536
|
+
ldouble_safe rem = (ldouble_safe) trees[curr_tree].remainder;
|
|
537
|
+
if (workspace.weights_arr.empty())
|
|
538
|
+
{
|
|
539
|
+
if (!as_kernel)
|
|
540
|
+
{
|
|
541
|
+
rem += (ldouble_safe)(workspace.end - workspace.st + 1);
|
|
542
|
+
if (!workspace.tmat_sep.empty())
|
|
543
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
544
|
+
prediction_data.nrows, workspace.tmat_sep.data(),
|
|
545
|
+
workspace.assume_full_distr? 3. : expected_separation_depth(rem));
|
|
546
|
+
else if (!workspace.rmat.empty())
|
|
547
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
548
|
+
workspace.n_from, prediction_data.nrows, workspace.rmat.data(),
|
|
549
|
+
workspace.assume_full_distr? 3. : expected_separation_depth(rem));
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
else
|
|
553
|
+
{
|
|
554
|
+
if (!workspace.tmat_sep.empty())
|
|
555
|
+
{
|
|
556
|
+
size_t i_, j_;
|
|
557
|
+
for (size_t i = workspace.st; i < workspace.end; i++)
|
|
558
|
+
{
|
|
559
|
+
i_ = workspace.ix_arr[i];
|
|
560
|
+
for (size_t j = i + 1; j <= workspace.end; j++)
|
|
561
|
+
{
|
|
562
|
+
j_ = workspace.ix_arr[j];
|
|
563
|
+
workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
else if (!workspace.rmat.empty())
|
|
569
|
+
{
|
|
570
|
+
size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
|
|
571
|
+
std::lower_bound(workspace.ix_arr.begin() + workspace.st,
|
|
572
|
+
workspace.ix_arr.begin() + workspace.end + 1,
|
|
573
|
+
workspace.n_from));
|
|
574
|
+
double *restrict rmat_this;
|
|
575
|
+
for (size_t i = workspace.st; i < workspace.st + n_group; i++)
|
|
576
|
+
{
|
|
577
|
+
rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
|
|
578
|
+
for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
|
|
579
|
+
{
|
|
580
|
+
rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
else
|
|
588
|
+
{
|
|
589
|
+
if (!as_kernel)
|
|
590
|
+
{
|
|
591
|
+
if (!workspace.assume_full_distr)
|
|
592
|
+
{
|
|
593
|
+
rem += std::accumulate(workspace.ix_arr.begin() + workspace.st,
|
|
594
|
+
workspace.ix_arr.begin() + workspace.end,
|
|
595
|
+
(ldouble_safe) 0.,
|
|
596
|
+
[&workspace](ldouble_safe curr, size_t ix)
|
|
597
|
+
{return curr + (ldouble_safe)workspace.weights_arr[ix];}
|
|
598
|
+
);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
if (!workspace.tmat_sep.empty())
|
|
602
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
603
|
+
prediction_data.nrows, workspace.tmat_sep.data(),
|
|
604
|
+
workspace.weights_arr.data(),
|
|
605
|
+
workspace.assume_full_distr? 3. : expected_separation_depth(rem));
|
|
606
|
+
else if (!workspace.rmat.empty())
|
|
607
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
608
|
+
workspace.n_from, prediction_data.nrows,
|
|
609
|
+
workspace.rmat.data(), workspace.weights_arr.data(),
|
|
610
|
+
workspace.assume_full_distr? 3. : expected_separation_depth(rem));
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
else
|
|
614
|
+
{
|
|
615
|
+
if (!workspace.tmat_sep.empty())
|
|
616
|
+
{
|
|
617
|
+
size_t i_, j_;
|
|
618
|
+
double w_this;
|
|
619
|
+
for (size_t i = workspace.st; i < workspace.end; i++)
|
|
620
|
+
{
|
|
621
|
+
i_ = workspace.ix_arr[i];
|
|
622
|
+
w_this = workspace.weights_arr[i_];
|
|
623
|
+
for (size_t j = i + 1; j <= workspace.end; j++)
|
|
624
|
+
{
|
|
625
|
+
j_ = workspace.ix_arr[j];
|
|
626
|
+
workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]
|
|
627
|
+
+=
|
|
628
|
+
w_this * workspace.weights_arr[j_];
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
else if (!workspace.rmat.empty())
|
|
634
|
+
{
|
|
635
|
+
size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
|
|
636
|
+
std::lower_bound(workspace.ix_arr.begin() + workspace.st,
|
|
637
|
+
workspace.ix_arr.begin() + workspace.end + 1,
|
|
638
|
+
workspace.n_from));
|
|
639
|
+
double *restrict rmat_this;
|
|
640
|
+
double w_this;
|
|
641
|
+
size_t i_, j_;
|
|
642
|
+
for (size_t i = workspace.st; i < workspace.st + n_group; i++)
|
|
643
|
+
{
|
|
644
|
+
i_ = workspace.ix_arr[i];
|
|
645
|
+
rmat_this = workspace.rmat.data() + i_*workspace.n_from;
|
|
646
|
+
w_this = workspace.weights_arr[i_];
|
|
647
|
+
for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
|
|
648
|
+
{
|
|
649
|
+
j_ = workspace.ix_arr[j];
|
|
650
|
+
rmat_this[j_ - workspace.n_from]
|
|
651
|
+
+=
|
|
652
|
+
w_this * workspace.weights_arr[j_];
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
else if (curr_tree > 0 && !as_kernel)
|
|
662
|
+
{
|
|
663
|
+
if (!workspace.tmat_sep.empty())
|
|
664
|
+
{
|
|
665
|
+
if (workspace.weights_arr.empty())
|
|
666
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
667
|
+
prediction_data.nrows, workspace.tmat_sep.data(), -1.);
|
|
668
|
+
else
|
|
669
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
670
|
+
prediction_data.nrows, workspace.tmat_sep.data(),
|
|
671
|
+
workspace.weights_arr.data(), -1.);
|
|
672
|
+
}
|
|
673
|
+
else if (!workspace.rmat.empty())
|
|
674
|
+
{
|
|
675
|
+
if (workspace.weights_arr.empty())
|
|
676
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
677
|
+
workspace.n_from, prediction_data.nrows, workspace.rmat.data(), -1.);
|
|
678
|
+
else
|
|
679
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
680
|
+
workspace.n_from, prediction_data.nrows,
|
|
681
|
+
workspace.rmat.data(), workspace.weights_arr.data(), -1.);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
/* divide according to tree */
|
|
687
|
+
if (prediction_data.Xc_indptr != NULL && !workspace.tmat_sep.empty())
|
|
688
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
689
|
+
size_t st_NA, end_NA, split_ix;
|
|
690
|
+
switch (trees[curr_tree].col_type)
|
|
691
|
+
{
|
|
692
|
+
case Numeric:
|
|
693
|
+
{
|
|
694
|
+
if (prediction_data.Xc_indptr == NULL)
|
|
695
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
696
|
+
prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
697
|
+
workspace.st, workspace.end, trees[curr_tree].num_split,
|
|
698
|
+
model_outputs.missing_action, st_NA, end_NA, split_ix);
|
|
699
|
+
else
|
|
700
|
+
divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
|
|
701
|
+
prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
|
|
702
|
+
trees[curr_tree].num_split, model_outputs.missing_action,
|
|
703
|
+
st_NA, end_NA, split_ix);
|
|
704
|
+
break;
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
case Categorical:
|
|
708
|
+
{
|
|
709
|
+
switch(model_outputs.cat_split_type)
|
|
710
|
+
{
|
|
711
|
+
case SingleCateg:
|
|
712
|
+
{
|
|
713
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
714
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
715
|
+
workspace.st, workspace.end, trees[curr_tree].chosen_cat,
|
|
716
|
+
model_outputs.missing_action, st_NA, end_NA, split_ix);
|
|
717
|
+
break;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
case SubSet:
|
|
721
|
+
{
|
|
722
|
+
if (!trees[curr_tree].cat_split.size())
|
|
723
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
724
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
725
|
+
workspace.st, workspace.end,
|
|
726
|
+
model_outputs.missing_action, model_outputs.new_cat_action,
|
|
727
|
+
trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
|
|
728
|
+
else
|
|
729
|
+
divide_subset_split(workspace.ix_arr.data(),
|
|
730
|
+
prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
|
|
731
|
+
workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
|
|
732
|
+
(int) trees[curr_tree].cat_split.size(),
|
|
733
|
+
model_outputs.missing_action, model_outputs.new_cat_action,
|
|
734
|
+
(bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
|
|
735
|
+
break;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
break;
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
default:
|
|
742
|
+
{
|
|
743
|
+
assert(0);
|
|
744
|
+
break;
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
/* continue splitting recursively */
|
|
750
|
+
size_t orig_end = workspace.end;
|
|
751
|
+
if (model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL) {
|
|
752
|
+
if (model_outputs.missing_action == Fail && trees[curr_tree].col_type == Numeric) {
|
|
753
|
+
st_NA = split_ix;
|
|
754
|
+
end_NA = split_ix;
|
|
755
|
+
}
|
|
756
|
+
goto missing_action_divide;
|
|
757
|
+
}
|
|
758
|
+
switch (model_outputs.missing_action)
|
|
759
|
+
{
|
|
760
|
+
case Impute:
|
|
761
|
+
{
|
|
762
|
+
split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
case Fail:
|
|
766
|
+
{
|
|
767
|
+
if (split_ix > workspace.st)
|
|
768
|
+
{
|
|
769
|
+
workspace.end = split_ix - 1;
|
|
770
|
+
traverse_tree_sim<PredictionData, ldouble_safe>(
|
|
771
|
+
workspace,
|
|
772
|
+
prediction_data,
|
|
773
|
+
model_outputs,
|
|
774
|
+
trees,
|
|
775
|
+
trees[curr_tree].tree_left,
|
|
776
|
+
as_kernel);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
if (split_ix <= orig_end)
|
|
781
|
+
{
|
|
782
|
+
workspace.st = split_ix;
|
|
783
|
+
workspace.end = orig_end;
|
|
784
|
+
traverse_tree_sim<PredictionData, ldouble_safe>(
|
|
785
|
+
workspace,
|
|
786
|
+
prediction_data,
|
|
787
|
+
model_outputs,
|
|
788
|
+
trees,
|
|
789
|
+
trees[curr_tree].tree_right,
|
|
790
|
+
as_kernel);
|
|
791
|
+
}
|
|
792
|
+
break;
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
case Divide: /* new_cat_action = 'Weighted' will also fall here */
|
|
796
|
+
{
|
|
797
|
+
/* TODO: this one should also have a parameter 'changed_weoghts' like during fitting */
|
|
798
|
+
missing_action_divide:
|
|
799
|
+
/* TODO: maybe here it shouldn't copy the whole ix_arr,
|
|
800
|
+
but then it'd need to re-generate it from outside too */
|
|
801
|
+
std::vector<double> weights_arr;
|
|
802
|
+
std::vector<size_t> ix_arr;
|
|
803
|
+
if (end_NA > workspace.st)
|
|
804
|
+
{
|
|
805
|
+
weights_arr.assign(workspace.weights_arr.begin(),
|
|
806
|
+
workspace.weights_arr.begin() + end_NA);
|
|
807
|
+
ix_arr.assign(workspace.ix_arr.begin(),
|
|
808
|
+
workspace.ix_arr.begin() + end_NA);
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
if (end_NA > workspace.st)
|
|
812
|
+
{
|
|
813
|
+
workspace.end = end_NA - 1;
|
|
814
|
+
for (size_t row = st_NA; row < end_NA; row++)
|
|
815
|
+
workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
|
|
816
|
+
traverse_tree_sim<PredictionData, ldouble_safe>(
|
|
817
|
+
workspace,
|
|
818
|
+
prediction_data,
|
|
819
|
+
model_outputs,
|
|
820
|
+
trees,
|
|
821
|
+
trees[curr_tree].tree_left,
|
|
822
|
+
as_kernel);
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
if (st_NA <= orig_end)
|
|
826
|
+
{
|
|
827
|
+
workspace.st = st_NA;
|
|
828
|
+
workspace.end = orig_end;
|
|
829
|
+
if (!weights_arr.empty())
|
|
830
|
+
{
|
|
831
|
+
std::copy(weights_arr.begin(),
|
|
832
|
+
weights_arr.end(),
|
|
833
|
+
workspace.weights_arr.begin());
|
|
834
|
+
std::copy(ix_arr.begin(),
|
|
835
|
+
ix_arr.end(),
|
|
836
|
+
workspace.ix_arr.begin());
|
|
837
|
+
weights_arr.clear();
|
|
838
|
+
weights_arr.shrink_to_fit();
|
|
839
|
+
ix_arr.clear();
|
|
840
|
+
ix_arr.shrink_to_fit();
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
for (size_t row = st_NA; row < end_NA; row++)
|
|
844
|
+
workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
|
|
845
|
+
traverse_tree_sim<PredictionData, ldouble_safe>(
|
|
846
|
+
workspace,
|
|
847
|
+
prediction_data,
|
|
848
|
+
model_outputs,
|
|
849
|
+
trees,
|
|
850
|
+
trees[curr_tree].tree_right,
|
|
851
|
+
as_kernel);
|
|
852
|
+
}
|
|
853
|
+
break;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
template <class PredictionData, class ldouble_safe>
|
|
859
|
+
void traverse_hplane_sim(WorkerForSimilarity &workspace,
|
|
860
|
+
PredictionData &prediction_data,
|
|
861
|
+
ExtIsoForest &model_outputs,
|
|
862
|
+
std::vector<IsoHPlane> &hplanes,
|
|
863
|
+
size_t curr_tree,
|
|
864
|
+
const bool as_kernel)
|
|
865
|
+
{
|
|
866
|
+
if (interrupt_switch)
|
|
867
|
+
return;
|
|
868
|
+
|
|
869
|
+
if (workspace.st == workspace.end)
|
|
870
|
+
return;
|
|
871
|
+
|
|
872
|
+
if (workspace.tmat_sep.empty())
|
|
873
|
+
{
|
|
874
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
875
|
+
if (workspace.ix_arr[workspace.st] >= workspace.n_from)
|
|
876
|
+
return;
|
|
877
|
+
if (workspace.ix_arr[workspace.end] < workspace.n_from)
|
|
878
|
+
return;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/* Note: the first separation step will not be added here, as it simply consists of adding +1
|
|
882
|
+
to every combination regardless. It has to be added at the end in 'gather_sim_result' to
|
|
883
|
+
obtain the average separation depth. */
|
|
884
|
+
if (hplanes[curr_tree].hplane_left == 0)
|
|
885
|
+
{
|
|
886
|
+
if (!as_kernel)
|
|
887
|
+
{
|
|
888
|
+
if (!workspace.tmat_sep.empty())
|
|
889
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
890
|
+
prediction_data.nrows, workspace.tmat_sep.data(),
|
|
891
|
+
workspace.assume_full_distr? 3. :
|
|
892
|
+
expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
|
|
893
|
+
+ (ldouble_safe)(workspace.end - workspace.st + 1))
|
|
894
|
+
);
|
|
895
|
+
else if (!workspace.rmat.empty())
|
|
896
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
|
|
897
|
+
prediction_data.nrows, workspace.rmat.data(),
|
|
898
|
+
workspace.assume_full_distr? 3. :
|
|
899
|
+
expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
|
|
900
|
+
+ (ldouble_safe)(workspace.end - workspace.st + 1))
|
|
901
|
+
);
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
else
|
|
905
|
+
{
|
|
906
|
+
if (!workspace.tmat_sep.empty())
|
|
907
|
+
{
|
|
908
|
+
size_t i_, j_;
|
|
909
|
+
for (size_t i = workspace.st; i < workspace.end; i++)
|
|
910
|
+
{
|
|
911
|
+
i_ = workspace.ix_arr[i];
|
|
912
|
+
for (size_t j = i + 1; j <= workspace.end; j++)
|
|
913
|
+
{
|
|
914
|
+
j_ = workspace.ix_arr[j];
|
|
915
|
+
workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
else if (!workspace.rmat.empty())
|
|
921
|
+
{
|
|
922
|
+
size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
|
|
923
|
+
std::lower_bound(workspace.ix_arr.begin() + workspace.st,
|
|
924
|
+
workspace.ix_arr.begin() + workspace.end + 1,
|
|
925
|
+
workspace.n_from));
|
|
926
|
+
double *restrict rmat_this;
|
|
927
|
+
for (size_t i = workspace.st; i < workspace.st + n_group; i++)
|
|
928
|
+
{
|
|
929
|
+
rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
|
|
930
|
+
for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
|
|
931
|
+
{
|
|
932
|
+
rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
return;
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
else if (curr_tree > 0 && !as_kernel)
|
|
941
|
+
{
|
|
942
|
+
if (!workspace.tmat_sep.empty())
|
|
943
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
944
|
+
prediction_data.nrows, workspace.tmat_sep.data(), -1.);
|
|
945
|
+
else if (!workspace.rmat.empty())
|
|
946
|
+
increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
|
|
947
|
+
prediction_data.nrows, workspace.rmat.data(), -1.);
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
|
|
951
|
+
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
|
952
|
+
|
|
953
|
+
/* reconstruct linear combination */
|
|
954
|
+
size_t ncols_numeric = 0;
|
|
955
|
+
size_t ncols_categ = 0;
|
|
956
|
+
std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
|
|
957
|
+
double unused;
|
|
958
|
+
if (prediction_data.categ_data != NULL || prediction_data.Xc_indptr != NULL)
|
|
959
|
+
{
|
|
960
|
+
for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
|
|
961
|
+
{
|
|
962
|
+
switch(hplanes[curr_tree].col_type[col])
|
|
963
|
+
{
|
|
964
|
+
case Numeric:
|
|
965
|
+
{
|
|
966
|
+
if (prediction_data.Xc_indptr == NULL)
|
|
967
|
+
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
|
968
|
+
prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
|
|
969
|
+
hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
|
|
970
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
971
|
+
model_outputs.missing_action, NULL, NULL, false);
|
|
972
|
+
else
|
|
973
|
+
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
|
|
974
|
+
hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
|
|
975
|
+
prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
|
|
976
|
+
hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
|
|
977
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
978
|
+
model_outputs.missing_action, NULL, NULL, false);
|
|
979
|
+
ncols_numeric++;
|
|
980
|
+
break;
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
case Categorical:
|
|
984
|
+
{
|
|
985
|
+
switch(model_outputs.cat_split_type)
|
|
986
|
+
{
|
|
987
|
+
case SingleCateg:
|
|
988
|
+
{
|
|
989
|
+
add_linear_comb<ldouble_safe>(
|
|
990
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
|
991
|
+
prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
|
|
992
|
+
(int)0, NULL, hplanes[curr_tree].fill_new[ncols_categ],
|
|
993
|
+
hplanes[curr_tree].chosen_cat[ncols_categ],
|
|
994
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
995
|
+
workspace.comb_val[0], NULL, NULL, model_outputs.new_cat_action,
|
|
996
|
+
model_outputs.missing_action, SingleCateg, false);
|
|
997
|
+
break;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
case SubSet:
|
|
1001
|
+
{
|
|
1002
|
+
add_linear_comb<ldouble_safe>(
|
|
1003
|
+
workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
|
1004
|
+
prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
|
|
1005
|
+
(int) hplanes[curr_tree].cat_coef[ncols_categ].size(),
|
|
1006
|
+
hplanes[curr_tree].cat_coef[ncols_categ].data(), (double) 0, (int) 0,
|
|
1007
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
1008
|
+
hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
|
|
1009
|
+
model_outputs.new_cat_action, model_outputs.missing_action, SubSet, false);
|
|
1010
|
+
break;
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
ncols_categ++;
|
|
1014
|
+
break;
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
default:
|
|
1018
|
+
{
|
|
1019
|
+
assert(0);
|
|
1020
|
+
break;
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
else /* faster version for numerical-only */
|
|
1028
|
+
{
|
|
1029
|
+
for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
|
|
1030
|
+
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
|
1031
|
+
prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
|
|
1032
|
+
hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
|
|
1033
|
+
(model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
|
|
1034
|
+
model_outputs.missing_action, NULL, NULL, false);
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
/* divide data */
|
|
1038
|
+
size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
|
|
1039
|
+
workspace.st, workspace.end, hplanes[curr_tree].split_point);
|
|
1040
|
+
|
|
1041
|
+
/* continue splitting recursively */
|
|
1042
|
+
size_t orig_end = workspace.end;
|
|
1043
|
+
if (split_ix > workspace.st)
|
|
1044
|
+
{
|
|
1045
|
+
workspace.end = split_ix - 1;
|
|
1046
|
+
traverse_hplane_sim<PredictionData, ldouble_safe>(
|
|
1047
|
+
workspace,
|
|
1048
|
+
prediction_data,
|
|
1049
|
+
model_outputs,
|
|
1050
|
+
hplanes,
|
|
1051
|
+
hplanes[curr_tree].hplane_left,
|
|
1052
|
+
as_kernel);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
if (split_ix <= orig_end)
|
|
1056
|
+
{
|
|
1057
|
+
workspace.st = split_ix;
|
|
1058
|
+
workspace.end = orig_end;
|
|
1059
|
+
traverse_hplane_sim<PredictionData, ldouble_safe>(
|
|
1060
|
+
workspace,
|
|
1061
|
+
prediction_data,
|
|
1062
|
+
model_outputs,
|
|
1063
|
+
hplanes,
|
|
1064
|
+
hplanes[curr_tree].hplane_right,
|
|
1065
|
+
as_kernel);
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
template <class PredictionData, class InputData, class WorkerMemory>
|
|
1071
|
+
void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
|
|
1072
|
+
std::vector<WorkerMemory> *worker_memory_m,
|
|
1073
|
+
PredictionData *prediction_data, InputData *input_data,
|
|
1074
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1075
|
+
double *restrict tmat, double *restrict rmat, size_t n_from,
|
|
1076
|
+
size_t ntrees, bool assume_full_distr,
|
|
1077
|
+
bool standardize_dist, bool as_kernel, int nthreads)
|
|
1078
|
+
{
|
|
1079
|
+
if (interrupt_switch)
|
|
1080
|
+
return;
|
|
1081
|
+
|
|
1082
|
+
size_t nrows = (prediction_data != NULL)? prediction_data->nrows : input_data->nrows;
|
|
1083
|
+
size_t ncomb = calc_ncomb(nrows);
|
|
1084
|
+
size_t n_to = (prediction_data != NULL)? (prediction_data->nrows - n_from) : 0;
|
|
1085
|
+
|
|
1086
|
+
#ifdef _OPENMP
|
|
1087
|
+
if (nthreads > 1)
|
|
1088
|
+
{
|
|
1089
|
+
if (worker_memory != NULL)
|
|
1090
|
+
{
|
|
1091
|
+
for (WorkerForSimilarity &w : *worker_memory)
|
|
1092
|
+
{
|
|
1093
|
+
if (!w.tmat_sep.empty())
|
|
1094
|
+
{
|
|
1095
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory)
|
|
1096
|
+
for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
|
|
1097
|
+
tmat[ix] += w.tmat_sep[ix];
|
|
1098
|
+
}
|
|
1099
|
+
else if (!w.rmat.empty())
|
|
1100
|
+
{
|
|
1101
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(rmat, w, worker_memory)
|
|
1102
|
+
for (size_t_for ix = 0; ix < (decltype(ix))w.rmat.size(); ix++)
|
|
1103
|
+
rmat[ix] += w.rmat[ix];
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
else
|
|
1109
|
+
{
|
|
1110
|
+
for (WorkerMemory &w : *worker_memory_m)
|
|
1111
|
+
{
|
|
1112
|
+
if (!w.tmat_sep.empty())
|
|
1113
|
+
{
|
|
1114
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory_m)
|
|
1115
|
+
for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
|
|
1116
|
+
tmat[ix] += w.tmat_sep[ix];
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
else
|
|
1123
|
+
#endif
|
|
1124
|
+
{
|
|
1125
|
+
if (worker_memory != NULL)
|
|
1126
|
+
{
|
|
1127
|
+
if (!(*worker_memory)[0].tmat_sep.empty())
|
|
1128
|
+
std::copy((*worker_memory)[0].tmat_sep.begin(), (*worker_memory)[0].tmat_sep.end(), tmat);
|
|
1129
|
+
else
|
|
1130
|
+
std::copy((*worker_memory)[0].rmat.begin(), (*worker_memory)[0].rmat.end(), rmat);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
else
|
|
1134
|
+
{
|
|
1135
|
+
std::copy((*worker_memory_m)[0].tmat_sep.begin(), (*worker_memory_m)[0].tmat_sep.end(), tmat);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
double ntrees_dbl = (double) ntrees;
|
|
1140
|
+
if (standardize_dist)
|
|
1141
|
+
{
|
|
1142
|
+
if (as_kernel)
|
|
1143
|
+
{
|
|
1144
|
+
if (tmat != NULL)
|
|
1145
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1146
|
+
tmat[ix] /= ntrees_dbl;
|
|
1147
|
+
else
|
|
1148
|
+
for (size_t ix = 0; ix < (n_from * n_to); ix++)
|
|
1149
|
+
rmat[ix] /= ntrees_dbl;
|
|
1150
|
+
return;
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
/* Note: the separation distances up this point are missing the first hop, which is always
|
|
1155
|
+
a +1 to every combination. Thus, it needs to be added back for the average separation depth.
|
|
1156
|
+
For the standardized metric, it takes the expected divisor as 2(=3-1) instead of 3, given
|
|
1157
|
+
that every combination will always get a +1 at the beginning. Since what's obtained here
|
|
1158
|
+
is a sum across all trees, adding this +1 means adding the number of trees. */
|
|
1159
|
+
double div_trees = ntrees_dbl;
|
|
1160
|
+
if (assume_full_distr)
|
|
1161
|
+
{
|
|
1162
|
+
div_trees *= 2;
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
else if (input_data != NULL)
|
|
1166
|
+
{
|
|
1167
|
+
div_trees *= (expected_separation_depth(input_data->nrows) - 1);
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
else
|
|
1171
|
+
{
|
|
1172
|
+
div_trees *= ((
|
|
1173
|
+
(model_outputs != NULL)?
|
|
1174
|
+
expected_separation_depth_hotstart(model_outputs->exp_avg_sep,
|
|
1175
|
+
model_outputs->orig_sample_size,
|
|
1176
|
+
model_outputs->orig_sample_size + prediction_data->nrows)
|
|
1177
|
+
:
|
|
1178
|
+
expected_separation_depth_hotstart(model_outputs_ext->exp_avg_sep,
|
|
1179
|
+
model_outputs_ext->orig_sample_size,
|
|
1180
|
+
model_outputs_ext->orig_sample_size + prediction_data->nrows)
|
|
1181
|
+
) - 1);
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
if (tmat != NULL)
|
|
1186
|
+
#ifndef _WIN32
|
|
1187
|
+
#pragma omp simd
|
|
1188
|
+
#endif
|
|
1189
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1190
|
+
tmat[ix] = std::exp2( - tmat[ix] / div_trees);
|
|
1191
|
+
else
|
|
1192
|
+
#ifndef _WIN32
|
|
1193
|
+
#pragma omp simd
|
|
1194
|
+
#endif
|
|
1195
|
+
for (size_t ix = 0; ix < (n_from * n_to); ix++)
|
|
1196
|
+
rmat[ix] = std::exp2( - rmat[ix] / div_trees);
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
else
|
|
1200
|
+
{
|
|
1201
|
+
if (as_kernel) return;
|
|
1202
|
+
|
|
1203
|
+
if (tmat != NULL)
|
|
1204
|
+
#ifndef _WIN32
|
|
1205
|
+
#pragma omp simd
|
|
1206
|
+
#endif
|
|
1207
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1208
|
+
tmat[ix] = (tmat[ix] + ntrees) / ntrees_dbl;
|
|
1209
|
+
else
|
|
1210
|
+
#ifndef _WIN32
|
|
1211
|
+
#pragma omp simd
|
|
1212
|
+
#endif
|
|
1213
|
+
for (size_t ix = 0; ix < (n_from * n_to); ix++)
|
|
1214
|
+
rmat[ix] = (rmat[ix] + ntrees) / ntrees_dbl;
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
template <class PredictionData>
|
|
1219
|
+
void initialize_worker_for_sim(WorkerForSimilarity &workspace,
|
|
1220
|
+
PredictionData &prediction_data,
|
|
1221
|
+
IsoForest *model_outputs,
|
|
1222
|
+
ExtIsoForest *model_outputs_ext,
|
|
1223
|
+
size_t n_from,
|
|
1224
|
+
bool assume_full_distr)
|
|
1225
|
+
{
|
|
1226
|
+
workspace.st = 0;
|
|
1227
|
+
workspace.end = prediction_data.nrows - 1;
|
|
1228
|
+
workspace.n_from = n_from;
|
|
1229
|
+
workspace.assume_full_distr = assume_full_distr; /* doesn't need to have one copy per worker */
|
|
1230
|
+
|
|
1231
|
+
if (workspace.ix_arr.empty())
|
|
1232
|
+
{
|
|
1233
|
+
workspace.ix_arr.resize(prediction_data.nrows);
|
|
1234
|
+
std::iota(workspace.ix_arr.begin(), workspace.ix_arr.end(), (size_t)0);
|
|
1235
|
+
if (!n_from)
|
|
1236
|
+
workspace.tmat_sep.resize(calc_ncomb(prediction_data.nrows), 0);
|
|
1237
|
+
else
|
|
1238
|
+
workspace.rmat.resize((prediction_data.nrows - n_from) * n_from, 0);
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
if (model_outputs != NULL &&
|
|
1242
|
+
(model_outputs->missing_action == Divide ||
|
|
1243
|
+
(model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)))
|
|
1244
|
+
{
|
|
1245
|
+
if (workspace.weights_arr.empty())
|
|
1246
|
+
workspace.weights_arr.resize(prediction_data.nrows, 1.);
|
|
1247
|
+
else
|
|
1248
|
+
std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), 1.);
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
if (model_outputs_ext != NULL)
|
|
1252
|
+
{
|
|
1253
|
+
if (workspace.comb_val.empty())
|
|
1254
|
+
workspace.comb_val.resize(prediction_data.nrows, 0);
|
|
1255
|
+
else
|
|
1256
|
+
std::fill(workspace.comb_val.begin(), workspace.comb_val.end(), 0);
|
|
1257
|
+
}
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
template <class real_t, class sparse_ix>
|
|
1261
|
+
void calc_similarity_from_indexer
|
|
1262
|
+
(
|
|
1263
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1264
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1265
|
+
size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
|
|
1266
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1267
|
+
double *restrict tmat, double *restrict rmat, size_t n_from,
|
|
1268
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
|
|
1269
|
+
)
|
|
1270
|
+
{
|
|
1271
|
+
SignalSwitcher ss;
|
|
1272
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
1273
|
+
std::vector<sparse_ix> terminal_indices(nrows * ntrees);
|
|
1274
|
+
std::unique_ptr<double[]> ignored(new double[nrows]);
|
|
1275
|
+
predict_iforest(numeric_data, categ_data,
|
|
1276
|
+
is_col_major, ld_numeric, ld_categ,
|
|
1277
|
+
is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
|
|
1278
|
+
is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
|
|
1279
|
+
nrows, nthreads, false,
|
|
1280
|
+
model_outputs, model_outputs_ext,
|
|
1281
|
+
ignored.get(), terminal_indices.data(),
|
|
1282
|
+
(double*)NULL,
|
|
1283
|
+
indexer);
|
|
1284
|
+
ignored.reset();
|
|
1285
|
+
|
|
1286
|
+
#ifndef _OPENMP
|
|
1287
|
+
nthreads = 1;
|
|
1288
|
+
#endif
|
|
1289
|
+
|
|
1290
|
+
check_interrupt_switch(ss);
|
|
1291
|
+
|
|
1292
|
+
if (n_from == 0)
|
|
1293
|
+
{
|
|
1294
|
+
size_t ncomb = calc_ncomb(nrows);
|
|
1295
|
+
std::fill_n(tmat, ncomb, 0.);
|
|
1296
|
+
|
|
1297
|
+
std::vector<std::vector<double>> sum_separations(nthreads);
|
|
1298
|
+
if (nthreads != 1) {
|
|
1299
|
+
for (auto &v : sum_separations) v.resize(ncomb);
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
|
|
1303
|
+
for (auto &v : thread_argsorted_nodes) v.resize(nrows);
|
|
1304
|
+
|
|
1305
|
+
std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
|
|
1306
|
+
for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
bool threw_exception = false;
|
|
1310
|
+
std::exception_ptr ex = NULL;
|
|
1311
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) \
|
|
1312
|
+
shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
|
|
1313
|
+
sum_separations, thread_argsorted_nodes, thread_sorted_nodes, tmat, \
|
|
1314
|
+
threw_exception, ex)
|
|
1315
|
+
for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
|
|
1316
|
+
{
|
|
1317
|
+
if (interrupt_switch || threw_exception) continue;
|
|
1318
|
+
|
|
1319
|
+
if (unlikely(indexer->indices[tree].n_terminal <= 1))
|
|
1320
|
+
{
|
|
1321
|
+
for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
|
|
1322
|
+
continue;
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
|
|
1326
|
+
if (nthreads == 1) ptr_this_sep = tmat;
|
|
1327
|
+
double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
|
|
1328
|
+
double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
|
|
1329
|
+
size_t n_terminal_this = indexer->indices[tree].n_terminal;
|
|
1330
|
+
size_t ncomb_this = calc_ncomb(n_terminal_this);
|
|
1331
|
+
std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
|
|
1332
|
+
std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
|
|
1333
|
+
sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
|
|
1334
|
+
size_t i, j;
|
|
1335
|
+
double add_round;
|
|
1336
|
+
|
|
1337
|
+
if (assume_full_distr)
|
|
1338
|
+
{
|
|
1339
|
+
for (size_t el1 = 0; el1 < nrows-1; el1++)
|
|
1340
|
+
{
|
|
1341
|
+
i = terminal_indices_this[el1];
|
|
1342
|
+
for (size_t el2 = el1+1; el2 < nrows; el2++)
|
|
1343
|
+
{
|
|
1344
|
+
j = terminal_indices_this[el2];
|
|
1345
|
+
if (unlikely(i == j))
|
|
1346
|
+
add_round = node_depths_this[i] + 3.;
|
|
1347
|
+
else
|
|
1348
|
+
add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
|
|
1349
|
+
ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)] += add_round;
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
else
|
|
1355
|
+
{
|
|
1356
|
+
hashed_set<size_t> nodes_w_repeated;
|
|
1357
|
+
try
|
|
1358
|
+
{
|
|
1359
|
+
nodes_w_repeated.reserve(n_terminal_this);
|
|
1360
|
+
for (size_t el1 = 0; el1 < nrows-1; el1++)
|
|
1361
|
+
{
|
|
1362
|
+
i = terminal_indices_this[el1];
|
|
1363
|
+
for (size_t el2 = el1+1; el2 < nrows; el2++)
|
|
1364
|
+
{
|
|
1365
|
+
j = terminal_indices_this[el2];
|
|
1366
|
+
if (unlikely(i == j))
|
|
1367
|
+
nodes_w_repeated.insert(i);
|
|
1368
|
+
else
|
|
1369
|
+
ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)]
|
|
1370
|
+
+=
|
|
1371
|
+
node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
|
|
1372
|
+
}
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
catch (...)
|
|
1377
|
+
{
|
|
1378
|
+
#pragma omp critical
|
|
1379
|
+
{
|
|
1380
|
+
if (!threw_exception)
|
|
1381
|
+
{
|
|
1382
|
+
threw_exception = true;
|
|
1383
|
+
ex = std::current_exception();
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
if (likely(!nodes_w_repeated.empty()))
|
|
1389
|
+
{
|
|
1390
|
+
std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
|
|
1391
|
+
std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
|
|
1392
|
+
std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
|
|
1393
|
+
[&terminal_indices_this](const size_t a, const size_t b)
|
|
1394
|
+
{return terminal_indices_this[a] < terminal_indices_this[b];});
|
|
1395
|
+
std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
|
|
1396
|
+
std::vector<size_t>::iterator new_begin;
|
|
1397
|
+
|
|
1398
|
+
std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
|
|
1399
|
+
sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
|
|
1400
|
+
std::sort(sorted_nodes->begin(), sorted_nodes->end());
|
|
1401
|
+
for (size_t node_ix : *sorted_nodes)
|
|
1402
|
+
{
|
|
1403
|
+
curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
|
|
1404
|
+
node_ix,
|
|
1405
|
+
[&terminal_indices_this](const size_t &a, const size_t &b)
|
|
1406
|
+
{return (size_t)terminal_indices_this[a] < b;});
|
|
1407
|
+
new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
|
|
1408
|
+
node_ix,
|
|
1409
|
+
[&terminal_indices_this](const size_t &a, const size_t &b)
|
|
1410
|
+
{return a < (size_t)terminal_indices_this[b];});
|
|
1411
|
+
size_t n_this = std::distance(curr_begin, new_begin);
|
|
1412
|
+
double sep_this
|
|
1413
|
+
=
|
|
1414
|
+
n_this
|
|
1415
|
+
+
|
|
1416
|
+
((tree_this != NULL)?
|
|
1417
|
+
(*tree_this)[node_ix].remainder
|
|
1418
|
+
:
|
|
1419
|
+
(*hplane_this)[node_ix].remainder);
|
|
1420
|
+
double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
|
|
1421
|
+
|
|
1422
|
+
size_t i, j;
|
|
1423
|
+
for (size_t el1 = 0; el1 < n_this-1; el1++)
|
|
1424
|
+
{
|
|
1425
|
+
i = *(curr_begin + el1);
|
|
1426
|
+
for (size_t el2 = el1+1; el2 < n_this; el2++)
|
|
1427
|
+
{
|
|
1428
|
+
j = *(curr_begin + el2);
|
|
1429
|
+
ptr_this_sep[ix_comb(i, j, nrows, ncomb)] += sep_this_;
|
|
1430
|
+
}
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
curr_begin = new_begin;
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
|
|
1440
|
+
check_interrupt_switch(ss);
|
|
1441
|
+
|
|
1442
|
+
if (threw_exception)
|
|
1443
|
+
std::rethrow_exception(ex);
|
|
1444
|
+
|
|
1445
|
+
if (nthreads == 1)
|
|
1446
|
+
{
|
|
1447
|
+
/* Here 'tmat' already contains the sum of separations */
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
else
|
|
1451
|
+
{
|
|
1452
|
+
for (int tid = 0; tid < nthreads; tid++)
|
|
1453
|
+
{
|
|
1454
|
+
double *restrict seps_thread = sum_separations[tid].data();
|
|
1455
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1456
|
+
tmat[ix] += seps_thread[ix];
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1460
|
+
check_interrupt_switch(ss);
|
|
1461
|
+
|
|
1462
|
+
if (standardize_dist)
|
|
1463
|
+
{
|
|
1464
|
+
double divisor;
|
|
1465
|
+
if (assume_full_distr)
|
|
1466
|
+
divisor = (double)(ntrees * 2);
|
|
1467
|
+
else
|
|
1468
|
+
divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
|
|
1469
|
+
|
|
1470
|
+
if (assume_full_distr)
|
|
1471
|
+
{
|
|
1472
|
+
double ntrees_dbl = (double)ntrees;
|
|
1473
|
+
#ifndef _WIN32
|
|
1474
|
+
#pragma omp simd
|
|
1475
|
+
#endif
|
|
1476
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1477
|
+
tmat[ix] = std::exp2( - (tmat[ix] - ntrees_dbl) / divisor);
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
else
|
|
1481
|
+
{
|
|
1482
|
+
#ifndef _WIN32
|
|
1483
|
+
#pragma omp simd
|
|
1484
|
+
#endif
|
|
1485
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1486
|
+
tmat[ix] = std::exp2( - tmat[ix] / divisor);
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
else
|
|
1491
|
+
{
|
|
1492
|
+
double divisor = (double)ntrees;
|
|
1493
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1494
|
+
tmat[ix] /= divisor;
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
check_interrupt_switch(ss);
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
/* TODO: merge this with the block above, can simplify lots of things by a couple if-elses */
|
|
1501
|
+
else /* has 'rmat' / 'nfrom>0' */
|
|
1502
|
+
{
|
|
1503
|
+
size_t n_to = nrows - n_from;
|
|
1504
|
+
size_t ncomb = n_from * n_to;
|
|
1505
|
+
std::fill_n(rmat, ncomb, 0.);
|
|
1506
|
+
|
|
1507
|
+
std::vector<std::vector<double>> sum_separations(nthreads);
|
|
1508
|
+
if (nthreads != 1) {
|
|
1509
|
+
for (auto &v : sum_separations) v.resize(ncomb);
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
|
|
1513
|
+
for (auto &v : thread_argsorted_nodes) v.resize(nrows);
|
|
1514
|
+
|
|
1515
|
+
std::vector<std::vector<size_t>> thread_doubly_argsorted(nthreads);
|
|
1516
|
+
for (auto &v : thread_doubly_argsorted) v.reserve(nrows);
|
|
1517
|
+
|
|
1518
|
+
std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
|
|
1519
|
+
for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
|
|
1520
|
+
|
|
1521
|
+
bool threw_exception = false;
|
|
1522
|
+
std::exception_ptr ex = NULL;
|
|
1523
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) \
|
|
1524
|
+
shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
|
|
1525
|
+
sum_separations, thread_argsorted_nodes, thread_sorted_nodes, thread_doubly_argsorted, rmat, n_to, n_from, \
|
|
1526
|
+
threw_exception, ex)
|
|
1527
|
+
for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
|
|
1528
|
+
{
|
|
1529
|
+
if (interrupt_switch || threw_exception) continue;
|
|
1530
|
+
|
|
1531
|
+
if (unlikely(indexer->indices[tree].n_terminal <= 1))
|
|
1532
|
+
{
|
|
1533
|
+
for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
|
|
1534
|
+
continue;
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
|
|
1538
|
+
if (nthreads == 1) ptr_this_sep = rmat;
|
|
1539
|
+
double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
|
|
1540
|
+
double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
|
|
1541
|
+
size_t n_terminal_this = indexer->indices[tree].n_terminal;
|
|
1542
|
+
size_t ncomb_this = calc_ncomb(n_terminal_this);
|
|
1543
|
+
std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
|
|
1544
|
+
std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
|
|
1545
|
+
sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
|
|
1546
|
+
size_t i, j;
|
|
1547
|
+
double add_round;
|
|
1548
|
+
|
|
1549
|
+
if (assume_full_distr)
|
|
1550
|
+
{
|
|
1551
|
+
for (size_t el1 = 0; el1 < n_from; el1++)
|
|
1552
|
+
{
|
|
1553
|
+
i = terminal_indices_this[el1];
|
|
1554
|
+
double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
|
|
1555
|
+
for (size_t el2 = n_from; el2 < nrows; el2++)
|
|
1556
|
+
{
|
|
1557
|
+
j = terminal_indices_this[el2];
|
|
1558
|
+
if (unlikely(i == j))
|
|
1559
|
+
add_round = node_depths_this[i] + 3.;
|
|
1560
|
+
else
|
|
1561
|
+
add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
|
|
1562
|
+
ptr_this_sep_[el2-n_from] += add_round;
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
else
|
|
1568
|
+
{
|
|
1569
|
+
hashed_set<size_t> nodes_w_repeated;
|
|
1570
|
+
try
|
|
1571
|
+
{
|
|
1572
|
+
nodes_w_repeated.reserve(n_terminal_this);
|
|
1573
|
+
for (size_t el1 = 0; el1 < n_from; el1++)
|
|
1574
|
+
{
|
|
1575
|
+
i = terminal_indices_this[el1];
|
|
1576
|
+
double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
|
|
1577
|
+
for (size_t el2 = n_from; el2 < nrows; el2++)
|
|
1578
|
+
{
|
|
1579
|
+
j = terminal_indices_this[el2];
|
|
1580
|
+
if (unlikely(i == j))
|
|
1581
|
+
nodes_w_repeated.insert(i);
|
|
1582
|
+
else
|
|
1583
|
+
ptr_this_sep_[el2-n_from]
|
|
1584
|
+
+=
|
|
1585
|
+
node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
|
|
1586
|
+
}
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
if (likely(!nodes_w_repeated.empty()))
|
|
1590
|
+
{
|
|
1591
|
+
std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
|
|
1592
|
+
std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
|
|
1593
|
+
std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
|
|
1594
|
+
[&terminal_indices_this](const size_t a, const size_t b)
|
|
1595
|
+
{return terminal_indices_this[a] < terminal_indices_this[b];});
|
|
1596
|
+
std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
|
|
1597
|
+
std::vector<size_t>::iterator new_begin;
|
|
1598
|
+
|
|
1599
|
+
std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
|
|
1600
|
+
sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
|
|
1601
|
+
std::sort(sorted_nodes->begin(), sorted_nodes->end());
|
|
1602
|
+
for (size_t node_ix : *sorted_nodes)
|
|
1603
|
+
{
|
|
1604
|
+
curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
|
|
1605
|
+
node_ix,
|
|
1606
|
+
[&terminal_indices_this](const size_t &a, const size_t &b)
|
|
1607
|
+
{return (size_t)terminal_indices_this[a] < b;});
|
|
1608
|
+
new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
|
|
1609
|
+
node_ix,
|
|
1610
|
+
[&terminal_indices_this](const size_t &a, const size_t &b)
|
|
1611
|
+
{return a < (size_t)terminal_indices_this[b];});
|
|
1612
|
+
size_t n_this = std::distance(curr_begin, new_begin);
|
|
1613
|
+
if (unlikely(!n_this)) unexpected_error();
|
|
1614
|
+
double sep_this
|
|
1615
|
+
=
|
|
1616
|
+
n_this
|
|
1617
|
+
+
|
|
1618
|
+
((tree_this != NULL)?
|
|
1619
|
+
(*tree_this)[node_ix].remainder
|
|
1620
|
+
:
|
|
1621
|
+
(*hplane_this)[node_ix].remainder);
|
|
1622
|
+
double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
|
|
1623
|
+
|
|
1624
|
+
std::vector<size_t> *restrict doubly_argsorted = &thread_doubly_argsorted[omp_get_thread_num()];
|
|
1625
|
+
doubly_argsorted->assign(curr_begin, curr_begin + n_this);
|
|
1626
|
+
std::sort(doubly_argsorted->begin(), doubly_argsorted->end());
|
|
1627
|
+
std::vector<size_t>::iterator pos_n_from = std::lower_bound(doubly_argsorted->begin(),
|
|
1628
|
+
doubly_argsorted->end(),
|
|
1629
|
+
n_from);
|
|
1630
|
+
if (pos_n_from == doubly_argsorted->end()) unexpected_error();
|
|
1631
|
+
size_t n1 = std::distance(doubly_argsorted->begin(), pos_n_from);
|
|
1632
|
+
size_t i, j;
|
|
1633
|
+
double *ptr_this_sep__;
|
|
1634
|
+
for (size_t el1 = 0; el1 < n1; el1++)
|
|
1635
|
+
{
|
|
1636
|
+
i = (*doubly_argsorted)[el1];
|
|
1637
|
+
ptr_this_sep__ = ptr_this_sep + i*n_to;
|
|
1638
|
+
for (size_t el2 = n1; el2 < n_this; el2++)
|
|
1639
|
+
{
|
|
1640
|
+
j = (*doubly_argsorted)[el2];
|
|
1641
|
+
ptr_this_sep__[j-n_from] += sep_this_;
|
|
1642
|
+
}
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
curr_begin = new_begin;
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
catch (...)
|
|
1651
|
+
{
|
|
1652
|
+
#pragma omp critical
|
|
1653
|
+
{
|
|
1654
|
+
if (!threw_exception)
|
|
1655
|
+
{
|
|
1656
|
+
threw_exception = true;
|
|
1657
|
+
ex = std::current_exception();
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
}
|
|
1661
|
+
}
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
check_interrupt_switch(ss);
|
|
1665
|
+
|
|
1666
|
+
if (threw_exception)
|
|
1667
|
+
std::rethrow_exception(ex);
|
|
1668
|
+
|
|
1669
|
+
if (nthreads == 1)
|
|
1670
|
+
{
|
|
1671
|
+
/* Here 'rmat' already contains the sum of separations */
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
else
|
|
1675
|
+
{
|
|
1676
|
+
for (int tid = 0; tid < nthreads; tid++)
|
|
1677
|
+
{
|
|
1678
|
+
double *restrict seps_thread = sum_separations[tid].data();
|
|
1679
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1680
|
+
rmat[ix] += seps_thread[ix];
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
|
|
1684
|
+
check_interrupt_switch(ss);
|
|
1685
|
+
|
|
1686
|
+
if (standardize_dist)
|
|
1687
|
+
{
|
|
1688
|
+
double divisor;
|
|
1689
|
+
if (assume_full_distr)
|
|
1690
|
+
divisor = (double)(ntrees * 2);
|
|
1691
|
+
else
|
|
1692
|
+
divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
|
|
1693
|
+
|
|
1694
|
+
if (assume_full_distr)
|
|
1695
|
+
{
|
|
1696
|
+
double ntrees_dbl = (double)ntrees;
|
|
1697
|
+
#ifndef _WIN32
|
|
1698
|
+
#pragma omp simd
|
|
1699
|
+
#endif
|
|
1700
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1701
|
+
rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / divisor);
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
else
|
|
1705
|
+
{
|
|
1706
|
+
#ifndef _WIN32
|
|
1707
|
+
#pragma omp simd
|
|
1708
|
+
#endif
|
|
1709
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1710
|
+
rmat[ix] = std::exp2( - rmat[ix] / divisor);
|
|
1711
|
+
}
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
else
|
|
1715
|
+
{
|
|
1716
|
+
double divisor = (double)ntrees;
|
|
1717
|
+
for (size_t ix = 0; ix < ncomb; ix++)
|
|
1718
|
+
rmat[ix] /= divisor;
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
check_interrupt_switch(ss);
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
template <class real_t, class sparse_ix>
|
|
1726
|
+
void calc_similarity_from_indexer_with_references
|
|
1727
|
+
(
|
|
1728
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1729
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1730
|
+
size_t nrows, int nthreads, bool standardize_dist,
|
|
1731
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1732
|
+
double *restrict rmat,
|
|
1733
|
+
TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
|
|
1734
|
+
)
|
|
1735
|
+
{
|
|
1736
|
+
size_t n_ref = get_number_of_reference_points(*indexer);
|
|
1737
|
+
if (unlikely(!n_ref)) unexpected_error();
|
|
1738
|
+
|
|
1739
|
+
SignalSwitcher ss;
|
|
1740
|
+
|
|
1741
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
|
1742
|
+
std::vector<sparse_ix> terminal_indices(nrows * ntrees);
|
|
1743
|
+
std::unique_ptr<double[]> ignored(new double[nrows]);
|
|
1744
|
+
predict_iforest(numeric_data, categ_data,
|
|
1745
|
+
is_col_major, ld_numeric, ld_categ,
|
|
1746
|
+
is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
|
|
1747
|
+
is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
|
|
1748
|
+
nrows, nthreads, false,
|
|
1749
|
+
model_outputs, model_outputs_ext,
|
|
1750
|
+
ignored.get(), terminal_indices.data(),
|
|
1751
|
+
(double*)NULL,
|
|
1752
|
+
indexer);
|
|
1753
|
+
ignored.reset();
|
|
1754
|
+
|
|
1755
|
+
#ifndef _OPENMP
|
|
1756
|
+
nthreads = 1;
|
|
1757
|
+
#endif
|
|
1758
|
+
|
|
1759
|
+
check_interrupt_switch(ss);
|
|
1760
|
+
|
|
1761
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) \
|
|
1762
|
+
shared(rmat, terminal_indices, nrows, n_ref, indexer, ntrees)
|
|
1763
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
1764
|
+
{
|
|
1765
|
+
if (interrupt_switch) continue;
|
|
1766
|
+
|
|
1767
|
+
size_t i, j;
|
|
1768
|
+
size_t n_terminal_this;
|
|
1769
|
+
size_t ncomb_this;
|
|
1770
|
+
size_t *restrict ref_this;
|
|
1771
|
+
sparse_ix *restrict ind_this;
|
|
1772
|
+
double *restrict node_depths_this;
|
|
1773
|
+
double *restrict node_dist_this;
|
|
1774
|
+
double *rmat_this = rmat + row*n_ref;
|
|
1775
|
+
memset(rmat_this, 0, n_ref*sizeof(double));
|
|
1776
|
+
for (size_t tree = 0; tree < ntrees; tree++)
|
|
1777
|
+
{
|
|
1778
|
+
ref_this = indexer->indices[tree].reference_points.data();
|
|
1779
|
+
ind_this = terminal_indices.data() + tree*nrows;
|
|
1780
|
+
node_depths_this = indexer->indices[tree].node_depths.data();
|
|
1781
|
+
n_terminal_this = indexer->indices[tree].n_terminal;
|
|
1782
|
+
node_dist_this = indexer->indices[tree].node_distances.data();
|
|
1783
|
+
ncomb_this = calc_ncomb(n_terminal_this);
|
|
1784
|
+
for (size_t ref = 0; ref < n_ref; ref++)
|
|
1785
|
+
{
|
|
1786
|
+
i = ind_this[row];
|
|
1787
|
+
j = ref_this[ref];
|
|
1788
|
+
|
|
1789
|
+
if (unlikely(i == j))
|
|
1790
|
+
rmat_this[ref] += node_depths_this[i] + 3.;
|
|
1791
|
+
else
|
|
1792
|
+
rmat_this[ref] += node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
check_interrupt_switch(ss);
|
|
1798
|
+
|
|
1799
|
+
size_t size_rmat = nrows * n_ref;
|
|
1800
|
+
if (standardize_dist)
|
|
1801
|
+
{
|
|
1802
|
+
double ntrees_dbl = (double)ntrees;
|
|
1803
|
+
double div_trees = (double)(mult2(ntrees));
|
|
1804
|
+
#ifndef _WIN32
|
|
1805
|
+
#pragma omp simd
|
|
1806
|
+
#endif
|
|
1807
|
+
for (size_t ix = 0; ix < size_rmat; ix++)
|
|
1808
|
+
rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / div_trees);
|
|
1809
|
+
}
|
|
1810
|
+
|
|
1811
|
+
else
|
|
1812
|
+
{
|
|
1813
|
+
double div_trees = (double)ntrees;
|
|
1814
|
+
for (size_t ix = 0; ix < size_rmat; ix++)
|
|
1815
|
+
rmat[ix] /= div_trees;
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
check_interrupt_switch(ss);
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
template <class real_t, class sparse_ix>
|
|
1822
|
+
void kernel_to_references(TreesIndexer &indexer,
|
|
1823
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
1824
|
+
real_t *restrict numeric_data, int *restrict categ_data,
|
|
1825
|
+
real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
|
1826
|
+
bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
1827
|
+
size_t nrows, int nthreads,
|
|
1828
|
+
double *restrict rmat,
|
|
1829
|
+
bool standardize)
|
|
1830
|
+
{
|
|
1831
|
+
size_t ntrees = indexer.indices.size();
|
|
1832
|
+
size_t n_ref = indexer.indices.front().reference_points.size();
|
|
1833
|
+
|
|
1834
|
+
SignalSwitcher ss;
|
|
1835
|
+
|
|
1836
|
+
std::unique_ptr<sparse_ix[]> terminal_indices(new sparse_ix[nrows*ntrees]);
|
|
1837
|
+
std::unique_ptr<double[]> ignored(new double[nrows]);
|
|
1838
|
+
predict_iforest(numeric_data, categ_data,
|
|
1839
|
+
is_col_major, ld_numeric, ld_categ,
|
|
1840
|
+
is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
|
|
1841
|
+
is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
|
|
1842
|
+
nrows, nthreads, false,
|
|
1843
|
+
model_outputs, model_outputs_ext,
|
|
1844
|
+
ignored.get(), terminal_indices.get(),
|
|
1845
|
+
(double*)NULL,
|
|
1846
|
+
&indexer);
|
|
1847
|
+
ignored.reset();
|
|
1848
|
+
|
|
1849
|
+
check_interrupt_switch(ss);
|
|
1850
|
+
|
|
1851
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) \
|
|
1852
|
+
shared(indexer, terminal_indices, nrows, ntrees, n_ref, rmat)
|
|
1853
|
+
for (size_t_for row = 0; row < (decltype(row))nrows; row++)
|
|
1854
|
+
{
|
|
1855
|
+
if (interrupt_switch) continue;
|
|
1856
|
+
|
|
1857
|
+
SingleTreeIndex *restrict index_node;
|
|
1858
|
+
size_t idx_this;
|
|
1859
|
+
sparse_ix *restrict terminal_indices_this = terminal_indices.get() + row;
|
|
1860
|
+
double *restrict rmat_this = rmat + row*n_ref;
|
|
1861
|
+
memset(rmat_this, 0, n_ref*sizeof(double));
|
|
1862
|
+
|
|
1863
|
+
for (size_t tree = 0; tree < ntrees; tree++)
|
|
1864
|
+
{
|
|
1865
|
+
idx_this = terminal_indices_this[tree*nrows];
|
|
1866
|
+
index_node = &indexer.indices[tree];
|
|
1867
|
+
for (size_t ind = index_node->reference_indptr[idx_this];
|
|
1868
|
+
ind < index_node->reference_indptr[idx_this + 1];
|
|
1869
|
+
ind++)
|
|
1870
|
+
{
|
|
1871
|
+
rmat_this[index_node->reference_mapping[ind]]++;
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
check_interrupt_switch(ss);
|
|
1877
|
+
|
|
1878
|
+
if (standardize)
|
|
1879
|
+
{
|
|
1880
|
+
double ntrees_dbl = (double)ntrees;
|
|
1881
|
+
for (size_t ix = 0; ix < nrows*n_ref; ix++)
|
|
1882
|
+
rmat[ix] /= ntrees_dbl;
|
|
1883
|
+
}
|
|
1884
|
+
|
|
1885
|
+
check_interrupt_switch(ss);
|
|
1886
|
+
}
|