isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,1886 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+
66
+ /* Calculate distance or similarity or kernel/proximity between data points
67
+ *
68
+ * Parameters
69
+ * ==========
70
+ * - numeric_data[nrows * ncols_numeric]
71
+ * Pointer to numeric data for which to make calculations. If not using 'indexer', must be
72
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
73
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
74
+ * row-major or column-major format (with row-major being faster).
75
+ * If categorical data is passed, must be in the same storage order (row-major / column-major)
76
+ * as numerical data (whether dense or sparse).
77
+ * The column order must be the same as in the data that was used to fit the model.
78
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
79
+ * the first group is assumed to be the earlier rows here.
80
+ * Pass NULL if there are no dense numeric columns.
81
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
82
+ * - categ_data[nrows * ncols_categ]
83
+ * Pointer to categorical data for which to make calculations. If not using 'indexer', must be
84
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
85
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
86
+ * row-major or column-major format (with row-major being faster).
87
+ * If numerical data is passed, must be in the same storage order (row-major / column-major)
88
+ * as categorical data (whether the numerical data is dense or sparse).
89
+ * Each category should be represented as an integer, and these integers must start at zero and
90
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
91
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
92
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
93
+ * must be the same as was used in the data to which the model was fit.
94
+ * Pass NULL if there are no categorical columns.
95
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
96
+ * the first group is assumed to be the earlier rows here.
97
+ * - Xc[nnz]
98
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed),
99
+ * or optionally in CSR format (row-compressed) if using 'indexer' and passing 'is_col_major=false'
100
+ * (not recommended as the calculations will be slower if sparse data is passed as CSR).
101
+ * If categorical data is passed, must be in the same storage order (row-major or CSR / column-major or CSC)
102
+ * as numerical data (whether dense or sparse).
103
+ * Pass NULL if there are no sparse numeric columns.
104
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
105
+ * - Xc_ind[nnz]
106
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds
107
+ * (column indices if 'Xc' is in CSR format).
108
+ * Must be in sorted order, otherwise results will be incorrect.
109
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
110
+ * - Xc_indptr[ncols_categ + 1]
111
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
112
+ * start and at entry [col + 1] where does column 'col' end
113
+ * (row index pointers if 'Xc' is passed in CSR format).
114
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
115
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
116
+ * the first group is assumed to be the earlier rows here.
117
+ * - nrows
118
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
119
+ * - use_long_double
120
+ * Whether to use 'long double' (extended precision) type for the calculations. This makes them
121
+ * more accurate (provided that the compiler used has wider long doubles than doubles), but
122
+ * slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
123
+ * Power8 platforms).
124
+ * - nthreads
125
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
126
+ * allocated, even if the thread does not end up being used (with one exception being kernel calculations
127
+ * with respect to reference points in an idexer). Ignored when not building with OpenMP support.
128
+ * - assume_full_distr
129
+ * Whether to assume that the fitted model represents a full population distribution (will use a
130
+ * standardizing criterion assuming infinite sample, and the results of the similarity between two points
131
+ * at prediction time will not depend on the prescence of any third point that is similar to them, but will
132
+ * differ more compared to the pairwise distances between points from which the model was fit). If passing
133
+ * 'false', will calculate pairwise distances as if the new observations at prediction time were added to
134
+ * the sample to which each tree was fit, which will make the distances between two points potentially vary
135
+ * according to other newly introduced points.
136
+ * This was added for experimentation purposes only and it's not recommended to pass 'false'.
137
+ * Note that when calculating distances using 'indexer', there
138
+ * might be slight discrepancies between the numbers produced with or without the indexer due to what
139
+ * are considered "additional" observations in this calculation.
140
+ * This is ignored when passing 'as_kernel=true'.
141
+ * - standardize_dist
142
+ * Whether to standardize the resulting average separation depths between rows according
143
+ * to the expected average separation depth in a similar way as when predicting outlierness,
144
+ * in order to obtain a standardized distance. If passing 'false', will output the average
145
+ * separation depth instead.
146
+ * If passing 'as_kernel=true', this indicates whether to output a fraction (if 'true') or
147
+ * the raw number of matching trees (if 'false').
148
+ * - as_kernel
149
+ * Whether to calculate the "similarities" as isolation kernel or proximity matrix, which counts
150
+ * the proportion of trees in which two observations end up in the same terminal node. This is
151
+ * typically much faster than separation-based distance, but is typically not as good quality.
152
+ * Note that, for kernel calculations, the indexer is only used if it has reference points stored on it.
153
+ * - model_outputs
154
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
155
+ * if the calculations are to be made from an extended model. Can only pass one of
156
+ * 'model_outputs' and 'model_outputs_ext'.
157
+ * - model_outputs_ext
158
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
159
+ * if the calculations are to be made from a single-variable model. Can only pass one of
160
+ * 'model_outputs' and 'model_outputs_ext'.
161
+ * - tmat[nrows * (nrows - 1) / 2] (out)
162
+ * Pointer to array where the resulting pairwise distances or average separation depths or kernels will
163
+ * be written into. As the output is a symmetric matrix, this function will only fill in the
164
+ * upper-triangular part, in which entry 0 <= i < j < n will be located at position
165
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
166
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
167
+ * The array must already be initialized to zeros.
168
+ * If calculating distance/separation from a group of points to another group of points,
169
+ * pass NULL here and use 'rmat' instead.
170
+ * - rmat[nrows1 * nrows2] (out)
171
+ * Pointer to array where to write the distances or separation depths or kernels between each row in
172
+ * one set of observations and each row in a different set of observations. If doing these
173
+ * calculations for all pairs of observations/rows, pass 'tmat' instead.
174
+ * Will take the first group of observations as the rows in this matrix, and the second
175
+ * group as the columns. The groups are assumed to be in the same data arrays, with the
176
+ * first group corresponding to the earlier rows there.
177
+ * This matrix will be used in row-major order (i.e. entries 1..nrows2 contain the first row from nrows1).
178
+ * Must be already initialized to zeros.
179
+ * If passing 'use_indexed_references=true' plus an indexer object with reference points, this
180
+ * array should have dimension [nrows, n_references].
181
+ * Ignored when 'tmat' is passed.
182
+ * - n_from
183
+ * When calculating distances between two groups of points, this indicates the number of
184
+ * observations/rows belonging to the first group (the rows in 'rmat'), which will be
185
+ * assumed to be the first 'n_from' rows.
186
+ * Ignored when 'tmat' is passed or when 'use_indexed_references=true' plus an indexer with
187
+ * references are passed.
188
+ * - use_indexed_references
189
+ * Whether to calculate distances with respect to reference points stored in the indexer
190
+ * object, if it has any. This is only supported with 'assume_full_distr=true' or with 'as_kernel=true'.
191
+ * If passing 'use_indexed_references=true', then 'tmat' must be NULL, and 'rmat' must
192
+ * be of dimension [nrows, n_references].
193
+ * - indexer
194
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
195
+ * which can be used to speed up distance calculations, assuming that it was built with
196
+ * option 'with_distances=true'. If it does not contain node distances, it will not be used.
197
+ * Pass NULL if the indexer has not been constructed or was constructed with 'with_distances=false'.
198
+ * If it contains reference points and passing 'use_indexed_references=true', distances will be
199
+ * calculated between between the input data passed here and the reference points stored in this object.
200
+ * If passing 'as_kernel=true', the indexer can only be used for calculating kernels with respect to
201
+ * reference points in the indexer, otherwise it will not be used (which also means that the data must be
202
+ * passed in column-major order for all kernel calculations that are not with respect to reference points
203
+ * from an indexer).
204
+ * - is_col_major
205
+ * Whether the data comes in column-major order. If using 'indexer', predictions are also possible
206
+ * (and are even faster for the case of dense-only data) if passing the data in row-major format.
207
+ * Without 'indexer' (and with 'as_kernel=true' but without reference points in the idnexer), data
208
+ * may only be passed in column-major format.
209
+ * If there is sparse numeric data, it is highly suggested to pass it in CSC/column-major format.
210
+ * - ld_numeric
211
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'numeric_data'.
212
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
213
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
214
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
215
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
216
+ * data in sparse format.
217
+ * Note that data in row-major order is only accepted when using 'indexer'.
218
+ * - ld_categ
219
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'categ_data'.
220
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
221
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
222
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
223
+ * leading dimension corresponds to the number of rows.
224
+ * Note that data in row-major order is only accepted when using 'indexer'.
225
+ */
226
+ template <class real_t, class sparse_ix>
227
+ void calc_similarity(real_t numeric_data[], int categ_data[],
228
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
229
+ size_t nrows, bool use_long_double, int nthreads,
230
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
231
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
232
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
233
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
234
+ {
235
+ if (use_long_double && !has_long_double()) {
236
+ use_long_double = false;
237
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
238
+ }
239
+ #ifndef NO_LONG_DOUBLE
240
+ if (likely(!use_long_double))
241
+ #endif
242
+ calc_similarity_internal<real_t, sparse_ix, double>(
243
+ numeric_data, categ_data,
244
+ Xc, Xc_ind, Xc_indptr,
245
+ nrows, nthreads,
246
+ assume_full_distr, standardize_dist, as_kernel,
247
+ model_outputs, model_outputs_ext,
248
+ tmat, rmat, n_from, use_indexed_references,
249
+ indexer, is_col_major, ld_numeric, ld_categ
250
+ );
251
+ #ifndef NO_LONG_DOUBLE
252
+ else
253
+ calc_similarity_internal<real_t, sparse_ix, long double>(
254
+ numeric_data, categ_data,
255
+ Xc, Xc_ind, Xc_indptr,
256
+ nrows, nthreads,
257
+ assume_full_distr, standardize_dist, as_kernel,
258
+ model_outputs, model_outputs_ext,
259
+ tmat, rmat, n_from, use_indexed_references,
260
+ indexer, is_col_major, ld_numeric, ld_categ
261
+ );
262
+ #endif
263
+ }
264
+
265
+ template <class real_t, class sparse_ix, class ldouble_safe>
266
+ void calc_similarity_internal(
267
+ real_t numeric_data[], int categ_data[],
268
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
269
+ size_t nrows, int nthreads,
270
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
271
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
272
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
273
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
274
+ {
275
+ if (nrows < 2 && (!use_indexed_references || indexer == NULL || indexer->indices.empty() || indexer->indices.front().reference_points.empty()))
276
+ throw std::runtime_error("Cannot calculate distances from less than 2 rows.\n");
277
+ if (as_kernel && (tmat != NULL || !use_indexed_references || (indexer != NULL && !indexer->indices.empty() && indexer->indices.front().reference_points.empty())))
278
+ indexer = NULL;
279
+
280
+ if (indexer != NULL && model_outputs != NULL)
281
+ {
282
+ if (model_outputs->missing_action == Divide) {
283
+ indexer = NULL;
284
+ if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
285
+ }
286
+ if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL) {
287
+ indexer = NULL;
288
+ if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
289
+ }
290
+ }
291
+ if (
292
+ !as_kernel &&
293
+ indexer != NULL &&
294
+ (indexer->indices.empty() || indexer->indices.front().node_distances.empty())
295
+ ) {
296
+ if (use_indexed_references && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
297
+ throw std::runtime_error("Indexer was built without distances. Cannot use references from it.\n");
298
+ else {
299
+ indexer = NULL;
300
+ fprintf(stderr, "Indexer has no pre-computed distances, will not be used for distance calculations.\n");
301
+ }
302
+ }
303
+ if (
304
+ !is_col_major &&
305
+ indexer == NULL &&
306
+ (
307
+ Xc_indptr != NULL
308
+ ||
309
+ (nrows != 1 &&
310
+ ((numeric_data != NULL && ld_numeric > 1) || (categ_data != NULL && ld_categ > 1)))
311
+ )
312
+ )
313
+ throw std::runtime_error("Cannot calculate distances with row-major data without indexer.\n");
314
+ if (indexer != NULL)
315
+ {
316
+ if (use_indexed_references && tmat == NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
317
+ {
318
+ if (unlikely(!assume_full_distr))
319
+ throw std::runtime_error("Cannot calculate distances to reference points in indexer with 'assume_full_distr=false'.\n");
320
+
321
+ if (!as_kernel)
322
+ {
323
+ calc_similarity_from_indexer_with_references(
324
+ numeric_data, categ_data,
325
+ Xc, Xc_ind, Xc_indptr,
326
+ nrows, nthreads, standardize_dist,
327
+ model_outputs, model_outputs_ext,
328
+ rmat,
329
+ indexer, is_col_major, ld_numeric, ld_categ
330
+ );
331
+ }
332
+
333
+ else
334
+ {
335
+ kernel_to_references(*indexer,
336
+ model_outputs, model_outputs_ext,
337
+ numeric_data, categ_data,
338
+ Xc, Xc_ind, Xc_indptr,
339
+ is_col_major, ld_numeric, ld_categ,
340
+ nrows, nthreads,
341
+ rmat,
342
+ standardize_dist);
343
+ }
344
+ }
345
+
346
+ else
347
+ {
348
+ if (as_kernel) goto skip_indexer_if_kernel;
349
+ calc_similarity_from_indexer(
350
+ numeric_data, categ_data,
351
+ Xc, Xc_ind, Xc_indptr,
352
+ nrows, nthreads, assume_full_distr, standardize_dist,
353
+ model_outputs, model_outputs_ext,
354
+ tmat, rmat, n_from,
355
+ indexer, is_col_major, ld_numeric, ld_categ
356
+ );
357
+ }
358
+
359
+ return;
360
+ }
361
+ skip_indexer_if_kernel:
362
+
363
+ PredictionData<real_t, sparse_ix>
364
+ prediction_data = {numeric_data, categ_data, nrows,
365
+ false, 0, 0,
366
+ Xc, Xc_ind, Xc_indptr,
367
+ NULL, NULL, NULL};
368
+
369
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
370
+
371
+ if (tmat != NULL) n_from = 0;
372
+
373
+ if (n_from == 0) {
374
+ #if SIZE_MAX == UINT32_MAX
375
+ size_t lim_rows = (size_t)UINT16_MAX - (size_t)1;
376
+ #elif SIZE_MAX == UINT64_MAX
377
+ size_t lim_rows = (size_t)UINT32_MAX - (size_t)1;
378
+ #else
379
+ size_t lim_rows = (size_t)std::ceil(std::sqrt((ldouble_safe)SIZE_MAX));
380
+ #endif
381
+ if (nrows > lim_rows)
382
+ throw std::runtime_error("Number of rows implies too large distance matrix (integer overflow).");
383
+ }
384
+
385
+ if ((size_t)nthreads > ntrees)
386
+ nthreads = (int)ntrees;
387
+ #ifdef _OPENMP
388
+ std::vector<WorkerForSimilarity> worker_memory(nthreads);
389
+ #else
390
+ std::vector<WorkerForSimilarity> worker_memory(1);
391
+ nthreads = 1;
392
+ #endif
393
+
394
+ /* Global variable that determines if the procedure receives a stop signal */
395
+ SignalSwitcher ss = SignalSwitcher();
396
+ check_interrupt_switch(ss);
397
+ #if defined(DONT_THROW_ON_INTERRUPT)
398
+ if (interrupt_switch) return;
399
+ #endif
400
+ /* For handling exceptions */
401
+ bool threw_exception = false;
402
+ std::exception_ptr ex = NULL;
403
+
404
+ if (
405
+ tmat == NULL &&
406
+ use_indexed_references &&
407
+ indexer != NULL &&
408
+ !indexer->indices.empty() &&
409
+ !indexer->indices.front().reference_points.empty() &&
410
+ (as_kernel || !indexer->indices.front().node_distances.empty())
411
+ ) {
412
+ n_from = indexer->indices.front().reference_points.size();
413
+ }
414
+
415
+ if (model_outputs != NULL)
416
+ {
417
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
418
+ shared(ntrees, worker_memory, prediction_data, model_outputs, ex, threw_exception, n_from)
419
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
420
+ {
421
+ if (threw_exception || interrupt_switch) continue;
422
+ try
423
+ {
424
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
425
+ model_outputs, NULL, n_from, assume_full_distr);
426
+ traverse_tree_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
427
+ worker_memory[omp_get_thread_num()],
428
+ prediction_data,
429
+ *model_outputs,
430
+ model_outputs->trees[tree],
431
+ (size_t)0,
432
+ as_kernel);
433
+ }
434
+
435
+ catch (...)
436
+ {
437
+ #pragma omp critical
438
+ {
439
+ if (!threw_exception)
440
+ {
441
+ threw_exception = true;
442
+ ex = std::current_exception();
443
+ }
444
+ }
445
+ }
446
+ }
447
+ }
448
+
449
+ else
450
+ {
451
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
452
+ shared(ntrees, worker_memory, prediction_data, model_outputs_ext, ex, threw_exception, n_from)
453
+ for (size_t_for hplane = 0; hplane < (decltype(hplane))ntrees; hplane++)
454
+ {
455
+ if (threw_exception || interrupt_switch) continue;
456
+ try
457
+ {
458
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
459
+ NULL, model_outputs_ext, n_from, assume_full_distr);
460
+ traverse_hplane_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
461
+ worker_memory[omp_get_thread_num()],
462
+ prediction_data,
463
+ *model_outputs_ext,
464
+ model_outputs_ext->hplanes[hplane],
465
+ (size_t)0,
466
+ as_kernel);
467
+ }
468
+
469
+ catch (...)
470
+ {
471
+ #pragma omp critical
472
+ {
473
+ if (!threw_exception)
474
+ {
475
+ threw_exception = true;
476
+ ex = std::current_exception();
477
+ }
478
+ }
479
+ }
480
+ }
481
+ }
482
+
483
+ check_interrupt_switch(ss);
484
+ #if defined(DONT_THROW_ON_INTERRUPT)
485
+ if (interrupt_switch) return;
486
+ #endif
487
+
488
+ if (threw_exception)
489
+ std::rethrow_exception(ex);
490
+
491
+ /* gather and transform the results */
492
+ gather_sim_result< PredictionData<real_t, sparse_ix>,
493
+ InputData<real_t, sparse_ix>,
494
+ WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t> >
495
+ (&worker_memory, NULL,
496
+ &prediction_data, NULL,
497
+ model_outputs, model_outputs_ext,
498
+ tmat, rmat, n_from,
499
+ ntrees, assume_full_distr,
500
+ standardize_dist, as_kernel, nthreads);
501
+
502
+ check_interrupt_switch(ss);
503
+ #if defined(DONT_THROW_ON_INTERRUPT)
504
+ if (interrupt_switch) return;
505
+ #endif
506
+ }
507
+
508
+ template <class PredictionData, class ldouble_safe>
509
+ void traverse_tree_sim(WorkerForSimilarity &workspace,
510
+ PredictionData &prediction_data,
511
+ IsoForest &model_outputs,
512
+ std::vector<IsoTree> &trees,
513
+ size_t curr_tree,
514
+ const bool as_kernel)
515
+ {
516
+ if (interrupt_switch)
517
+ return;
518
+
519
+ if (workspace.st == workspace.end)
520
+ return;
521
+
522
+ if (workspace.tmat_sep.empty())
523
+ {
524
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
525
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
526
+ return;
527
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
528
+ return;
529
+ }
530
+
531
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
532
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
533
+ obtain the average separation depth. */
534
+ if (trees[curr_tree].tree_left == 0)
535
+ {
536
+ ldouble_safe rem = (ldouble_safe) trees[curr_tree].remainder;
537
+ if (workspace.weights_arr.empty())
538
+ {
539
+ if (!as_kernel)
540
+ {
541
+ rem += (ldouble_safe)(workspace.end - workspace.st + 1);
542
+ if (!workspace.tmat_sep.empty())
543
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
544
+ prediction_data.nrows, workspace.tmat_sep.data(),
545
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
546
+ else if (!workspace.rmat.empty())
547
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
548
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(),
549
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
550
+ }
551
+
552
+ else
553
+ {
554
+ if (!workspace.tmat_sep.empty())
555
+ {
556
+ size_t i_, j_;
557
+ for (size_t i = workspace.st; i < workspace.end; i++)
558
+ {
559
+ i_ = workspace.ix_arr[i];
560
+ for (size_t j = i + 1; j <= workspace.end; j++)
561
+ {
562
+ j_ = workspace.ix_arr[j];
563
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
564
+ }
565
+ }
566
+ }
567
+
568
+ else if (!workspace.rmat.empty())
569
+ {
570
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
571
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
572
+ workspace.ix_arr.begin() + workspace.end + 1,
573
+ workspace.n_from));
574
+ double *restrict rmat_this;
575
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
576
+ {
577
+ rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
578
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
579
+ {
580
+ rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
581
+ }
582
+ }
583
+ }
584
+ }
585
+ }
586
+
587
+ else
588
+ {
589
+ if (!as_kernel)
590
+ {
591
+ if (!workspace.assume_full_distr)
592
+ {
593
+ rem += std::accumulate(workspace.ix_arr.begin() + workspace.st,
594
+ workspace.ix_arr.begin() + workspace.end,
595
+ (ldouble_safe) 0.,
596
+ [&workspace](ldouble_safe curr, size_t ix)
597
+ {return curr + (ldouble_safe)workspace.weights_arr[ix];}
598
+ );
599
+ }
600
+
601
+ if (!workspace.tmat_sep.empty())
602
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
603
+ prediction_data.nrows, workspace.tmat_sep.data(),
604
+ workspace.weights_arr.data(),
605
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
606
+ else if (!workspace.rmat.empty())
607
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
608
+ workspace.n_from, prediction_data.nrows,
609
+ workspace.rmat.data(), workspace.weights_arr.data(),
610
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
611
+ }
612
+
613
+ else
614
+ {
615
+ if (!workspace.tmat_sep.empty())
616
+ {
617
+ size_t i_, j_;
618
+ double w_this;
619
+ for (size_t i = workspace.st; i < workspace.end; i++)
620
+ {
621
+ i_ = workspace.ix_arr[i];
622
+ w_this = workspace.weights_arr[i_];
623
+ for (size_t j = i + 1; j <= workspace.end; j++)
624
+ {
625
+ j_ = workspace.ix_arr[j];
626
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]
627
+ +=
628
+ w_this * workspace.weights_arr[j_];
629
+ }
630
+ }
631
+ }
632
+
633
+ else if (!workspace.rmat.empty())
634
+ {
635
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
636
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
637
+ workspace.ix_arr.begin() + workspace.end + 1,
638
+ workspace.n_from));
639
+ double *restrict rmat_this;
640
+ double w_this;
641
+ size_t i_, j_;
642
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
643
+ {
644
+ i_ = workspace.ix_arr[i];
645
+ rmat_this = workspace.rmat.data() + i_*workspace.n_from;
646
+ w_this = workspace.weights_arr[i_];
647
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
648
+ {
649
+ j_ = workspace.ix_arr[j];
650
+ rmat_this[j_ - workspace.n_from]
651
+ +=
652
+ w_this * workspace.weights_arr[j_];
653
+ }
654
+ }
655
+ }
656
+ }
657
+ }
658
+ return;
659
+ }
660
+
661
+ else if (curr_tree > 0 && !as_kernel)
662
+ {
663
+ if (!workspace.tmat_sep.empty())
664
+ {
665
+ if (workspace.weights_arr.empty())
666
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
667
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
668
+ else
669
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
670
+ prediction_data.nrows, workspace.tmat_sep.data(),
671
+ workspace.weights_arr.data(), -1.);
672
+ }
673
+ else if (!workspace.rmat.empty())
674
+ {
675
+ if (workspace.weights_arr.empty())
676
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
677
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(), -1.);
678
+ else
679
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
680
+ workspace.n_from, prediction_data.nrows,
681
+ workspace.rmat.data(), workspace.weights_arr.data(), -1.);
682
+ }
683
+ }
684
+
685
+
686
+ /* divide according to tree */
687
+ if (prediction_data.Xc_indptr != NULL && !workspace.tmat_sep.empty())
688
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
689
+ size_t st_NA, end_NA, split_ix;
690
+ switch (trees[curr_tree].col_type)
691
+ {
692
+ case Numeric:
693
+ {
694
+ if (prediction_data.Xc_indptr == NULL)
695
+ divide_subset_split(workspace.ix_arr.data(),
696
+ prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
697
+ workspace.st, workspace.end, trees[curr_tree].num_split,
698
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
699
+ else
700
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
701
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
702
+ trees[curr_tree].num_split, model_outputs.missing_action,
703
+ st_NA, end_NA, split_ix);
704
+ break;
705
+ }
706
+
707
+ case Categorical:
708
+ {
709
+ switch(model_outputs.cat_split_type)
710
+ {
711
+ case SingleCateg:
712
+ {
713
+ divide_subset_split(workspace.ix_arr.data(),
714
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
715
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
716
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
717
+ break;
718
+ }
719
+
720
+ case SubSet:
721
+ {
722
+ if (!trees[curr_tree].cat_split.size())
723
+ divide_subset_split(workspace.ix_arr.data(),
724
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
725
+ workspace.st, workspace.end,
726
+ model_outputs.missing_action, model_outputs.new_cat_action,
727
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
728
+ else
729
+ divide_subset_split(workspace.ix_arr.data(),
730
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
731
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
732
+ (int) trees[curr_tree].cat_split.size(),
733
+ model_outputs.missing_action, model_outputs.new_cat_action,
734
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
735
+ break;
736
+ }
737
+ }
738
+ break;
739
+ }
740
+
741
+ default:
742
+ {
743
+ assert(0);
744
+ break;
745
+ }
746
+ }
747
+
748
+
749
+ /* continue splitting recursively */
750
+ size_t orig_end = workspace.end;
751
+ if (model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL) {
752
+ if (model_outputs.missing_action == Fail && trees[curr_tree].col_type == Numeric) {
753
+ st_NA = split_ix;
754
+ end_NA = split_ix;
755
+ }
756
+ goto missing_action_divide;
757
+ }
758
+ switch (model_outputs.missing_action)
759
+ {
760
+ case Impute:
761
+ {
762
+ split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
763
+ }
764
+
765
+ case Fail:
766
+ {
767
+ if (split_ix > workspace.st)
768
+ {
769
+ workspace.end = split_ix - 1;
770
+ traverse_tree_sim<PredictionData, ldouble_safe>(
771
+ workspace,
772
+ prediction_data,
773
+ model_outputs,
774
+ trees,
775
+ trees[curr_tree].tree_left,
776
+ as_kernel);
777
+ }
778
+
779
+
780
+ if (split_ix <= orig_end)
781
+ {
782
+ workspace.st = split_ix;
783
+ workspace.end = orig_end;
784
+ traverse_tree_sim<PredictionData, ldouble_safe>(
785
+ workspace,
786
+ prediction_data,
787
+ model_outputs,
788
+ trees,
789
+ trees[curr_tree].tree_right,
790
+ as_kernel);
791
+ }
792
+ break;
793
+ }
794
+
795
+ case Divide: /* new_cat_action = 'Weighted' will also fall here */
796
+ {
797
+ /* TODO: this one should also have a parameter 'changed_weoghts' like during fitting */
798
+ missing_action_divide:
799
+ /* TODO: maybe here it shouldn't copy the whole ix_arr,
800
+ but then it'd need to re-generate it from outside too */
801
+ std::vector<double> weights_arr;
802
+ std::vector<size_t> ix_arr;
803
+ if (end_NA > workspace.st)
804
+ {
805
+ weights_arr.assign(workspace.weights_arr.begin(),
806
+ workspace.weights_arr.begin() + end_NA);
807
+ ix_arr.assign(workspace.ix_arr.begin(),
808
+ workspace.ix_arr.begin() + end_NA);
809
+ }
810
+
811
+ if (end_NA > workspace.st)
812
+ {
813
+ workspace.end = end_NA - 1;
814
+ for (size_t row = st_NA; row < end_NA; row++)
815
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
816
+ traverse_tree_sim<PredictionData, ldouble_safe>(
817
+ workspace,
818
+ prediction_data,
819
+ model_outputs,
820
+ trees,
821
+ trees[curr_tree].tree_left,
822
+ as_kernel);
823
+ }
824
+
825
+ if (st_NA <= orig_end)
826
+ {
827
+ workspace.st = st_NA;
828
+ workspace.end = orig_end;
829
+ if (!weights_arr.empty())
830
+ {
831
+ std::copy(weights_arr.begin(),
832
+ weights_arr.end(),
833
+ workspace.weights_arr.begin());
834
+ std::copy(ix_arr.begin(),
835
+ ix_arr.end(),
836
+ workspace.ix_arr.begin());
837
+ weights_arr.clear();
838
+ weights_arr.shrink_to_fit();
839
+ ix_arr.clear();
840
+ ix_arr.shrink_to_fit();
841
+ }
842
+
843
+ for (size_t row = st_NA; row < end_NA; row++)
844
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
845
+ traverse_tree_sim<PredictionData, ldouble_safe>(
846
+ workspace,
847
+ prediction_data,
848
+ model_outputs,
849
+ trees,
850
+ trees[curr_tree].tree_right,
851
+ as_kernel);
852
+ }
853
+ break;
854
+ }
855
+ }
856
+ }
857
+
858
+ template <class PredictionData, class ldouble_safe>
859
+ void traverse_hplane_sim(WorkerForSimilarity &workspace,
860
+ PredictionData &prediction_data,
861
+ ExtIsoForest &model_outputs,
862
+ std::vector<IsoHPlane> &hplanes,
863
+ size_t curr_tree,
864
+ const bool as_kernel)
865
+ {
866
+ if (interrupt_switch)
867
+ return;
868
+
869
+ if (workspace.st == workspace.end)
870
+ return;
871
+
872
+ if (workspace.tmat_sep.empty())
873
+ {
874
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
875
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
876
+ return;
877
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
878
+ return;
879
+ }
880
+
881
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
882
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
883
+ obtain the average separation depth. */
884
+ if (hplanes[curr_tree].hplane_left == 0)
885
+ {
886
+ if (!as_kernel)
887
+ {
888
+ if (!workspace.tmat_sep.empty())
889
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
890
+ prediction_data.nrows, workspace.tmat_sep.data(),
891
+ workspace.assume_full_distr? 3. :
892
+ expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
893
+ + (ldouble_safe)(workspace.end - workspace.st + 1))
894
+ );
895
+ else if (!workspace.rmat.empty())
896
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
897
+ prediction_data.nrows, workspace.rmat.data(),
898
+ workspace.assume_full_distr? 3. :
899
+ expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
900
+ + (ldouble_safe)(workspace.end - workspace.st + 1))
901
+ );
902
+ }
903
+
904
+ else
905
+ {
906
+ if (!workspace.tmat_sep.empty())
907
+ {
908
+ size_t i_, j_;
909
+ for (size_t i = workspace.st; i < workspace.end; i++)
910
+ {
911
+ i_ = workspace.ix_arr[i];
912
+ for (size_t j = i + 1; j <= workspace.end; j++)
913
+ {
914
+ j_ = workspace.ix_arr[j];
915
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
916
+ }
917
+ }
918
+ }
919
+
920
+ else if (!workspace.rmat.empty())
921
+ {
922
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
923
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
924
+ workspace.ix_arr.begin() + workspace.end + 1,
925
+ workspace.n_from));
926
+ double *restrict rmat_this;
927
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
928
+ {
929
+ rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
930
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
931
+ {
932
+ rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
933
+ }
934
+ }
935
+ }
936
+ }
937
+ return;
938
+ }
939
+
940
+ else if (curr_tree > 0 && !as_kernel)
941
+ {
942
+ if (!workspace.tmat_sep.empty())
943
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
944
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
945
+ else if (!workspace.rmat.empty())
946
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
947
+ prediction_data.nrows, workspace.rmat.data(), -1.);
948
+ }
949
+
950
+ if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
951
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
952
+
953
+ /* reconstruct linear combination */
954
+ size_t ncols_numeric = 0;
955
+ size_t ncols_categ = 0;
956
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
957
+ double unused;
958
+ if (prediction_data.categ_data != NULL || prediction_data.Xc_indptr != NULL)
959
+ {
960
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
961
+ {
962
+ switch(hplanes[curr_tree].col_type[col])
963
+ {
964
+ case Numeric:
965
+ {
966
+ if (prediction_data.Xc_indptr == NULL)
967
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
968
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
969
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
970
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
971
+ model_outputs.missing_action, NULL, NULL, false);
972
+ else
973
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
974
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
975
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
976
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
977
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
978
+ model_outputs.missing_action, NULL, NULL, false);
979
+ ncols_numeric++;
980
+ break;
981
+ }
982
+
983
+ case Categorical:
984
+ {
985
+ switch(model_outputs.cat_split_type)
986
+ {
987
+ case SingleCateg:
988
+ {
989
+ add_linear_comb<ldouble_safe>(
990
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
991
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
992
+ (int)0, NULL, hplanes[curr_tree].fill_new[ncols_categ],
993
+ hplanes[curr_tree].chosen_cat[ncols_categ],
994
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
995
+ workspace.comb_val[0], NULL, NULL, model_outputs.new_cat_action,
996
+ model_outputs.missing_action, SingleCateg, false);
997
+ break;
998
+ }
999
+
1000
+ case SubSet:
1001
+ {
1002
+ add_linear_comb<ldouble_safe>(
1003
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1004
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
1005
+ (int) hplanes[curr_tree].cat_coef[ncols_categ].size(),
1006
+ hplanes[curr_tree].cat_coef[ncols_categ].data(), (double) 0, (int) 0,
1007
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1008
+ hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
1009
+ model_outputs.new_cat_action, model_outputs.missing_action, SubSet, false);
1010
+ break;
1011
+ }
1012
+ }
1013
+ ncols_categ++;
1014
+ break;
1015
+ }
1016
+
1017
+ default:
1018
+ {
1019
+ assert(0);
1020
+ break;
1021
+ }
1022
+ }
1023
+ }
1024
+ }
1025
+
1026
+
1027
+ else /* faster version for numerical-only */
1028
+ {
1029
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1030
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1031
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
1032
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
1033
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1034
+ model_outputs.missing_action, NULL, NULL, false);
1035
+ }
1036
+
1037
+ /* divide data */
1038
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
1039
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
1040
+
1041
+ /* continue splitting recursively */
1042
+ size_t orig_end = workspace.end;
1043
+ if (split_ix > workspace.st)
1044
+ {
1045
+ workspace.end = split_ix - 1;
1046
+ traverse_hplane_sim<PredictionData, ldouble_safe>(
1047
+ workspace,
1048
+ prediction_data,
1049
+ model_outputs,
1050
+ hplanes,
1051
+ hplanes[curr_tree].hplane_left,
1052
+ as_kernel);
1053
+ }
1054
+
1055
+ if (split_ix <= orig_end)
1056
+ {
1057
+ workspace.st = split_ix;
1058
+ workspace.end = orig_end;
1059
+ traverse_hplane_sim<PredictionData, ldouble_safe>(
1060
+ workspace,
1061
+ prediction_data,
1062
+ model_outputs,
1063
+ hplanes,
1064
+ hplanes[curr_tree].hplane_right,
1065
+ as_kernel);
1066
+ }
1067
+
1068
+ }
1069
+
1070
+ template <class PredictionData, class InputData, class WorkerMemory>
1071
+ void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
1072
+ std::vector<WorkerMemory> *worker_memory_m,
1073
+ PredictionData *prediction_data, InputData *input_data,
1074
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1075
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1076
+ size_t ntrees, bool assume_full_distr,
1077
+ bool standardize_dist, bool as_kernel, int nthreads)
1078
+ {
1079
+ if (interrupt_switch)
1080
+ return;
1081
+
1082
+ size_t nrows = (prediction_data != NULL)? prediction_data->nrows : input_data->nrows;
1083
+ size_t ncomb = calc_ncomb(nrows);
1084
+ size_t n_to = (prediction_data != NULL)? (prediction_data->nrows - n_from) : 0;
1085
+
1086
+ #ifdef _OPENMP
1087
+ if (nthreads > 1)
1088
+ {
1089
+ if (worker_memory != NULL)
1090
+ {
1091
+ for (WorkerForSimilarity &w : *worker_memory)
1092
+ {
1093
+ if (!w.tmat_sep.empty())
1094
+ {
1095
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory)
1096
+ for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
1097
+ tmat[ix] += w.tmat_sep[ix];
1098
+ }
1099
+ else if (!w.rmat.empty())
1100
+ {
1101
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(rmat, w, worker_memory)
1102
+ for (size_t_for ix = 0; ix < (decltype(ix))w.rmat.size(); ix++)
1103
+ rmat[ix] += w.rmat[ix];
1104
+ }
1105
+ }
1106
+ }
1107
+
1108
+ else
1109
+ {
1110
+ for (WorkerMemory &w : *worker_memory_m)
1111
+ {
1112
+ if (!w.tmat_sep.empty())
1113
+ {
1114
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory_m)
1115
+ for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
1116
+ tmat[ix] += w.tmat_sep[ix];
1117
+ }
1118
+ }
1119
+ }
1120
+ }
1121
+
1122
+ else
1123
+ #endif
1124
+ {
1125
+ if (worker_memory != NULL)
1126
+ {
1127
+ if (!(*worker_memory)[0].tmat_sep.empty())
1128
+ std::copy((*worker_memory)[0].tmat_sep.begin(), (*worker_memory)[0].tmat_sep.end(), tmat);
1129
+ else
1130
+ std::copy((*worker_memory)[0].rmat.begin(), (*worker_memory)[0].rmat.end(), rmat);
1131
+ }
1132
+
1133
+ else
1134
+ {
1135
+ std::copy((*worker_memory_m)[0].tmat_sep.begin(), (*worker_memory_m)[0].tmat_sep.end(), tmat);
1136
+ }
1137
+ }
1138
+
1139
+ double ntrees_dbl = (double) ntrees;
1140
+ if (standardize_dist)
1141
+ {
1142
+ if (as_kernel)
1143
+ {
1144
+ if (tmat != NULL)
1145
+ for (size_t ix = 0; ix < ncomb; ix++)
1146
+ tmat[ix] /= ntrees_dbl;
1147
+ else
1148
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1149
+ rmat[ix] /= ntrees_dbl;
1150
+ return;
1151
+ }
1152
+
1153
+
1154
+ /* Note: the separation distances up this point are missing the first hop, which is always
1155
+ a +1 to every combination. Thus, it needs to be added back for the average separation depth.
1156
+ For the standardized metric, it takes the expected divisor as 2(=3-1) instead of 3, given
1157
+ that every combination will always get a +1 at the beginning. Since what's obtained here
1158
+ is a sum across all trees, adding this +1 means adding the number of trees. */
1159
+ double div_trees = ntrees_dbl;
1160
+ if (assume_full_distr)
1161
+ {
1162
+ div_trees *= 2;
1163
+ }
1164
+
1165
+ else if (input_data != NULL)
1166
+ {
1167
+ div_trees *= (expected_separation_depth(input_data->nrows) - 1);
1168
+ }
1169
+
1170
+ else
1171
+ {
1172
+ div_trees *= ((
1173
+ (model_outputs != NULL)?
1174
+ expected_separation_depth_hotstart(model_outputs->exp_avg_sep,
1175
+ model_outputs->orig_sample_size,
1176
+ model_outputs->orig_sample_size + prediction_data->nrows)
1177
+ :
1178
+ expected_separation_depth_hotstart(model_outputs_ext->exp_avg_sep,
1179
+ model_outputs_ext->orig_sample_size,
1180
+ model_outputs_ext->orig_sample_size + prediction_data->nrows)
1181
+ ) - 1);
1182
+ }
1183
+
1184
+
1185
+ if (tmat != NULL)
1186
+ #ifndef _WIN32
1187
+ #pragma omp simd
1188
+ #endif
1189
+ for (size_t ix = 0; ix < ncomb; ix++)
1190
+ tmat[ix] = std::exp2( - tmat[ix] / div_trees);
1191
+ else
1192
+ #ifndef _WIN32
1193
+ #pragma omp simd
1194
+ #endif
1195
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1196
+ rmat[ix] = std::exp2( - rmat[ix] / div_trees);
1197
+ }
1198
+
1199
+ else
1200
+ {
1201
+ if (as_kernel) return;
1202
+
1203
+ if (tmat != NULL)
1204
+ #ifndef _WIN32
1205
+ #pragma omp simd
1206
+ #endif
1207
+ for (size_t ix = 0; ix < ncomb; ix++)
1208
+ tmat[ix] = (tmat[ix] + ntrees) / ntrees_dbl;
1209
+ else
1210
+ #ifndef _WIN32
1211
+ #pragma omp simd
1212
+ #endif
1213
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1214
+ rmat[ix] = (rmat[ix] + ntrees) / ntrees_dbl;
1215
+ }
1216
+ }
1217
+
1218
+ template <class PredictionData>
1219
+ void initialize_worker_for_sim(WorkerForSimilarity &workspace,
1220
+ PredictionData &prediction_data,
1221
+ IsoForest *model_outputs,
1222
+ ExtIsoForest *model_outputs_ext,
1223
+ size_t n_from,
1224
+ bool assume_full_distr)
1225
+ {
1226
+ workspace.st = 0;
1227
+ workspace.end = prediction_data.nrows - 1;
1228
+ workspace.n_from = n_from;
1229
+ workspace.assume_full_distr = assume_full_distr; /* doesn't need to have one copy per worker */
1230
+
1231
+ if (workspace.ix_arr.empty())
1232
+ {
1233
+ workspace.ix_arr.resize(prediction_data.nrows);
1234
+ std::iota(workspace.ix_arr.begin(), workspace.ix_arr.end(), (size_t)0);
1235
+ if (!n_from)
1236
+ workspace.tmat_sep.resize(calc_ncomb(prediction_data.nrows), 0);
1237
+ else
1238
+ workspace.rmat.resize((prediction_data.nrows - n_from) * n_from, 0);
1239
+ }
1240
+
1241
+ if (model_outputs != NULL &&
1242
+ (model_outputs->missing_action == Divide ||
1243
+ (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)))
1244
+ {
1245
+ if (workspace.weights_arr.empty())
1246
+ workspace.weights_arr.resize(prediction_data.nrows, 1.);
1247
+ else
1248
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), 1.);
1249
+ }
1250
+
1251
+ if (model_outputs_ext != NULL)
1252
+ {
1253
+ if (workspace.comb_val.empty())
1254
+ workspace.comb_val.resize(prediction_data.nrows, 0);
1255
+ else
1256
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.end(), 0);
1257
+ }
1258
+ }
1259
+
1260
+ template <class real_t, class sparse_ix>
1261
+ void calc_similarity_from_indexer
1262
+ (
1263
+ real_t *restrict numeric_data, int *restrict categ_data,
1264
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1265
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1266
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1267
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1268
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1269
+ )
1270
+ {
1271
+ SignalSwitcher ss;
1272
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1273
+ std::vector<sparse_ix> terminal_indices(nrows * ntrees);
1274
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1275
+ predict_iforest(numeric_data, categ_data,
1276
+ is_col_major, ld_numeric, ld_categ,
1277
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1278
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1279
+ nrows, nthreads, false,
1280
+ model_outputs, model_outputs_ext,
1281
+ ignored.get(), terminal_indices.data(),
1282
+ (double*)NULL,
1283
+ indexer);
1284
+ ignored.reset();
1285
+
1286
+ #ifndef _OPENMP
1287
+ nthreads = 1;
1288
+ #endif
1289
+
1290
+ check_interrupt_switch(ss);
1291
+
1292
+ if (n_from == 0)
1293
+ {
1294
+ size_t ncomb = calc_ncomb(nrows);
1295
+ std::fill_n(tmat, ncomb, 0.);
1296
+
1297
+ std::vector<std::vector<double>> sum_separations(nthreads);
1298
+ if (nthreads != 1) {
1299
+ for (auto &v : sum_separations) v.resize(ncomb);
1300
+ }
1301
+
1302
+ std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
1303
+ for (auto &v : thread_argsorted_nodes) v.resize(nrows);
1304
+
1305
+ std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
1306
+ for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
1307
+
1308
+
1309
+ bool threw_exception = false;
1310
+ std::exception_ptr ex = NULL;
1311
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1312
+ shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
1313
+ sum_separations, thread_argsorted_nodes, thread_sorted_nodes, tmat, \
1314
+ threw_exception, ex)
1315
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1316
+ {
1317
+ if (interrupt_switch || threw_exception) continue;
1318
+
1319
+ if (unlikely(indexer->indices[tree].n_terminal <= 1))
1320
+ {
1321
+ for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
1322
+ continue;
1323
+ }
1324
+
1325
+ double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
1326
+ if (nthreads == 1) ptr_this_sep = tmat;
1327
+ double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
1328
+ double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
1329
+ size_t n_terminal_this = indexer->indices[tree].n_terminal;
1330
+ size_t ncomb_this = calc_ncomb(n_terminal_this);
1331
+ std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
1332
+ std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
1333
+ sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
1334
+ size_t i, j;
1335
+ double add_round;
1336
+
1337
+ if (assume_full_distr)
1338
+ {
1339
+ for (size_t el1 = 0; el1 < nrows-1; el1++)
1340
+ {
1341
+ i = terminal_indices_this[el1];
1342
+ for (size_t el2 = el1+1; el2 < nrows; el2++)
1343
+ {
1344
+ j = terminal_indices_this[el2];
1345
+ if (unlikely(i == j))
1346
+ add_round = node_depths_this[i] + 3.;
1347
+ else
1348
+ add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1349
+ ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)] += add_round;
1350
+ }
1351
+ }
1352
+ }
1353
+
1354
+ else
1355
+ {
1356
+ hashed_set<size_t> nodes_w_repeated;
1357
+ try
1358
+ {
1359
+ nodes_w_repeated.reserve(n_terminal_this);
1360
+ for (size_t el1 = 0; el1 < nrows-1; el1++)
1361
+ {
1362
+ i = terminal_indices_this[el1];
1363
+ for (size_t el2 = el1+1; el2 < nrows; el2++)
1364
+ {
1365
+ j = terminal_indices_this[el2];
1366
+ if (unlikely(i == j))
1367
+ nodes_w_repeated.insert(i);
1368
+ else
1369
+ ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)]
1370
+ +=
1371
+ node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1372
+ }
1373
+ }
1374
+ }
1375
+
1376
+ catch (...)
1377
+ {
1378
+ #pragma omp critical
1379
+ {
1380
+ if (!threw_exception)
1381
+ {
1382
+ threw_exception = true;
1383
+ ex = std::current_exception();
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (likely(!nodes_w_repeated.empty()))
1389
+ {
1390
+ std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
1391
+ std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
1392
+ std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
1393
+ [&terminal_indices_this](const size_t a, const size_t b)
1394
+ {return terminal_indices_this[a] < terminal_indices_this[b];});
1395
+ std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
1396
+ std::vector<size_t>::iterator new_begin;
1397
+
1398
+ std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
1399
+ sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
1400
+ std::sort(sorted_nodes->begin(), sorted_nodes->end());
1401
+ for (size_t node_ix : *sorted_nodes)
1402
+ {
1403
+ curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
1404
+ node_ix,
1405
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1406
+ {return (size_t)terminal_indices_this[a] < b;});
1407
+ new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
1408
+ node_ix,
1409
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1410
+ {return a < (size_t)terminal_indices_this[b];});
1411
+ size_t n_this = std::distance(curr_begin, new_begin);
1412
+ double sep_this
1413
+ =
1414
+ n_this
1415
+ +
1416
+ ((tree_this != NULL)?
1417
+ (*tree_this)[node_ix].remainder
1418
+ :
1419
+ (*hplane_this)[node_ix].remainder);
1420
+ double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
1421
+
1422
+ size_t i, j;
1423
+ for (size_t el1 = 0; el1 < n_this-1; el1++)
1424
+ {
1425
+ i = *(curr_begin + el1);
1426
+ for (size_t el2 = el1+1; el2 < n_this; el2++)
1427
+ {
1428
+ j = *(curr_begin + el2);
1429
+ ptr_this_sep[ix_comb(i, j, nrows, ncomb)] += sep_this_;
1430
+ }
1431
+ }
1432
+
1433
+ curr_begin = new_begin;
1434
+ }
1435
+ }
1436
+
1437
+ }
1438
+ }
1439
+
1440
+ check_interrupt_switch(ss);
1441
+
1442
+ if (threw_exception)
1443
+ std::rethrow_exception(ex);
1444
+
1445
+ if (nthreads == 1)
1446
+ {
1447
+ /* Here 'tmat' already contains the sum of separations */
1448
+ }
1449
+
1450
+ else
1451
+ {
1452
+ for (int tid = 0; tid < nthreads; tid++)
1453
+ {
1454
+ double *restrict seps_thread = sum_separations[tid].data();
1455
+ for (size_t ix = 0; ix < ncomb; ix++)
1456
+ tmat[ix] += seps_thread[ix];
1457
+ }
1458
+ }
1459
+
1460
+ check_interrupt_switch(ss);
1461
+
1462
+ if (standardize_dist)
1463
+ {
1464
+ double divisor;
1465
+ if (assume_full_distr)
1466
+ divisor = (double)(ntrees * 2);
1467
+ else
1468
+ divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
1469
+
1470
+ if (assume_full_distr)
1471
+ {
1472
+ double ntrees_dbl = (double)ntrees;
1473
+ #ifndef _WIN32
1474
+ #pragma omp simd
1475
+ #endif
1476
+ for (size_t ix = 0; ix < ncomb; ix++)
1477
+ tmat[ix] = std::exp2( - (tmat[ix] - ntrees_dbl) / divisor);
1478
+ }
1479
+
1480
+ else
1481
+ {
1482
+ #ifndef _WIN32
1483
+ #pragma omp simd
1484
+ #endif
1485
+ for (size_t ix = 0; ix < ncomb; ix++)
1486
+ tmat[ix] = std::exp2( - tmat[ix] / divisor);
1487
+ }
1488
+ }
1489
+
1490
+ else
1491
+ {
1492
+ double divisor = (double)ntrees;
1493
+ for (size_t ix = 0; ix < ncomb; ix++)
1494
+ tmat[ix] /= divisor;
1495
+ }
1496
+
1497
+ check_interrupt_switch(ss);
1498
+ }
1499
+
1500
+ /* TODO: merge this with the block above, can simplify lots of things by a couple if-elses */
1501
+ else /* has 'rmat' / 'nfrom>0' */
1502
+ {
1503
+ size_t n_to = nrows - n_from;
1504
+ size_t ncomb = n_from * n_to;
1505
+ std::fill_n(rmat, ncomb, 0.);
1506
+
1507
+ std::vector<std::vector<double>> sum_separations(nthreads);
1508
+ if (nthreads != 1) {
1509
+ for (auto &v : sum_separations) v.resize(ncomb);
1510
+ }
1511
+
1512
+ std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
1513
+ for (auto &v : thread_argsorted_nodes) v.resize(nrows);
1514
+
1515
+ std::vector<std::vector<size_t>> thread_doubly_argsorted(nthreads);
1516
+ for (auto &v : thread_doubly_argsorted) v.reserve(nrows);
1517
+
1518
+ std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
1519
+ for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
1520
+
1521
+ bool threw_exception = false;
1522
+ std::exception_ptr ex = NULL;
1523
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1524
+ shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
1525
+ sum_separations, thread_argsorted_nodes, thread_sorted_nodes, thread_doubly_argsorted, rmat, n_to, n_from, \
1526
+ threw_exception, ex)
1527
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1528
+ {
1529
+ if (interrupt_switch || threw_exception) continue;
1530
+
1531
+ if (unlikely(indexer->indices[tree].n_terminal <= 1))
1532
+ {
1533
+ for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
1534
+ continue;
1535
+ }
1536
+
1537
+ double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
1538
+ if (nthreads == 1) ptr_this_sep = rmat;
1539
+ double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
1540
+ double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
1541
+ size_t n_terminal_this = indexer->indices[tree].n_terminal;
1542
+ size_t ncomb_this = calc_ncomb(n_terminal_this);
1543
+ std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
1544
+ std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
1545
+ sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
1546
+ size_t i, j;
1547
+ double add_round;
1548
+
1549
+ if (assume_full_distr)
1550
+ {
1551
+ for (size_t el1 = 0; el1 < n_from; el1++)
1552
+ {
1553
+ i = terminal_indices_this[el1];
1554
+ double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
1555
+ for (size_t el2 = n_from; el2 < nrows; el2++)
1556
+ {
1557
+ j = terminal_indices_this[el2];
1558
+ if (unlikely(i == j))
1559
+ add_round = node_depths_this[i] + 3.;
1560
+ else
1561
+ add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1562
+ ptr_this_sep_[el2-n_from] += add_round;
1563
+ }
1564
+ }
1565
+ }
1566
+
1567
+ else
1568
+ {
1569
+ hashed_set<size_t> nodes_w_repeated;
1570
+ try
1571
+ {
1572
+ nodes_w_repeated.reserve(n_terminal_this);
1573
+ for (size_t el1 = 0; el1 < n_from; el1++)
1574
+ {
1575
+ i = terminal_indices_this[el1];
1576
+ double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
1577
+ for (size_t el2 = n_from; el2 < nrows; el2++)
1578
+ {
1579
+ j = terminal_indices_this[el2];
1580
+ if (unlikely(i == j))
1581
+ nodes_w_repeated.insert(i);
1582
+ else
1583
+ ptr_this_sep_[el2-n_from]
1584
+ +=
1585
+ node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1586
+ }
1587
+ }
1588
+
1589
+ if (likely(!nodes_w_repeated.empty()))
1590
+ {
1591
+ std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
1592
+ std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
1593
+ std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
1594
+ [&terminal_indices_this](const size_t a, const size_t b)
1595
+ {return terminal_indices_this[a] < terminal_indices_this[b];});
1596
+ std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
1597
+ std::vector<size_t>::iterator new_begin;
1598
+
1599
+ std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
1600
+ sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
1601
+ std::sort(sorted_nodes->begin(), sorted_nodes->end());
1602
+ for (size_t node_ix : *sorted_nodes)
1603
+ {
1604
+ curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
1605
+ node_ix,
1606
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1607
+ {return (size_t)terminal_indices_this[a] < b;});
1608
+ new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
1609
+ node_ix,
1610
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1611
+ {return a < (size_t)terminal_indices_this[b];});
1612
+ size_t n_this = std::distance(curr_begin, new_begin);
1613
+ if (unlikely(!n_this)) unexpected_error();
1614
+ double sep_this
1615
+ =
1616
+ n_this
1617
+ +
1618
+ ((tree_this != NULL)?
1619
+ (*tree_this)[node_ix].remainder
1620
+ :
1621
+ (*hplane_this)[node_ix].remainder);
1622
+ double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
1623
+
1624
+ std::vector<size_t> *restrict doubly_argsorted = &thread_doubly_argsorted[omp_get_thread_num()];
1625
+ doubly_argsorted->assign(curr_begin, curr_begin + n_this);
1626
+ std::sort(doubly_argsorted->begin(), doubly_argsorted->end());
1627
+ std::vector<size_t>::iterator pos_n_from = std::lower_bound(doubly_argsorted->begin(),
1628
+ doubly_argsorted->end(),
1629
+ n_from);
1630
+ if (pos_n_from == doubly_argsorted->end()) unexpected_error();
1631
+ size_t n1 = std::distance(doubly_argsorted->begin(), pos_n_from);
1632
+ size_t i, j;
1633
+ double *ptr_this_sep__;
1634
+ for (size_t el1 = 0; el1 < n1; el1++)
1635
+ {
1636
+ i = (*doubly_argsorted)[el1];
1637
+ ptr_this_sep__ = ptr_this_sep + i*n_to;
1638
+ for (size_t el2 = n1; el2 < n_this; el2++)
1639
+ {
1640
+ j = (*doubly_argsorted)[el2];
1641
+ ptr_this_sep__[j-n_from] += sep_this_;
1642
+ }
1643
+ }
1644
+
1645
+ curr_begin = new_begin;
1646
+ }
1647
+ }
1648
+ }
1649
+
1650
+ catch (...)
1651
+ {
1652
+ #pragma omp critical
1653
+ {
1654
+ if (!threw_exception)
1655
+ {
1656
+ threw_exception = true;
1657
+ ex = std::current_exception();
1658
+ }
1659
+ }
1660
+ }
1661
+ }
1662
+ }
1663
+
1664
+ check_interrupt_switch(ss);
1665
+
1666
+ if (threw_exception)
1667
+ std::rethrow_exception(ex);
1668
+
1669
+ if (nthreads == 1)
1670
+ {
1671
+ /* Here 'rmat' already contains the sum of separations */
1672
+ }
1673
+
1674
+ else
1675
+ {
1676
+ for (int tid = 0; tid < nthreads; tid++)
1677
+ {
1678
+ double *restrict seps_thread = sum_separations[tid].data();
1679
+ for (size_t ix = 0; ix < ncomb; ix++)
1680
+ rmat[ix] += seps_thread[ix];
1681
+ }
1682
+ }
1683
+
1684
+ check_interrupt_switch(ss);
1685
+
1686
+ if (standardize_dist)
1687
+ {
1688
+ double divisor;
1689
+ if (assume_full_distr)
1690
+ divisor = (double)(ntrees * 2);
1691
+ else
1692
+ divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
1693
+
1694
+ if (assume_full_distr)
1695
+ {
1696
+ double ntrees_dbl = (double)ntrees;
1697
+ #ifndef _WIN32
1698
+ #pragma omp simd
1699
+ #endif
1700
+ for (size_t ix = 0; ix < ncomb; ix++)
1701
+ rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / divisor);
1702
+ }
1703
+
1704
+ else
1705
+ {
1706
+ #ifndef _WIN32
1707
+ #pragma omp simd
1708
+ #endif
1709
+ for (size_t ix = 0; ix < ncomb; ix++)
1710
+ rmat[ix] = std::exp2( - rmat[ix] / divisor);
1711
+ }
1712
+ }
1713
+
1714
+ else
1715
+ {
1716
+ double divisor = (double)ntrees;
1717
+ for (size_t ix = 0; ix < ncomb; ix++)
1718
+ rmat[ix] /= divisor;
1719
+ }
1720
+
1721
+ check_interrupt_switch(ss);
1722
+ }
1723
+ }
1724
+
1725
+ template <class real_t, class sparse_ix>
1726
+ void calc_similarity_from_indexer_with_references
1727
+ (
1728
+ real_t *restrict numeric_data, int *restrict categ_data,
1729
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1730
+ size_t nrows, int nthreads, bool standardize_dist,
1731
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1732
+ double *restrict rmat,
1733
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1734
+ )
1735
+ {
1736
+ size_t n_ref = get_number_of_reference_points(*indexer);
1737
+ if (unlikely(!n_ref)) unexpected_error();
1738
+
1739
+ SignalSwitcher ss;
1740
+
1741
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1742
+ std::vector<sparse_ix> terminal_indices(nrows * ntrees);
1743
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1744
+ predict_iforest(numeric_data, categ_data,
1745
+ is_col_major, ld_numeric, ld_categ,
1746
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1747
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1748
+ nrows, nthreads, false,
1749
+ model_outputs, model_outputs_ext,
1750
+ ignored.get(), terminal_indices.data(),
1751
+ (double*)NULL,
1752
+ indexer);
1753
+ ignored.reset();
1754
+
1755
+ #ifndef _OPENMP
1756
+ nthreads = 1;
1757
+ #endif
1758
+
1759
+ check_interrupt_switch(ss);
1760
+
1761
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1762
+ shared(rmat, terminal_indices, nrows, n_ref, indexer, ntrees)
1763
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
1764
+ {
1765
+ if (interrupt_switch) continue;
1766
+
1767
+ size_t i, j;
1768
+ size_t n_terminal_this;
1769
+ size_t ncomb_this;
1770
+ size_t *restrict ref_this;
1771
+ sparse_ix *restrict ind_this;
1772
+ double *restrict node_depths_this;
1773
+ double *restrict node_dist_this;
1774
+ double *rmat_this = rmat + row*n_ref;
1775
+ memset(rmat_this, 0, n_ref*sizeof(double));
1776
+ for (size_t tree = 0; tree < ntrees; tree++)
1777
+ {
1778
+ ref_this = indexer->indices[tree].reference_points.data();
1779
+ ind_this = terminal_indices.data() + tree*nrows;
1780
+ node_depths_this = indexer->indices[tree].node_depths.data();
1781
+ n_terminal_this = indexer->indices[tree].n_terminal;
1782
+ node_dist_this = indexer->indices[tree].node_distances.data();
1783
+ ncomb_this = calc_ncomb(n_terminal_this);
1784
+ for (size_t ref = 0; ref < n_ref; ref++)
1785
+ {
1786
+ i = ind_this[row];
1787
+ j = ref_this[ref];
1788
+
1789
+ if (unlikely(i == j))
1790
+ rmat_this[ref] += node_depths_this[i] + 3.;
1791
+ else
1792
+ rmat_this[ref] += node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1793
+ }
1794
+ }
1795
+ }
1796
+
1797
+ check_interrupt_switch(ss);
1798
+
1799
+ size_t size_rmat = nrows * n_ref;
1800
+ if (standardize_dist)
1801
+ {
1802
+ double ntrees_dbl = (double)ntrees;
1803
+ double div_trees = (double)(mult2(ntrees));
1804
+ #ifndef _WIN32
1805
+ #pragma omp simd
1806
+ #endif
1807
+ for (size_t ix = 0; ix < size_rmat; ix++)
1808
+ rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / div_trees);
1809
+ }
1810
+
1811
+ else
1812
+ {
1813
+ double div_trees = (double)ntrees;
1814
+ for (size_t ix = 0; ix < size_rmat; ix++)
1815
+ rmat[ix] /= div_trees;
1816
+ }
1817
+
1818
+ check_interrupt_switch(ss);
1819
+ }
1820
+
1821
+ template <class real_t, class sparse_ix>
1822
+ void kernel_to_references(TreesIndexer &indexer,
1823
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1824
+ real_t *restrict numeric_data, int *restrict categ_data,
1825
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1826
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1827
+ size_t nrows, int nthreads,
1828
+ double *restrict rmat,
1829
+ bool standardize)
1830
+ {
1831
+ size_t ntrees = indexer.indices.size();
1832
+ size_t n_ref = indexer.indices.front().reference_points.size();
1833
+
1834
+ SignalSwitcher ss;
1835
+
1836
+ std::unique_ptr<sparse_ix[]> terminal_indices(new sparse_ix[nrows*ntrees]);
1837
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1838
+ predict_iforest(numeric_data, categ_data,
1839
+ is_col_major, ld_numeric, ld_categ,
1840
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1841
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1842
+ nrows, nthreads, false,
1843
+ model_outputs, model_outputs_ext,
1844
+ ignored.get(), terminal_indices.get(),
1845
+ (double*)NULL,
1846
+ &indexer);
1847
+ ignored.reset();
1848
+
1849
+ check_interrupt_switch(ss);
1850
+
1851
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1852
+ shared(indexer, terminal_indices, nrows, ntrees, n_ref, rmat)
1853
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
1854
+ {
1855
+ if (interrupt_switch) continue;
1856
+
1857
+ SingleTreeIndex *restrict index_node;
1858
+ size_t idx_this;
1859
+ sparse_ix *restrict terminal_indices_this = terminal_indices.get() + row;
1860
+ double *restrict rmat_this = rmat + row*n_ref;
1861
+ memset(rmat_this, 0, n_ref*sizeof(double));
1862
+
1863
+ for (size_t tree = 0; tree < ntrees; tree++)
1864
+ {
1865
+ idx_this = terminal_indices_this[tree*nrows];
1866
+ index_node = &indexer.indices[tree];
1867
+ for (size_t ind = index_node->reference_indptr[idx_this];
1868
+ ind < index_node->reference_indptr[idx_this + 1];
1869
+ ind++)
1870
+ {
1871
+ rmat_this[index_node->reference_mapping[ind]]++;
1872
+ }
1873
+ }
1874
+ }
1875
+
1876
+ check_interrupt_switch(ss);
1877
+
1878
+ if (standardize)
1879
+ {
1880
+ double ntrees_dbl = (double)ntrees;
1881
+ for (size_t ix = 0; ix < nrows*n_ref; ix++)
1882
+ rmat[ix] /= ntrees_dbl;
1883
+ }
1884
+
1885
+ check_interrupt_switch(ss);
1886
+ }