isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,1886 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+
66
+ /* Calculate distance or similarity or kernel/proximity between data points
67
+ *
68
+ * Parameters
69
+ * ==========
70
+ * - numeric_data[nrows * ncols_numeric]
71
+ * Pointer to numeric data for which to make calculations. If not using 'indexer', must be
72
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
73
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
74
+ * row-major or column-major format (with row-major being faster).
75
+ * If categorical data is passed, must be in the same storage order (row-major / column-major)
76
+ * as numerical data (whether dense or sparse).
77
+ * The column order must be the same as in the data that was used to fit the model.
78
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
79
+ * the first group is assumed to be the earlier rows here.
80
+ * Pass NULL if there are no dense numeric columns.
81
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
82
+ * - categ_data[nrows * ncols_categ]
83
+ * Pointer to categorical data for which to make calculations. If not using 'indexer', must be
84
+ * ordered by columns like Fortran, not ordered by rows like C (i.e. entries 1..n contain
85
+ * column 0, n+1..2n column 1, etc.), while if using 'indexer', may be passed in either
86
+ * row-major or column-major format (with row-major being faster).
87
+ * If numerical data is passed, must be in the same storage order (row-major / column-major)
88
+ * as categorical data (whether the numerical data is dense or sparse).
89
+ * Each category should be represented as an integer, and these integers must start at zero and
90
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
91
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
92
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
93
+ * must be the same as was used in the data to which the model was fit.
94
+ * Pass NULL if there are no categorical columns.
95
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
96
+ * the first group is assumed to be the earlier rows here.
97
+ * - Xc[nnz]
98
+ * Pointer to numeric data in sparse numeric matrix in CSC format (column-compressed),
99
+ * or optionally in CSR format (row-compressed) if using 'indexer' and passing 'is_col_major=false'
100
+ * (not recommended as the calculations will be slower if sparse data is passed as CSR).
101
+ * If categorical data is passed, must be in the same storage order (row-major or CSR / column-major or CSC)
102
+ * as numerical data (whether dense or sparse).
103
+ * Pass NULL if there are no sparse numeric columns.
104
+ * Can only pass one of 'numeric_data' or 'Xc' + 'Xc_ind' + 'Xc_indptr'.
105
+ * - Xc_ind[nnz]
106
+ * Pointer to row indices to which each non-zero entry in 'Xc' corresponds
107
+ * (column indices if 'Xc' is in CSR format).
108
+ * Must be in sorted order, otherwise results will be incorrect.
109
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
110
+ * - Xc_indptr[ncols_categ + 1]
111
+ * Pointer to column index pointers that tell at entry [col] where does column 'col'
112
+ * start and at entry [col + 1] where does column 'col' end
113
+ * (row index pointers if 'Xc' is passed in CSR format).
114
+ * Pass NULL if there are no sparse numeric columns in CSC or CSR format.
115
+ * If making calculations between two sets of observations/rows (see documentation for 'rmat'),
116
+ * the first group is assumed to be the earlier rows here.
117
+ * - nrows
118
+ * Number of rows in 'numeric_data', 'Xc', 'categ_data'.
119
+ * - use_long_double
120
+ * Whether to use 'long double' (extended precision) type for the calculations. This makes them
121
+ * more accurate (provided that the compiler used has wider long doubles than doubles), but
122
+ * slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
123
+ * Power8 platforms).
124
+ * - nthreads
125
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
126
+ * allocated, even if the thread does not end up being used (with one exception being kernel calculations
127
+ * with respect to reference points in an idexer). Ignored when not building with OpenMP support.
128
+ * - assume_full_distr
129
+ * Whether to assume that the fitted model represents a full population distribution (will use a
130
+ * standardizing criterion assuming infinite sample, and the results of the similarity between two points
131
+ * at prediction time will not depend on the prescence of any third point that is similar to them, but will
132
+ * differ more compared to the pairwise distances between points from which the model was fit). If passing
133
+ * 'false', will calculate pairwise distances as if the new observations at prediction time were added to
134
+ * the sample to which each tree was fit, which will make the distances between two points potentially vary
135
+ * according to other newly introduced points.
136
+ * This was added for experimentation purposes only and it's not recommended to pass 'false'.
137
+ * Note that when calculating distances using 'indexer', there
138
+ * might be slight discrepancies between the numbers produced with or without the indexer due to what
139
+ * are considered "additional" observations in this calculation.
140
+ * This is ignored when passing 'as_kernel=true'.
141
+ * - standardize_dist
142
+ * Whether to standardize the resulting average separation depths between rows according
143
+ * to the expected average separation depth in a similar way as when predicting outlierness,
144
+ * in order to obtain a standardized distance. If passing 'false', will output the average
145
+ * separation depth instead.
146
+ * If passing 'as_kernel=true', this indicates whether to output a fraction (if 'true') or
147
+ * the raw number of matching trees (if 'false').
148
+ * - as_kernel
149
+ * Whether to calculate the "similarities" as isolation kernel or proximity matrix, which counts
150
+ * the proportion of trees in which two observations end up in the same terminal node. This is
151
+ * typically much faster than separation-based distance, but is typically not as good quality.
152
+ * Note that, for kernel calculations, the indexer is only used if it has reference points stored on it.
153
+ * - model_outputs
154
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
155
+ * if the calculations are to be made from an extended model. Can only pass one of
156
+ * 'model_outputs' and 'model_outputs_ext'.
157
+ * - model_outputs_ext
158
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
159
+ * if the calculations are to be made from a single-variable model. Can only pass one of
160
+ * 'model_outputs' and 'model_outputs_ext'.
161
+ * - tmat[nrows * (nrows - 1) / 2] (out)
162
+ * Pointer to array where the resulting pairwise distances or average separation depths or kernels will
163
+ * be written into. As the output is a symmetric matrix, this function will only fill in the
164
+ * upper-triangular part, in which entry 0 <= i < j < n will be located at position
165
+ * p(i,j) = (i * (n - (i+1)/2) + j - i - 1).
166
+ * Can be converted to a dense square matrix through function 'tmat_to_dense'.
167
+ * The array must already be initialized to zeros.
168
+ * If calculating distance/separation from a group of points to another group of points,
169
+ * pass NULL here and use 'rmat' instead.
170
+ * - rmat[nrows1 * nrows2] (out)
171
+ * Pointer to array where to write the distances or separation depths or kernels between each row in
172
+ * one set of observations and each row in a different set of observations. If doing these
173
+ * calculations for all pairs of observations/rows, pass 'tmat' instead.
174
+ * Will take the first group of observations as the rows in this matrix, and the second
175
+ * group as the columns. The groups are assumed to be in the same data arrays, with the
176
+ * first group corresponding to the earlier rows there.
177
+ * This matrix will be used in row-major order (i.e. entries 1..nrows2 contain the first row from nrows1).
178
+ * Must be already initialized to zeros.
179
+ * If passing 'use_indexed_references=true' plus an indexer object with reference points, this
180
+ * array should have dimension [nrows, n_references].
181
+ * Ignored when 'tmat' is passed.
182
+ * - n_from
183
+ * When calculating distances between two groups of points, this indicates the number of
184
+ * observations/rows belonging to the first group (the rows in 'rmat'), which will be
185
+ * assumed to be the first 'n_from' rows.
186
+ * Ignored when 'tmat' is passed or when 'use_indexed_references=true' plus an indexer with
187
+ * references are passed.
188
+ * - use_indexed_references
189
+ * Whether to calculate distances with respect to reference points stored in the indexer
190
+ * object, if it has any. This is only supported with 'assume_full_distr=true' or with 'as_kernel=true'.
191
+ * If passing 'use_indexed_references=true', then 'tmat' must be NULL, and 'rmat' must
192
+ * be of dimension [nrows, n_references].
193
+ * - indexer
194
+ * Pointer to associated tree indexer for the model being used, if it was constructed,
195
+ * which can be used to speed up distance calculations, assuming that it was built with
196
+ * option 'with_distances=true'. If it does not contain node distances, it will not be used.
197
+ * Pass NULL if the indexer has not been constructed or was constructed with 'with_distances=false'.
198
+ * If it contains reference points and passing 'use_indexed_references=true', distances will be
199
+ * calculated between between the input data passed here and the reference points stored in this object.
200
+ * If passing 'as_kernel=true', the indexer can only be used for calculating kernels with respect to
201
+ * reference points in the indexer, otherwise it will not be used (which also means that the data must be
202
+ * passed in column-major order for all kernel calculations that are not with respect to reference points
203
+ * from an indexer).
204
+ * - is_col_major
205
+ * Whether the data comes in column-major order. If using 'indexer', predictions are also possible
206
+ * (and are even faster for the case of dense-only data) if passing the data in row-major format.
207
+ * Without 'indexer' (and with 'as_kernel=true' but without reference points in the idnexer), data
208
+ * may only be passed in column-major format.
209
+ * If there is sparse numeric data, it is highly suggested to pass it in CSC/column-major format.
210
+ * - ld_numeric
211
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'numeric_data'.
212
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
213
+ * be accessed assuming that row 'n' starts at 'numeric_data + n*ld_numeric'). If passing
214
+ * 'numeric_data' in column-major order, this is ignored and will be assumed that the
215
+ * leading dimension corresponds to the number of rows. This is ignored when passing numeric
216
+ * data in sparse format.
217
+ * Note that data in row-major order is only accepted when using 'indexer'.
218
+ * - ld_categ
219
+ * If passing 'is_col_major=false', this indicates the leading dimension of the array 'categ_data'.
220
+ * Typically, this corresponds to the number of columns, but may be larger (the array will
221
+ * be accessed assuming that row 'n' starts at 'categ_data + n*ld_categ'). If passing
222
+ * 'categ_data' in column-major order, this is ignored and will be assumed that the
223
+ * leading dimension corresponds to the number of rows.
224
+ * Note that data in row-major order is only accepted when using 'indexer'.
225
+ */
226
+ template <class real_t, class sparse_ix>
227
+ void calc_similarity(real_t numeric_data[], int categ_data[],
228
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
229
+ size_t nrows, bool use_long_double, int nthreads,
230
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
231
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
232
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
233
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
234
+ {
235
+ if (use_long_double && !has_long_double()) {
236
+ use_long_double = false;
237
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
238
+ }
239
+ #ifndef NO_LONG_DOUBLE
240
+ if (likely(!use_long_double))
241
+ #endif
242
+ calc_similarity_internal<real_t, sparse_ix, double>(
243
+ numeric_data, categ_data,
244
+ Xc, Xc_ind, Xc_indptr,
245
+ nrows, nthreads,
246
+ assume_full_distr, standardize_dist, as_kernel,
247
+ model_outputs, model_outputs_ext,
248
+ tmat, rmat, n_from, use_indexed_references,
249
+ indexer, is_col_major, ld_numeric, ld_categ
250
+ );
251
+ #ifndef NO_LONG_DOUBLE
252
+ else
253
+ calc_similarity_internal<real_t, sparse_ix, long double>(
254
+ numeric_data, categ_data,
255
+ Xc, Xc_ind, Xc_indptr,
256
+ nrows, nthreads,
257
+ assume_full_distr, standardize_dist, as_kernel,
258
+ model_outputs, model_outputs_ext,
259
+ tmat, rmat, n_from, use_indexed_references,
260
+ indexer, is_col_major, ld_numeric, ld_categ
261
+ );
262
+ #endif
263
+ }
264
+
265
+ template <class real_t, class sparse_ix, class ldouble_safe>
266
+ void calc_similarity_internal(
267
+ real_t numeric_data[], int categ_data[],
268
+ real_t Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
269
+ size_t nrows, int nthreads,
270
+ bool assume_full_distr, bool standardize_dist, bool as_kernel,
271
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
272
+ double tmat[], double rmat[], size_t n_from, bool use_indexed_references,
273
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ)
274
+ {
275
+ if (nrows < 2 && (!use_indexed_references || indexer == NULL || indexer->indices.empty() || indexer->indices.front().reference_points.empty()))
276
+ throw std::runtime_error("Cannot calculate distances from less than 2 rows.\n");
277
+ if (as_kernel && (tmat != NULL || !use_indexed_references || (indexer != NULL && !indexer->indices.empty() && indexer->indices.front().reference_points.empty())))
278
+ indexer = NULL;
279
+
280
+ if (indexer != NULL && model_outputs != NULL)
281
+ {
282
+ if (model_outputs->missing_action == Divide) {
283
+ indexer = NULL;
284
+ if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
285
+ }
286
+ if (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && categ_data != NULL) {
287
+ indexer = NULL;
288
+ if (use_indexed_references) throw std::runtime_error("Invalid indexer - cannot use references from it.\n");
289
+ }
290
+ }
291
+ if (
292
+ !as_kernel &&
293
+ indexer != NULL &&
294
+ (indexer->indices.empty() || indexer->indices.front().node_distances.empty())
295
+ ) {
296
+ if (use_indexed_references && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
297
+ throw std::runtime_error("Indexer was built without distances. Cannot use references from it.\n");
298
+ else {
299
+ indexer = NULL;
300
+ fprintf(stderr, "Indexer has no pre-computed distances, will not be used for distance calculations.\n");
301
+ }
302
+ }
303
+ if (
304
+ !is_col_major &&
305
+ indexer == NULL &&
306
+ (
307
+ Xc_indptr != NULL
308
+ ||
309
+ (nrows != 1 &&
310
+ ((numeric_data != NULL && ld_numeric > 1) || (categ_data != NULL && ld_categ > 1)))
311
+ )
312
+ )
313
+ throw std::runtime_error("Cannot calculate distances with row-major data without indexer.\n");
314
+ if (indexer != NULL)
315
+ {
316
+ if (use_indexed_references && tmat == NULL && !indexer->indices.empty() && !indexer->indices.front().reference_points.empty())
317
+ {
318
+ if (unlikely(!assume_full_distr))
319
+ throw std::runtime_error("Cannot calculate distances to reference points in indexer with 'assume_full_distr=false'.\n");
320
+
321
+ if (!as_kernel)
322
+ {
323
+ calc_similarity_from_indexer_with_references(
324
+ numeric_data, categ_data,
325
+ Xc, Xc_ind, Xc_indptr,
326
+ nrows, nthreads, standardize_dist,
327
+ model_outputs, model_outputs_ext,
328
+ rmat,
329
+ indexer, is_col_major, ld_numeric, ld_categ
330
+ );
331
+ }
332
+
333
+ else
334
+ {
335
+ kernel_to_references(*indexer,
336
+ model_outputs, model_outputs_ext,
337
+ numeric_data, categ_data,
338
+ Xc, Xc_ind, Xc_indptr,
339
+ is_col_major, ld_numeric, ld_categ,
340
+ nrows, nthreads,
341
+ rmat,
342
+ standardize_dist);
343
+ }
344
+ }
345
+
346
+ else
347
+ {
348
+ if (as_kernel) goto skip_indexer_if_kernel;
349
+ calc_similarity_from_indexer(
350
+ numeric_data, categ_data,
351
+ Xc, Xc_ind, Xc_indptr,
352
+ nrows, nthreads, assume_full_distr, standardize_dist,
353
+ model_outputs, model_outputs_ext,
354
+ tmat, rmat, n_from,
355
+ indexer, is_col_major, ld_numeric, ld_categ
356
+ );
357
+ }
358
+
359
+ return;
360
+ }
361
+ skip_indexer_if_kernel:
362
+
363
+ PredictionData<real_t, sparse_ix>
364
+ prediction_data = {numeric_data, categ_data, nrows,
365
+ false, 0, 0,
366
+ Xc, Xc_ind, Xc_indptr,
367
+ NULL, NULL, NULL};
368
+
369
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
370
+
371
+ if (tmat != NULL) n_from = 0;
372
+
373
+ if (n_from == 0) {
374
+ #if SIZE_MAX == UINT32_MAX
375
+ size_t lim_rows = (size_t)UINT16_MAX - (size_t)1;
376
+ #elif SIZE_MAX == UINT64_MAX
377
+ size_t lim_rows = (size_t)UINT32_MAX - (size_t)1;
378
+ #else
379
+ size_t lim_rows = (size_t)std::ceil(std::sqrt((ldouble_safe)SIZE_MAX));
380
+ #endif
381
+ if (nrows > lim_rows)
382
+ throw std::runtime_error("Number of rows implies too large distance matrix (integer overflow).");
383
+ }
384
+
385
+ if ((size_t)nthreads > ntrees)
386
+ nthreads = (int)ntrees;
387
+ #ifdef _OPENMP
388
+ std::vector<WorkerForSimilarity> worker_memory(nthreads);
389
+ #else
390
+ std::vector<WorkerForSimilarity> worker_memory(1);
391
+ nthreads = 1;
392
+ #endif
393
+
394
+ /* Global variable that determines if the procedure receives a stop signal */
395
+ SignalSwitcher ss = SignalSwitcher();
396
+ check_interrupt_switch(ss);
397
+ #if defined(DONT_THROW_ON_INTERRUPT)
398
+ if (interrupt_switch) return;
399
+ #endif
400
+ /* For handling exceptions */
401
+ bool threw_exception = false;
402
+ std::exception_ptr ex = NULL;
403
+
404
+ if (
405
+ tmat == NULL &&
406
+ use_indexed_references &&
407
+ indexer != NULL &&
408
+ !indexer->indices.empty() &&
409
+ !indexer->indices.front().reference_points.empty() &&
410
+ (as_kernel || !indexer->indices.front().node_distances.empty())
411
+ ) {
412
+ n_from = indexer->indices.front().reference_points.size();
413
+ }
414
+
415
+ if (model_outputs != NULL)
416
+ {
417
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
418
+ shared(ntrees, worker_memory, prediction_data, model_outputs, ex, threw_exception, n_from)
419
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
420
+ {
421
+ if (threw_exception || interrupt_switch) continue;
422
+ try
423
+ {
424
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
425
+ model_outputs, NULL, n_from, assume_full_distr);
426
+ traverse_tree_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
427
+ worker_memory[omp_get_thread_num()],
428
+ prediction_data,
429
+ *model_outputs,
430
+ model_outputs->trees[tree],
431
+ (size_t)0,
432
+ as_kernel);
433
+ }
434
+
435
+ catch (...)
436
+ {
437
+ #pragma omp critical
438
+ {
439
+ if (!threw_exception)
440
+ {
441
+ threw_exception = true;
442
+ ex = std::current_exception();
443
+ }
444
+ }
445
+ }
446
+ }
447
+ }
448
+
449
+ else
450
+ {
451
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
452
+ shared(ntrees, worker_memory, prediction_data, model_outputs_ext, ex, threw_exception, n_from)
453
+ for (size_t_for hplane = 0; hplane < (decltype(hplane))ntrees; hplane++)
454
+ {
455
+ if (threw_exception || interrupt_switch) continue;
456
+ try
457
+ {
458
+ initialize_worker_for_sim(worker_memory[omp_get_thread_num()], prediction_data,
459
+ NULL, model_outputs_ext, n_from, assume_full_distr);
460
+ traverse_hplane_sim<PredictionData<real_t, sparse_ix>, ldouble_safe>(
461
+ worker_memory[omp_get_thread_num()],
462
+ prediction_data,
463
+ *model_outputs_ext,
464
+ model_outputs_ext->hplanes[hplane],
465
+ (size_t)0,
466
+ as_kernel);
467
+ }
468
+
469
+ catch (...)
470
+ {
471
+ #pragma omp critical
472
+ {
473
+ if (!threw_exception)
474
+ {
475
+ threw_exception = true;
476
+ ex = std::current_exception();
477
+ }
478
+ }
479
+ }
480
+ }
481
+ }
482
+
483
+ check_interrupt_switch(ss);
484
+ #if defined(DONT_THROW_ON_INTERRUPT)
485
+ if (interrupt_switch) return;
486
+ #endif
487
+
488
+ if (threw_exception)
489
+ std::rethrow_exception(ex);
490
+
491
+ /* gather and transform the results */
492
+ gather_sim_result< PredictionData<real_t, sparse_ix>,
493
+ InputData<real_t, sparse_ix>,
494
+ WorkerMemory<ImputedData<sparse_ix, ldouble_safe>, ldouble_safe, real_t> >
495
+ (&worker_memory, NULL,
496
+ &prediction_data, NULL,
497
+ model_outputs, model_outputs_ext,
498
+ tmat, rmat, n_from,
499
+ ntrees, assume_full_distr,
500
+ standardize_dist, as_kernel, nthreads);
501
+
502
+ check_interrupt_switch(ss);
503
+ #if defined(DONT_THROW_ON_INTERRUPT)
504
+ if (interrupt_switch) return;
505
+ #endif
506
+ }
507
+
508
+ template <class PredictionData, class ldouble_safe>
509
+ void traverse_tree_sim(WorkerForSimilarity &workspace,
510
+ PredictionData &prediction_data,
511
+ IsoForest &model_outputs,
512
+ std::vector<IsoTree> &trees,
513
+ size_t curr_tree,
514
+ const bool as_kernel)
515
+ {
516
+ if (interrupt_switch)
517
+ return;
518
+
519
+ if (workspace.st == workspace.end)
520
+ return;
521
+
522
+ if (workspace.tmat_sep.empty())
523
+ {
524
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
525
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
526
+ return;
527
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
528
+ return;
529
+ }
530
+
531
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
532
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
533
+ obtain the average separation depth. */
534
+ if (trees[curr_tree].tree_left == 0)
535
+ {
536
+ ldouble_safe rem = (ldouble_safe) trees[curr_tree].remainder;
537
+ if (workspace.weights_arr.empty())
538
+ {
539
+ if (!as_kernel)
540
+ {
541
+ rem += (ldouble_safe)(workspace.end - workspace.st + 1);
542
+ if (!workspace.tmat_sep.empty())
543
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
544
+ prediction_data.nrows, workspace.tmat_sep.data(),
545
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
546
+ else if (!workspace.rmat.empty())
547
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
548
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(),
549
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
550
+ }
551
+
552
+ else
553
+ {
554
+ if (!workspace.tmat_sep.empty())
555
+ {
556
+ size_t i_, j_;
557
+ for (size_t i = workspace.st; i < workspace.end; i++)
558
+ {
559
+ i_ = workspace.ix_arr[i];
560
+ for (size_t j = i + 1; j <= workspace.end; j++)
561
+ {
562
+ j_ = workspace.ix_arr[j];
563
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
564
+ }
565
+ }
566
+ }
567
+
568
+ else if (!workspace.rmat.empty())
569
+ {
570
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
571
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
572
+ workspace.ix_arr.begin() + workspace.end + 1,
573
+ workspace.n_from));
574
+ double *restrict rmat_this;
575
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
576
+ {
577
+ rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
578
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
579
+ {
580
+ rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
581
+ }
582
+ }
583
+ }
584
+ }
585
+ }
586
+
587
+ else
588
+ {
589
+ if (!as_kernel)
590
+ {
591
+ if (!workspace.assume_full_distr)
592
+ {
593
+ rem += std::accumulate(workspace.ix_arr.begin() + workspace.st,
594
+ workspace.ix_arr.begin() + workspace.end,
595
+ (ldouble_safe) 0.,
596
+ [&workspace](ldouble_safe curr, size_t ix)
597
+ {return curr + (ldouble_safe)workspace.weights_arr[ix];}
598
+ );
599
+ }
600
+
601
+ if (!workspace.tmat_sep.empty())
602
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
603
+ prediction_data.nrows, workspace.tmat_sep.data(),
604
+ workspace.weights_arr.data(),
605
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
606
+ else if (!workspace.rmat.empty())
607
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
608
+ workspace.n_from, prediction_data.nrows,
609
+ workspace.rmat.data(), workspace.weights_arr.data(),
610
+ workspace.assume_full_distr? 3. : expected_separation_depth(rem));
611
+ }
612
+
613
+ else
614
+ {
615
+ if (!workspace.tmat_sep.empty())
616
+ {
617
+ size_t i_, j_;
618
+ double w_this;
619
+ for (size_t i = workspace.st; i < workspace.end; i++)
620
+ {
621
+ i_ = workspace.ix_arr[i];
622
+ w_this = workspace.weights_arr[i_];
623
+ for (size_t j = i + 1; j <= workspace.end; j++)
624
+ {
625
+ j_ = workspace.ix_arr[j];
626
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]
627
+ +=
628
+ w_this * workspace.weights_arr[j_];
629
+ }
630
+ }
631
+ }
632
+
633
+ else if (!workspace.rmat.empty())
634
+ {
635
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
636
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
637
+ workspace.ix_arr.begin() + workspace.end + 1,
638
+ workspace.n_from));
639
+ double *restrict rmat_this;
640
+ double w_this;
641
+ size_t i_, j_;
642
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
643
+ {
644
+ i_ = workspace.ix_arr[i];
645
+ rmat_this = workspace.rmat.data() + i_*workspace.n_from;
646
+ w_this = workspace.weights_arr[i_];
647
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
648
+ {
649
+ j_ = workspace.ix_arr[j];
650
+ rmat_this[j_ - workspace.n_from]
651
+ +=
652
+ w_this * workspace.weights_arr[j_];
653
+ }
654
+ }
655
+ }
656
+ }
657
+ }
658
+ return;
659
+ }
660
+
661
+ else if (curr_tree > 0 && !as_kernel)
662
+ {
663
+ if (!workspace.tmat_sep.empty())
664
+ {
665
+ if (workspace.weights_arr.empty())
666
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
667
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
668
+ else
669
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
670
+ prediction_data.nrows, workspace.tmat_sep.data(),
671
+ workspace.weights_arr.data(), -1.);
672
+ }
673
+ else if (!workspace.rmat.empty())
674
+ {
675
+ if (workspace.weights_arr.empty())
676
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
677
+ workspace.n_from, prediction_data.nrows, workspace.rmat.data(), -1.);
678
+ else
679
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end,
680
+ workspace.n_from, prediction_data.nrows,
681
+ workspace.rmat.data(), workspace.weights_arr.data(), -1.);
682
+ }
683
+ }
684
+
685
+
686
+ /* divide according to tree */
687
+ if (prediction_data.Xc_indptr != NULL && !workspace.tmat_sep.empty())
688
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
689
+ size_t st_NA, end_NA, split_ix;
690
+ switch (trees[curr_tree].col_type)
691
+ {
692
+ case Numeric:
693
+ {
694
+ if (prediction_data.Xc_indptr == NULL)
695
+ divide_subset_split(workspace.ix_arr.data(),
696
+ prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
697
+ workspace.st, workspace.end, trees[curr_tree].num_split,
698
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
699
+ else
700
+ divide_subset_split(workspace.ix_arr.data(), workspace.st, workspace.end, trees[curr_tree].col_num,
701
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
702
+ trees[curr_tree].num_split, model_outputs.missing_action,
703
+ st_NA, end_NA, split_ix);
704
+ break;
705
+ }
706
+
707
+ case Categorical:
708
+ {
709
+ switch(model_outputs.cat_split_type)
710
+ {
711
+ case SingleCateg:
712
+ {
713
+ divide_subset_split(workspace.ix_arr.data(),
714
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
715
+ workspace.st, workspace.end, trees[curr_tree].chosen_cat,
716
+ model_outputs.missing_action, st_NA, end_NA, split_ix);
717
+ break;
718
+ }
719
+
720
+ case SubSet:
721
+ {
722
+ if (!trees[curr_tree].cat_split.size())
723
+ divide_subset_split(workspace.ix_arr.data(),
724
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
725
+ workspace.st, workspace.end,
726
+ model_outputs.missing_action, model_outputs.new_cat_action,
727
+ trees[curr_tree].pct_tree_left < .5, st_NA, end_NA, split_ix);
728
+ else
729
+ divide_subset_split(workspace.ix_arr.data(),
730
+ prediction_data.categ_data + prediction_data.nrows * trees[curr_tree].col_num,
731
+ workspace.st, workspace.end, trees[curr_tree].cat_split.data(),
732
+ (int) trees[curr_tree].cat_split.size(),
733
+ model_outputs.missing_action, model_outputs.new_cat_action,
734
+ (bool)(trees[curr_tree].pct_tree_left < .5), st_NA, end_NA, split_ix);
735
+ break;
736
+ }
737
+ }
738
+ break;
739
+ }
740
+
741
+ default:
742
+ {
743
+ assert(0);
744
+ break;
745
+ }
746
+ }
747
+
748
+
749
+ /* continue splitting recursively */
750
+ size_t orig_end = workspace.end;
751
+ if (model_outputs.new_cat_action == Weighted && model_outputs.cat_split_type == SubSet && prediction_data.categ_data != NULL) {
752
+ if (model_outputs.missing_action == Fail && trees[curr_tree].col_type == Numeric) {
753
+ st_NA = split_ix;
754
+ end_NA = split_ix;
755
+ }
756
+ goto missing_action_divide;
757
+ }
758
+ switch (model_outputs.missing_action)
759
+ {
760
+ case Impute:
761
+ {
762
+ split_ix = (trees[curr_tree].pct_tree_left >= .5)? end_NA : st_NA;
763
+ }
764
+
765
+ case Fail:
766
+ {
767
+ if (split_ix > workspace.st)
768
+ {
769
+ workspace.end = split_ix - 1;
770
+ traverse_tree_sim<PredictionData, ldouble_safe>(
771
+ workspace,
772
+ prediction_data,
773
+ model_outputs,
774
+ trees,
775
+ trees[curr_tree].tree_left,
776
+ as_kernel);
777
+ }
778
+
779
+
780
+ if (split_ix <= orig_end)
781
+ {
782
+ workspace.st = split_ix;
783
+ workspace.end = orig_end;
784
+ traverse_tree_sim<PredictionData, ldouble_safe>(
785
+ workspace,
786
+ prediction_data,
787
+ model_outputs,
788
+ trees,
789
+ trees[curr_tree].tree_right,
790
+ as_kernel);
791
+ }
792
+ break;
793
+ }
794
+
795
+ case Divide: /* new_cat_action = 'Weighted' will also fall here */
796
+ {
797
+ /* TODO: this one should also have a parameter 'changed_weoghts' like during fitting */
798
+ missing_action_divide:
799
+ /* TODO: maybe here it shouldn't copy the whole ix_arr,
800
+ but then it'd need to re-generate it from outside too */
801
+ std::vector<double> weights_arr;
802
+ std::vector<size_t> ix_arr;
803
+ if (end_NA > workspace.st)
804
+ {
805
+ weights_arr.assign(workspace.weights_arr.begin(),
806
+ workspace.weights_arr.begin() + end_NA);
807
+ ix_arr.assign(workspace.ix_arr.begin(),
808
+ workspace.ix_arr.begin() + end_NA);
809
+ }
810
+
811
+ if (end_NA > workspace.st)
812
+ {
813
+ workspace.end = end_NA - 1;
814
+ for (size_t row = st_NA; row < end_NA; row++)
815
+ workspace.weights_arr[workspace.ix_arr[row]] *= trees[curr_tree].pct_tree_left;
816
+ traverse_tree_sim<PredictionData, ldouble_safe>(
817
+ workspace,
818
+ prediction_data,
819
+ model_outputs,
820
+ trees,
821
+ trees[curr_tree].tree_left,
822
+ as_kernel);
823
+ }
824
+
825
+ if (st_NA <= orig_end)
826
+ {
827
+ workspace.st = st_NA;
828
+ workspace.end = orig_end;
829
+ if (!weights_arr.empty())
830
+ {
831
+ std::copy(weights_arr.begin(),
832
+ weights_arr.end(),
833
+ workspace.weights_arr.begin());
834
+ std::copy(ix_arr.begin(),
835
+ ix_arr.end(),
836
+ workspace.ix_arr.begin());
837
+ weights_arr.clear();
838
+ weights_arr.shrink_to_fit();
839
+ ix_arr.clear();
840
+ ix_arr.shrink_to_fit();
841
+ }
842
+
843
+ for (size_t row = st_NA; row < end_NA; row++)
844
+ workspace.weights_arr[workspace.ix_arr[row]] *= (1. - trees[curr_tree].pct_tree_left);
845
+ traverse_tree_sim<PredictionData, ldouble_safe>(
846
+ workspace,
847
+ prediction_data,
848
+ model_outputs,
849
+ trees,
850
+ trees[curr_tree].tree_right,
851
+ as_kernel);
852
+ }
853
+ break;
854
+ }
855
+ }
856
+ }
857
+
858
+ template <class PredictionData, class ldouble_safe>
859
+ void traverse_hplane_sim(WorkerForSimilarity &workspace,
860
+ PredictionData &prediction_data,
861
+ ExtIsoForest &model_outputs,
862
+ std::vector<IsoHPlane> &hplanes,
863
+ size_t curr_tree,
864
+ const bool as_kernel)
865
+ {
866
+ if (interrupt_switch)
867
+ return;
868
+
869
+ if (workspace.st == workspace.end)
870
+ return;
871
+
872
+ if (workspace.tmat_sep.empty())
873
+ {
874
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
875
+ if (workspace.ix_arr[workspace.st] >= workspace.n_from)
876
+ return;
877
+ if (workspace.ix_arr[workspace.end] < workspace.n_from)
878
+ return;
879
+ }
880
+
881
+ /* Note: the first separation step will not be added here, as it simply consists of adding +1
882
+ to every combination regardless. It has to be added at the end in 'gather_sim_result' to
883
+ obtain the average separation depth. */
884
+ if (hplanes[curr_tree].hplane_left == 0)
885
+ {
886
+ if (!as_kernel)
887
+ {
888
+ if (!workspace.tmat_sep.empty())
889
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
890
+ prediction_data.nrows, workspace.tmat_sep.data(),
891
+ workspace.assume_full_distr? 3. :
892
+ expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
893
+ + (ldouble_safe)(workspace.end - workspace.st + 1))
894
+ );
895
+ else if (!workspace.rmat.empty())
896
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
897
+ prediction_data.nrows, workspace.rmat.data(),
898
+ workspace.assume_full_distr? 3. :
899
+ expected_separation_depth((ldouble_safe) hplanes[curr_tree].remainder
900
+ + (ldouble_safe)(workspace.end - workspace.st + 1))
901
+ );
902
+ }
903
+
904
+ else
905
+ {
906
+ if (!workspace.tmat_sep.empty())
907
+ {
908
+ size_t i_, j_;
909
+ for (size_t i = workspace.st; i < workspace.end; i++)
910
+ {
911
+ i_ = workspace.ix_arr[i];
912
+ for (size_t j = i + 1; j <= workspace.end; j++)
913
+ {
914
+ j_ = workspace.ix_arr[j];
915
+ workspace.tmat_sep[ix_comb(i_, j_, prediction_data.nrows, workspace.tmat_sep.size())]++;
916
+ }
917
+ }
918
+ }
919
+
920
+ else if (!workspace.rmat.empty())
921
+ {
922
+ size_t n_group = std::distance(workspace.ix_arr.begin() + workspace.st,
923
+ std::lower_bound(workspace.ix_arr.begin() + workspace.st,
924
+ workspace.ix_arr.begin() + workspace.end + 1,
925
+ workspace.n_from));
926
+ double *restrict rmat_this;
927
+ for (size_t i = workspace.st; i < workspace.st + n_group; i++)
928
+ {
929
+ rmat_this = workspace.rmat.data() + workspace.ix_arr[i]*workspace.n_from;
930
+ for (size_t j = workspace.st + n_group; j <= workspace.end; j++)
931
+ {
932
+ rmat_this[workspace.ix_arr[j] - workspace.n_from]++;
933
+ }
934
+ }
935
+ }
936
+ }
937
+ return;
938
+ }
939
+
940
+ else if (curr_tree > 0 && !as_kernel)
941
+ {
942
+ if (!workspace.tmat_sep.empty())
943
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
944
+ prediction_data.nrows, workspace.tmat_sep.data(), -1.);
945
+ else if (!workspace.rmat.empty())
946
+ increase_comb_counter_in_groups(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.n_from,
947
+ prediction_data.nrows, workspace.rmat.data(), -1.);
948
+ }
949
+
950
+ if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
951
+ std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
952
+
953
+ /* reconstruct linear combination */
954
+ size_t ncols_numeric = 0;
955
+ size_t ncols_categ = 0;
956
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
957
+ double unused;
958
+ if (prediction_data.categ_data != NULL || prediction_data.Xc_indptr != NULL)
959
+ {
960
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
961
+ {
962
+ switch(hplanes[curr_tree].col_type[col])
963
+ {
964
+ case Numeric:
965
+ {
966
+ if (prediction_data.Xc_indptr == NULL)
967
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
968
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
969
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
970
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
971
+ model_outputs.missing_action, NULL, NULL, false);
972
+ else
973
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end,
974
+ hplanes[curr_tree].col_num[col], workspace.comb_val.data(),
975
+ prediction_data.Xc, prediction_data.Xc_ind, prediction_data.Xc_indptr,
976
+ hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
977
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
978
+ model_outputs.missing_action, NULL, NULL, false);
979
+ ncols_numeric++;
980
+ break;
981
+ }
982
+
983
+ case Categorical:
984
+ {
985
+ switch(model_outputs.cat_split_type)
986
+ {
987
+ case SingleCateg:
988
+ {
989
+ add_linear_comb<ldouble_safe>(
990
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
991
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
992
+ (int)0, NULL, hplanes[curr_tree].fill_new[ncols_categ],
993
+ hplanes[curr_tree].chosen_cat[ncols_categ],
994
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
995
+ workspace.comb_val[0], NULL, NULL, model_outputs.new_cat_action,
996
+ model_outputs.missing_action, SingleCateg, false);
997
+ break;
998
+ }
999
+
1000
+ case SubSet:
1001
+ {
1002
+ add_linear_comb<ldouble_safe>(
1003
+ workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1004
+ prediction_data.categ_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
1005
+ (int) hplanes[curr_tree].cat_coef[ncols_categ].size(),
1006
+ hplanes[curr_tree].cat_coef[ncols_categ].data(), (double) 0, (int) 0,
1007
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1008
+ hplanes[curr_tree].fill_new[ncols_categ], NULL, NULL,
1009
+ model_outputs.new_cat_action, model_outputs.missing_action, SubSet, false);
1010
+ break;
1011
+ }
1012
+ }
1013
+ ncols_categ++;
1014
+ break;
1015
+ }
1016
+
1017
+ default:
1018
+ {
1019
+ assert(0);
1020
+ break;
1021
+ }
1022
+ }
1023
+ }
1024
+ }
1025
+
1026
+
1027
+ else /* faster version for numerical-only */
1028
+ {
1029
+ for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
1030
+ add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
1031
+ prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
1032
+ hplanes[curr_tree].coef[col], (double)0, hplanes[curr_tree].mean[col],
1033
+ (model_outputs.missing_action == Fail)? unused : hplanes[curr_tree].fill_val[col],
1034
+ model_outputs.missing_action, NULL, NULL, false);
1035
+ }
1036
+
1037
+ /* divide data */
1038
+ size_t split_ix = divide_subset_split(workspace.ix_arr.data(), workspace.comb_val.data(),
1039
+ workspace.st, workspace.end, hplanes[curr_tree].split_point);
1040
+
1041
+ /* continue splitting recursively */
1042
+ size_t orig_end = workspace.end;
1043
+ if (split_ix > workspace.st)
1044
+ {
1045
+ workspace.end = split_ix - 1;
1046
+ traverse_hplane_sim<PredictionData, ldouble_safe>(
1047
+ workspace,
1048
+ prediction_data,
1049
+ model_outputs,
1050
+ hplanes,
1051
+ hplanes[curr_tree].hplane_left,
1052
+ as_kernel);
1053
+ }
1054
+
1055
+ if (split_ix <= orig_end)
1056
+ {
1057
+ workspace.st = split_ix;
1058
+ workspace.end = orig_end;
1059
+ traverse_hplane_sim<PredictionData, ldouble_safe>(
1060
+ workspace,
1061
+ prediction_data,
1062
+ model_outputs,
1063
+ hplanes,
1064
+ hplanes[curr_tree].hplane_right,
1065
+ as_kernel);
1066
+ }
1067
+
1068
+ }
1069
+
1070
+ template <class PredictionData, class InputData, class WorkerMemory>
1071
+ void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
1072
+ std::vector<WorkerMemory> *worker_memory_m,
1073
+ PredictionData *prediction_data, InputData *input_data,
1074
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1075
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1076
+ size_t ntrees, bool assume_full_distr,
1077
+ bool standardize_dist, bool as_kernel, int nthreads)
1078
+ {
1079
+ if (interrupt_switch)
1080
+ return;
1081
+
1082
+ size_t nrows = (prediction_data != NULL)? prediction_data->nrows : input_data->nrows;
1083
+ size_t ncomb = calc_ncomb(nrows);
1084
+ size_t n_to = (prediction_data != NULL)? (prediction_data->nrows - n_from) : 0;
1085
+
1086
+ #ifdef _OPENMP
1087
+ if (nthreads > 1)
1088
+ {
1089
+ if (worker_memory != NULL)
1090
+ {
1091
+ for (WorkerForSimilarity &w : *worker_memory)
1092
+ {
1093
+ if (!w.tmat_sep.empty())
1094
+ {
1095
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory)
1096
+ for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
1097
+ tmat[ix] += w.tmat_sep[ix];
1098
+ }
1099
+ else if (!w.rmat.empty())
1100
+ {
1101
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(rmat, w, worker_memory)
1102
+ for (size_t_for ix = 0; ix < (decltype(ix))w.rmat.size(); ix++)
1103
+ rmat[ix] += w.rmat[ix];
1104
+ }
1105
+ }
1106
+ }
1107
+
1108
+ else
1109
+ {
1110
+ for (WorkerMemory &w : *worker_memory_m)
1111
+ {
1112
+ if (!w.tmat_sep.empty())
1113
+ {
1114
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(ncomb, tmat, w, worker_memory_m)
1115
+ for (size_t_for ix = 0; ix < (decltype(ix))ncomb; ix++)
1116
+ tmat[ix] += w.tmat_sep[ix];
1117
+ }
1118
+ }
1119
+ }
1120
+ }
1121
+
1122
+ else
1123
+ #endif
1124
+ {
1125
+ if (worker_memory != NULL)
1126
+ {
1127
+ if (!(*worker_memory)[0].tmat_sep.empty())
1128
+ std::copy((*worker_memory)[0].tmat_sep.begin(), (*worker_memory)[0].tmat_sep.end(), tmat);
1129
+ else
1130
+ std::copy((*worker_memory)[0].rmat.begin(), (*worker_memory)[0].rmat.end(), rmat);
1131
+ }
1132
+
1133
+ else
1134
+ {
1135
+ std::copy((*worker_memory_m)[0].tmat_sep.begin(), (*worker_memory_m)[0].tmat_sep.end(), tmat);
1136
+ }
1137
+ }
1138
+
1139
+ double ntrees_dbl = (double) ntrees;
1140
+ if (standardize_dist)
1141
+ {
1142
+ if (as_kernel)
1143
+ {
1144
+ if (tmat != NULL)
1145
+ for (size_t ix = 0; ix < ncomb; ix++)
1146
+ tmat[ix] /= ntrees_dbl;
1147
+ else
1148
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1149
+ rmat[ix] /= ntrees_dbl;
1150
+ return;
1151
+ }
1152
+
1153
+
1154
+ /* Note: the separation distances up this point are missing the first hop, which is always
1155
+ a +1 to every combination. Thus, it needs to be added back for the average separation depth.
1156
+ For the standardized metric, it takes the expected divisor as 2(=3-1) instead of 3, given
1157
+ that every combination will always get a +1 at the beginning. Since what's obtained here
1158
+ is a sum across all trees, adding this +1 means adding the number of trees. */
1159
+ double div_trees = ntrees_dbl;
1160
+ if (assume_full_distr)
1161
+ {
1162
+ div_trees *= 2;
1163
+ }
1164
+
1165
+ else if (input_data != NULL)
1166
+ {
1167
+ div_trees *= (expected_separation_depth(input_data->nrows) - 1);
1168
+ }
1169
+
1170
+ else
1171
+ {
1172
+ div_trees *= ((
1173
+ (model_outputs != NULL)?
1174
+ expected_separation_depth_hotstart(model_outputs->exp_avg_sep,
1175
+ model_outputs->orig_sample_size,
1176
+ model_outputs->orig_sample_size + prediction_data->nrows)
1177
+ :
1178
+ expected_separation_depth_hotstart(model_outputs_ext->exp_avg_sep,
1179
+ model_outputs_ext->orig_sample_size,
1180
+ model_outputs_ext->orig_sample_size + prediction_data->nrows)
1181
+ ) - 1);
1182
+ }
1183
+
1184
+
1185
+ if (tmat != NULL)
1186
+ #ifndef _WIN32
1187
+ #pragma omp simd
1188
+ #endif
1189
+ for (size_t ix = 0; ix < ncomb; ix++)
1190
+ tmat[ix] = std::exp2( - tmat[ix] / div_trees);
1191
+ else
1192
+ #ifndef _WIN32
1193
+ #pragma omp simd
1194
+ #endif
1195
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1196
+ rmat[ix] = std::exp2( - rmat[ix] / div_trees);
1197
+ }
1198
+
1199
+ else
1200
+ {
1201
+ if (as_kernel) return;
1202
+
1203
+ if (tmat != NULL)
1204
+ #ifndef _WIN32
1205
+ #pragma omp simd
1206
+ #endif
1207
+ for (size_t ix = 0; ix < ncomb; ix++)
1208
+ tmat[ix] = (tmat[ix] + ntrees) / ntrees_dbl;
1209
+ else
1210
+ #ifndef _WIN32
1211
+ #pragma omp simd
1212
+ #endif
1213
+ for (size_t ix = 0; ix < (n_from * n_to); ix++)
1214
+ rmat[ix] = (rmat[ix] + ntrees) / ntrees_dbl;
1215
+ }
1216
+ }
1217
+
1218
+ template <class PredictionData>
1219
+ void initialize_worker_for_sim(WorkerForSimilarity &workspace,
1220
+ PredictionData &prediction_data,
1221
+ IsoForest *model_outputs,
1222
+ ExtIsoForest *model_outputs_ext,
1223
+ size_t n_from,
1224
+ bool assume_full_distr)
1225
+ {
1226
+ workspace.st = 0;
1227
+ workspace.end = prediction_data.nrows - 1;
1228
+ workspace.n_from = n_from;
1229
+ workspace.assume_full_distr = assume_full_distr; /* doesn't need to have one copy per worker */
1230
+
1231
+ if (workspace.ix_arr.empty())
1232
+ {
1233
+ workspace.ix_arr.resize(prediction_data.nrows);
1234
+ std::iota(workspace.ix_arr.begin(), workspace.ix_arr.end(), (size_t)0);
1235
+ if (!n_from)
1236
+ workspace.tmat_sep.resize(calc_ncomb(prediction_data.nrows), 0);
1237
+ else
1238
+ workspace.rmat.resize((prediction_data.nrows - n_from) * n_from, 0);
1239
+ }
1240
+
1241
+ if (model_outputs != NULL &&
1242
+ (model_outputs->missing_action == Divide ||
1243
+ (model_outputs->new_cat_action == Weighted && model_outputs->cat_split_type == SubSet && prediction_data.categ_data != NULL)))
1244
+ {
1245
+ if (workspace.weights_arr.empty())
1246
+ workspace.weights_arr.resize(prediction_data.nrows, 1.);
1247
+ else
1248
+ std::fill(workspace.weights_arr.begin(), workspace.weights_arr.end(), 1.);
1249
+ }
1250
+
1251
+ if (model_outputs_ext != NULL)
1252
+ {
1253
+ if (workspace.comb_val.empty())
1254
+ workspace.comb_val.resize(prediction_data.nrows, 0);
1255
+ else
1256
+ std::fill(workspace.comb_val.begin(), workspace.comb_val.end(), 0);
1257
+ }
1258
+ }
1259
+
1260
+ template <class real_t, class sparse_ix>
1261
+ void calc_similarity_from_indexer
1262
+ (
1263
+ real_t *restrict numeric_data, int *restrict categ_data,
1264
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1265
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
1266
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1267
+ double *restrict tmat, double *restrict rmat, size_t n_from,
1268
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1269
+ )
1270
+ {
1271
+ SignalSwitcher ss;
1272
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1273
+ std::vector<sparse_ix> terminal_indices(nrows * ntrees);
1274
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1275
+ predict_iforest(numeric_data, categ_data,
1276
+ is_col_major, ld_numeric, ld_categ,
1277
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1278
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1279
+ nrows, nthreads, false,
1280
+ model_outputs, model_outputs_ext,
1281
+ ignored.get(), terminal_indices.data(),
1282
+ (double*)NULL,
1283
+ indexer);
1284
+ ignored.reset();
1285
+
1286
+ #ifndef _OPENMP
1287
+ nthreads = 1;
1288
+ #endif
1289
+
1290
+ check_interrupt_switch(ss);
1291
+
1292
+ if (n_from == 0)
1293
+ {
1294
+ size_t ncomb = calc_ncomb(nrows);
1295
+ std::fill_n(tmat, ncomb, 0.);
1296
+
1297
+ std::vector<std::vector<double>> sum_separations(nthreads);
1298
+ if (nthreads != 1) {
1299
+ for (auto &v : sum_separations) v.resize(ncomb);
1300
+ }
1301
+
1302
+ std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
1303
+ for (auto &v : thread_argsorted_nodes) v.resize(nrows);
1304
+
1305
+ std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
1306
+ for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
1307
+
1308
+
1309
+ bool threw_exception = false;
1310
+ std::exception_ptr ex = NULL;
1311
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1312
+ shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
1313
+ sum_separations, thread_argsorted_nodes, thread_sorted_nodes, tmat, \
1314
+ threw_exception, ex)
1315
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1316
+ {
1317
+ if (interrupt_switch || threw_exception) continue;
1318
+
1319
+ if (unlikely(indexer->indices[tree].n_terminal <= 1))
1320
+ {
1321
+ for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
1322
+ continue;
1323
+ }
1324
+
1325
+ double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
1326
+ if (nthreads == 1) ptr_this_sep = tmat;
1327
+ double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
1328
+ double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
1329
+ size_t n_terminal_this = indexer->indices[tree].n_terminal;
1330
+ size_t ncomb_this = calc_ncomb(n_terminal_this);
1331
+ std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
1332
+ std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
1333
+ sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
1334
+ size_t i, j;
1335
+ double add_round;
1336
+
1337
+ if (assume_full_distr)
1338
+ {
1339
+ for (size_t el1 = 0; el1 < nrows-1; el1++)
1340
+ {
1341
+ i = terminal_indices_this[el1];
1342
+ for (size_t el2 = el1+1; el2 < nrows; el2++)
1343
+ {
1344
+ j = terminal_indices_this[el2];
1345
+ if (unlikely(i == j))
1346
+ add_round = node_depths_this[i] + 3.;
1347
+ else
1348
+ add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1349
+ ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)] += add_round;
1350
+ }
1351
+ }
1352
+ }
1353
+
1354
+ else
1355
+ {
1356
+ hashed_set<size_t> nodes_w_repeated;
1357
+ try
1358
+ {
1359
+ nodes_w_repeated.reserve(n_terminal_this);
1360
+ for (size_t el1 = 0; el1 < nrows-1; el1++)
1361
+ {
1362
+ i = terminal_indices_this[el1];
1363
+ for (size_t el2 = el1+1; el2 < nrows; el2++)
1364
+ {
1365
+ j = terminal_indices_this[el2];
1366
+ if (unlikely(i == j))
1367
+ nodes_w_repeated.insert(i);
1368
+ else
1369
+ ptr_this_sep[ix_comb(el1, el2, nrows, ncomb)]
1370
+ +=
1371
+ node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1372
+ }
1373
+ }
1374
+ }
1375
+
1376
+ catch (...)
1377
+ {
1378
+ #pragma omp critical
1379
+ {
1380
+ if (!threw_exception)
1381
+ {
1382
+ threw_exception = true;
1383
+ ex = std::current_exception();
1384
+ }
1385
+ }
1386
+ }
1387
+
1388
+ if (likely(!nodes_w_repeated.empty()))
1389
+ {
1390
+ std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
1391
+ std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
1392
+ std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
1393
+ [&terminal_indices_this](const size_t a, const size_t b)
1394
+ {return terminal_indices_this[a] < terminal_indices_this[b];});
1395
+ std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
1396
+ std::vector<size_t>::iterator new_begin;
1397
+
1398
+ std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
1399
+ sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
1400
+ std::sort(sorted_nodes->begin(), sorted_nodes->end());
1401
+ for (size_t node_ix : *sorted_nodes)
1402
+ {
1403
+ curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
1404
+ node_ix,
1405
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1406
+ {return (size_t)terminal_indices_this[a] < b;});
1407
+ new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
1408
+ node_ix,
1409
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1410
+ {return a < (size_t)terminal_indices_this[b];});
1411
+ size_t n_this = std::distance(curr_begin, new_begin);
1412
+ double sep_this
1413
+ =
1414
+ n_this
1415
+ +
1416
+ ((tree_this != NULL)?
1417
+ (*tree_this)[node_ix].remainder
1418
+ :
1419
+ (*hplane_this)[node_ix].remainder);
1420
+ double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
1421
+
1422
+ size_t i, j;
1423
+ for (size_t el1 = 0; el1 < n_this-1; el1++)
1424
+ {
1425
+ i = *(curr_begin + el1);
1426
+ for (size_t el2 = el1+1; el2 < n_this; el2++)
1427
+ {
1428
+ j = *(curr_begin + el2);
1429
+ ptr_this_sep[ix_comb(i, j, nrows, ncomb)] += sep_this_;
1430
+ }
1431
+ }
1432
+
1433
+ curr_begin = new_begin;
1434
+ }
1435
+ }
1436
+
1437
+ }
1438
+ }
1439
+
1440
+ check_interrupt_switch(ss);
1441
+
1442
+ if (threw_exception)
1443
+ std::rethrow_exception(ex);
1444
+
1445
+ if (nthreads == 1)
1446
+ {
1447
+ /* Here 'tmat' already contains the sum of separations */
1448
+ }
1449
+
1450
+ else
1451
+ {
1452
+ for (int tid = 0; tid < nthreads; tid++)
1453
+ {
1454
+ double *restrict seps_thread = sum_separations[tid].data();
1455
+ for (size_t ix = 0; ix < ncomb; ix++)
1456
+ tmat[ix] += seps_thread[ix];
1457
+ }
1458
+ }
1459
+
1460
+ check_interrupt_switch(ss);
1461
+
1462
+ if (standardize_dist)
1463
+ {
1464
+ double divisor;
1465
+ if (assume_full_distr)
1466
+ divisor = (double)(ntrees * 2);
1467
+ else
1468
+ divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
1469
+
1470
+ if (assume_full_distr)
1471
+ {
1472
+ double ntrees_dbl = (double)ntrees;
1473
+ #ifndef _WIN32
1474
+ #pragma omp simd
1475
+ #endif
1476
+ for (size_t ix = 0; ix < ncomb; ix++)
1477
+ tmat[ix] = std::exp2( - (tmat[ix] - ntrees_dbl) / divisor);
1478
+ }
1479
+
1480
+ else
1481
+ {
1482
+ #ifndef _WIN32
1483
+ #pragma omp simd
1484
+ #endif
1485
+ for (size_t ix = 0; ix < ncomb; ix++)
1486
+ tmat[ix] = std::exp2( - tmat[ix] / divisor);
1487
+ }
1488
+ }
1489
+
1490
+ else
1491
+ {
1492
+ double divisor = (double)ntrees;
1493
+ for (size_t ix = 0; ix < ncomb; ix++)
1494
+ tmat[ix] /= divisor;
1495
+ }
1496
+
1497
+ check_interrupt_switch(ss);
1498
+ }
1499
+
1500
+ /* TODO: merge this with the block above, can simplify lots of things by a couple if-elses */
1501
+ else /* has 'rmat' / 'nfrom>0' */
1502
+ {
1503
+ size_t n_to = nrows - n_from;
1504
+ size_t ncomb = n_from * n_to;
1505
+ std::fill_n(rmat, ncomb, 0.);
1506
+
1507
+ std::vector<std::vector<double>> sum_separations(nthreads);
1508
+ if (nthreads != 1) {
1509
+ for (auto &v : sum_separations) v.resize(ncomb);
1510
+ }
1511
+
1512
+ std::vector<std::vector<size_t>> thread_argsorted_nodes(nthreads);
1513
+ for (auto &v : thread_argsorted_nodes) v.resize(nrows);
1514
+
1515
+ std::vector<std::vector<size_t>> thread_doubly_argsorted(nthreads);
1516
+ for (auto &v : thread_doubly_argsorted) v.reserve(nrows);
1517
+
1518
+ std::vector<std::vector<size_t>> thread_sorted_nodes(nthreads);
1519
+ for (auto &v : thread_sorted_nodes) v.reserve(nrows); /* <- could shrink to max number of terminal nodes */
1520
+
1521
+ bool threw_exception = false;
1522
+ std::exception_ptr ex = NULL;
1523
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1524
+ shared(model_outputs, model_outputs_ext, nthreads, indexer, nrows, ncomb, terminal_indices, \
1525
+ sum_separations, thread_argsorted_nodes, thread_sorted_nodes, thread_doubly_argsorted, rmat, n_to, n_from, \
1526
+ threw_exception, ex)
1527
+ for (size_t_for tree = 0; tree < (decltype(tree))ntrees; tree++)
1528
+ {
1529
+ if (interrupt_switch || threw_exception) continue;
1530
+
1531
+ if (unlikely(indexer->indices[tree].n_terminal <= 1))
1532
+ {
1533
+ for (auto &el : sum_separations[omp_get_thread_num()]) el += 1.;
1534
+ continue;
1535
+ }
1536
+
1537
+ double *restrict ptr_this_sep = sum_separations[omp_get_thread_num()].data();
1538
+ if (nthreads == 1) ptr_this_sep = rmat;
1539
+ double *restrict node_dist_this = indexer->indices[tree].node_distances.data();
1540
+ double *restrict node_depths_this = indexer->indices[tree].node_depths.data();
1541
+ size_t n_terminal_this = indexer->indices[tree].n_terminal;
1542
+ size_t ncomb_this = calc_ncomb(n_terminal_this);
1543
+ std::vector<IsoTree> *tree_this = (model_outputs != NULL)? &model_outputs->trees[tree] : nullptr;
1544
+ std::vector<IsoHPlane> *hplane_this = (model_outputs_ext != NULL)? &model_outputs_ext->hplanes[tree] : nullptr;
1545
+ sparse_ix *restrict terminal_indices_this = terminal_indices.data() + nrows * tree;
1546
+ size_t i, j;
1547
+ double add_round;
1548
+
1549
+ if (assume_full_distr)
1550
+ {
1551
+ for (size_t el1 = 0; el1 < n_from; el1++)
1552
+ {
1553
+ i = terminal_indices_this[el1];
1554
+ double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
1555
+ for (size_t el2 = n_from; el2 < nrows; el2++)
1556
+ {
1557
+ j = terminal_indices_this[el2];
1558
+ if (unlikely(i == j))
1559
+ add_round = node_depths_this[i] + 3.;
1560
+ else
1561
+ add_round = node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1562
+ ptr_this_sep_[el2-n_from] += add_round;
1563
+ }
1564
+ }
1565
+ }
1566
+
1567
+ else
1568
+ {
1569
+ hashed_set<size_t> nodes_w_repeated;
1570
+ try
1571
+ {
1572
+ nodes_w_repeated.reserve(n_terminal_this);
1573
+ for (size_t el1 = 0; el1 < n_from; el1++)
1574
+ {
1575
+ i = terminal_indices_this[el1];
1576
+ double *ptr_this_sep_ = ptr_this_sep + el1*n_to;
1577
+ for (size_t el2 = n_from; el2 < nrows; el2++)
1578
+ {
1579
+ j = terminal_indices_this[el2];
1580
+ if (unlikely(i == j))
1581
+ nodes_w_repeated.insert(i);
1582
+ else
1583
+ ptr_this_sep_[el2-n_from]
1584
+ +=
1585
+ node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1586
+ }
1587
+ }
1588
+
1589
+ if (likely(!nodes_w_repeated.empty()))
1590
+ {
1591
+ std::vector<size_t> *restrict argsorted_nodes = &thread_argsorted_nodes[omp_get_thread_num()];
1592
+ std::iota(argsorted_nodes->begin(), argsorted_nodes->end(), (size_t)0);
1593
+ std::sort(argsorted_nodes->begin(), argsorted_nodes->end(),
1594
+ [&terminal_indices_this](const size_t a, const size_t b)
1595
+ {return terminal_indices_this[a] < terminal_indices_this[b];});
1596
+ std::vector<size_t>::iterator curr_begin = argsorted_nodes->begin();
1597
+ std::vector<size_t>::iterator new_begin;
1598
+
1599
+ std::vector<size_t> *restrict sorted_nodes = &thread_sorted_nodes[omp_get_thread_num()];
1600
+ sorted_nodes->assign(nodes_w_repeated.begin(), nodes_w_repeated.end());
1601
+ std::sort(sorted_nodes->begin(), sorted_nodes->end());
1602
+ for (size_t node_ix : *sorted_nodes)
1603
+ {
1604
+ curr_begin = std::lower_bound(curr_begin, argsorted_nodes->end(),
1605
+ node_ix,
1606
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1607
+ {return (size_t)terminal_indices_this[a] < b;});
1608
+ new_begin = std::upper_bound(curr_begin, argsorted_nodes->end(),
1609
+ node_ix,
1610
+ [&terminal_indices_this](const size_t &a, const size_t &b)
1611
+ {return a < (size_t)terminal_indices_this[b];});
1612
+ size_t n_this = std::distance(curr_begin, new_begin);
1613
+ if (unlikely(!n_this)) unexpected_error();
1614
+ double sep_this
1615
+ =
1616
+ n_this
1617
+ +
1618
+ ((tree_this != NULL)?
1619
+ (*tree_this)[node_ix].remainder
1620
+ :
1621
+ (*hplane_this)[node_ix].remainder);
1622
+ double sep_this_ = expected_separation_depth(sep_this) + node_depths_this[node_ix];
1623
+
1624
+ std::vector<size_t> *restrict doubly_argsorted = &thread_doubly_argsorted[omp_get_thread_num()];
1625
+ doubly_argsorted->assign(curr_begin, curr_begin + n_this);
1626
+ std::sort(doubly_argsorted->begin(), doubly_argsorted->end());
1627
+ std::vector<size_t>::iterator pos_n_from = std::lower_bound(doubly_argsorted->begin(),
1628
+ doubly_argsorted->end(),
1629
+ n_from);
1630
+ if (pos_n_from == doubly_argsorted->end()) unexpected_error();
1631
+ size_t n1 = std::distance(doubly_argsorted->begin(), pos_n_from);
1632
+ size_t i, j;
1633
+ double *ptr_this_sep__;
1634
+ for (size_t el1 = 0; el1 < n1; el1++)
1635
+ {
1636
+ i = (*doubly_argsorted)[el1];
1637
+ ptr_this_sep__ = ptr_this_sep + i*n_to;
1638
+ for (size_t el2 = n1; el2 < n_this; el2++)
1639
+ {
1640
+ j = (*doubly_argsorted)[el2];
1641
+ ptr_this_sep__[j-n_from] += sep_this_;
1642
+ }
1643
+ }
1644
+
1645
+ curr_begin = new_begin;
1646
+ }
1647
+ }
1648
+ }
1649
+
1650
+ catch (...)
1651
+ {
1652
+ #pragma omp critical
1653
+ {
1654
+ if (!threw_exception)
1655
+ {
1656
+ threw_exception = true;
1657
+ ex = std::current_exception();
1658
+ }
1659
+ }
1660
+ }
1661
+ }
1662
+ }
1663
+
1664
+ check_interrupt_switch(ss);
1665
+
1666
+ if (threw_exception)
1667
+ std::rethrow_exception(ex);
1668
+
1669
+ if (nthreads == 1)
1670
+ {
1671
+ /* Here 'rmat' already contains the sum of separations */
1672
+ }
1673
+
1674
+ else
1675
+ {
1676
+ for (int tid = 0; tid < nthreads; tid++)
1677
+ {
1678
+ double *restrict seps_thread = sum_separations[tid].data();
1679
+ for (size_t ix = 0; ix < ncomb; ix++)
1680
+ rmat[ix] += seps_thread[ix];
1681
+ }
1682
+ }
1683
+
1684
+ check_interrupt_switch(ss);
1685
+
1686
+ if (standardize_dist)
1687
+ {
1688
+ double divisor;
1689
+ if (assume_full_distr)
1690
+ divisor = (double)(ntrees * 2);
1691
+ else
1692
+ divisor = (double)ntrees * ((model_outputs != NULL)? model_outputs->exp_avg_sep : model_outputs_ext->exp_avg_sep);
1693
+
1694
+ if (assume_full_distr)
1695
+ {
1696
+ double ntrees_dbl = (double)ntrees;
1697
+ #ifndef _WIN32
1698
+ #pragma omp simd
1699
+ #endif
1700
+ for (size_t ix = 0; ix < ncomb; ix++)
1701
+ rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / divisor);
1702
+ }
1703
+
1704
+ else
1705
+ {
1706
+ #ifndef _WIN32
1707
+ #pragma omp simd
1708
+ #endif
1709
+ for (size_t ix = 0; ix < ncomb; ix++)
1710
+ rmat[ix] = std::exp2( - rmat[ix] / divisor);
1711
+ }
1712
+ }
1713
+
1714
+ else
1715
+ {
1716
+ double divisor = (double)ntrees;
1717
+ for (size_t ix = 0; ix < ncomb; ix++)
1718
+ rmat[ix] /= divisor;
1719
+ }
1720
+
1721
+ check_interrupt_switch(ss);
1722
+ }
1723
+ }
1724
+
1725
+ template <class real_t, class sparse_ix>
1726
+ void calc_similarity_from_indexer_with_references
1727
+ (
1728
+ real_t *restrict numeric_data, int *restrict categ_data,
1729
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1730
+ size_t nrows, int nthreads, bool standardize_dist,
1731
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1732
+ double *restrict rmat,
1733
+ TreesIndexer *indexer, bool is_col_major, size_t ld_numeric, size_t ld_categ
1734
+ )
1735
+ {
1736
+ size_t n_ref = get_number_of_reference_points(*indexer);
1737
+ if (unlikely(!n_ref)) unexpected_error();
1738
+
1739
+ SignalSwitcher ss;
1740
+
1741
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
1742
+ std::vector<sparse_ix> terminal_indices(nrows * ntrees);
1743
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1744
+ predict_iforest(numeric_data, categ_data,
1745
+ is_col_major, ld_numeric, ld_categ,
1746
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1747
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1748
+ nrows, nthreads, false,
1749
+ model_outputs, model_outputs_ext,
1750
+ ignored.get(), terminal_indices.data(),
1751
+ (double*)NULL,
1752
+ indexer);
1753
+ ignored.reset();
1754
+
1755
+ #ifndef _OPENMP
1756
+ nthreads = 1;
1757
+ #endif
1758
+
1759
+ check_interrupt_switch(ss);
1760
+
1761
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1762
+ shared(rmat, terminal_indices, nrows, n_ref, indexer, ntrees)
1763
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
1764
+ {
1765
+ if (interrupt_switch) continue;
1766
+
1767
+ size_t i, j;
1768
+ size_t n_terminal_this;
1769
+ size_t ncomb_this;
1770
+ size_t *restrict ref_this;
1771
+ sparse_ix *restrict ind_this;
1772
+ double *restrict node_depths_this;
1773
+ double *restrict node_dist_this;
1774
+ double *rmat_this = rmat + row*n_ref;
1775
+ memset(rmat_this, 0, n_ref*sizeof(double));
1776
+ for (size_t tree = 0; tree < ntrees; tree++)
1777
+ {
1778
+ ref_this = indexer->indices[tree].reference_points.data();
1779
+ ind_this = terminal_indices.data() + tree*nrows;
1780
+ node_depths_this = indexer->indices[tree].node_depths.data();
1781
+ n_terminal_this = indexer->indices[tree].n_terminal;
1782
+ node_dist_this = indexer->indices[tree].node_distances.data();
1783
+ ncomb_this = calc_ncomb(n_terminal_this);
1784
+ for (size_t ref = 0; ref < n_ref; ref++)
1785
+ {
1786
+ i = ind_this[row];
1787
+ j = ref_this[ref];
1788
+
1789
+ if (unlikely(i == j))
1790
+ rmat_this[ref] += node_depths_this[i] + 3.;
1791
+ else
1792
+ rmat_this[ref] += node_dist_this[ix_comb(i, j, n_terminal_this, ncomb_this)];
1793
+ }
1794
+ }
1795
+ }
1796
+
1797
+ check_interrupt_switch(ss);
1798
+
1799
+ size_t size_rmat = nrows * n_ref;
1800
+ if (standardize_dist)
1801
+ {
1802
+ double ntrees_dbl = (double)ntrees;
1803
+ double div_trees = (double)(mult2(ntrees));
1804
+ #ifndef _WIN32
1805
+ #pragma omp simd
1806
+ #endif
1807
+ for (size_t ix = 0; ix < size_rmat; ix++)
1808
+ rmat[ix] = std::exp2( - (rmat[ix] - ntrees_dbl) / div_trees);
1809
+ }
1810
+
1811
+ else
1812
+ {
1813
+ double div_trees = (double)ntrees;
1814
+ for (size_t ix = 0; ix < size_rmat; ix++)
1815
+ rmat[ix] /= div_trees;
1816
+ }
1817
+
1818
+ check_interrupt_switch(ss);
1819
+ }
1820
+
1821
+ template <class real_t, class sparse_ix>
1822
+ void kernel_to_references(TreesIndexer &indexer,
1823
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
1824
+ real_t *restrict numeric_data, int *restrict categ_data,
1825
+ real_t *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
1826
+ bool is_col_major, size_t ld_numeric, size_t ld_categ,
1827
+ size_t nrows, int nthreads,
1828
+ double *restrict rmat,
1829
+ bool standardize)
1830
+ {
1831
+ size_t ntrees = indexer.indices.size();
1832
+ size_t n_ref = indexer.indices.front().reference_points.size();
1833
+
1834
+ SignalSwitcher ss;
1835
+
1836
+ std::unique_ptr<sparse_ix[]> terminal_indices(new sparse_ix[nrows*ntrees]);
1837
+ std::unique_ptr<double[]> ignored(new double[nrows]);
1838
+ predict_iforest(numeric_data, categ_data,
1839
+ is_col_major, ld_numeric, ld_categ,
1840
+ is_col_major? Xc : nullptr, is_col_major? Xc_ind : nullptr, is_col_major? Xc_indptr : nullptr,
1841
+ is_col_major? (real_t*)nullptr : Xc, is_col_major? (sparse_ix*)nullptr : Xc_ind, is_col_major? (sparse_ix*)nullptr : Xc_indptr,
1842
+ nrows, nthreads, false,
1843
+ model_outputs, model_outputs_ext,
1844
+ ignored.get(), terminal_indices.get(),
1845
+ (double*)NULL,
1846
+ &indexer);
1847
+ ignored.reset();
1848
+
1849
+ check_interrupt_switch(ss);
1850
+
1851
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
1852
+ shared(indexer, terminal_indices, nrows, ntrees, n_ref, rmat)
1853
+ for (size_t_for row = 0; row < (decltype(row))nrows; row++)
1854
+ {
1855
+ if (interrupt_switch) continue;
1856
+
1857
+ SingleTreeIndex *restrict index_node;
1858
+ size_t idx_this;
1859
+ sparse_ix *restrict terminal_indices_this = terminal_indices.get() + row;
1860
+ double *restrict rmat_this = rmat + row*n_ref;
1861
+ memset(rmat_this, 0, n_ref*sizeof(double));
1862
+
1863
+ for (size_t tree = 0; tree < ntrees; tree++)
1864
+ {
1865
+ idx_this = terminal_indices_this[tree*nrows];
1866
+ index_node = &indexer.indices[tree];
1867
+ for (size_t ind = index_node->reference_indptr[idx_this];
1868
+ ind < index_node->reference_indptr[idx_this + 1];
1869
+ ind++)
1870
+ {
1871
+ rmat_this[index_node->reference_mapping[ind]]++;
1872
+ }
1873
+ }
1874
+ }
1875
+
1876
+ check_interrupt_switch(ss);
1877
+
1878
+ if (standardize)
1879
+ {
1880
+ double ntrees_dbl = (double)ntrees;
1881
+ for (size_t ix = 0; ix < nrows*n_ref; ix++)
1882
+ rmat[ix] /= ntrees_dbl;
1883
+ }
1884
+
1885
+ check_interrupt_switch(ss);
1886
+ }