isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -0,0 +1,1321 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
41
+ *
42
+ * BSD 2-Clause License
43
+ * Copyright (c) 2019-2022, David Cortes
44
+ * All rights reserved.
45
+ * Redistribution and use in source and binary forms, with or without
46
+ * modification, are permitted provided that the following conditions are met:
47
+ * * Redistributions of source code must retain the above copyright notice, this
48
+ * list of conditions and the following disclaimer.
49
+ * * Redistributions in binary form must reproduce the above copyright notice,
50
+ * this list of conditions and the following disclaimer in the documentation
51
+ * and/or other materials provided with the distribution.
52
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
53
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
55
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
56
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
58
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
59
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
60
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+ */
63
+ #include "isotree.hpp"
64
+
65
+ /* FIXME / TODO: here the calculations of medians do not take weights into account */
66
+
67
+ #define SD_MIN 1e-10
68
+ /* https://www.johndcook.com/blog/standard_deviation/ */
69
+
70
+ /* for regular numerical */
71
+ template <class real_t, class real_t_>
72
+ void calc_mean_and_sd_t(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
73
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean)
74
+ {
75
+ real_t m = 0;
76
+ real_t s = 0;
77
+ real_t m_prev = x[ix_arr[st]];
78
+ real_t xval;
79
+
80
+ if (missing_action == Fail)
81
+ {
82
+ m_prev = x[ix_arr[st]];
83
+ for (size_t row = st; row <= end; row++)
84
+ {
85
+ xval = x[ix_arr[row]];
86
+ m += (xval - m) / (real_t)(row - st + 1);
87
+ s = std::fma(xval - m, xval - m_prev, s);
88
+ m_prev = m;
89
+ }
90
+
91
+ x_mean = m;
92
+ x_sd = std::sqrt(s / (real_t)(end - st + 1));
93
+ }
94
+
95
+ else
96
+ {
97
+ size_t cnt = 0;
98
+ while (is_na_or_inf(m_prev) && st <= end)
99
+ {
100
+ m_prev = x[ix_arr[++st]];
101
+ }
102
+
103
+ for (size_t row = st; row <= end; row++)
104
+ {
105
+ xval = x[ix_arr[row]];
106
+ if (likely(!is_na_or_inf(xval)))
107
+ {
108
+ cnt++;
109
+ m += (xval - m) / (real_t)cnt;
110
+ s = std::fma(xval - m, xval - m_prev, s);
111
+ m_prev = m;
112
+ }
113
+ }
114
+
115
+ x_mean = m;
116
+ x_sd = std::sqrt(s / (real_t)cnt);
117
+ }
118
+ }
119
+
120
+ template <class real_t_>
121
+ double calc_mean_only(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x)
122
+ {
123
+ size_t cnt = 0;
124
+ double m = 0;
125
+ real_t_ xval;
126
+ for (size_t row = st; row <= end; row++)
127
+ {
128
+ xval = x[ix_arr[row]];
129
+ if (likely(!is_na_or_inf(xval)))
130
+ {
131
+ cnt++;
132
+ m += (xval - m) / (double)cnt;
133
+ }
134
+ }
135
+
136
+ return m;
137
+ }
138
+
139
+ template <class real_t_, class ldouble_safe>
140
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x,
141
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean)
142
+ {
143
+ if (end - st + 1 < THRESHOLD_LONG_DOUBLE)
144
+ calc_mean_and_sd_t<double, real_t_>(ix_arr, st, end, x, missing_action, x_sd, x_mean);
145
+ else
146
+ calc_mean_and_sd_t<ldouble_safe, real_t_>(ix_arr, st, end, x, missing_action, x_sd, x_mean);
147
+ x_sd = std::fmax(x_sd, SD_MIN);
148
+ }
149
+
150
+ template <class real_t_, class mapping, class ldouble_safe>
151
+ void calc_mean_and_sd_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w,
152
+ MissingAction missing_action, double &restrict x_sd, double &restrict x_mean)
153
+ {
154
+ ldouble_safe cnt = 0;
155
+ ldouble_safe w_this;
156
+ ldouble_safe m = 0;
157
+ ldouble_safe s = 0;
158
+ ldouble_safe m_prev = x[ix_arr[st]];
159
+ ldouble_safe xval;
160
+ while (is_na_or_inf(m_prev) && st <= end)
161
+ {
162
+ m_prev = x[ix_arr[++st]];
163
+ }
164
+
165
+ for (size_t row = st; row <= end; row++)
166
+ {
167
+ xval = x[ix_arr[row]];
168
+ if (likely(!is_na_or_inf(xval)))
169
+ {
170
+ w_this = w[ix_arr[row]];
171
+ cnt += w_this;
172
+ m = std::fma(w_this, (xval - m) / cnt, m);
173
+ s = std::fma(w_this, (xval - m) * (xval - m_prev), s);
174
+ m_prev = m;
175
+ }
176
+ }
177
+
178
+ x_mean = m;
179
+ x_sd = std::sqrt((ldouble_safe)s / (ldouble_safe)cnt);
180
+ }
181
+
182
+ template <class real_t_, class mapping>
183
+ double calc_mean_only_weighted(size_t ix_arr[], size_t st, size_t end, real_t_ *restrict x, mapping &restrict w)
184
+ {
185
+ double cnt = 0;
186
+ double w_this;
187
+ double m = 0;
188
+ for (size_t row = st; row <= end; row++)
189
+ {
190
+ if (likely(!is_na_or_inf(x[ix_arr[row]])))
191
+ {
192
+ w_this = w[ix_arr[row]];
193
+ cnt += w_this;
194
+ m = std::fma(w_this, (x[ix_arr[row]] - m) / cnt, m);
195
+ }
196
+ }
197
+
198
+ return m;
199
+ }
200
+
201
+ /* for sparse numerical */
202
+ template <class real_t_, class sparse_ix, class real_t>
203
+ void calc_mean_and_sd_(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
204
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
205
+ double &restrict x_sd, double &restrict x_mean)
206
+ {
207
+ /* ix_arr must be already sorted beforehand */
208
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
209
+ {
210
+ x_sd = 0;
211
+ x_mean = 0;
212
+ return;
213
+ }
214
+ size_t st_col = Xc_indptr[col_num];
215
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
216
+ size_t curr_pos = st_col;
217
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
218
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
219
+
220
+ size_t cnt = end - st + 1;
221
+ size_t added = 0;
222
+ real_t m = 0;
223
+ real_t s = 0;
224
+ real_t m_prev = 0;
225
+
226
+ for (size_t *row = ptr_st;
227
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
228
+ )
229
+ {
230
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
231
+ {
232
+ if (unlikely(is_na_or_inf(Xc[curr_pos])))
233
+ {
234
+ cnt--;
235
+ }
236
+
237
+ else
238
+ {
239
+ if (added == 0) m_prev = Xc[curr_pos];
240
+ m += (Xc[curr_pos] - m) / (real_t)(++added);
241
+ s = std::fma(Xc[curr_pos] - m, Xc[curr_pos] - m_prev, s);
242
+ m_prev = m;
243
+ }
244
+
245
+ if (row == ix_arr + end || curr_pos == end_col) break;
246
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
247
+ }
248
+
249
+ else
250
+ {
251
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
252
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
253
+ else
254
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
255
+ }
256
+ }
257
+
258
+ if (added == 0)
259
+ {
260
+ x_mean = 0;
261
+ x_sd = 0;
262
+ return;
263
+ }
264
+
265
+ /* Note: up to this point:
266
+ m = sum(x)/nnz
267
+ s = sum(x^2) - (1/nnz)*(sum(x)^2)
268
+ Here the standard deviation is given by:
269
+ sigma = (1/n)*(sum(x^2) - (1/n)*(sum(x)^2))
270
+ The difference can be put to a closed form. */
271
+ if (cnt > added)
272
+ {
273
+ s += square(m) * ((real_t)added * ((real_t)1 - (real_t)added/(real_t)cnt));
274
+ m *= (real_t)added / (real_t)cnt;
275
+ }
276
+
277
+ x_mean = m;
278
+ x_sd = std::sqrt(s / (real_t)cnt);
279
+ }
280
+
281
+ template <class real_t_, class sparse_ix, class ldouble_safe>
282
+ void calc_mean_and_sd(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
283
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
284
+ double &restrict x_sd, double &restrict x_mean)
285
+ {
286
+ if (end - st + 1 < THRESHOLD_LONG_DOUBLE)
287
+ calc_mean_and_sd_<real_t_, sparse_ix, double>(ix_arr, st, end, col_num, Xc, Xc_ind, Xc_indptr, x_sd, x_mean);
288
+ else
289
+ calc_mean_and_sd_<real_t_, sparse_ix, ldouble_safe>(ix_arr, st, end, col_num, Xc, Xc_ind, Xc_indptr, x_sd, x_mean);
290
+ x_sd = std::fmax(SD_MIN, x_sd);
291
+ }
292
+
293
+ template <class real_t_, class sparse_ix, class ldouble_safe>
294
+ double calc_mean_only(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
295
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr)
296
+ {
297
+ /* ix_arr must be already sorted beforehand */
298
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
299
+ return 0;
300
+ size_t st_col = Xc_indptr[col_num];
301
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
302
+ size_t curr_pos = st_col;
303
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
304
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
305
+
306
+ size_t cnt = end - st + 1;
307
+ size_t added = 0;
308
+ double m = 0;
309
+
310
+ for (size_t *row = ptr_st;
311
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
312
+ )
313
+ {
314
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
315
+ {
316
+ if (unlikely(is_na_or_inf(Xc[curr_pos])))
317
+ cnt--;
318
+ else
319
+ m += (Xc[curr_pos] - m) / (double)(++added);
320
+
321
+ if (row == ix_arr + end || curr_pos == end_col) break;
322
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
323
+ }
324
+
325
+ else
326
+ {
327
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
328
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
329
+ else
330
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
331
+ }
332
+ }
333
+
334
+ if (added == 0)
335
+ return 0;
336
+
337
+ if (cnt > added)
338
+ m *= ((ldouble_safe)added / (ldouble_safe)cnt);
339
+
340
+ return m;
341
+ }
342
+
343
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
344
+ void calc_mean_and_sd_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
345
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
346
+ double &restrict x_sd, double &restrict x_mean, mapping &restrict w)
347
+ {
348
+ /* ix_arr must be already sorted beforehand */
349
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
350
+ {
351
+ x_sd = 0;
352
+ x_mean = 0;
353
+ return;
354
+ }
355
+ size_t st_col = Xc_indptr[col_num];
356
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
357
+ size_t curr_pos = st_col;
358
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
359
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
360
+
361
+ ldouble_safe cnt = 0.;
362
+ for (size_t row = st; row <= end; row++)
363
+ cnt += w[ix_arr[row]];
364
+ ldouble_safe added = 0;
365
+ ldouble_safe m = 0;
366
+ ldouble_safe s = 0;
367
+ ldouble_safe m_prev = 0;
368
+ ldouble_safe w_this;
369
+
370
+ for (size_t *row = ptr_st;
371
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
372
+ )
373
+ {
374
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
375
+ {
376
+ if (unlikely(is_na_or_inf(Xc[curr_pos])))
377
+ {
378
+ cnt -= w[*row];
379
+ }
380
+
381
+ else
382
+ {
383
+ w_this = w[*row];
384
+ if (added == 0) m_prev = Xc[curr_pos];
385
+ added += w_this;
386
+ m = std::fma(w_this, (Xc[curr_pos] - m) / added, m);
387
+ s = std::fma(w_this, (Xc[curr_pos] - m) * (Xc[curr_pos] - m_prev), s);
388
+ m_prev = m;
389
+ }
390
+
391
+ if (row == ix_arr + end || curr_pos == end_col) break;
392
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
393
+ }
394
+
395
+ else
396
+ {
397
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
398
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
399
+ else
400
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
401
+ }
402
+ }
403
+
404
+ if (added == 0)
405
+ {
406
+ x_mean = 0;
407
+ x_sd = 0;
408
+ return;
409
+ }
410
+
411
+ /* Note: up to this point:
412
+ m = sum(x)/nnz
413
+ s = sum(x^2) - (1/nnz)*(sum(x)^2)
414
+ Here the standard deviation is given by:
415
+ sigma = (1/n)*(sum(x^2) - (1/n)*(sum(x)^2))
416
+ The difference can be put to a closed form. */
417
+ if (cnt > added)
418
+ {
419
+ s += square(m) * (added * ((ldouble_safe)1 - (ldouble_safe)added/(ldouble_safe)cnt));
420
+ m *= added / cnt;
421
+ }
422
+
423
+ x_mean = m;
424
+ x_sd = std::sqrt(s / (ldouble_safe)cnt);
425
+ }
426
+
427
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
428
+ double calc_mean_only_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num,
429
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
430
+ mapping &restrict w)
431
+ {
432
+ /* ix_arr must be already sorted beforehand */
433
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
434
+ return 0;
435
+ size_t st_col = Xc_indptr[col_num];
436
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
437
+ size_t curr_pos = st_col;
438
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
439
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
440
+
441
+ ldouble_safe cnt = 0.;
442
+ for (size_t row = st; row <= end; row++)
443
+ cnt += w[ix_arr[row]];
444
+ ldouble_safe added = 0;
445
+ ldouble_safe m = 0;
446
+ ldouble_safe w_this;
447
+
448
+ for (size_t *row = ptr_st;
449
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
450
+ )
451
+ {
452
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
453
+ {
454
+ if (unlikely(is_na_or_inf(Xc[curr_pos]))) {
455
+ cnt -= w[*row];
456
+ }
457
+
458
+ else {
459
+ w_this = w[*row];
460
+ added += w_this;
461
+ m += w_this * (Xc[curr_pos] - m) / added;
462
+ }
463
+
464
+ if (row == ix_arr + end || curr_pos == end_col) break;
465
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
466
+ }
467
+
468
+ else
469
+ {
470
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
471
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
472
+ else
473
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
474
+ }
475
+ }
476
+
477
+ if (added == 0)
478
+ return 0;
479
+
480
+ if (cnt > added)
481
+ m *= (ldouble_safe)added / (ldouble_safe)cnt;
482
+
483
+ return m;
484
+ }
485
+
486
+ /* Note about these functions: they write into an array that does not need to match to 'ix_arr',
487
+ and instead, the index that is stored in ix_arr[n] will have the value in res[n] */
488
+
489
+
490
+ /* for regular numerical */
491
+ template <class real_t_>
492
+ void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
493
+ real_t_ *restrict x, double &coef, double x_sd, double x_mean, double &restrict fill_val,
494
+ MissingAction missing_action, double *restrict buffer_arr,
495
+ size_t *restrict buffer_NAs, bool first_run)
496
+ {
497
+ /* TODO: here don't need the buffer for NAs */
498
+
499
+ if (first_run)
500
+ coef /= x_sd;
501
+
502
+ size_t cnt = 0;
503
+ size_t cnt_NA = 0;
504
+ double *restrict res_write = res - st;
505
+
506
+ if (missing_action == Fail)
507
+ {
508
+ for (size_t row = st; row <= end; row++)
509
+ res_write[row] = std::fma(x[ix_arr[row]] - x_mean, coef, res_write[row]);
510
+ }
511
+
512
+ else
513
+ {
514
+ if (first_run)
515
+ {
516
+ for (size_t row = st; row <= end; row++)
517
+ {
518
+ if (likely(!is_na_or_inf(x[ix_arr[row]])))
519
+ {
520
+ res_write[row] = std::fma(x[ix_arr[row]] - x_mean, coef, res_write[row]);
521
+ buffer_arr[cnt++] = x[ix_arr[row]];
522
+ }
523
+
524
+ else
525
+ {
526
+ buffer_NAs[cnt_NA++] = row;
527
+ }
528
+
529
+ }
530
+ }
531
+
532
+ else
533
+ {
534
+ for (size_t row = st; row <= end; row++)
535
+ {
536
+ res_write[row] += (is_na_or_inf(x[ix_arr[row]]))? fill_val : ( (x[ix_arr[row]]-x_mean) * coef );
537
+ }
538
+ return;
539
+ }
540
+
541
+ size_t mid_ceil = cnt / 2;
542
+ std::partial_sort(buffer_arr, buffer_arr + mid_ceil + 1, buffer_arr + cnt);
543
+
544
+ if ((cnt % 2) == 0)
545
+ fill_val = buffer_arr[mid_ceil-1] + (buffer_arr[mid_ceil] - buffer_arr[mid_ceil-1]) / 2.0;
546
+ else
547
+ fill_val = buffer_arr[mid_ceil];
548
+
549
+ fill_val = (fill_val - x_mean) * coef;
550
+ if (cnt_NA && fill_val)
551
+ {
552
+ for (size_t row = 0; row < cnt_NA; row++)
553
+ res_write[buffer_NAs[row]] += fill_val;
554
+ }
555
+
556
+ }
557
+ }
558
+
559
+ /* for regular numerical */
560
+ template <class real_t_, class mapping, class ldouble_safe>
561
+ void add_linear_comb_weighted(size_t ix_arr[], size_t st, size_t end, double *restrict res,
562
+ real_t_ *restrict x, double &coef, double x_sd, double x_mean, double &restrict fill_val,
563
+ MissingAction missing_action, double *restrict buffer_arr,
564
+ size_t *restrict buffer_NAs, bool first_run, mapping &restrict w)
565
+ {
566
+ /* TODO: here don't need the buffer for NAs */
567
+
568
+ if (first_run)
569
+ coef /= x_sd;
570
+
571
+ size_t cnt = 0;
572
+ size_t cnt_NA = 0;
573
+ double *restrict res_write = res - st;
574
+ ldouble_safe cumw = 0;
575
+ double w_this;
576
+ /* TODO: these buffers should be allocated externally */
577
+ std::vector<double> obs_weight;
578
+
579
+ if (first_run && missing_action != Fail)
580
+ {
581
+ obs_weight.resize(end - st + 1, 0.);
582
+ }
583
+
584
+ if (missing_action == Fail)
585
+ {
586
+ for (size_t row = st; row <= end; row++)
587
+ res_write[row] = std::fma(x[ix_arr[row]] - x_mean, coef, res_write[row]);
588
+ }
589
+
590
+ else
591
+ {
592
+ if (first_run)
593
+ {
594
+ for (size_t row = st; row <= end; row++)
595
+ {
596
+ if (likely(!is_na_or_inf(x[ix_arr[row]])))
597
+ {
598
+ w_this = w[ix_arr[row]];
599
+ res_write[row] = std::fma(x[ix_arr[row]] - x_mean, coef, res_write[row]);
600
+ obs_weight[cnt] = w_this;
601
+ buffer_arr[cnt++] = x[ix_arr[row]];
602
+ cumw += w_this;
603
+ }
604
+
605
+ else
606
+ {
607
+ buffer_NAs[cnt_NA++] = row;
608
+ }
609
+
610
+ }
611
+ }
612
+
613
+ else
614
+ {
615
+ for (size_t row = st; row <= end; row++)
616
+ {
617
+ res_write[row] += (is_na_or_inf(x[ix_arr[row]]))? fill_val : ( (x[ix_arr[row]]-x_mean) * coef );
618
+ }
619
+ return;
620
+ }
621
+
622
+
623
+ ldouble_safe mid_point = cumw / (ldouble_safe)2;
624
+ std::vector<size_t> sorted_ix(cnt);
625
+ std::iota(sorted_ix.begin(), sorted_ix.end(), (size_t)0);
626
+ std::sort(sorted_ix.begin(), sorted_ix.end(),
627
+ [&buffer_arr](const size_t a, const size_t b){return buffer_arr[a] < buffer_arr[b];});
628
+ ldouble_safe currw = 0;
629
+ fill_val = buffer_arr[sorted_ix.back()]; /* <- will overwrite later */
630
+ /* TODO: is this median calculation correct? should it do a weighted interpolation? */
631
+ for (size_t ix = 0; ix < cnt; ix++)
632
+ {
633
+ currw += obs_weight[sorted_ix[ix]];
634
+ if (currw >= mid_point)
635
+ {
636
+ if (currw == mid_point && ix < cnt-1)
637
+ fill_val = buffer_arr[sorted_ix[ix]] + (buffer_arr[sorted_ix[ix+1]] - buffer_arr[sorted_ix[ix]]) / 2.0;
638
+ else
639
+ fill_val = buffer_arr[sorted_ix[ix]];
640
+ break;
641
+ }
642
+ }
643
+
644
+ fill_val = (fill_val - x_mean) * coef;
645
+ if (cnt_NA && fill_val)
646
+ {
647
+ for (size_t row = 0; row < cnt_NA; row++)
648
+ res_write[buffer_NAs[row]] += fill_val;
649
+ }
650
+
651
+ }
652
+ }
653
+
654
+ /* for sparse numerical */
655
+ template <class real_t_, class sparse_ix>
656
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
657
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
658
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
659
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run)
660
+ {
661
+ /* ix_arr must be already sorted beforehand */
662
+
663
+ /* if it's all zeros, no need to do anything, but this is not supposed
664
+ to happen while fitting because the range is determined before calling this */
665
+ if (
666
+ Xc_indptr[col_num] == Xc_indptr[col_num + 1] ||
667
+ Xc_ind[Xc_indptr[col_num]] > (sparse_ix)ix_arr[end] ||
668
+ Xc_ind[Xc_indptr[col_num + 1] - 1] < (sparse_ix)ix_arr[st]
669
+ )
670
+ {
671
+ if (first_run)
672
+ {
673
+ coef /= x_sd;
674
+ if (missing_action != Fail)
675
+ fill_val = 0;
676
+ }
677
+
678
+ double *restrict res_write = res - st;
679
+ double offset = x_mean * coef;
680
+ if (offset)
681
+ {
682
+ for (size_t row = st; row <= end; row++)
683
+ res_write[row] -= offset;
684
+ }
685
+
686
+ return;
687
+ }
688
+
689
+ size_t st_col = Xc_indptr[col_num];
690
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
691
+ size_t curr_pos = st_col;
692
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
693
+
694
+ size_t cnt_non_NA = 0; /* when NAs need to be imputed */
695
+ size_t cnt_NA = 0; /* when NAs need to be imputed */
696
+ size_t n_sample = end - st + 1;
697
+ size_t *ix_arr_plus_st = ix_arr + st;
698
+
699
+ if (first_run)
700
+ coef /= x_sd;
701
+
702
+ double *restrict res_write = res - st;
703
+ double offset = x_mean * coef;
704
+ if (offset)
705
+ {
706
+ for (size_t row = st; row <= end; row++)
707
+ res_write[row] -= offset;
708
+ }
709
+
710
+ size_t ind_end_col = Xc_ind[end_col];
711
+ size_t nmatches = 0;
712
+
713
+ if (missing_action != Fail)
714
+ {
715
+ if (first_run)
716
+ {
717
+ for (size_t *row = ptr_st;
718
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
719
+ )
720
+ {
721
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
722
+ {
723
+ if (unlikely(is_na_or_inf(Xc[curr_pos])))
724
+ {
725
+ buffer_NAs[cnt_NA++] = row - ix_arr_plus_st;
726
+ }
727
+
728
+ else
729
+ {
730
+ buffer_arr[cnt_non_NA++] = Xc[curr_pos];
731
+ res[row - ix_arr_plus_st] = std::fma(Xc[curr_pos], coef, res[row - ix_arr_plus_st]);
732
+ }
733
+
734
+ nmatches++;
735
+ if (row == ix_arr + end || curr_pos == end_col) break;
736
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
737
+ }
738
+
739
+ else
740
+ {
741
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
742
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
743
+ else
744
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
745
+ }
746
+ }
747
+ }
748
+
749
+ else
750
+ {
751
+ /* when impute value for missing has already been determined */
752
+ for (size_t *row = ptr_st;
753
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
754
+ )
755
+ {
756
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
757
+ {
758
+ res[row - ix_arr_plus_st] += is_na_or_inf(Xc[curr_pos])?
759
+ (fill_val + offset) : (Xc[curr_pos] * coef);
760
+ if (row == ix_arr + end) break;
761
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
762
+ }
763
+
764
+ else
765
+ {
766
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
767
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
768
+ else
769
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
770
+ }
771
+ }
772
+
773
+ return;
774
+ }
775
+
776
+
777
+ /* Determine imputation value */
778
+ std::sort(buffer_arr, buffer_arr + cnt_non_NA);
779
+ size_t mid_ceil = (n_sample - cnt_NA) / 2;
780
+ size_t nzeros = (end - st + 1) - nmatches;
781
+ if (nzeros > mid_ceil && buffer_arr[0] > 0)
782
+ {
783
+ fill_val = 0;
784
+ return;
785
+ }
786
+
787
+ else
788
+ {
789
+ size_t n_neg = (buffer_arr[0] > 0)?
790
+ 0 : ((buffer_arr[cnt_non_NA - 1] < 0)?
791
+ cnt_non_NA : std::lower_bound(buffer_arr, buffer_arr + cnt_non_NA, (double)0) - buffer_arr);
792
+
793
+
794
+ if (n_neg < (mid_ceil-1) && n_neg + nzeros > mid_ceil)
795
+ {
796
+ fill_val = 0;
797
+ return;
798
+ }
799
+
800
+ else
801
+ {
802
+ /* if the sample size is odd, take the middle, otherwise take a simple average */
803
+ if (((n_sample - cnt_NA) % 2) != 0)
804
+ {
805
+ if (mid_ceil < n_neg)
806
+ fill_val = buffer_arr[mid_ceil];
807
+ else if (mid_ceil < n_neg + nzeros)
808
+ fill_val = 0;
809
+ else
810
+ fill_val = buffer_arr[mid_ceil - nzeros];
811
+ }
812
+
813
+ else
814
+ {
815
+ if (mid_ceil < n_neg)
816
+ {
817
+ fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2;
818
+ }
819
+
820
+ else if (mid_ceil < n_neg + nzeros)
821
+ {
822
+ if (mid_ceil == n_neg)
823
+ fill_val = buffer_arr[mid_ceil - 1] / 2;
824
+ else
825
+ fill_val = 0;
826
+ }
827
+
828
+ else
829
+ {
830
+ if (mid_ceil == n_neg + nzeros && nzeros > 0)
831
+ fill_val = buffer_arr[n_neg] / 2;
832
+ else
833
+ fill_val = (buffer_arr[mid_ceil - nzeros - 1] + buffer_arr[mid_ceil - nzeros]) / 2; /* WRONG!!!! */
834
+ }
835
+ }
836
+
837
+ /* fill missing if any */
838
+ fill_val *= coef;
839
+ if (cnt_NA && fill_val)
840
+ for (size_t ix = 0; ix < cnt_NA; ix++)
841
+ res[buffer_NAs[ix]] += fill_val;
842
+
843
+ /* next time, it will need to have the offset added */
844
+ fill_val -= offset;
845
+ }
846
+ }
847
+ }
848
+
849
+ else /* no NAs */
850
+ {
851
+ for (size_t *row = ptr_st;
852
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
853
+ )
854
+ {
855
+ if (Xc_ind[curr_pos] == (sparse_ix)(*row))
856
+ {
857
+ res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
858
+ if (row == ix_arr + end || curr_pos == end_col) break;
859
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
860
+ }
861
+
862
+ else
863
+ {
864
+ if (Xc_ind[curr_pos] > (sparse_ix)(*row))
865
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
866
+ else
867
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
868
+ }
869
+ }
870
+ }
871
+ }
872
+
873
+ template <class real_t_, class sparse_ix, class mapping, class ldouble_safe>
874
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
875
+ real_t_ *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
876
+ double &restrict coef, double x_sd, double x_mean, double &restrict fill_val, MissingAction missing_action,
877
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run, mapping &restrict w)
878
+ {
879
+ /* TODO: there's likely a better way of doing this directly with sparse inputs.
880
+ Think about some way of doing it efficiently. */
881
+ if (first_run && missing_action != Fail)
882
+ {
883
+ std::vector<double> denseX(end-st+1, 0.);
884
+ todense(ix_arr, st, end,
885
+ col_num, Xc, Xc_ind, Xc_indptr,
886
+ denseX.data());
887
+ std::vector<double> obs_weight(end-st+1);
888
+ for (size_t row = st; row <= end; row++)
889
+ obs_weight[row - st] = w[ix_arr[row]];
890
+
891
+ size_t end_new = end - st + 1;
892
+ for (size_t ix = 0; ix < end-st+1; ix++)
893
+ {
894
+ if (unlikely(is_na_or_inf(denseX[ix])))
895
+ {
896
+ std::swap(denseX[ix], denseX[--end_new]);
897
+ std::swap(obs_weight[ix], obs_weight[end_new]);
898
+ }
899
+ }
900
+
901
+ ldouble_safe cumw = std::accumulate(obs_weight.begin(), obs_weight.begin() + end_new, (ldouble_safe)0);
902
+ ldouble_safe mid_point = cumw / (ldouble_safe)2;
903
+ std::vector<size_t> sorted_ix(end_new);
904
+ std::iota(sorted_ix.begin(), sorted_ix.end(), (size_t)0);
905
+ std::sort(sorted_ix.begin(), sorted_ix.end(),
906
+ [&denseX](const size_t a, const size_t b){return denseX[a] < denseX[b];});
907
+ ldouble_safe currw = 0;
908
+ fill_val = denseX[sorted_ix.back()]; /* <- will overwrite later */
909
+ /* TODO: is this median calculation correct? should it do a weighted interpolation? */
910
+ for (size_t ix = 0; ix < end_new; ix++)
911
+ {
912
+ currw += obs_weight[sorted_ix[ix]];
913
+ if (currw >= mid_point)
914
+ {
915
+ if (currw == mid_point && ix < end_new-1)
916
+ fill_val = denseX[sorted_ix[ix]] + (denseX[sorted_ix[ix+1]] - denseX[sorted_ix[ix]]) / 2.0;
917
+ else
918
+ fill_val = denseX[sorted_ix[ix]];
919
+ break;
920
+ }
921
+ }
922
+
923
+ fill_val = (fill_val - x_mean) * (coef / x_sd);
924
+ denseX.clear();
925
+ obs_weight.clear();
926
+ sorted_ix.clear();
927
+
928
+ add_linear_comb(ix_arr, st, end, col_num, res,
929
+ Xc, Xc_ind, Xc_indptr,
930
+ coef, x_sd, x_mean, fill_val, missing_action,
931
+ buffer_arr, buffer_NAs, false);
932
+ }
933
+
934
+ else
935
+ {
936
+ add_linear_comb(ix_arr, st, end, col_num, res,
937
+ Xc, Xc_ind, Xc_indptr,
938
+ coef, x_sd, x_mean, fill_val, missing_action,
939
+ buffer_arr, buffer_NAs, first_run);
940
+ }
941
+ }
942
+
943
+ /* for categoricals */
944
+ template <class ldouble_safe>
945
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
946
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
947
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
948
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run)
949
+ {
950
+ double *restrict res_write = res - st;
951
+ switch(cat_split_type)
952
+ {
953
+ case SingleCateg:
954
+ {
955
+ /* in this case there's no need to make-up an impute value for new categories, only for NAs */
956
+ switch(missing_action)
957
+ {
958
+ case Fail:
959
+ {
960
+ for (size_t row = st; row <= end; row++)
961
+ res_write[row] += (x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0;
962
+ return;
963
+ }
964
+
965
+ case Impute:
966
+ {
967
+ size_t cnt_NA = 0;
968
+ size_t cnt_this = 0;
969
+ size_t cnt = end - st + 1;
970
+ if (first_run)
971
+ {
972
+ for (size_t row = st; row <= end; row++)
973
+ {
974
+ if (unlikely(x[ix_arr[row]] < 0))
975
+ {
976
+ cnt_NA++;
977
+ }
978
+
979
+ else if (x[ix_arr[row]] == chosen_cat)
980
+ {
981
+ cnt_this++;
982
+ res_write[row] += single_cat_coef;
983
+ }
984
+ }
985
+ }
986
+
987
+ else
988
+ {
989
+ for (size_t row = st; row <= end; row++)
990
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0);
991
+ return;
992
+ }
993
+
994
+ fill_val = (cnt_this > (cnt - cnt_NA - cnt_this))? single_cat_coef : 0;
995
+ if (cnt_NA && fill_val)
996
+ {
997
+ for (size_t row = st; row <= end; row++)
998
+ if (x[ix_arr[row]] < 0)
999
+ res_write[row] += fill_val;
1000
+ }
1001
+ return;
1002
+ }
1003
+
1004
+ default:
1005
+ {
1006
+ unexpected_error();
1007
+ break;
1008
+ }
1009
+ }
1010
+ }
1011
+
1012
+ case SubSet:
1013
+ {
1014
+ /* in this case, since the splits are by more than 1 variable, it's not possible to
1015
+ divide missing/new categoricals by assigning weights, so they have to be imputed
1016
+ in both cases, unless using random weights for the new ones, in which case they won't
1017
+ need to be imputed for new, but sill need it for NA */
1018
+
1019
+ if (new_cat_action == Random && missing_action == Fail)
1020
+ {
1021
+ for (size_t row = st; row <= end; row++)
1022
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1023
+ return;
1024
+ }
1025
+
1026
+ if (!first_run)
1027
+ {
1028
+ if (missing_action == Fail)
1029
+ {
1030
+ for (size_t row = st; row <= end; row++)
1031
+ res_write[row] += (x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]];
1032
+ }
1033
+
1034
+ else
1035
+ {
1036
+ for (size_t row = st; row <= end; row++)
1037
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]]);
1038
+ }
1039
+ return;
1040
+ }
1041
+
1042
+ std::fill(buffer_cnt, buffer_cnt + ncat + 1, 0);
1043
+ switch(missing_action)
1044
+ {
1045
+ case Fail:
1046
+ {
1047
+ for (size_t row = st; row <= end; row++)
1048
+ {
1049
+ buffer_cnt[x[ix_arr[row]]]++;
1050
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1051
+ }
1052
+ break;
1053
+ }
1054
+
1055
+ default:
1056
+ {
1057
+ for (size_t row = st; row <= end; row++)
1058
+ {
1059
+ if (x[ix_arr[row]] >= 0)
1060
+ {
1061
+ buffer_cnt[x[ix_arr[row]]]++;
1062
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1063
+ }
1064
+
1065
+ else
1066
+ {
1067
+ buffer_cnt[ncat]++;
1068
+ }
1069
+
1070
+ }
1071
+ break;
1072
+ }
1073
+ }
1074
+
1075
+ switch(new_cat_action)
1076
+ {
1077
+ case Smallest:
1078
+ {
1079
+ size_t smallest = SIZE_MAX;
1080
+ int cat_smallest = 0;
1081
+ for (int cat = 0; cat < ncat; cat++)
1082
+ {
1083
+ if (buffer_cnt[cat] > 0 && buffer_cnt[cat] < smallest)
1084
+ {
1085
+ smallest = buffer_cnt[cat];
1086
+ cat_smallest = cat;
1087
+ }
1088
+ }
1089
+ fill_new = cat_coef[cat_smallest];
1090
+ if (missing_action == Fail) break;
1091
+ }
1092
+
1093
+ default:
1094
+ {
1095
+ /* Determine imputation value as the category in sorted order that gives 50% + 1 */
1096
+ ldouble_safe cnt_l = (ldouble_safe)((end - st + 1) - buffer_cnt[ncat]);
1097
+ std::iota(buffer_pos, buffer_pos + ncat, (size_t)0);
1098
+ std::sort(buffer_pos, buffer_pos + ncat, [&cat_coef](const size_t a, const size_t b){return cat_coef[a] < cat_coef[b];});
1099
+
1100
+ double cumprob = 0;
1101
+ int cat;
1102
+ for (cat = 0; cat < ncat; cat++)
1103
+ {
1104
+ cumprob += (ldouble_safe)buffer_cnt[buffer_pos[cat]] / cnt_l;
1105
+ if (cumprob >= .5) break;
1106
+ }
1107
+ // cat = std::min(cat, ncat); /* in case it picks the last one */
1108
+ fill_val = cat_coef[buffer_pos[cat]];
1109
+ if (new_cat_action != Smallest)
1110
+ fill_new = fill_val;
1111
+
1112
+ if (buffer_cnt[ncat] > 0 && fill_val) /* NAs */
1113
+ for (size_t row = st; row <= end; row++)
1114
+ if (unlikely(x[ix_arr[row]] < 0))
1115
+ res_write[row] += fill_val;
1116
+ }
1117
+ }
1118
+
1119
+ /* now fill unseen categories */
1120
+ if (new_cat_action != Random)
1121
+ for (int cat = 0; cat < ncat; cat++)
1122
+ if (!buffer_cnt[cat])
1123
+ cat_coef[cat] = fill_new;
1124
+
1125
+ }
1126
+ }
1127
+ }
1128
+
1129
+ template <class mapping, class ldouble_safe>
1130
+ void add_linear_comb_weighted(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
1131
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
1132
+ double &restrict fill_val, double &restrict fill_new, size_t *restrict buffer_pos,
1133
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type,
1134
+ bool first_run, mapping &restrict w)
1135
+ {
1136
+ double *restrict res_write = res - st;
1137
+ /* TODO: this buffer should be allocated externally */
1138
+
1139
+ switch(cat_split_type)
1140
+ {
1141
+ case SingleCateg:
1142
+ {
1143
+ /* in this case there's no need to make-up an impute value for new categories, only for NAs */
1144
+ switch(missing_action)
1145
+ {
1146
+ case Fail:
1147
+ {
1148
+ for (size_t row = st; row <= end; row++)
1149
+ res_write[row] += (x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0;
1150
+ return;
1151
+ }
1152
+
1153
+ case Impute:
1154
+ {
1155
+ bool has_NA = false;
1156
+ ldouble_safe cnt_this = 0;
1157
+ ldouble_safe cnt_other = 0;
1158
+ if (first_run)
1159
+ {
1160
+ for (size_t row = st; row <= end; row++)
1161
+ {
1162
+ if (unlikely(x[ix_arr[row]] < 0))
1163
+ {
1164
+ has_NA = true;
1165
+ }
1166
+
1167
+ else if (x[ix_arr[row]] == chosen_cat)
1168
+ {
1169
+ cnt_this += w[ix_arr[row]];
1170
+ res_write[row] += single_cat_coef;
1171
+ }
1172
+
1173
+ else
1174
+ {
1175
+ cnt_other += w[ix_arr[row]];
1176
+ }
1177
+ }
1178
+ }
1179
+
1180
+ else
1181
+ {
1182
+ for (size_t row = st; row <= end; row++)
1183
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0);
1184
+ return;
1185
+ }
1186
+
1187
+ fill_val = (cnt_this > cnt_other)? single_cat_coef : 0;
1188
+ if (has_NA && fill_val)
1189
+ {
1190
+ for (size_t row = st; row <= end; row++)
1191
+ if (unlikely(x[ix_arr[row]] < 0))
1192
+ res_write[row] += fill_val;
1193
+ }
1194
+ return;
1195
+ }
1196
+
1197
+ default:
1198
+ {
1199
+ unexpected_error();
1200
+ break;
1201
+ }
1202
+ }
1203
+ }
1204
+
1205
+ case SubSet:
1206
+ {
1207
+ /* in this case, since the splits are by more than 1 variable, it's not possible to
1208
+ divide missing/new categoricals by assigning weights, so they have to be imputed
1209
+ in both cases, unless using random weights for the new ones, in which case they won't
1210
+ need to be imputed for new, but sill need it for NA */
1211
+
1212
+ if (new_cat_action == Random && missing_action == Fail)
1213
+ {
1214
+ for (size_t row = st; row <= end; row++)
1215
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1216
+ return;
1217
+ }
1218
+
1219
+ if (!first_run)
1220
+ {
1221
+ if (missing_action == Fail)
1222
+ {
1223
+ for (size_t row = st; row <= end; row++)
1224
+ res_write[row] += (x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]];
1225
+ }
1226
+
1227
+ else
1228
+ {
1229
+ for (size_t row = st; row <= end; row++)
1230
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]]);
1231
+ }
1232
+ return;
1233
+ }
1234
+
1235
+ /* TODO: this buffer should be allocated externally */
1236
+ std::vector<ldouble_safe> buffer_cnt(ncat+1, 0.);
1237
+ switch(missing_action)
1238
+ {
1239
+ case Fail:
1240
+ {
1241
+ for (size_t row = st; row <= end; row++)
1242
+ {
1243
+ buffer_cnt[x[ix_arr[row]]] += w[ix_arr[row]];
1244
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1245
+ }
1246
+ break;
1247
+ }
1248
+
1249
+ default:
1250
+ {
1251
+ for (size_t row = st; row <= end; row++)
1252
+ {
1253
+ if (likely(x[ix_arr[row]] >= 0))
1254
+ {
1255
+ buffer_cnt[x[ix_arr[row]]] += w[ix_arr[row]];
1256
+ res_write[row] += cat_coef[x[ix_arr[row]]];
1257
+ }
1258
+
1259
+ else
1260
+ {
1261
+ buffer_cnt[ncat] += w[ix_arr[row]];
1262
+ }
1263
+
1264
+ }
1265
+ break;
1266
+ }
1267
+ }
1268
+
1269
+ switch(new_cat_action)
1270
+ {
1271
+ case Smallest:
1272
+ {
1273
+ ldouble_safe smallest = std::numeric_limits<ldouble_safe>::infinity();
1274
+ int cat_smallest = 0;
1275
+ for (int cat = 0; cat < ncat; cat++)
1276
+ {
1277
+ if (buffer_cnt[cat] > 0 && buffer_cnt[cat] < smallest)
1278
+ {
1279
+ smallest = buffer_cnt[cat];
1280
+ cat_smallest = cat;
1281
+ }
1282
+ }
1283
+ fill_new = cat_coef[cat_smallest];
1284
+ if (missing_action == Fail) break;
1285
+ }
1286
+
1287
+ default:
1288
+ {
1289
+ /* Determine imputation value as the category in sorted order that gives 50% + 1 */
1290
+ ldouble_safe cnt_l = std::accumulate(buffer_cnt.begin(), buffer_cnt.begin() + ncat, (ldouble_safe)0);
1291
+ std::iota(buffer_pos, buffer_pos + ncat, (size_t)0);
1292
+ std::sort(buffer_pos, buffer_pos + ncat, [&cat_coef](const size_t a, const size_t b){return cat_coef[a] < cat_coef[b];});
1293
+
1294
+ double cumprob = 0;
1295
+ int cat;
1296
+ for (cat = 0; cat < ncat; cat++)
1297
+ {
1298
+ cumprob += buffer_cnt[buffer_pos[cat]] / cnt_l;
1299
+ if (cumprob >= .5) break;
1300
+ }
1301
+ // cat = std::min(cat, ncat); /* in case it picks the last one */
1302
+ fill_val = cat_coef[buffer_pos[cat]];
1303
+ if (new_cat_action != Smallest)
1304
+ fill_new = fill_val;
1305
+
1306
+ if (buffer_cnt[ncat] > 0 && fill_val) /* NAs */
1307
+ for (size_t row = st; row <= end; row++)
1308
+ if (unlikely(x[ix_arr[row]] < 0))
1309
+ res_write[row] += fill_val;
1310
+ }
1311
+ }
1312
+
1313
+ /* now fill unseen categories */
1314
+ if (new_cat_action != Random)
1315
+ for (int cat = 0; cat < ncat; cat++)
1316
+ if (!buffer_cnt[cat])
1317
+ cat_coef[cat] = fill_new;
1318
+
1319
+ }
1320
+ }
1321
+ }