isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,116 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Append trees from one model into another
48
+ *
49
+ * Parameters
50
+ * ==========
51
+ * - model (in, out)
52
+ * Pointer to isolation forest model wich has already been fit through 'fit_iforest'.
53
+ * The trees from 'other' will be merged into this (will be at the end of vector member 'trees').
54
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
55
+ * in order for this merge to work correctly - at the very least, should have
56
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
57
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
58
+ * Pass NULL if this is not to be used.
59
+ * - other
60
+ * Pointer to isolation forest model which has already been fit through 'fit_iforest'.
61
+ * The trees from this object will be added into 'model' (this object will not be modified).
62
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
63
+ * in order for this merge to work correctly - at the very least, should have
64
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
65
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
66
+ * Pass NULL if this is not to be used.
67
+ * - ext_model (in, out)
68
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
69
+ * The trees/hyperplanes from 'ext_other' will be merged into this (will be at the end of vector member 'hplanes').
70
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
71
+ * in order for this merge to work correctly - at the very least, should have
72
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
73
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
74
+ * Pass NULL if this is not to be used.
75
+ * - ext_other
76
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
77
+ * The trees/hyperplanes from this object will be added into 'ext_model' (this object will not be modified).
78
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
79
+ * in order for this merge to work correctly - at the very least, should have
80
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
81
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
82
+ * Pass NULL if this is not to be used.
83
+ * - imputer (in, out)
84
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
85
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
86
+ * The imputation nodes from 'iother' will be merged into this (will be at the end of vector member 'imputer_tree').
87
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
88
+ * hyperparameters after the merge).
89
+ * Pass NULL if this is not to be used.
90
+ * - iother
91
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
92
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
93
+ * The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
94
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
95
+ * hyperparameters after the merge).
96
+ * Pass NULL if this is not to be used.
97
+ */
98
+ void merge_models(IsoForest* model, IsoForest* other,
99
+ ExtIsoForest* ext_model, ExtIsoForest* ext_other,
100
+ Imputer* imputer, Imputer* iother)
101
+ {
102
+ if (model != NULL && other != NULL)
103
+ model->trees.insert(model->trees.end(),
104
+ other->trees.begin(),
105
+ other->trees.end());
106
+
107
+ if (ext_model != NULL && ext_other != NULL)
108
+ ext_model->hplanes.insert(ext_model->hplanes.end(),
109
+ ext_other->hplanes.begin(),
110
+ ext_other->hplanes.end());
111
+
112
+ if (imputer != NULL && iother != NULL)
113
+ imputer->imputer_tree.insert(imputer->imputer_tree.end(),
114
+ iother->imputer_tree.begin(),
115
+ iother->imputer_tree.end());
116
+ }
@@ -0,0 +1,607 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* for regular numerical */
48
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
49
+ MissingAction missing_action, double &x_sd, double &x_mean)
50
+ {
51
+ long double m = 0;
52
+ long double s = 0;
53
+ long double m_prev = 0;
54
+
55
+ if (missing_action == Fail)
56
+ {
57
+ for (size_t row = st; row <= end; row++)
58
+ {
59
+ m += (x[ix_arr[row]] - m) / (long double)(row - st + 1);
60
+ s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
61
+ m_prev = m;
62
+ }
63
+
64
+ x_mean = m;
65
+ x_sd = sqrtl(s / (long double)(end - st + 1));
66
+ }
67
+
68
+ else
69
+ {
70
+ size_t cnt = 0;
71
+ for (size_t row = st; row <= end; row++)
72
+ {
73
+ if (!is_na_or_inf(x[ix_arr[row]]))
74
+ {
75
+ cnt++;
76
+ m += (x[ix_arr[row]] - m) / (long double)cnt;
77
+ s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
78
+ m_prev = m;
79
+ }
80
+ }
81
+
82
+ x_mean = m;
83
+ x_sd = sqrtl(s / (long double)cnt);
84
+ }
85
+ }
86
+
87
+ /* for sparse numerical */
88
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
89
+ double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
90
+ double &x_sd, double &x_mean)
91
+ {
92
+ /* ix_arr must be already sorted beforehand */
93
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
94
+ {
95
+ x_sd = 0;
96
+ x_mean = 0;
97
+ return;
98
+ }
99
+ size_t st_col = Xc_indptr[col_num];
100
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
101
+ size_t curr_pos = st_col;
102
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
103
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
104
+
105
+ size_t cnt = end - st + 1;
106
+ long double sum = 0;
107
+ long double sum_sq = 0;
108
+
109
+ /* Note: this function will discard NAs regardless of chosen action. If reaching the point of calling
110
+ this function, chances are that the performance gain of not checking for them will not be important */
111
+
112
+ for (size_t *row = ptr_st;
113
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
114
+ )
115
+ {
116
+ if (Xc_ind[curr_pos] == *row)
117
+ {
118
+ if (is_na_or_inf(Xc[curr_pos]))
119
+ {
120
+ cnt--;
121
+ }
122
+
123
+ else
124
+ {
125
+ sum += Xc[curr_pos];
126
+ sum_sq += square(Xc[curr_pos]);
127
+ }
128
+
129
+ if (row == ix_arr + end || curr_pos == end_col) break;
130
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
131
+ }
132
+
133
+ else
134
+ {
135
+ if (Xc_ind[curr_pos] > *row)
136
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
137
+ else
138
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
139
+ }
140
+ }
141
+
142
+ x_mean = sum / (long double) cnt;
143
+ x_sd = calc_sd_raw(cnt, sum, sum_sq);
144
+ }
145
+
146
+
147
+ /* Note about these functions: they write into an array that does not need to match to 'ix_arr',
148
+ and instead, the index that is stored in ix_arr[n] will have the value in res[n] */
149
+
150
+ /* for regular numerical */
151
+ void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
152
+ double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
153
+ MissingAction missing_action, double *restrict buffer_arr,
154
+ size_t *restrict buffer_NAs, bool first_run)
155
+ {
156
+ /* TODO: here don't need the buffer for NAs */
157
+
158
+ if (first_run)
159
+ coef /= x_sd;
160
+
161
+ size_t cnt = 0;
162
+ size_t cnt_NA = 0;
163
+ double *restrict res_write = res - st;
164
+
165
+ if (missing_action == Fail)
166
+ {
167
+ for (size_t row = st; row <= end; row++)
168
+ res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
169
+ }
170
+
171
+ else
172
+ {
173
+ if (first_run)
174
+ {
175
+ for (size_t row = st; row <= end; row++)
176
+ {
177
+ if (!is_na_or_inf(x[ix_arr[row]]))
178
+ {
179
+ res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
180
+ buffer_arr[cnt++] = x[ix_arr[row]];
181
+ }
182
+
183
+ else
184
+ {
185
+ buffer_NAs[cnt_NA++] = row;
186
+ }
187
+
188
+ }
189
+ }
190
+
191
+ else
192
+ {
193
+ for (size_t row = st; row <= end; row++)
194
+ {
195
+ res_write[row] += (is_na_or_inf(x[ix_arr[row]]))? fill_val : ( (x[ix_arr[row]]-x_mean) * coef );
196
+ }
197
+ return;
198
+ }
199
+
200
+ size_t mid_ceil = cnt / 2;
201
+ std::partial_sort(buffer_arr, buffer_arr + mid_ceil + 1, buffer_arr + cnt);
202
+
203
+ if ((cnt % 2) == 0)
204
+ fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2.0;
205
+ else
206
+ fill_val = buffer_arr[mid_ceil];
207
+
208
+ fill_val = (fill_val - x_mean) * coef;
209
+ if (cnt_NA)
210
+ {
211
+ for (size_t row = 0; row < cnt_NA; row++)
212
+ res_write[buffer_NAs[row]] += fill_val;
213
+ }
214
+
215
+ }
216
+ }
217
+
218
+ /* for sparse numerical */
219
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
220
+ double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
221
+ double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
222
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run)
223
+ {
224
+ /* ix_arr must be already sorted beforehand */
225
+
226
+ /* if it's all zeros, no need to do anything, but this is not supposed
227
+ to happen while fitting because the range is determined before calling this */
228
+ if (
229
+ Xc_indptr[col_num] == Xc_indptr[col_num + 1] ||
230
+ Xc_ind[Xc_indptr[col_num]] > ix_arr[end] ||
231
+ Xc_ind[Xc_indptr[col_num + 1] - 1] < ix_arr[st]
232
+ )
233
+ {
234
+ if (first_run)
235
+ {
236
+ coef /= x_sd;
237
+ if (missing_action != Fail)
238
+ fill_val = 0;
239
+ }
240
+
241
+ double *restrict res_write = res - st;
242
+ double offset = x_mean * coef;
243
+ for (size_t row = st; row <= end; row++)
244
+ res_write[row] -= offset;
245
+
246
+ return;
247
+ }
248
+
249
+ size_t st_col = Xc_indptr[col_num];
250
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
251
+ size_t curr_pos = st_col;
252
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
253
+
254
+ size_t cnt_non_NA = 0; /* when NAs need to be imputed */
255
+ size_t cnt_NA = 0; /* when NAs need to be imputed */
256
+ size_t n_sample = end - st + 1;
257
+ size_t *ix_arr_plus_st = ix_arr + st;
258
+
259
+ if (first_run)
260
+ coef /= x_sd;
261
+
262
+ double *restrict res_write = res - st;
263
+ double offset = x_mean * coef;
264
+ for (size_t row = st; row <= end; row++)
265
+ res_write[row] -= offset;
266
+
267
+ size_t ind_end_col = Xc_ind[end_col];
268
+ size_t nmatches = 0;
269
+
270
+ if (missing_action != Fail)
271
+ {
272
+ if (first_run)
273
+ {
274
+ for (size_t *row = ptr_st;
275
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
276
+ )
277
+ {
278
+ if (Xc_ind[curr_pos] == *row)
279
+ {
280
+ if (is_na_or_inf(Xc[curr_pos]))
281
+ {
282
+ buffer_NAs[cnt_NA++] = row - ix_arr_plus_st;
283
+ }
284
+
285
+ else
286
+ {
287
+ buffer_arr[cnt_non_NA++] = Xc[curr_pos];
288
+ res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
289
+ }
290
+
291
+ nmatches++;
292
+ if (row == ix_arr + end || curr_pos == end_col) break;
293
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
294
+ }
295
+
296
+ else
297
+ {
298
+ if (Xc_ind[curr_pos] > *row)
299
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
300
+ else
301
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
302
+ }
303
+ }
304
+ }
305
+
306
+ else
307
+ {
308
+ /* when impute value for missing has already been determined */
309
+ for (size_t *row = ptr_st;
310
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
311
+ )
312
+ {
313
+ if (Xc_ind[curr_pos] == *row)
314
+ {
315
+ res[row - ix_arr_plus_st] += is_na_or_inf(Xc[curr_pos])?
316
+ (fill_val + offset) : (Xc[curr_pos] * coef);
317
+ if (row == ix_arr + end) break;
318
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
319
+ }
320
+
321
+ else
322
+ {
323
+ if (Xc_ind[curr_pos] > *row)
324
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
325
+ else
326
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
327
+ }
328
+ }
329
+
330
+ return;
331
+ }
332
+
333
+
334
+ /* Determine imputation value */
335
+ std::sort(buffer_arr, buffer_arr + cnt_non_NA);
336
+ size_t mid_ceil = (n_sample - cnt_NA) / 2;
337
+ size_t nzeros = (end - st + 1) - nmatches;
338
+ if (nzeros > mid_ceil && buffer_arr[0] > 0)
339
+ {
340
+ fill_val = 0;
341
+ return;
342
+ }
343
+
344
+ else
345
+ {
346
+ size_t n_neg = (buffer_arr[0] > 0)?
347
+ 0 : ((buffer_arr[cnt_non_NA - 1] < 0)?
348
+ cnt_non_NA : std::lower_bound(buffer_arr, buffer_arr + cnt_non_NA, (double)0) - buffer_arr);
349
+
350
+
351
+ if (n_neg < (mid_ceil-1) && n_neg + nzeros > mid_ceil)
352
+ {
353
+ fill_val = 0;
354
+ return;
355
+ }
356
+
357
+ else
358
+ {
359
+ /* if the sample size is odd, take the middle, otherwise take a simple average */
360
+ if (((n_sample - cnt_NA) % 2) != 0)
361
+ {
362
+ if (mid_ceil < n_neg)
363
+ fill_val = buffer_arr[mid_ceil];
364
+ else if (mid_ceil < n_neg + nzeros)
365
+ fill_val = 0;
366
+ else
367
+ fill_val = buffer_arr[mid_ceil - nzeros];
368
+ }
369
+
370
+ else
371
+ {
372
+ if (mid_ceil < n_neg)
373
+ {
374
+ fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2;
375
+ }
376
+
377
+ else if (mid_ceil < n_neg + nzeros)
378
+ {
379
+ if (mid_ceil == n_neg)
380
+ fill_val = buffer_arr[mid_ceil - 1] / 2;
381
+ else
382
+ fill_val = 0;
383
+ }
384
+
385
+ else
386
+ {
387
+ if (mid_ceil == n_neg + nzeros && nzeros > 0)
388
+ fill_val = buffer_arr[n_neg] / 2;
389
+ else
390
+ fill_val = (buffer_arr[mid_ceil - nzeros - 1] + buffer_arr[mid_ceil - nzeros]) / 2; /* WRONG!!!! */
391
+ }
392
+ }
393
+
394
+ /* fill missing if any */
395
+ fill_val *= coef;
396
+ if (cnt_NA && fill_val)
397
+ for (size_t ix = 0; ix < cnt_NA; ix++)
398
+ res[buffer_NAs[ix]] += fill_val;
399
+
400
+ /* next time, it will need to have the offset added */
401
+ fill_val -= offset;
402
+ }
403
+ }
404
+ }
405
+
406
+ else /* no NAs */
407
+ {
408
+ for (size_t *row = ptr_st;
409
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
410
+ )
411
+ {
412
+ if (Xc_ind[curr_pos] == *row)
413
+ {
414
+ res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
415
+ if (row == ix_arr + end || curr_pos == end_col) break;
416
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
417
+ }
418
+
419
+ else
420
+ {
421
+ if (Xc_ind[curr_pos] > *row)
422
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
423
+ else
424
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
425
+ }
426
+ }
427
+ }
428
+ }
429
+
430
+ /* for categoricals */
431
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
432
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
433
+ double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
434
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run)
435
+ {
436
+ double *restrict res_write = res - st;
437
+ switch(cat_split_type)
438
+ {
439
+ case SingleCateg:
440
+ {
441
+ /* in this case there's no need to make-up an impute value for new categories, only for NAs */
442
+ switch(missing_action)
443
+ {
444
+ case Fail:
445
+ {
446
+ for (size_t row = st; row <= end; row++)
447
+ res_write[row] += (x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0;
448
+ return;
449
+ }
450
+
451
+ case Impute:
452
+ {
453
+ size_t cnt_NA = 0;
454
+ size_t cnt_this = 0;
455
+ size_t cnt = end - st + 1;
456
+ if (first_run)
457
+ {
458
+ for (size_t row = st; row <= end; row++)
459
+ {
460
+ if (x[ix_arr[row]] < 0)
461
+ {
462
+ cnt_NA++;
463
+ }
464
+
465
+ else if (x[ix_arr[row]] == chosen_cat)
466
+ {
467
+ cnt_this++;
468
+ res_write[row] += single_cat_coef;
469
+ }
470
+ }
471
+ }
472
+
473
+ else
474
+ {
475
+ for (size_t row = st; row <= end; row++)
476
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0);
477
+ return;
478
+ }
479
+
480
+ fill_val = (cnt_this > (cnt - cnt_NA - cnt_this))? single_cat_coef : 0;
481
+ if (cnt_NA)
482
+ {
483
+ for (size_t row = st; row <= end; row++)
484
+ if (x[ix_arr[row]] < 0)
485
+ res_write[row] += fill_val;
486
+ }
487
+ return;
488
+ }
489
+ }
490
+ }
491
+
492
+ case SubSet:
493
+ {
494
+ /* in this case, since the splits are by more than 1 variable, it's not possible to
495
+ divide missing/new categoricals by assigning weights, so they have to be imputed
496
+ in both cases, unless using random weights for the new ones, in which case they won't
497
+ need to be imputed for new, but sill need it for NA */
498
+
499
+ if (new_cat_action == Random && missing_action == Fail)
500
+ {
501
+ for (size_t row = st; row <= end; row++)
502
+ res_write[row] += cat_coef[x[ix_arr[row]]];
503
+ return;
504
+ }
505
+
506
+ if (!first_run)
507
+ {
508
+ if (missing_action == Fail)
509
+ {
510
+ for (size_t row = st; row <= end; row++)
511
+ res_write[row] += (x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]];
512
+ }
513
+
514
+ else
515
+ {
516
+ for (size_t row = st; row <= end; row++)
517
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]]);
518
+ }
519
+ return;
520
+ }
521
+
522
+ std::fill(buffer_cnt, buffer_cnt + ncat + 1, 0);
523
+ switch(missing_action)
524
+ {
525
+ case Fail:
526
+ {
527
+ for (size_t row = st; row <= end; row++)
528
+ {
529
+ buffer_cnt[x[ix_arr[row]]]++;
530
+ res_write[row] += cat_coef[x[ix_arr[row]]];
531
+ }
532
+ break;
533
+ }
534
+
535
+ default:
536
+ {
537
+ for (size_t row = st; row <= end; row++)
538
+ {
539
+ if (x[ix_arr[row]] >= 0)
540
+ {
541
+ buffer_cnt[x[ix_arr[row]]]++;
542
+ res_write[row] += cat_coef[x[ix_arr[row]]];
543
+ }
544
+
545
+ else
546
+ {
547
+ buffer_cnt[ncat]++;
548
+ }
549
+
550
+ }
551
+ break;
552
+ }
553
+ }
554
+
555
+ switch(new_cat_action)
556
+ {
557
+ case Smallest:
558
+ {
559
+ size_t smallest = SIZE_MAX;
560
+ int cat_smallest;
561
+ for (int cat = 0; cat < ncat; cat++)
562
+ {
563
+ if (buffer_cnt[cat] > 0 && buffer_cnt[cat] < smallest)
564
+ {
565
+ smallest = buffer_cnt[cat];
566
+ cat_smallest = cat;
567
+ }
568
+ }
569
+ fill_new = cat_coef[cat_smallest];
570
+ if (missing_action == Fail) break;
571
+ }
572
+
573
+ default:
574
+ {
575
+ /* Determine imputation value as the category in sorted order that gives 50% + 1 */
576
+ long double cnt_l = (long double)((end - st + 1) - buffer_cnt[ncat]);
577
+ std::iota(buffer_pos, buffer_pos + ncat, (size_t)0);
578
+ std::sort(buffer_pos, buffer_pos + ncat, [&cat_coef](const size_t a, const size_t b){return cat_coef[a] < cat_coef[b];});
579
+
580
+ double cumprob = 0;
581
+ int cat;
582
+ for (cat = 0; cat < ncat; cat++)
583
+ {
584
+ cumprob += (long double)buffer_cnt[buffer_pos[cat]] / cnt_l;
585
+ if (cumprob >= .5) break;
586
+ }
587
+ // cat = std::min(cat, ncat); /* in case it picks the last one */
588
+ fill_val = cat_coef[buffer_pos[cat]];
589
+ if (new_cat_action != Smallest)
590
+ fill_new = fill_val;
591
+
592
+ if (buffer_cnt[ncat] > 0) /* NAs */
593
+ for (size_t row = st; row <= end; row++)
594
+ if (x[ix_arr[row]] < 0)
595
+ res_write[row] += fill_val;
596
+ }
597
+ }
598
+
599
+ /* now fill unseen categories */
600
+ if (new_cat_action != Random)
601
+ for (int cat = 0; cat < ncat; cat++)
602
+ if (!buffer_cnt[cat])
603
+ cat_coef[cat] = fill_new;
604
+
605
+ }
606
+ }
607
+ }