isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,116 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* Append trees from one model into another
48
+ *
49
+ * Parameters
50
+ * ==========
51
+ * - model (in, out)
52
+ * Pointer to isolation forest model wich has already been fit through 'fit_iforest'.
53
+ * The trees from 'other' will be merged into this (will be at the end of vector member 'trees').
54
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
55
+ * in order for this merge to work correctly - at the very least, should have
56
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
57
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
58
+ * Pass NULL if this is not to be used.
59
+ * - other
60
+ * Pointer to isolation forest model which has already been fit through 'fit_iforest'.
61
+ * The trees from this object will be added into 'model' (this object will not be modified).
62
+ * Both 'model' and 'other' must have been fit with the same hyperparameters
63
+ * in order for this merge to work correctly - at the very least, should have
64
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
65
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
66
+ * Pass NULL if this is not to be used.
67
+ * - ext_model (in, out)
68
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
69
+ * The trees/hyperplanes from 'ext_other' will be merged into this (will be at the end of vector member 'hplanes').
70
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
71
+ * in order for this merge to work correctly - at the very least, should have
72
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
73
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
74
+ * Pass NULL if this is not to be used.
75
+ * - ext_other
76
+ * Pointer to extended isolation forest model which has already been fit through 'fit_iforest'.
77
+ * The trees/hyperplanes from this object will be added into 'ext_model' (this object will not be modified).
78
+ * Both 'ext_model' and 'ext_other' must have been fit with the same hyperparameters
79
+ * in order for this merge to work correctly - at the very least, should have
80
+ * the same 'missing_action', 'cat_split_type', 'new_cat_action'.
81
+ * Should only pass one of 'model'+'other' or 'ext_model'+'ext_other'.
82
+ * Pass NULL if this is not to be used.
83
+ * - imputer (in, out)
84
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
85
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
86
+ * The imputation nodes from 'iother' will be merged into this (will be at the end of vector member 'imputer_tree').
87
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
88
+ * hyperparameters after the merge).
89
+ * Pass NULL if this is not to be used.
90
+ * - iother
91
+ * Pointer to imputation object which has already been fit through 'fit_iforest' along with
92
+ * either 'model' or 'ext_model' in the same call to 'fit_iforest'.
93
+ * The imputation nodes from this object will be added into 'imputer' (this object will not be modified).
94
+ * Hyperparameters related to imputation might differ between 'imputer' and 'iother' ('imputer' will preserve its
95
+ * hyperparameters after the merge).
96
+ * Pass NULL if this is not to be used.
97
+ */
98
+ void merge_models(IsoForest* model, IsoForest* other,
99
+ ExtIsoForest* ext_model, ExtIsoForest* ext_other,
100
+ Imputer* imputer, Imputer* iother)
101
+ {
102
+ if (model != NULL && other != NULL)
103
+ model->trees.insert(model->trees.end(),
104
+ other->trees.begin(),
105
+ other->trees.end());
106
+
107
+ if (ext_model != NULL && ext_other != NULL)
108
+ ext_model->hplanes.insert(ext_model->hplanes.end(),
109
+ ext_other->hplanes.begin(),
110
+ ext_other->hplanes.end());
111
+
112
+ if (imputer != NULL && iother != NULL)
113
+ imputer->imputer_tree.insert(imputer->imputer_tree.end(),
114
+ iother->imputer_tree.begin(),
115
+ iother->imputer_tree.end());
116
+ }
@@ -0,0 +1,607 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ /* for regular numerical */
48
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
49
+ MissingAction missing_action, double &x_sd, double &x_mean)
50
+ {
51
+ long double m = 0;
52
+ long double s = 0;
53
+ long double m_prev = 0;
54
+
55
+ if (missing_action == Fail)
56
+ {
57
+ for (size_t row = st; row <= end; row++)
58
+ {
59
+ m += (x[ix_arr[row]] - m) / (long double)(row - st + 1);
60
+ s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
61
+ m_prev = m;
62
+ }
63
+
64
+ x_mean = m;
65
+ x_sd = sqrtl(s / (long double)(end - st + 1));
66
+ }
67
+
68
+ else
69
+ {
70
+ size_t cnt = 0;
71
+ for (size_t row = st; row <= end; row++)
72
+ {
73
+ if (!is_na_or_inf(x[ix_arr[row]]))
74
+ {
75
+ cnt++;
76
+ m += (x[ix_arr[row]] - m) / (long double)cnt;
77
+ s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
78
+ m_prev = m;
79
+ }
80
+ }
81
+
82
+ x_mean = m;
83
+ x_sd = sqrtl(s / (long double)cnt);
84
+ }
85
+ }
86
+
87
+ /* for sparse numerical */
88
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
89
+ double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
90
+ double &x_sd, double &x_mean)
91
+ {
92
+ /* ix_arr must be already sorted beforehand */
93
+ if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
94
+ {
95
+ x_sd = 0;
96
+ x_mean = 0;
97
+ return;
98
+ }
99
+ size_t st_col = Xc_indptr[col_num];
100
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
101
+ size_t curr_pos = st_col;
102
+ size_t ind_end_col = (size_t) Xc_ind[end_col];
103
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
104
+
105
+ size_t cnt = end - st + 1;
106
+ long double sum = 0;
107
+ long double sum_sq = 0;
108
+
109
+ /* Note: this function will discard NAs regardless of chosen action. If reaching the point of calling
110
+ this function, chances are that the performance gain of not checking for them will not be important */
111
+
112
+ for (size_t *row = ptr_st;
113
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
114
+ )
115
+ {
116
+ if (Xc_ind[curr_pos] == *row)
117
+ {
118
+ if (is_na_or_inf(Xc[curr_pos]))
119
+ {
120
+ cnt--;
121
+ }
122
+
123
+ else
124
+ {
125
+ sum += Xc[curr_pos];
126
+ sum_sq += square(Xc[curr_pos]);
127
+ }
128
+
129
+ if (row == ix_arr + end || curr_pos == end_col) break;
130
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
131
+ }
132
+
133
+ else
134
+ {
135
+ if (Xc_ind[curr_pos] > *row)
136
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
137
+ else
138
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
139
+ }
140
+ }
141
+
142
+ x_mean = sum / (long double) cnt;
143
+ x_sd = calc_sd_raw(cnt, sum, sum_sq);
144
+ }
145
+
146
+
147
+ /* Note about these functions: they write into an array that does not need to match to 'ix_arr',
148
+ and instead, the index that is stored in ix_arr[n] will have the value in res[n] */
149
+
150
+ /* for regular numerical */
151
+ void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
152
+ double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
153
+ MissingAction missing_action, double *restrict buffer_arr,
154
+ size_t *restrict buffer_NAs, bool first_run)
155
+ {
156
+ /* TODO: here don't need the buffer for NAs */
157
+
158
+ if (first_run)
159
+ coef /= x_sd;
160
+
161
+ size_t cnt = 0;
162
+ size_t cnt_NA = 0;
163
+ double *restrict res_write = res - st;
164
+
165
+ if (missing_action == Fail)
166
+ {
167
+ for (size_t row = st; row <= end; row++)
168
+ res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
169
+ }
170
+
171
+ else
172
+ {
173
+ if (first_run)
174
+ {
175
+ for (size_t row = st; row <= end; row++)
176
+ {
177
+ if (!is_na_or_inf(x[ix_arr[row]]))
178
+ {
179
+ res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
180
+ buffer_arr[cnt++] = x[ix_arr[row]];
181
+ }
182
+
183
+ else
184
+ {
185
+ buffer_NAs[cnt_NA++] = row;
186
+ }
187
+
188
+ }
189
+ }
190
+
191
+ else
192
+ {
193
+ for (size_t row = st; row <= end; row++)
194
+ {
195
+ res_write[row] += (is_na_or_inf(x[ix_arr[row]]))? fill_val : ( (x[ix_arr[row]]-x_mean) * coef );
196
+ }
197
+ return;
198
+ }
199
+
200
+ size_t mid_ceil = cnt / 2;
201
+ std::partial_sort(buffer_arr, buffer_arr + mid_ceil + 1, buffer_arr + cnt);
202
+
203
+ if ((cnt % 2) == 0)
204
+ fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2.0;
205
+ else
206
+ fill_val = buffer_arr[mid_ceil];
207
+
208
+ fill_val = (fill_val - x_mean) * coef;
209
+ if (cnt_NA)
210
+ {
211
+ for (size_t row = 0; row < cnt_NA; row++)
212
+ res_write[buffer_NAs[row]] += fill_val;
213
+ }
214
+
215
+ }
216
+ }
217
+
218
+ /* for sparse numerical */
219
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
220
+ double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
221
+ double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
222
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run)
223
+ {
224
+ /* ix_arr must be already sorted beforehand */
225
+
226
+ /* if it's all zeros, no need to do anything, but this is not supposed
227
+ to happen while fitting because the range is determined before calling this */
228
+ if (
229
+ Xc_indptr[col_num] == Xc_indptr[col_num + 1] ||
230
+ Xc_ind[Xc_indptr[col_num]] > ix_arr[end] ||
231
+ Xc_ind[Xc_indptr[col_num + 1] - 1] < ix_arr[st]
232
+ )
233
+ {
234
+ if (first_run)
235
+ {
236
+ coef /= x_sd;
237
+ if (missing_action != Fail)
238
+ fill_val = 0;
239
+ }
240
+
241
+ double *restrict res_write = res - st;
242
+ double offset = x_mean * coef;
243
+ for (size_t row = st; row <= end; row++)
244
+ res_write[row] -= offset;
245
+
246
+ return;
247
+ }
248
+
249
+ size_t st_col = Xc_indptr[col_num];
250
+ size_t end_col = Xc_indptr[col_num + 1] - 1;
251
+ size_t curr_pos = st_col;
252
+ size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
253
+
254
+ size_t cnt_non_NA = 0; /* when NAs need to be imputed */
255
+ size_t cnt_NA = 0; /* when NAs need to be imputed */
256
+ size_t n_sample = end - st + 1;
257
+ size_t *ix_arr_plus_st = ix_arr + st;
258
+
259
+ if (first_run)
260
+ coef /= x_sd;
261
+
262
+ double *restrict res_write = res - st;
263
+ double offset = x_mean * coef;
264
+ for (size_t row = st; row <= end; row++)
265
+ res_write[row] -= offset;
266
+
267
+ size_t ind_end_col = Xc_ind[end_col];
268
+ size_t nmatches = 0;
269
+
270
+ if (missing_action != Fail)
271
+ {
272
+ if (first_run)
273
+ {
274
+ for (size_t *row = ptr_st;
275
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
276
+ )
277
+ {
278
+ if (Xc_ind[curr_pos] == *row)
279
+ {
280
+ if (is_na_or_inf(Xc[curr_pos]))
281
+ {
282
+ buffer_NAs[cnt_NA++] = row - ix_arr_plus_st;
283
+ }
284
+
285
+ else
286
+ {
287
+ buffer_arr[cnt_non_NA++] = Xc[curr_pos];
288
+ res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
289
+ }
290
+
291
+ nmatches++;
292
+ if (row == ix_arr + end || curr_pos == end_col) break;
293
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
294
+ }
295
+
296
+ else
297
+ {
298
+ if (Xc_ind[curr_pos] > *row)
299
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
300
+ else
301
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
302
+ }
303
+ }
304
+ }
305
+
306
+ else
307
+ {
308
+ /* when impute value for missing has already been determined */
309
+ for (size_t *row = ptr_st;
310
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
311
+ )
312
+ {
313
+ if (Xc_ind[curr_pos] == *row)
314
+ {
315
+ res[row - ix_arr_plus_st] += is_na_or_inf(Xc[curr_pos])?
316
+ (fill_val + offset) : (Xc[curr_pos] * coef);
317
+ if (row == ix_arr + end) break;
318
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
319
+ }
320
+
321
+ else
322
+ {
323
+ if (Xc_ind[curr_pos] > *row)
324
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
325
+ else
326
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
327
+ }
328
+ }
329
+
330
+ return;
331
+ }
332
+
333
+
334
+ /* Determine imputation value */
335
+ std::sort(buffer_arr, buffer_arr + cnt_non_NA);
336
+ size_t mid_ceil = (n_sample - cnt_NA) / 2;
337
+ size_t nzeros = (end - st + 1) - nmatches;
338
+ if (nzeros > mid_ceil && buffer_arr[0] > 0)
339
+ {
340
+ fill_val = 0;
341
+ return;
342
+ }
343
+
344
+ else
345
+ {
346
+ size_t n_neg = (buffer_arr[0] > 0)?
347
+ 0 : ((buffer_arr[cnt_non_NA - 1] < 0)?
348
+ cnt_non_NA : std::lower_bound(buffer_arr, buffer_arr + cnt_non_NA, (double)0) - buffer_arr);
349
+
350
+
351
+ if (n_neg < (mid_ceil-1) && n_neg + nzeros > mid_ceil)
352
+ {
353
+ fill_val = 0;
354
+ return;
355
+ }
356
+
357
+ else
358
+ {
359
+ /* if the sample size is odd, take the middle, otherwise take a simple average */
360
+ if (((n_sample - cnt_NA) % 2) != 0)
361
+ {
362
+ if (mid_ceil < n_neg)
363
+ fill_val = buffer_arr[mid_ceil];
364
+ else if (mid_ceil < n_neg + nzeros)
365
+ fill_val = 0;
366
+ else
367
+ fill_val = buffer_arr[mid_ceil - nzeros];
368
+ }
369
+
370
+ else
371
+ {
372
+ if (mid_ceil < n_neg)
373
+ {
374
+ fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2;
375
+ }
376
+
377
+ else if (mid_ceil < n_neg + nzeros)
378
+ {
379
+ if (mid_ceil == n_neg)
380
+ fill_val = buffer_arr[mid_ceil - 1] / 2;
381
+ else
382
+ fill_val = 0;
383
+ }
384
+
385
+ else
386
+ {
387
+ if (mid_ceil == n_neg + nzeros && nzeros > 0)
388
+ fill_val = buffer_arr[n_neg] / 2;
389
+ else
390
+ fill_val = (buffer_arr[mid_ceil - nzeros - 1] + buffer_arr[mid_ceil - nzeros]) / 2; /* WRONG!!!! */
391
+ }
392
+ }
393
+
394
+ /* fill missing if any */
395
+ fill_val *= coef;
396
+ if (cnt_NA && fill_val)
397
+ for (size_t ix = 0; ix < cnt_NA; ix++)
398
+ res[buffer_NAs[ix]] += fill_val;
399
+
400
+ /* next time, it will need to have the offset added */
401
+ fill_val -= offset;
402
+ }
403
+ }
404
+ }
405
+
406
+ else /* no NAs */
407
+ {
408
+ for (size_t *row = ptr_st;
409
+ row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
410
+ )
411
+ {
412
+ if (Xc_ind[curr_pos] == *row)
413
+ {
414
+ res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
415
+ if (row == ix_arr + end || curr_pos == end_col) break;
416
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
417
+ }
418
+
419
+ else
420
+ {
421
+ if (Xc_ind[curr_pos] > *row)
422
+ row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
423
+ else
424
+ curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
425
+ }
426
+ }
427
+ }
428
+ }
429
+
430
+ /* for categoricals */
431
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
432
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
433
+ double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
434
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run)
435
+ {
436
+ double *restrict res_write = res - st;
437
+ switch(cat_split_type)
438
+ {
439
+ case SingleCateg:
440
+ {
441
+ /* in this case there's no need to make-up an impute value for new categories, only for NAs */
442
+ switch(missing_action)
443
+ {
444
+ case Fail:
445
+ {
446
+ for (size_t row = st; row <= end; row++)
447
+ res_write[row] += (x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0;
448
+ return;
449
+ }
450
+
451
+ case Impute:
452
+ {
453
+ size_t cnt_NA = 0;
454
+ size_t cnt_this = 0;
455
+ size_t cnt = end - st + 1;
456
+ if (first_run)
457
+ {
458
+ for (size_t row = st; row <= end; row++)
459
+ {
460
+ if (x[ix_arr[row]] < 0)
461
+ {
462
+ cnt_NA++;
463
+ }
464
+
465
+ else if (x[ix_arr[row]] == chosen_cat)
466
+ {
467
+ cnt_this++;
468
+ res_write[row] += single_cat_coef;
469
+ }
470
+ }
471
+ }
472
+
473
+ else
474
+ {
475
+ for (size_t row = st; row <= end; row++)
476
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0);
477
+ return;
478
+ }
479
+
480
+ fill_val = (cnt_this > (cnt - cnt_NA - cnt_this))? single_cat_coef : 0;
481
+ if (cnt_NA)
482
+ {
483
+ for (size_t row = st; row <= end; row++)
484
+ if (x[ix_arr[row]] < 0)
485
+ res_write[row] += fill_val;
486
+ }
487
+ return;
488
+ }
489
+ }
490
+ }
491
+
492
+ case SubSet:
493
+ {
494
+ /* in this case, since the splits are by more than 1 variable, it's not possible to
495
+ divide missing/new categoricals by assigning weights, so they have to be imputed
496
+ in both cases, unless using random weights for the new ones, in which case they won't
497
+ need to be imputed for new, but sill need it for NA */
498
+
499
+ if (new_cat_action == Random && missing_action == Fail)
500
+ {
501
+ for (size_t row = st; row <= end; row++)
502
+ res_write[row] += cat_coef[x[ix_arr[row]]];
503
+ return;
504
+ }
505
+
506
+ if (!first_run)
507
+ {
508
+ if (missing_action == Fail)
509
+ {
510
+ for (size_t row = st; row <= end; row++)
511
+ res_write[row] += (x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]];
512
+ }
513
+
514
+ else
515
+ {
516
+ for (size_t row = st; row <= end; row++)
517
+ res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]]);
518
+ }
519
+ return;
520
+ }
521
+
522
+ std::fill(buffer_cnt, buffer_cnt + ncat + 1, 0);
523
+ switch(missing_action)
524
+ {
525
+ case Fail:
526
+ {
527
+ for (size_t row = st; row <= end; row++)
528
+ {
529
+ buffer_cnt[x[ix_arr[row]]]++;
530
+ res_write[row] += cat_coef[x[ix_arr[row]]];
531
+ }
532
+ break;
533
+ }
534
+
535
+ default:
536
+ {
537
+ for (size_t row = st; row <= end; row++)
538
+ {
539
+ if (x[ix_arr[row]] >= 0)
540
+ {
541
+ buffer_cnt[x[ix_arr[row]]]++;
542
+ res_write[row] += cat_coef[x[ix_arr[row]]];
543
+ }
544
+
545
+ else
546
+ {
547
+ buffer_cnt[ncat]++;
548
+ }
549
+
550
+ }
551
+ break;
552
+ }
553
+ }
554
+
555
+ switch(new_cat_action)
556
+ {
557
+ case Smallest:
558
+ {
559
+ size_t smallest = SIZE_MAX;
560
+ int cat_smallest;
561
+ for (int cat = 0; cat < ncat; cat++)
562
+ {
563
+ if (buffer_cnt[cat] > 0 && buffer_cnt[cat] < smallest)
564
+ {
565
+ smallest = buffer_cnt[cat];
566
+ cat_smallest = cat;
567
+ }
568
+ }
569
+ fill_new = cat_coef[cat_smallest];
570
+ if (missing_action == Fail) break;
571
+ }
572
+
573
+ default:
574
+ {
575
+ /* Determine imputation value as the category in sorted order that gives 50% + 1 */
576
+ long double cnt_l = (long double)((end - st + 1) - buffer_cnt[ncat]);
577
+ std::iota(buffer_pos, buffer_pos + ncat, (size_t)0);
578
+ std::sort(buffer_pos, buffer_pos + ncat, [&cat_coef](const size_t a, const size_t b){return cat_coef[a] < cat_coef[b];});
579
+
580
+ double cumprob = 0;
581
+ int cat;
582
+ for (cat = 0; cat < ncat; cat++)
583
+ {
584
+ cumprob += (long double)buffer_cnt[buffer_pos[cat]] / cnt_l;
585
+ if (cumprob >= .5) break;
586
+ }
587
+ // cat = std::min(cat, ncat); /* in case it picks the last one */
588
+ fill_val = cat_coef[buffer_pos[cat]];
589
+ if (new_cat_action != Smallest)
590
+ fill_new = fill_val;
591
+
592
+ if (buffer_cnt[ncat] > 0) /* NAs */
593
+ for (size_t row = st; row <= end; row++)
594
+ if (x[ix_arr[row]] < 0)
595
+ res_write[row] += fill_val;
596
+ }
597
+ }
598
+
599
+ /* now fill unseen categories */
600
+ if (new_cat_action != Random)
601
+ for (int cat = 0; cat < ncat; cat++)
602
+ if (!buffer_cnt[cat])
603
+ cat_coef[cat] = fill_new;
604
+
605
+ }
606
+ }
607
+ }