isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
48
+ RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
49
+ std::discrete_distribution<size_t> &col_sampler)
50
+ {
51
+ if (!col_sampler.max())
52
+ col_chosen = runif(rnd_generator);
53
+ else
54
+ col_chosen = col_sampler(rnd_generator);
55
+
56
+ if (col_chosen >= ncols_numeric)
57
+ {
58
+ col_chosen -= ncols_numeric;
59
+ col_type = Categorical;
60
+ }
61
+
62
+ else { col_type = Numeric; }
63
+ }
64
+
65
+ void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
66
+ {
67
+ if (tree.col_type == Numeric)
68
+ workspace.cols_possible[tree.col_num] = false;
69
+ else
70
+ workspace.cols_possible[tree.col_num + input_data.ncols_numeric] = false;
71
+ }
72
+
73
+ void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data)
74
+ {
75
+ if (workspace.col_type == Numeric)
76
+ workspace.cols_possible[workspace.col_chosen] = false;
77
+ else
78
+ workspace.cols_possible[workspace.col_chosen + input_data.ncols_numeric] = false;
79
+ }
80
+
81
+ bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
82
+ {
83
+ if (tree.col_type == Numeric)
84
+ return workspace.cols_possible[tree.col_num];
85
+ else
86
+ return workspace.cols_possible[tree.col_num + input_data.ncols_numeric];
87
+ }
88
+
89
+ /* for use in regular model */
90
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree)
91
+ {
92
+ if (tree.col_type == Numeric)
93
+ {
94
+ if (input_data.Xc == NULL)
95
+ get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * tree.col_num,
96
+ workspace.st, workspace.end, model_params.missing_action,
97
+ workspace.xmin, workspace.xmax, workspace.unsplittable);
98
+ else
99
+ get_range(workspace.ix_arr.data(), workspace.st, workspace.end, tree.col_num,
100
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
101
+ model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
102
+ }
103
+
104
+ else
105
+ {
106
+ get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * tree.col_num,
107
+ workspace.st, workspace.end, input_data.ncat[tree.col_num],
108
+ model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
109
+ }
110
+ }
111
+
112
+ /* for use in extended model */
113
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params)
114
+ {
115
+ if (workspace.col_type == Numeric)
116
+ {
117
+ if (input_data.Xc == NULL)
118
+ get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * workspace.col_chosen,
119
+ workspace.st, workspace.end, model_params.missing_action,
120
+ workspace.xmin, workspace.xmax, workspace.unsplittable);
121
+ else
122
+ get_range(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.col_chosen,
123
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
124
+ model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
125
+ }
126
+
127
+ else
128
+ {
129
+ get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * workspace.col_chosen,
130
+ workspace.st, workspace.end, input_data.ncat[workspace.col_chosen],
131
+ model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
132
+ }
133
+ }
134
+
135
+ int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num)
136
+ {
137
+ int chosen_cat = std::uniform_int_distribution<int>
138
+ (0, workspace.npresent - 1)
139
+ (workspace.rnd_generator);
140
+ workspace.ncat_tried = 0;
141
+ for (int cat = 0; cat < input_data.ncat[col_num]; cat++)
142
+ {
143
+ if (workspace.categs[cat] > 0)
144
+ {
145
+ if (workspace.ncat_tried == chosen_cat)
146
+ return cat;
147
+ else
148
+ workspace.ncat_tried++;
149
+ }
150
+ }
151
+
152
+ return -1; /* this will never be reached, but CRAN complains otherwise */
153
+ }
154
+
155
+ void update_col_sampler(WorkerMemory &workspace, InputData &input_data)
156
+ {
157
+ if (!workspace.col_sampler.max())
158
+ return;
159
+
160
+ std::vector<double> col_weights = workspace.col_sampler.probabilities();
161
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
162
+ if (!workspace.cols_possible[col])
163
+ col_weights[col] = 0;
164
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
165
+ if (!workspace.cols_possible[col + input_data.ncols_numeric])
166
+ col_weights[col + input_data.ncols_numeric] = 0;
167
+ workspace.col_sampler = std::discrete_distribution<size_t>(col_weights.begin(), col_weights.end());
168
+ }
169
+
170
+ bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
171
+ InputData &input_data, size_t col_num, ColType col_type)
172
+ {
173
+ col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
174
+ if (col_is_taken.size())
175
+ return col_is_taken[col_num];
176
+ else
177
+ return col_is_taken_s.find(col_num) != col_is_taken_s.end();
178
+ }
179
+
180
+ void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
181
+ InputData &input_data, size_t col_num, ColType col_type)
182
+ {
183
+ col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
184
+ if (col_is_taken.size())
185
+ col_is_taken[col_num] = true;
186
+ else
187
+ col_is_taken_s.insert(col_num);
188
+ }
189
+
190
+ void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder)
191
+ {
192
+ if (workspace.weights_arr.size())
193
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
194
+ input_data.nrows, workspace.tmat_sep.data(), workspace.weights_arr.data(), remainder);
195
+ else if (workspace.weights_map.size())
196
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
197
+ input_data.nrows, workspace.tmat_sep.data(), workspace.weights_map, remainder);
198
+ else
199
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
200
+ input_data.nrows, workspace.tmat_sep.data(), remainder);
201
+ }
202
+
203
+ void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight)
204
+ {
205
+ if (
206
+ ((workspace.end - workspace.st) > 0 && !workspace.weights_arr.size() && !workspace.weights_map.size()) ||
207
+ (sum_weight > 1 && (workspace.weights_arr.size() || workspace.weights_map.size()))
208
+ )
209
+ {
210
+ double expected_dsep;
211
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
212
+ expected_dsep = expected_separation_depth(workspace.end - workspace.st + 1);
213
+ else
214
+ expected_dsep = expected_separation_depth(sum_weight);
215
+
216
+ add_separation_step(workspace, input_data, expected_dsep + 1);
217
+ }
218
+ }
219
+
220
+ void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
221
+ PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads)
222
+ {
223
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
224
+ size_t max_tree, curr_term;
225
+ std::vector<sparse_ix> tree_mapping;
226
+ if (model_outputs != NULL)
227
+ {
228
+ max_tree = std::accumulate(model_outputs->trees.begin(),
229
+ model_outputs->trees.end(),
230
+ (size_t)0,
231
+ [](const size_t curr_max, const std::vector<IsoTree> &tr)
232
+ {return std::max(curr_max, tr.size());});
233
+ tree_mapping.resize(max_tree);
234
+ for (size_t tree = 0; tree < ntrees; tree++)
235
+ {
236
+ std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
237
+ curr_term = 0;
238
+ for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
239
+ if (model_outputs->trees[tree][node].score >= 0)
240
+ tree_mapping[node] = curr_term++;
241
+
242
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
243
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
244
+ tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
245
+ }
246
+ }
247
+
248
+ else
249
+ {
250
+ max_tree = std::accumulate(model_outputs_ext->hplanes.begin(),
251
+ model_outputs_ext->hplanes.end(),
252
+ (size_t)0,
253
+ [](const size_t curr_max, const std::vector<IsoHPlane> &tr)
254
+ {return std::max(curr_max, tr.size());});
255
+ tree_mapping.resize(max_tree);
256
+ for (size_t tree = 0; tree < ntrees; tree++)
257
+ {
258
+ std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
259
+ curr_term = 0;
260
+ for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
261
+ if (model_outputs_ext->hplanes[tree][node].score >= 0)
262
+ tree_mapping[node] = curr_term++;
263
+
264
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
265
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
266
+ tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
267
+ }
268
+ }
269
+ }
270
+
271
+ void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
272
+ {
273
+ recursion_state.st = workspace.st;
274
+ recursion_state.st_NA = workspace.st_NA;
275
+ recursion_state.end_NA = workspace.end_NA;
276
+ recursion_state.split_ix = workspace.split_ix;
277
+ recursion_state.end = workspace.end;
278
+ recursion_state.cols_possible = workspace.cols_possible;
279
+ recursion_state.col_sampler = workspace.col_sampler;
280
+
281
+ /* for the extended model, it's not necessary to copy everything */
282
+ if (!workspace.comb_val.size())
283
+ {
284
+ /* TODO: here only need to copy the left half, as the right one is untouched */
285
+ recursion_state.ix_arr = workspace.ix_arr;
286
+ recursion_state.weights_map = workspace.weights_map;
287
+ recursion_state.weights_arr = workspace.weights_arr;
288
+ }
289
+ }
290
+
291
+
292
+ void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
293
+ {
294
+ workspace.st = recursion_state.st;
295
+ workspace.st_NA = recursion_state.st_NA;
296
+ workspace.end_NA = recursion_state.end_NA;
297
+ workspace.split_ix = recursion_state.split_ix;
298
+ workspace.end = recursion_state.end;
299
+ workspace.cols_possible = std::move(recursion_state.cols_possible);
300
+ workspace.col_sampler = std::move(recursion_state.col_sampler);
301
+
302
+ if (!workspace.comb_val.size())
303
+ {
304
+ /* TODO: here only need to copy the left half, as the right one is untouched */
305
+ workspace.ix_arr = std::move(recursion_state.ix_arr);
306
+ workspace.weights_map = std::move(recursion_state.weights_map);
307
+ workspace.weights_arr = std::move(recursion_state.weights_arr);
308
+ }
309
+ }
@@ -0,0 +1,1205 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+
48
+ /* TODO: this file is a complete mess, needs a refactor from scratch along with the data structs */
49
+
50
+ /* Impute missing values in new data
51
+ *
52
+ * Parameters
53
+ * ==========
54
+ * - numeric_data[nrows * ncols_numeric] (in, out)
55
+ * Pointer to numeric data in which missing values will be imputed. Must be ordered by columns like Fortran,
56
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
57
+ * and the column order must be the same as in the data that was used to fit the model.
58
+ * Pass NULL if there are no dense numeric columns.
59
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
60
+ * Imputations will overwrite values in this same array.
61
+ * - ncols_numeric
62
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
63
+ * - categ_data[nrows * ncols_categ]
64
+ * Pointer to categorical data in which missing values will be imputed. Must be ordered by columns like Fortran,
65
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
66
+ * and the column order must be the same as in the data that was used to fit the model.
67
+ * Pass NULL if there are no categorical columns.
68
+ * Each category should be represented as an integer, and these integers must start at zero and
69
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
70
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
71
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
72
+ * must be the same as was used in the data to which the model was fit.
73
+ * Imputations will overwrite values in this same array.
74
+ * - ncols_categ
75
+ * Number of categorical columns in the data.
76
+ * - ncat[ncols_categ]
77
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
78
+ * the number of categories for that column is '5' (zero is one category).
79
+ * Must be the same as was passed to 'fit_iforest'.
80
+ * - Xr[nnz] (in, out)
81
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
82
+ * Pass NULL if there are no sparse numeric columns.
83
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
84
+ * Imputations will overwrite values in this same array.
85
+ * - Xr_ind[nnz]
86
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
87
+ * Pass NULL if there are no sparse numeric columns in CSR format.
88
+ * - Xr_indptr[nrows + 1]
89
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
90
+ * start and at entry [row + 1] where does row 'row' end.
91
+ * Pass NULL if there are no sparse numeric columns in CSR format.
92
+ * - nrows
93
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
94
+ * - nthreads
95
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
96
+ * allocated, even if the thread does not end up being used. Ignored when not building with
97
+ * OpenMP support.
98
+ * - model_outputs
99
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
100
+ * if the predictions are to be made from an extended model. Can only pass one of
101
+ * 'model_outputs' and 'model_outputs_ext'.
102
+ * - model_outputs_ext
103
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
104
+ * if the predictions are to be made from a single-variable model. Can only pass one of
105
+ * 'model_outputs' and 'model_outputs_ext'.
106
+ * - impute_nodes
107
+ * Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
108
+ * as produced from function 'fit_iforest',
109
+ */
110
+ void impute_missing_values(double numeric_data[], int categ_data[],
111
+ double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
112
+ size_t nrows, int nthreads,
113
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
114
+ Imputer &imputer)
115
+ {
116
+ PredictionData prediction_data = {numeric_data, categ_data, nrows,
117
+ NULL, NULL, NULL,
118
+ Xr, Xr_ind, Xr_indptr};
119
+
120
+ std::vector<size_t> ix_arr(nrows);
121
+ std::iota(ix_arr.begin(), ix_arr.end(), (size_t) 0);
122
+
123
+ size_t end = check_for_missing(prediction_data, imputer, ix_arr.data(), nthreads);
124
+
125
+ if (end == 0)
126
+ return;
127
+
128
+ if ((size_t)nthreads > end)
129
+ nthreads = (int)end;
130
+ #ifdef _OPENMP
131
+ std::vector<ImputedData> imp_memory(nthreads);
132
+ #else
133
+ std::vector<ImputedData> imp_memory(1);
134
+ #endif
135
+
136
+
137
+ if (model_outputs != NULL)
138
+ {
139
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
140
+ shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
141
+ for (size_t_for row = 0; row < end; row++)
142
+ {
143
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
144
+
145
+ for (std::vector<IsoTree> &tree : model_outputs->trees)
146
+ {
147
+ traverse_itree(tree,
148
+ *model_outputs,
149
+ prediction_data,
150
+ &imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
151
+ &imp_memory[omp_get_thread_num()],
152
+ (double) 1,
153
+ ix_arr[row],
154
+ NULL,
155
+ (size_t) 0);
156
+ }
157
+
158
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
159
+
160
+ }
161
+ }
162
+
163
+ else
164
+ {
165
+ double temp;
166
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
167
+ shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
168
+ private(temp)
169
+ for (size_t_for row = 0; row < end; row++)
170
+ {
171
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
172
+
173
+ for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
174
+ {
175
+ traverse_hplane(hplane,
176
+ *model_outputs_ext,
177
+ prediction_data,
178
+ temp,
179
+ &imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
180
+ &imp_memory[omp_get_thread_num()],
181
+ NULL,
182
+ ix_arr[row]);
183
+ }
184
+
185
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
186
+
187
+ }
188
+ }
189
+
190
+ }
191
+
192
+ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
193
+ {
194
+ imputer.ncols_numeric = input_data.ncols_numeric;
195
+ imputer.ncols_categ = input_data.ncols_categ;
196
+ imputer.ncat.assign(input_data.ncat, input_data.ncat + input_data.ncols_categ);
197
+ if (imputer.col_means.size())
198
+ {
199
+ imputer.col_means.resize(input_data.ncols_numeric);
200
+ std::fill(imputer.col_means.begin(), imputer.col_means.end(), 0);
201
+ }
202
+
203
+ else
204
+ {
205
+ imputer.col_means.resize(input_data.ncols_numeric, 0);
206
+ }
207
+
208
+ imputer.col_modes.resize(input_data.ncols_categ);
209
+ imputer.imputer_tree = std::vector<std::vector<ImputeNode>>(ntrees);
210
+
211
+ size_t offset, cnt;
212
+ if (input_data.numeric_data != NULL)
213
+ {
214
+ #pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
215
+ for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
216
+ {
217
+ cnt = input_data.nrows;
218
+ offset = col * input_data.nrows;
219
+ for (size_t row = 0; row < input_data.nrows; row++)
220
+ {
221
+ imputer.col_means[col] += (!is_na_or_inf(input_data.numeric_data[row + offset]))?
222
+ input_data.numeric_data[row + offset] : 0;
223
+ cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
224
+ }
225
+ imputer.col_means[col] /= (long double) cnt;
226
+ }
227
+ }
228
+
229
+ else if (input_data.Xc != NULL)
230
+ {
231
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
232
+ for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
233
+ {
234
+ cnt = input_data.nrows;
235
+ for (size_t ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
236
+ {
237
+ imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
238
+ input_data.Xc[ix] : 0;
239
+ cnt -= is_na_or_inf(input_data.Xc[ix]);
240
+ }
241
+ imputer.col_means[col] /= (long double) cnt;
242
+ }
243
+ }
244
+
245
+ if (input_data.categ_data != NULL)
246
+ {
247
+ std::vector<size_t> cat_counts(input_data.max_categ);
248
+ #pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
249
+ for (size_t_for col = 0; col < input_data.ncols_categ; col++)
250
+ {
251
+ std::fill(cat_counts.begin(), cat_counts.end(), 0);
252
+ offset = col * input_data.nrows;
253
+ for (size_t row = 0; row < input_data.nrows; row++)
254
+ {
255
+ if (input_data.categ_data[row + offset] >= 0)
256
+ cat_counts[input_data.categ_data[row + offset]]++;
257
+ }
258
+ imputer.col_modes[col] = (int) std::distance(cat_counts.begin(),
259
+ std::max_element(cat_counts.begin(),
260
+ cat_counts.begin() + input_data.ncat[col]));
261
+ }
262
+ }
263
+ }
264
+
265
+
266
+ /* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
267
+ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
268
+ InputData &input_data, ModelParams &model_params,
269
+ std::vector<ImputeNode> &imputer_tree,
270
+ size_t curr_depth, size_t min_imp_obs)
271
+ {
272
+ double wsum;
273
+ bool has_weights = workspace.weights_arr.size() || workspace.weights_map.size();
274
+ if (!has_weights)
275
+ wsum = (double)(workspace.end - workspace.st + 1);
276
+ else
277
+ wsum = calculate_sum_weights(workspace.ix_arr, workspace.st, workspace.end, curr_depth,
278
+ workspace.weights_arr, workspace.weights_map);
279
+
280
+ imputer.num_sum.resize(input_data.ncols_numeric, 0);
281
+ imputer.num_weight.resize(input_data.ncols_numeric, 0);
282
+ imputer.cat_sum.resize(input_data.ncols_categ);
283
+ imputer.cat_weight.resize(input_data.ncols_categ, 0);
284
+ imputer.num_sum.shrink_to_fit();
285
+ imputer.num_weight.shrink_to_fit();
286
+ imputer.cat_sum.shrink_to_fit();
287
+ imputer.cat_weight.shrink_to_fit();
288
+
289
+ /* Note: in theory, 'num_weight' could be initialized to 'wsum',
290
+ and the entries could get subtracted the weight afterwards, but due to rounding
291
+ error, this could produce some cases of no-present observations having positive
292
+ weight, or cases of negative weight, so it's better to add it for each row after
293
+ checking for possible NAs, even though it's less computationally efficient.
294
+ For sparse matrices it's done the other way as otherwise it would be too slow. */
295
+
296
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
297
+ {
298
+ imputer.cat_sum[col].resize(input_data.ncat[col]);
299
+ imputer.cat_sum[col].shrink_to_fit();
300
+ }
301
+
302
+ double xnum;
303
+ int xcat;
304
+ double weight;
305
+ size_t ix;
306
+
307
+ if ((input_data.Xc == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
308
+ {
309
+ if (!has_weights)
310
+ {
311
+ size_t cnt;
312
+ if (input_data.numeric_data != NULL)
313
+ {
314
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
315
+ {
316
+ cnt = 0;
317
+ for (size_t row = workspace.st; row <= workspace.end; row++)
318
+ {
319
+ xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
320
+ if (!is_na_or_inf(xnum))
321
+ {
322
+ cnt++;
323
+ imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (long double)cnt;
324
+ }
325
+ }
326
+ imputer.num_weight[col] = (double) cnt;
327
+ }
328
+ }
329
+
330
+ if (input_data.categ_data != NULL)
331
+ {
332
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
333
+ {
334
+ cnt = 0;
335
+ for (size_t row = workspace.st; row <= workspace.end; row++)
336
+ {
337
+ xcat = input_data.categ_data[workspace.ix_arr[row] + col * input_data.nrows];
338
+ if (xcat >= 0)
339
+ {
340
+ cnt++;
341
+ imputer.cat_sum[col][xcat]++; /* later gets divided */
342
+ }
343
+ }
344
+ imputer.cat_weight[col] = (double) cnt;
345
+ }
346
+ }
347
+
348
+ }
349
+
350
+ else
351
+ {
352
+ long double prod_sum, corr, val, diff;
353
+ if (input_data.numeric_data != NULL)
354
+ {
355
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
356
+ {
357
+ prod_sum = 0; corr = 0;
358
+ for (size_t row = workspace.st; row <= workspace.end; row++)
359
+ {
360
+ xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
361
+ if (!is_na_or_inf(xnum))
362
+ {
363
+ if (workspace.weights_arr.size())
364
+ weight = workspace.weights_arr[workspace.ix_arr[row]];
365
+ else
366
+ weight = workspace.weights_map[workspace.ix_arr[row]];
367
+
368
+ imputer.num_weight[col] += weight; /* these are always <= 1 */
369
+ val = (xnum * weight) - corr;
370
+ diff = prod_sum + val;
371
+ corr = (diff - prod_sum) - val;
372
+ prod_sum = diff;
373
+ }
374
+ }
375
+ imputer.num_sum[col] = prod_sum / imputer.num_weight[col];
376
+ }
377
+ }
378
+
379
+
380
+ if (input_data.ncols_categ)
381
+ {
382
+ for (size_t row = workspace.st; row <= workspace.end; row++)
383
+ {
384
+ ix = workspace.ix_arr[row];
385
+ if (workspace.weights_arr.size())
386
+ weight = workspace.weights_arr[ix];
387
+ else
388
+ weight = workspace.weights_map[ix];
389
+
390
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
391
+ {
392
+ xcat = input_data.categ_data[ix + col * input_data.nrows];
393
+ if (xcat >= 0)
394
+ {
395
+ imputer.cat_sum[col][xcat] += weight; /* later gets divided */
396
+ imputer.cat_weight[col] += weight;
397
+ }
398
+ }
399
+ }
400
+ }
401
+ }
402
+ }
403
+
404
+ if (input_data.Xc != NULL) /* sparse numeric */
405
+ {
406
+ size_t *ix_arr = workspace.ix_arr.data();
407
+ size_t st_col, end_col, ind_end_col, curr_pos;
408
+ std::fill(imputer.num_weight.begin(), imputer.num_weight.end(), wsum);
409
+
410
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
411
+ {
412
+ st_col = input_data.Xc_indptr[col];
413
+ end_col = input_data.Xc_indptr[col + 1] - 1;
414
+ ind_end_col = input_data.Xc_ind[end_col];
415
+ curr_pos = st_col;
416
+ for (size_t *row = std::lower_bound(ix_arr + workspace.st, ix_arr + workspace.end + 1, input_data.Xc_ind[st_col]);
417
+ row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
418
+ )
419
+ {
420
+ if (input_data.Xc_ind[curr_pos] == *row)
421
+ {
422
+ xnum = input_data.Xc[curr_pos];
423
+ if (workspace.weights_arr.size())
424
+ weight = workspace.weights_arr[*row];
425
+ else if (workspace.weights_map.size())
426
+ weight = workspace.weights_map[*row];
427
+ else
428
+ weight = 1;
429
+
430
+ if (!is_na_or_inf(xnum))
431
+ {
432
+ imputer.num_sum[col] += weight * xnum;
433
+ }
434
+
435
+ else
436
+ {
437
+ imputer.num_weight[col] -= weight;
438
+ }
439
+
440
+ if (row == ix_arr + workspace.end || curr_pos == end_col) break;
441
+ curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos, input_data.Xc_ind + end_col + 1, *(++row)) - input_data.Xc_ind;
442
+ }
443
+
444
+ else
445
+ {
446
+ if (input_data.Xc_ind[curr_pos] > *row)
447
+ row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
448
+ else
449
+ curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
450
+ }
451
+ }
452
+
453
+ imputer.num_sum[col] /= imputer.num_weight[col];
454
+ }
455
+ }
456
+
457
+ /* if any value is not possible to impute, look it up from the parent tree, but assign a lesser weight
458
+ Note: in theory, the parent node should always have some imputation value for every variable, but due to
459
+ numeric rounding errors, it might have a weight of zero, so in those cases it's looked up higher up the
460
+ tree instead. */
461
+ size_t look_aboves, curr_tree;
462
+ double min_imp_obs_dbl = (double) min_imp_obs;
463
+ if (imputer.num_sum.size())
464
+ {
465
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
466
+ {
467
+ if (imputer.num_weight[col] < min_imp_obs_dbl)
468
+ {
469
+ look_aboves = 1;
470
+ curr_tree = imputer.parent;
471
+ while (true)
472
+ {
473
+ if (!is_na_or_inf(imputer_tree[curr_tree].num_sum[col]))
474
+ {
475
+ imputer.num_sum[col] = imputer_tree[curr_tree].num_sum[col] / imputer_tree[curr_tree].num_weight[col];
476
+ imputer.num_weight[col] = wsum / (double)(2 * look_aboves);
477
+ break;
478
+ }
479
+
480
+ else if (curr_tree > 0)
481
+ {
482
+ curr_tree = imputer_tree[curr_tree].parent;
483
+ look_aboves++;
484
+ }
485
+
486
+ else /* will only happen if every single value is missing */
487
+ {
488
+ imputer.num_sum[col] = NAN;
489
+ imputer.num_weight[col] = 0;
490
+ break;
491
+ }
492
+ }
493
+ }
494
+ }
495
+ }
496
+
497
+ if (imputer.cat_sum.size())
498
+ {
499
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
500
+ {
501
+ if (imputer.cat_weight[col] >= min_imp_obs_dbl)
502
+ {
503
+ for (double &cat : imputer.cat_sum[col])
504
+ cat /= imputer.cat_weight[col];
505
+ }
506
+
507
+ else
508
+ {
509
+ look_aboves = 1;
510
+ curr_tree = imputer.parent;
511
+ while (true)
512
+ {
513
+ if (imputer_tree[curr_tree].cat_weight[col] > 0)
514
+ {
515
+ for (int cat = 0; cat < input_data.ncat[col]; cat++)
516
+ {
517
+ imputer.cat_sum[col][cat] += imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col];
518
+ imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
519
+ }
520
+ break;
521
+ }
522
+
523
+ else if (curr_tree > 0)
524
+ {
525
+ curr_tree = imputer_tree[curr_tree].parent;
526
+ look_aboves++;
527
+ }
528
+
529
+ else /* will only happen if every single value is missing */
530
+ {
531
+ break;
532
+ }
533
+ }
534
+ imputer.cat_weight[col] = std::accumulate(imputer.cat_sum[col].begin(),
535
+ imputer.cat_sum[col].end(),
536
+ (double) 0);
537
+ }
538
+ }
539
+ }
540
+
541
+ /* re-adjust the weights according to parameters
542
+ (note that by this point, the weights are a sum) */
543
+ switch(model_params.weigh_imp_rows)
544
+ {
545
+ case Inverse:
546
+ {
547
+ double wsum_div = wsum * sqrt(wsum);
548
+ for (double &w : imputer.num_weight)
549
+ w /= wsum_div;
550
+
551
+ for (double &w : imputer.cat_weight)
552
+ w /= wsum_div;
553
+ break;
554
+ }
555
+
556
+ case Flat:
557
+ {
558
+ for (double &w : imputer.num_weight)
559
+ w /= wsum;
560
+ for (double &w : imputer.cat_weight)
561
+ w /= wsum;
562
+ break;
563
+ }
564
+
565
+ /* TODO: maybe divide by nrows for prop */
566
+ }
567
+
568
+ double curr_depth_dbl = (double) (curr_depth + 1);
569
+ switch(model_params.depth_imp)
570
+ {
571
+ case Lower:
572
+ {
573
+ for (double &w : imputer.num_weight)
574
+ w /= curr_depth_dbl;
575
+ for (double &w : imputer.cat_weight)
576
+ w /= curr_depth_dbl;
577
+ break;
578
+ }
579
+
580
+ case Higher:
581
+ {
582
+ for (double &w : imputer.num_weight)
583
+ w *= curr_depth_dbl;
584
+ for (double &w : imputer.cat_weight)
585
+ w *= curr_depth_dbl;
586
+ break;
587
+ }
588
+ }
589
+
590
+ /* now re-adjust sums */
591
+ if (model_params.weigh_imp_rows != Prop || model_params.depth_imp != Same)
592
+ {
593
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
594
+ imputer.num_sum[col] *= imputer.num_weight[col];
595
+
596
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
597
+ for (int cat = 0; cat < input_data.ncat[col]; cat++)
598
+ imputer.cat_sum[col][cat] *= imputer.cat_weight[col];
599
+ }
600
+ }
601
+
602
+
603
+ void shrink_impute_node(ImputeNode &imputer)
604
+ {
605
+ imputer.num_sum.clear();
606
+ imputer.num_weight.clear();
607
+ imputer.cat_sum.clear();
608
+ imputer.cat_weight.clear();
609
+
610
+ imputer.num_sum.shrink_to_fit();
611
+ imputer.num_weight.shrink_to_fit();
612
+ imputer.cat_sum.shrink_to_fit();
613
+ imputer.cat_weight.shrink_to_fit();
614
+ }
615
+
616
+ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
617
+ std::vector<IsoTree> *trees,
618
+ std::vector<IsoHPlane> *hplanes)
619
+ {
620
+ if (trees != NULL)
621
+ {
622
+ for (size_t tr = 0; tr < trees->size(); tr++)
623
+ {
624
+ if ((*trees)[tr].score <= 0)
625
+ {
626
+ shrink_impute_node(imputer_tree[tr]);
627
+ }
628
+
629
+ else
630
+ {
631
+ /* cat_weight is not needed for anything else */
632
+ imputer_tree[tr].cat_weight.clear();
633
+ imputer_tree[tr].cat_weight.shrink_to_fit();
634
+ }
635
+ }
636
+ }
637
+
638
+ else
639
+ {
640
+ for (size_t tr = 0; tr < hplanes->size(); tr++)
641
+ {
642
+ if ((*hplanes)[tr].score <= 0)
643
+ {
644
+ shrink_impute_node(imputer_tree[tr]);
645
+ }
646
+
647
+ else
648
+ {
649
+ /* cat_weight is not needed for anything else */
650
+ imputer_tree[tr].cat_weight.clear();
651
+ imputer_tree[tr].cat_weight.shrink_to_fit();
652
+ }
653
+ }
654
+ }
655
+
656
+ imputer_tree.shrink_to_fit();
657
+ }
658
+
659
+ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
660
+ {
661
+ size_t col;
662
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
663
+ {
664
+ imp_addto.num_sum[ix] += imp_addfrom.num_sum[ix];
665
+ imp_addto.num_weight[ix] += imp_addfrom.num_weight[ix];
666
+ }
667
+
668
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_cat; ix++)
669
+ {
670
+ col = imp_addfrom.missing_cat[ix];
671
+ for (size_t cat = 0; cat < imp_addto.cat_sum[col].size(); cat++)
672
+ {
673
+ imp_addto.cat_sum[col][cat] += imp_addfrom.cat_sum[col][cat];
674
+ }
675
+ }
676
+
677
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_sp; ix++)
678
+ {
679
+ imp_addto.sp_num_sum[ix] += imp_addfrom.sp_num_sum[ix];
680
+ imp_addto.sp_num_weight[ix] += imp_addfrom.sp_num_weight[ix];
681
+ }
682
+ }
683
+
684
+ void combine_tree_imputations(WorkerMemory &workspace,
685
+ std::vector<ImputedData> &impute_vec,
686
+ std::unordered_map<size_t, ImputedData> &impute_map,
687
+ std::vector<char> &has_missing,
688
+ int nthreads)
689
+ {
690
+ if (workspace.impute_vec.size())
691
+ {
692
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
693
+ for (size_t_for row = 0; row < has_missing.size(); row++)
694
+ if (has_missing[row])
695
+ combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
696
+ }
697
+
698
+ else if (workspace.impute_map.size())
699
+ {
700
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
701
+ for (size_t_for row = 0; row < has_missing.size(); row++)
702
+ if (has_missing[row])
703
+ combine_imp_single(workspace.impute_map[row], impute_map[row]);
704
+ }
705
+ }
706
+
707
+
708
+ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
709
+ {
710
+ size_t col;
711
+ for (size_t ix = 0; ix < imputed_data.n_missing_num; ix++)
712
+ {
713
+ col = imputed_data.missing_num[ix];
714
+ imputed_data.num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
715
+ imputed_data.num_weight[ix] += w * imputer.num_weight[ix];
716
+ }
717
+
718
+ for (size_t ix = 0; ix < imputed_data.n_missing_sp; ix++)
719
+ {
720
+ col = imputed_data.missing_sp[ix];
721
+ imputed_data.sp_num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
722
+ imputed_data.sp_num_weight[ix] += w * imputer.num_weight[ix];
723
+ }
724
+
725
+ for (size_t ix = 0; ix < imputed_data.n_missing_cat; ix++)
726
+ {
727
+ col = imputed_data.missing_cat[ix];
728
+ for (size_t cat = 0; cat < imputer.cat_sum[col].size(); cat++)
729
+ imputed_data.cat_sum[col][cat] += w * imputer.cat_sum[col][cat];
730
+ }
731
+ }
732
+
733
+
734
+ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
735
+ {
736
+ if (workspace.impute_vec.size())
737
+ {
738
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
739
+ {
740
+ for (size_t row = workspace.st; row <= workspace.end; row++)
741
+ if (input_data.has_missing[workspace.ix_arr[row]])
742
+ add_from_impute_node(imputer,
743
+ workspace.impute_vec[workspace.ix_arr[row]],
744
+ (double)1);
745
+ }
746
+
747
+ else if (workspace.weights_arr.size())
748
+ {
749
+ for (size_t row = workspace.st; row <= workspace.end; row++)
750
+ if (input_data.has_missing[workspace.ix_arr[row]])
751
+ add_from_impute_node(imputer,
752
+ workspace.impute_vec[workspace.ix_arr[row]],
753
+ workspace.weights_arr[workspace.ix_arr[row]]);
754
+ }
755
+
756
+ else
757
+ {
758
+ for (size_t row = workspace.st; row <= workspace.end; row++)
759
+ if (input_data.has_missing[workspace.ix_arr[row]])
760
+ add_from_impute_node(imputer,
761
+ workspace.impute_vec[workspace.ix_arr[row]],
762
+ workspace.weights_map[workspace.ix_arr[row]]);
763
+ }
764
+ }
765
+
766
+ else if (workspace.impute_map.size())
767
+ {
768
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
769
+ {
770
+ for (size_t row = workspace.st; row <= workspace.end; row++)
771
+ if (input_data.has_missing[workspace.ix_arr[row]])
772
+ add_from_impute_node(imputer,
773
+ workspace.impute_map[workspace.ix_arr[row]],
774
+ (double)1);
775
+ }
776
+
777
+ else if (workspace.weights_arr.size())
778
+ {
779
+ for (size_t row = workspace.st; row <= workspace.end; row++)
780
+ if (input_data.has_missing[workspace.ix_arr[row]])
781
+ add_from_impute_node(imputer,
782
+ workspace.impute_map[workspace.ix_arr[row]],
783
+ workspace.weights_arr[workspace.ix_arr[row]]);
784
+ }
785
+
786
+ else
787
+ {
788
+ for (size_t row = workspace.st; row <= workspace.end; row++)
789
+ if (input_data.has_missing[workspace.ix_arr[row]])
790
+ add_from_impute_node(imputer,
791
+ workspace.impute_map[workspace.ix_arr[row]],
792
+ workspace.weights_map[workspace.ix_arr[row]]);
793
+ }
794
+ }
795
+ }
796
+
797
+ template <class imp_arr>
798
+ void apply_imputation_results(imp_arr &impute_vec,
799
+ Imputer &imputer,
800
+ InputData &input_data,
801
+ int nthreads)
802
+ {
803
+ size_t col;
804
+
805
+ if (input_data.Xc != NULL)
806
+ {
807
+ std::vector<size_t> row_pos(input_data.nrows, 0);
808
+ size_t row;
809
+
810
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
811
+ {
812
+ for (sparse_ix ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
813
+ {
814
+ if (is_na_or_inf(input_data.Xc[ix]))
815
+ {
816
+ row = input_data.Xc_ind[ix];
817
+ if (impute_vec[row].sp_num_weight[row_pos[row]] > 0 && !is_na_or_inf(impute_vec[row].sp_num_sum[row_pos[row]]))
818
+ input_data.Xc[ix]
819
+ =
820
+ impute_vec[row].sp_num_sum[row_pos[row]]
821
+ /
822
+ impute_vec[row].sp_num_weight[row_pos[row]];
823
+ else
824
+ input_data.Xc[ix]
825
+ =
826
+ imputer.col_means[col];
827
+
828
+ row_pos[row]++;
829
+ }
830
+ }
831
+ }
832
+ }
833
+
834
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
835
+ for (size_t_for row = 0; row < input_data.nrows; row++)
836
+ {
837
+ if (input_data.has_missing[row])
838
+ {
839
+ for (size_t ix = 0; ix < impute_vec[row].n_missing_num; ix++)
840
+ {
841
+ col = impute_vec[row].missing_num[ix];
842
+ if (impute_vec[row].num_weight[ix] > 0 && !is_na_or_inf(impute_vec[row].num_sum[ix]))
843
+ input_data.numeric_data[row + col * input_data.nrows]
844
+ =
845
+ impute_vec[row].num_sum[ix] / impute_vec[row].num_weight[ix];
846
+ else
847
+ input_data.numeric_data[row + col * input_data.nrows]
848
+ =
849
+ imputer.col_means[col];
850
+ }
851
+
852
+ for (size_t ix = 0; ix < impute_vec[row].n_missing_cat; ix++)
853
+ {
854
+ col = impute_vec[row].missing_cat[ix];
855
+ input_data.categ_data[row + col * input_data.nrows]
856
+ =
857
+ std::distance(impute_vec[row].cat_sum[col].begin(),
858
+ std::max_element(impute_vec[row].cat_sum[col].begin(),
859
+ impute_vec[row].cat_sum[col].end()));
860
+
861
+ if (input_data.categ_data[row + col * input_data.nrows] == 0 && impute_vec[row].cat_sum[col][0] <= 0)
862
+ input_data.categ_data[row + col * input_data.nrows]
863
+ =
864
+ imputer.col_modes[col];
865
+ }
866
+ }
867
+ }
868
+ }
869
+
870
+ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
871
+ std::unordered_map<size_t, ImputedData> &impute_map,
872
+ Imputer &imputer,
873
+ InputData &input_data,
874
+ int nthreads)
875
+ {
876
+ if (impute_vec.size())
877
+ apply_imputation_results(impute_vec, imputer, input_data, nthreads);
878
+ else if (impute_map.size())
879
+ apply_imputation_results(impute_map, imputer, input_data, nthreads);
880
+ }
881
+
882
+
883
+ void apply_imputation_results(PredictionData &prediction_data,
884
+ ImputedData &imp,
885
+ Imputer &imputer,
886
+ size_t row)
887
+ {
888
+ size_t col;
889
+ size_t pos = 0;
890
+ for (size_t ix = 0; ix < imp.n_missing_num; ix++)
891
+ {
892
+ col = imp.missing_num[ix];
893
+ if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
894
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
895
+ =
896
+ imp.num_sum[ix] / imp.num_weight[ix];
897
+ else
898
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
899
+ =
900
+ imputer.col_means[col];
901
+ }
902
+
903
+ if (prediction_data.Xr != NULL)
904
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
905
+ {
906
+ if (is_na_or_inf(prediction_data.Xr[ix]))
907
+ {
908
+ if (imp.sp_num_weight[pos] > 0 && !is_na_or_inf(imp.sp_num_sum[pos]))
909
+ prediction_data.Xr[ix]
910
+ =
911
+ imp.sp_num_sum[pos] / imp.sp_num_weight[pos];
912
+ else
913
+ prediction_data.Xr[ix]
914
+ =
915
+ imputer.col_means[imp.missing_sp[pos]];
916
+ pos++;
917
+ }
918
+ }
919
+
920
+ for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
921
+ {
922
+ col = imp.missing_cat[ix];
923
+ prediction_data.categ_data[row + col * prediction_data.nrows]
924
+ =
925
+ std::distance(imp.cat_sum[col].begin(),
926
+ std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
927
+
928
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
929
+ prediction_data.categ_data[row + col * prediction_data.nrows]
930
+ =
931
+ imputer.col_modes[col];
932
+ }
933
+ }
934
+
935
+
936
+ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
937
+ {
938
+ imp.n_missing_num = 0;
939
+ imp.n_missing_cat = 0;
940
+ imp.n_missing_sp = 0;
941
+
942
+ if (input_data.numeric_data != NULL)
943
+ {
944
+ imp.missing_num.resize(input_data.ncols_numeric);
945
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
946
+ if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
947
+ imp.missing_num[imp.n_missing_num++] = col;
948
+ imp.missing_num.resize(imp.n_missing_num);
949
+ imp.num_sum.assign(imp.n_missing_num, 0);
950
+ imp.num_weight.assign(imp.n_missing_num, 0);
951
+ }
952
+
953
+ else if (input_data.Xc != NULL)
954
+ {
955
+ imp.missing_sp.resize(input_data.ncols_numeric);
956
+ sparse_ix *res;
957
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
958
+ {
959
+ res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
960
+ input_data.Xc_ind + input_data.Xc_indptr[col + 1],
961
+ (sparse_ix) row);
962
+ if (
963
+ res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
964
+ *res == row &&
965
+ is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
966
+ )
967
+ {
968
+ imp.missing_sp[imp.n_missing_sp++] = col;
969
+ }
970
+ }
971
+ imp.sp_num_sum.assign(imp.n_missing_sp, 0);
972
+ imp.sp_num_weight.assign(imp.n_missing_sp, 0);
973
+ }
974
+
975
+ if (input_data.categ_data != NULL)
976
+ {
977
+ imp.missing_cat.resize(input_data.ncols_categ);
978
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
979
+ if (input_data.categ_data[row + col * input_data.nrows] < 0)
980
+ imp.missing_cat[imp.n_missing_cat++] = col;
981
+ imp.missing_cat.resize(imp.n_missing_cat);
982
+ imp.cat_weight.assign(imp.n_missing_cat, 0);
983
+ imp.cat_sum.resize(input_data.ncols_categ);
984
+ for (size_t cat = 0; cat < imp.n_missing_cat; cat++)
985
+ imp.cat_sum[imp.missing_cat[cat]].assign(input_data.ncat[imp.missing_cat[cat]], 0);
986
+ }
987
+ }
988
+
989
+ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
990
+ {
991
+ imp.n_missing_num = 0;
992
+ imp.n_missing_cat = 0;
993
+ imp.n_missing_sp = 0;
994
+
995
+ if (prediction_data.numeric_data != NULL)
996
+ {
997
+ if (!imp.missing_num.size())
998
+ imp.missing_num.resize(imputer.ncols_numeric);
999
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1000
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1001
+ imp.missing_num[imp.n_missing_num++] = col;
1002
+
1003
+ if (!imp.num_sum.size())
1004
+ {
1005
+ imp.num_sum.resize(imputer.ncols_numeric, 0);
1006
+ imp.num_weight.resize(imputer.ncols_numeric, 0);
1007
+ }
1008
+
1009
+ else
1010
+ {
1011
+ std::fill(imp.num_sum.begin(), imp.num_sum.begin() + imp.n_missing_num, 0);
1012
+ std::fill(imp.num_weight.begin(), imp.num_weight.begin() + imp.n_missing_num, 0);
1013
+ }
1014
+ }
1015
+
1016
+ else if (prediction_data.Xr != NULL)
1017
+ {
1018
+ if (!imp.missing_sp.size())
1019
+ imp.missing_sp.resize(imputer.ncols_numeric);
1020
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1021
+ if (is_na_or_inf(prediction_data.Xr[ix]))
1022
+ imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
1023
+
1024
+ if (!imp.sp_num_sum.size())
1025
+ {
1026
+ imp.sp_num_sum.resize(imputer.ncols_numeric, 0);
1027
+ imp.sp_num_weight.resize(imputer.ncols_numeric, 0);
1028
+ }
1029
+
1030
+ else
1031
+ {
1032
+ std::fill(imp.sp_num_sum.begin(), imp.sp_num_sum.begin() + imp.n_missing_sp, 0);
1033
+ std::fill(imp.sp_num_weight.begin(), imp.sp_num_weight.begin() + imp.n_missing_sp, 0);
1034
+ }
1035
+ }
1036
+
1037
+ if (prediction_data.categ_data != NULL)
1038
+ {
1039
+ if (!imp.missing_cat.size())
1040
+ imp.missing_cat.resize(imputer.ncols_categ);
1041
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1042
+ {
1043
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1044
+ imp.missing_cat[imp.n_missing_cat++] = col;
1045
+ }
1046
+
1047
+ if (!imp.cat_weight.size())
1048
+ {
1049
+ imp.cat_weight.resize(imputer.ncols_categ, 0);
1050
+ imp.cat_sum.resize(imputer.ncols_categ);
1051
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1052
+ imp.cat_sum[col].resize(imputer.ncat[col], 0);
1053
+ }
1054
+
1055
+ else
1056
+ {
1057
+ std::fill(imp.cat_weight.begin(), imp.cat_weight.begin() + imp.n_missing_cat, 0);
1058
+ for (size_t col = 0; col < imp.n_missing_cat; col++)
1059
+ std::fill(imp.cat_sum[imp.missing_cat[col]].begin(),
1060
+ imp.cat_sum[imp.missing_cat[col]].end(),
1061
+ 0);
1062
+ }
1063
+ }
1064
+ }
1065
+
1066
+ ImputedData::ImputedData(InputData &input_data, size_t row)
1067
+ {
1068
+ initialize_impute_calc(*this, input_data, row);
1069
+ }
1070
+
1071
+ void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
1072
+ {
1073
+ impute_vec.resize(input_data.nrows);
1074
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
1075
+ for (size_t_for row = 0; row < input_data.nrows; row++)
1076
+ if (input_data.has_missing[row])
1077
+ initialize_impute_calc(impute_vec[row], input_data, row);
1078
+ }
1079
+
1080
+
1081
+ void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data)
1082
+ {
1083
+ for (size_t row = 0; row < input_data.nrows; row++)
1084
+ if (input_data.has_missing[row])
1085
+ impute_map[row] = ImputedData(input_data, row);
1086
+ }
1087
+
1088
+ void allocate_imp(InputData &input_data,
1089
+ std::vector<ImputedData> &impute_vec,
1090
+ std::unordered_map<size_t, ImputedData> &impute_map,
1091
+ int nthreads)
1092
+ {
1093
+ if (input_data.n_missing == 0)
1094
+ return;
1095
+ else if (input_data.n_missing <= input_data.nrows / (nthreads * 10))
1096
+ allocate_imp_map(impute_map, input_data);
1097
+ else
1098
+ allocate_imp_vec(impute_vec, input_data, nthreads);
1099
+ }
1100
+
1101
+ void check_for_missing(InputData &input_data,
1102
+ std::vector<ImputedData> &impute_vec,
1103
+ std::unordered_map<size_t, ImputedData> &impute_map,
1104
+ int nthreads)
1105
+ {
1106
+ input_data.has_missing.assign(input_data.nrows, false);
1107
+
1108
+ if (input_data.Xc != NULL)
1109
+ {
1110
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1111
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
1112
+ for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
1113
+ if (is_na_or_inf(input_data.Xc[ix]))
1114
+ input_data.has_missing[input_data.Xc_ind[ix]] = true;
1115
+ #pragma omp barrier
1116
+ }
1117
+
1118
+ if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
1119
+ {
1120
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
1121
+ for (size_t_for row = 0; row < input_data.nrows; row++)
1122
+ {
1123
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1124
+ {
1125
+ if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
1126
+ {
1127
+ input_data.has_missing[row] = true;
1128
+ break;
1129
+ }
1130
+ }
1131
+
1132
+ if (!input_data.has_missing[row])
1133
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
1134
+ {
1135
+ if (input_data.categ_data[row + col * input_data.nrows] < 0)
1136
+ {
1137
+ input_data.has_missing[row] = true;
1138
+ break;
1139
+ }
1140
+ }
1141
+ }
1142
+ }
1143
+
1144
+ input_data.n_missing = std::accumulate(input_data.has_missing.begin(), input_data.has_missing.end(), (size_t)0);
1145
+ allocate_imp(input_data, impute_vec, impute_map, nthreads);
1146
+ }
1147
+
1148
+ size_t check_for_missing(PredictionData &prediction_data,
1149
+ Imputer &imputer,
1150
+ size_t ix_arr[],
1151
+ int nthreads)
1152
+ {
1153
+ std::vector<char> has_missing(prediction_data.nrows, false);
1154
+
1155
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
1156
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
1157
+ {
1158
+ if (prediction_data.numeric_data != NULL)
1159
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1160
+ {
1161
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1162
+ {
1163
+ has_missing[row] = true;
1164
+ break;
1165
+ }
1166
+ }
1167
+ else if (prediction_data.Xr != NULL)
1168
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1169
+ {
1170
+ if (is_na_or_inf(prediction_data.Xr[ix]))
1171
+ {
1172
+ has_missing[row] = true;
1173
+ break;
1174
+ }
1175
+ }
1176
+
1177
+ if (!has_missing[row])
1178
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1179
+ {
1180
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1181
+ {
1182
+ has_missing[row] = true;
1183
+ break;
1184
+ }
1185
+ }
1186
+ }
1187
+
1188
+ size_t st = 0;
1189
+ size_t temp;
1190
+ for (size_t row = 0; row < prediction_data.nrows; row++)
1191
+ {
1192
+ if (has_missing[row])
1193
+ {
1194
+ temp = ix_arr[st];
1195
+ ix_arr[st] = ix_arr[row];
1196
+ ix_arr[row] = temp;
1197
+ st++;
1198
+ }
1199
+ }
1200
+
1201
+ if (st == 0)
1202
+ return 0;
1203
+
1204
+ return st;
1205
+ }