isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,309 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+ void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
48
+ RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
49
+ std::discrete_distribution<size_t> &col_sampler)
50
+ {
51
+ if (!col_sampler.max())
52
+ col_chosen = runif(rnd_generator);
53
+ else
54
+ col_chosen = col_sampler(rnd_generator);
55
+
56
+ if (col_chosen >= ncols_numeric)
57
+ {
58
+ col_chosen -= ncols_numeric;
59
+ col_type = Categorical;
60
+ }
61
+
62
+ else { col_type = Numeric; }
63
+ }
64
+
65
+ void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
66
+ {
67
+ if (tree.col_type == Numeric)
68
+ workspace.cols_possible[tree.col_num] = false;
69
+ else
70
+ workspace.cols_possible[tree.col_num + input_data.ncols_numeric] = false;
71
+ }
72
+
73
+ void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data)
74
+ {
75
+ if (workspace.col_type == Numeric)
76
+ workspace.cols_possible[workspace.col_chosen] = false;
77
+ else
78
+ workspace.cols_possible[workspace.col_chosen + input_data.ncols_numeric] = false;
79
+ }
80
+
81
+ bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
82
+ {
83
+ if (tree.col_type == Numeric)
84
+ return workspace.cols_possible[tree.col_num];
85
+ else
86
+ return workspace.cols_possible[tree.col_num + input_data.ncols_numeric];
87
+ }
88
+
89
+ /* for use in regular model */
90
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree)
91
+ {
92
+ if (tree.col_type == Numeric)
93
+ {
94
+ if (input_data.Xc == NULL)
95
+ get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * tree.col_num,
96
+ workspace.st, workspace.end, model_params.missing_action,
97
+ workspace.xmin, workspace.xmax, workspace.unsplittable);
98
+ else
99
+ get_range(workspace.ix_arr.data(), workspace.st, workspace.end, tree.col_num,
100
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
101
+ model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
102
+ }
103
+
104
+ else
105
+ {
106
+ get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * tree.col_num,
107
+ workspace.st, workspace.end, input_data.ncat[tree.col_num],
108
+ model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
109
+ }
110
+ }
111
+
112
+ /* for use in extended model */
113
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params)
114
+ {
115
+ if (workspace.col_type == Numeric)
116
+ {
117
+ if (input_data.Xc == NULL)
118
+ get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * workspace.col_chosen,
119
+ workspace.st, workspace.end, model_params.missing_action,
120
+ workspace.xmin, workspace.xmax, workspace.unsplittable);
121
+ else
122
+ get_range(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.col_chosen,
123
+ input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
124
+ model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
125
+ }
126
+
127
+ else
128
+ {
129
+ get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * workspace.col_chosen,
130
+ workspace.st, workspace.end, input_data.ncat[workspace.col_chosen],
131
+ model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
132
+ }
133
+ }
134
+
135
+ int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num)
136
+ {
137
+ int chosen_cat = std::uniform_int_distribution<int>
138
+ (0, workspace.npresent - 1)
139
+ (workspace.rnd_generator);
140
+ workspace.ncat_tried = 0;
141
+ for (int cat = 0; cat < input_data.ncat[col_num]; cat++)
142
+ {
143
+ if (workspace.categs[cat] > 0)
144
+ {
145
+ if (workspace.ncat_tried == chosen_cat)
146
+ return cat;
147
+ else
148
+ workspace.ncat_tried++;
149
+ }
150
+ }
151
+
152
+ return -1; /* this will never be reached, but CRAN complains otherwise */
153
+ }
154
+
155
+ void update_col_sampler(WorkerMemory &workspace, InputData &input_data)
156
+ {
157
+ if (!workspace.col_sampler.max())
158
+ return;
159
+
160
+ std::vector<double> col_weights = workspace.col_sampler.probabilities();
161
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
162
+ if (!workspace.cols_possible[col])
163
+ col_weights[col] = 0;
164
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
165
+ if (!workspace.cols_possible[col + input_data.ncols_numeric])
166
+ col_weights[col + input_data.ncols_numeric] = 0;
167
+ workspace.col_sampler = std::discrete_distribution<size_t>(col_weights.begin(), col_weights.end());
168
+ }
169
+
170
+ bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
171
+ InputData &input_data, size_t col_num, ColType col_type)
172
+ {
173
+ col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
174
+ if (col_is_taken.size())
175
+ return col_is_taken[col_num];
176
+ else
177
+ return col_is_taken_s.find(col_num) != col_is_taken_s.end();
178
+ }
179
+
180
+ void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
181
+ InputData &input_data, size_t col_num, ColType col_type)
182
+ {
183
+ col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
184
+ if (col_is_taken.size())
185
+ col_is_taken[col_num] = true;
186
+ else
187
+ col_is_taken_s.insert(col_num);
188
+ }
189
+
190
+ void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder)
191
+ {
192
+ if (workspace.weights_arr.size())
193
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
194
+ input_data.nrows, workspace.tmat_sep.data(), workspace.weights_arr.data(), remainder);
195
+ else if (workspace.weights_map.size())
196
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
197
+ input_data.nrows, workspace.tmat_sep.data(), workspace.weights_map, remainder);
198
+ else
199
+ increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
200
+ input_data.nrows, workspace.tmat_sep.data(), remainder);
201
+ }
202
+
203
+ void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight)
204
+ {
205
+ if (
206
+ ((workspace.end - workspace.st) > 0 && !workspace.weights_arr.size() && !workspace.weights_map.size()) ||
207
+ (sum_weight > 1 && (workspace.weights_arr.size() || workspace.weights_map.size()))
208
+ )
209
+ {
210
+ double expected_dsep;
211
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
212
+ expected_dsep = expected_separation_depth(workspace.end - workspace.st + 1);
213
+ else
214
+ expected_dsep = expected_separation_depth(sum_weight);
215
+
216
+ add_separation_step(workspace, input_data, expected_dsep + 1);
217
+ }
218
+ }
219
+
220
+ void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
221
+ PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads)
222
+ {
223
+ size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
224
+ size_t max_tree, curr_term;
225
+ std::vector<sparse_ix> tree_mapping;
226
+ if (model_outputs != NULL)
227
+ {
228
+ max_tree = std::accumulate(model_outputs->trees.begin(),
229
+ model_outputs->trees.end(),
230
+ (size_t)0,
231
+ [](const size_t curr_max, const std::vector<IsoTree> &tr)
232
+ {return std::max(curr_max, tr.size());});
233
+ tree_mapping.resize(max_tree);
234
+ for (size_t tree = 0; tree < ntrees; tree++)
235
+ {
236
+ std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
237
+ curr_term = 0;
238
+ for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
239
+ if (model_outputs->trees[tree][node].score >= 0)
240
+ tree_mapping[node] = curr_term++;
241
+
242
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
243
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
244
+ tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
245
+ }
246
+ }
247
+
248
+ else
249
+ {
250
+ max_tree = std::accumulate(model_outputs_ext->hplanes.begin(),
251
+ model_outputs_ext->hplanes.end(),
252
+ (size_t)0,
253
+ [](const size_t curr_max, const std::vector<IsoHPlane> &tr)
254
+ {return std::max(curr_max, tr.size());});
255
+ tree_mapping.resize(max_tree);
256
+ for (size_t tree = 0; tree < ntrees; tree++)
257
+ {
258
+ std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
259
+ curr_term = 0;
260
+ for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
261
+ if (model_outputs_ext->hplanes[tree][node].score >= 0)
262
+ tree_mapping[node] = curr_term++;
263
+
264
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
265
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
266
+ tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
267
+ }
268
+ }
269
+ }
270
+
271
+ void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
272
+ {
273
+ recursion_state.st = workspace.st;
274
+ recursion_state.st_NA = workspace.st_NA;
275
+ recursion_state.end_NA = workspace.end_NA;
276
+ recursion_state.split_ix = workspace.split_ix;
277
+ recursion_state.end = workspace.end;
278
+ recursion_state.cols_possible = workspace.cols_possible;
279
+ recursion_state.col_sampler = workspace.col_sampler;
280
+
281
+ /* for the extended model, it's not necessary to copy everything */
282
+ if (!workspace.comb_val.size())
283
+ {
284
+ /* TODO: here only need to copy the left half, as the right one is untouched */
285
+ recursion_state.ix_arr = workspace.ix_arr;
286
+ recursion_state.weights_map = workspace.weights_map;
287
+ recursion_state.weights_arr = workspace.weights_arr;
288
+ }
289
+ }
290
+
291
+
292
+ void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
293
+ {
294
+ workspace.st = recursion_state.st;
295
+ workspace.st_NA = recursion_state.st_NA;
296
+ workspace.end_NA = recursion_state.end_NA;
297
+ workspace.split_ix = recursion_state.split_ix;
298
+ workspace.end = recursion_state.end;
299
+ workspace.cols_possible = std::move(recursion_state.cols_possible);
300
+ workspace.col_sampler = std::move(recursion_state.col_sampler);
301
+
302
+ if (!workspace.comb_val.size())
303
+ {
304
+ /* TODO: here only need to copy the left half, as the right one is untouched */
305
+ workspace.ix_arr = std::move(recursion_state.ix_arr);
306
+ workspace.weights_map = std::move(recursion_state.weights_map);
307
+ workspace.weights_arr = std::move(recursion_state.weights_arr);
308
+ }
309
+ }
@@ -0,0 +1,1205 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+ #include "isotree.hpp"
46
+
47
+
48
+ /* TODO: this file is a complete mess, needs a refactor from scratch along with the data structs */
49
+
50
+ /* Impute missing values in new data
51
+ *
52
+ * Parameters
53
+ * ==========
54
+ * - numeric_data[nrows * ncols_numeric] (in, out)
55
+ * Pointer to numeric data in which missing values will be imputed. Must be ordered by columns like Fortran,
56
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
57
+ * and the column order must be the same as in the data that was used to fit the model.
58
+ * Pass NULL if there are no dense numeric columns.
59
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
60
+ * Imputations will overwrite values in this same array.
61
+ * - ncols_numeric
62
+ * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
63
+ * - categ_data[nrows * ncols_categ]
64
+ * Pointer to categorical data in which missing values will be imputed. Must be ordered by columns like Fortran,
65
+ * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
66
+ * and the column order must be the same as in the data that was used to fit the model.
67
+ * Pass NULL if there are no categorical columns.
68
+ * Each category should be represented as an integer, and these integers must start at zero and
69
+ * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
70
+ * present when the model was fit (note that they are not treated as being ordinal, this is just
71
+ * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
72
+ * must be the same as was used in the data to which the model was fit.
73
+ * Imputations will overwrite values in this same array.
74
+ * - ncols_categ
75
+ * Number of categorical columns in the data.
76
+ * - ncat[ncols_categ]
77
+ * Number of categories in each categorical column. E.g. if the highest code for a column is '4',
78
+ * the number of categories for that column is '5' (zero is one category).
79
+ * Must be the same as was passed to 'fit_iforest'.
80
+ * - Xr[nnz] (in, out)
81
+ * Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
82
+ * Pass NULL if there are no sparse numeric columns.
83
+ * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
84
+ * Imputations will overwrite values in this same array.
85
+ * - Xr_ind[nnz]
86
+ * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
87
+ * Pass NULL if there are no sparse numeric columns in CSR format.
88
+ * - Xr_indptr[nrows + 1]
89
+ * Pointer to row index pointers that tell at entry [row] where does row 'row'
90
+ * start and at entry [row + 1] where does row 'row' end.
91
+ * Pass NULL if there are no sparse numeric columns in CSR format.
92
+ * - nrows
93
+ * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
94
+ * - nthreads
95
+ * Number of parallel threads to use. Note that, the more threads, the more memory will be
96
+ * allocated, even if the thread does not end up being used. Ignored when not building with
97
+ * OpenMP support.
98
+ * - model_outputs
99
+ * Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
100
+ * if the predictions are to be made from an extended model. Can only pass one of
101
+ * 'model_outputs' and 'model_outputs_ext'.
102
+ * - model_outputs_ext
103
+ * Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
104
+ * if the predictions are to be made from a single-variable model. Can only pass one of
105
+ * 'model_outputs' and 'model_outputs_ext'.
106
+ * - impute_nodes
107
+ * Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
108
+ * as produced from function 'fit_iforest',
109
+ */
110
+ void impute_missing_values(double numeric_data[], int categ_data[],
111
+ double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
112
+ size_t nrows, int nthreads,
113
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
114
+ Imputer &imputer)
115
+ {
116
+ PredictionData prediction_data = {numeric_data, categ_data, nrows,
117
+ NULL, NULL, NULL,
118
+ Xr, Xr_ind, Xr_indptr};
119
+
120
+ std::vector<size_t> ix_arr(nrows);
121
+ std::iota(ix_arr.begin(), ix_arr.end(), (size_t) 0);
122
+
123
+ size_t end = check_for_missing(prediction_data, imputer, ix_arr.data(), nthreads);
124
+
125
+ if (end == 0)
126
+ return;
127
+
128
+ if ((size_t)nthreads > end)
129
+ nthreads = (int)end;
130
+ #ifdef _OPENMP
131
+ std::vector<ImputedData> imp_memory(nthreads);
132
+ #else
133
+ std::vector<ImputedData> imp_memory(1);
134
+ #endif
135
+
136
+
137
+ if (model_outputs != NULL)
138
+ {
139
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
140
+ shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
141
+ for (size_t_for row = 0; row < end; row++)
142
+ {
143
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
144
+
145
+ for (std::vector<IsoTree> &tree : model_outputs->trees)
146
+ {
147
+ traverse_itree(tree,
148
+ *model_outputs,
149
+ prediction_data,
150
+ &imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
151
+ &imp_memory[omp_get_thread_num()],
152
+ (double) 1,
153
+ ix_arr[row],
154
+ NULL,
155
+ (size_t) 0);
156
+ }
157
+
158
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
159
+
160
+ }
161
+ }
162
+
163
+ else
164
+ {
165
+ double temp;
166
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
167
+ shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
168
+ private(temp)
169
+ for (size_t_for row = 0; row < end; row++)
170
+ {
171
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
172
+
173
+ for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
174
+ {
175
+ traverse_hplane(hplane,
176
+ *model_outputs_ext,
177
+ prediction_data,
178
+ temp,
179
+ &imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
180
+ &imp_memory[omp_get_thread_num()],
181
+ NULL,
182
+ ix_arr[row]);
183
+ }
184
+
185
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
186
+
187
+ }
188
+ }
189
+
190
+ }
191
+
192
+ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
193
+ {
194
+ imputer.ncols_numeric = input_data.ncols_numeric;
195
+ imputer.ncols_categ = input_data.ncols_categ;
196
+ imputer.ncat.assign(input_data.ncat, input_data.ncat + input_data.ncols_categ);
197
+ if (imputer.col_means.size())
198
+ {
199
+ imputer.col_means.resize(input_data.ncols_numeric);
200
+ std::fill(imputer.col_means.begin(), imputer.col_means.end(), 0);
201
+ }
202
+
203
+ else
204
+ {
205
+ imputer.col_means.resize(input_data.ncols_numeric, 0);
206
+ }
207
+
208
+ imputer.col_modes.resize(input_data.ncols_categ);
209
+ imputer.imputer_tree = std::vector<std::vector<ImputeNode>>(ntrees);
210
+
211
+ size_t offset, cnt;
212
+ if (input_data.numeric_data != NULL)
213
+ {
214
+ #pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
215
+ for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
216
+ {
217
+ cnt = input_data.nrows;
218
+ offset = col * input_data.nrows;
219
+ for (size_t row = 0; row < input_data.nrows; row++)
220
+ {
221
+ imputer.col_means[col] += (!is_na_or_inf(input_data.numeric_data[row + offset]))?
222
+ input_data.numeric_data[row + offset] : 0;
223
+ cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
224
+ }
225
+ imputer.col_means[col] /= (long double) cnt;
226
+ }
227
+ }
228
+
229
+ else if (input_data.Xc != NULL)
230
+ {
231
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
232
+ for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
233
+ {
234
+ cnt = input_data.nrows;
235
+ for (size_t ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
236
+ {
237
+ imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
238
+ input_data.Xc[ix] : 0;
239
+ cnt -= is_na_or_inf(input_data.Xc[ix]);
240
+ }
241
+ imputer.col_means[col] /= (long double) cnt;
242
+ }
243
+ }
244
+
245
+ if (input_data.categ_data != NULL)
246
+ {
247
+ std::vector<size_t> cat_counts(input_data.max_categ);
248
+ #pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
249
+ for (size_t_for col = 0; col < input_data.ncols_categ; col++)
250
+ {
251
+ std::fill(cat_counts.begin(), cat_counts.end(), 0);
252
+ offset = col * input_data.nrows;
253
+ for (size_t row = 0; row < input_data.nrows; row++)
254
+ {
255
+ if (input_data.categ_data[row + offset] >= 0)
256
+ cat_counts[input_data.categ_data[row + offset]]++;
257
+ }
258
+ imputer.col_modes[col] = (int) std::distance(cat_counts.begin(),
259
+ std::max_element(cat_counts.begin(),
260
+ cat_counts.begin() + input_data.ncat[col]));
261
+ }
262
+ }
263
+ }
264
+
265
+
266
+ /* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
267
+ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
268
+ InputData &input_data, ModelParams &model_params,
269
+ std::vector<ImputeNode> &imputer_tree,
270
+ size_t curr_depth, size_t min_imp_obs)
271
+ {
272
+ double wsum;
273
+ bool has_weights = workspace.weights_arr.size() || workspace.weights_map.size();
274
+ if (!has_weights)
275
+ wsum = (double)(workspace.end - workspace.st + 1);
276
+ else
277
+ wsum = calculate_sum_weights(workspace.ix_arr, workspace.st, workspace.end, curr_depth,
278
+ workspace.weights_arr, workspace.weights_map);
279
+
280
+ imputer.num_sum.resize(input_data.ncols_numeric, 0);
281
+ imputer.num_weight.resize(input_data.ncols_numeric, 0);
282
+ imputer.cat_sum.resize(input_data.ncols_categ);
283
+ imputer.cat_weight.resize(input_data.ncols_categ, 0);
284
+ imputer.num_sum.shrink_to_fit();
285
+ imputer.num_weight.shrink_to_fit();
286
+ imputer.cat_sum.shrink_to_fit();
287
+ imputer.cat_weight.shrink_to_fit();
288
+
289
+ /* Note: in theory, 'num_weight' could be initialized to 'wsum',
290
+ and the entries could get subtracted the weight afterwards, but due to rounding
291
+ error, this could produce some cases of no-present observations having positive
292
+ weight, or cases of negative weight, so it's better to add it for each row after
293
+ checking for possible NAs, even though it's less computationally efficient.
294
+ For sparse matrices it's done the other way as otherwise it would be too slow. */
295
+
296
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
297
+ {
298
+ imputer.cat_sum[col].resize(input_data.ncat[col]);
299
+ imputer.cat_sum[col].shrink_to_fit();
300
+ }
301
+
302
+ double xnum;
303
+ int xcat;
304
+ double weight;
305
+ size_t ix;
306
+
307
+ if ((input_data.Xc == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
308
+ {
309
+ if (!has_weights)
310
+ {
311
+ size_t cnt;
312
+ if (input_data.numeric_data != NULL)
313
+ {
314
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
315
+ {
316
+ cnt = 0;
317
+ for (size_t row = workspace.st; row <= workspace.end; row++)
318
+ {
319
+ xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
320
+ if (!is_na_or_inf(xnum))
321
+ {
322
+ cnt++;
323
+ imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (long double)cnt;
324
+ }
325
+ }
326
+ imputer.num_weight[col] = (double) cnt;
327
+ }
328
+ }
329
+
330
+ if (input_data.categ_data != NULL)
331
+ {
332
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
333
+ {
334
+ cnt = 0;
335
+ for (size_t row = workspace.st; row <= workspace.end; row++)
336
+ {
337
+ xcat = input_data.categ_data[workspace.ix_arr[row] + col * input_data.nrows];
338
+ if (xcat >= 0)
339
+ {
340
+ cnt++;
341
+ imputer.cat_sum[col][xcat]++; /* later gets divided */
342
+ }
343
+ }
344
+ imputer.cat_weight[col] = (double) cnt;
345
+ }
346
+ }
347
+
348
+ }
349
+
350
+ else
351
+ {
352
+ long double prod_sum, corr, val, diff;
353
+ if (input_data.numeric_data != NULL)
354
+ {
355
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
356
+ {
357
+ prod_sum = 0; corr = 0;
358
+ for (size_t row = workspace.st; row <= workspace.end; row++)
359
+ {
360
+ xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
361
+ if (!is_na_or_inf(xnum))
362
+ {
363
+ if (workspace.weights_arr.size())
364
+ weight = workspace.weights_arr[workspace.ix_arr[row]];
365
+ else
366
+ weight = workspace.weights_map[workspace.ix_arr[row]];
367
+
368
+ imputer.num_weight[col] += weight; /* these are always <= 1 */
369
+ val = (xnum * weight) - corr;
370
+ diff = prod_sum + val;
371
+ corr = (diff - prod_sum) - val;
372
+ prod_sum = diff;
373
+ }
374
+ }
375
+ imputer.num_sum[col] = prod_sum / imputer.num_weight[col];
376
+ }
377
+ }
378
+
379
+
380
+ if (input_data.ncols_categ)
381
+ {
382
+ for (size_t row = workspace.st; row <= workspace.end; row++)
383
+ {
384
+ ix = workspace.ix_arr[row];
385
+ if (workspace.weights_arr.size())
386
+ weight = workspace.weights_arr[ix];
387
+ else
388
+ weight = workspace.weights_map[ix];
389
+
390
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
391
+ {
392
+ xcat = input_data.categ_data[ix + col * input_data.nrows];
393
+ if (xcat >= 0)
394
+ {
395
+ imputer.cat_sum[col][xcat] += weight; /* later gets divided */
396
+ imputer.cat_weight[col] += weight;
397
+ }
398
+ }
399
+ }
400
+ }
401
+ }
402
+ }
403
+
404
+ if (input_data.Xc != NULL) /* sparse numeric */
405
+ {
406
+ size_t *ix_arr = workspace.ix_arr.data();
407
+ size_t st_col, end_col, ind_end_col, curr_pos;
408
+ std::fill(imputer.num_weight.begin(), imputer.num_weight.end(), wsum);
409
+
410
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
411
+ {
412
+ st_col = input_data.Xc_indptr[col];
413
+ end_col = input_data.Xc_indptr[col + 1] - 1;
414
+ ind_end_col = input_data.Xc_ind[end_col];
415
+ curr_pos = st_col;
416
+ for (size_t *row = std::lower_bound(ix_arr + workspace.st, ix_arr + workspace.end + 1, input_data.Xc_ind[st_col]);
417
+ row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
418
+ )
419
+ {
420
+ if (input_data.Xc_ind[curr_pos] == *row)
421
+ {
422
+ xnum = input_data.Xc[curr_pos];
423
+ if (workspace.weights_arr.size())
424
+ weight = workspace.weights_arr[*row];
425
+ else if (workspace.weights_map.size())
426
+ weight = workspace.weights_map[*row];
427
+ else
428
+ weight = 1;
429
+
430
+ if (!is_na_or_inf(xnum))
431
+ {
432
+ imputer.num_sum[col] += weight * xnum;
433
+ }
434
+
435
+ else
436
+ {
437
+ imputer.num_weight[col] -= weight;
438
+ }
439
+
440
+ if (row == ix_arr + workspace.end || curr_pos == end_col) break;
441
+ curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos, input_data.Xc_ind + end_col + 1, *(++row)) - input_data.Xc_ind;
442
+ }
443
+
444
+ else
445
+ {
446
+ if (input_data.Xc_ind[curr_pos] > *row)
447
+ row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
448
+ else
449
+ curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
450
+ }
451
+ }
452
+
453
+ imputer.num_sum[col] /= imputer.num_weight[col];
454
+ }
455
+ }
456
+
457
+ /* if any value is not possible to impute, look it up from the parent tree, but assign a lesser weight
458
+ Note: in theory, the parent node should always have some imputation value for every variable, but due to
459
+ numeric rounding errors, it might have a weight of zero, so in those cases it's looked up higher up the
460
+ tree instead. */
461
+ size_t look_aboves, curr_tree;
462
+ double min_imp_obs_dbl = (double) min_imp_obs;
463
+ if (imputer.num_sum.size())
464
+ {
465
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
466
+ {
467
+ if (imputer.num_weight[col] < min_imp_obs_dbl)
468
+ {
469
+ look_aboves = 1;
470
+ curr_tree = imputer.parent;
471
+ while (true)
472
+ {
473
+ if (!is_na_or_inf(imputer_tree[curr_tree].num_sum[col]))
474
+ {
475
+ imputer.num_sum[col] = imputer_tree[curr_tree].num_sum[col] / imputer_tree[curr_tree].num_weight[col];
476
+ imputer.num_weight[col] = wsum / (double)(2 * look_aboves);
477
+ break;
478
+ }
479
+
480
+ else if (curr_tree > 0)
481
+ {
482
+ curr_tree = imputer_tree[curr_tree].parent;
483
+ look_aboves++;
484
+ }
485
+
486
+ else /* will only happen if every single value is missing */
487
+ {
488
+ imputer.num_sum[col] = NAN;
489
+ imputer.num_weight[col] = 0;
490
+ break;
491
+ }
492
+ }
493
+ }
494
+ }
495
+ }
496
+
497
+ if (imputer.cat_sum.size())
498
+ {
499
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
500
+ {
501
+ if (imputer.cat_weight[col] >= min_imp_obs_dbl)
502
+ {
503
+ for (double &cat : imputer.cat_sum[col])
504
+ cat /= imputer.cat_weight[col];
505
+ }
506
+
507
+ else
508
+ {
509
+ look_aboves = 1;
510
+ curr_tree = imputer.parent;
511
+ while (true)
512
+ {
513
+ if (imputer_tree[curr_tree].cat_weight[col] > 0)
514
+ {
515
+ for (int cat = 0; cat < input_data.ncat[col]; cat++)
516
+ {
517
+ imputer.cat_sum[col][cat] += imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col];
518
+ imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
519
+ }
520
+ break;
521
+ }
522
+
523
+ else if (curr_tree > 0)
524
+ {
525
+ curr_tree = imputer_tree[curr_tree].parent;
526
+ look_aboves++;
527
+ }
528
+
529
+ else /* will only happen if every single value is missing */
530
+ {
531
+ break;
532
+ }
533
+ }
534
+ imputer.cat_weight[col] = std::accumulate(imputer.cat_sum[col].begin(),
535
+ imputer.cat_sum[col].end(),
536
+ (double) 0);
537
+ }
538
+ }
539
+ }
540
+
541
+ /* re-adjust the weights according to parameters
542
+ (note that by this point, the weights are a sum) */
543
+ switch(model_params.weigh_imp_rows)
544
+ {
545
+ case Inverse:
546
+ {
547
+ double wsum_div = wsum * sqrt(wsum);
548
+ for (double &w : imputer.num_weight)
549
+ w /= wsum_div;
550
+
551
+ for (double &w : imputer.cat_weight)
552
+ w /= wsum_div;
553
+ break;
554
+ }
555
+
556
+ case Flat:
557
+ {
558
+ for (double &w : imputer.num_weight)
559
+ w /= wsum;
560
+ for (double &w : imputer.cat_weight)
561
+ w /= wsum;
562
+ break;
563
+ }
564
+
565
+ /* TODO: maybe divide by nrows for prop */
566
+ }
567
+
568
+ double curr_depth_dbl = (double) (curr_depth + 1);
569
+ switch(model_params.depth_imp)
570
+ {
571
+ case Lower:
572
+ {
573
+ for (double &w : imputer.num_weight)
574
+ w /= curr_depth_dbl;
575
+ for (double &w : imputer.cat_weight)
576
+ w /= curr_depth_dbl;
577
+ break;
578
+ }
579
+
580
+ case Higher:
581
+ {
582
+ for (double &w : imputer.num_weight)
583
+ w *= curr_depth_dbl;
584
+ for (double &w : imputer.cat_weight)
585
+ w *= curr_depth_dbl;
586
+ break;
587
+ }
588
+ }
589
+
590
+ /* now re-adjust sums */
591
+ if (model_params.weigh_imp_rows != Prop || model_params.depth_imp != Same)
592
+ {
593
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
594
+ imputer.num_sum[col] *= imputer.num_weight[col];
595
+
596
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
597
+ for (int cat = 0; cat < input_data.ncat[col]; cat++)
598
+ imputer.cat_sum[col][cat] *= imputer.cat_weight[col];
599
+ }
600
+ }
601
+
602
+
603
+ void shrink_impute_node(ImputeNode &imputer)
604
+ {
605
+ imputer.num_sum.clear();
606
+ imputer.num_weight.clear();
607
+ imputer.cat_sum.clear();
608
+ imputer.cat_weight.clear();
609
+
610
+ imputer.num_sum.shrink_to_fit();
611
+ imputer.num_weight.shrink_to_fit();
612
+ imputer.cat_sum.shrink_to_fit();
613
+ imputer.cat_weight.shrink_to_fit();
614
+ }
615
+
616
+ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
617
+ std::vector<IsoTree> *trees,
618
+ std::vector<IsoHPlane> *hplanes)
619
+ {
620
+ if (trees != NULL)
621
+ {
622
+ for (size_t tr = 0; tr < trees->size(); tr++)
623
+ {
624
+ if ((*trees)[tr].score <= 0)
625
+ {
626
+ shrink_impute_node(imputer_tree[tr]);
627
+ }
628
+
629
+ else
630
+ {
631
+ /* cat_weight is not needed for anything else */
632
+ imputer_tree[tr].cat_weight.clear();
633
+ imputer_tree[tr].cat_weight.shrink_to_fit();
634
+ }
635
+ }
636
+ }
637
+
638
+ else
639
+ {
640
+ for (size_t tr = 0; tr < hplanes->size(); tr++)
641
+ {
642
+ if ((*hplanes)[tr].score <= 0)
643
+ {
644
+ shrink_impute_node(imputer_tree[tr]);
645
+ }
646
+
647
+ else
648
+ {
649
+ /* cat_weight is not needed for anything else */
650
+ imputer_tree[tr].cat_weight.clear();
651
+ imputer_tree[tr].cat_weight.shrink_to_fit();
652
+ }
653
+ }
654
+ }
655
+
656
+ imputer_tree.shrink_to_fit();
657
+ }
658
+
659
+ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
660
+ {
661
+ size_t col;
662
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
663
+ {
664
+ imp_addto.num_sum[ix] += imp_addfrom.num_sum[ix];
665
+ imp_addto.num_weight[ix] += imp_addfrom.num_weight[ix];
666
+ }
667
+
668
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_cat; ix++)
669
+ {
670
+ col = imp_addfrom.missing_cat[ix];
671
+ for (size_t cat = 0; cat < imp_addto.cat_sum[col].size(); cat++)
672
+ {
673
+ imp_addto.cat_sum[col][cat] += imp_addfrom.cat_sum[col][cat];
674
+ }
675
+ }
676
+
677
+ for (size_t ix = 0; ix < imp_addfrom.n_missing_sp; ix++)
678
+ {
679
+ imp_addto.sp_num_sum[ix] += imp_addfrom.sp_num_sum[ix];
680
+ imp_addto.sp_num_weight[ix] += imp_addfrom.sp_num_weight[ix];
681
+ }
682
+ }
683
+
684
+ void combine_tree_imputations(WorkerMemory &workspace,
685
+ std::vector<ImputedData> &impute_vec,
686
+ std::unordered_map<size_t, ImputedData> &impute_map,
687
+ std::vector<char> &has_missing,
688
+ int nthreads)
689
+ {
690
+ if (workspace.impute_vec.size())
691
+ {
692
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
693
+ for (size_t_for row = 0; row < has_missing.size(); row++)
694
+ if (has_missing[row])
695
+ combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
696
+ }
697
+
698
+ else if (workspace.impute_map.size())
699
+ {
700
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
701
+ for (size_t_for row = 0; row < has_missing.size(); row++)
702
+ if (has_missing[row])
703
+ combine_imp_single(workspace.impute_map[row], impute_map[row]);
704
+ }
705
+ }
706
+
707
+
708
+ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
709
+ {
710
+ size_t col;
711
+ for (size_t ix = 0; ix < imputed_data.n_missing_num; ix++)
712
+ {
713
+ col = imputed_data.missing_num[ix];
714
+ imputed_data.num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
715
+ imputed_data.num_weight[ix] += w * imputer.num_weight[ix];
716
+ }
717
+
718
+ for (size_t ix = 0; ix < imputed_data.n_missing_sp; ix++)
719
+ {
720
+ col = imputed_data.missing_sp[ix];
721
+ imputed_data.sp_num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
722
+ imputed_data.sp_num_weight[ix] += w * imputer.num_weight[ix];
723
+ }
724
+
725
+ for (size_t ix = 0; ix < imputed_data.n_missing_cat; ix++)
726
+ {
727
+ col = imputed_data.missing_cat[ix];
728
+ for (size_t cat = 0; cat < imputer.cat_sum[col].size(); cat++)
729
+ imputed_data.cat_sum[col][cat] += w * imputer.cat_sum[col][cat];
730
+ }
731
+ }
732
+
733
+
734
+ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
735
+ {
736
+ if (workspace.impute_vec.size())
737
+ {
738
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
739
+ {
740
+ for (size_t row = workspace.st; row <= workspace.end; row++)
741
+ if (input_data.has_missing[workspace.ix_arr[row]])
742
+ add_from_impute_node(imputer,
743
+ workspace.impute_vec[workspace.ix_arr[row]],
744
+ (double)1);
745
+ }
746
+
747
+ else if (workspace.weights_arr.size())
748
+ {
749
+ for (size_t row = workspace.st; row <= workspace.end; row++)
750
+ if (input_data.has_missing[workspace.ix_arr[row]])
751
+ add_from_impute_node(imputer,
752
+ workspace.impute_vec[workspace.ix_arr[row]],
753
+ workspace.weights_arr[workspace.ix_arr[row]]);
754
+ }
755
+
756
+ else
757
+ {
758
+ for (size_t row = workspace.st; row <= workspace.end; row++)
759
+ if (input_data.has_missing[workspace.ix_arr[row]])
760
+ add_from_impute_node(imputer,
761
+ workspace.impute_vec[workspace.ix_arr[row]],
762
+ workspace.weights_map[workspace.ix_arr[row]]);
763
+ }
764
+ }
765
+
766
+ else if (workspace.impute_map.size())
767
+ {
768
+ if (!workspace.weights_arr.size() && !workspace.weights_map.size())
769
+ {
770
+ for (size_t row = workspace.st; row <= workspace.end; row++)
771
+ if (input_data.has_missing[workspace.ix_arr[row]])
772
+ add_from_impute_node(imputer,
773
+ workspace.impute_map[workspace.ix_arr[row]],
774
+ (double)1);
775
+ }
776
+
777
+ else if (workspace.weights_arr.size())
778
+ {
779
+ for (size_t row = workspace.st; row <= workspace.end; row++)
780
+ if (input_data.has_missing[workspace.ix_arr[row]])
781
+ add_from_impute_node(imputer,
782
+ workspace.impute_map[workspace.ix_arr[row]],
783
+ workspace.weights_arr[workspace.ix_arr[row]]);
784
+ }
785
+
786
+ else
787
+ {
788
+ for (size_t row = workspace.st; row <= workspace.end; row++)
789
+ if (input_data.has_missing[workspace.ix_arr[row]])
790
+ add_from_impute_node(imputer,
791
+ workspace.impute_map[workspace.ix_arr[row]],
792
+ workspace.weights_map[workspace.ix_arr[row]]);
793
+ }
794
+ }
795
+ }
796
+
797
+ template <class imp_arr>
798
+ void apply_imputation_results(imp_arr &impute_vec,
799
+ Imputer &imputer,
800
+ InputData &input_data,
801
+ int nthreads)
802
+ {
803
+ size_t col;
804
+
805
+ if (input_data.Xc != NULL)
806
+ {
807
+ std::vector<size_t> row_pos(input_data.nrows, 0);
808
+ size_t row;
809
+
810
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
811
+ {
812
+ for (sparse_ix ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
813
+ {
814
+ if (is_na_or_inf(input_data.Xc[ix]))
815
+ {
816
+ row = input_data.Xc_ind[ix];
817
+ if (impute_vec[row].sp_num_weight[row_pos[row]] > 0 && !is_na_or_inf(impute_vec[row].sp_num_sum[row_pos[row]]))
818
+ input_data.Xc[ix]
819
+ =
820
+ impute_vec[row].sp_num_sum[row_pos[row]]
821
+ /
822
+ impute_vec[row].sp_num_weight[row_pos[row]];
823
+ else
824
+ input_data.Xc[ix]
825
+ =
826
+ imputer.col_means[col];
827
+
828
+ row_pos[row]++;
829
+ }
830
+ }
831
+ }
832
+ }
833
+
834
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
835
+ for (size_t_for row = 0; row < input_data.nrows; row++)
836
+ {
837
+ if (input_data.has_missing[row])
838
+ {
839
+ for (size_t ix = 0; ix < impute_vec[row].n_missing_num; ix++)
840
+ {
841
+ col = impute_vec[row].missing_num[ix];
842
+ if (impute_vec[row].num_weight[ix] > 0 && !is_na_or_inf(impute_vec[row].num_sum[ix]))
843
+ input_data.numeric_data[row + col * input_data.nrows]
844
+ =
845
+ impute_vec[row].num_sum[ix] / impute_vec[row].num_weight[ix];
846
+ else
847
+ input_data.numeric_data[row + col * input_data.nrows]
848
+ =
849
+ imputer.col_means[col];
850
+ }
851
+
852
+ for (size_t ix = 0; ix < impute_vec[row].n_missing_cat; ix++)
853
+ {
854
+ col = impute_vec[row].missing_cat[ix];
855
+ input_data.categ_data[row + col * input_data.nrows]
856
+ =
857
+ std::distance(impute_vec[row].cat_sum[col].begin(),
858
+ std::max_element(impute_vec[row].cat_sum[col].begin(),
859
+ impute_vec[row].cat_sum[col].end()));
860
+
861
+ if (input_data.categ_data[row + col * input_data.nrows] == 0 && impute_vec[row].cat_sum[col][0] <= 0)
862
+ input_data.categ_data[row + col * input_data.nrows]
863
+ =
864
+ imputer.col_modes[col];
865
+ }
866
+ }
867
+ }
868
+ }
869
+
870
+ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
871
+ std::unordered_map<size_t, ImputedData> &impute_map,
872
+ Imputer &imputer,
873
+ InputData &input_data,
874
+ int nthreads)
875
+ {
876
+ if (impute_vec.size())
877
+ apply_imputation_results(impute_vec, imputer, input_data, nthreads);
878
+ else if (impute_map.size())
879
+ apply_imputation_results(impute_map, imputer, input_data, nthreads);
880
+ }
881
+
882
+
883
+ void apply_imputation_results(PredictionData &prediction_data,
884
+ ImputedData &imp,
885
+ Imputer &imputer,
886
+ size_t row)
887
+ {
888
+ size_t col;
889
+ size_t pos = 0;
890
+ for (size_t ix = 0; ix < imp.n_missing_num; ix++)
891
+ {
892
+ col = imp.missing_num[ix];
893
+ if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
894
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
895
+ =
896
+ imp.num_sum[ix] / imp.num_weight[ix];
897
+ else
898
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
899
+ =
900
+ imputer.col_means[col];
901
+ }
902
+
903
+ if (prediction_data.Xr != NULL)
904
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
905
+ {
906
+ if (is_na_or_inf(prediction_data.Xr[ix]))
907
+ {
908
+ if (imp.sp_num_weight[pos] > 0 && !is_na_or_inf(imp.sp_num_sum[pos]))
909
+ prediction_data.Xr[ix]
910
+ =
911
+ imp.sp_num_sum[pos] / imp.sp_num_weight[pos];
912
+ else
913
+ prediction_data.Xr[ix]
914
+ =
915
+ imputer.col_means[imp.missing_sp[pos]];
916
+ pos++;
917
+ }
918
+ }
919
+
920
+ for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
921
+ {
922
+ col = imp.missing_cat[ix];
923
+ prediction_data.categ_data[row + col * prediction_data.nrows]
924
+ =
925
+ std::distance(imp.cat_sum[col].begin(),
926
+ std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
927
+
928
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
929
+ prediction_data.categ_data[row + col * prediction_data.nrows]
930
+ =
931
+ imputer.col_modes[col];
932
+ }
933
+ }
934
+
935
+
936
+ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
937
+ {
938
+ imp.n_missing_num = 0;
939
+ imp.n_missing_cat = 0;
940
+ imp.n_missing_sp = 0;
941
+
942
+ if (input_data.numeric_data != NULL)
943
+ {
944
+ imp.missing_num.resize(input_data.ncols_numeric);
945
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
946
+ if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
947
+ imp.missing_num[imp.n_missing_num++] = col;
948
+ imp.missing_num.resize(imp.n_missing_num);
949
+ imp.num_sum.assign(imp.n_missing_num, 0);
950
+ imp.num_weight.assign(imp.n_missing_num, 0);
951
+ }
952
+
953
+ else if (input_data.Xc != NULL)
954
+ {
955
+ imp.missing_sp.resize(input_data.ncols_numeric);
956
+ sparse_ix *res;
957
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
958
+ {
959
+ res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
960
+ input_data.Xc_ind + input_data.Xc_indptr[col + 1],
961
+ (sparse_ix) row);
962
+ if (
963
+ res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
964
+ *res == row &&
965
+ is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
966
+ )
967
+ {
968
+ imp.missing_sp[imp.n_missing_sp++] = col;
969
+ }
970
+ }
971
+ imp.sp_num_sum.assign(imp.n_missing_sp, 0);
972
+ imp.sp_num_weight.assign(imp.n_missing_sp, 0);
973
+ }
974
+
975
+ if (input_data.categ_data != NULL)
976
+ {
977
+ imp.missing_cat.resize(input_data.ncols_categ);
978
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
979
+ if (input_data.categ_data[row + col * input_data.nrows] < 0)
980
+ imp.missing_cat[imp.n_missing_cat++] = col;
981
+ imp.missing_cat.resize(imp.n_missing_cat);
982
+ imp.cat_weight.assign(imp.n_missing_cat, 0);
983
+ imp.cat_sum.resize(input_data.ncols_categ);
984
+ for (size_t cat = 0; cat < imp.n_missing_cat; cat++)
985
+ imp.cat_sum[imp.missing_cat[cat]].assign(input_data.ncat[imp.missing_cat[cat]], 0);
986
+ }
987
+ }
988
+
989
+ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
990
+ {
991
+ imp.n_missing_num = 0;
992
+ imp.n_missing_cat = 0;
993
+ imp.n_missing_sp = 0;
994
+
995
+ if (prediction_data.numeric_data != NULL)
996
+ {
997
+ if (!imp.missing_num.size())
998
+ imp.missing_num.resize(imputer.ncols_numeric);
999
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1000
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1001
+ imp.missing_num[imp.n_missing_num++] = col;
1002
+
1003
+ if (!imp.num_sum.size())
1004
+ {
1005
+ imp.num_sum.resize(imputer.ncols_numeric, 0);
1006
+ imp.num_weight.resize(imputer.ncols_numeric, 0);
1007
+ }
1008
+
1009
+ else
1010
+ {
1011
+ std::fill(imp.num_sum.begin(), imp.num_sum.begin() + imp.n_missing_num, 0);
1012
+ std::fill(imp.num_weight.begin(), imp.num_weight.begin() + imp.n_missing_num, 0);
1013
+ }
1014
+ }
1015
+
1016
+ else if (prediction_data.Xr != NULL)
1017
+ {
1018
+ if (!imp.missing_sp.size())
1019
+ imp.missing_sp.resize(imputer.ncols_numeric);
1020
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1021
+ if (is_na_or_inf(prediction_data.Xr[ix]))
1022
+ imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
1023
+
1024
+ if (!imp.sp_num_sum.size())
1025
+ {
1026
+ imp.sp_num_sum.resize(imputer.ncols_numeric, 0);
1027
+ imp.sp_num_weight.resize(imputer.ncols_numeric, 0);
1028
+ }
1029
+
1030
+ else
1031
+ {
1032
+ std::fill(imp.sp_num_sum.begin(), imp.sp_num_sum.begin() + imp.n_missing_sp, 0);
1033
+ std::fill(imp.sp_num_weight.begin(), imp.sp_num_weight.begin() + imp.n_missing_sp, 0);
1034
+ }
1035
+ }
1036
+
1037
+ if (prediction_data.categ_data != NULL)
1038
+ {
1039
+ if (!imp.missing_cat.size())
1040
+ imp.missing_cat.resize(imputer.ncols_categ);
1041
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1042
+ {
1043
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1044
+ imp.missing_cat[imp.n_missing_cat++] = col;
1045
+ }
1046
+
1047
+ if (!imp.cat_weight.size())
1048
+ {
1049
+ imp.cat_weight.resize(imputer.ncols_categ, 0);
1050
+ imp.cat_sum.resize(imputer.ncols_categ);
1051
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1052
+ imp.cat_sum[col].resize(imputer.ncat[col], 0);
1053
+ }
1054
+
1055
+ else
1056
+ {
1057
+ std::fill(imp.cat_weight.begin(), imp.cat_weight.begin() + imp.n_missing_cat, 0);
1058
+ for (size_t col = 0; col < imp.n_missing_cat; col++)
1059
+ std::fill(imp.cat_sum[imp.missing_cat[col]].begin(),
1060
+ imp.cat_sum[imp.missing_cat[col]].end(),
1061
+ 0);
1062
+ }
1063
+ }
1064
+ }
1065
+
1066
+ ImputedData::ImputedData(InputData &input_data, size_t row)
1067
+ {
1068
+ initialize_impute_calc(*this, input_data, row);
1069
+ }
1070
+
1071
+ void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
1072
+ {
1073
+ impute_vec.resize(input_data.nrows);
1074
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
1075
+ for (size_t_for row = 0; row < input_data.nrows; row++)
1076
+ if (input_data.has_missing[row])
1077
+ initialize_impute_calc(impute_vec[row], input_data, row);
1078
+ }
1079
+
1080
+
1081
+ void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data)
1082
+ {
1083
+ for (size_t row = 0; row < input_data.nrows; row++)
1084
+ if (input_data.has_missing[row])
1085
+ impute_map[row] = ImputedData(input_data, row);
1086
+ }
1087
+
1088
+ void allocate_imp(InputData &input_data,
1089
+ std::vector<ImputedData> &impute_vec,
1090
+ std::unordered_map<size_t, ImputedData> &impute_map,
1091
+ int nthreads)
1092
+ {
1093
+ if (input_data.n_missing == 0)
1094
+ return;
1095
+ else if (input_data.n_missing <= input_data.nrows / (nthreads * 10))
1096
+ allocate_imp_map(impute_map, input_data);
1097
+ else
1098
+ allocate_imp_vec(impute_vec, input_data, nthreads);
1099
+ }
1100
+
1101
+ void check_for_missing(InputData &input_data,
1102
+ std::vector<ImputedData> &impute_vec,
1103
+ std::unordered_map<size_t, ImputedData> &impute_map,
1104
+ int nthreads)
1105
+ {
1106
+ input_data.has_missing.assign(input_data.nrows, false);
1107
+
1108
+ if (input_data.Xc != NULL)
1109
+ {
1110
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1111
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
1112
+ for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
1113
+ if (is_na_or_inf(input_data.Xc[ix]))
1114
+ input_data.has_missing[input_data.Xc_ind[ix]] = true;
1115
+ #pragma omp barrier
1116
+ }
1117
+
1118
+ if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
1119
+ {
1120
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
1121
+ for (size_t_for row = 0; row < input_data.nrows; row++)
1122
+ {
1123
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1124
+ {
1125
+ if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
1126
+ {
1127
+ input_data.has_missing[row] = true;
1128
+ break;
1129
+ }
1130
+ }
1131
+
1132
+ if (!input_data.has_missing[row])
1133
+ for (size_t col = 0; col < input_data.ncols_categ; col++)
1134
+ {
1135
+ if (input_data.categ_data[row + col * input_data.nrows] < 0)
1136
+ {
1137
+ input_data.has_missing[row] = true;
1138
+ break;
1139
+ }
1140
+ }
1141
+ }
1142
+ }
1143
+
1144
+ input_data.n_missing = std::accumulate(input_data.has_missing.begin(), input_data.has_missing.end(), (size_t)0);
1145
+ allocate_imp(input_data, impute_vec, impute_map, nthreads);
1146
+ }
1147
+
1148
+ size_t check_for_missing(PredictionData &prediction_data,
1149
+ Imputer &imputer,
1150
+ size_t ix_arr[],
1151
+ int nthreads)
1152
+ {
1153
+ std::vector<char> has_missing(prediction_data.nrows, false);
1154
+
1155
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
1156
+ for (size_t_for row = 0; row < prediction_data.nrows; row++)
1157
+ {
1158
+ if (prediction_data.numeric_data != NULL)
1159
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1160
+ {
1161
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1162
+ {
1163
+ has_missing[row] = true;
1164
+ break;
1165
+ }
1166
+ }
1167
+ else if (prediction_data.Xr != NULL)
1168
+ for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1169
+ {
1170
+ if (is_na_or_inf(prediction_data.Xr[ix]))
1171
+ {
1172
+ has_missing[row] = true;
1173
+ break;
1174
+ }
1175
+ }
1176
+
1177
+ if (!has_missing[row])
1178
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1179
+ {
1180
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1181
+ {
1182
+ has_missing[row] = true;
1183
+ break;
1184
+ }
1185
+ }
1186
+ }
1187
+
1188
+ size_t st = 0;
1189
+ size_t temp;
1190
+ for (size_t row = 0; row < prediction_data.nrows; row++)
1191
+ {
1192
+ if (has_missing[row])
1193
+ {
1194
+ temp = ix_arr[st];
1195
+ ix_arr[st] = ix_arr[row];
1196
+ ix_arr[row] = temp;
1197
+ st++;
1198
+ }
1199
+ }
1200
+
1201
+ if (st == 0)
1202
+ return 0;
1203
+
1204
+ return st;
1205
+ }