isotree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
@@ -0,0 +1,309 @@
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
+
* of categorical variables and missing values.
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
+
*
|
5
|
+
* This library is based on the following works:
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
+
* "Isolation forest."
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
+
* "Isolation-based anomaly detection."
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
+
* "Extended Isolation Forest."
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
+
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
+
*
|
24
|
+
* BSD 2-Clause License
|
25
|
+
* Copyright (c) 2019, David Cortes
|
26
|
+
* All rights reserved.
|
27
|
+
* Redistribution and use in source and binary forms, with or without
|
28
|
+
* modification, are permitted provided that the following conditions are met:
|
29
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
+
* list of conditions and the following disclaimer.
|
31
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
+
* this list of conditions and the following disclaimer in the documentation
|
33
|
+
* and/or other materials provided with the distribution.
|
34
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
+
*/
|
45
|
+
#include "isotree.hpp"
|
46
|
+
|
47
|
+
void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
|
48
|
+
RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
|
49
|
+
std::discrete_distribution<size_t> &col_sampler)
|
50
|
+
{
|
51
|
+
if (!col_sampler.max())
|
52
|
+
col_chosen = runif(rnd_generator);
|
53
|
+
else
|
54
|
+
col_chosen = col_sampler(rnd_generator);
|
55
|
+
|
56
|
+
if (col_chosen >= ncols_numeric)
|
57
|
+
{
|
58
|
+
col_chosen -= ncols_numeric;
|
59
|
+
col_type = Categorical;
|
60
|
+
}
|
61
|
+
|
62
|
+
else { col_type = Numeric; }
|
63
|
+
}
|
64
|
+
|
65
|
+
void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
|
66
|
+
{
|
67
|
+
if (tree.col_type == Numeric)
|
68
|
+
workspace.cols_possible[tree.col_num] = false;
|
69
|
+
else
|
70
|
+
workspace.cols_possible[tree.col_num + input_data.ncols_numeric] = false;
|
71
|
+
}
|
72
|
+
|
73
|
+
void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data)
|
74
|
+
{
|
75
|
+
if (workspace.col_type == Numeric)
|
76
|
+
workspace.cols_possible[workspace.col_chosen] = false;
|
77
|
+
else
|
78
|
+
workspace.cols_possible[workspace.col_chosen + input_data.ncols_numeric] = false;
|
79
|
+
}
|
80
|
+
|
81
|
+
bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data)
|
82
|
+
{
|
83
|
+
if (tree.col_type == Numeric)
|
84
|
+
return workspace.cols_possible[tree.col_num];
|
85
|
+
else
|
86
|
+
return workspace.cols_possible[tree.col_num + input_data.ncols_numeric];
|
87
|
+
}
|
88
|
+
|
89
|
+
/* for use in regular model */
|
90
|
+
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree)
|
91
|
+
{
|
92
|
+
if (tree.col_type == Numeric)
|
93
|
+
{
|
94
|
+
if (input_data.Xc == NULL)
|
95
|
+
get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * tree.col_num,
|
96
|
+
workspace.st, workspace.end, model_params.missing_action,
|
97
|
+
workspace.xmin, workspace.xmax, workspace.unsplittable);
|
98
|
+
else
|
99
|
+
get_range(workspace.ix_arr.data(), workspace.st, workspace.end, tree.col_num,
|
100
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
101
|
+
model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
|
102
|
+
}
|
103
|
+
|
104
|
+
else
|
105
|
+
{
|
106
|
+
get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * tree.col_num,
|
107
|
+
workspace.st, workspace.end, input_data.ncat[tree.col_num],
|
108
|
+
model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
/* for use in extended model */
|
113
|
+
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params)
|
114
|
+
{
|
115
|
+
if (workspace.col_type == Numeric)
|
116
|
+
{
|
117
|
+
if (input_data.Xc == NULL)
|
118
|
+
get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * workspace.col_chosen,
|
119
|
+
workspace.st, workspace.end, model_params.missing_action,
|
120
|
+
workspace.xmin, workspace.xmax, workspace.unsplittable);
|
121
|
+
else
|
122
|
+
get_range(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.col_chosen,
|
123
|
+
input_data.Xc, input_data.Xc_ind, input_data.Xc_indptr,
|
124
|
+
model_params.missing_action, workspace.xmin, workspace.xmax, workspace.unsplittable);
|
125
|
+
}
|
126
|
+
|
127
|
+
else
|
128
|
+
{
|
129
|
+
get_categs(workspace.ix_arr.data(), input_data.categ_data + input_data.nrows * workspace.col_chosen,
|
130
|
+
workspace.st, workspace.end, input_data.ncat[workspace.col_chosen],
|
131
|
+
model_params.missing_action, workspace.categs.data(), workspace.npresent, workspace.unsplittable);
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num)
|
136
|
+
{
|
137
|
+
int chosen_cat = std::uniform_int_distribution<int>
|
138
|
+
(0, workspace.npresent - 1)
|
139
|
+
(workspace.rnd_generator);
|
140
|
+
workspace.ncat_tried = 0;
|
141
|
+
for (int cat = 0; cat < input_data.ncat[col_num]; cat++)
|
142
|
+
{
|
143
|
+
if (workspace.categs[cat] > 0)
|
144
|
+
{
|
145
|
+
if (workspace.ncat_tried == chosen_cat)
|
146
|
+
return cat;
|
147
|
+
else
|
148
|
+
workspace.ncat_tried++;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
return -1; /* this will never be reached, but CRAN complains otherwise */
|
153
|
+
}
|
154
|
+
|
155
|
+
void update_col_sampler(WorkerMemory &workspace, InputData &input_data)
|
156
|
+
{
|
157
|
+
if (!workspace.col_sampler.max())
|
158
|
+
return;
|
159
|
+
|
160
|
+
std::vector<double> col_weights = workspace.col_sampler.probabilities();
|
161
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
162
|
+
if (!workspace.cols_possible[col])
|
163
|
+
col_weights[col] = 0;
|
164
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
165
|
+
if (!workspace.cols_possible[col + input_data.ncols_numeric])
|
166
|
+
col_weights[col + input_data.ncols_numeric] = 0;
|
167
|
+
workspace.col_sampler = std::discrete_distribution<size_t>(col_weights.begin(), col_weights.end());
|
168
|
+
}
|
169
|
+
|
170
|
+
bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
|
171
|
+
InputData &input_data, size_t col_num, ColType col_type)
|
172
|
+
{
|
173
|
+
col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
|
174
|
+
if (col_is_taken.size())
|
175
|
+
return col_is_taken[col_num];
|
176
|
+
else
|
177
|
+
return col_is_taken_s.find(col_num) != col_is_taken_s.end();
|
178
|
+
}
|
179
|
+
|
180
|
+
void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
|
181
|
+
InputData &input_data, size_t col_num, ColType col_type)
|
182
|
+
{
|
183
|
+
col_num += ((col_type == Categorical)? 0 : input_data.ncols_categ);
|
184
|
+
if (col_is_taken.size())
|
185
|
+
col_is_taken[col_num] = true;
|
186
|
+
else
|
187
|
+
col_is_taken_s.insert(col_num);
|
188
|
+
}
|
189
|
+
|
190
|
+
void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder)
|
191
|
+
{
|
192
|
+
if (workspace.weights_arr.size())
|
193
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
194
|
+
input_data.nrows, workspace.tmat_sep.data(), workspace.weights_arr.data(), remainder);
|
195
|
+
else if (workspace.weights_map.size())
|
196
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
197
|
+
input_data.nrows, workspace.tmat_sep.data(), workspace.weights_map, remainder);
|
198
|
+
else
|
199
|
+
increase_comb_counter(workspace.ix_arr.data(), workspace.st, workspace.end,
|
200
|
+
input_data.nrows, workspace.tmat_sep.data(), remainder);
|
201
|
+
}
|
202
|
+
|
203
|
+
void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight)
|
204
|
+
{
|
205
|
+
if (
|
206
|
+
((workspace.end - workspace.st) > 0 && !workspace.weights_arr.size() && !workspace.weights_map.size()) ||
|
207
|
+
(sum_weight > 1 && (workspace.weights_arr.size() || workspace.weights_map.size()))
|
208
|
+
)
|
209
|
+
{
|
210
|
+
double expected_dsep;
|
211
|
+
if (!workspace.weights_arr.size() && !workspace.weights_map.size())
|
212
|
+
expected_dsep = expected_separation_depth(workspace.end - workspace.st + 1);
|
213
|
+
else
|
214
|
+
expected_dsep = expected_separation_depth(sum_weight);
|
215
|
+
|
216
|
+
add_separation_step(workspace, input_data, expected_dsep + 1);
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
221
|
+
PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads)
|
222
|
+
{
|
223
|
+
size_t ntrees = (model_outputs != NULL)? model_outputs->trees.size() : model_outputs_ext->hplanes.size();
|
224
|
+
size_t max_tree, curr_term;
|
225
|
+
std::vector<sparse_ix> tree_mapping;
|
226
|
+
if (model_outputs != NULL)
|
227
|
+
{
|
228
|
+
max_tree = std::accumulate(model_outputs->trees.begin(),
|
229
|
+
model_outputs->trees.end(),
|
230
|
+
(size_t)0,
|
231
|
+
[](const size_t curr_max, const std::vector<IsoTree> &tr)
|
232
|
+
{return std::max(curr_max, tr.size());});
|
233
|
+
tree_mapping.resize(max_tree);
|
234
|
+
for (size_t tree = 0; tree < ntrees; tree++)
|
235
|
+
{
|
236
|
+
std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
|
237
|
+
curr_term = 0;
|
238
|
+
for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
|
239
|
+
if (model_outputs->trees[tree][node].score >= 0)
|
240
|
+
tree_mapping[node] = curr_term++;
|
241
|
+
|
242
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
|
243
|
+
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
244
|
+
tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
|
245
|
+
}
|
246
|
+
}
|
247
|
+
|
248
|
+
else
|
249
|
+
{
|
250
|
+
max_tree = std::accumulate(model_outputs_ext->hplanes.begin(),
|
251
|
+
model_outputs_ext->hplanes.end(),
|
252
|
+
(size_t)0,
|
253
|
+
[](const size_t curr_max, const std::vector<IsoHPlane> &tr)
|
254
|
+
{return std::max(curr_max, tr.size());});
|
255
|
+
tree_mapping.resize(max_tree);
|
256
|
+
for (size_t tree = 0; tree < ntrees; tree++)
|
257
|
+
{
|
258
|
+
std::fill(tree_mapping.begin(), tree_mapping.end(), (size_t)0);
|
259
|
+
curr_term = 0;
|
260
|
+
for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
|
261
|
+
if (model_outputs_ext->hplanes[tree][node].score >= 0)
|
262
|
+
tree_mapping[node] = curr_term++;
|
263
|
+
|
264
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(tree_num, tree_mapping, tree, prediction_data)
|
265
|
+
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
266
|
+
tree_num[row + tree * prediction_data.nrows] = tree_mapping[tree_num[row + tree * prediction_data.nrows]];
|
267
|
+
}
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
|
272
|
+
{
|
273
|
+
recursion_state.st = workspace.st;
|
274
|
+
recursion_state.st_NA = workspace.st_NA;
|
275
|
+
recursion_state.end_NA = workspace.end_NA;
|
276
|
+
recursion_state.split_ix = workspace.split_ix;
|
277
|
+
recursion_state.end = workspace.end;
|
278
|
+
recursion_state.cols_possible = workspace.cols_possible;
|
279
|
+
recursion_state.col_sampler = workspace.col_sampler;
|
280
|
+
|
281
|
+
/* for the extended model, it's not necessary to copy everything */
|
282
|
+
if (!workspace.comb_val.size())
|
283
|
+
{
|
284
|
+
/* TODO: here only need to copy the left half, as the right one is untouched */
|
285
|
+
recursion_state.ix_arr = workspace.ix_arr;
|
286
|
+
recursion_state.weights_map = workspace.weights_map;
|
287
|
+
recursion_state.weights_arr = workspace.weights_arr;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
|
291
|
+
|
292
|
+
void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state)
|
293
|
+
{
|
294
|
+
workspace.st = recursion_state.st;
|
295
|
+
workspace.st_NA = recursion_state.st_NA;
|
296
|
+
workspace.end_NA = recursion_state.end_NA;
|
297
|
+
workspace.split_ix = recursion_state.split_ix;
|
298
|
+
workspace.end = recursion_state.end;
|
299
|
+
workspace.cols_possible = std::move(recursion_state.cols_possible);
|
300
|
+
workspace.col_sampler = std::move(recursion_state.col_sampler);
|
301
|
+
|
302
|
+
if (!workspace.comb_val.size())
|
303
|
+
{
|
304
|
+
/* TODO: here only need to copy the left half, as the right one is untouched */
|
305
|
+
workspace.ix_arr = std::move(recursion_state.ix_arr);
|
306
|
+
workspace.weights_map = std::move(recursion_state.weights_map);
|
307
|
+
workspace.weights_arr = std::move(recursion_state.weights_arr);
|
308
|
+
}
|
309
|
+
}
|
@@ -0,0 +1,1205 @@
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
+
* of categorical variables and missing values.
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
+
*
|
5
|
+
* This library is based on the following works:
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
+
* "Isolation forest."
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
+
* "Isolation-based anomaly detection."
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
+
* "Extended Isolation Forest."
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
+
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
+
*
|
24
|
+
* BSD 2-Clause License
|
25
|
+
* Copyright (c) 2019, David Cortes
|
26
|
+
* All rights reserved.
|
27
|
+
* Redistribution and use in source and binary forms, with or without
|
28
|
+
* modification, are permitted provided that the following conditions are met:
|
29
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
+
* list of conditions and the following disclaimer.
|
31
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
+
* this list of conditions and the following disclaimer in the documentation
|
33
|
+
* and/or other materials provided with the distribution.
|
34
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
+
*/
|
45
|
+
#include "isotree.hpp"
|
46
|
+
|
47
|
+
|
48
|
+
/* TODO: this file is a complete mess, needs a refactor from scratch along with the data structs */
|
49
|
+
|
50
|
+
/* Impute missing values in new data
|
51
|
+
*
|
52
|
+
* Parameters
|
53
|
+
* ==========
|
54
|
+
* - numeric_data[nrows * ncols_numeric] (in, out)
|
55
|
+
* Pointer to numeric data in which missing values will be imputed. Must be ordered by columns like Fortran,
|
56
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
|
57
|
+
* and the column order must be the same as in the data that was used to fit the model.
|
58
|
+
* Pass NULL if there are no dense numeric columns.
|
59
|
+
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
60
|
+
* Imputations will overwrite values in this same array.
|
61
|
+
* - ncols_numeric
|
62
|
+
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
63
|
+
* - categ_data[nrows * ncols_categ]
|
64
|
+
* Pointer to categorical data in which missing values will be imputed. Must be ordered by columns like Fortran,
|
65
|
+
* not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
|
66
|
+
* and the column order must be the same as in the data that was used to fit the model.
|
67
|
+
* Pass NULL if there are no categorical columns.
|
68
|
+
* Each category should be represented as an integer, and these integers must start at zero and
|
69
|
+
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
70
|
+
* present when the model was fit (note that they are not treated as being ordinal, this is just
|
71
|
+
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
72
|
+
* must be the same as was used in the data to which the model was fit.
|
73
|
+
* Imputations will overwrite values in this same array.
|
74
|
+
* - ncols_categ
|
75
|
+
* Number of categorical columns in the data.
|
76
|
+
* - ncat[ncols_categ]
|
77
|
+
* Number of categories in each categorical column. E.g. if the highest code for a column is '4',
|
78
|
+
* the number of categories for that column is '5' (zero is one category).
|
79
|
+
* Must be the same as was passed to 'fit_iforest'.
|
80
|
+
* - Xr[nnz] (in, out)
|
81
|
+
* Pointer to numeric data in sparse numeric matrix in CSR format (row-compressed).
|
82
|
+
* Pass NULL if there are no sparse numeric columns.
|
83
|
+
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
84
|
+
* Imputations will overwrite values in this same array.
|
85
|
+
* - Xr_ind[nnz]
|
86
|
+
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
87
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
88
|
+
* - Xr_indptr[nrows + 1]
|
89
|
+
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
90
|
+
* start and at entry [row + 1] where does row 'row' end.
|
91
|
+
* Pass NULL if there are no sparse numeric columns in CSR format.
|
92
|
+
* - nrows
|
93
|
+
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
94
|
+
* - nthreads
|
95
|
+
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
96
|
+
* allocated, even if the thread does not end up being used. Ignored when not building with
|
97
|
+
* OpenMP support.
|
98
|
+
* - model_outputs
|
99
|
+
* Pointer to fitted single-variable model object from function 'fit_iforest'. Pass NULL
|
100
|
+
* if the predictions are to be made from an extended model. Can only pass one of
|
101
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
102
|
+
* - model_outputs_ext
|
103
|
+
* Pointer to fitted extended model object from function 'fit_iforest'. Pass NULL
|
104
|
+
* if the predictions are to be made from a single-variable model. Can only pass one of
|
105
|
+
* 'model_outputs' and 'model_outputs_ext'.
|
106
|
+
* - impute_nodes
|
107
|
+
* Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
|
108
|
+
* as produced from function 'fit_iforest',
|
109
|
+
*/
|
110
|
+
void impute_missing_values(double numeric_data[], int categ_data[],
|
111
|
+
double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
112
|
+
size_t nrows, int nthreads,
|
113
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
114
|
+
Imputer &imputer)
|
115
|
+
{
|
116
|
+
PredictionData prediction_data = {numeric_data, categ_data, nrows,
|
117
|
+
NULL, NULL, NULL,
|
118
|
+
Xr, Xr_ind, Xr_indptr};
|
119
|
+
|
120
|
+
std::vector<size_t> ix_arr(nrows);
|
121
|
+
std::iota(ix_arr.begin(), ix_arr.end(), (size_t) 0);
|
122
|
+
|
123
|
+
size_t end = check_for_missing(prediction_data, imputer, ix_arr.data(), nthreads);
|
124
|
+
|
125
|
+
if (end == 0)
|
126
|
+
return;
|
127
|
+
|
128
|
+
if ((size_t)nthreads > end)
|
129
|
+
nthreads = (int)end;
|
130
|
+
#ifdef _OPENMP
|
131
|
+
std::vector<ImputedData> imp_memory(nthreads);
|
132
|
+
#else
|
133
|
+
std::vector<ImputedData> imp_memory(1);
|
134
|
+
#endif
|
135
|
+
|
136
|
+
|
137
|
+
if (model_outputs != NULL)
|
138
|
+
{
|
139
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
140
|
+
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
|
141
|
+
for (size_t_for row = 0; row < end; row++)
|
142
|
+
{
|
143
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
144
|
+
|
145
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
146
|
+
{
|
147
|
+
traverse_itree(tree,
|
148
|
+
*model_outputs,
|
149
|
+
prediction_data,
|
150
|
+
&imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
|
151
|
+
&imp_memory[omp_get_thread_num()],
|
152
|
+
(double) 1,
|
153
|
+
ix_arr[row],
|
154
|
+
NULL,
|
155
|
+
(size_t) 0);
|
156
|
+
}
|
157
|
+
|
158
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
159
|
+
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
else
|
164
|
+
{
|
165
|
+
double temp;
|
166
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
167
|
+
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
|
168
|
+
private(temp)
|
169
|
+
for (size_t_for row = 0; row < end; row++)
|
170
|
+
{
|
171
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
172
|
+
|
173
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
174
|
+
{
|
175
|
+
traverse_hplane(hplane,
|
176
|
+
*model_outputs_ext,
|
177
|
+
prediction_data,
|
178
|
+
temp,
|
179
|
+
&imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
|
180
|
+
&imp_memory[omp_get_thread_num()],
|
181
|
+
NULL,
|
182
|
+
ix_arr[row]);
|
183
|
+
}
|
184
|
+
|
185
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
186
|
+
|
187
|
+
}
|
188
|
+
}
|
189
|
+
|
190
|
+
}
|
191
|
+
|
192
|
+
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
|
193
|
+
{
|
194
|
+
imputer.ncols_numeric = input_data.ncols_numeric;
|
195
|
+
imputer.ncols_categ = input_data.ncols_categ;
|
196
|
+
imputer.ncat.assign(input_data.ncat, input_data.ncat + input_data.ncols_categ);
|
197
|
+
if (imputer.col_means.size())
|
198
|
+
{
|
199
|
+
imputer.col_means.resize(input_data.ncols_numeric);
|
200
|
+
std::fill(imputer.col_means.begin(), imputer.col_means.end(), 0);
|
201
|
+
}
|
202
|
+
|
203
|
+
else
|
204
|
+
{
|
205
|
+
imputer.col_means.resize(input_data.ncols_numeric, 0);
|
206
|
+
}
|
207
|
+
|
208
|
+
imputer.col_modes.resize(input_data.ncols_categ);
|
209
|
+
imputer.imputer_tree = std::vector<std::vector<ImputeNode>>(ntrees);
|
210
|
+
|
211
|
+
size_t offset, cnt;
|
212
|
+
if (input_data.numeric_data != NULL)
|
213
|
+
{
|
214
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
|
215
|
+
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
216
|
+
{
|
217
|
+
cnt = input_data.nrows;
|
218
|
+
offset = col * input_data.nrows;
|
219
|
+
for (size_t row = 0; row < input_data.nrows; row++)
|
220
|
+
{
|
221
|
+
imputer.col_means[col] += (!is_na_or_inf(input_data.numeric_data[row + offset]))?
|
222
|
+
input_data.numeric_data[row + offset] : 0;
|
223
|
+
cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
|
224
|
+
}
|
225
|
+
imputer.col_means[col] /= (long double) cnt;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
else if (input_data.Xc != NULL)
|
230
|
+
{
|
231
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
|
232
|
+
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
233
|
+
{
|
234
|
+
cnt = input_data.nrows;
|
235
|
+
for (size_t ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
236
|
+
{
|
237
|
+
imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
|
238
|
+
input_data.Xc[ix] : 0;
|
239
|
+
cnt -= is_na_or_inf(input_data.Xc[ix]);
|
240
|
+
}
|
241
|
+
imputer.col_means[col] /= (long double) cnt;
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
if (input_data.categ_data != NULL)
|
246
|
+
{
|
247
|
+
std::vector<size_t> cat_counts(input_data.max_categ);
|
248
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
|
249
|
+
for (size_t_for col = 0; col < input_data.ncols_categ; col++)
|
250
|
+
{
|
251
|
+
std::fill(cat_counts.begin(), cat_counts.end(), 0);
|
252
|
+
offset = col * input_data.nrows;
|
253
|
+
for (size_t row = 0; row < input_data.nrows; row++)
|
254
|
+
{
|
255
|
+
if (input_data.categ_data[row + offset] >= 0)
|
256
|
+
cat_counts[input_data.categ_data[row + offset]]++;
|
257
|
+
}
|
258
|
+
imputer.col_modes[col] = (int) std::distance(cat_counts.begin(),
|
259
|
+
std::max_element(cat_counts.begin(),
|
260
|
+
cat_counts.begin() + input_data.ncat[col]));
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
|
265
|
+
|
266
|
+
/* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
|
267
|
+
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
268
|
+
InputData &input_data, ModelParams &model_params,
|
269
|
+
std::vector<ImputeNode> &imputer_tree,
|
270
|
+
size_t curr_depth, size_t min_imp_obs)
|
271
|
+
{
|
272
|
+
double wsum;
|
273
|
+
bool has_weights = workspace.weights_arr.size() || workspace.weights_map.size();
|
274
|
+
if (!has_weights)
|
275
|
+
wsum = (double)(workspace.end - workspace.st + 1);
|
276
|
+
else
|
277
|
+
wsum = calculate_sum_weights(workspace.ix_arr, workspace.st, workspace.end, curr_depth,
|
278
|
+
workspace.weights_arr, workspace.weights_map);
|
279
|
+
|
280
|
+
imputer.num_sum.resize(input_data.ncols_numeric, 0);
|
281
|
+
imputer.num_weight.resize(input_data.ncols_numeric, 0);
|
282
|
+
imputer.cat_sum.resize(input_data.ncols_categ);
|
283
|
+
imputer.cat_weight.resize(input_data.ncols_categ, 0);
|
284
|
+
imputer.num_sum.shrink_to_fit();
|
285
|
+
imputer.num_weight.shrink_to_fit();
|
286
|
+
imputer.cat_sum.shrink_to_fit();
|
287
|
+
imputer.cat_weight.shrink_to_fit();
|
288
|
+
|
289
|
+
/* Note: in theory, 'num_weight' could be initialized to 'wsum',
|
290
|
+
and the entries could get subtracted the weight afterwards, but due to rounding
|
291
|
+
error, this could produce some cases of no-present observations having positive
|
292
|
+
weight, or cases of negative weight, so it's better to add it for each row after
|
293
|
+
checking for possible NAs, even though it's less computationally efficient.
|
294
|
+
For sparse matrices it's done the other way as otherwise it would be too slow. */
|
295
|
+
|
296
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
297
|
+
{
|
298
|
+
imputer.cat_sum[col].resize(input_data.ncat[col]);
|
299
|
+
imputer.cat_sum[col].shrink_to_fit();
|
300
|
+
}
|
301
|
+
|
302
|
+
double xnum;
|
303
|
+
int xcat;
|
304
|
+
double weight;
|
305
|
+
size_t ix;
|
306
|
+
|
307
|
+
if ((input_data.Xc == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
|
308
|
+
{
|
309
|
+
if (!has_weights)
|
310
|
+
{
|
311
|
+
size_t cnt;
|
312
|
+
if (input_data.numeric_data != NULL)
|
313
|
+
{
|
314
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
315
|
+
{
|
316
|
+
cnt = 0;
|
317
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
318
|
+
{
|
319
|
+
xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
|
320
|
+
if (!is_na_or_inf(xnum))
|
321
|
+
{
|
322
|
+
cnt++;
|
323
|
+
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (long double)cnt;
|
324
|
+
}
|
325
|
+
}
|
326
|
+
imputer.num_weight[col] = (double) cnt;
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
if (input_data.categ_data != NULL)
|
331
|
+
{
|
332
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
333
|
+
{
|
334
|
+
cnt = 0;
|
335
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
336
|
+
{
|
337
|
+
xcat = input_data.categ_data[workspace.ix_arr[row] + col * input_data.nrows];
|
338
|
+
if (xcat >= 0)
|
339
|
+
{
|
340
|
+
cnt++;
|
341
|
+
imputer.cat_sum[col][xcat]++; /* later gets divided */
|
342
|
+
}
|
343
|
+
}
|
344
|
+
imputer.cat_weight[col] = (double) cnt;
|
345
|
+
}
|
346
|
+
}
|
347
|
+
|
348
|
+
}
|
349
|
+
|
350
|
+
else
|
351
|
+
{
|
352
|
+
long double prod_sum, corr, val, diff;
|
353
|
+
if (input_data.numeric_data != NULL)
|
354
|
+
{
|
355
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
356
|
+
{
|
357
|
+
prod_sum = 0; corr = 0;
|
358
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
359
|
+
{
|
360
|
+
xnum = input_data.numeric_data[workspace.ix_arr[row] + col * input_data.nrows];
|
361
|
+
if (!is_na_or_inf(xnum))
|
362
|
+
{
|
363
|
+
if (workspace.weights_arr.size())
|
364
|
+
weight = workspace.weights_arr[workspace.ix_arr[row]];
|
365
|
+
else
|
366
|
+
weight = workspace.weights_map[workspace.ix_arr[row]];
|
367
|
+
|
368
|
+
imputer.num_weight[col] += weight; /* these are always <= 1 */
|
369
|
+
val = (xnum * weight) - corr;
|
370
|
+
diff = prod_sum + val;
|
371
|
+
corr = (diff - prod_sum) - val;
|
372
|
+
prod_sum = diff;
|
373
|
+
}
|
374
|
+
}
|
375
|
+
imputer.num_sum[col] = prod_sum / imputer.num_weight[col];
|
376
|
+
}
|
377
|
+
}
|
378
|
+
|
379
|
+
|
380
|
+
if (input_data.ncols_categ)
|
381
|
+
{
|
382
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
383
|
+
{
|
384
|
+
ix = workspace.ix_arr[row];
|
385
|
+
if (workspace.weights_arr.size())
|
386
|
+
weight = workspace.weights_arr[ix];
|
387
|
+
else
|
388
|
+
weight = workspace.weights_map[ix];
|
389
|
+
|
390
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
391
|
+
{
|
392
|
+
xcat = input_data.categ_data[ix + col * input_data.nrows];
|
393
|
+
if (xcat >= 0)
|
394
|
+
{
|
395
|
+
imputer.cat_sum[col][xcat] += weight; /* later gets divided */
|
396
|
+
imputer.cat_weight[col] += weight;
|
397
|
+
}
|
398
|
+
}
|
399
|
+
}
|
400
|
+
}
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
if (input_data.Xc != NULL) /* sparse numeric */
|
405
|
+
{
|
406
|
+
size_t *ix_arr = workspace.ix_arr.data();
|
407
|
+
size_t st_col, end_col, ind_end_col, curr_pos;
|
408
|
+
std::fill(imputer.num_weight.begin(), imputer.num_weight.end(), wsum);
|
409
|
+
|
410
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
411
|
+
{
|
412
|
+
st_col = input_data.Xc_indptr[col];
|
413
|
+
end_col = input_data.Xc_indptr[col + 1] - 1;
|
414
|
+
ind_end_col = input_data.Xc_ind[end_col];
|
415
|
+
curr_pos = st_col;
|
416
|
+
for (size_t *row = std::lower_bound(ix_arr + workspace.st, ix_arr + workspace.end + 1, input_data.Xc_ind[st_col]);
|
417
|
+
row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
418
|
+
)
|
419
|
+
{
|
420
|
+
if (input_data.Xc_ind[curr_pos] == *row)
|
421
|
+
{
|
422
|
+
xnum = input_data.Xc[curr_pos];
|
423
|
+
if (workspace.weights_arr.size())
|
424
|
+
weight = workspace.weights_arr[*row];
|
425
|
+
else if (workspace.weights_map.size())
|
426
|
+
weight = workspace.weights_map[*row];
|
427
|
+
else
|
428
|
+
weight = 1;
|
429
|
+
|
430
|
+
if (!is_na_or_inf(xnum))
|
431
|
+
{
|
432
|
+
imputer.num_sum[col] += weight * xnum;
|
433
|
+
}
|
434
|
+
|
435
|
+
else
|
436
|
+
{
|
437
|
+
imputer.num_weight[col] -= weight;
|
438
|
+
}
|
439
|
+
|
440
|
+
if (row == ix_arr + workspace.end || curr_pos == end_col) break;
|
441
|
+
curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos, input_data.Xc_ind + end_col + 1, *(++row)) - input_data.Xc_ind;
|
442
|
+
}
|
443
|
+
|
444
|
+
else
|
445
|
+
{
|
446
|
+
if (input_data.Xc_ind[curr_pos] > *row)
|
447
|
+
row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
|
448
|
+
else
|
449
|
+
curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
|
450
|
+
}
|
451
|
+
}
|
452
|
+
|
453
|
+
imputer.num_sum[col] /= imputer.num_weight[col];
|
454
|
+
}
|
455
|
+
}
|
456
|
+
|
457
|
+
/* if any value is not possible to impute, look it up from the parent tree, but assign a lesser weight
|
458
|
+
Note: in theory, the parent node should always have some imputation value for every variable, but due to
|
459
|
+
numeric rounding errors, it might have a weight of zero, so in those cases it's looked up higher up the
|
460
|
+
tree instead. */
|
461
|
+
size_t look_aboves, curr_tree;
|
462
|
+
double min_imp_obs_dbl = (double) min_imp_obs;
|
463
|
+
if (imputer.num_sum.size())
|
464
|
+
{
|
465
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
466
|
+
{
|
467
|
+
if (imputer.num_weight[col] < min_imp_obs_dbl)
|
468
|
+
{
|
469
|
+
look_aboves = 1;
|
470
|
+
curr_tree = imputer.parent;
|
471
|
+
while (true)
|
472
|
+
{
|
473
|
+
if (!is_na_or_inf(imputer_tree[curr_tree].num_sum[col]))
|
474
|
+
{
|
475
|
+
imputer.num_sum[col] = imputer_tree[curr_tree].num_sum[col] / imputer_tree[curr_tree].num_weight[col];
|
476
|
+
imputer.num_weight[col] = wsum / (double)(2 * look_aboves);
|
477
|
+
break;
|
478
|
+
}
|
479
|
+
|
480
|
+
else if (curr_tree > 0)
|
481
|
+
{
|
482
|
+
curr_tree = imputer_tree[curr_tree].parent;
|
483
|
+
look_aboves++;
|
484
|
+
}
|
485
|
+
|
486
|
+
else /* will only happen if every single value is missing */
|
487
|
+
{
|
488
|
+
imputer.num_sum[col] = NAN;
|
489
|
+
imputer.num_weight[col] = 0;
|
490
|
+
break;
|
491
|
+
}
|
492
|
+
}
|
493
|
+
}
|
494
|
+
}
|
495
|
+
}
|
496
|
+
|
497
|
+
if (imputer.cat_sum.size())
|
498
|
+
{
|
499
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
500
|
+
{
|
501
|
+
if (imputer.cat_weight[col] >= min_imp_obs_dbl)
|
502
|
+
{
|
503
|
+
for (double &cat : imputer.cat_sum[col])
|
504
|
+
cat /= imputer.cat_weight[col];
|
505
|
+
}
|
506
|
+
|
507
|
+
else
|
508
|
+
{
|
509
|
+
look_aboves = 1;
|
510
|
+
curr_tree = imputer.parent;
|
511
|
+
while (true)
|
512
|
+
{
|
513
|
+
if (imputer_tree[curr_tree].cat_weight[col] > 0)
|
514
|
+
{
|
515
|
+
for (int cat = 0; cat < input_data.ncat[col]; cat++)
|
516
|
+
{
|
517
|
+
imputer.cat_sum[col][cat] += imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col];
|
518
|
+
imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
|
519
|
+
}
|
520
|
+
break;
|
521
|
+
}
|
522
|
+
|
523
|
+
else if (curr_tree > 0)
|
524
|
+
{
|
525
|
+
curr_tree = imputer_tree[curr_tree].parent;
|
526
|
+
look_aboves++;
|
527
|
+
}
|
528
|
+
|
529
|
+
else /* will only happen if every single value is missing */
|
530
|
+
{
|
531
|
+
break;
|
532
|
+
}
|
533
|
+
}
|
534
|
+
imputer.cat_weight[col] = std::accumulate(imputer.cat_sum[col].begin(),
|
535
|
+
imputer.cat_sum[col].end(),
|
536
|
+
(double) 0);
|
537
|
+
}
|
538
|
+
}
|
539
|
+
}
|
540
|
+
|
541
|
+
/* re-adjust the weights according to parameters
|
542
|
+
(note that by this point, the weights are a sum) */
|
543
|
+
switch(model_params.weigh_imp_rows)
|
544
|
+
{
|
545
|
+
case Inverse:
|
546
|
+
{
|
547
|
+
double wsum_div = wsum * sqrt(wsum);
|
548
|
+
for (double &w : imputer.num_weight)
|
549
|
+
w /= wsum_div;
|
550
|
+
|
551
|
+
for (double &w : imputer.cat_weight)
|
552
|
+
w /= wsum_div;
|
553
|
+
break;
|
554
|
+
}
|
555
|
+
|
556
|
+
case Flat:
|
557
|
+
{
|
558
|
+
for (double &w : imputer.num_weight)
|
559
|
+
w /= wsum;
|
560
|
+
for (double &w : imputer.cat_weight)
|
561
|
+
w /= wsum;
|
562
|
+
break;
|
563
|
+
}
|
564
|
+
|
565
|
+
/* TODO: maybe divide by nrows for prop */
|
566
|
+
}
|
567
|
+
|
568
|
+
double curr_depth_dbl = (double) (curr_depth + 1);
|
569
|
+
switch(model_params.depth_imp)
|
570
|
+
{
|
571
|
+
case Lower:
|
572
|
+
{
|
573
|
+
for (double &w : imputer.num_weight)
|
574
|
+
w /= curr_depth_dbl;
|
575
|
+
for (double &w : imputer.cat_weight)
|
576
|
+
w /= curr_depth_dbl;
|
577
|
+
break;
|
578
|
+
}
|
579
|
+
|
580
|
+
case Higher:
|
581
|
+
{
|
582
|
+
for (double &w : imputer.num_weight)
|
583
|
+
w *= curr_depth_dbl;
|
584
|
+
for (double &w : imputer.cat_weight)
|
585
|
+
w *= curr_depth_dbl;
|
586
|
+
break;
|
587
|
+
}
|
588
|
+
}
|
589
|
+
|
590
|
+
/* now re-adjust sums */
|
591
|
+
if (model_params.weigh_imp_rows != Prop || model_params.depth_imp != Same)
|
592
|
+
{
|
593
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
594
|
+
imputer.num_sum[col] *= imputer.num_weight[col];
|
595
|
+
|
596
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
597
|
+
for (int cat = 0; cat < input_data.ncat[col]; cat++)
|
598
|
+
imputer.cat_sum[col][cat] *= imputer.cat_weight[col];
|
599
|
+
}
|
600
|
+
}
|
601
|
+
|
602
|
+
|
603
|
+
void shrink_impute_node(ImputeNode &imputer)
|
604
|
+
{
|
605
|
+
imputer.num_sum.clear();
|
606
|
+
imputer.num_weight.clear();
|
607
|
+
imputer.cat_sum.clear();
|
608
|
+
imputer.cat_weight.clear();
|
609
|
+
|
610
|
+
imputer.num_sum.shrink_to_fit();
|
611
|
+
imputer.num_weight.shrink_to_fit();
|
612
|
+
imputer.cat_sum.shrink_to_fit();
|
613
|
+
imputer.cat_weight.shrink_to_fit();
|
614
|
+
}
|
615
|
+
|
616
|
+
void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
617
|
+
std::vector<IsoTree> *trees,
|
618
|
+
std::vector<IsoHPlane> *hplanes)
|
619
|
+
{
|
620
|
+
if (trees != NULL)
|
621
|
+
{
|
622
|
+
for (size_t tr = 0; tr < trees->size(); tr++)
|
623
|
+
{
|
624
|
+
if ((*trees)[tr].score <= 0)
|
625
|
+
{
|
626
|
+
shrink_impute_node(imputer_tree[tr]);
|
627
|
+
}
|
628
|
+
|
629
|
+
else
|
630
|
+
{
|
631
|
+
/* cat_weight is not needed for anything else */
|
632
|
+
imputer_tree[tr].cat_weight.clear();
|
633
|
+
imputer_tree[tr].cat_weight.shrink_to_fit();
|
634
|
+
}
|
635
|
+
}
|
636
|
+
}
|
637
|
+
|
638
|
+
else
|
639
|
+
{
|
640
|
+
for (size_t tr = 0; tr < hplanes->size(); tr++)
|
641
|
+
{
|
642
|
+
if ((*hplanes)[tr].score <= 0)
|
643
|
+
{
|
644
|
+
shrink_impute_node(imputer_tree[tr]);
|
645
|
+
}
|
646
|
+
|
647
|
+
else
|
648
|
+
{
|
649
|
+
/* cat_weight is not needed for anything else */
|
650
|
+
imputer_tree[tr].cat_weight.clear();
|
651
|
+
imputer_tree[tr].cat_weight.shrink_to_fit();
|
652
|
+
}
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
imputer_tree.shrink_to_fit();
|
657
|
+
}
|
658
|
+
|
659
|
+
void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
|
660
|
+
{
|
661
|
+
size_t col;
|
662
|
+
for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
|
663
|
+
{
|
664
|
+
imp_addto.num_sum[ix] += imp_addfrom.num_sum[ix];
|
665
|
+
imp_addto.num_weight[ix] += imp_addfrom.num_weight[ix];
|
666
|
+
}
|
667
|
+
|
668
|
+
for (size_t ix = 0; ix < imp_addfrom.n_missing_cat; ix++)
|
669
|
+
{
|
670
|
+
col = imp_addfrom.missing_cat[ix];
|
671
|
+
for (size_t cat = 0; cat < imp_addto.cat_sum[col].size(); cat++)
|
672
|
+
{
|
673
|
+
imp_addto.cat_sum[col][cat] += imp_addfrom.cat_sum[col][cat];
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
for (size_t ix = 0; ix < imp_addfrom.n_missing_sp; ix++)
|
678
|
+
{
|
679
|
+
imp_addto.sp_num_sum[ix] += imp_addfrom.sp_num_sum[ix];
|
680
|
+
imp_addto.sp_num_weight[ix] += imp_addfrom.sp_num_weight[ix];
|
681
|
+
}
|
682
|
+
}
|
683
|
+
|
684
|
+
void combine_tree_imputations(WorkerMemory &workspace,
|
685
|
+
std::vector<ImputedData> &impute_vec,
|
686
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
687
|
+
std::vector<char> &has_missing,
|
688
|
+
int nthreads)
|
689
|
+
{
|
690
|
+
if (workspace.impute_vec.size())
|
691
|
+
{
|
692
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
|
693
|
+
for (size_t_for row = 0; row < has_missing.size(); row++)
|
694
|
+
if (has_missing[row])
|
695
|
+
combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
|
696
|
+
}
|
697
|
+
|
698
|
+
else if (workspace.impute_map.size())
|
699
|
+
{
|
700
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
|
701
|
+
for (size_t_for row = 0; row < has_missing.size(); row++)
|
702
|
+
if (has_missing[row])
|
703
|
+
combine_imp_single(workspace.impute_map[row], impute_map[row]);
|
704
|
+
}
|
705
|
+
}
|
706
|
+
|
707
|
+
|
708
|
+
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
|
709
|
+
{
|
710
|
+
size_t col;
|
711
|
+
for (size_t ix = 0; ix < imputed_data.n_missing_num; ix++)
|
712
|
+
{
|
713
|
+
col = imputed_data.missing_num[ix];
|
714
|
+
imputed_data.num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
|
715
|
+
imputed_data.num_weight[ix] += w * imputer.num_weight[ix];
|
716
|
+
}
|
717
|
+
|
718
|
+
for (size_t ix = 0; ix < imputed_data.n_missing_sp; ix++)
|
719
|
+
{
|
720
|
+
col = imputed_data.missing_sp[ix];
|
721
|
+
imputed_data.sp_num_sum[ix] += (!is_na_or_inf(imputer.num_sum[col]))? (w * imputer.num_sum[col]) : 0;
|
722
|
+
imputed_data.sp_num_weight[ix] += w * imputer.num_weight[ix];
|
723
|
+
}
|
724
|
+
|
725
|
+
for (size_t ix = 0; ix < imputed_data.n_missing_cat; ix++)
|
726
|
+
{
|
727
|
+
col = imputed_data.missing_cat[ix];
|
728
|
+
for (size_t cat = 0; cat < imputer.cat_sum[col].size(); cat++)
|
729
|
+
imputed_data.cat_sum[col][cat] += w * imputer.cat_sum[col][cat];
|
730
|
+
}
|
731
|
+
}
|
732
|
+
|
733
|
+
|
734
|
+
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
|
735
|
+
{
|
736
|
+
if (workspace.impute_vec.size())
|
737
|
+
{
|
738
|
+
if (!workspace.weights_arr.size() && !workspace.weights_map.size())
|
739
|
+
{
|
740
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
741
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
742
|
+
add_from_impute_node(imputer,
|
743
|
+
workspace.impute_vec[workspace.ix_arr[row]],
|
744
|
+
(double)1);
|
745
|
+
}
|
746
|
+
|
747
|
+
else if (workspace.weights_arr.size())
|
748
|
+
{
|
749
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
750
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
751
|
+
add_from_impute_node(imputer,
|
752
|
+
workspace.impute_vec[workspace.ix_arr[row]],
|
753
|
+
workspace.weights_arr[workspace.ix_arr[row]]);
|
754
|
+
}
|
755
|
+
|
756
|
+
else
|
757
|
+
{
|
758
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
759
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
760
|
+
add_from_impute_node(imputer,
|
761
|
+
workspace.impute_vec[workspace.ix_arr[row]],
|
762
|
+
workspace.weights_map[workspace.ix_arr[row]]);
|
763
|
+
}
|
764
|
+
}
|
765
|
+
|
766
|
+
else if (workspace.impute_map.size())
|
767
|
+
{
|
768
|
+
if (!workspace.weights_arr.size() && !workspace.weights_map.size())
|
769
|
+
{
|
770
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
771
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
772
|
+
add_from_impute_node(imputer,
|
773
|
+
workspace.impute_map[workspace.ix_arr[row]],
|
774
|
+
(double)1);
|
775
|
+
}
|
776
|
+
|
777
|
+
else if (workspace.weights_arr.size())
|
778
|
+
{
|
779
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
780
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
781
|
+
add_from_impute_node(imputer,
|
782
|
+
workspace.impute_map[workspace.ix_arr[row]],
|
783
|
+
workspace.weights_arr[workspace.ix_arr[row]]);
|
784
|
+
}
|
785
|
+
|
786
|
+
else
|
787
|
+
{
|
788
|
+
for (size_t row = workspace.st; row <= workspace.end; row++)
|
789
|
+
if (input_data.has_missing[workspace.ix_arr[row]])
|
790
|
+
add_from_impute_node(imputer,
|
791
|
+
workspace.impute_map[workspace.ix_arr[row]],
|
792
|
+
workspace.weights_map[workspace.ix_arr[row]]);
|
793
|
+
}
|
794
|
+
}
|
795
|
+
}
|
796
|
+
|
797
|
+
template <class imp_arr>
|
798
|
+
void apply_imputation_results(imp_arr &impute_vec,
|
799
|
+
Imputer &imputer,
|
800
|
+
InputData &input_data,
|
801
|
+
int nthreads)
|
802
|
+
{
|
803
|
+
size_t col;
|
804
|
+
|
805
|
+
if (input_data.Xc != NULL)
|
806
|
+
{
|
807
|
+
std::vector<size_t> row_pos(input_data.nrows, 0);
|
808
|
+
size_t row;
|
809
|
+
|
810
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
811
|
+
{
|
812
|
+
for (sparse_ix ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
813
|
+
{
|
814
|
+
if (is_na_or_inf(input_data.Xc[ix]))
|
815
|
+
{
|
816
|
+
row = input_data.Xc_ind[ix];
|
817
|
+
if (impute_vec[row].sp_num_weight[row_pos[row]] > 0 && !is_na_or_inf(impute_vec[row].sp_num_sum[row_pos[row]]))
|
818
|
+
input_data.Xc[ix]
|
819
|
+
=
|
820
|
+
impute_vec[row].sp_num_sum[row_pos[row]]
|
821
|
+
/
|
822
|
+
impute_vec[row].sp_num_weight[row_pos[row]];
|
823
|
+
else
|
824
|
+
input_data.Xc[ix]
|
825
|
+
=
|
826
|
+
imputer.col_means[col];
|
827
|
+
|
828
|
+
row_pos[row]++;
|
829
|
+
}
|
830
|
+
}
|
831
|
+
}
|
832
|
+
}
|
833
|
+
|
834
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
|
835
|
+
for (size_t_for row = 0; row < input_data.nrows; row++)
|
836
|
+
{
|
837
|
+
if (input_data.has_missing[row])
|
838
|
+
{
|
839
|
+
for (size_t ix = 0; ix < impute_vec[row].n_missing_num; ix++)
|
840
|
+
{
|
841
|
+
col = impute_vec[row].missing_num[ix];
|
842
|
+
if (impute_vec[row].num_weight[ix] > 0 && !is_na_or_inf(impute_vec[row].num_sum[ix]))
|
843
|
+
input_data.numeric_data[row + col * input_data.nrows]
|
844
|
+
=
|
845
|
+
impute_vec[row].num_sum[ix] / impute_vec[row].num_weight[ix];
|
846
|
+
else
|
847
|
+
input_data.numeric_data[row + col * input_data.nrows]
|
848
|
+
=
|
849
|
+
imputer.col_means[col];
|
850
|
+
}
|
851
|
+
|
852
|
+
for (size_t ix = 0; ix < impute_vec[row].n_missing_cat; ix++)
|
853
|
+
{
|
854
|
+
col = impute_vec[row].missing_cat[ix];
|
855
|
+
input_data.categ_data[row + col * input_data.nrows]
|
856
|
+
=
|
857
|
+
std::distance(impute_vec[row].cat_sum[col].begin(),
|
858
|
+
std::max_element(impute_vec[row].cat_sum[col].begin(),
|
859
|
+
impute_vec[row].cat_sum[col].end()));
|
860
|
+
|
861
|
+
if (input_data.categ_data[row + col * input_data.nrows] == 0 && impute_vec[row].cat_sum[col][0] <= 0)
|
862
|
+
input_data.categ_data[row + col * input_data.nrows]
|
863
|
+
=
|
864
|
+
imputer.col_modes[col];
|
865
|
+
}
|
866
|
+
}
|
867
|
+
}
|
868
|
+
}
|
869
|
+
|
870
|
+
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
871
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
872
|
+
Imputer &imputer,
|
873
|
+
InputData &input_data,
|
874
|
+
int nthreads)
|
875
|
+
{
|
876
|
+
if (impute_vec.size())
|
877
|
+
apply_imputation_results(impute_vec, imputer, input_data, nthreads);
|
878
|
+
else if (impute_map.size())
|
879
|
+
apply_imputation_results(impute_map, imputer, input_data, nthreads);
|
880
|
+
}
|
881
|
+
|
882
|
+
|
883
|
+
void apply_imputation_results(PredictionData &prediction_data,
|
884
|
+
ImputedData &imp,
|
885
|
+
Imputer &imputer,
|
886
|
+
size_t row)
|
887
|
+
{
|
888
|
+
size_t col;
|
889
|
+
size_t pos = 0;
|
890
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
891
|
+
{
|
892
|
+
col = imp.missing_num[ix];
|
893
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
|
894
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
895
|
+
=
|
896
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
897
|
+
else
|
898
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
899
|
+
=
|
900
|
+
imputer.col_means[col];
|
901
|
+
}
|
902
|
+
|
903
|
+
if (prediction_data.Xr != NULL)
|
904
|
+
for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
905
|
+
{
|
906
|
+
if (is_na_or_inf(prediction_data.Xr[ix]))
|
907
|
+
{
|
908
|
+
if (imp.sp_num_weight[pos] > 0 && !is_na_or_inf(imp.sp_num_sum[pos]))
|
909
|
+
prediction_data.Xr[ix]
|
910
|
+
=
|
911
|
+
imp.sp_num_sum[pos] / imp.sp_num_weight[pos];
|
912
|
+
else
|
913
|
+
prediction_data.Xr[ix]
|
914
|
+
=
|
915
|
+
imputer.col_means[imp.missing_sp[pos]];
|
916
|
+
pos++;
|
917
|
+
}
|
918
|
+
}
|
919
|
+
|
920
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
921
|
+
{
|
922
|
+
col = imp.missing_cat[ix];
|
923
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
924
|
+
=
|
925
|
+
std::distance(imp.cat_sum[col].begin(),
|
926
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
927
|
+
|
928
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
|
929
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
930
|
+
=
|
931
|
+
imputer.col_modes[col];
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
935
|
+
|
936
|
+
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
937
|
+
{
|
938
|
+
imp.n_missing_num = 0;
|
939
|
+
imp.n_missing_cat = 0;
|
940
|
+
imp.n_missing_sp = 0;
|
941
|
+
|
942
|
+
if (input_data.numeric_data != NULL)
|
943
|
+
{
|
944
|
+
imp.missing_num.resize(input_data.ncols_numeric);
|
945
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
946
|
+
if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
|
947
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
948
|
+
imp.missing_num.resize(imp.n_missing_num);
|
949
|
+
imp.num_sum.assign(imp.n_missing_num, 0);
|
950
|
+
imp.num_weight.assign(imp.n_missing_num, 0);
|
951
|
+
}
|
952
|
+
|
953
|
+
else if (input_data.Xc != NULL)
|
954
|
+
{
|
955
|
+
imp.missing_sp.resize(input_data.ncols_numeric);
|
956
|
+
sparse_ix *res;
|
957
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
958
|
+
{
|
959
|
+
res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
|
960
|
+
input_data.Xc_ind + input_data.Xc_indptr[col + 1],
|
961
|
+
(sparse_ix) row);
|
962
|
+
if (
|
963
|
+
res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
|
964
|
+
*res == row &&
|
965
|
+
is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
|
966
|
+
)
|
967
|
+
{
|
968
|
+
imp.missing_sp[imp.n_missing_sp++] = col;
|
969
|
+
}
|
970
|
+
}
|
971
|
+
imp.sp_num_sum.assign(imp.n_missing_sp, 0);
|
972
|
+
imp.sp_num_weight.assign(imp.n_missing_sp, 0);
|
973
|
+
}
|
974
|
+
|
975
|
+
if (input_data.categ_data != NULL)
|
976
|
+
{
|
977
|
+
imp.missing_cat.resize(input_data.ncols_categ);
|
978
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
979
|
+
if (input_data.categ_data[row + col * input_data.nrows] < 0)
|
980
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
981
|
+
imp.missing_cat.resize(imp.n_missing_cat);
|
982
|
+
imp.cat_weight.assign(imp.n_missing_cat, 0);
|
983
|
+
imp.cat_sum.resize(input_data.ncols_categ);
|
984
|
+
for (size_t cat = 0; cat < imp.n_missing_cat; cat++)
|
985
|
+
imp.cat_sum[imp.missing_cat[cat]].assign(input_data.ncat[imp.missing_cat[cat]], 0);
|
986
|
+
}
|
987
|
+
}
|
988
|
+
|
989
|
+
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
|
990
|
+
{
|
991
|
+
imp.n_missing_num = 0;
|
992
|
+
imp.n_missing_cat = 0;
|
993
|
+
imp.n_missing_sp = 0;
|
994
|
+
|
995
|
+
if (prediction_data.numeric_data != NULL)
|
996
|
+
{
|
997
|
+
if (!imp.missing_num.size())
|
998
|
+
imp.missing_num.resize(imputer.ncols_numeric);
|
999
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1000
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1001
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
1002
|
+
|
1003
|
+
if (!imp.num_sum.size())
|
1004
|
+
{
|
1005
|
+
imp.num_sum.resize(imputer.ncols_numeric, 0);
|
1006
|
+
imp.num_weight.resize(imputer.ncols_numeric, 0);
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
else
|
1010
|
+
{
|
1011
|
+
std::fill(imp.num_sum.begin(), imp.num_sum.begin() + imp.n_missing_num, 0);
|
1012
|
+
std::fill(imp.num_weight.begin(), imp.num_weight.begin() + imp.n_missing_num, 0);
|
1013
|
+
}
|
1014
|
+
}
|
1015
|
+
|
1016
|
+
else if (prediction_data.Xr != NULL)
|
1017
|
+
{
|
1018
|
+
if (!imp.missing_sp.size())
|
1019
|
+
imp.missing_sp.resize(imputer.ncols_numeric);
|
1020
|
+
for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1021
|
+
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1022
|
+
imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
|
1023
|
+
|
1024
|
+
if (!imp.sp_num_sum.size())
|
1025
|
+
{
|
1026
|
+
imp.sp_num_sum.resize(imputer.ncols_numeric, 0);
|
1027
|
+
imp.sp_num_weight.resize(imputer.ncols_numeric, 0);
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
else
|
1031
|
+
{
|
1032
|
+
std::fill(imp.sp_num_sum.begin(), imp.sp_num_sum.begin() + imp.n_missing_sp, 0);
|
1033
|
+
std::fill(imp.sp_num_weight.begin(), imp.sp_num_weight.begin() + imp.n_missing_sp, 0);
|
1034
|
+
}
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
if (prediction_data.categ_data != NULL)
|
1038
|
+
{
|
1039
|
+
if (!imp.missing_cat.size())
|
1040
|
+
imp.missing_cat.resize(imputer.ncols_categ);
|
1041
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1042
|
+
{
|
1043
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1044
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
if (!imp.cat_weight.size())
|
1048
|
+
{
|
1049
|
+
imp.cat_weight.resize(imputer.ncols_categ, 0);
|
1050
|
+
imp.cat_sum.resize(imputer.ncols_categ);
|
1051
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1052
|
+
imp.cat_sum[col].resize(imputer.ncat[col], 0);
|
1053
|
+
}
|
1054
|
+
|
1055
|
+
else
|
1056
|
+
{
|
1057
|
+
std::fill(imp.cat_weight.begin(), imp.cat_weight.begin() + imp.n_missing_cat, 0);
|
1058
|
+
for (size_t col = 0; col < imp.n_missing_cat; col++)
|
1059
|
+
std::fill(imp.cat_sum[imp.missing_cat[col]].begin(),
|
1060
|
+
imp.cat_sum[imp.missing_cat[col]].end(),
|
1061
|
+
0);
|
1062
|
+
}
|
1063
|
+
}
|
1064
|
+
}
|
1065
|
+
|
1066
|
+
ImputedData::ImputedData(InputData &input_data, size_t row)
|
1067
|
+
{
|
1068
|
+
initialize_impute_calc(*this, input_data, row);
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
|
1072
|
+
{
|
1073
|
+
impute_vec.resize(input_data.nrows);
|
1074
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
|
1075
|
+
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1076
|
+
if (input_data.has_missing[row])
|
1077
|
+
initialize_impute_calc(impute_vec[row], input_data, row);
|
1078
|
+
}
|
1079
|
+
|
1080
|
+
|
1081
|
+
void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data)
|
1082
|
+
{
|
1083
|
+
for (size_t row = 0; row < input_data.nrows; row++)
|
1084
|
+
if (input_data.has_missing[row])
|
1085
|
+
impute_map[row] = ImputedData(input_data, row);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
void allocate_imp(InputData &input_data,
|
1089
|
+
std::vector<ImputedData> &impute_vec,
|
1090
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
1091
|
+
int nthreads)
|
1092
|
+
{
|
1093
|
+
if (input_data.n_missing == 0)
|
1094
|
+
return;
|
1095
|
+
else if (input_data.n_missing <= input_data.nrows / (nthreads * 10))
|
1096
|
+
allocate_imp_map(impute_map, input_data);
|
1097
|
+
else
|
1098
|
+
allocate_imp_vec(impute_vec, input_data, nthreads);
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
void check_for_missing(InputData &input_data,
|
1102
|
+
std::vector<ImputedData> &impute_vec,
|
1103
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
1104
|
+
int nthreads)
|
1105
|
+
{
|
1106
|
+
input_data.has_missing.assign(input_data.nrows, false);
|
1107
|
+
|
1108
|
+
if (input_data.Xc != NULL)
|
1109
|
+
{
|
1110
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1111
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
|
1112
|
+
for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
1113
|
+
if (is_na_or_inf(input_data.Xc[ix]))
|
1114
|
+
input_data.has_missing[input_data.Xc_ind[ix]] = true;
|
1115
|
+
#pragma omp barrier
|
1116
|
+
}
|
1117
|
+
|
1118
|
+
if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
|
1119
|
+
{
|
1120
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
|
1121
|
+
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1122
|
+
{
|
1123
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1124
|
+
{
|
1125
|
+
if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
|
1126
|
+
{
|
1127
|
+
input_data.has_missing[row] = true;
|
1128
|
+
break;
|
1129
|
+
}
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
if (!input_data.has_missing[row])
|
1133
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++)
|
1134
|
+
{
|
1135
|
+
if (input_data.categ_data[row + col * input_data.nrows] < 0)
|
1136
|
+
{
|
1137
|
+
input_data.has_missing[row] = true;
|
1138
|
+
break;
|
1139
|
+
}
|
1140
|
+
}
|
1141
|
+
}
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
input_data.n_missing = std::accumulate(input_data.has_missing.begin(), input_data.has_missing.end(), (size_t)0);
|
1145
|
+
allocate_imp(input_data, impute_vec, impute_map, nthreads);
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
size_t check_for_missing(PredictionData &prediction_data,
|
1149
|
+
Imputer &imputer,
|
1150
|
+
size_t ix_arr[],
|
1151
|
+
int nthreads)
|
1152
|
+
{
|
1153
|
+
std::vector<char> has_missing(prediction_data.nrows, false);
|
1154
|
+
|
1155
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
|
1156
|
+
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
1157
|
+
{
|
1158
|
+
if (prediction_data.numeric_data != NULL)
|
1159
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1160
|
+
{
|
1161
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1162
|
+
{
|
1163
|
+
has_missing[row] = true;
|
1164
|
+
break;
|
1165
|
+
}
|
1166
|
+
}
|
1167
|
+
else if (prediction_data.Xr != NULL)
|
1168
|
+
for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1169
|
+
{
|
1170
|
+
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1171
|
+
{
|
1172
|
+
has_missing[row] = true;
|
1173
|
+
break;
|
1174
|
+
}
|
1175
|
+
}
|
1176
|
+
|
1177
|
+
if (!has_missing[row])
|
1178
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1179
|
+
{
|
1180
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1181
|
+
{
|
1182
|
+
has_missing[row] = true;
|
1183
|
+
break;
|
1184
|
+
}
|
1185
|
+
}
|
1186
|
+
}
|
1187
|
+
|
1188
|
+
size_t st = 0;
|
1189
|
+
size_t temp;
|
1190
|
+
for (size_t row = 0; row < prediction_data.nrows; row++)
|
1191
|
+
{
|
1192
|
+
if (has_missing[row])
|
1193
|
+
{
|
1194
|
+
temp = ix_arr[st];
|
1195
|
+
ix_arr[st] = ix_arr[row];
|
1196
|
+
ix_arr[row] = temp;
|
1197
|
+
st++;
|
1198
|
+
}
|
1199
|
+
}
|
1200
|
+
|
1201
|
+
if (st == 0)
|
1202
|
+
return 0;
|
1203
|
+
|
1204
|
+
return st;
|
1205
|
+
}
|