isotree 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/LICENSE.txt +2 -2
- data/README.md +41 -23
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/dataset.rb +0 -1
- data/lib/isotree/isolation_forest.rb +114 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +40 -106
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
|
|
18
18
|
* [5] https://sourceforge.net/projects/iforest/
|
19
19
|
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
-
* [8] Cortes, David.
|
22
|
-
*
|
21
|
+
* [8] Cortes, David.
|
22
|
+
* "Distance approximation using Isolation Forests."
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
24
|
+
* [9] Cortes, David.
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
28
|
+
* [11] Cortes, David.
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
31
|
+
* [12] Guha, Sudipto, et al.
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
34
|
+
* [13] Cortes, David.
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
23
41
|
*
|
24
42
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
26
44
|
* All rights reserved.
|
27
45
|
* Redistribution and use in source and binary forms, with or without
|
28
46
|
* modification, are permitted provided that the following conditions are met:
|
@@ -52,18 +70,18 @@
|
|
52
70
|
* Parameters
|
53
71
|
* ==========
|
54
72
|
* - numeric_data[nrows * ncols_numeric] (in, out)
|
55
|
-
* Pointer to numeric data in which missing values will be imputed.
|
56
|
-
*
|
57
|
-
*
|
73
|
+
* Pointer to numeric data in which missing values will be imputed. May be ordered by rows
|
74
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
75
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
76
|
+
* (see parameter 'is_col_major').
|
58
77
|
* Pass NULL if there are no dense numeric columns.
|
59
78
|
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
60
79
|
* Imputations will overwrite values in this same array.
|
61
|
-
* - ncols_numeric
|
62
|
-
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
63
80
|
* - categ_data[nrows * ncols_categ]
|
64
|
-
* Pointer to categorical data in which missing values will be imputed.
|
65
|
-
*
|
66
|
-
*
|
81
|
+
* Pointer to categorical data in which missing values will be imputed. May be ordered by rows
|
82
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
83
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
84
|
+
* (see parameter 'is_col_major').
|
67
85
|
* Pass NULL if there are no categorical columns.
|
68
86
|
* Each category should be represented as an integer, and these integers must start at zero and
|
69
87
|
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
@@ -71,6 +89,11 @@
|
|
71
89
|
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
72
90
|
* must be the same as was used in the data to which the model was fit.
|
73
91
|
* Imputations will overwrite values in this same array.
|
92
|
+
* - is_col_major
|
93
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
94
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
95
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
96
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
74
97
|
* - ncols_categ
|
75
98
|
* Number of categorical columns in the data.
|
76
99
|
* - ncat[ncols_categ]
|
@@ -84,6 +107,7 @@
|
|
84
107
|
* Imputations will overwrite values in this same array.
|
85
108
|
* - Xr_ind[nnz]
|
86
109
|
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
110
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
87
111
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
88
112
|
* - Xr_indptr[nrows + 1]
|
89
113
|
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
@@ -91,6 +115,11 @@
|
|
91
115
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
92
116
|
* - nrows
|
93
117
|
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
118
|
+
* - use_long_double
|
119
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
120
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
121
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
122
|
+
* Power8 platforms).
|
94
123
|
* - nthreads
|
95
124
|
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
96
125
|
* allocated, even if the thread does not end up being used. Ignored when not building with
|
@@ -107,13 +136,50 @@
|
|
107
136
|
* Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
|
108
137
|
* as produced from function 'fit_iforest',
|
109
138
|
*/
|
110
|
-
|
111
|
-
|
139
|
+
template <class real_t, class sparse_ix>
|
140
|
+
void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
|
141
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
142
|
+
size_t nrows, bool use_long_double, int nthreads,
|
143
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
144
|
+
Imputer &imputer)
|
145
|
+
{
|
146
|
+
if (use_long_double && !has_long_double()) {
|
147
|
+
use_long_double = false;
|
148
|
+
fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
149
|
+
}
|
150
|
+
#ifndef NO_LONG_DOUBLE
|
151
|
+
if (likely(!use_long_double))
|
152
|
+
#endif
|
153
|
+
impute_missing_values_internal<real_t, sparse_ix, double>(
|
154
|
+
numeric_data, categ_data, is_col_major,
|
155
|
+
Xr, Xr_ind, Xr_indptr,
|
156
|
+
nrows, nthreads,
|
157
|
+
model_outputs, model_outputs_ext,
|
158
|
+
imputer
|
159
|
+
);
|
160
|
+
#ifndef NO_LONG_DOUBLE
|
161
|
+
else
|
162
|
+
impute_missing_values_internal<real_t, sparse_ix, long double>(
|
163
|
+
numeric_data, categ_data, is_col_major,
|
164
|
+
Xr, Xr_ind, Xr_indptr,
|
165
|
+
nrows, nthreads,
|
166
|
+
model_outputs, model_outputs_ext,
|
167
|
+
imputer
|
168
|
+
);
|
169
|
+
#endif
|
170
|
+
}
|
171
|
+
|
172
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
173
|
+
void impute_missing_values_internal(
|
174
|
+
real_t numeric_data[], int categ_data[], bool is_col_major,
|
175
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
112
176
|
size_t nrows, int nthreads,
|
113
177
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
114
178
|
Imputer &imputer)
|
115
179
|
{
|
116
|
-
PredictionData
|
180
|
+
PredictionData<real_t, sparse_ix>
|
181
|
+
prediction_data = {numeric_data, categ_data, nrows,
|
182
|
+
is_col_major, imputer.ncols_numeric, imputer.ncols_categ,
|
117
183
|
NULL, NULL, NULL,
|
118
184
|
Xr, Xr_ind, Xr_indptr};
|
119
185
|
|
@@ -128,34 +194,53 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
128
194
|
if ((size_t)nthreads > end)
|
129
195
|
nthreads = (int)end;
|
130
196
|
#ifdef _OPENMP
|
131
|
-
std::vector<ImputedData
|
197
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(nthreads);
|
132
198
|
#else
|
133
|
-
std::vector<ImputedData
|
199
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(1);
|
134
200
|
#endif
|
135
201
|
|
202
|
+
bool threw_exception = false;
|
203
|
+
std::exception_ptr ex = NULL;
|
136
204
|
|
137
205
|
if (model_outputs != NULL)
|
138
206
|
{
|
139
207
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
140
|
-
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
|
141
|
-
for (size_t_for row = 0; row < end; row++)
|
208
|
+
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer, ex, threw_exception)
|
209
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
142
210
|
{
|
143
|
-
|
144
|
-
|
145
|
-
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
211
|
+
if (threw_exception) continue;
|
212
|
+
try
|
146
213
|
{
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
214
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
215
|
+
|
216
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
217
|
+
{
|
218
|
+
traverse_itree(tree,
|
219
|
+
*model_outputs,
|
220
|
+
prediction_data,
|
221
|
+
&imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
|
222
|
+
&imp_memory[omp_get_thread_num()],
|
223
|
+
(double) 1,
|
224
|
+
ix_arr[row],
|
225
|
+
(sparse_ix*)NULL,
|
226
|
+
(double*)NULL,
|
227
|
+
(size_t) 0);
|
228
|
+
}
|
229
|
+
|
230
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
156
231
|
}
|
157
232
|
|
158
|
-
|
233
|
+
catch(...)
|
234
|
+
{
|
235
|
+
#pragma omp critical
|
236
|
+
{
|
237
|
+
if (!threw_exception)
|
238
|
+
{
|
239
|
+
threw_exception = true;
|
240
|
+
ex = std::current_exception();
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
159
244
|
|
160
245
|
}
|
161
246
|
}
|
@@ -164,31 +249,51 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
164
249
|
{
|
165
250
|
double temp;
|
166
251
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
167
|
-
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
|
252
|
+
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer, ex, threw_exception) \
|
168
253
|
private(temp)
|
169
|
-
for (size_t_for row = 0; row < end; row++)
|
254
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
170
255
|
{
|
171
|
-
|
172
|
-
|
173
|
-
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
256
|
+
if (threw_exception) continue;
|
257
|
+
try
|
174
258
|
{
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
259
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
260
|
+
|
261
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
262
|
+
{
|
263
|
+
traverse_hplane(hplane,
|
264
|
+
*model_outputs_ext,
|
265
|
+
prediction_data,
|
266
|
+
temp,
|
267
|
+
&imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
|
268
|
+
&imp_memory[omp_get_thread_num()],
|
269
|
+
(sparse_ix*)NULL,
|
270
|
+
(double*)NULL,
|
271
|
+
ix_arr[row]);
|
272
|
+
}
|
273
|
+
|
274
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
183
275
|
}
|
184
276
|
|
185
|
-
|
277
|
+
catch (...)
|
278
|
+
{
|
279
|
+
#pragma omp critical
|
280
|
+
{
|
281
|
+
if (!threw_exception)
|
282
|
+
{
|
283
|
+
threw_exception = true;
|
284
|
+
ex = std::current_exception();
|
285
|
+
}
|
286
|
+
}
|
287
|
+
}
|
186
288
|
|
187
289
|
}
|
188
290
|
}
|
189
291
|
|
292
|
+
if (threw_exception)
|
293
|
+
std::rethrow_exception(ex);
|
190
294
|
}
|
191
295
|
|
296
|
+
template <class InputData, class ldouble_safe>
|
192
297
|
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
|
193
298
|
{
|
194
299
|
imputer.ncols_numeric = input_data.ncols_numeric;
|
@@ -212,7 +317,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
212
317
|
if (input_data.numeric_data != NULL)
|
213
318
|
{
|
214
319
|
#pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
|
215
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
320
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
216
321
|
{
|
217
322
|
cnt = input_data.nrows;
|
218
323
|
offset = col * input_data.nrows;
|
@@ -222,23 +327,23 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
222
327
|
input_data.numeric_data[row + offset] : 0;
|
223
328
|
cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
|
224
329
|
}
|
225
|
-
imputer.col_means[col] /= (
|
330
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
226
331
|
}
|
227
332
|
}
|
228
333
|
|
229
334
|
else if (input_data.Xc_indptr != NULL)
|
230
335
|
{
|
231
336
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
|
232
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
337
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
233
338
|
{
|
234
339
|
cnt = input_data.nrows;
|
235
|
-
for (
|
340
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
236
341
|
{
|
237
342
|
imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
|
238
343
|
input_data.Xc[ix] : 0;
|
239
344
|
cnt -= is_na_or_inf(input_data.Xc[ix]);
|
240
345
|
}
|
241
|
-
imputer.col_means[col] /= (
|
346
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
242
347
|
}
|
243
348
|
}
|
244
349
|
|
@@ -246,7 +351,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
246
351
|
{
|
247
352
|
std::vector<size_t> cat_counts(input_data.max_categ);
|
248
353
|
#pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
|
249
|
-
for (size_t_for col = 0; col < input_data.ncols_categ; col++)
|
354
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_categ; col++)
|
250
355
|
{
|
251
356
|
std::fill(cat_counts.begin(), cat_counts.end(), 0);
|
252
357
|
offset = col * input_data.nrows;
|
@@ -264,6 +369,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
264
369
|
|
265
370
|
|
266
371
|
/* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
|
372
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
267
373
|
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
268
374
|
InputData &input_data, ModelParams &model_params,
|
269
375
|
std::vector<ImputeNode> &imputer_tree,
|
@@ -274,7 +380,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
274
380
|
if (!has_weights)
|
275
381
|
wsum = (double)(workspace.end - workspace.st + 1);
|
276
382
|
else
|
277
|
-
wsum = calculate_sum_weights(
|
383
|
+
wsum = calculate_sum_weights<ldouble_safe>(
|
384
|
+
workspace.ix_arr, workspace.st, workspace.end, curr_depth,
|
278
385
|
workspace.weights_arr, workspace.weights_map);
|
279
386
|
|
280
387
|
imputer.num_sum.resize(input_data.ncols_numeric, 0);
|
@@ -320,7 +427,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
320
427
|
if (!is_na_or_inf(xnum))
|
321
428
|
{
|
322
429
|
cnt++;
|
323
|
-
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (
|
430
|
+
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (ldouble_safe)cnt;
|
324
431
|
}
|
325
432
|
}
|
326
433
|
imputer.num_weight[col] = (double) cnt;
|
@@ -349,7 +456,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
349
456
|
|
350
457
|
else
|
351
458
|
{
|
352
|
-
|
459
|
+
ldouble_safe prod_sum, corr, val, diff;
|
353
460
|
if (input_data.numeric_data != NULL)
|
354
461
|
{
|
355
462
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
@@ -417,7 +524,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
417
524
|
row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
418
525
|
)
|
419
526
|
{
|
420
|
-
if (input_data.Xc_ind[curr_pos] == *row)
|
527
|
+
if (input_data.Xc_ind[curr_pos] == static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
421
528
|
{
|
422
529
|
xnum = input_data.Xc[curr_pos];
|
423
530
|
if (workspace.weights_arr.size())
|
@@ -443,7 +550,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
443
550
|
|
444
551
|
else
|
445
552
|
{
|
446
|
-
if (input_data.Xc_ind[curr_pos] > *row)
|
553
|
+
if (input_data.Xc_ind[curr_pos] > static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
447
554
|
row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
|
448
555
|
else
|
449
556
|
curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
|
@@ -514,7 +621,10 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
514
621
|
{
|
515
622
|
for (int cat = 0; cat < input_data.ncat[col]; cat++)
|
516
623
|
{
|
517
|
-
imputer.cat_sum[col][cat]
|
624
|
+
imputer.cat_sum[col][cat]
|
625
|
+
+=
|
626
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] > 0)?
|
627
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col]) : 0.;
|
518
628
|
imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
|
519
629
|
}
|
520
630
|
break;
|
@@ -544,7 +654,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
544
654
|
{
|
545
655
|
case Inverse:
|
546
656
|
{
|
547
|
-
double wsum_div = wsum * sqrt(wsum);
|
657
|
+
double wsum_div = wsum * std::sqrt(wsum);
|
548
658
|
for (double &w : imputer.num_weight)
|
549
659
|
w /= wsum_div;
|
550
660
|
|
@@ -562,6 +672,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
562
672
|
break;
|
563
673
|
}
|
564
674
|
|
675
|
+
default: {}
|
676
|
+
|
565
677
|
/* TODO: maybe divide by nrows for prop */
|
566
678
|
}
|
567
679
|
|
@@ -585,6 +697,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
585
697
|
w *= curr_depth_dbl;
|
586
698
|
break;
|
587
699
|
}
|
700
|
+
|
701
|
+
default: {}
|
588
702
|
}
|
589
703
|
|
590
704
|
/* now re-adjust sums */
|
@@ -621,7 +735,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
621
735
|
{
|
622
736
|
for (size_t tr = 0; tr < trees->size(); tr++)
|
623
737
|
{
|
624
|
-
if ((*trees)[tr].
|
738
|
+
if ((*trees)[tr].tree_left != 0)
|
625
739
|
{
|
626
740
|
shrink_impute_node(imputer_tree[tr]);
|
627
741
|
}
|
@@ -639,7 +753,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
639
753
|
{
|
640
754
|
for (size_t tr = 0; tr < hplanes->size(); tr++)
|
641
755
|
{
|
642
|
-
if ((*hplanes)[tr].
|
756
|
+
if ((*hplanes)[tr].hplane_left != 0)
|
643
757
|
{
|
644
758
|
shrink_impute_node(imputer_tree[tr]);
|
645
759
|
}
|
@@ -656,7 +770,8 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
656
770
|
imputer_tree.shrink_to_fit();
|
657
771
|
}
|
658
772
|
|
659
|
-
|
773
|
+
template <class ImputedData>
|
774
|
+
void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto)
|
660
775
|
{
|
661
776
|
size_t col;
|
662
777
|
for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
|
@@ -681,16 +796,17 @@ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
|
|
681
796
|
}
|
682
797
|
}
|
683
798
|
|
799
|
+
template <class ImputedData, class WorkerMemory>
|
684
800
|
void combine_tree_imputations(WorkerMemory &workspace,
|
685
801
|
std::vector<ImputedData> &impute_vec,
|
686
|
-
|
802
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
687
803
|
std::vector<char> &has_missing,
|
688
804
|
int nthreads)
|
689
805
|
{
|
690
806
|
if (workspace.impute_vec.size())
|
691
807
|
{
|
692
808
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
|
693
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
809
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
694
810
|
if (has_missing[row])
|
695
811
|
combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
|
696
812
|
}
|
@@ -698,13 +814,14 @@ void combine_tree_imputations(WorkerMemory &workspace,
|
|
698
814
|
else if (workspace.impute_map.size())
|
699
815
|
{
|
700
816
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
|
701
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
817
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
702
818
|
if (has_missing[row])
|
703
819
|
combine_imp_single(workspace.impute_map[row], impute_map[row]);
|
704
820
|
}
|
705
821
|
}
|
706
822
|
|
707
823
|
|
824
|
+
template <class ImputedData>
|
708
825
|
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
|
709
826
|
{
|
710
827
|
size_t col;
|
@@ -731,6 +848,7 @@ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double
|
|
731
848
|
}
|
732
849
|
|
733
850
|
|
851
|
+
template <class InputData, class WorkerMemory>
|
734
852
|
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
|
735
853
|
{
|
736
854
|
if (workspace.impute_vec.size())
|
@@ -794,7 +912,7 @@ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputDat
|
|
794
912
|
}
|
795
913
|
}
|
796
914
|
|
797
|
-
template <class imp_arr>
|
915
|
+
template <class imp_arr, class InputData>
|
798
916
|
void apply_imputation_results(imp_arr &impute_vec,
|
799
917
|
Imputer &imputer,
|
800
918
|
InputData &input_data,
|
@@ -809,7 +927,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
809
927
|
|
810
928
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
811
929
|
{
|
812
|
-
for (
|
930
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
813
931
|
{
|
814
932
|
if (is_na_or_inf(input_data.Xc[ix]))
|
815
933
|
{
|
@@ -832,7 +950,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
832
950
|
}
|
833
951
|
|
834
952
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
|
835
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
953
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
836
954
|
{
|
837
955
|
if (input_data.has_missing[row])
|
838
956
|
{
|
@@ -867,8 +985,9 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
867
985
|
}
|
868
986
|
}
|
869
987
|
|
988
|
+
template <class ImputedData, class InputData>
|
870
989
|
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
871
|
-
|
990
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
872
991
|
Imputer &imputer,
|
873
992
|
InputData &input_data,
|
874
993
|
int nthreads)
|
@@ -880,6 +999,7 @@ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
|
880
999
|
}
|
881
1000
|
|
882
1001
|
|
1002
|
+
template <class PredictionData, class ImputedData>
|
883
1003
|
void apply_imputation_results(PredictionData &prediction_data,
|
884
1004
|
ImputedData &imp,
|
885
1005
|
Imputer &imputer,
|
@@ -887,21 +1007,40 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
887
1007
|
{
|
888
1008
|
size_t col;
|
889
1009
|
size_t pos = 0;
|
890
|
-
|
1010
|
+
if (prediction_data.is_col_major)
|
891
1011
|
{
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
1012
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
1013
|
+
{
|
1014
|
+
col = imp.missing_num[ix];
|
1015
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
|
1016
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
1017
|
+
=
|
1018
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
1019
|
+
else
|
1020
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
1021
|
+
=
|
1022
|
+
imputer.col_means[col];
|
1023
|
+
}
|
1024
|
+
}
|
1025
|
+
|
1026
|
+
else
|
1027
|
+
{
|
1028
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
1029
|
+
{
|
1030
|
+
col = imp.missing_num[ix];
|
1031
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
|
1032
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
1033
|
+
=
|
1034
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
1035
|
+
else
|
1036
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
1037
|
+
=
|
1038
|
+
imputer.col_means[col];
|
1039
|
+
}
|
901
1040
|
}
|
902
1041
|
|
903
1042
|
if (prediction_data.Xr != NULL)
|
904
|
-
for (
|
1043
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
905
1044
|
{
|
906
1045
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
907
1046
|
{
|
@@ -917,22 +1056,43 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
917
1056
|
}
|
918
1057
|
}
|
919
1058
|
|
920
|
-
|
1059
|
+
if (prediction_data.is_col_major)
|
921
1060
|
{
|
922
|
-
|
923
|
-
|
1061
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
1062
|
+
{
|
1063
|
+
col = imp.missing_cat[ix];
|
1064
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
1065
|
+
=
|
1066
|
+
std::distance(imp.cat_sum[col].begin(),
|
1067
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
1068
|
+
|
1069
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
|
1070
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
924
1071
|
=
|
925
|
-
|
926
|
-
|
1072
|
+
imputer.col_modes[col];
|
1073
|
+
}
|
1074
|
+
}
|
927
1075
|
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
1076
|
+
else
|
1077
|
+
{
|
1078
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
1079
|
+
{
|
1080
|
+
col = imp.missing_cat[ix];
|
1081
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
1082
|
+
=
|
1083
|
+
std::distance(imp.cat_sum[col].begin(),
|
1084
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
1085
|
+
|
1086
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] == 0 && imp.cat_sum[col][0] <= 0)
|
1087
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
1088
|
+
=
|
1089
|
+
imputer.col_modes[col];
|
1090
|
+
}
|
932
1091
|
}
|
933
1092
|
}
|
934
1093
|
|
935
1094
|
|
1095
|
+
template <class ImputedData, class InputData>
|
936
1096
|
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
937
1097
|
{
|
938
1098
|
imp.n_missing_num = 0;
|
@@ -953,15 +1113,15 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
953
1113
|
else if (input_data.Xc_indptr != NULL)
|
954
1114
|
{
|
955
1115
|
imp.missing_sp.resize(input_data.ncols_numeric);
|
956
|
-
|
1116
|
+
decltype(input_data.Xc_indptr) res;
|
957
1117
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
958
1118
|
{
|
959
1119
|
res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
|
960
1120
|
input_data.Xc_ind + input_data.Xc_indptr[col + 1],
|
961
|
-
|
1121
|
+
row);
|
962
1122
|
if (
|
963
1123
|
res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
|
964
|
-
*res == row &&
|
1124
|
+
*res == static_cast<typename std::remove_pointer<decltype(res)>::type>(row) &&
|
965
1125
|
is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
|
966
1126
|
)
|
967
1127
|
{
|
@@ -986,6 +1146,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
986
1146
|
}
|
987
1147
|
}
|
988
1148
|
|
1149
|
+
template <class ImputedData, class PredictionData>
|
989
1150
|
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
|
990
1151
|
{
|
991
1152
|
imp.n_missing_num = 0;
|
@@ -996,9 +1157,20 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
996
1157
|
{
|
997
1158
|
if (!imp.missing_num.size())
|
998
1159
|
imp.missing_num.resize(imputer.ncols_numeric);
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1160
|
+
|
1161
|
+
if (prediction_data.is_col_major)
|
1162
|
+
{
|
1163
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1164
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1165
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
1166
|
+
}
|
1167
|
+
|
1168
|
+
else
|
1169
|
+
{
|
1170
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1171
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
1172
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
1173
|
+
}
|
1002
1174
|
|
1003
1175
|
if (!imp.num_sum.size())
|
1004
1176
|
{
|
@@ -1017,7 +1189,7 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1017
1189
|
{
|
1018
1190
|
if (!imp.missing_sp.size())
|
1019
1191
|
imp.missing_sp.resize(imputer.ncols_numeric);
|
1020
|
-
for (
|
1192
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1021
1193
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1022
1194
|
imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
|
1023
1195
|
|
@@ -1038,10 +1210,23 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1038
1210
|
{
|
1039
1211
|
if (!imp.missing_cat.size())
|
1040
1212
|
imp.missing_cat.resize(imputer.ncols_categ);
|
1041
|
-
|
1213
|
+
|
1214
|
+
if (prediction_data.is_col_major)
|
1042
1215
|
{
|
1043
|
-
|
1044
|
-
|
1216
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1217
|
+
{
|
1218
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1219
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
1220
|
+
}
|
1221
|
+
}
|
1222
|
+
|
1223
|
+
else
|
1224
|
+
{
|
1225
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1226
|
+
{
|
1227
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
1228
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
1229
|
+
}
|
1045
1230
|
}
|
1046
1231
|
|
1047
1232
|
if (!imp.cat_weight.size())
|
@@ -1063,31 +1248,35 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1063
1248
|
}
|
1064
1249
|
}
|
1065
1250
|
|
1066
|
-
ImputedData
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1251
|
+
// template class ImputedData <class InputData>
|
1252
|
+
// ImputedData::ImputedData(InputData &input_data, size_t row)
|
1253
|
+
// {
|
1254
|
+
// initialize_impute_calc(*this, input_data, row);
|
1255
|
+
// }
|
1070
1256
|
|
1257
|
+
template <class ImputedData, class InputData>
|
1071
1258
|
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
|
1072
1259
|
{
|
1073
1260
|
impute_vec.resize(input_data.nrows);
|
1074
1261
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
|
1075
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1262
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
1076
1263
|
if (input_data.has_missing[row])
|
1077
1264
|
initialize_impute_calc(impute_vec[row], input_data, row);
|
1078
1265
|
}
|
1079
1266
|
|
1080
1267
|
|
1081
|
-
|
1268
|
+
template <class ImputedData, class InputData>
|
1269
|
+
void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data)
|
1082
1270
|
{
|
1083
1271
|
for (size_t row = 0; row < input_data.nrows; row++)
|
1084
1272
|
if (input_data.has_missing[row])
|
1085
1273
|
impute_map[row] = ImputedData(input_data, row);
|
1086
1274
|
}
|
1087
1275
|
|
1276
|
+
template <class ImputedData, class InputData>
|
1088
1277
|
void allocate_imp(InputData &input_data,
|
1089
1278
|
std::vector<ImputedData> &impute_vec,
|
1090
|
-
|
1279
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
1091
1280
|
int nthreads)
|
1092
1281
|
{
|
1093
1282
|
if (input_data.n_missing == 0)
|
@@ -1098,9 +1287,10 @@ void allocate_imp(InputData &input_data,
|
|
1098
1287
|
allocate_imp_vec(impute_vec, input_data, nthreads);
|
1099
1288
|
}
|
1100
1289
|
|
1290
|
+
template <class ImputedData, class InputData>
|
1101
1291
|
void check_for_missing(InputData &input_data,
|
1102
1292
|
std::vector<ImputedData> &impute_vec,
|
1103
|
-
|
1293
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
1104
1294
|
int nthreads)
|
1105
1295
|
{
|
1106
1296
|
input_data.has_missing.assign(input_data.nrows, false);
|
@@ -1109,7 +1299,7 @@ void check_for_missing(InputData &input_data,
|
|
1109
1299
|
{
|
1110
1300
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1111
1301
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
|
1112
|
-
for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
1302
|
+
for (size_t_for ix = input_data.Xc_indptr[col]; ix < (decltype(ix))input_data.Xc_indptr[col + 1]; ix++)
|
1113
1303
|
if (is_na_or_inf(input_data.Xc[ix]))
|
1114
1304
|
input_data.has_missing[input_data.Xc_ind[ix]] = true;
|
1115
1305
|
#pragma omp barrier
|
@@ -1118,14 +1308,17 @@ void check_for_missing(InputData &input_data,
|
|
1118
1308
|
if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
|
1119
1309
|
{
|
1120
1310
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
|
1121
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1311
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
1122
1312
|
{
|
1123
|
-
|
1313
|
+
if (input_data.Xc_indptr == NULL)
|
1124
1314
|
{
|
1125
|
-
|
1315
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1126
1316
|
{
|
1127
|
-
input_data.
|
1128
|
-
|
1317
|
+
if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
|
1318
|
+
{
|
1319
|
+
input_data.has_missing[row] = true;
|
1320
|
+
break;
|
1321
|
+
}
|
1129
1322
|
}
|
1130
1323
|
}
|
1131
1324
|
|
@@ -1145,6 +1338,7 @@ void check_for_missing(InputData &input_data,
|
|
1145
1338
|
allocate_imp(input_data, impute_vec, impute_map, nthreads);
|
1146
1339
|
}
|
1147
1340
|
|
1341
|
+
template <class PredictionData>
|
1148
1342
|
size_t check_for_missing(PredictionData &prediction_data,
|
1149
1343
|
Imputer &imputer,
|
1150
1344
|
size_t ix_arr[],
|
@@ -1153,19 +1347,38 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
1153
1347
|
std::vector<char> has_missing(prediction_data.nrows, false);
|
1154
1348
|
|
1155
1349
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
|
1156
|
-
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
1350
|
+
for (size_t_for row = 0; row < (decltype(row))prediction_data.nrows; row++)
|
1157
1351
|
{
|
1158
1352
|
if (prediction_data.numeric_data != NULL)
|
1159
|
-
|
1353
|
+
{
|
1354
|
+
if (prediction_data.is_col_major)
|
1160
1355
|
{
|
1161
|
-
|
1356
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1162
1357
|
{
|
1163
|
-
|
1164
|
-
|
1358
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1359
|
+
{
|
1360
|
+
has_missing[row] = true;
|
1361
|
+
break;
|
1362
|
+
}
|
1165
1363
|
}
|
1166
1364
|
}
|
1365
|
+
|
1366
|
+
else
|
1367
|
+
{
|
1368
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1369
|
+
{
|
1370
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
1371
|
+
{
|
1372
|
+
has_missing[row] = true;
|
1373
|
+
break;
|
1374
|
+
}
|
1375
|
+
}
|
1376
|
+
}
|
1377
|
+
}
|
1378
|
+
|
1167
1379
|
else if (prediction_data.Xr != NULL)
|
1168
|
-
|
1380
|
+
{
|
1381
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1169
1382
|
{
|
1170
1383
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1171
1384
|
{
|
@@ -1173,16 +1386,34 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
1173
1386
|
break;
|
1174
1387
|
}
|
1175
1388
|
}
|
1389
|
+
}
|
1176
1390
|
|
1177
1391
|
if (!has_missing[row])
|
1178
|
-
|
1392
|
+
{
|
1393
|
+
if (prediction_data.is_col_major)
|
1179
1394
|
{
|
1180
|
-
|
1395
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1181
1396
|
{
|
1182
|
-
|
1183
|
-
|
1397
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1398
|
+
{
|
1399
|
+
has_missing[row] = true;
|
1400
|
+
break;
|
1401
|
+
}
|
1402
|
+
}
|
1403
|
+
}
|
1404
|
+
|
1405
|
+
else
|
1406
|
+
{
|
1407
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1408
|
+
{
|
1409
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
1410
|
+
{
|
1411
|
+
has_missing[row] = true;
|
1412
|
+
break;
|
1413
|
+
}
|
1184
1414
|
}
|
1185
1415
|
}
|
1416
|
+
}
|
1186
1417
|
}
|
1187
1418
|
|
1188
1419
|
size_t st = 0;
|