isotree 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2116 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +132 -0
- data/vendor/isotree/src/RcppExports.cpp +594 -57
- data/vendor/isotree/src/Rwrapper.cpp +2452 -304
- data/vendor/isotree/src/c_interface.cpp +958 -0
- data/vendor/isotree/src/crit.hpp +4236 -0
- data/vendor/isotree/src/digamma.hpp +184 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +814 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +382 -123
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1815 -394
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +844 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +114 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1639 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4316 -139
- data/vendor/isotree/src/sql.cpp +143 -61
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3786 -0
- data/vendor/isotree/src/xoshiro.hpp +463 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +40 -105
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
|
|
18
18
|
* [5] https://sourceforge.net/projects/iforest/
|
19
19
|
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
-
* [8] Cortes, David.
|
22
|
-
*
|
21
|
+
* [8] Cortes, David.
|
22
|
+
* "Distance approximation using Isolation Forests."
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
24
|
+
* [9] Cortes, David.
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
28
|
+
* [11] Cortes, David.
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
31
|
+
* [12] Guha, Sudipto, et al.
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
34
|
+
* [13] Cortes, David.
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
23
41
|
*
|
24
42
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
26
44
|
* All rights reserved.
|
27
45
|
* Redistribution and use in source and binary forms, with or without
|
28
46
|
* modification, are permitted provided that the following conditions are met:
|
@@ -52,18 +70,18 @@
|
|
52
70
|
* Parameters
|
53
71
|
* ==========
|
54
72
|
* - numeric_data[nrows * ncols_numeric] (in, out)
|
55
|
-
* Pointer to numeric data in which missing values will be imputed.
|
56
|
-
*
|
57
|
-
*
|
73
|
+
* Pointer to numeric data in which missing values will be imputed. May be ordered by rows
|
74
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
75
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
76
|
+
* (see parameter 'is_col_major').
|
58
77
|
* Pass NULL if there are no dense numeric columns.
|
59
78
|
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
60
79
|
* Imputations will overwrite values in this same array.
|
61
|
-
* - ncols_numeric
|
62
|
-
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
63
80
|
* - categ_data[nrows * ncols_categ]
|
64
|
-
* Pointer to categorical data in which missing values will be imputed.
|
65
|
-
*
|
66
|
-
*
|
81
|
+
* Pointer to categorical data in which missing values will be imputed. May be ordered by rows
|
82
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
83
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
84
|
+
* (see parameter 'is_col_major').
|
67
85
|
* Pass NULL if there are no categorical columns.
|
68
86
|
* Each category should be represented as an integer, and these integers must start at zero and
|
69
87
|
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
@@ -71,6 +89,11 @@
|
|
71
89
|
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
72
90
|
* must be the same as was used in the data to which the model was fit.
|
73
91
|
* Imputations will overwrite values in this same array.
|
92
|
+
* - is_col_major
|
93
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
94
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
95
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
96
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
74
97
|
* - ncols_categ
|
75
98
|
* Number of categorical columns in the data.
|
76
99
|
* - ncat[ncols_categ]
|
@@ -84,6 +107,7 @@
|
|
84
107
|
* Imputations will overwrite values in this same array.
|
85
108
|
* - Xr_ind[nnz]
|
86
109
|
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
110
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
87
111
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
88
112
|
* - Xr_indptr[nrows + 1]
|
89
113
|
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
@@ -91,6 +115,11 @@
|
|
91
115
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
92
116
|
* - nrows
|
93
117
|
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
118
|
+
* - use_long_double
|
119
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
120
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
121
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
122
|
+
* Power8 platforms).
|
94
123
|
* - nthreads
|
95
124
|
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
96
125
|
* allocated, even if the thread does not end up being used. Ignored when not building with
|
@@ -107,13 +136,50 @@
|
|
107
136
|
* Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
|
108
137
|
* as produced from function 'fit_iforest',
|
109
138
|
*/
|
110
|
-
|
111
|
-
|
139
|
+
template <class real_t, class sparse_ix>
|
140
|
+
void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
|
141
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
142
|
+
size_t nrows, bool use_long_double, int nthreads,
|
143
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
144
|
+
Imputer &imputer)
|
145
|
+
{
|
146
|
+
if (use_long_double && !has_long_double()) {
|
147
|
+
use_long_double = false;
|
148
|
+
print_errmsg("Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
149
|
+
}
|
150
|
+
#ifndef NO_LONG_DOUBLE
|
151
|
+
if (likely(!use_long_double))
|
152
|
+
#endif
|
153
|
+
impute_missing_values_internal<real_t, sparse_ix, double>(
|
154
|
+
numeric_data, categ_data, is_col_major,
|
155
|
+
Xr, Xr_ind, Xr_indptr,
|
156
|
+
nrows, nthreads,
|
157
|
+
model_outputs, model_outputs_ext,
|
158
|
+
imputer
|
159
|
+
);
|
160
|
+
#ifndef NO_LONG_DOUBLE
|
161
|
+
else
|
162
|
+
impute_missing_values_internal<real_t, sparse_ix, long double>(
|
163
|
+
numeric_data, categ_data, is_col_major,
|
164
|
+
Xr, Xr_ind, Xr_indptr,
|
165
|
+
nrows, nthreads,
|
166
|
+
model_outputs, model_outputs_ext,
|
167
|
+
imputer
|
168
|
+
);
|
169
|
+
#endif
|
170
|
+
}
|
171
|
+
|
172
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
173
|
+
void impute_missing_values_internal(
|
174
|
+
real_t numeric_data[], int categ_data[], bool is_col_major,
|
175
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
112
176
|
size_t nrows, int nthreads,
|
113
177
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
114
178
|
Imputer &imputer)
|
115
179
|
{
|
116
|
-
PredictionData
|
180
|
+
PredictionData<real_t, sparse_ix>
|
181
|
+
prediction_data = {numeric_data, categ_data, nrows,
|
182
|
+
is_col_major, imputer.ncols_numeric, imputer.ncols_categ,
|
117
183
|
NULL, NULL, NULL,
|
118
184
|
Xr, Xr_ind, Xr_indptr};
|
119
185
|
|
@@ -128,34 +194,53 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
128
194
|
if ((size_t)nthreads > end)
|
129
195
|
nthreads = (int)end;
|
130
196
|
#ifdef _OPENMP
|
131
|
-
std::vector<ImputedData
|
197
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(nthreads);
|
132
198
|
#else
|
133
|
-
std::vector<ImputedData
|
199
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(1);
|
134
200
|
#endif
|
135
201
|
|
202
|
+
bool threw_exception = false;
|
203
|
+
std::exception_ptr ex = NULL;
|
136
204
|
|
137
205
|
if (model_outputs != NULL)
|
138
206
|
{
|
139
207
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
140
|
-
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
|
141
|
-
for (size_t_for row = 0; row < end; row++)
|
208
|
+
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer, ex, threw_exception)
|
209
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
142
210
|
{
|
143
|
-
|
144
|
-
|
145
|
-
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
211
|
+
if (threw_exception) continue;
|
212
|
+
try
|
146
213
|
{
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
214
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
215
|
+
|
216
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
217
|
+
{
|
218
|
+
traverse_itree(tree,
|
219
|
+
*model_outputs,
|
220
|
+
prediction_data,
|
221
|
+
&imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
|
222
|
+
&imp_memory[omp_get_thread_num()],
|
223
|
+
(double) 1,
|
224
|
+
ix_arr[row],
|
225
|
+
(sparse_ix*)NULL,
|
226
|
+
(double*)NULL,
|
227
|
+
(size_t) 0);
|
228
|
+
}
|
229
|
+
|
230
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
156
231
|
}
|
157
232
|
|
158
|
-
|
233
|
+
catch(...)
|
234
|
+
{
|
235
|
+
#pragma omp critical
|
236
|
+
{
|
237
|
+
if (!threw_exception)
|
238
|
+
{
|
239
|
+
threw_exception = true;
|
240
|
+
ex = std::current_exception();
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
159
244
|
|
160
245
|
}
|
161
246
|
}
|
@@ -164,31 +249,51 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
164
249
|
{
|
165
250
|
double temp;
|
166
251
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
167
|
-
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
|
252
|
+
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer, ex, threw_exception) \
|
168
253
|
private(temp)
|
169
|
-
for (size_t_for row = 0; row < end; row++)
|
254
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
170
255
|
{
|
171
|
-
|
172
|
-
|
173
|
-
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
256
|
+
if (threw_exception) continue;
|
257
|
+
try
|
174
258
|
{
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
259
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
260
|
+
|
261
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
262
|
+
{
|
263
|
+
traverse_hplane(hplane,
|
264
|
+
*model_outputs_ext,
|
265
|
+
prediction_data,
|
266
|
+
temp,
|
267
|
+
&imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
|
268
|
+
&imp_memory[omp_get_thread_num()],
|
269
|
+
(sparse_ix*)NULL,
|
270
|
+
(double*)NULL,
|
271
|
+
ix_arr[row]);
|
272
|
+
}
|
273
|
+
|
274
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
183
275
|
}
|
184
276
|
|
185
|
-
|
277
|
+
catch (...)
|
278
|
+
{
|
279
|
+
#pragma omp critical
|
280
|
+
{
|
281
|
+
if (!threw_exception)
|
282
|
+
{
|
283
|
+
threw_exception = true;
|
284
|
+
ex = std::current_exception();
|
285
|
+
}
|
286
|
+
}
|
287
|
+
}
|
186
288
|
|
187
289
|
}
|
188
290
|
}
|
189
291
|
|
292
|
+
if (threw_exception)
|
293
|
+
std::rethrow_exception(ex);
|
190
294
|
}
|
191
295
|
|
296
|
+
template <class InputData, class ldouble_safe>
|
192
297
|
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
|
193
298
|
{
|
194
299
|
imputer.ncols_numeric = input_data.ncols_numeric;
|
@@ -208,11 +313,12 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
208
313
|
imputer.col_modes.resize(input_data.ncols_categ);
|
209
314
|
imputer.imputer_tree = std::vector<std::vector<ImputeNode>>(ntrees);
|
210
315
|
|
316
|
+
/* TODO: here should use sample weights if specified as density */
|
211
317
|
size_t offset, cnt;
|
212
318
|
if (input_data.numeric_data != NULL)
|
213
319
|
{
|
214
320
|
#pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
|
215
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
321
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
216
322
|
{
|
217
323
|
cnt = input_data.nrows;
|
218
324
|
offset = col * input_data.nrows;
|
@@ -222,23 +328,25 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
222
328
|
input_data.numeric_data[row + offset] : 0;
|
223
329
|
cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
|
224
330
|
}
|
225
|
-
imputer.col_means[col] /= (
|
331
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
332
|
+
if (!cnt) imputer.col_means[col] = NAN;
|
226
333
|
}
|
227
334
|
}
|
228
335
|
|
229
336
|
else if (input_data.Xc_indptr != NULL)
|
230
337
|
{
|
231
338
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
|
232
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
339
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
233
340
|
{
|
234
341
|
cnt = input_data.nrows;
|
235
|
-
for (
|
342
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
236
343
|
{
|
237
344
|
imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
|
238
345
|
input_data.Xc[ix] : 0;
|
239
346
|
cnt -= is_na_or_inf(input_data.Xc[ix]);
|
240
347
|
}
|
241
|
-
imputer.col_means[col] /= (
|
348
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
349
|
+
if (!cnt) imputer.col_means[col] = NAN;
|
242
350
|
}
|
243
351
|
}
|
244
352
|
|
@@ -246,7 +354,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
246
354
|
{
|
247
355
|
std::vector<size_t> cat_counts(input_data.max_categ);
|
248
356
|
#pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
|
249
|
-
for (size_t_for col = 0; col < input_data.ncols_categ; col++)
|
357
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_categ; col++)
|
250
358
|
{
|
251
359
|
std::fill(cat_counts.begin(), cat_counts.end(), 0);
|
252
360
|
offset = col * input_data.nrows;
|
@@ -264,6 +372,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
264
372
|
|
265
373
|
|
266
374
|
/* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
|
375
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
267
376
|
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
268
377
|
InputData &input_data, ModelParams &model_params,
|
269
378
|
std::vector<ImputeNode> &imputer_tree,
|
@@ -274,7 +383,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
274
383
|
if (!has_weights)
|
275
384
|
wsum = (double)(workspace.end - workspace.st + 1);
|
276
385
|
else
|
277
|
-
wsum = calculate_sum_weights(
|
386
|
+
wsum = calculate_sum_weights<ldouble_safe>(
|
387
|
+
workspace.ix_arr, workspace.st, workspace.end, curr_depth,
|
278
388
|
workspace.weights_arr, workspace.weights_map);
|
279
389
|
|
280
390
|
imputer.num_sum.resize(input_data.ncols_numeric, 0);
|
@@ -320,7 +430,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
320
430
|
if (!is_na_or_inf(xnum))
|
321
431
|
{
|
322
432
|
cnt++;
|
323
|
-
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (
|
433
|
+
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (ldouble_safe)cnt;
|
324
434
|
}
|
325
435
|
}
|
326
436
|
imputer.num_weight[col] = (double) cnt;
|
@@ -349,7 +459,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
349
459
|
|
350
460
|
else
|
351
461
|
{
|
352
|
-
|
462
|
+
ldouble_safe prod_sum, corr, val, diff;
|
353
463
|
if (input_data.numeric_data != NULL)
|
354
464
|
{
|
355
465
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
@@ -417,7 +527,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
417
527
|
row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
418
528
|
)
|
419
529
|
{
|
420
|
-
if (input_data.Xc_ind[curr_pos] == *row)
|
530
|
+
if (input_data.Xc_ind[curr_pos] == static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
421
531
|
{
|
422
532
|
xnum = input_data.Xc[curr_pos];
|
423
533
|
if (workspace.weights_arr.size())
|
@@ -443,7 +553,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
443
553
|
|
444
554
|
else
|
445
555
|
{
|
446
|
-
if (input_data.Xc_ind[curr_pos] > *row)
|
556
|
+
if (input_data.Xc_ind[curr_pos] > static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
447
557
|
row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
|
448
558
|
else
|
449
559
|
curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
|
@@ -514,7 +624,10 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
514
624
|
{
|
515
625
|
for (int cat = 0; cat < input_data.ncat[col]; cat++)
|
516
626
|
{
|
517
|
-
imputer.cat_sum[col][cat]
|
627
|
+
imputer.cat_sum[col][cat]
|
628
|
+
+=
|
629
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] > 0)?
|
630
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] / imputer_tree[curr_tree].cat_weight[col]) : 0.;
|
518
631
|
imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
|
519
632
|
}
|
520
633
|
break;
|
@@ -544,7 +657,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
544
657
|
{
|
545
658
|
case Inverse:
|
546
659
|
{
|
547
|
-
double wsum_div = wsum * sqrt(wsum);
|
660
|
+
double wsum_div = wsum * std::sqrt(wsum);
|
548
661
|
for (double &w : imputer.num_weight)
|
549
662
|
w /= wsum_div;
|
550
663
|
|
@@ -562,6 +675,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
562
675
|
break;
|
563
676
|
}
|
564
677
|
|
678
|
+
default: {}
|
679
|
+
|
565
680
|
/* TODO: maybe divide by nrows for prop */
|
566
681
|
}
|
567
682
|
|
@@ -585,6 +700,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
585
700
|
w *= curr_depth_dbl;
|
586
701
|
break;
|
587
702
|
}
|
703
|
+
|
704
|
+
default: {}
|
588
705
|
}
|
589
706
|
|
590
707
|
/* now re-adjust sums */
|
@@ -621,7 +738,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
621
738
|
{
|
622
739
|
for (size_t tr = 0; tr < trees->size(); tr++)
|
623
740
|
{
|
624
|
-
if ((*trees)[tr].
|
741
|
+
if ((*trees)[tr].tree_left != 0)
|
625
742
|
{
|
626
743
|
shrink_impute_node(imputer_tree[tr]);
|
627
744
|
}
|
@@ -639,7 +756,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
639
756
|
{
|
640
757
|
for (size_t tr = 0; tr < hplanes->size(); tr++)
|
641
758
|
{
|
642
|
-
if ((*hplanes)[tr].
|
759
|
+
if ((*hplanes)[tr].hplane_left != 0)
|
643
760
|
{
|
644
761
|
shrink_impute_node(imputer_tree[tr]);
|
645
762
|
}
|
@@ -656,7 +773,8 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
656
773
|
imputer_tree.shrink_to_fit();
|
657
774
|
}
|
658
775
|
|
659
|
-
|
776
|
+
template <class ImputedData>
|
777
|
+
void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto)
|
660
778
|
{
|
661
779
|
size_t col;
|
662
780
|
for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
|
@@ -681,16 +799,17 @@ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
|
|
681
799
|
}
|
682
800
|
}
|
683
801
|
|
802
|
+
template <class ImputedData, class WorkerMemory>
|
684
803
|
void combine_tree_imputations(WorkerMemory &workspace,
|
685
804
|
std::vector<ImputedData> &impute_vec,
|
686
|
-
|
805
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
687
806
|
std::vector<char> &has_missing,
|
688
807
|
int nthreads)
|
689
808
|
{
|
690
809
|
if (workspace.impute_vec.size())
|
691
810
|
{
|
692
811
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
|
693
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
812
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
694
813
|
if (has_missing[row])
|
695
814
|
combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
|
696
815
|
}
|
@@ -698,13 +817,14 @@ void combine_tree_imputations(WorkerMemory &workspace,
|
|
698
817
|
else if (workspace.impute_map.size())
|
699
818
|
{
|
700
819
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
|
701
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
820
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
702
821
|
if (has_missing[row])
|
703
822
|
combine_imp_single(workspace.impute_map[row], impute_map[row]);
|
704
823
|
}
|
705
824
|
}
|
706
825
|
|
707
826
|
|
827
|
+
template <class ImputedData>
|
708
828
|
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
|
709
829
|
{
|
710
830
|
size_t col;
|
@@ -731,6 +851,7 @@ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double
|
|
731
851
|
}
|
732
852
|
|
733
853
|
|
854
|
+
template <class InputData, class WorkerMemory>
|
734
855
|
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
|
735
856
|
{
|
736
857
|
if (workspace.impute_vec.size())
|
@@ -794,7 +915,7 @@ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputDat
|
|
794
915
|
}
|
795
916
|
}
|
796
917
|
|
797
|
-
template <class imp_arr>
|
918
|
+
template <class imp_arr, class InputData>
|
798
919
|
void apply_imputation_results(imp_arr &impute_vec,
|
799
920
|
Imputer &imputer,
|
800
921
|
InputData &input_data,
|
@@ -809,7 +930,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
809
930
|
|
810
931
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
811
932
|
{
|
812
|
-
for (
|
933
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
813
934
|
{
|
814
935
|
if (is_na_or_inf(input_data.Xc[ix]))
|
815
936
|
{
|
@@ -832,7 +953,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
832
953
|
}
|
833
954
|
|
834
955
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
|
835
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
956
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
836
957
|
{
|
837
958
|
if (input_data.has_missing[row])
|
838
959
|
{
|
@@ -867,8 +988,9 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
867
988
|
}
|
868
989
|
}
|
869
990
|
|
991
|
+
template <class ImputedData, class InputData>
|
870
992
|
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
871
|
-
|
993
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
872
994
|
Imputer &imputer,
|
873
995
|
InputData &input_data,
|
874
996
|
int nthreads)
|
@@ -880,6 +1002,12 @@ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
|
880
1002
|
}
|
881
1003
|
|
882
1004
|
|
1005
|
+
/* TODO: investigate why in the case of all-missing numeric columns the node weights still
|
1006
|
+
get filled when using extended model, then remove the workaround that was added here that
|
1007
|
+
checks if the sum is zero and column is all-nan. Should also modify the earlier code to
|
1008
|
+
remove these cases from the imputation tracking list when doing the imputations on-the-fly
|
1009
|
+
as the model is being fit. */
|
1010
|
+
template <class PredictionData, class ImputedData>
|
883
1011
|
void apply_imputation_results(PredictionData &prediction_data,
|
884
1012
|
ImputedData &imp,
|
885
1013
|
Imputer &imputer,
|
@@ -887,21 +1015,40 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
887
1015
|
{
|
888
1016
|
size_t col;
|
889
1017
|
size_t pos = 0;
|
890
|
-
|
1018
|
+
if (prediction_data.is_col_major)
|
891
1019
|
{
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
1020
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
1021
|
+
{
|
1022
|
+
col = imp.missing_num[ix];
|
1023
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]) && !(imp.num_sum[ix] == 0 && std::isnan(imputer.col_means[col])))
|
1024
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
1025
|
+
=
|
1026
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
1027
|
+
else
|
1028
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
1029
|
+
=
|
1030
|
+
imputer.col_means[col];
|
1031
|
+
}
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
else
|
1035
|
+
{
|
1036
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
1037
|
+
{
|
1038
|
+
col = imp.missing_num[ix];
|
1039
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]) && !(imp.num_sum[ix] == 0 && std::isnan(imputer.col_means[col])))
|
1040
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
1041
|
+
=
|
1042
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
1043
|
+
else
|
1044
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
1045
|
+
=
|
1046
|
+
imputer.col_means[col];
|
1047
|
+
}
|
901
1048
|
}
|
902
1049
|
|
903
1050
|
if (prediction_data.Xr != NULL)
|
904
|
-
for (
|
1051
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
905
1052
|
{
|
906
1053
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
907
1054
|
{
|
@@ -917,22 +1064,63 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
917
1064
|
}
|
918
1065
|
}
|
919
1066
|
|
920
|
-
|
1067
|
+
if (prediction_data.is_col_major)
|
921
1068
|
{
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
std::distance(imp.cat_sum[col].begin(),
|
926
|
-
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
927
|
-
|
928
|
-
if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
|
1069
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
1070
|
+
{
|
1071
|
+
col = imp.missing_cat[ix];
|
929
1072
|
prediction_data.categ_data[row + col * prediction_data.nrows]
|
930
|
-
|
931
|
-
|
1073
|
+
=
|
1074
|
+
std::distance(imp.cat_sum[col].begin(),
|
1075
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
1076
|
+
|
1077
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0)
|
1078
|
+
{
|
1079
|
+
if (imp.cat_sum.empty() || imp.cat_sum[col].empty())
|
1080
|
+
{
|
1081
|
+
prediction_data.categ_data[row + col * prediction_data.nrows] = -1;
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
else if (imp.cat_sum[col][0] <= 0)
|
1085
|
+
{
|
1086
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
1087
|
+
=
|
1088
|
+
imputer.col_modes[col];
|
1089
|
+
}
|
1090
|
+
}
|
1091
|
+
}
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
else
|
1095
|
+
{
|
1096
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
1097
|
+
{
|
1098
|
+
col = imp.missing_cat[ix];
|
1099
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
1100
|
+
=
|
1101
|
+
std::distance(imp.cat_sum[col].begin(),
|
1102
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
1103
|
+
|
1104
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] == 0)
|
1105
|
+
{
|
1106
|
+
if (imp.cat_sum.empty() || imp.cat_sum[col].empty())
|
1107
|
+
{
|
1108
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ] = -1;
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
else if (imp.cat_sum[col][0] <= 0)
|
1112
|
+
{
|
1113
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
1114
|
+
=
|
1115
|
+
imputer.col_modes[col];
|
1116
|
+
}
|
1117
|
+
}
|
1118
|
+
}
|
932
1119
|
}
|
933
1120
|
}
|
934
1121
|
|
935
1122
|
|
1123
|
+
template <class ImputedData, class InputData>
|
936
1124
|
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
937
1125
|
{
|
938
1126
|
imp.n_missing_num = 0;
|
@@ -953,15 +1141,15 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
953
1141
|
else if (input_data.Xc_indptr != NULL)
|
954
1142
|
{
|
955
1143
|
imp.missing_sp.resize(input_data.ncols_numeric);
|
956
|
-
|
1144
|
+
decltype(input_data.Xc_indptr) res;
|
957
1145
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
958
1146
|
{
|
959
1147
|
res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
|
960
1148
|
input_data.Xc_ind + input_data.Xc_indptr[col + 1],
|
961
|
-
|
1149
|
+
row);
|
962
1150
|
if (
|
963
1151
|
res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
|
964
|
-
*res == row &&
|
1152
|
+
*res == static_cast<typename std::remove_pointer<decltype(res)>::type>(row) &&
|
965
1153
|
is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
|
966
1154
|
)
|
967
1155
|
{
|
@@ -986,6 +1174,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
986
1174
|
}
|
987
1175
|
}
|
988
1176
|
|
1177
|
+
template <class ImputedData, class PredictionData>
|
989
1178
|
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
|
990
1179
|
{
|
991
1180
|
imp.n_missing_num = 0;
|
@@ -996,9 +1185,20 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
996
1185
|
{
|
997
1186
|
if (!imp.missing_num.size())
|
998
1187
|
imp.missing_num.resize(imputer.ncols_numeric);
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1188
|
+
|
1189
|
+
if (prediction_data.is_col_major)
|
1190
|
+
{
|
1191
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1192
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1193
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
1194
|
+
}
|
1195
|
+
|
1196
|
+
else
|
1197
|
+
{
|
1198
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1199
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
1200
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
1201
|
+
}
|
1002
1202
|
|
1003
1203
|
if (!imp.num_sum.size())
|
1004
1204
|
{
|
@@ -1017,7 +1217,7 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1017
1217
|
{
|
1018
1218
|
if (!imp.missing_sp.size())
|
1019
1219
|
imp.missing_sp.resize(imputer.ncols_numeric);
|
1020
|
-
for (
|
1220
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1021
1221
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1022
1222
|
imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
|
1023
1223
|
|
@@ -1038,10 +1238,23 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1038
1238
|
{
|
1039
1239
|
if (!imp.missing_cat.size())
|
1040
1240
|
imp.missing_cat.resize(imputer.ncols_categ);
|
1041
|
-
|
1241
|
+
|
1242
|
+
if (prediction_data.is_col_major)
|
1042
1243
|
{
|
1043
|
-
|
1044
|
-
|
1244
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1245
|
+
{
|
1246
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1247
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
1248
|
+
}
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
else
|
1252
|
+
{
|
1253
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1254
|
+
{
|
1255
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
1256
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
1257
|
+
}
|
1045
1258
|
}
|
1046
1259
|
|
1047
1260
|
if (!imp.cat_weight.size())
|
@@ -1063,31 +1276,35 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
1063
1276
|
}
|
1064
1277
|
}
|
1065
1278
|
|
1066
|
-
ImputedData
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1279
|
+
// template class ImputedData <class InputData>
|
1280
|
+
// ImputedData::ImputedData(InputData &input_data, size_t row)
|
1281
|
+
// {
|
1282
|
+
// initialize_impute_calc(*this, input_data, row);
|
1283
|
+
// }
|
1070
1284
|
|
1285
|
+
template <class ImputedData, class InputData>
|
1071
1286
|
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
|
1072
1287
|
{
|
1073
1288
|
impute_vec.resize(input_data.nrows);
|
1074
1289
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
|
1075
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1290
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
1076
1291
|
if (input_data.has_missing[row])
|
1077
1292
|
initialize_impute_calc(impute_vec[row], input_data, row);
|
1078
1293
|
}
|
1079
1294
|
|
1080
1295
|
|
1081
|
-
|
1296
|
+
template <class ImputedData, class InputData>
|
1297
|
+
void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data)
|
1082
1298
|
{
|
1083
1299
|
for (size_t row = 0; row < input_data.nrows; row++)
|
1084
1300
|
if (input_data.has_missing[row])
|
1085
1301
|
impute_map[row] = ImputedData(input_data, row);
|
1086
1302
|
}
|
1087
1303
|
|
1304
|
+
template <class ImputedData, class InputData>
|
1088
1305
|
void allocate_imp(InputData &input_data,
|
1089
1306
|
std::vector<ImputedData> &impute_vec,
|
1090
|
-
|
1307
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
1091
1308
|
int nthreads)
|
1092
1309
|
{
|
1093
1310
|
if (input_data.n_missing == 0)
|
@@ -1098,9 +1315,10 @@ void allocate_imp(InputData &input_data,
|
|
1098
1315
|
allocate_imp_vec(impute_vec, input_data, nthreads);
|
1099
1316
|
}
|
1100
1317
|
|
1318
|
+
template <class ImputedData, class InputData>
|
1101
1319
|
void check_for_missing(InputData &input_data,
|
1102
1320
|
std::vector<ImputedData> &impute_vec,
|
1103
|
-
|
1321
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
1104
1322
|
int nthreads)
|
1105
1323
|
{
|
1106
1324
|
input_data.has_missing.assign(input_data.nrows, false);
|
@@ -1109,7 +1327,7 @@ void check_for_missing(InputData &input_data,
|
|
1109
1327
|
{
|
1110
1328
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1111
1329
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
|
1112
|
-
for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
1330
|
+
for (size_t_for ix = input_data.Xc_indptr[col]; ix < (decltype(ix))input_data.Xc_indptr[col + 1]; ix++)
|
1113
1331
|
if (is_na_or_inf(input_data.Xc[ix]))
|
1114
1332
|
input_data.has_missing[input_data.Xc_ind[ix]] = true;
|
1115
1333
|
#pragma omp barrier
|
@@ -1118,14 +1336,17 @@ void check_for_missing(InputData &input_data,
|
|
1118
1336
|
if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
|
1119
1337
|
{
|
1120
1338
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
|
1121
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
1339
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
1122
1340
|
{
|
1123
|
-
|
1341
|
+
if (input_data.Xc_indptr == NULL)
|
1124
1342
|
{
|
1125
|
-
|
1343
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1126
1344
|
{
|
1127
|
-
input_data.
|
1128
|
-
|
1345
|
+
if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
|
1346
|
+
{
|
1347
|
+
input_data.has_missing[row] = true;
|
1348
|
+
break;
|
1349
|
+
}
|
1129
1350
|
}
|
1130
1351
|
}
|
1131
1352
|
|
@@ -1145,6 +1366,7 @@ void check_for_missing(InputData &input_data,
|
|
1145
1366
|
allocate_imp(input_data, impute_vec, impute_map, nthreads);
|
1146
1367
|
}
|
1147
1368
|
|
1369
|
+
template <class PredictionData>
|
1148
1370
|
size_t check_for_missing(PredictionData &prediction_data,
|
1149
1371
|
Imputer &imputer,
|
1150
1372
|
size_t ix_arr[],
|
@@ -1153,19 +1375,38 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
1153
1375
|
std::vector<char> has_missing(prediction_data.nrows, false);
|
1154
1376
|
|
1155
1377
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
|
1156
|
-
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
1378
|
+
for (size_t_for row = 0; row < (decltype(row))prediction_data.nrows; row++)
|
1157
1379
|
{
|
1158
1380
|
if (prediction_data.numeric_data != NULL)
|
1159
|
-
|
1381
|
+
{
|
1382
|
+
if (prediction_data.is_col_major)
|
1160
1383
|
{
|
1161
|
-
|
1384
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1162
1385
|
{
|
1163
|
-
|
1164
|
-
|
1386
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
1387
|
+
{
|
1388
|
+
has_missing[row] = true;
|
1389
|
+
break;
|
1390
|
+
}
|
1165
1391
|
}
|
1166
1392
|
}
|
1393
|
+
|
1394
|
+
else
|
1395
|
+
{
|
1396
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
1397
|
+
{
|
1398
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
1399
|
+
{
|
1400
|
+
has_missing[row] = true;
|
1401
|
+
break;
|
1402
|
+
}
|
1403
|
+
}
|
1404
|
+
}
|
1405
|
+
}
|
1406
|
+
|
1167
1407
|
else if (prediction_data.Xr != NULL)
|
1168
|
-
|
1408
|
+
{
|
1409
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
1169
1410
|
{
|
1170
1411
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
1171
1412
|
{
|
@@ -1173,16 +1414,34 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
1173
1414
|
break;
|
1174
1415
|
}
|
1175
1416
|
}
|
1417
|
+
}
|
1176
1418
|
|
1177
1419
|
if (!has_missing[row])
|
1178
|
-
|
1420
|
+
{
|
1421
|
+
if (prediction_data.is_col_major)
|
1179
1422
|
{
|
1180
|
-
|
1423
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1181
1424
|
{
|
1182
|
-
|
1183
|
-
|
1425
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
1426
|
+
{
|
1427
|
+
has_missing[row] = true;
|
1428
|
+
break;
|
1429
|
+
}
|
1430
|
+
}
|
1431
|
+
}
|
1432
|
+
|
1433
|
+
else
|
1434
|
+
{
|
1435
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
1436
|
+
{
|
1437
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
1438
|
+
{
|
1439
|
+
has_missing[row] = true;
|
1440
|
+
break;
|
1441
|
+
}
|
1184
1442
|
}
|
1185
1443
|
}
|
1444
|
+
}
|
1186
1445
|
}
|
1187
1446
|
|
1188
1447
|
size_t st = 0;
|