isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -18,11 +18,29 @@
|
|
|
18
18
|
* [5] https://sourceforge.net/projects/iforest/
|
|
19
19
|
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
-
* [8] Cortes, David.
|
|
22
|
-
*
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
23
41
|
*
|
|
24
42
|
* BSD 2-Clause License
|
|
25
|
-
* Copyright (c)
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
26
44
|
* All rights reserved.
|
|
27
45
|
* Redistribution and use in source and binary forms, with or without
|
|
28
46
|
* modification, are permitted provided that the following conditions are met:
|
|
@@ -52,18 +70,18 @@
|
|
|
52
70
|
* Parameters
|
|
53
71
|
* ==========
|
|
54
72
|
* - numeric_data[nrows * ncols_numeric] (in, out)
|
|
55
|
-
* Pointer to numeric data in which missing values will be imputed.
|
|
56
|
-
*
|
|
57
|
-
*
|
|
73
|
+
* Pointer to numeric data in which missing values will be imputed. May be ordered by rows
|
|
74
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
75
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
76
|
+
* (see parameter 'is_col_major').
|
|
58
77
|
* Pass NULL if there are no dense numeric columns.
|
|
59
78
|
* Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
|
|
60
79
|
* Imputations will overwrite values in this same array.
|
|
61
|
-
* - ncols_numeric
|
|
62
|
-
* Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
|
|
63
80
|
* - categ_data[nrows * ncols_categ]
|
|
64
|
-
* Pointer to categorical data in which missing values will be imputed.
|
|
65
|
-
*
|
|
66
|
-
*
|
|
81
|
+
* Pointer to categorical data in which missing values will be imputed. May be ordered by rows
|
|
82
|
+
* (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
|
|
83
|
+
* columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
|
|
84
|
+
* (see parameter 'is_col_major').
|
|
67
85
|
* Pass NULL if there are no categorical columns.
|
|
68
86
|
* Each category should be represented as an integer, and these integers must start at zero and
|
|
69
87
|
* be in consecutive order - i.e. if category '3' is present, category '2' must have also been
|
|
@@ -71,6 +89,11 @@
|
|
|
71
89
|
* an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
|
|
72
90
|
* must be the same as was used in the data to which the model was fit.
|
|
73
91
|
* Imputations will overwrite values in this same array.
|
|
92
|
+
* - is_col_major
|
|
93
|
+
* Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
|
|
94
|
+
* model was fit. If passing 'false', will assume they are in row-major order. Note that most of
|
|
95
|
+
* the functions in this library work only with column-major order, but here both are suitable
|
|
96
|
+
* and row-major is preferred. Both arrays must have the same orientation (row/column major).
|
|
74
97
|
* - ncols_categ
|
|
75
98
|
* Number of categorical columns in the data.
|
|
76
99
|
* - ncat[ncols_categ]
|
|
@@ -84,6 +107,7 @@
|
|
|
84
107
|
* Imputations will overwrite values in this same array.
|
|
85
108
|
* - Xr_ind[nnz]
|
|
86
109
|
* Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
|
|
110
|
+
* Must be in sorted order, otherwise results will be incorrect.
|
|
87
111
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
88
112
|
* - Xr_indptr[nrows + 1]
|
|
89
113
|
* Pointer to row index pointers that tell at entry [row] where does row 'row'
|
|
@@ -91,6 +115,11 @@
|
|
|
91
115
|
* Pass NULL if there are no sparse numeric columns in CSR format.
|
|
92
116
|
* - nrows
|
|
93
117
|
* Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
|
|
118
|
+
* - use_long_double
|
|
119
|
+
* Whether to use 'long double' (extended precision) type for the calculations. This makes them
|
|
120
|
+
* more accurate (provided that the compiler used has wider long doubles than doubles), but
|
|
121
|
+
* slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
|
|
122
|
+
* Power8 platforms).
|
|
94
123
|
* - nthreads
|
|
95
124
|
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
|
96
125
|
* allocated, even if the thread does not end up being used. Ignored when not building with
|
|
@@ -107,13 +136,50 @@
|
|
|
107
136
|
* Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
|
|
108
137
|
* as produced from function 'fit_iforest',
|
|
109
138
|
*/
|
|
110
|
-
|
|
111
|
-
|
|
139
|
+
template <class real_t, class sparse_ix>
|
|
140
|
+
void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
|
|
141
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
142
|
+
size_t nrows, bool use_long_double, int nthreads,
|
|
143
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
144
|
+
Imputer &imputer)
|
|
145
|
+
{
|
|
146
|
+
if (use_long_double && !has_long_double()) {
|
|
147
|
+
use_long_double = false;
|
|
148
|
+
fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
|
|
149
|
+
}
|
|
150
|
+
#ifndef NO_LONG_DOUBLE
|
|
151
|
+
if (likely(!use_long_double))
|
|
152
|
+
#endif
|
|
153
|
+
impute_missing_values_internal<real_t, sparse_ix, double>(
|
|
154
|
+
numeric_data, categ_data, is_col_major,
|
|
155
|
+
Xr, Xr_ind, Xr_indptr,
|
|
156
|
+
nrows, nthreads,
|
|
157
|
+
model_outputs, model_outputs_ext,
|
|
158
|
+
imputer
|
|
159
|
+
);
|
|
160
|
+
#ifndef NO_LONG_DOUBLE
|
|
161
|
+
else
|
|
162
|
+
impute_missing_values_internal<real_t, sparse_ix, long double>(
|
|
163
|
+
numeric_data, categ_data, is_col_major,
|
|
164
|
+
Xr, Xr_ind, Xr_indptr,
|
|
165
|
+
nrows, nthreads,
|
|
166
|
+
model_outputs, model_outputs_ext,
|
|
167
|
+
imputer
|
|
168
|
+
);
|
|
169
|
+
#endif
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
template <class real_t, class sparse_ix, class ldouble_safe>
|
|
173
|
+
void impute_missing_values_internal(
|
|
174
|
+
real_t numeric_data[], int categ_data[], bool is_col_major,
|
|
175
|
+
real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
|
112
176
|
size_t nrows, int nthreads,
|
|
113
177
|
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
114
178
|
Imputer &imputer)
|
|
115
179
|
{
|
|
116
|
-
PredictionData
|
|
180
|
+
PredictionData<real_t, sparse_ix>
|
|
181
|
+
prediction_data = {numeric_data, categ_data, nrows,
|
|
182
|
+
is_col_major, imputer.ncols_numeric, imputer.ncols_categ,
|
|
117
183
|
NULL, NULL, NULL,
|
|
118
184
|
Xr, Xr_ind, Xr_indptr};
|
|
119
185
|
|
|
@@ -128,34 +194,53 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
|
128
194
|
if ((size_t)nthreads > end)
|
|
129
195
|
nthreads = (int)end;
|
|
130
196
|
#ifdef _OPENMP
|
|
131
|
-
std::vector<ImputedData
|
|
197
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(nthreads);
|
|
132
198
|
#else
|
|
133
|
-
std::vector<ImputedData
|
|
199
|
+
std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(1);
|
|
134
200
|
#endif
|
|
135
201
|
|
|
202
|
+
bool threw_exception = false;
|
|
203
|
+
std::exception_ptr ex = NULL;
|
|
136
204
|
|
|
137
205
|
if (model_outputs != NULL)
|
|
138
206
|
{
|
|
139
207
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
140
|
-
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
|
|
141
|
-
for (size_t_for row = 0; row < end; row++)
|
|
208
|
+
shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer, ex, threw_exception)
|
|
209
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
|
142
210
|
{
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
|
211
|
+
if (threw_exception) continue;
|
|
212
|
+
try
|
|
146
213
|
{
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
214
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
|
215
|
+
|
|
216
|
+
for (std::vector<IsoTree> &tree : model_outputs->trees)
|
|
217
|
+
{
|
|
218
|
+
traverse_itree(tree,
|
|
219
|
+
*model_outputs,
|
|
220
|
+
prediction_data,
|
|
221
|
+
&imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
|
|
222
|
+
&imp_memory[omp_get_thread_num()],
|
|
223
|
+
(double) 1,
|
|
224
|
+
ix_arr[row],
|
|
225
|
+
(sparse_ix*)NULL,
|
|
226
|
+
(double*)NULL,
|
|
227
|
+
(size_t) 0);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
|
156
231
|
}
|
|
157
232
|
|
|
158
|
-
|
|
233
|
+
catch(...)
|
|
234
|
+
{
|
|
235
|
+
#pragma omp critical
|
|
236
|
+
{
|
|
237
|
+
if (!threw_exception)
|
|
238
|
+
{
|
|
239
|
+
threw_exception = true;
|
|
240
|
+
ex = std::current_exception();
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
159
244
|
|
|
160
245
|
}
|
|
161
246
|
}
|
|
@@ -164,31 +249,51 @@ void impute_missing_values(double numeric_data[], int categ_data[],
|
|
|
164
249
|
{
|
|
165
250
|
double temp;
|
|
166
251
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
|
167
|
-
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
|
|
252
|
+
shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer, ex, threw_exception) \
|
|
168
253
|
private(temp)
|
|
169
|
-
for (size_t_for row = 0; row < end; row++)
|
|
254
|
+
for (size_t_for row = 0; row < (decltype(row))end; row++)
|
|
170
255
|
{
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
|
256
|
+
if (threw_exception) continue;
|
|
257
|
+
try
|
|
174
258
|
{
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
259
|
+
initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
|
|
260
|
+
|
|
261
|
+
for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
|
|
262
|
+
{
|
|
263
|
+
traverse_hplane(hplane,
|
|
264
|
+
*model_outputs_ext,
|
|
265
|
+
prediction_data,
|
|
266
|
+
temp,
|
|
267
|
+
&imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
|
|
268
|
+
&imp_memory[omp_get_thread_num()],
|
|
269
|
+
(sparse_ix*)NULL,
|
|
270
|
+
(double*)NULL,
|
|
271
|
+
ix_arr[row]);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
|
|
183
275
|
}
|
|
184
276
|
|
|
185
|
-
|
|
277
|
+
catch (...)
|
|
278
|
+
{
|
|
279
|
+
#pragma omp critical
|
|
280
|
+
{
|
|
281
|
+
if (!threw_exception)
|
|
282
|
+
{
|
|
283
|
+
threw_exception = true;
|
|
284
|
+
ex = std::current_exception();
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
186
288
|
|
|
187
289
|
}
|
|
188
290
|
}
|
|
189
291
|
|
|
292
|
+
if (threw_exception)
|
|
293
|
+
std::rethrow_exception(ex);
|
|
190
294
|
}
|
|
191
295
|
|
|
296
|
+
template <class InputData, class ldouble_safe>
|
|
192
297
|
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
|
|
193
298
|
{
|
|
194
299
|
imputer.ncols_numeric = input_data.ncols_numeric;
|
|
@@ -212,7 +317,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
|
212
317
|
if (input_data.numeric_data != NULL)
|
|
213
318
|
{
|
|
214
319
|
#pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
|
|
215
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
|
320
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
|
216
321
|
{
|
|
217
322
|
cnt = input_data.nrows;
|
|
218
323
|
offset = col * input_data.nrows;
|
|
@@ -222,23 +327,23 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
|
222
327
|
input_data.numeric_data[row + offset] : 0;
|
|
223
328
|
cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
|
|
224
329
|
}
|
|
225
|
-
imputer.col_means[col] /= (
|
|
330
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
|
226
331
|
}
|
|
227
332
|
}
|
|
228
333
|
|
|
229
334
|
else if (input_data.Xc_indptr != NULL)
|
|
230
335
|
{
|
|
231
336
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
|
|
232
|
-
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
|
337
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
|
|
233
338
|
{
|
|
234
339
|
cnt = input_data.nrows;
|
|
235
|
-
for (
|
|
340
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
|
236
341
|
{
|
|
237
342
|
imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
|
|
238
343
|
input_data.Xc[ix] : 0;
|
|
239
344
|
cnt -= is_na_or_inf(input_data.Xc[ix]);
|
|
240
345
|
}
|
|
241
|
-
imputer.col_means[col] /= (
|
|
346
|
+
imputer.col_means[col] /= (ldouble_safe) cnt;
|
|
242
347
|
}
|
|
243
348
|
}
|
|
244
349
|
|
|
@@ -246,7 +351,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
|
246
351
|
{
|
|
247
352
|
std::vector<size_t> cat_counts(input_data.max_categ);
|
|
248
353
|
#pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
|
|
249
|
-
for (size_t_for col = 0; col < input_data.ncols_categ; col++)
|
|
354
|
+
for (size_t_for col = 0; col < (decltype(col))input_data.ncols_categ; col++)
|
|
250
355
|
{
|
|
251
356
|
std::fill(cat_counts.begin(), cat_counts.end(), 0);
|
|
252
357
|
offset = col * input_data.nrows;
|
|
@@ -264,6 +369,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
|
264
369
|
|
|
265
370
|
|
|
266
371
|
/* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
|
|
372
|
+
template <class InputData, class WorkerMemory, class ldouble_safe>
|
|
267
373
|
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
268
374
|
InputData &input_data, ModelParams &model_params,
|
|
269
375
|
std::vector<ImputeNode> &imputer_tree,
|
|
@@ -274,7 +380,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
274
380
|
if (!has_weights)
|
|
275
381
|
wsum = (double)(workspace.end - workspace.st + 1);
|
|
276
382
|
else
|
|
277
|
-
wsum = calculate_sum_weights(
|
|
383
|
+
wsum = calculate_sum_weights<ldouble_safe>(
|
|
384
|
+
workspace.ix_arr, workspace.st, workspace.end, curr_depth,
|
|
278
385
|
workspace.weights_arr, workspace.weights_map);
|
|
279
386
|
|
|
280
387
|
imputer.num_sum.resize(input_data.ncols_numeric, 0);
|
|
@@ -320,7 +427,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
320
427
|
if (!is_na_or_inf(xnum))
|
|
321
428
|
{
|
|
322
429
|
cnt++;
|
|
323
|
-
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (
|
|
430
|
+
imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (ldouble_safe)cnt;
|
|
324
431
|
}
|
|
325
432
|
}
|
|
326
433
|
imputer.num_weight[col] = (double) cnt;
|
|
@@ -349,7 +456,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
349
456
|
|
|
350
457
|
else
|
|
351
458
|
{
|
|
352
|
-
|
|
459
|
+
ldouble_safe prod_sum, corr, val, diff;
|
|
353
460
|
if (input_data.numeric_data != NULL)
|
|
354
461
|
{
|
|
355
462
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
@@ -417,7 +524,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
417
524
|
row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
|
|
418
525
|
)
|
|
419
526
|
{
|
|
420
|
-
if (input_data.Xc_ind[curr_pos] == *row)
|
|
527
|
+
if (input_data.Xc_ind[curr_pos] == static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
|
421
528
|
{
|
|
422
529
|
xnum = input_data.Xc[curr_pos];
|
|
423
530
|
if (workspace.weights_arr.size())
|
|
@@ -443,7 +550,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
443
550
|
|
|
444
551
|
else
|
|
445
552
|
{
|
|
446
|
-
if (input_data.Xc_ind[curr_pos] > *row)
|
|
553
|
+
if (input_data.Xc_ind[curr_pos] > static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
|
|
447
554
|
row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
|
|
448
555
|
else
|
|
449
556
|
curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
|
|
@@ -514,7 +621,10 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
514
621
|
{
|
|
515
622
|
for (int cat = 0; cat < input_data.ncat[col]; cat++)
|
|
516
623
|
{
|
|
517
|
-
imputer.cat_sum[col][cat]
|
|
624
|
+
imputer.cat_sum[col][cat]
|
|
625
|
+
+=
|
|
626
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] > 0)?
|
|
627
|
+
(imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col]) : 0.;
|
|
518
628
|
imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
|
|
519
629
|
}
|
|
520
630
|
break;
|
|
@@ -544,7 +654,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
544
654
|
{
|
|
545
655
|
case Inverse:
|
|
546
656
|
{
|
|
547
|
-
double wsum_div = wsum * sqrt(wsum);
|
|
657
|
+
double wsum_div = wsum * std::sqrt(wsum);
|
|
548
658
|
for (double &w : imputer.num_weight)
|
|
549
659
|
w /= wsum_div;
|
|
550
660
|
|
|
@@ -562,6 +672,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
562
672
|
break;
|
|
563
673
|
}
|
|
564
674
|
|
|
675
|
+
default: {}
|
|
676
|
+
|
|
565
677
|
/* TODO: maybe divide by nrows for prop */
|
|
566
678
|
}
|
|
567
679
|
|
|
@@ -585,6 +697,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
|
585
697
|
w *= curr_depth_dbl;
|
|
586
698
|
break;
|
|
587
699
|
}
|
|
700
|
+
|
|
701
|
+
default: {}
|
|
588
702
|
}
|
|
589
703
|
|
|
590
704
|
/* now re-adjust sums */
|
|
@@ -621,7 +735,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
|
621
735
|
{
|
|
622
736
|
for (size_t tr = 0; tr < trees->size(); tr++)
|
|
623
737
|
{
|
|
624
|
-
if ((*trees)[tr].
|
|
738
|
+
if ((*trees)[tr].tree_left != 0)
|
|
625
739
|
{
|
|
626
740
|
shrink_impute_node(imputer_tree[tr]);
|
|
627
741
|
}
|
|
@@ -639,7 +753,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
|
639
753
|
{
|
|
640
754
|
for (size_t tr = 0; tr < hplanes->size(); tr++)
|
|
641
755
|
{
|
|
642
|
-
if ((*hplanes)[tr].
|
|
756
|
+
if ((*hplanes)[tr].hplane_left != 0)
|
|
643
757
|
{
|
|
644
758
|
shrink_impute_node(imputer_tree[tr]);
|
|
645
759
|
}
|
|
@@ -656,7 +770,8 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
|
|
656
770
|
imputer_tree.shrink_to_fit();
|
|
657
771
|
}
|
|
658
772
|
|
|
659
|
-
|
|
773
|
+
template <class ImputedData>
|
|
774
|
+
void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto)
|
|
660
775
|
{
|
|
661
776
|
size_t col;
|
|
662
777
|
for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
|
|
@@ -681,16 +796,17 @@ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
|
|
|
681
796
|
}
|
|
682
797
|
}
|
|
683
798
|
|
|
799
|
+
template <class ImputedData, class WorkerMemory>
|
|
684
800
|
void combine_tree_imputations(WorkerMemory &workspace,
|
|
685
801
|
std::vector<ImputedData> &impute_vec,
|
|
686
|
-
|
|
802
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
687
803
|
std::vector<char> &has_missing,
|
|
688
804
|
int nthreads)
|
|
689
805
|
{
|
|
690
806
|
if (workspace.impute_vec.size())
|
|
691
807
|
{
|
|
692
808
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
|
|
693
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
|
809
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
|
694
810
|
if (has_missing[row])
|
|
695
811
|
combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
|
|
696
812
|
}
|
|
@@ -698,13 +814,14 @@ void combine_tree_imputations(WorkerMemory &workspace,
|
|
|
698
814
|
else if (workspace.impute_map.size())
|
|
699
815
|
{
|
|
700
816
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
|
|
701
|
-
for (size_t_for row = 0; row < has_missing.size(); row++)
|
|
817
|
+
for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
|
|
702
818
|
if (has_missing[row])
|
|
703
819
|
combine_imp_single(workspace.impute_map[row], impute_map[row]);
|
|
704
820
|
}
|
|
705
821
|
}
|
|
706
822
|
|
|
707
823
|
|
|
824
|
+
template <class ImputedData>
|
|
708
825
|
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
|
|
709
826
|
{
|
|
710
827
|
size_t col;
|
|
@@ -731,6 +848,7 @@ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double
|
|
|
731
848
|
}
|
|
732
849
|
|
|
733
850
|
|
|
851
|
+
template <class InputData, class WorkerMemory>
|
|
734
852
|
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
|
|
735
853
|
{
|
|
736
854
|
if (workspace.impute_vec.size())
|
|
@@ -794,7 +912,7 @@ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputDat
|
|
|
794
912
|
}
|
|
795
913
|
}
|
|
796
914
|
|
|
797
|
-
template <class imp_arr>
|
|
915
|
+
template <class imp_arr, class InputData>
|
|
798
916
|
void apply_imputation_results(imp_arr &impute_vec,
|
|
799
917
|
Imputer &imputer,
|
|
800
918
|
InputData &input_data,
|
|
@@ -809,7 +927,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
|
809
927
|
|
|
810
928
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
811
929
|
{
|
|
812
|
-
for (
|
|
930
|
+
for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
|
813
931
|
{
|
|
814
932
|
if (is_na_or_inf(input_data.Xc[ix]))
|
|
815
933
|
{
|
|
@@ -832,7 +950,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
|
832
950
|
}
|
|
833
951
|
|
|
834
952
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
|
|
835
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
|
953
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
|
836
954
|
{
|
|
837
955
|
if (input_data.has_missing[row])
|
|
838
956
|
{
|
|
@@ -867,8 +985,9 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
|
867
985
|
}
|
|
868
986
|
}
|
|
869
987
|
|
|
988
|
+
template <class ImputedData, class InputData>
|
|
870
989
|
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
|
871
|
-
|
|
990
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
872
991
|
Imputer &imputer,
|
|
873
992
|
InputData &input_data,
|
|
874
993
|
int nthreads)
|
|
@@ -880,6 +999,7 @@ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
|
|
880
999
|
}
|
|
881
1000
|
|
|
882
1001
|
|
|
1002
|
+
template <class PredictionData, class ImputedData>
|
|
883
1003
|
void apply_imputation_results(PredictionData &prediction_data,
|
|
884
1004
|
ImputedData &imp,
|
|
885
1005
|
Imputer &imputer,
|
|
@@ -887,21 +1007,40 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
|
887
1007
|
{
|
|
888
1008
|
size_t col;
|
|
889
1009
|
size_t pos = 0;
|
|
890
|
-
|
|
1010
|
+
if (prediction_data.is_col_major)
|
|
891
1011
|
{
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
1012
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
|
1013
|
+
{
|
|
1014
|
+
col = imp.missing_num[ix];
|
|
1015
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
|
|
1016
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
|
1017
|
+
=
|
|
1018
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
|
1019
|
+
else
|
|
1020
|
+
prediction_data.numeric_data[row + col * prediction_data.nrows]
|
|
1021
|
+
=
|
|
1022
|
+
imputer.col_means[col];
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
else
|
|
1027
|
+
{
|
|
1028
|
+
for (size_t ix = 0; ix < imp.n_missing_num; ix++)
|
|
1029
|
+
{
|
|
1030
|
+
col = imp.missing_num[ix];
|
|
1031
|
+
if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
|
|
1032
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
|
1033
|
+
=
|
|
1034
|
+
imp.num_sum[ix] / imp.num_weight[ix];
|
|
1035
|
+
else
|
|
1036
|
+
prediction_data.numeric_data[col + row * imputer.ncols_numeric]
|
|
1037
|
+
=
|
|
1038
|
+
imputer.col_means[col];
|
|
1039
|
+
}
|
|
901
1040
|
}
|
|
902
1041
|
|
|
903
1042
|
if (prediction_data.Xr != NULL)
|
|
904
|
-
for (
|
|
1043
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
|
905
1044
|
{
|
|
906
1045
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
|
907
1046
|
{
|
|
@@ -917,22 +1056,43 @@ void apply_imputation_results(PredictionData &prediction_data,
|
|
|
917
1056
|
}
|
|
918
1057
|
}
|
|
919
1058
|
|
|
920
|
-
|
|
1059
|
+
if (prediction_data.is_col_major)
|
|
921
1060
|
{
|
|
922
|
-
|
|
923
|
-
|
|
1061
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
|
1062
|
+
{
|
|
1063
|
+
col = imp.missing_cat[ix];
|
|
1064
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
|
1065
|
+
=
|
|
1066
|
+
std::distance(imp.cat_sum[col].begin(),
|
|
1067
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
|
1068
|
+
|
|
1069
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
|
|
1070
|
+
prediction_data.categ_data[row + col * prediction_data.nrows]
|
|
924
1071
|
=
|
|
925
|
-
|
|
926
|
-
|
|
1072
|
+
imputer.col_modes[col];
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
927
1075
|
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
1076
|
+
else
|
|
1077
|
+
{
|
|
1078
|
+
for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
|
|
1079
|
+
{
|
|
1080
|
+
col = imp.missing_cat[ix];
|
|
1081
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
|
1082
|
+
=
|
|
1083
|
+
std::distance(imp.cat_sum[col].begin(),
|
|
1084
|
+
std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
|
|
1085
|
+
|
|
1086
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] == 0 && imp.cat_sum[col][0] <= 0)
|
|
1087
|
+
prediction_data.categ_data[col + row * imputer.ncols_categ]
|
|
1088
|
+
=
|
|
1089
|
+
imputer.col_modes[col];
|
|
1090
|
+
}
|
|
932
1091
|
}
|
|
933
1092
|
}
|
|
934
1093
|
|
|
935
1094
|
|
|
1095
|
+
template <class ImputedData, class InputData>
|
|
936
1096
|
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
937
1097
|
{
|
|
938
1098
|
imp.n_missing_num = 0;
|
|
@@ -953,15 +1113,15 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
|
953
1113
|
else if (input_data.Xc_indptr != NULL)
|
|
954
1114
|
{
|
|
955
1115
|
imp.missing_sp.resize(input_data.ncols_numeric);
|
|
956
|
-
|
|
1116
|
+
decltype(input_data.Xc_indptr) res;
|
|
957
1117
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
958
1118
|
{
|
|
959
1119
|
res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
|
|
960
1120
|
input_data.Xc_ind + input_data.Xc_indptr[col + 1],
|
|
961
|
-
|
|
1121
|
+
row);
|
|
962
1122
|
if (
|
|
963
1123
|
res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
|
|
964
|
-
*res == row &&
|
|
1124
|
+
*res == static_cast<typename std::remove_pointer<decltype(res)>::type>(row) &&
|
|
965
1125
|
is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
|
|
966
1126
|
)
|
|
967
1127
|
{
|
|
@@ -986,6 +1146,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
|
986
1146
|
}
|
|
987
1147
|
}
|
|
988
1148
|
|
|
1149
|
+
template <class ImputedData, class PredictionData>
|
|
989
1150
|
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
|
|
990
1151
|
{
|
|
991
1152
|
imp.n_missing_num = 0;
|
|
@@ -996,9 +1157,20 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
|
996
1157
|
{
|
|
997
1158
|
if (!imp.missing_num.size())
|
|
998
1159
|
imp.missing_num.resize(imputer.ncols_numeric);
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1160
|
+
|
|
1161
|
+
if (prediction_data.is_col_major)
|
|
1162
|
+
{
|
|
1163
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
|
1164
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
|
1165
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
else
|
|
1169
|
+
{
|
|
1170
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
|
1171
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
|
1172
|
+
imp.missing_num[imp.n_missing_num++] = col;
|
|
1173
|
+
}
|
|
1002
1174
|
|
|
1003
1175
|
if (!imp.num_sum.size())
|
|
1004
1176
|
{
|
|
@@ -1017,7 +1189,7 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
|
1017
1189
|
{
|
|
1018
1190
|
if (!imp.missing_sp.size())
|
|
1019
1191
|
imp.missing_sp.resize(imputer.ncols_numeric);
|
|
1020
|
-
for (
|
|
1192
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
|
1021
1193
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
|
1022
1194
|
imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
|
|
1023
1195
|
|
|
@@ -1038,10 +1210,23 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
|
1038
1210
|
{
|
|
1039
1211
|
if (!imp.missing_cat.size())
|
|
1040
1212
|
imp.missing_cat.resize(imputer.ncols_categ);
|
|
1041
|
-
|
|
1213
|
+
|
|
1214
|
+
if (prediction_data.is_col_major)
|
|
1042
1215
|
{
|
|
1043
|
-
|
|
1044
|
-
|
|
1216
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
|
1217
|
+
{
|
|
1218
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
|
1219
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
else
|
|
1224
|
+
{
|
|
1225
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
|
1226
|
+
{
|
|
1227
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
|
1228
|
+
imp.missing_cat[imp.n_missing_cat++] = col;
|
|
1229
|
+
}
|
|
1045
1230
|
}
|
|
1046
1231
|
|
|
1047
1232
|
if (!imp.cat_weight.size())
|
|
@@ -1063,31 +1248,35 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
|
|
|
1063
1248
|
}
|
|
1064
1249
|
}
|
|
1065
1250
|
|
|
1066
|
-
ImputedData
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1251
|
+
// template class ImputedData <class InputData>
|
|
1252
|
+
// ImputedData::ImputedData(InputData &input_data, size_t row)
|
|
1253
|
+
// {
|
|
1254
|
+
// initialize_impute_calc(*this, input_data, row);
|
|
1255
|
+
// }
|
|
1070
1256
|
|
|
1257
|
+
template <class ImputedData, class InputData>
|
|
1071
1258
|
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
|
|
1072
1259
|
{
|
|
1073
1260
|
impute_vec.resize(input_data.nrows);
|
|
1074
1261
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
|
|
1075
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
|
1262
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
|
1076
1263
|
if (input_data.has_missing[row])
|
|
1077
1264
|
initialize_impute_calc(impute_vec[row], input_data, row);
|
|
1078
1265
|
}
|
|
1079
1266
|
|
|
1080
1267
|
|
|
1081
|
-
|
|
1268
|
+
template <class ImputedData, class InputData>
|
|
1269
|
+
void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data)
|
|
1082
1270
|
{
|
|
1083
1271
|
for (size_t row = 0; row < input_data.nrows; row++)
|
|
1084
1272
|
if (input_data.has_missing[row])
|
|
1085
1273
|
impute_map[row] = ImputedData(input_data, row);
|
|
1086
1274
|
}
|
|
1087
1275
|
|
|
1276
|
+
template <class ImputedData, class InputData>
|
|
1088
1277
|
void allocate_imp(InputData &input_data,
|
|
1089
1278
|
std::vector<ImputedData> &impute_vec,
|
|
1090
|
-
|
|
1279
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
1091
1280
|
int nthreads)
|
|
1092
1281
|
{
|
|
1093
1282
|
if (input_data.n_missing == 0)
|
|
@@ -1098,9 +1287,10 @@ void allocate_imp(InputData &input_data,
|
|
|
1098
1287
|
allocate_imp_vec(impute_vec, input_data, nthreads);
|
|
1099
1288
|
}
|
|
1100
1289
|
|
|
1290
|
+
template <class ImputedData, class InputData>
|
|
1101
1291
|
void check_for_missing(InputData &input_data,
|
|
1102
1292
|
std::vector<ImputedData> &impute_vec,
|
|
1103
|
-
|
|
1293
|
+
hashed_map<size_t, ImputedData> &impute_map,
|
|
1104
1294
|
int nthreads)
|
|
1105
1295
|
{
|
|
1106
1296
|
input_data.has_missing.assign(input_data.nrows, false);
|
|
@@ -1109,7 +1299,7 @@ void check_for_missing(InputData &input_data,
|
|
|
1109
1299
|
{
|
|
1110
1300
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
1111
1301
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
|
|
1112
|
-
for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
|
|
1302
|
+
for (size_t_for ix = input_data.Xc_indptr[col]; ix < (decltype(ix))input_data.Xc_indptr[col + 1]; ix++)
|
|
1113
1303
|
if (is_na_or_inf(input_data.Xc[ix]))
|
|
1114
1304
|
input_data.has_missing[input_data.Xc_ind[ix]] = true;
|
|
1115
1305
|
#pragma omp barrier
|
|
@@ -1118,14 +1308,17 @@ void check_for_missing(InputData &input_data,
|
|
|
1118
1308
|
if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
|
|
1119
1309
|
{
|
|
1120
1310
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
|
|
1121
|
-
for (size_t_for row = 0; row < input_data.nrows; row++)
|
|
1311
|
+
for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
|
|
1122
1312
|
{
|
|
1123
|
-
|
|
1313
|
+
if (input_data.Xc_indptr == NULL)
|
|
1124
1314
|
{
|
|
1125
|
-
|
|
1315
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
|
1126
1316
|
{
|
|
1127
|
-
input_data.
|
|
1128
|
-
|
|
1317
|
+
if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
|
|
1318
|
+
{
|
|
1319
|
+
input_data.has_missing[row] = true;
|
|
1320
|
+
break;
|
|
1321
|
+
}
|
|
1129
1322
|
}
|
|
1130
1323
|
}
|
|
1131
1324
|
|
|
@@ -1145,6 +1338,7 @@ void check_for_missing(InputData &input_data,
|
|
|
1145
1338
|
allocate_imp(input_data, impute_vec, impute_map, nthreads);
|
|
1146
1339
|
}
|
|
1147
1340
|
|
|
1341
|
+
template <class PredictionData>
|
|
1148
1342
|
size_t check_for_missing(PredictionData &prediction_data,
|
|
1149
1343
|
Imputer &imputer,
|
|
1150
1344
|
size_t ix_arr[],
|
|
@@ -1153,19 +1347,38 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
|
1153
1347
|
std::vector<char> has_missing(prediction_data.nrows, false);
|
|
1154
1348
|
|
|
1155
1349
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
|
|
1156
|
-
for (size_t_for row = 0; row < prediction_data.nrows; row++)
|
|
1350
|
+
for (size_t_for row = 0; row < (decltype(row))prediction_data.nrows; row++)
|
|
1157
1351
|
{
|
|
1158
1352
|
if (prediction_data.numeric_data != NULL)
|
|
1159
|
-
|
|
1353
|
+
{
|
|
1354
|
+
if (prediction_data.is_col_major)
|
|
1160
1355
|
{
|
|
1161
|
-
|
|
1356
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
|
1162
1357
|
{
|
|
1163
|
-
|
|
1164
|
-
|
|
1358
|
+
if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
|
|
1359
|
+
{
|
|
1360
|
+
has_missing[row] = true;
|
|
1361
|
+
break;
|
|
1362
|
+
}
|
|
1165
1363
|
}
|
|
1166
1364
|
}
|
|
1365
|
+
|
|
1366
|
+
else
|
|
1367
|
+
{
|
|
1368
|
+
for (size_t col = 0; col < imputer.ncols_numeric; col++)
|
|
1369
|
+
{
|
|
1370
|
+
if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
|
|
1371
|
+
{
|
|
1372
|
+
has_missing[row] = true;
|
|
1373
|
+
break;
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1167
1379
|
else if (prediction_data.Xr != NULL)
|
|
1168
|
-
|
|
1380
|
+
{
|
|
1381
|
+
for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
|
|
1169
1382
|
{
|
|
1170
1383
|
if (is_na_or_inf(prediction_data.Xr[ix]))
|
|
1171
1384
|
{
|
|
@@ -1173,16 +1386,34 @@ size_t check_for_missing(PredictionData &prediction_data,
|
|
|
1173
1386
|
break;
|
|
1174
1387
|
}
|
|
1175
1388
|
}
|
|
1389
|
+
}
|
|
1176
1390
|
|
|
1177
1391
|
if (!has_missing[row])
|
|
1178
|
-
|
|
1392
|
+
{
|
|
1393
|
+
if (prediction_data.is_col_major)
|
|
1179
1394
|
{
|
|
1180
|
-
|
|
1395
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
|
1181
1396
|
{
|
|
1182
|
-
|
|
1183
|
-
|
|
1397
|
+
if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
|
|
1398
|
+
{
|
|
1399
|
+
has_missing[row] = true;
|
|
1400
|
+
break;
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
else
|
|
1406
|
+
{
|
|
1407
|
+
for (size_t col = 0; col < imputer.ncols_categ; col++)
|
|
1408
|
+
{
|
|
1409
|
+
if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
|
|
1410
|
+
{
|
|
1411
|
+
has_missing[row] = true;
|
|
1412
|
+
break;
|
|
1413
|
+
}
|
|
1184
1414
|
}
|
|
1185
1415
|
}
|
|
1416
|
+
}
|
|
1186
1417
|
}
|
|
1187
1418
|
|
|
1188
1419
|
size_t st = 0;
|