isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -52,18 +70,18 @@
52
70
  * Parameters
53
71
  * ==========
54
72
  * - numeric_data[nrows * ncols_numeric] (in, out)
55
- * Pointer to numeric data in which missing values will be imputed. Must be ordered by columns like Fortran,
56
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
57
- * and the column order must be the same as in the data that was used to fit the model.
73
+ * Pointer to numeric data in which missing values will be imputed. May be ordered by rows
74
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
75
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
76
+ * (see parameter 'is_col_major').
58
77
  * Pass NULL if there are no dense numeric columns.
59
78
  * Can only pass one of 'numeric_data', 'Xr' + 'Xr_ind' + 'Xr_indptr'.
60
79
  * Imputations will overwrite values in this same array.
61
- * - ncols_numeric
62
- * Number of numeric columns in the data (whether they come in a sparse matrix or dense array).
63
80
  * - categ_data[nrows * ncols_categ]
64
- * Pointer to categorical data in which missing values will be imputed. Must be ordered by columns like Fortran,
65
- * not ordered by rows like C (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.),
66
- * and the column order must be the same as in the data that was used to fit the model.
81
+ * Pointer to categorical data in which missing values will be imputed. May be ordered by rows
82
+ * (i.e. entries 1..n contain row 0, n+1..2n row 1, etc.) - a.k.a. row-major - or by
83
+ * columns (i.e. entries 1..n contain column 0, n+1..2n column 1, etc.) - a.k.a. column-major
84
+ * (see parameter 'is_col_major').
67
85
  * Pass NULL if there are no categorical columns.
68
86
  * Each category should be represented as an integer, and these integers must start at zero and
69
87
  * be in consecutive order - i.e. if category '3' is present, category '2' must have also been
@@ -71,6 +89,11 @@
71
89
  * an encoding). Missing values should be encoded as negative numbers such as (-1). The encoding
72
90
  * must be the same as was used in the data to which the model was fit.
73
91
  * Imputations will overwrite values in this same array.
92
+ * - is_col_major
93
+ * Whether 'numeric_data' and 'categ_data' come in column-major order, like the data to which the
94
+ * model was fit. If passing 'false', will assume they are in row-major order. Note that most of
95
+ * the functions in this library work only with column-major order, but here both are suitable
96
+ * and row-major is preferred. Both arrays must have the same orientation (row/column major).
74
97
  * - ncols_categ
75
98
  * Number of categorical columns in the data.
76
99
  * - ncat[ncols_categ]
@@ -84,6 +107,7 @@
84
107
  * Imputations will overwrite values in this same array.
85
108
  * - Xr_ind[nnz]
86
109
  * Pointer to column indices to which each non-zero entry in 'Xr' corresponds.
110
+ * Must be in sorted order, otherwise results will be incorrect.
87
111
  * Pass NULL if there are no sparse numeric columns in CSR format.
88
112
  * - Xr_indptr[nrows + 1]
89
113
  * Pointer to row index pointers that tell at entry [row] where does row 'row'
@@ -91,6 +115,11 @@
91
115
  * Pass NULL if there are no sparse numeric columns in CSR format.
92
116
  * - nrows
93
117
  * Number of rows in 'numeric_data', 'Xc', 'Xr, 'categ_data'.
118
+ * - use_long_double
119
+ * Whether to use 'long double' (extended precision) type for the calculations. This makes them
120
+ * more accurate (provided that the compiler used has wider long doubles than doubles), but
121
+ * slower - especially in platforms in which 'long double' is a software-emulated type (e.g.
122
+ * Power8 platforms).
94
123
  * - nthreads
95
124
  * Number of parallel threads to use. Note that, the more threads, the more memory will be
96
125
  * allocated, even if the thread does not end up being used. Ignored when not building with
@@ -107,13 +136,50 @@
107
136
  * Pointer to fitted imputation node obects for the same trees as in 'model_outputs' or 'model_outputs_ext',
108
137
  * as produced from function 'fit_iforest',
109
138
  */
110
- void impute_missing_values(double numeric_data[], int categ_data[],
111
- double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
139
+ template <class real_t, class sparse_ix>
140
+ void impute_missing_values(real_t numeric_data[], int categ_data[], bool is_col_major,
141
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
142
+ size_t nrows, bool use_long_double, int nthreads,
143
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
144
+ Imputer &imputer)
145
+ {
146
+ if (use_long_double && !has_long_double()) {
147
+ use_long_double = false;
148
+ fprintf(stderr, "Passed 'use_long_double=true', but library was compiled without long double support.\n");
149
+ }
150
+ #ifndef NO_LONG_DOUBLE
151
+ if (likely(!use_long_double))
152
+ #endif
153
+ impute_missing_values_internal<real_t, sparse_ix, double>(
154
+ numeric_data, categ_data, is_col_major,
155
+ Xr, Xr_ind, Xr_indptr,
156
+ nrows, nthreads,
157
+ model_outputs, model_outputs_ext,
158
+ imputer
159
+ );
160
+ #ifndef NO_LONG_DOUBLE
161
+ else
162
+ impute_missing_values_internal<real_t, sparse_ix, long double>(
163
+ numeric_data, categ_data, is_col_major,
164
+ Xr, Xr_ind, Xr_indptr,
165
+ nrows, nthreads,
166
+ model_outputs, model_outputs_ext,
167
+ imputer
168
+ );
169
+ #endif
170
+ }
171
+
172
+ template <class real_t, class sparse_ix, class ldouble_safe>
173
+ void impute_missing_values_internal(
174
+ real_t numeric_data[], int categ_data[], bool is_col_major,
175
+ real_t Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
112
176
  size_t nrows, int nthreads,
113
177
  IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
114
178
  Imputer &imputer)
115
179
  {
116
- PredictionData prediction_data = {numeric_data, categ_data, nrows,
180
+ PredictionData<real_t, sparse_ix>
181
+ prediction_data = {numeric_data, categ_data, nrows,
182
+ is_col_major, imputer.ncols_numeric, imputer.ncols_categ,
117
183
  NULL, NULL, NULL,
118
184
  Xr, Xr_ind, Xr_indptr};
119
185
 
@@ -128,34 +194,53 @@ void impute_missing_values(double numeric_data[], int categ_data[],
128
194
  if ((size_t)nthreads > end)
129
195
  nthreads = (int)end;
130
196
  #ifdef _OPENMP
131
- std::vector<ImputedData> imp_memory(nthreads);
197
+ std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(nthreads);
132
198
  #else
133
- std::vector<ImputedData> imp_memory(1);
199
+ std::vector<ImputedData<sparse_ix, ldouble_safe>> imp_memory(1);
134
200
  #endif
135
201
 
202
+ bool threw_exception = false;
203
+ std::exception_ptr ex = NULL;
136
204
 
137
205
  if (model_outputs != NULL)
138
206
  {
139
207
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
140
- shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer)
141
- for (size_t_for row = 0; row < end; row++)
208
+ shared(end, imp_memory, prediction_data, model_outputs, ix_arr, imputer, ex, threw_exception)
209
+ for (size_t_for row = 0; row < (decltype(row))end; row++)
142
210
  {
143
- initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
144
-
145
- for (std::vector<IsoTree> &tree : model_outputs->trees)
211
+ if (threw_exception) continue;
212
+ try
146
213
  {
147
- traverse_itree(tree,
148
- *model_outputs,
149
- prediction_data,
150
- &imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
151
- &imp_memory[omp_get_thread_num()],
152
- (double) 1,
153
- ix_arr[row],
154
- NULL,
155
- (size_t) 0);
214
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
215
+
216
+ for (std::vector<IsoTree> &tree : model_outputs->trees)
217
+ {
218
+ traverse_itree(tree,
219
+ *model_outputs,
220
+ prediction_data,
221
+ &imputer.imputer_tree[&tree - &(model_outputs->trees[0])],
222
+ &imp_memory[omp_get_thread_num()],
223
+ (double) 1,
224
+ ix_arr[row],
225
+ (sparse_ix*)NULL,
226
+ (double*)NULL,
227
+ (size_t) 0);
228
+ }
229
+
230
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
156
231
  }
157
232
 
158
- apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
233
+ catch(...)
234
+ {
235
+ #pragma omp critical
236
+ {
237
+ if (!threw_exception)
238
+ {
239
+ threw_exception = true;
240
+ ex = std::current_exception();
241
+ }
242
+ }
243
+ }
159
244
 
160
245
  }
161
246
  }
@@ -164,31 +249,51 @@ void impute_missing_values(double numeric_data[], int categ_data[],
164
249
  {
165
250
  double temp;
166
251
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
167
- shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer) \
252
+ shared(end, imp_memory, prediction_data, model_outputs_ext, ix_arr, imputer, ex, threw_exception) \
168
253
  private(temp)
169
- for (size_t_for row = 0; row < end; row++)
254
+ for (size_t_for row = 0; row < (decltype(row))end; row++)
170
255
  {
171
- initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
172
-
173
- for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
256
+ if (threw_exception) continue;
257
+ try
174
258
  {
175
- traverse_hplane(hplane,
176
- *model_outputs_ext,
177
- prediction_data,
178
- temp,
179
- &imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
180
- &imp_memory[omp_get_thread_num()],
181
- NULL,
182
- ix_arr[row]);
259
+ initialize_impute_calc(imp_memory[omp_get_thread_num()], prediction_data, imputer, ix_arr[row]);
260
+
261
+ for (std::vector<IsoHPlane> &hplane : model_outputs_ext->hplanes)
262
+ {
263
+ traverse_hplane(hplane,
264
+ *model_outputs_ext,
265
+ prediction_data,
266
+ temp,
267
+ &imputer.imputer_tree[&hplane - &(model_outputs_ext->hplanes[0])],
268
+ &imp_memory[omp_get_thread_num()],
269
+ (sparse_ix*)NULL,
270
+ (double*)NULL,
271
+ ix_arr[row]);
272
+ }
273
+
274
+ apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
183
275
  }
184
276
 
185
- apply_imputation_results(prediction_data, imp_memory[omp_get_thread_num()], imputer, (size_t) ix_arr[row]);
277
+ catch (...)
278
+ {
279
+ #pragma omp critical
280
+ {
281
+ if (!threw_exception)
282
+ {
283
+ threw_exception = true;
284
+ ex = std::current_exception();
285
+ }
286
+ }
287
+ }
186
288
 
187
289
  }
188
290
  }
189
291
 
292
+ if (threw_exception)
293
+ std::rethrow_exception(ex);
190
294
  }
191
295
 
296
+ template <class InputData, class ldouble_safe>
192
297
  void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads)
193
298
  {
194
299
  imputer.ncols_numeric = input_data.ncols_numeric;
@@ -212,7 +317,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
212
317
  if (input_data.numeric_data != NULL)
213
318
  {
214
319
  #pragma omp parallel for schedule(static) num_threads(nthreads) private(cnt, offset) shared(input_data, imputer)
215
- for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
320
+ for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
216
321
  {
217
322
  cnt = input_data.nrows;
218
323
  offset = col * input_data.nrows;
@@ -222,23 +327,23 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
222
327
  input_data.numeric_data[row + offset] : 0;
223
328
  cnt -= is_na_or_inf(input_data.numeric_data[row + offset]);
224
329
  }
225
- imputer.col_means[col] /= (long double) cnt;
330
+ imputer.col_means[col] /= (ldouble_safe) cnt;
226
331
  }
227
332
  }
228
333
 
229
334
  else if (input_data.Xc_indptr != NULL)
230
335
  {
231
336
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
232
- for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
337
+ for (size_t_for col = 0; col < (decltype(col))input_data.ncols_numeric; col++)
233
338
  {
234
339
  cnt = input_data.nrows;
235
- for (size_t ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
340
+ for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
236
341
  {
237
342
  imputer.col_means[col] += (!is_na_or_inf(input_data.Xc[ix]))?
238
343
  input_data.Xc[ix] : 0;
239
344
  cnt -= is_na_or_inf(input_data.Xc[ix]);
240
345
  }
241
- imputer.col_means[col] /= (long double) cnt;
346
+ imputer.col_means[col] /= (ldouble_safe) cnt;
242
347
  }
243
348
  }
244
349
 
@@ -246,7 +351,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
246
351
  {
247
352
  std::vector<size_t> cat_counts(input_data.max_categ);
248
353
  #pragma omp parallel for schedule(static) num_threads(nthreads) firstprivate(cat_counts) private(offset) shared(input_data, imputer)
249
- for (size_t_for col = 0; col < input_data.ncols_categ; col++)
354
+ for (size_t_for col = 0; col < (decltype(col))input_data.ncols_categ; col++)
250
355
  {
251
356
  std::fill(cat_counts.begin(), cat_counts.end(), 0);
252
357
  offset = col * input_data.nrows;
@@ -264,6 +369,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
264
369
 
265
370
 
266
371
  /* https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
372
+ template <class InputData, class WorkerMemory, class ldouble_safe>
267
373
  void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
268
374
  InputData &input_data, ModelParams &model_params,
269
375
  std::vector<ImputeNode> &imputer_tree,
@@ -274,7 +380,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
274
380
  if (!has_weights)
275
381
  wsum = (double)(workspace.end - workspace.st + 1);
276
382
  else
277
- wsum = calculate_sum_weights(workspace.ix_arr, workspace.st, workspace.end, curr_depth,
383
+ wsum = calculate_sum_weights<ldouble_safe>(
384
+ workspace.ix_arr, workspace.st, workspace.end, curr_depth,
278
385
  workspace.weights_arr, workspace.weights_map);
279
386
 
280
387
  imputer.num_sum.resize(input_data.ncols_numeric, 0);
@@ -320,7 +427,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
320
427
  if (!is_na_or_inf(xnum))
321
428
  {
322
429
  cnt++;
323
- imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (long double)cnt;
430
+ imputer.num_sum[col] += (xnum - imputer.num_sum[col]) / (ldouble_safe)cnt;
324
431
  }
325
432
  }
326
433
  imputer.num_weight[col] = (double) cnt;
@@ -349,7 +456,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
349
456
 
350
457
  else
351
458
  {
352
- long double prod_sum, corr, val, diff;
459
+ ldouble_safe prod_sum, corr, val, diff;
353
460
  if (input_data.numeric_data != NULL)
354
461
  {
355
462
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
@@ -417,7 +524,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
417
524
  row != ix_arr + workspace.end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
418
525
  )
419
526
  {
420
- if (input_data.Xc_ind[curr_pos] == *row)
527
+ if (input_data.Xc_ind[curr_pos] == static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
421
528
  {
422
529
  xnum = input_data.Xc[curr_pos];
423
530
  if (workspace.weights_arr.size())
@@ -443,7 +550,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
443
550
 
444
551
  else
445
552
  {
446
- if (input_data.Xc_ind[curr_pos] > *row)
553
+ if (input_data.Xc_ind[curr_pos] > static_cast<typename std::remove_pointer<decltype(input_data.Xc_ind)>::type>(*row))
447
554
  row = std::lower_bound(row + 1, ix_arr + workspace.end + 1, input_data.Xc_ind[curr_pos]);
448
555
  else
449
556
  curr_pos = std::lower_bound(input_data.Xc_ind + curr_pos + 1, input_data.Xc_ind + end_col + 1, *row) - input_data.Xc_ind;
@@ -514,7 +621,10 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
514
621
  {
515
622
  for (int cat = 0; cat < input_data.ncat[col]; cat++)
516
623
  {
517
- imputer.cat_sum[col][cat] += imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col];
624
+ imputer.cat_sum[col][cat]
625
+ +=
626
+ (imputer_tree[curr_tree].cat_sum[col][cat] > 0)?
627
+ (imputer_tree[curr_tree].cat_sum[col][cat] / imputer.cat_weight[col]) : 0.;
518
628
  imputer.cat_weight[col] = wsum / (double)(2 * look_aboves);
519
629
  }
520
630
  break;
@@ -544,7 +654,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
544
654
  {
545
655
  case Inverse:
546
656
  {
547
- double wsum_div = wsum * sqrt(wsum);
657
+ double wsum_div = wsum * std::sqrt(wsum);
548
658
  for (double &w : imputer.num_weight)
549
659
  w /= wsum_div;
550
660
 
@@ -562,6 +672,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
562
672
  break;
563
673
  }
564
674
 
675
+ default: {}
676
+
565
677
  /* TODO: maybe divide by nrows for prop */
566
678
  }
567
679
 
@@ -585,6 +697,8 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
585
697
  w *= curr_depth_dbl;
586
698
  break;
587
699
  }
700
+
701
+ default: {}
588
702
  }
589
703
 
590
704
  /* now re-adjust sums */
@@ -621,7 +735,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
621
735
  {
622
736
  for (size_t tr = 0; tr < trees->size(); tr++)
623
737
  {
624
- if ((*trees)[tr].score <= 0)
738
+ if ((*trees)[tr].tree_left != 0)
625
739
  {
626
740
  shrink_impute_node(imputer_tree[tr]);
627
741
  }
@@ -639,7 +753,7 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
639
753
  {
640
754
  for (size_t tr = 0; tr < hplanes->size(); tr++)
641
755
  {
642
- if ((*hplanes)[tr].score <= 0)
756
+ if ((*hplanes)[tr].hplane_left != 0)
643
757
  {
644
758
  shrink_impute_node(imputer_tree[tr]);
645
759
  }
@@ -656,7 +770,8 @@ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
656
770
  imputer_tree.shrink_to_fit();
657
771
  }
658
772
 
659
- void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
773
+ template <class ImputedData>
774
+ void combine_imp_single(ImputedData &restrict imp_addfrom, ImputedData &restrict imp_addto)
660
775
  {
661
776
  size_t col;
662
777
  for (size_t ix = 0; ix < imp_addfrom.n_missing_num; ix++)
@@ -681,16 +796,17 @@ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto)
681
796
  }
682
797
  }
683
798
 
799
+ template <class ImputedData, class WorkerMemory>
684
800
  void combine_tree_imputations(WorkerMemory &workspace,
685
801
  std::vector<ImputedData> &impute_vec,
686
- std::unordered_map<size_t, ImputedData> &impute_map,
802
+ hashed_map<size_t, ImputedData> &impute_map,
687
803
  std::vector<char> &has_missing,
688
804
  int nthreads)
689
805
  {
690
806
  if (workspace.impute_vec.size())
691
807
  {
692
808
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_vec)
693
- for (size_t_for row = 0; row < has_missing.size(); row++)
809
+ for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
694
810
  if (has_missing[row])
695
811
  combine_imp_single(workspace.impute_vec[row], impute_vec[row]);
696
812
  }
@@ -698,13 +814,14 @@ void combine_tree_imputations(WorkerMemory &workspace,
698
814
  else if (workspace.impute_map.size())
699
815
  {
700
816
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(has_missing, workspace, impute_map)
701
- for (size_t_for row = 0; row < has_missing.size(); row++)
817
+ for (size_t_for row = 0; row < (decltype(row))has_missing.size(); row++)
702
818
  if (has_missing[row])
703
819
  combine_imp_single(workspace.impute_map[row], impute_map[row]);
704
820
  }
705
821
  }
706
822
 
707
823
 
824
+ template <class ImputedData>
708
825
  void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w)
709
826
  {
710
827
  size_t col;
@@ -731,6 +848,7 @@ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double
731
848
  }
732
849
 
733
850
 
851
+ template <class InputData, class WorkerMemory>
734
852
  void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data)
735
853
  {
736
854
  if (workspace.impute_vec.size())
@@ -794,7 +912,7 @@ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputDat
794
912
  }
795
913
  }
796
914
 
797
- template <class imp_arr>
915
+ template <class imp_arr, class InputData>
798
916
  void apply_imputation_results(imp_arr &impute_vec,
799
917
  Imputer &imputer,
800
918
  InputData &input_data,
@@ -809,7 +927,7 @@ void apply_imputation_results(imp_arr &impute_vec,
809
927
 
810
928
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
811
929
  {
812
- for (sparse_ix ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
930
+ for (auto ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
813
931
  {
814
932
  if (is_na_or_inf(input_data.Xc[ix]))
815
933
  {
@@ -832,7 +950,7 @@ void apply_imputation_results(imp_arr &impute_vec,
832
950
  }
833
951
 
834
952
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(input_data, impute_vec, imputer) private(col)
835
- for (size_t_for row = 0; row < input_data.nrows; row++)
953
+ for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
836
954
  {
837
955
  if (input_data.has_missing[row])
838
956
  {
@@ -867,8 +985,9 @@ void apply_imputation_results(imp_arr &impute_vec,
867
985
  }
868
986
  }
869
987
 
988
+ template <class ImputedData, class InputData>
870
989
  void apply_imputation_results(std::vector<ImputedData> &impute_vec,
871
- std::unordered_map<size_t, ImputedData> &impute_map,
990
+ hashed_map<size_t, ImputedData> &impute_map,
872
991
  Imputer &imputer,
873
992
  InputData &input_data,
874
993
  int nthreads)
@@ -880,6 +999,7 @@ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
880
999
  }
881
1000
 
882
1001
 
1002
+ template <class PredictionData, class ImputedData>
883
1003
  void apply_imputation_results(PredictionData &prediction_data,
884
1004
  ImputedData &imp,
885
1005
  Imputer &imputer,
@@ -887,21 +1007,40 @@ void apply_imputation_results(PredictionData &prediction_data,
887
1007
  {
888
1008
  size_t col;
889
1009
  size_t pos = 0;
890
- for (size_t ix = 0; ix < imp.n_missing_num; ix++)
1010
+ if (prediction_data.is_col_major)
891
1011
  {
892
- col = imp.missing_num[ix];
893
- if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
894
- prediction_data.numeric_data[row + col * prediction_data.nrows]
895
- =
896
- imp.num_sum[ix] / imp.num_weight[ix];
897
- else
898
- prediction_data.numeric_data[row + col * prediction_data.nrows]
899
- =
900
- imputer.col_means[col];
1012
+ for (size_t ix = 0; ix < imp.n_missing_num; ix++)
1013
+ {
1014
+ col = imp.missing_num[ix];
1015
+ if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
1016
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
1017
+ =
1018
+ imp.num_sum[ix] / imp.num_weight[ix];
1019
+ else
1020
+ prediction_data.numeric_data[row + col * prediction_data.nrows]
1021
+ =
1022
+ imputer.col_means[col];
1023
+ }
1024
+ }
1025
+
1026
+ else
1027
+ {
1028
+ for (size_t ix = 0; ix < imp.n_missing_num; ix++)
1029
+ {
1030
+ col = imp.missing_num[ix];
1031
+ if (imp.num_weight[ix] > 0 && !is_na_or_inf(imp.num_sum[ix]))
1032
+ prediction_data.numeric_data[col + row * imputer.ncols_numeric]
1033
+ =
1034
+ imp.num_sum[ix] / imp.num_weight[ix];
1035
+ else
1036
+ prediction_data.numeric_data[col + row * imputer.ncols_numeric]
1037
+ =
1038
+ imputer.col_means[col];
1039
+ }
901
1040
  }
902
1041
 
903
1042
  if (prediction_data.Xr != NULL)
904
- for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1043
+ for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
905
1044
  {
906
1045
  if (is_na_or_inf(prediction_data.Xr[ix]))
907
1046
  {
@@ -917,22 +1056,43 @@ void apply_imputation_results(PredictionData &prediction_data,
917
1056
  }
918
1057
  }
919
1058
 
920
- for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
1059
+ if (prediction_data.is_col_major)
921
1060
  {
922
- col = imp.missing_cat[ix];
923
- prediction_data.categ_data[row + col * prediction_data.nrows]
1061
+ for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
1062
+ {
1063
+ col = imp.missing_cat[ix];
1064
+ prediction_data.categ_data[row + col * prediction_data.nrows]
1065
+ =
1066
+ std::distance(imp.cat_sum[col].begin(),
1067
+ std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
1068
+
1069
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
1070
+ prediction_data.categ_data[row + col * prediction_data.nrows]
924
1071
  =
925
- std::distance(imp.cat_sum[col].begin(),
926
- std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
1072
+ imputer.col_modes[col];
1073
+ }
1074
+ }
927
1075
 
928
- if (prediction_data.categ_data[row + col * prediction_data.nrows] == 0 && imp.cat_sum[col][0] <= 0)
929
- prediction_data.categ_data[row + col * prediction_data.nrows]
930
- =
931
- imputer.col_modes[col];
1076
+ else
1077
+ {
1078
+ for (size_t ix = 0; ix < imp.n_missing_cat; ix++)
1079
+ {
1080
+ col = imp.missing_cat[ix];
1081
+ prediction_data.categ_data[col + row * imputer.ncols_categ]
1082
+ =
1083
+ std::distance(imp.cat_sum[col].begin(),
1084
+ std::max_element(imp.cat_sum[col].begin(), imp.cat_sum[col].end()));
1085
+
1086
+ if (prediction_data.categ_data[col + row * imputer.ncols_categ] == 0 && imp.cat_sum[col][0] <= 0)
1087
+ prediction_data.categ_data[col + row * imputer.ncols_categ]
1088
+ =
1089
+ imputer.col_modes[col];
1090
+ }
932
1091
  }
933
1092
  }
934
1093
 
935
1094
 
1095
+ template <class ImputedData, class InputData>
936
1096
  void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
937
1097
  {
938
1098
  imp.n_missing_num = 0;
@@ -953,15 +1113,15 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
953
1113
  else if (input_data.Xc_indptr != NULL)
954
1114
  {
955
1115
  imp.missing_sp.resize(input_data.ncols_numeric);
956
- sparse_ix *res;
1116
+ decltype(input_data.Xc_indptr) res;
957
1117
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
958
1118
  {
959
1119
  res = std::lower_bound(input_data.Xc_ind + input_data.Xc_indptr[col],
960
1120
  input_data.Xc_ind + input_data.Xc_indptr[col + 1],
961
- (sparse_ix) row);
1121
+ row);
962
1122
  if (
963
1123
  res != input_data.Xc_ind + input_data.Xc_indptr[col + 1] &&
964
- *res == row &&
1124
+ *res == static_cast<typename std::remove_pointer<decltype(res)>::type>(row) &&
965
1125
  is_na_or_inf(input_data.Xc[res - input_data.Xc_ind])
966
1126
  )
967
1127
  {
@@ -986,6 +1146,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
986
1146
  }
987
1147
  }
988
1148
 
1149
+ template <class ImputedData, class PredictionData>
989
1150
  void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row)
990
1151
  {
991
1152
  imp.n_missing_num = 0;
@@ -996,9 +1157,20 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
996
1157
  {
997
1158
  if (!imp.missing_num.size())
998
1159
  imp.missing_num.resize(imputer.ncols_numeric);
999
- for (size_t col = 0; col < imputer.ncols_numeric; col++)
1000
- if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1001
- imp.missing_num[imp.n_missing_num++] = col;
1160
+
1161
+ if (prediction_data.is_col_major)
1162
+ {
1163
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1164
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1165
+ imp.missing_num[imp.n_missing_num++] = col;
1166
+ }
1167
+
1168
+ else
1169
+ {
1170
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1171
+ if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
1172
+ imp.missing_num[imp.n_missing_num++] = col;
1173
+ }
1002
1174
 
1003
1175
  if (!imp.num_sum.size())
1004
1176
  {
@@ -1017,7 +1189,7 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
1017
1189
  {
1018
1190
  if (!imp.missing_sp.size())
1019
1191
  imp.missing_sp.resize(imputer.ncols_numeric);
1020
- for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1192
+ for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1021
1193
  if (is_na_or_inf(prediction_data.Xr[ix]))
1022
1194
  imp.missing_sp[imp.n_missing_sp++] = prediction_data.Xr_ind[ix];
1023
1195
 
@@ -1038,10 +1210,23 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
1038
1210
  {
1039
1211
  if (!imp.missing_cat.size())
1040
1212
  imp.missing_cat.resize(imputer.ncols_categ);
1041
- for (size_t col = 0; col < imputer.ncols_categ; col++)
1213
+
1214
+ if (prediction_data.is_col_major)
1042
1215
  {
1043
- if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1044
- imp.missing_cat[imp.n_missing_cat++] = col;
1216
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1217
+ {
1218
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1219
+ imp.missing_cat[imp.n_missing_cat++] = col;
1220
+ }
1221
+ }
1222
+
1223
+ else
1224
+ {
1225
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1226
+ {
1227
+ if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
1228
+ imp.missing_cat[imp.n_missing_cat++] = col;
1229
+ }
1045
1230
  }
1046
1231
 
1047
1232
  if (!imp.cat_weight.size())
@@ -1063,31 +1248,35 @@ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, I
1063
1248
  }
1064
1249
  }
1065
1250
 
1066
- ImputedData::ImputedData(InputData &input_data, size_t row)
1067
- {
1068
- initialize_impute_calc(*this, input_data, row);
1069
- }
1251
+ // template class ImputedData <class InputData>
1252
+ // ImputedData::ImputedData(InputData &input_data, size_t row)
1253
+ // {
1254
+ // initialize_impute_calc(*this, input_data, row);
1255
+ // }
1070
1256
 
1257
+ template <class ImputedData, class InputData>
1071
1258
  void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads)
1072
1259
  {
1073
1260
  impute_vec.resize(input_data.nrows);
1074
1261
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(impute_vec, input_data)
1075
- for (size_t_for row = 0; row < input_data.nrows; row++)
1262
+ for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
1076
1263
  if (input_data.has_missing[row])
1077
1264
  initialize_impute_calc(impute_vec[row], input_data, row);
1078
1265
  }
1079
1266
 
1080
1267
 
1081
- void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data)
1268
+ template <class ImputedData, class InputData>
1269
+ void allocate_imp_map(hashed_map<size_t, ImputedData> &impute_map, InputData &input_data)
1082
1270
  {
1083
1271
  for (size_t row = 0; row < input_data.nrows; row++)
1084
1272
  if (input_data.has_missing[row])
1085
1273
  impute_map[row] = ImputedData(input_data, row);
1086
1274
  }
1087
1275
 
1276
+ template <class ImputedData, class InputData>
1088
1277
  void allocate_imp(InputData &input_data,
1089
1278
  std::vector<ImputedData> &impute_vec,
1090
- std::unordered_map<size_t, ImputedData> &impute_map,
1279
+ hashed_map<size_t, ImputedData> &impute_map,
1091
1280
  int nthreads)
1092
1281
  {
1093
1282
  if (input_data.n_missing == 0)
@@ -1098,9 +1287,10 @@ void allocate_imp(InputData &input_data,
1098
1287
  allocate_imp_vec(impute_vec, input_data, nthreads);
1099
1288
  }
1100
1289
 
1290
+ template <class ImputedData, class InputData>
1101
1291
  void check_for_missing(InputData &input_data,
1102
1292
  std::vector<ImputedData> &impute_vec,
1103
- std::unordered_map<size_t, ImputedData> &impute_map,
1293
+ hashed_map<size_t, ImputedData> &impute_map,
1104
1294
  int nthreads)
1105
1295
  {
1106
1296
  input_data.has_missing.assign(input_data.nrows, false);
@@ -1109,7 +1299,7 @@ void check_for_missing(InputData &input_data,
1109
1299
  {
1110
1300
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
1111
1301
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
1112
- for (size_t_for ix = input_data.Xc_indptr[col]; ix < input_data.Xc_indptr[col + 1]; ix++)
1302
+ for (size_t_for ix = input_data.Xc_indptr[col]; ix < (decltype(ix))input_data.Xc_indptr[col + 1]; ix++)
1113
1303
  if (is_na_or_inf(input_data.Xc[ix]))
1114
1304
  input_data.has_missing[input_data.Xc_ind[ix]] = true;
1115
1305
  #pragma omp barrier
@@ -1118,14 +1308,17 @@ void check_for_missing(InputData &input_data,
1118
1308
  if (input_data.numeric_data != NULL || input_data.categ_data != NULL)
1119
1309
  {
1120
1310
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(input_data)
1121
- for (size_t_for row = 0; row < input_data.nrows; row++)
1311
+ for (size_t_for row = 0; row < (decltype(row))input_data.nrows; row++)
1122
1312
  {
1123
- for (size_t col = 0; col < input_data.ncols_numeric; col++)
1313
+ if (input_data.Xc_indptr == NULL)
1124
1314
  {
1125
- if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
1315
+ for (size_t col = 0; col < input_data.ncols_numeric; col++)
1126
1316
  {
1127
- input_data.has_missing[row] = true;
1128
- break;
1317
+ if (is_na_or_inf(input_data.numeric_data[row + col * input_data.nrows]))
1318
+ {
1319
+ input_data.has_missing[row] = true;
1320
+ break;
1321
+ }
1129
1322
  }
1130
1323
  }
1131
1324
 
@@ -1145,6 +1338,7 @@ void check_for_missing(InputData &input_data,
1145
1338
  allocate_imp(input_data, impute_vec, impute_map, nthreads);
1146
1339
  }
1147
1340
 
1341
+ template <class PredictionData>
1148
1342
  size_t check_for_missing(PredictionData &prediction_data,
1149
1343
  Imputer &imputer,
1150
1344
  size_t ix_arr[],
@@ -1153,19 +1347,38 @@ size_t check_for_missing(PredictionData &prediction_data,
1153
1347
  std::vector<char> has_missing(prediction_data.nrows, false);
1154
1348
 
1155
1349
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(has_missing, prediction_data, imputer)
1156
- for (size_t_for row = 0; row < prediction_data.nrows; row++)
1350
+ for (size_t_for row = 0; row < (decltype(row))prediction_data.nrows; row++)
1157
1351
  {
1158
1352
  if (prediction_data.numeric_data != NULL)
1159
- for (size_t col = 0; col < imputer.ncols_numeric; col++)
1353
+ {
1354
+ if (prediction_data.is_col_major)
1160
1355
  {
1161
- if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1356
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1162
1357
  {
1163
- has_missing[row] = true;
1164
- break;
1358
+ if (is_na_or_inf(prediction_data.numeric_data[row + col * prediction_data.nrows]))
1359
+ {
1360
+ has_missing[row] = true;
1361
+ break;
1362
+ }
1165
1363
  }
1166
1364
  }
1365
+
1366
+ else
1367
+ {
1368
+ for (size_t col = 0; col < imputer.ncols_numeric; col++)
1369
+ {
1370
+ if (is_na_or_inf(prediction_data.numeric_data[col + row * imputer.ncols_numeric]))
1371
+ {
1372
+ has_missing[row] = true;
1373
+ break;
1374
+ }
1375
+ }
1376
+ }
1377
+ }
1378
+
1167
1379
  else if (prediction_data.Xr != NULL)
1168
- for (size_t ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1380
+ {
1381
+ for (auto ix = prediction_data.Xr_indptr[row]; ix < prediction_data.Xr_indptr[row + 1]; ix++)
1169
1382
  {
1170
1383
  if (is_na_or_inf(prediction_data.Xr[ix]))
1171
1384
  {
@@ -1173,16 +1386,34 @@ size_t check_for_missing(PredictionData &prediction_data,
1173
1386
  break;
1174
1387
  }
1175
1388
  }
1389
+ }
1176
1390
 
1177
1391
  if (!has_missing[row])
1178
- for (size_t col = 0; col < imputer.ncols_categ; col++)
1392
+ {
1393
+ if (prediction_data.is_col_major)
1179
1394
  {
1180
- if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1395
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1181
1396
  {
1182
- has_missing[row] = true;
1183
- break;
1397
+ if (prediction_data.categ_data[row + col * prediction_data.nrows] < 0)
1398
+ {
1399
+ has_missing[row] = true;
1400
+ break;
1401
+ }
1402
+ }
1403
+ }
1404
+
1405
+ else
1406
+ {
1407
+ for (size_t col = 0; col < imputer.ncols_categ; col++)
1408
+ {
1409
+ if (prediction_data.categ_data[col + row * imputer.ncols_categ] < 0)
1410
+ {
1411
+ has_missing[row] = true;
1412
+ break;
1413
+ }
1184
1414
  }
1185
1415
  }
1416
+ }
1186
1417
  }
1187
1418
 
1188
1419
  size_t st = 0;