isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -1,607 +0,0 @@
1
- /* Isolation forests and variations thereof, with adjustments for incorporation
2
- * of categorical variables and missing values.
3
- * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
5
- * This library is based on the following works:
6
- * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
- * "Isolation forest."
8
- * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
- * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
- * "Isolation-based anomaly detection."
11
- * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
- * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
- * "Extended Isolation Forest."
14
- * arXiv preprint arXiv:1811.02141 (2018).
15
- * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
- * "On detecting clustered anomalies using SCiForest."
17
- * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
- * [5] https://sourceforge.net/projects/iforest/
19
- * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
- * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
24
- * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
26
- * All rights reserved.
27
- * Redistribution and use in source and binary forms, with or without
28
- * modification, are permitted provided that the following conditions are met:
29
- * * Redistributions of source code must retain the above copyright notice, this
30
- * list of conditions and the following disclaimer.
31
- * * Redistributions in binary form must reproduce the above copyright notice,
32
- * this list of conditions and the following disclaimer in the documentation
33
- * and/or other materials provided with the distribution.
34
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
- */
45
- #include "isotree.hpp"
46
-
47
- /* for regular numerical */
48
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
49
- MissingAction missing_action, double &x_sd, double &x_mean)
50
- {
51
- long double m = 0;
52
- long double s = 0;
53
- long double m_prev = 0;
54
-
55
- if (missing_action == Fail)
56
- {
57
- for (size_t row = st; row <= end; row++)
58
- {
59
- m += (x[ix_arr[row]] - m) / (long double)(row - st + 1);
60
- s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
61
- m_prev = m;
62
- }
63
-
64
- x_mean = m;
65
- x_sd = sqrtl(s / (long double)(end - st + 1));
66
- }
67
-
68
- else
69
- {
70
- size_t cnt = 0;
71
- for (size_t row = st; row <= end; row++)
72
- {
73
- if (!is_na_or_inf(x[ix_arr[row]]))
74
- {
75
- cnt++;
76
- m += (x[ix_arr[row]] - m) / (long double)cnt;
77
- s += (x[ix_arr[row]] - m) * (x[ix_arr[row]] - m_prev);
78
- m_prev = m;
79
- }
80
- }
81
-
82
- x_mean = m;
83
- x_sd = sqrtl(s / (long double)cnt);
84
- }
85
- }
86
-
87
- /* for sparse numerical */
88
- void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
89
- double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
90
- double &x_sd, double &x_mean)
91
- {
92
- /* ix_arr must be already sorted beforehand */
93
- if (Xc_indptr[col_num] == Xc_indptr[col_num + 1])
94
- {
95
- x_sd = 0;
96
- x_mean = 0;
97
- return;
98
- }
99
- size_t st_col = Xc_indptr[col_num];
100
- size_t end_col = Xc_indptr[col_num + 1] - 1;
101
- size_t curr_pos = st_col;
102
- size_t ind_end_col = (size_t) Xc_ind[end_col];
103
- size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
104
-
105
- size_t cnt = end - st + 1;
106
- long double sum = 0;
107
- long double sum_sq = 0;
108
-
109
- /* Note: this function will discard NAs regardless of chosen action. If reaching the point of calling
110
- this function, chances are that the performance gain of not checking for them will not be important */
111
-
112
- for (size_t *row = ptr_st;
113
- row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
114
- )
115
- {
116
- if (Xc_ind[curr_pos] == *row)
117
- {
118
- if (is_na_or_inf(Xc[curr_pos]))
119
- {
120
- cnt--;
121
- }
122
-
123
- else
124
- {
125
- sum += Xc[curr_pos];
126
- sum_sq += square(Xc[curr_pos]);
127
- }
128
-
129
- if (row == ix_arr + end || curr_pos == end_col) break;
130
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
131
- }
132
-
133
- else
134
- {
135
- if (Xc_ind[curr_pos] > *row)
136
- row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
137
- else
138
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
139
- }
140
- }
141
-
142
- x_mean = sum / (long double) cnt;
143
- x_sd = calc_sd_raw(cnt, sum, sum_sq);
144
- }
145
-
146
-
147
- /* Note about these functions: they write into an array that does not need to match to 'ix_arr',
148
- and instead, the index that is stored in ix_arr[n] will have the value in res[n] */
149
-
150
- /* for regular numerical */
151
- void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
152
- double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
153
- MissingAction missing_action, double *restrict buffer_arr,
154
- size_t *restrict buffer_NAs, bool first_run)
155
- {
156
- /* TODO: here don't need the buffer for NAs */
157
-
158
- if (first_run)
159
- coef /= x_sd;
160
-
161
- size_t cnt = 0;
162
- size_t cnt_NA = 0;
163
- double *restrict res_write = res - st;
164
-
165
- if (missing_action == Fail)
166
- {
167
- for (size_t row = st; row <= end; row++)
168
- res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
169
- }
170
-
171
- else
172
- {
173
- if (first_run)
174
- {
175
- for (size_t row = st; row <= end; row++)
176
- {
177
- if (!is_na_or_inf(x[ix_arr[row]]))
178
- {
179
- res_write[row] += (x[ix_arr[row]] - x_mean) * coef;
180
- buffer_arr[cnt++] = x[ix_arr[row]];
181
- }
182
-
183
- else
184
- {
185
- buffer_NAs[cnt_NA++] = row;
186
- }
187
-
188
- }
189
- }
190
-
191
- else
192
- {
193
- for (size_t row = st; row <= end; row++)
194
- {
195
- res_write[row] += (is_na_or_inf(x[ix_arr[row]]))? fill_val : ( (x[ix_arr[row]]-x_mean) * coef );
196
- }
197
- return;
198
- }
199
-
200
- size_t mid_ceil = cnt / 2;
201
- std::partial_sort(buffer_arr, buffer_arr + mid_ceil + 1, buffer_arr + cnt);
202
-
203
- if ((cnt % 2) == 0)
204
- fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2.0;
205
- else
206
- fill_val = buffer_arr[mid_ceil];
207
-
208
- fill_val = (fill_val - x_mean) * coef;
209
- if (cnt_NA)
210
- {
211
- for (size_t row = 0; row < cnt_NA; row++)
212
- res_write[buffer_NAs[row]] += fill_val;
213
- }
214
-
215
- }
216
- }
217
-
218
- /* for sparse numerical */
219
- void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
220
- double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
221
- double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
222
- double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run)
223
- {
224
- /* ix_arr must be already sorted beforehand */
225
-
226
- /* if it's all zeros, no need to do anything, but this is not supposed
227
- to happen while fitting because the range is determined before calling this */
228
- if (
229
- Xc_indptr[col_num] == Xc_indptr[col_num + 1] ||
230
- Xc_ind[Xc_indptr[col_num]] > ix_arr[end] ||
231
- Xc_ind[Xc_indptr[col_num + 1] - 1] < ix_arr[st]
232
- )
233
- {
234
- if (first_run)
235
- {
236
- coef /= x_sd;
237
- if (missing_action != Fail)
238
- fill_val = 0;
239
- }
240
-
241
- double *restrict res_write = res - st;
242
- double offset = x_mean * coef;
243
- for (size_t row = st; row <= end; row++)
244
- res_write[row] -= offset;
245
-
246
- return;
247
- }
248
-
249
- size_t st_col = Xc_indptr[col_num];
250
- size_t end_col = Xc_indptr[col_num + 1] - 1;
251
- size_t curr_pos = st_col;
252
- size_t *ptr_st = std::lower_bound(ix_arr + st, ix_arr + end + 1, (size_t)Xc_ind[st_col]);
253
-
254
- size_t cnt_non_NA = 0; /* when NAs need to be imputed */
255
- size_t cnt_NA = 0; /* when NAs need to be imputed */
256
- size_t n_sample = end - st + 1;
257
- size_t *ix_arr_plus_st = ix_arr + st;
258
-
259
- if (first_run)
260
- coef /= x_sd;
261
-
262
- double *restrict res_write = res - st;
263
- double offset = x_mean * coef;
264
- for (size_t row = st; row <= end; row++)
265
- res_write[row] -= offset;
266
-
267
- size_t ind_end_col = Xc_ind[end_col];
268
- size_t nmatches = 0;
269
-
270
- if (missing_action != Fail)
271
- {
272
- if (first_run)
273
- {
274
- for (size_t *row = ptr_st;
275
- row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
276
- )
277
- {
278
- if (Xc_ind[curr_pos] == *row)
279
- {
280
- if (is_na_or_inf(Xc[curr_pos]))
281
- {
282
- buffer_NAs[cnt_NA++] = row - ix_arr_plus_st;
283
- }
284
-
285
- else
286
- {
287
- buffer_arr[cnt_non_NA++] = Xc[curr_pos];
288
- res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
289
- }
290
-
291
- nmatches++;
292
- if (row == ix_arr + end || curr_pos == end_col) break;
293
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
294
- }
295
-
296
- else
297
- {
298
- if (Xc_ind[curr_pos] > *row)
299
- row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
300
- else
301
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
302
- }
303
- }
304
- }
305
-
306
- else
307
- {
308
- /* when impute value for missing has already been determined */
309
- for (size_t *row = ptr_st;
310
- row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
311
- )
312
- {
313
- if (Xc_ind[curr_pos] == *row)
314
- {
315
- res[row - ix_arr_plus_st] += is_na_or_inf(Xc[curr_pos])?
316
- (fill_val + offset) : (Xc[curr_pos] * coef);
317
- if (row == ix_arr + end) break;
318
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
319
- }
320
-
321
- else
322
- {
323
- if (Xc_ind[curr_pos] > *row)
324
- row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
325
- else
326
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
327
- }
328
- }
329
-
330
- return;
331
- }
332
-
333
-
334
- /* Determine imputation value */
335
- std::sort(buffer_arr, buffer_arr + cnt_non_NA);
336
- size_t mid_ceil = (n_sample - cnt_NA) / 2;
337
- size_t nzeros = (end - st + 1) - nmatches;
338
- if (nzeros > mid_ceil && buffer_arr[0] > 0)
339
- {
340
- fill_val = 0;
341
- return;
342
- }
343
-
344
- else
345
- {
346
- size_t n_neg = (buffer_arr[0] > 0)?
347
- 0 : ((buffer_arr[cnt_non_NA - 1] < 0)?
348
- cnt_non_NA : std::lower_bound(buffer_arr, buffer_arr + cnt_non_NA, (double)0) - buffer_arr);
349
-
350
-
351
- if (n_neg < (mid_ceil-1) && n_neg + nzeros > mid_ceil)
352
- {
353
- fill_val = 0;
354
- return;
355
- }
356
-
357
- else
358
- {
359
- /* if the sample size is odd, take the middle, otherwise take a simple average */
360
- if (((n_sample - cnt_NA) % 2) != 0)
361
- {
362
- if (mid_ceil < n_neg)
363
- fill_val = buffer_arr[mid_ceil];
364
- else if (mid_ceil < n_neg + nzeros)
365
- fill_val = 0;
366
- else
367
- fill_val = buffer_arr[mid_ceil - nzeros];
368
- }
369
-
370
- else
371
- {
372
- if (mid_ceil < n_neg)
373
- {
374
- fill_val = (buffer_arr[mid_ceil - 1] + buffer_arr[mid_ceil]) / 2;
375
- }
376
-
377
- else if (mid_ceil < n_neg + nzeros)
378
- {
379
- if (mid_ceil == n_neg)
380
- fill_val = buffer_arr[mid_ceil - 1] / 2;
381
- else
382
- fill_val = 0;
383
- }
384
-
385
- else
386
- {
387
- if (mid_ceil == n_neg + nzeros && nzeros > 0)
388
- fill_val = buffer_arr[n_neg] / 2;
389
- else
390
- fill_val = (buffer_arr[mid_ceil - nzeros - 1] + buffer_arr[mid_ceil - nzeros]) / 2; /* WRONG!!!! */
391
- }
392
- }
393
-
394
- /* fill missing if any */
395
- fill_val *= coef;
396
- if (cnt_NA && fill_val)
397
- for (size_t ix = 0; ix < cnt_NA; ix++)
398
- res[buffer_NAs[ix]] += fill_val;
399
-
400
- /* next time, it will need to have the offset added */
401
- fill_val -= offset;
402
- }
403
- }
404
- }
405
-
406
- else /* no NAs */
407
- {
408
- for (size_t *row = ptr_st;
409
- row != ix_arr + end + 1 && curr_pos != end_col + 1 && ind_end_col >= *row;
410
- )
411
- {
412
- if (Xc_ind[curr_pos] == *row)
413
- {
414
- res[row - ix_arr_plus_st] += Xc[curr_pos] * coef;
415
- if (row == ix_arr + end || curr_pos == end_col) break;
416
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *(++row)) - Xc_ind;
417
- }
418
-
419
- else
420
- {
421
- if (Xc_ind[curr_pos] > *row)
422
- row = std::lower_bound(row + 1, ix_arr + end + 1, Xc_ind[curr_pos]);
423
- else
424
- curr_pos = std::lower_bound(Xc_ind + curr_pos + 1, Xc_ind + end_col + 1, *row) - Xc_ind;
425
- }
426
- }
427
- }
428
- }
429
-
430
- /* for categoricals */
431
- void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
432
- int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
433
- double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
434
- NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run)
435
- {
436
- double *restrict res_write = res - st;
437
- switch(cat_split_type)
438
- {
439
- case SingleCateg:
440
- {
441
- /* in this case there's no need to make-up an impute value for new categories, only for NAs */
442
- switch(missing_action)
443
- {
444
- case Fail:
445
- {
446
- for (size_t row = st; row <= end; row++)
447
- res_write[row] += (x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0;
448
- return;
449
- }
450
-
451
- case Impute:
452
- {
453
- size_t cnt_NA = 0;
454
- size_t cnt_this = 0;
455
- size_t cnt = end - st + 1;
456
- if (first_run)
457
- {
458
- for (size_t row = st; row <= end; row++)
459
- {
460
- if (x[ix_arr[row]] < 0)
461
- {
462
- cnt_NA++;
463
- }
464
-
465
- else if (x[ix_arr[row]] == chosen_cat)
466
- {
467
- cnt_this++;
468
- res_write[row] += single_cat_coef;
469
- }
470
- }
471
- }
472
-
473
- else
474
- {
475
- for (size_t row = st; row <= end; row++)
476
- res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] == chosen_cat)? single_cat_coef : 0);
477
- return;
478
- }
479
-
480
- fill_val = (cnt_this > (cnt - cnt_NA - cnt_this))? single_cat_coef : 0;
481
- if (cnt_NA)
482
- {
483
- for (size_t row = st; row <= end; row++)
484
- if (x[ix_arr[row]] < 0)
485
- res_write[row] += fill_val;
486
- }
487
- return;
488
- }
489
- }
490
- }
491
-
492
- case SubSet:
493
- {
494
- /* in this case, since the splits are by more than 1 variable, it's not possible to
495
- divide missing/new categoricals by assigning weights, so they have to be imputed
496
- in both cases, unless using random weights for the new ones, in which case they won't
497
- need to be imputed for new, but sill need it for NA */
498
-
499
- if (new_cat_action == Random && missing_action == Fail)
500
- {
501
- for (size_t row = st; row <= end; row++)
502
- res_write[row] += cat_coef[x[ix_arr[row]]];
503
- return;
504
- }
505
-
506
- if (!first_run)
507
- {
508
- if (missing_action == Fail)
509
- {
510
- for (size_t row = st; row <= end; row++)
511
- res_write[row] += (x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]];
512
- }
513
-
514
- else
515
- {
516
- for (size_t row = st; row <= end; row++)
517
- res_write[row] += (x[ix_arr[row]] < 0)? fill_val : ((x[ix_arr[row]] >= ncat)? fill_new : cat_coef[x[ix_arr[row]]]);
518
- }
519
- return;
520
- }
521
-
522
- std::fill(buffer_cnt, buffer_cnt + ncat + 1, 0);
523
- switch(missing_action)
524
- {
525
- case Fail:
526
- {
527
- for (size_t row = st; row <= end; row++)
528
- {
529
- buffer_cnt[x[ix_arr[row]]]++;
530
- res_write[row] += cat_coef[x[ix_arr[row]]];
531
- }
532
- break;
533
- }
534
-
535
- default:
536
- {
537
- for (size_t row = st; row <= end; row++)
538
- {
539
- if (x[ix_arr[row]] >= 0)
540
- {
541
- buffer_cnt[x[ix_arr[row]]]++;
542
- res_write[row] += cat_coef[x[ix_arr[row]]];
543
- }
544
-
545
- else
546
- {
547
- buffer_cnt[ncat]++;
548
- }
549
-
550
- }
551
- break;
552
- }
553
- }
554
-
555
- switch(new_cat_action)
556
- {
557
- case Smallest:
558
- {
559
- size_t smallest = SIZE_MAX;
560
- int cat_smallest;
561
- for (int cat = 0; cat < ncat; cat++)
562
- {
563
- if (buffer_cnt[cat] > 0 && buffer_cnt[cat] < smallest)
564
- {
565
- smallest = buffer_cnt[cat];
566
- cat_smallest = cat;
567
- }
568
- }
569
- fill_new = cat_coef[cat_smallest];
570
- if (missing_action == Fail) break;
571
- }
572
-
573
- default:
574
- {
575
- /* Determine imputation value as the category in sorted order that gives 50% + 1 */
576
- long double cnt_l = (long double)((end - st + 1) - buffer_cnt[ncat]);
577
- std::iota(buffer_pos, buffer_pos + ncat, (size_t)0);
578
- std::sort(buffer_pos, buffer_pos + ncat, [&cat_coef](const size_t a, const size_t b){return cat_coef[a] < cat_coef[b];});
579
-
580
- double cumprob = 0;
581
- int cat;
582
- for (cat = 0; cat < ncat; cat++)
583
- {
584
- cumprob += (long double)buffer_cnt[buffer_pos[cat]] / cnt_l;
585
- if (cumprob >= .5) break;
586
- }
587
- // cat = std::min(cat, ncat); /* in case it picks the last one */
588
- fill_val = cat_coef[buffer_pos[cat]];
589
- if (new_cat_action != Smallest)
590
- fill_new = fill_val;
591
-
592
- if (buffer_cnt[ncat] > 0) /* NAs */
593
- for (size_t row = st; row <= end; row++)
594
- if (x[ix_arr[row]] < 0)
595
- res_write[row] += fill_val;
596
- }
597
- }
598
-
599
- /* now fill unseen categories */
600
- if (new_cat_action != Random)
601
- for (int cat = 0; cat < ncat; cat++)
602
- if (!buffer_cnt[cat])
603
- cat_coef[cat] = fill_new;
604
-
605
- }
606
- }
607
- }