isotree 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -45,76 +63,138 @@
45
63
  #ifdef _FOR_R
46
64
 
47
65
  #include <Rcpp.h>
48
- // [[Rcpp::plugins(cpp11)]]
66
+ #include <Rcpp/unwindProtect.h>
67
+ // [[Rcpp::plugins(unwindProtect)]]
68
+ #include <Rinternals.h>
49
69
 
50
- /* This is to serialize the model objects */
51
- // [[Rcpp::depends(Rcereal)]]
52
- #include <cereal/archives/binary.hpp>
53
- #include <cereal/types/vector.hpp>
54
- #include <sstream>
55
- #include <string>
70
+ #ifndef _FOR_R
71
+ #define FOR_R
72
+ #endif
56
73
 
57
74
  /* This is the package's header */
58
75
  #include "isotree.hpp"
59
76
 
77
+ /* Library is templated, base R comes with only these 2 types though */
78
+ #include "headers_joined.hpp"
79
+ #define real_t double
80
+ #define sparse_ix int
81
+ #include "instantiate_template_headers.hpp"
82
+
83
+ /* For imputing CSR matrices with differing columns from input */
84
+ #include "other_helpers.hpp"
85
+
86
+ /* Note: the R version calls the 'sort_csc_indices' templated function,
87
+ so it's not enough to just include 'isotree_exportable.hpp' and let
88
+ the templates be instantiated elsewhere. */
89
+
90
+ #define throw_mem_err() Rcpp::stop("Error: insufficient memory. Try smaller sample sizes and fewer trees.\n")
91
+
92
+ SEXP alloc_RawVec(void *data)
93
+ {
94
+ size_t vecsize = *(size_t*)data;
95
+ if (unlikely(vecsize > (size_t)std::numeric_limits<R_xlen_t>::max()))
96
+ Rcpp::stop("Object is too big for R to handle.");
97
+ return Rcpp::RawVector((R_xlen_t)vecsize);
98
+ }
99
+
100
+ SEXP safe_copy_vec(void *data)
101
+ {
102
+ std::vector<double> *vec = (std::vector<double>*)data;
103
+ return Rcpp::NumericVector(vec->begin(), vec->end());
104
+ }
105
+
106
+ SEXP safe_copy_intvec(void *data)
107
+ {
108
+ std::vector<int> *vec = (std::vector<int>*)data;
109
+ return Rcpp::IntegerVector(vec->begin(), vec->end());
110
+ }
111
+
112
+ SEXP safe_int_matrix(void *dims)
113
+ {
114
+ size_t *dims_ = (size_t*)dims;
115
+ size_t nrows = dims_[0];
116
+ size_t ncols = dims_[1];
117
+ return Rcpp::IntegerMatrix(nrows, ncols);
118
+ }
119
+
120
+ template <class Model>
121
+ SEXP safe_XPtr(void *model_ptr)
122
+ {
123
+ return Rcpp::XPtr<Model>((Model*)model_ptr, true);
124
+ }
125
+
126
+ SEXP safe_errlist(void *ignored)
127
+ {
128
+ return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
129
+ }
130
+
131
+ SEXP safe_FALSE(void *ignored)
132
+ {
133
+ return Rcpp::LogicalVector::create(0);
134
+ }
135
+
136
+ Rcpp::RawVector resize_vec(Rcpp::RawVector inp, size_t new_size)
137
+ {
138
+ Rcpp::RawVector out = Rcpp::unwindProtect(alloc_RawVec, (void*)&new_size);
139
+ memcpy(RAW(out), RAW(inp), std::min((size_t)inp.size(), new_size));
140
+ return out;
141
+ }
142
+
60
143
  /* for model serialization and re-usage in R */
61
144
  /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
62
145
  /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
63
- #include <Rinternals.h>
64
- template <class T>
65
- Rcpp::RawVector serialize_cpp_obj(T *model_outputs)
146
+ template <class Model>
147
+ Rcpp::RawVector serialize_cpp_obj(const Model *model_outputs)
66
148
  {
67
- std::stringstream ss;
68
- {
69
- cereal::BinaryOutputArchive oarchive(ss); // Create an output archive
70
- oarchive(*model_outputs);
71
- }
72
- ss.seekg(0, ss.end);
73
- /* Checking for potential integer overflows */
74
- std::stringstream::pos_type vec_size = ss.tellg();
75
- if (vec_size <= 0) {
76
- Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
77
- return Rcpp::RawVector();
78
- }
79
- Rcpp::RawVector retval((size_t) vec_size);
80
- ss.seekg(0, ss.beg);
81
- ss.read(reinterpret_cast<char*>(&retval[0]), retval.size());
82
- return retval;
149
+ size_t serialized_size = determine_serialized_size(*model_outputs);
150
+ if (unlikely(!serialized_size))
151
+ Rcpp::stop("Unexpected error.");
152
+ if (unlikely(serialized_size > (size_t)std::numeric_limits<R_xlen_t>::max()))
153
+ Rcpp::stop("Resulting model is too large for R to handle.");
154
+ Rcpp::RawVector out = Rcpp::unwindProtect(alloc_RawVec, (void*)&serialized_size);
155
+ char *out_ = (char*)RAW(out);
156
+ serialize_isotree(*model_outputs, out_);
157
+ return out;
83
158
  }
84
159
 
85
- template <class T>
160
+ template <class Model>
86
161
  SEXP deserialize_cpp_obj(Rcpp::RawVector src)
87
162
  {
88
- std::stringstream ss;
89
- ss.write(reinterpret_cast<char*>(&src[0]), src.size());
90
- ss.seekg(0, ss.beg);
91
- std::unique_ptr<T> model_outputs = std::unique_ptr<T>(new T());
92
- {
93
- cereal::BinaryInputArchive iarchive(ss);
94
- iarchive(*model_outputs);
95
- }
96
- return Rcpp::XPtr<T>(model_outputs.release(), true);
163
+ if (unlikely(!src.size()))
164
+ Rcpp::stop("Unexpected error.");
165
+ std::unique_ptr<Model> out(new Model());
166
+ const char *inp = (const char*)RAW(src);
167
+ deserialize_isotree(*out, inp);
168
+ SEXP out_ = Rcpp::unwindProtect(safe_XPtr<Model>, out.get());
169
+ out.release();
170
+ return out_;
97
171
  }
98
172
 
99
- // [[Rcpp::export]]
173
+ // [[Rcpp::export(rng = false)]]
100
174
  SEXP deserialize_IsoForest(Rcpp::RawVector src)
101
175
  {
102
176
  return deserialize_cpp_obj<IsoForest>(src);
103
177
  }
104
178
 
105
- // [[Rcpp::export]]
179
+ // [[Rcpp::export(rng = false)]]
106
180
  SEXP deserialize_ExtIsoForest(Rcpp::RawVector src)
107
181
  {
108
182
  return deserialize_cpp_obj<ExtIsoForest>(src);
109
183
  }
110
184
 
111
- // [[Rcpp::export]]
185
+ // [[Rcpp::export(rng = false)]]
112
186
  SEXP deserialize_Imputer(Rcpp::RawVector src)
113
187
  {
114
188
  return deserialize_cpp_obj<Imputer>(src);
115
189
  }
116
190
 
117
- // [[Rcpp::export]]
191
+ // [[Rcpp::export(rng = false)]]
192
+ SEXP deserialize_Indexer(Rcpp::RawVector src)
193
+ {
194
+ return deserialize_cpp_obj<TreesIndexer>(src);
195
+ }
196
+
197
+ // [[Rcpp::export(rng = false)]]
118
198
  Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
119
199
  {
120
200
  return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
@@ -123,79 +203,87 @@ Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
123
203
  double* set_R_nan_as_C_nan(double *x, size_t n, std::vector<double> &v, int nthreads)
124
204
  {
125
205
  v.assign(x, x + n);
126
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, v, n)
127
- for (size_t_for i = 0; i < n; i++)
128
- if (isnan(v[i]))
129
- v[i] = NAN;
206
+ for (size_t i = 0; i < n; i++)
207
+ if (unlikely(std::isnan(v[i]))) v[i] = NAN;
130
208
  return v.data();
131
209
  }
132
210
 
211
+ double* set_R_nan_as_C_nan(double *x, size_t n, Rcpp::NumericVector &v, int nthreads)
212
+ {
213
+ v = Rcpp::NumericVector(x, x + n);
214
+ for (size_t i = 0; i < n; i++)
215
+ if (unlikely(std::isnan(v[i]))) v[i] = NAN;
216
+ return REAL(v);
217
+ }
218
+
133
219
  double* set_R_nan_as_C_nan(double *x, size_t n, int nthreads)
134
220
  {
135
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, n)
136
- for (size_t_for i = 0; i < n; i++)
137
- if (isnan(x[i]))
138
- x[i] = NAN;
221
+ for (size_t i = 0; i < n; i++)
222
+ if (unlikely(std::isnan(x[i]))) x[i] = NAN;
139
223
  return x;
140
224
  }
141
225
 
142
- // [[Rcpp::export]]
226
+ // [[Rcpp::export(rng = false)]]
143
227
  Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
144
228
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
145
229
  Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
146
230
  size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry,
147
231
  Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample,
148
- size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth,
149
- bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist,
232
+ size_t sample_size, size_t ntrees, size_t max_depth, size_t ncols_per_tree, bool limit_depth,
233
+ bool penalize_range, bool standardize_data,
234
+ Rcpp::CharacterVector scoring_metric, bool fast_bratio,
235
+ bool calc_dist, bool standardize_dist, bool sq_dist,
150
236
  bool calc_depth, bool standardize_depth, bool weigh_by_kurt,
151
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
152
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
237
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
238
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
239
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
240
+ double prob_pick_col_by_kurt, double min_gain,
153
241
  Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
154
242
  Rcpp::CharacterVector missing_action, bool all_perm,
155
243
  bool build_imputer, bool output_imputations, size_t min_imp_obs,
156
244
  Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
157
- int random_seed, bool handle_interrupt, int nthreads)
245
+ int random_seed, bool use_long_double, int nthreads)
158
246
  {
159
247
  double* numeric_data_ptr = NULL;
160
248
  int* categ_data_ptr = NULL;
161
249
  int* ncat_ptr = NULL;
162
250
  double* Xc_ptr = NULL;
163
- sparse_ix* Xc_ind_ptr = NULL;
164
- sparse_ix* Xc_indptr_ptr = NULL;
251
+ int* Xc_ind_ptr = NULL;
252
+ int* Xc_indptr_ptr = NULL;
165
253
  double* sample_weights_ptr = NULL;
166
254
  double* col_weights_ptr = NULL;
167
- std::vector<double> Xcpp;
255
+ Rcpp::NumericVector Xcpp;
168
256
 
169
257
  if (X_num.size())
170
258
  {
171
- numeric_data_ptr = &X_num[0];
172
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
259
+ numeric_data_ptr = REAL(X_num);
260
+ if (Rcpp::as<std::string>(missing_action) != "fail")
173
261
  numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, nthreads);
174
262
  }
175
263
 
176
264
  if (X_cat.size())
177
265
  {
178
- categ_data_ptr = &X_cat[0];
179
- ncat_ptr = &ncat[0];
266
+ categ_data_ptr = INTEGER(X_cat);
267
+ ncat_ptr = INTEGER(ncat);
180
268
  }
181
269
 
182
270
  if (Xc.size())
183
271
  {
184
- Xc_ptr = &Xc[0];
185
- Xc_ind_ptr = &Xc_ind[0];
186
- Xc_indptr_ptr = &Xc_indptr[0];
187
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
272
+ Xc_ptr = REAL(Xc);
273
+ Xc_ind_ptr = INTEGER(Xc_ind);
274
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
275
+ if (Rcpp::as<std::string>(missing_action) != "fail")
188
276
  Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
189
277
  }
190
278
 
191
279
  if (sample_weights.size())
192
280
  {
193
- sample_weights_ptr = &sample_weights[0];
281
+ sample_weights_ptr = REAL(sample_weights);
194
282
  }
195
283
 
196
284
  if (col_weights.size())
197
285
  {
198
- col_weights_ptr = &col_weights[0];
286
+ col_weights_ptr = REAL(col_weights);
199
287
  }
200
288
 
201
289
  CoefType coef_type_C = Normal;
@@ -204,47 +292,72 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
204
292
  MissingAction missing_action_C = Divide;
205
293
  UseDepthImp depth_imp_C = Higher;
206
294
  WeighImpRows weigh_imp_rows_C = Inverse;
295
+ ScoringMetric scoring_metric_C = Depth;
207
296
 
208
- if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
297
+ if (Rcpp::as<std::string>(coef_type) == "uniform")
209
298
  {
210
299
  coef_type_C = Uniform;
211
300
  }
212
- if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
301
+ if (Rcpp::as<std::string>(cat_split_type) == "single_categ")
213
302
  {
214
303
  cat_split_type_C = SingleCateg;
215
304
  }
216
- if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
305
+ if (Rcpp::as<std::string>(new_cat_action) == "smallest")
217
306
  {
218
307
  new_cat_action_C = Smallest;
219
308
  }
220
- else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
309
+ else if (Rcpp::as<std::string>(new_cat_action) == "random")
221
310
  {
222
311
  new_cat_action_C = Random;
223
312
  }
224
- if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
313
+ if (Rcpp::as<std::string>(missing_action) == "impute")
225
314
  {
226
315
  missing_action_C = Impute;
227
316
  }
228
- else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
317
+ else if (Rcpp::as<std::string>(missing_action) == "fail")
229
318
  {
230
319
  missing_action_C = Fail;
231
320
  }
232
- if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
321
+ if (Rcpp::as<std::string>(depth_imp) == "lower")
233
322
  {
234
323
  depth_imp_C = Lower;
235
324
  }
236
- else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
325
+ else if (Rcpp::as<std::string>(depth_imp) == "same")
237
326
  {
238
327
  depth_imp_C = Same;
239
328
  }
240
- if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
329
+ if (Rcpp::as<std::string>(weigh_imp_rows) == "prop")
241
330
  {
242
331
  weigh_imp_rows_C = Prop;
243
332
  }
244
- else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
333
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == "flat")
245
334
  {
246
335
  weigh_imp_rows_C = Flat;
247
336
  }
337
+ if (Rcpp::as<std::string>(scoring_metric) == "adj_depth")
338
+ {
339
+ scoring_metric_C = AdjDepth;
340
+ }
341
+ else if (Rcpp::as<std::string>(scoring_metric) == "density")
342
+ {
343
+ scoring_metric_C = Density;
344
+ }
345
+ else if (Rcpp::as<std::string>(scoring_metric) == "adj_density")
346
+ {
347
+ scoring_metric_C = AdjDensity;
348
+ }
349
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_density")
350
+ {
351
+ scoring_metric_C = BoxedDensity;
352
+ }
353
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_density2")
354
+ {
355
+ scoring_metric_C = BoxedDensity2;
356
+ }
357
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_ratio")
358
+ {
359
+ scoring_metric_C = BoxedRatio;
360
+ }
248
361
 
249
362
  Rcpp::NumericVector tmat = Rcpp::NumericVector();
250
363
  Rcpp::NumericMatrix dmat = Rcpp::NumericMatrix();
@@ -255,24 +368,37 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
255
368
 
256
369
  if (calc_dist)
257
370
  {
258
- tmat = Rcpp::NumericVector((nrows * (nrows - 1)) / 2);
259
- tmat_ptr = &tmat[0];
371
+ tmat = Rcpp::NumericVector(calc_ncomb(nrows));
372
+ tmat_ptr = REAL(tmat);
260
373
  if (sq_dist)
261
374
  {
262
- dmat = Rcpp::NumericMatrix(nrows);
263
- dmat_ptr = &dmat(0, 0);
375
+ dmat = Rcpp::NumericMatrix(nrows, nrows);
376
+ dmat_ptr = REAL(dmat);
264
377
  }
265
378
  }
266
379
 
267
380
  if (calc_depth)
268
381
  {
269
382
  depths = Rcpp::NumericVector(nrows);
270
- depths_ptr = &depths[0];
383
+ depths_ptr = REAL(depths);
271
384
  }
272
385
 
273
- std::unique_ptr<IsoForest> model_ptr = std::unique_ptr<IsoForest>();
274
- std::unique_ptr<ExtIsoForest> ext_model_ptr = std::unique_ptr<ExtIsoForest>();
275
- std::unique_ptr<Imputer> imputer_ptr = std::unique_ptr<Imputer>();
386
+ Rcpp::List outp = Rcpp::List::create(
387
+ Rcpp::_["depths"] = depths,
388
+ Rcpp::_["tmat"] = tmat,
389
+ Rcpp::_["dmat"] = dmat,
390
+ Rcpp::_["ptr"] = R_NilValue,
391
+ Rcpp::_["serialized"] = R_NilValue,
392
+ Rcpp::_["imp_ptr"] = R_NilValue,
393
+ Rcpp::_["imp_ser"] = R_NilValue,
394
+ Rcpp::_["imputed_num"] = R_NilValue,
395
+ Rcpp::_["imputed_cat"] = R_NilValue,
396
+ Rcpp::_["err"] = Rcpp::LogicalVector::create(1)
397
+ );
398
+
399
+ std::unique_ptr<IsoForest> model_ptr(nullptr);
400
+ std::unique_ptr<ExtIsoForest> ext_model_ptr(nullptr);
401
+ std::unique_ptr<Imputer> imputer_ptr(nullptr);
276
402
 
277
403
  if (ndim == 1)
278
404
  model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
@@ -282,68 +408,86 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
282
408
  if (build_imputer)
283
409
  imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
284
410
 
285
- int ret_val =
411
+ int ret_val;
412
+ try {
413
+ ret_val =
286
414
  fit_iforest(model_ptr.get(), ext_model_ptr.get(),
287
415
  numeric_data_ptr, ncols_numeric,
288
416
  categ_data_ptr, ncols_categ, ncat_ptr,
289
417
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
290
418
  ndim, ntry, coef_type_C, coef_by_prop,
291
419
  sample_weights_ptr, with_replacement, weight_as_sample,
292
- nrows, sample_size, ntrees, max_depth,
293
- limit_depth, penalize_range,
420
+ nrows, sample_size, ntrees, max_depth, ncols_per_tree,
421
+ limit_depth, penalize_range, standardize_data,
422
+ scoring_metric_C, fast_bratio,
294
423
  standardize_dist, tmat_ptr,
295
424
  depths_ptr, standardize_depth,
296
425
  col_weights_ptr, weigh_by_kurt,
297
- prob_pick_by_gain_avg, prob_split_by_gain_avg,
298
- prob_pick_by_gain_pl, prob_split_by_gain_pl,
426
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
427
+ prob_pick_by_full_gain, prob_pick_by_dens,
428
+ prob_pick_col_by_range, prob_pick_col_by_var,
429
+ prob_pick_col_by_kurt,
299
430
  min_gain, missing_action_C,
300
431
  cat_split_type_C, new_cat_action_C,
301
432
  all_perm, imputer_ptr.get(), min_imp_obs,
302
433
  depth_imp_C, weigh_imp_rows_C, output_imputations,
303
- (uint64_t) random_seed, handle_interrupt, nthreads);
434
+ (uint64_t) random_seed, use_long_double, nthreads);
435
+ }
436
+ catch (std::bad_alloc &e) {
437
+ throw_mem_err();
438
+ }
439
+ Rcpp::checkUserInterrupt();
304
440
 
305
441
  if (ret_val == EXIT_FAILURE)
306
442
  {
307
- return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
443
+ return Rcpp::unwindProtect(safe_errlist, nullptr);
308
444
  }
309
445
 
310
446
  if (calc_dist && sq_dist)
311
- tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
447
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, standardize_dist? 0. : std::numeric_limits<double>::infinity());
312
448
 
313
449
  bool serialization_failed = false;
314
450
  Rcpp::RawVector serialized_obj;
315
- if (ndim == 1)
316
- serialized_obj = serialize_cpp_obj(model_ptr.get());
317
- else
318
- serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
319
- if (!serialized_obj.size()) serialization_failed = true;
320
- if (serialization_failed) {
451
+ try {
452
+ if (ndim == 1)
453
+ serialized_obj = serialize_cpp_obj(model_ptr.get());
454
+ else
455
+ serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
456
+ }
457
+ catch (std::bad_alloc &e) {
458
+ throw_mem_err();
459
+ }
460
+ if (unlikely(!serialized_obj.size())) serialization_failed = true;
461
+ if (unlikely(serialization_failed)) {
321
462
  if (ndim == 1)
322
463
  model_ptr.reset();
323
464
  else
324
465
  ext_model_ptr.reset();
325
466
  }
326
467
 
327
- Rcpp::List outp = Rcpp::List::create(
328
- Rcpp::_["serialized_obj"] = serialized_obj,
329
- Rcpp::_["depths"] = depths,
330
- Rcpp::_["tmat"] = tmat,
331
- Rcpp::_["dmat"] = dmat
332
- );
333
-
334
468
  if (!serialization_failed)
335
469
  {
336
- if (ndim == 1)
337
- outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
338
- else
339
- outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
470
+ outp["serialized"] = serialized_obj;
471
+ if (ndim == 1) {
472
+ outp["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, model_ptr.get());
473
+ model_ptr.release();
474
+ }
475
+ else {
476
+ outp["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, ext_model_ptr.get());
477
+ ext_model_ptr.release();
478
+ }
340
479
  } else
341
- outp["model_ptr"] = R_NilValue;
480
+ outp["ptr"] = R_NilValue;
342
481
 
343
482
  if (build_imputer && !serialization_failed)
344
483
  {
345
- outp["imputer_ser"] = serialize_cpp_obj(imputer_ptr.get());
346
- if (!Rf_xlength(outp["imputer_ser"]))
484
+ try {
485
+ outp["imp_ser"] = serialize_cpp_obj(imputer_ptr.get());
486
+ }
487
+ catch (std::bad_alloc &e) {
488
+ throw_mem_err();
489
+ }
490
+ if (!Rf_xlength(outp["imp_ser"]))
347
491
  {
348
492
  serialization_failed = true;
349
493
  imputer_ptr.reset();
@@ -351,79 +495,122 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
351
495
  model_ptr.reset();
352
496
  else
353
497
  ext_model_ptr.reset();
354
- outp["imputer_ptr"] = R_NilValue;
355
- outp["model_ptr"] = R_NilValue;
356
- } else
357
- outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
498
+ outp["imp_ptr"] = R_NilValue;
499
+ outp["ptr"] = R_NilValue;
500
+ } else {
501
+ outp["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, imputer_ptr.get());
502
+ imputer_ptr.release();
503
+ }
358
504
  }
359
505
 
360
506
  if (output_imputations && !serialization_failed)
361
507
  {
362
- outp["imputed_num"] = Rcpp::NumericVector(Xcpp.begin(), Xcpp.end());
508
+ outp["imputed_num"] = Xcpp;
363
509
  outp["imputed_cat"] = X_cat;
364
510
  }
365
511
 
366
- outp["err"] = Rcpp::LogicalVector::create(0);
367
-
512
+ outp["err"] = Rcpp::unwindProtect(safe_FALSE, nullptr);
368
513
  return outp;
369
514
  }
370
515
 
371
- // [[Rcpp::export]]
372
- Rcpp::RawVector fit_tree(SEXP model_R_ptr,
373
- Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
374
- Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
375
- Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
376
- size_t nrows, size_t ncols_numeric, size_t ncols_categ,
377
- size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
378
- size_t max_depth, bool limit_depth, bool penalize_range,
379
- bool weigh_by_kurt,
380
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
381
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
382
- Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
383
- Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
384
- Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
385
- bool all_perm, uint64_t random_seed)
516
+ // [[Rcpp::export(rng = false)]]
517
+ void fit_tree(SEXP model_R_ptr, Rcpp::RawVector serialized_obj, Rcpp::RawVector serialized_imputer,
518
+ SEXP indexer_R_ptr, Rcpp::RawVector serialized_indexer,
519
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
520
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
521
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
522
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ,
523
+ size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
524
+ size_t max_depth, size_t ncols_per_tree, bool limit_depth, bool penalize_range,
525
+ bool standardize_data, bool fast_bratio, bool weigh_by_kurt,
526
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
527
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
528
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
529
+ double prob_pick_col_by_kurt, double min_gain,
530
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
531
+ Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
532
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
533
+ bool all_perm,
534
+ Rcpp::NumericVector ref_X_num, Rcpp::IntegerVector ref_X_cat,
535
+ Rcpp::NumericVector ref_Xc, Rcpp::IntegerVector ref_Xc_ind, Rcpp::IntegerVector ref_Xc_indptr,
536
+ uint64_t random_seed, bool use_long_double,
537
+ Rcpp::List &model_cpp_obj_update, Rcpp::List &model_params_update)
386
538
  {
539
+ Rcpp::List out = Rcpp::List::create(
540
+ Rcpp::_["serialized"] = R_NilValue,
541
+ Rcpp::_["imp_ser"] = R_NilValue,
542
+ Rcpp::_["ind_ser"] = R_NilValue
543
+ );
544
+
545
+ Rcpp::IntegerVector ntrees_plus1 = Rcpp::IntegerVector::create(Rf_asInteger(model_params_update["ntrees"]) + 1);
546
+
387
547
  double* numeric_data_ptr = NULL;
388
548
  int* categ_data_ptr = NULL;
389
549
  int* ncat_ptr = NULL;
390
550
  double* Xc_ptr = NULL;
391
- sparse_ix* Xc_ind_ptr = NULL;
392
- sparse_ix* Xc_indptr_ptr = NULL;
551
+ int* Xc_ind_ptr = NULL;
552
+ int* Xc_indptr_ptr = NULL;
393
553
  double* sample_weights_ptr = NULL;
394
554
  double* col_weights_ptr = NULL;
395
- std::vector<double> Xcpp;
555
+ Rcpp::NumericVector Xcpp;
396
556
 
397
557
  if (X_num.size())
398
558
  {
399
- numeric_data_ptr = &X_num[0];
400
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
559
+ numeric_data_ptr = REAL(X_num);
560
+ if (Rcpp::as<std::string>(missing_action) != "fail")
401
561
  numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, 1);
402
562
  }
403
563
 
404
564
  if (X_cat.size())
405
565
  {
406
- categ_data_ptr = &X_cat[0];
407
- ncat_ptr = &ncat[0];
566
+ categ_data_ptr = INTEGER(X_cat);
567
+ ncat_ptr = INTEGER(ncat);
408
568
  }
409
569
 
410
570
  if (Xc.size())
411
571
  {
412
- Xc_ptr = &Xc[0];
413
- Xc_ind_ptr = &Xc_ind[0];
414
- Xc_indptr_ptr = &Xc_indptr[0];
415
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
572
+ Xc_ptr = REAL(Xc);
573
+ Xc_ind_ptr = INTEGER(Xc_ind);
574
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
575
+ if (Rcpp::as<std::string>(missing_action) != "fail")
416
576
  Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, 1);
417
577
  }
418
578
 
579
+ double* ref_numeric_data_ptr = NULL;
580
+ int* ref_categ_data_ptr = NULL;
581
+ double* ref_Xc_ptr = NULL;
582
+ int* ref_Xc_ind_ptr = NULL;
583
+ int* ref_Xc_indptr_ptr = NULL;
584
+ Rcpp::NumericVector ref_Xcpp;
585
+ if (ref_X_num.size())
586
+ {
587
+ ref_numeric_data_ptr = REAL(ref_X_num);
588
+ if (Rcpp::as<std::string>(missing_action) != "fail")
589
+ ref_numeric_data_ptr = set_R_nan_as_C_nan(ref_numeric_data_ptr, ref_X_num.size(), ref_Xcpp, 1);
590
+ }
591
+
592
+ if (ref_X_cat.size())
593
+ {
594
+ ref_categ_data_ptr = INTEGER(ref_X_cat);
595
+ }
596
+
597
+ if (ref_Xc.size())
598
+ {
599
+ ref_Xc_ptr = REAL(ref_Xc);
600
+ ref_Xc_ind_ptr = INTEGER(ref_Xc_ind);
601
+ ref_Xc_indptr_ptr = INTEGER(ref_Xc_indptr);
602
+ if (Rcpp::as<std::string>(missing_action) != "fail")
603
+ ref_Xc_ptr = set_R_nan_as_C_nan(ref_Xc_ptr, ref_Xc.size(), ref_Xcpp, 1);
604
+ }
605
+
419
606
  if (sample_weights.size())
420
607
  {
421
- sample_weights_ptr = &sample_weights[0];
608
+ sample_weights_ptr = REAL(sample_weights);
422
609
  }
423
610
 
424
611
  if (col_weights.size())
425
612
  {
426
- col_weights_ptr = &col_weights[0];
613
+ col_weights_ptr = REAL(col_weights);
427
614
  }
428
615
 
429
616
  CoefType coef_type_C = Normal;
@@ -433,62 +620,66 @@ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
433
620
  UseDepthImp depth_imp_C = Higher;
434
621
  WeighImpRows weigh_imp_rows_C = Inverse;
435
622
 
436
- if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
623
+ if (Rcpp::as<std::string>(coef_type) == "uniform")
437
624
  {
438
625
  coef_type_C = Uniform;
439
626
  }
440
- if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
627
+ if (Rcpp::as<std::string>(cat_split_type) == "single_categ")
441
628
  {
442
629
  cat_split_type_C = SingleCateg;
443
630
  }
444
- if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
631
+ if (Rcpp::as<std::string>(new_cat_action) == "smallest")
445
632
  {
446
633
  new_cat_action_C = Smallest;
447
634
  }
448
- else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
635
+ else if (Rcpp::as<std::string>(new_cat_action) == "random")
449
636
  {
450
637
  new_cat_action_C = Random;
451
638
  }
452
- if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
639
+ if (Rcpp::as<std::string>(missing_action) == "impute")
453
640
  {
454
641
  missing_action_C = Impute;
455
642
  }
456
- else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
643
+ else if (Rcpp::as<std::string>(missing_action) == "fail")
457
644
  {
458
645
  missing_action_C = Fail;
459
646
  }
460
- if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
647
+ if (Rcpp::as<std::string>(depth_imp) == "lower")
461
648
  {
462
649
  depth_imp_C = Lower;
463
650
  }
464
- else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
651
+ else if (Rcpp::as<std::string>(depth_imp) == "same")
465
652
  {
466
653
  depth_imp_C = Same;
467
654
  }
468
- if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
655
+ if (Rcpp::as<std::string>(weigh_imp_rows) == "prop")
469
656
  {
470
657
  weigh_imp_rows_C = Prop;
471
658
  }
472
- else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
659
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == "flat")
473
660
  {
474
661
  weigh_imp_rows_C = Flat;
475
662
  }
663
+
476
664
 
477
665
  IsoForest* model_ptr = NULL;
478
666
  ExtIsoForest* ext_model_ptr = NULL;
479
- Imputer* imputer_ptr = NULL;
667
+ Imputer* imputer_ptr = NULL;
668
+ TreesIndexer* indexer_ptr = NULL;
480
669
  if (ndim == 1)
481
670
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
482
671
  else
483
672
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
484
673
 
485
- std::vector<ImputeNode> *imp_ptr = NULL;
486
674
  if (build_imputer)
487
- {
488
675
  imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
489
- imputer_ptr->imputer_tree.emplace_back();
490
- imp_ptr = &imputer_ptr->imputer_tree.back();
491
- }
676
+
677
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
678
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
679
+ if (indexer_ptr != NULL && indexer_ptr->indices.empty())
680
+ indexer_ptr = NULL;
681
+
682
+ size_t old_ntrees = (ndim == 1)? (model_ptr->trees.size()) : (ext_model_ptr->hplanes.size());
492
683
 
493
684
  add_tree(model_ptr, ext_model_ptr,
494
685
  numeric_data_ptr, ncols_numeric,
@@ -496,24 +687,153 @@ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
496
687
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
497
688
  ndim, ntry, coef_type_C, coef_by_prop,
498
689
  sample_weights_ptr,
499
- nrows, max_depth,
500
- limit_depth, penalize_range,
690
+ nrows, max_depth, ncols_per_tree,
691
+ limit_depth, penalize_range, standardize_data, fast_bratio,
501
692
  col_weights_ptr, weigh_by_kurt,
502
- prob_pick_by_gain_avg, prob_split_by_gain_avg,
503
- prob_pick_by_gain_pl, prob_split_by_gain_pl,
693
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
694
+ prob_pick_by_full_gain, prob_pick_by_dens,
695
+ prob_pick_col_by_range, prob_pick_col_by_var,
696
+ prob_pick_col_by_kurt,
504
697
  min_gain, missing_action_C,
505
698
  cat_split_type_C, new_cat_action_C,
506
699
  depth_imp_C, weigh_imp_rows_C, all_perm,
507
- imp_ptr, min_imp_obs, (uint64_t)random_seed);
700
+ imputer_ptr, min_imp_obs,
701
+ indexer_ptr,
702
+ ref_numeric_data_ptr, ref_categ_data_ptr,
703
+ true, (size_t)0, (size_t)0,
704
+ ref_Xc_ptr, ref_Xc_ind_ptr, ref_Xc_indptr_ptr,
705
+ (uint64_t)random_seed, use_long_double);
706
+
707
+ Rcpp::RawVector new_serialized, new_imp_serialized, new_ind_serialized;
708
+ size_t new_size;
709
+ try
710
+ {
711
+ if (ndim == 1)
712
+ {
713
+ if (serialized_obj.size() &&
714
+ check_can_undergo_incremental_serialization(*model_ptr, (char*)RAW(serialized_obj)))
715
+ {
716
+ try {
717
+ new_size = serialized_obj.size()
718
+ + determine_serialized_size_additional_trees(*model_ptr, old_ntrees);
719
+ new_serialized = resize_vec(serialized_obj, new_size);
720
+ char *temp = (char*)RAW(new_serialized);
721
+ incremental_serialize_isotree(*model_ptr, temp);
722
+ out["serialized"] = new_serialized;
723
+ }
724
+
725
+ catch (std::runtime_error &e) {
726
+ goto serialize_anew_singlevar;
727
+ }
728
+ }
729
+
730
+ else {
731
+ serialize_anew_singlevar:
732
+ out["serialized"] = serialize_cpp_obj(model_ptr);
733
+ }
734
+ }
508
735
 
509
- if (ndim == 1)
510
- return serialize_cpp_obj(model_ptr);
511
- else
512
- return serialize_cpp_obj(ext_model_ptr);
736
+ else
737
+ {
738
+ if (serialized_obj.size() &&
739
+ check_can_undergo_incremental_serialization(*ext_model_ptr, (char*)RAW(serialized_obj)))
740
+ {
741
+ try {
742
+ new_size = serialized_obj.size()
743
+ + determine_serialized_size_additional_trees(*ext_model_ptr, old_ntrees);
744
+ new_serialized = resize_vec(serialized_obj, new_size);
745
+ char *temp = (char*)RAW(new_serialized);
746
+ incremental_serialize_isotree(*ext_model_ptr, temp);
747
+ out["serialized"] = new_serialized;
748
+ }
749
+
750
+ catch (std::runtime_error &e) {
751
+ goto serialize_anew_ext;
752
+ }
753
+ }
754
+
755
+ else {
756
+ serialize_anew_ext:
757
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
758
+ }
759
+ }
760
+
761
+ if (imputer_ptr != NULL)
762
+ {
763
+ if (serialized_imputer.size() &&
764
+ check_can_undergo_incremental_serialization(*imputer_ptr, (char*)RAW(serialized_imputer)))
765
+ {
766
+ try {
767
+ new_size = serialized_imputer.size()
768
+ + determine_serialized_size_additional_trees(*imputer_ptr, old_ntrees);
769
+ new_imp_serialized = resize_vec(serialized_imputer, new_size);
770
+ char *temp = (char*)RAW(new_imp_serialized);
771
+ incremental_serialize_isotree(*imputer_ptr, temp);
772
+ out["imp_ser"] = new_imp_serialized;
773
+ }
774
+
775
+ catch (std::runtime_error &e) {
776
+ goto serialize_anew_imp;
777
+ }
778
+ }
779
+
780
+ else {
781
+ serialize_anew_imp:
782
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
783
+ }
784
+ }
785
+
786
+ if (indexer_ptr != NULL)
787
+ {
788
+ if (serialized_indexer.size() &&
789
+ check_can_undergo_incremental_serialization(*indexer_ptr, (char*)RAW(serialized_indexer)))
790
+ {
791
+ try {
792
+ new_size = serialized_indexer.size()
793
+ + determine_serialized_size_additional_trees(*indexer_ptr, old_ntrees);
794
+ new_ind_serialized = resize_vec(serialized_indexer, new_size);
795
+ char *temp = (char*)RAW(new_ind_serialized);
796
+ incremental_serialize_isotree(*indexer_ptr, temp);
797
+ out["ind_ser"] = new_ind_serialized;
798
+ }
799
+
800
+ catch (std::runtime_error &e) {
801
+ goto serialize_anew_ind;
802
+ }
803
+ }
804
+
805
+ else {
806
+ serialize_anew_ind:
807
+ out["ind_ser"] = serialize_cpp_obj(indexer_ptr);
808
+ }
809
+ }
810
+ }
811
+
812
+ catch (...)
813
+ {
814
+ if (ndim == 1)
815
+ model_ptr->trees.resize(old_ntrees);
816
+ else
817
+ ext_model_ptr->hplanes.resize(old_ntrees);
818
+ if (build_imputer)
819
+ imputer_ptr->imputer_tree.resize(old_ntrees);
820
+ if (indexer_ptr != NULL)
821
+ indexer_ptr->indices.resize(old_ntrees);
822
+ throw;
823
+ }
824
+
825
+ model_cpp_obj_update["serialized"] = out["serialized"];
826
+ if (build_imputer)
827
+ model_cpp_obj_update["imp_ser"] = out["imp_ser"];
828
+ if (indexer_ptr != NULL)
829
+ model_cpp_obj_update["ind_ser"] = out["ind_ser"];
830
+ model_params_update["ntrees"] = ntrees_plus1;
513
831
  }
514
832
 
515
- // [[Rcpp::export]]
516
- void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector tree_num, bool is_extended,
833
+ // [[Rcpp::export(rng = false)]]
834
+ void predict_iso(SEXP model_R_ptr, bool is_extended,
835
+ SEXP indexer_R_ptr,
836
+ Rcpp::NumericVector outp, Rcpp::IntegerMatrix tree_num, Rcpp::NumericMatrix tree_depths,
517
837
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
518
838
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
519
839
  Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
@@ -522,48 +842,40 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
522
842
  double* numeric_data_ptr = NULL;
523
843
  int* categ_data_ptr = NULL;
524
844
  double* Xc_ptr = NULL;
525
- sparse_ix* Xc_ind_ptr = NULL;
526
- sparse_ix* Xc_indptr_ptr = NULL;
845
+ int* Xc_ind_ptr = NULL;
846
+ int* Xc_indptr_ptr = NULL;
527
847
  double* Xr_ptr = NULL;
528
- sparse_ix* Xr_ind_ptr = NULL;
529
- sparse_ix* Xr_indptr_ptr = NULL;
530
- sparse_ix* tree_num_ptr = NULL;
531
- std::vector<double> Xcpp;
848
+ int* Xr_ind_ptr = NULL;
849
+ int* Xr_indptr_ptr = NULL;
850
+ Rcpp::NumericVector Xcpp;
532
851
 
533
852
  if (X_num.size())
534
853
  {
535
- numeric_data_ptr = &X_num[0];
854
+ numeric_data_ptr = REAL(X_num);
536
855
  }
537
856
 
538
857
  if (X_cat.size())
539
858
  {
540
- categ_data_ptr = &X_cat[0];
859
+ categ_data_ptr = INTEGER(X_cat);
541
860
  }
542
861
 
543
862
  if (Xc_indptr.size())
544
863
  {
545
- if (Xc.size())
546
- Xc_ptr = &Xc[0];
547
- if (Xc_ind.size())
548
- Xc_ind_ptr = &Xc_ind[0];
549
- Xc_indptr_ptr = &Xc_indptr[0];
864
+ Xc_ptr = REAL(Xc);
865
+ Xc_ind_ptr = INTEGER(Xc_ind);
866
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
550
867
  }
551
868
 
552
869
  if (Xr_indptr.size())
553
870
  {
554
- if (Xr.size())
555
- Xr_ptr = &Xr[0];
556
- if (Xr_ind.size())
557
- Xr_ind_ptr = &Xr_ind[0];
558
- Xr_indptr_ptr = &Xr_indptr[0];
559
- }
560
-
561
- if (tree_num.size())
562
- {
563
- tree_num_ptr = &tree_num[0];
871
+ Xr_ptr = REAL(Xr);
872
+ Xr_ind_ptr = INTEGER(Xr_ind);
873
+ Xr_indptr_ptr = INTEGER(Xr_indptr);
564
874
  }
565
875
 
566
- double* depths_ptr = &outp[0];
876
+ double *depths_ptr = REAL(outp);
877
+ double *tree_depths_ptr = tree_depths.size()? REAL(tree_depths) : NULL;
878
+ int *tree_num_ptr = tree_num.size()? INTEGER(tree_num) : NULL;
567
879
 
568
880
  IsoForest* model_ptr = NULL;
569
881
  ExtIsoForest* ext_model_ptr = NULL;
@@ -571,6 +883,11 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
571
883
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
572
884
  else
573
885
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
886
+ TreesIndexer* indexer = NULL;
887
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
888
+ indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
889
+ if (indexer != NULL && indexer->indices.empty())
890
+ indexer = NULL;
574
891
 
575
892
  MissingAction missing_action = is_extended?
576
893
  ext_model_ptr->missing_action
@@ -583,58 +900,75 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
583
900
  if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), Xcpp, nthreads);
584
901
  }
585
902
 
586
- predict_iforest(numeric_data_ptr, categ_data_ptr,
587
- Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
588
- Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
589
- nrows, nthreads, standardize,
590
- model_ptr, ext_model_ptr,
591
- depths_ptr, tree_num_ptr);
903
+ predict_iforest<double, int>(numeric_data_ptr, categ_data_ptr,
904
+ true, (size_t)0, (size_t)0,
905
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
906
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
907
+ nrows, nthreads, standardize,
908
+ model_ptr, ext_model_ptr,
909
+ depths_ptr, tree_num_ptr,
910
+ tree_depths_ptr,
911
+ indexer);
592
912
  }
593
913
 
594
- // [[Rcpp::export]]
595
- void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dmat,
596
- Rcpp::NumericVector rmat, bool is_extended,
914
+ // [[Rcpp::export(rng = false)]]
915
+ void dist_iso(SEXP model_R_ptr, SEXP indexer_R_ptr,
916
+ Rcpp::NumericVector tmat, Rcpp::NumericMatrix dmat,
917
+ Rcpp::NumericMatrix rmat, bool is_extended,
597
918
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
598
919
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
599
- size_t nrows, int nthreads, bool assume_full_distr,
600
- bool standardize_dist, bool sq_dist, size_t n_from)
920
+ size_t nrows, bool use_long_double, int nthreads, bool assume_full_distr,
921
+ bool standardize_dist, bool sq_dist, size_t n_from,
922
+ bool use_reference_points, bool as_kernel)
601
923
  {
602
924
  double* numeric_data_ptr = NULL;
603
925
  int* categ_data_ptr = NULL;
604
926
  double* Xc_ptr = NULL;
605
- sparse_ix* Xc_ind_ptr = NULL;
606
- sparse_ix* Xc_indptr_ptr = NULL;
607
- std::vector<double> Xcpp;
927
+ int* Xc_ind_ptr = NULL;
928
+ int* Xc_indptr_ptr = NULL;
929
+ Rcpp::NumericVector Xcpp;
608
930
 
609
931
  if (X_num.size())
610
932
  {
611
- numeric_data_ptr = &X_num[0];
933
+ numeric_data_ptr = REAL(X_num);
612
934
  }
613
935
 
614
936
  if (X_cat.size())
615
937
  {
616
- categ_data_ptr = &X_cat[0];
938
+ categ_data_ptr = INTEGER(X_cat);
617
939
  }
618
940
 
619
941
  if (Xc_indptr.size())
620
942
  {
621
- if (Xc.size())
622
- Xc_ptr = &Xc[0];
623
- if (Xc_ind.size())
624
- Xc_ind_ptr = &Xc_ind[0];
625
- Xc_indptr_ptr = &Xc_indptr[0];
943
+ Xc_ptr = REAL(Xc);
944
+ Xc_ind_ptr = INTEGER(Xc_ind);
945
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
626
946
  }
627
947
 
628
- double* tmat_ptr = n_from? (double*)NULL : &tmat[0];
629
- double* dmat_ptr = (sq_dist & !n_from)? &dmat[0] : NULL;
630
- double* rmat_ptr = n_from? &rmat[0] : NULL;
948
+ double* tmat_ptr = n_from? (double*)NULL : REAL(tmat);
949
+ double* dmat_ptr = (sq_dist & !n_from)? REAL(dmat) : NULL;
950
+ double* rmat_ptr = n_from? REAL(rmat) : NULL;
631
951
 
632
952
  IsoForest* model_ptr = NULL;
633
953
  ExtIsoForest* ext_model_ptr = NULL;
954
+ TreesIndexer* indexer = NULL;
634
955
  if (is_extended)
635
956
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
636
957
  else
637
958
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
959
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
960
+ indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
961
+ if (indexer != NULL && (indexer->indices.empty() || (!as_kernel && indexer->indices.front().node_distances.empty())))
962
+ indexer = NULL;
963
+
964
+ if (use_reference_points && indexer != NULL && !indexer->indices.front().reference_points.empty()) {
965
+ tmat_ptr = NULL;
966
+ dmat_ptr = NULL;
967
+ rmat_ptr = REAL(rmat);
968
+ }
969
+ else {
970
+ use_reference_points = false;
971
+ }
638
972
 
639
973
 
640
974
  MissingAction missing_action = is_extended?
@@ -650,43 +984,58 @@ void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dm
650
984
 
651
985
  calc_similarity(numeric_data_ptr, categ_data_ptr,
652
986
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
653
- nrows, nthreads, assume_full_distr, standardize_dist,
987
+ nrows, use_long_double, nthreads,
988
+ assume_full_distr, standardize_dist, as_kernel,
654
989
  model_ptr, ext_model_ptr,
655
- tmat_ptr, rmat_ptr, n_from);
990
+ tmat_ptr, rmat_ptr, n_from, use_reference_points,
991
+ indexer, true, (size_t)0, (size_t)0);
656
992
 
657
- if (sq_dist & !n_from)
658
- tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
993
+ if (tmat.size() && dmat.ncol() > 0)
994
+ {
995
+ double diag_filler;
996
+ if (as_kernel) {
997
+ if (standardize_dist)
998
+ diag_filler = 1.;
999
+ else
1000
+ diag_filler = (model_ptr != NULL)? model_ptr->trees.size() : ext_model_ptr->hplanes.size();
1001
+ }
1002
+ else {
1003
+ if (standardize_dist)
1004
+ diag_filler = 0;
1005
+ else
1006
+ diag_filler = std::numeric_limits<double>::infinity();
1007
+ }
1008
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, diag_filler);
1009
+ }
659
1010
  }
660
1011
 
661
- // [[Rcpp::export]]
1012
+ // [[Rcpp::export(rng = false)]]
662
1013
  Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
663
1014
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
664
1015
  Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
665
- size_t nrows, int nthreads)
1016
+ size_t nrows, bool use_long_double, int nthreads)
666
1017
  {
667
1018
  double* numeric_data_ptr = NULL;
668
1019
  int* categ_data_ptr = NULL;
669
1020
  double* Xr_ptr = NULL;
670
- sparse_ix* Xr_ind_ptr = NULL;
671
- sparse_ix* Xr_indptr_ptr = NULL;
1021
+ int* Xr_ind_ptr = NULL;
1022
+ int* Xr_indptr_ptr = NULL;
672
1023
 
673
1024
  if (X_num.size())
674
1025
  {
675
- numeric_data_ptr = &X_num[0];
1026
+ numeric_data_ptr = REAL(X_num);
676
1027
  }
677
1028
 
678
1029
  if (X_cat.size())
679
1030
  {
680
- categ_data_ptr = &X_cat[0];
1031
+ categ_data_ptr = INTEGER(X_cat);
681
1032
  }
682
1033
 
683
1034
  if (Xr_indptr.size())
684
1035
  {
685
- if (Xr.size())
686
- Xr_ptr = &Xr[0];
687
- if (Xr_ind.size())
688
- Xr_ind_ptr = &Xr_ind[0];
689
- Xr_indptr_ptr = &Xr_indptr[0];
1036
+ Xr_ptr = REAL(Xr);
1037
+ Xr_ind_ptr = INTEGER(Xr_ind);
1038
+ Xr_indptr_ptr = INTEGER(Xr_indptr);
690
1039
  }
691
1040
 
692
1041
  if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), nthreads);
@@ -702,9 +1051,9 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
702
1051
  Imputer* imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
703
1052
 
704
1053
 
705
- impute_missing_values(numeric_data_ptr, categ_data_ptr,
1054
+ impute_missing_values(numeric_data_ptr, categ_data_ptr, true,
706
1055
  Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
707
- nrows, nthreads,
1056
+ nrows, use_long_double, nthreads,
708
1057
  model_ptr, ext_model_ptr,
709
1058
  *imputer_ptr);
710
1059
 
@@ -714,7 +1063,187 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
714
1063
  );
715
1064
  }
716
1065
 
717
- // [[Rcpp::export]]
1066
+ // [[Rcpp::export(rng = false)]]
1067
+ void drop_imputer(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1068
+ {
1069
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1070
+ Rcpp::LogicalVector FalseObj = Rcpp::LogicalVector::create(false);
1071
+ Rcpp::XPtr<Imputer> imp_ptr = lst_modify["imp_ptr"];
1072
+ imp_ptr.release();
1073
+
1074
+ lst_modify["imp_ser"] = empty_ser;
1075
+ lst_modify2["build_imputer"] = FalseObj;
1076
+ }
1077
+
1078
+ // [[Rcpp::export(rng = false)]]
1079
+ void drop_indexer(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1080
+ {
1081
+ Rcpp::XPtr<TreesIndexer> empty_ptr = Rcpp::XPtr<TreesIndexer>(nullptr, false);
1082
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1083
+ Rcpp::CharacterVector empty_char = Rcpp::CharacterVector();
1084
+ Rcpp::XPtr<TreesIndexer> indexer = lst_modify["indexer"];
1085
+ indexer.release();
1086
+
1087
+ lst_modify["ind_ser"] = empty_ser;
1088
+ lst_modify2["reference_names"] = empty_char;
1089
+ }
1090
+
1091
+ // [[Rcpp::export(rng = false)]]
1092
+ void drop_reference_points(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1093
+ {
1094
+ Rcpp::CharacterVector empty_char = Rcpp::CharacterVector();
1095
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1096
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1097
+ TreesIndexer *indexer_ptr = indexer_R_ptr.get();
1098
+ if (indexer_ptr == NULL) {
1099
+ lst_modify["ind_ser"] = empty_ser;
1100
+ lst_modify2["reference_names"] = empty_char;
1101
+ return;
1102
+ }
1103
+ if (indexer_ptr->indices.empty()) {
1104
+ indexer_R_ptr.release();
1105
+ lst_modify["ind_ser"] = empty_ser;
1106
+ lst_modify2["reference_names"] = empty_char;
1107
+ return;
1108
+ }
1109
+ if (indexer_ptr->indices.front().reference_points.empty()) {
1110
+ lst_modify2["reference_names"] = empty_char;
1111
+ return;
1112
+ }
1113
+
1114
+ std::unique_ptr<TreesIndexer> new_indexer(new TreesIndexer(*indexer_ptr));
1115
+ for (auto &tree : new_indexer->indices)
1116
+ {
1117
+ tree.reference_points.clear();
1118
+ tree.reference_indptr.clear();
1119
+ tree.reference_mapping.clear();
1120
+ }
1121
+ Rcpp::RawVector ind_ser = serialize_cpp_obj(new_indexer.get());
1122
+ *indexer_ptr = std::move(*new_indexer);
1123
+ new_indexer.release();
1124
+ lst_modify["ind_ser"] = ind_ser;
1125
+ lst_modify2["reference_names"] = empty_char;
1126
+ }
1127
+
1128
+ // [[Rcpp::export(rng = false)]]
1129
+ Rcpp::List subset_trees
1130
+ (
1131
+ SEXP model_R_ptr, SEXP imputer_R_ptr, SEXP indexer_R_ptr,
1132
+ bool is_extended, bool has_imputer,
1133
+ Rcpp::IntegerVector trees_take
1134
+ )
1135
+ {
1136
+ bool has_indexer = !Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL;
1137
+
1138
+ Rcpp::List out = Rcpp::List::create(
1139
+ Rcpp::_["ptr"] = R_NilValue,
1140
+ Rcpp::_["serialized"] = R_NilValue,
1141
+ Rcpp::_["imp_ptr"] = R_NilValue,
1142
+ Rcpp::_["imp_ser"] = R_NilValue,
1143
+ Rcpp::_["indexer"] = R_NilValue,
1144
+ Rcpp::_["ind_ser"] = R_NilValue
1145
+ );
1146
+
1147
+ IsoForest* model_ptr = NULL;
1148
+ ExtIsoForest* ext_model_ptr = NULL;
1149
+ Imputer* imputer_ptr = NULL;
1150
+ TreesIndexer* indexer_ptr = NULL;
1151
+ std::unique_ptr<IsoForest> new_model_ptr(nullptr);
1152
+ std::unique_ptr<ExtIsoForest> new_ext_model_ptr(nullptr);
1153
+ std::unique_ptr<Imputer> new_imputer_ptr(nullptr);
1154
+ std::unique_ptr<TreesIndexer> new_indexer_ptr(nullptr);
1155
+
1156
+ if (is_extended) {
1157
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1158
+ new_ext_model_ptr = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
1159
+ }
1160
+ else {
1161
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1162
+ new_model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
1163
+ }
1164
+
1165
+
1166
+ if (has_imputer) {
1167
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
1168
+ new_imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
1169
+ }
1170
+
1171
+ if (has_indexer) {
1172
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1173
+ new_indexer_ptr = std::unique_ptr<TreesIndexer>(new TreesIndexer());
1174
+ }
1175
+
1176
+ std::unique_ptr<size_t[]> trees_take_(new size_t[trees_take.size()]);
1177
+ for (decltype(trees_take.size()) ix = 0; ix < trees_take.size(); ix++)
1178
+ trees_take_[ix] = (size_t)(trees_take[ix] - 1);
1179
+
1180
+ subset_model(model_ptr, new_model_ptr.get(),
1181
+ ext_model_ptr, new_ext_model_ptr.get(),
1182
+ imputer_ptr, new_imputer_ptr.get(),
1183
+ indexer_ptr, new_indexer_ptr.get(),
1184
+ trees_take_.get(), trees_take.size());
1185
+ trees_take_.reset();
1186
+
1187
+ if (!is_extended)
1188
+ out["serialized"] = serialize_cpp_obj(new_model_ptr.get());
1189
+ else
1190
+ out["serialized"] = serialize_cpp_obj(new_ext_model_ptr.get());
1191
+ if (has_imputer)
1192
+ out["imp_ser"] = serialize_cpp_obj(new_imputer_ptr.get());
1193
+ if (has_indexer)
1194
+ out["ind_ser"] = serialize_cpp_obj(new_indexer_ptr.get());
1195
+
1196
+ if (!is_extended) {
1197
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, new_model_ptr.get());
1198
+ new_model_ptr.release();
1199
+ }
1200
+ else {
1201
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, new_ext_model_ptr.get());
1202
+ new_ext_model_ptr.release();
1203
+ }
1204
+ if (has_imputer) {
1205
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, new_imputer_ptr.get());
1206
+ new_imputer_ptr.release();
1207
+ }
1208
+ if (has_indexer) {
1209
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, new_indexer_ptr.get());
1210
+ new_indexer_ptr.release();
1211
+ }
1212
+ return out;
1213
+ }
1214
+
1215
+ // [[Rcpp::export(rng = false)]]
1216
+ void inplace_set_to_zero(SEXP obj)
1217
+ {
1218
+ auto obj_type = TYPEOF(obj);
1219
+ switch(obj_type)
1220
+ {
1221
+ case REALSXP:
1222
+ {
1223
+ REAL(obj)[0] = 0;
1224
+ break;
1225
+ }
1226
+
1227
+ case INTSXP:
1228
+ {
1229
+ INTEGER(obj)[0] = 0;
1230
+ break;
1231
+ }
1232
+
1233
+ case LGLSXP:
1234
+ {
1235
+ LOGICAL(obj)[0] = 0;
1236
+ break;
1237
+ }
1238
+
1239
+ default:
1240
+ {
1241
+ Rcpp::stop("Model object has incorrect structure.\n");
1242
+ }
1243
+ }
1244
+ }
1245
+
1246
+ // [[Rcpp::export(rng = false)]]
718
1247
  Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
719
1248
  {
720
1249
  size_t ntrees;
@@ -734,9 +1263,9 @@ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
734
1263
  Rcpp::IntegerVector n_nodes(ntrees);
735
1264
  Rcpp::IntegerVector n_terminal(ntrees);
736
1265
  if (is_extended)
737
- get_num_nodes(*ext_model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
1266
+ get_num_nodes(*ext_model_ptr, INTEGER(n_nodes), INTEGER(n_terminal), nthreads);
738
1267
  else
739
- get_num_nodes(*model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
1268
+ get_num_nodes(*model_ptr, INTEGER(n_nodes), INTEGER(n_terminal), nthreads);
740
1269
 
741
1270
  return Rcpp::List::create(
742
1271
  Rcpp::_["total"] = n_nodes,
@@ -744,25 +1273,56 @@ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
744
1273
  );
745
1274
  }
746
1275
 
747
- // [[Rcpp::export]]
748
- Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
749
- SEXP imp_R_ptr, SEXP oimp_R_ptr,
750
- bool is_extended)
1276
+ // [[Rcpp::export(rng = false)]]
1277
+ void append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
1278
+ SEXP imp_R_ptr, SEXP oimp_R_ptr,
1279
+ SEXP ind_R_ptr, SEXP oind_R_ptr,
1280
+ bool is_extended,
1281
+ Rcpp::RawVector serialized_obj,
1282
+ Rcpp::RawVector serialized_imputer,
1283
+ Rcpp::RawVector serialized_indexer,
1284
+ Rcpp::List &model_cpp_obj_update,
1285
+ Rcpp::List &model_params_update)
751
1286
  {
752
- Rcpp::List out;
1287
+ if ((!Rf_isNull(imp_R_ptr) && R_ExternalPtrAddr(imp_R_ptr) != NULL)
1288
+ &&
1289
+ !(!Rf_isNull(oimp_R_ptr) && R_ExternalPtrAddr(oimp_R_ptr) != NULL))
1290
+ {
1291
+ Rcpp::stop("Model to append trees to has imputer, but model to append from doesn't. Try dropping the imputer.\n");
1292
+ }
1293
+ if ((!Rf_isNull(ind_R_ptr) && R_ExternalPtrAddr(ind_R_ptr) != NULL)
1294
+ &&
1295
+ !(!Rf_isNull(oind_R_ptr) && R_ExternalPtrAddr(oind_R_ptr) != NULL))
1296
+ {
1297
+ Rcpp::stop("Model to append trees to has indexer, but model to append from doesn't. Try dropping the indexer.\n");
1298
+ }
1299
+
1300
+ Rcpp::List out = Rcpp::List::create(
1301
+ Rcpp::_["serialized"] = R_NilValue,
1302
+ Rcpp::_["imp_ser"] = R_NilValue,
1303
+ Rcpp::_["ind_ser"] = R_NilValue
1304
+ );
1305
+
1306
+ Rcpp::IntegerVector ntrees_new = Rcpp::IntegerVector::create(Rf_asInteger(model_params_update["ntrees"]));
1307
+
753
1308
  IsoForest* model_ptr = NULL;
754
1309
  IsoForest* other_ptr = NULL;
755
1310
  ExtIsoForest* ext_model_ptr = NULL;
756
1311
  ExtIsoForest* ext_other_ptr = NULL;
757
1312
  Imputer* imputer_ptr = NULL;
758
1313
  Imputer* oimputer_ptr = NULL;
1314
+ TreesIndexer* indexer_ptr = NULL;
1315
+ TreesIndexer* oindexer_ptr = NULL;
1316
+ size_t old_ntrees;
759
1317
 
760
1318
  if (is_extended) {
761
1319
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
762
1320
  ext_other_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(other_R_ptr));
1321
+ old_ntrees = ext_model_ptr->hplanes.size();
763
1322
  } else {
764
1323
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
765
1324
  other_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(other_R_ptr));
1325
+ old_ntrees = model_ptr->trees.size();
766
1326
  }
767
1327
 
768
1328
  if (!Rf_isNull(imp_R_ptr) && !Rf_isNull(oimp_R_ptr) &&
@@ -773,23 +1333,158 @@ Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
773
1333
  oimputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(oimp_R_ptr));
774
1334
  }
775
1335
 
1336
+ if (!Rf_isNull(ind_R_ptr) && !Rf_isNull(oind_R_ptr) &&
1337
+ R_ExternalPtrAddr(ind_R_ptr) != NULL &&
1338
+ R_ExternalPtrAddr(oind_R_ptr) != NULL)
1339
+ {
1340
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(ind_R_ptr));
1341
+ oindexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(oind_R_ptr));
1342
+ }
1343
+
776
1344
  merge_models(model_ptr, other_ptr,
777
1345
  ext_model_ptr, ext_other_ptr,
778
- imputer_ptr, oimputer_ptr);
1346
+ imputer_ptr, oimputer_ptr,
1347
+ indexer_ptr, oindexer_ptr);
779
1348
 
1349
+ Rcpp::RawVector new_serialized, new_imp_serialized, new_ind_serialized;
1350
+ size_t new_size;
1351
+ try
1352
+ {
1353
+ if (!is_extended)
1354
+ {
1355
+ if (serialized_obj.size() &&
1356
+ check_can_undergo_incremental_serialization(*model_ptr, (char*)RAW(serialized_obj)))
1357
+ {
1358
+ try {
1359
+ new_size = serialized_obj.size()
1360
+ + determine_serialized_size_additional_trees(*model_ptr, old_ntrees);
1361
+ new_serialized = resize_vec(serialized_obj, new_size);
1362
+ char *temp = (char*)RAW(new_serialized);
1363
+ incremental_serialize_isotree(*model_ptr, temp);
1364
+ out["serialized"] = new_serialized;
1365
+ }
1366
+
1367
+ catch (std::runtime_error &e) {
1368
+ goto serialize_anew_singlevar;
1369
+ }
1370
+ }
1371
+
1372
+ else {
1373
+ serialize_anew_singlevar:
1374
+ out["serialized"] = serialize_cpp_obj(model_ptr);
1375
+ }
1376
+ }
780
1377
 
781
- if (is_extended)
782
- out["serialized"] = serialize_cpp_obj(ext_model_ptr);
783
- else
784
- out["serialized"] = serialize_cpp_obj(model_ptr);
1378
+ else
1379
+ {
1380
+ if (serialized_obj.size() &&
1381
+ check_can_undergo_incremental_serialization(*ext_model_ptr, (char*)RAW(serialized_obj)))
1382
+ {
1383
+ try {
1384
+ new_size = serialized_obj.size()
1385
+ + determine_serialized_size_additional_trees(*ext_model_ptr, old_ntrees);
1386
+ new_serialized = resize_vec(serialized_obj, new_size);
1387
+ char *temp = (char*)RAW(new_serialized);
1388
+ incremental_serialize_isotree(*ext_model_ptr, temp);
1389
+ out["serialized"] = new_serialized;
1390
+ }
1391
+
1392
+ catch (std::runtime_error &e) {
1393
+ goto serialize_anew_ext;
1394
+ }
1395
+ }
1396
+
1397
+ else {
1398
+ serialize_anew_ext:
1399
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
1400
+ }
1401
+ }
785
1402
 
786
- if (imputer_ptr != NULL && oimputer_ptr != NULL)
787
- out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
1403
+ if (imputer_ptr != NULL)
1404
+ {
1405
+ if (serialized_imputer.size() &&
1406
+ check_can_undergo_incremental_serialization(*imputer_ptr, (char*)RAW(serialized_imputer)))
1407
+ {
1408
+ try {
1409
+ new_size = serialized_obj.size()
1410
+ + determine_serialized_size_additional_trees(*imputer_ptr, old_ntrees);
1411
+ new_imp_serialized = resize_vec(serialized_imputer, new_size);
1412
+ char *temp = (char*)RAW(new_imp_serialized);
1413
+ incremental_serialize_isotree(*imputer_ptr, temp);
1414
+ out["imp_ser"] = new_imp_serialized;
1415
+ }
1416
+
1417
+ catch (std::runtime_error &e) {
1418
+ goto serialize_anew_imp;
1419
+ }
1420
+ }
1421
+
1422
+ else {
1423
+ serialize_anew_imp:
1424
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
1425
+ }
1426
+ }
788
1427
 
789
- return out;
1428
+ if (indexer_ptr != NULL)
1429
+ {
1430
+ if (serialized_indexer.size() &&
1431
+ check_can_undergo_incremental_serialization(*indexer_ptr, (char*)RAW(serialized_indexer)))
1432
+ {
1433
+ try {
1434
+ new_size = serialized_obj.size()
1435
+ + determine_serialized_size_additional_trees(*indexer_ptr, old_ntrees);
1436
+ new_ind_serialized = resize_vec(serialized_indexer, new_size);
1437
+ char *temp = (char*)RAW(new_ind_serialized);
1438
+ incremental_serialize_isotree(*indexer_ptr, temp);
1439
+ out["ind_ser"] = new_ind_serialized;
1440
+ }
1441
+
1442
+ catch (std::runtime_error &e) {
1443
+ goto serialize_anew_ind;
1444
+ }
1445
+ }
1446
+
1447
+ else {
1448
+ serialize_anew_ind:
1449
+ out["ind_ser"] = serialize_cpp_obj(indexer_ptr);
1450
+ }
1451
+ }
1452
+ }
1453
+
1454
+ catch (...)
1455
+ {
1456
+ if (!is_extended)
1457
+ model_ptr->trees.resize(old_ntrees);
1458
+ else
1459
+ ext_model_ptr->hplanes.resize(old_ntrees);
1460
+
1461
+ if (imputer_ptr != NULL)
1462
+ imputer_ptr->imputer_tree.resize(old_ntrees);
1463
+ if (indexer_ptr != NULL)
1464
+ indexer_ptr->indices.resize(old_ntrees);
1465
+ throw;
1466
+ }
1467
+
1468
+ model_cpp_obj_update["serialized"] = out["serialized"];
1469
+ if (imputer_ptr)
1470
+ model_cpp_obj_update["imp_ser"] = out["imp_ser"];
1471
+ if (indexer_ptr)
1472
+ model_cpp_obj_update["ind_ser"] = out["ind_ser"];
1473
+ *(INTEGER(ntrees_new)) = is_extended? ext_model_ptr->hplanes.size() : model_ptr->trees.size();
1474
+ model_params_update["ntrees"] = ntrees_new;
790
1475
  }
791
1476
 
792
- // [[Rcpp::export]]
1477
+ SEXP alloc_List(void *data)
1478
+ {
1479
+ return Rcpp::List(*(size_t*)data);
1480
+ }
1481
+
1482
+ SEXP safe_CastString(void *data)
1483
+ {
1484
+ return Rcpp::CharacterVector(*(std::string*)data);
1485
+ }
1486
+
1487
+ // [[Rcpp::export(rng = false)]]
793
1488
  Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_extended,
794
1489
  Rcpp::CharacterVector numeric_colanmes,
795
1490
  Rcpp::CharacterVector categ_colnames,
@@ -814,13 +1509,16 @@ Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_exten
814
1509
  categ_levels_cpp,
815
1510
  output_tree_num, true, single_tree, tree_num,
816
1511
  nthreads);
817
- Rcpp::List out(res.size());
1512
+ /* TODO: this function could create objects through the ALTREP system instead.
1513
+ That way, it would avoid an extra copy of the data */
1514
+ size_t sz = res.size();
1515
+ Rcpp::List out = Rcpp::unwindProtect(alloc_List, (void*)&sz);
818
1516
  for (size_t ix = 0; ix < res.size(); ix++)
819
- out[ix] = Rcpp::CharacterVector(res[ix]);
1517
+ out[ix] = Rcpp::unwindProtect(safe_CastString, &(res[ix]));
820
1518
  return out;
821
1519
  }
822
1520
 
823
- // [[Rcpp::export]]
1521
+ // [[Rcpp::export(rng = false)]]
824
1522
  Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_extended,
825
1523
  Rcpp::CharacterVector numeric_colanmes,
826
1524
  Rcpp::CharacterVector categ_colnames,
@@ -842,11 +1540,976 @@ Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_ex
842
1540
  std::string table_from_cpp = Rcpp::as<std::string>(table_from);
843
1541
  std::string select_as_cpp = Rcpp::as<std::string>(select_as);
844
1542
 
845
- return generate_sql_with_select_from(model_ptr, ext_model_ptr,
846
- table_from_cpp, select_as_cpp,
847
- numeric_colanmes_cpp, categ_colanmes_cpp,
848
- categ_levels_cpp,
849
- true, nthreads);
1543
+ std::string out = generate_sql_with_select_from(model_ptr, ext_model_ptr,
1544
+ table_from_cpp, select_as_cpp,
1545
+ numeric_colanmes_cpp, categ_colanmes_cpp,
1546
+ categ_levels_cpp,
1547
+ true, nthreads);
1548
+ /* TODO: this function could create objects through the ALTREP system instead.
1549
+ That way, it would avoid an extra copy of the data */
1550
+ return Rcpp::unwindProtect(safe_CastString, &out);
1551
+ }
1552
+
1553
+ // [[Rcpp::export(rng = false)]]
1554
+ Rcpp::List copy_cpp_objects(SEXP model_R_ptr, bool is_extended, SEXP imp_R_ptr, bool has_imputer, SEXP ind_R_ptr)
1555
+ {
1556
+ bool has_indexer = !Rf_isNull(ind_R_ptr) && R_ExternalPtrAddr(ind_R_ptr) != NULL;
1557
+
1558
+ Rcpp::List out = Rcpp::List::create(
1559
+ Rcpp::_["ptr"] = R_NilValue,
1560
+ Rcpp::_["imp_ptr"] = R_NilValue,
1561
+ Rcpp::_["indexer"] = R_NilValue
1562
+ );
1563
+
1564
+ IsoForest* model_ptr = NULL;
1565
+ ExtIsoForest* ext_model_ptr = NULL;
1566
+ Imputer* imputer_ptr = NULL;
1567
+ TreesIndexer* indexer_ptr = NULL;
1568
+ if (is_extended)
1569
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1570
+ else
1571
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1572
+ if (has_imputer)
1573
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
1574
+ if (has_indexer)
1575
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(ind_R_ptr));
1576
+
1577
+ std::unique_ptr<IsoForest> copy_model(new IsoForest());
1578
+ std::unique_ptr<ExtIsoForest> copy_ext_model(new ExtIsoForest());
1579
+ std::unique_ptr<Imputer> copy_imputer(new Imputer());
1580
+ std::unique_ptr<TreesIndexer> copy_indexer(new TreesIndexer());
1581
+
1582
+ if (model_ptr != NULL)
1583
+ *copy_model = *model_ptr;
1584
+ if (ext_model_ptr != NULL)
1585
+ *copy_ext_model = *ext_model_ptr;
1586
+ if (imputer_ptr != NULL)
1587
+ *copy_imputer = *imputer_ptr;
1588
+ if (indexer_ptr != NULL)
1589
+ *copy_indexer = *indexer_ptr;
1590
+
1591
+ if (is_extended) {
1592
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, copy_ext_model.get());
1593
+ copy_ext_model.release();
1594
+ }
1595
+ else {
1596
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, copy_model.get());
1597
+ copy_model.release();
1598
+ }
1599
+ if (has_imputer) {
1600
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, copy_imputer.get());
1601
+ copy_imputer.release();
1602
+ }
1603
+ if (has_indexer) {
1604
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, copy_indexer.get());
1605
+ copy_indexer.release();
1606
+ }
1607
+ return out;
1608
+ }
1609
+
1610
+ // [[Rcpp::export(rng = false)]]
1611
+ void build_tree_indices(Rcpp::List lst_modify, bool is_extended, bool with_distances, int nthreads)
1612
+ {
1613
+ Rcpp::RawVector ind_ser = Rcpp::RawVector();
1614
+ Rcpp::List empty_lst = Rcpp::List::create(Rcpp::_["indexer"] = R_NilValue);
1615
+ std::unique_ptr<TreesIndexer> indexer(new TreesIndexer());
1616
+
1617
+ if (!is_extended) {
1618
+ build_tree_indices(*indexer,
1619
+ *static_cast<IsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"])),
1620
+ nthreads,
1621
+ with_distances);
1622
+ }
1623
+ else {
1624
+ build_tree_indices(*indexer,
1625
+ *static_cast<ExtIsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"])),
1626
+ nthreads,
1627
+ with_distances);
1628
+ }
1629
+
1630
+ ind_ser = serialize_cpp_obj(indexer.get());
1631
+ empty_lst["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, indexer.get());
1632
+ if (!Rf_isNull(lst_modify["indexer"])) {
1633
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1634
+ indexer_R_ptr.release();
1635
+ }
1636
+
1637
+ lst_modify["ind_ser"] = ind_ser;
1638
+ lst_modify["indexer"] = empty_lst["indexer"];
1639
+ indexer.release();
1640
+ }
1641
+
1642
+ // [[Rcpp::export(rng = false)]]
1643
+ bool check_node_indexer_has_distances(SEXP indexer_R_ptr)
1644
+ {
1645
+ if (Rf_isNull(indexer_R_ptr) || R_ExternalPtrAddr(indexer_R_ptr) == NULL)
1646
+ return false;
1647
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1648
+ if (indexer->indices.empty()) return false;
1649
+ return !indexer->indices.front().node_distances.empty();
1650
+ }
1651
+
1652
+ // [[Rcpp::export(rng = false)]]
1653
+ void set_reference_points(Rcpp::List lst_modify, Rcpp::List lst_modify2, SEXP rnames, bool is_extended,
1654
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
1655
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
1656
+ size_t nrows, int nthreads, bool with_distances)
1657
+ {
1658
+ Rcpp::RawVector ind_ser = Rcpp::RawVector();
1659
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1660
+
1661
+ double* numeric_data_ptr = NULL;
1662
+ int* categ_data_ptr = NULL;
1663
+ double* Xc_ptr = NULL;
1664
+ int* Xc_ind_ptr = NULL;
1665
+ int* Xc_indptr_ptr = NULL;
1666
+ Rcpp::NumericVector Xcpp;
1667
+
1668
+ if (X_num.size())
1669
+ {
1670
+ numeric_data_ptr = REAL(X_num);
1671
+ }
1672
+
1673
+ if (X_cat.size())
1674
+ {
1675
+ categ_data_ptr = INTEGER(X_cat);
1676
+ }
1677
+
1678
+ if (Xc_indptr.size())
1679
+ {
1680
+ Xc_ptr = REAL(Xc);
1681
+ Xc_ind_ptr = INTEGER(Xc_ind);
1682
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
1683
+ }
1684
+
1685
+ IsoForest* model_ptr = NULL;
1686
+ ExtIsoForest* ext_model_ptr = NULL;
1687
+ TreesIndexer* indexer = NULL;
1688
+ if (is_extended)
1689
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"]));
1690
+ else
1691
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"]));
1692
+ indexer = indexer_R_ptr.get();
1693
+
1694
+ MissingAction missing_action = is_extended?
1695
+ ext_model_ptr->missing_action
1696
+ :
1697
+ model_ptr->missing_action;
1698
+ if (missing_action != Fail)
1699
+ {
1700
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
1701
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
1702
+ }
1703
+
1704
+ std::unique_ptr<TreesIndexer> new_indexer(new TreesIndexer(*indexer));
1705
+
1706
+ set_reference_points(model_ptr, ext_model_ptr, new_indexer.get(),
1707
+ with_distances,
1708
+ numeric_data_ptr, categ_data_ptr,
1709
+ true, (size_t)0, (size_t)0,
1710
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
1711
+ (double*)NULL, (int*)NULL, (int*)NULL,
1712
+ nrows, nthreads);
1713
+
1714
+ ind_ser = serialize_cpp_obj(new_indexer.get());
1715
+ *indexer = std::move(*new_indexer);
1716
+ new_indexer.release();
1717
+ lst_modify["ind_ser"] = ind_ser;
1718
+ lst_modify2["reference_names"] = rnames;
1719
+ }
1720
+
1721
+ // [[Rcpp::export(rng = false)]]
1722
+ bool check_node_indexer_has_references(SEXP indexer_R_ptr)
1723
+ {
1724
+ if (Rf_isNull(indexer_R_ptr) || R_ExternalPtrAddr(indexer_R_ptr) == NULL)
1725
+ return false;
1726
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1727
+ if (indexer->indices.empty())
1728
+ return false;
1729
+ if (indexer->indices.front().reference_points.empty())
1730
+ return false;
1731
+ else
1732
+ return true;
1733
+ }
1734
+
1735
+ // [[Rcpp::export(rng = false)]]
1736
+ int get_num_references(SEXP indexer_R_ptr)
1737
+ {
1738
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1739
+ if (indexer == NULL || indexer->indices.empty()) return 0;
1740
+ return indexer->indices.front().reference_points.size();
1741
+ }
1742
+
1743
+ // [[Rcpp::export(rng = false)]]
1744
+ SEXP get_null_R_pointer()
1745
+ {
1746
+ return R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue);
1747
+ }
1748
+
1749
+ /* This library will use different code paths for opening a file path
1750
+ in order to support non-ASCII characters, depending on compiler and
1751
+ platform support. */
1752
+ #if (defined(_WIN32) || defined(_WIN64))
1753
+ # if defined(__GNUC__) && (__GNUC__ >= 5)
1754
+ # define USE_CODECVT
1755
+ # define TAKE_AS_UTF8 true
1756
+ # elif !defined(_FOR_CRAN)
1757
+ # define USE_RC_FOPEN
1758
+ # define TAKE_AS_UTF8 false
1759
+ # else
1760
+ # define USE_SIMPLE_FOPEN
1761
+ # define TAKE_AS_UTF8 false
1762
+ # endif
1763
+ #else
1764
+ # define USE_SIMPLE_FOPEN
1765
+ # define TAKE_AS_UTF8 false
1766
+ #endif
1767
+
1768
+ /* Now the actual implementations */
1769
+ #ifdef USE_CODECVT
1770
+ /* https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t */
1771
+ /* */
1772
+ #include <locale>
1773
+ #include <codecvt>
1774
+ #include <string>
1775
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1776
+ {
1777
+ Rcpp::String s(fname[0], CE_UTF8);
1778
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
1779
+ std::wstring wide = converter.from_bytes(s.get_cstring());
1780
+ std::string mode__(mode);
1781
+ std::wstring mode_ = converter.from_bytes(mode__);
1782
+ return _wfopen(wide.c_str(), mode_.c_str());
1783
+ }
1784
+ #endif
1785
+
1786
+ #ifdef USE_RC_FOPEN
1787
+ extern "C" {
1788
+ FILE *RC_fopen(const SEXP fn, const char *mode, const Rboolean expand);
1789
+ }
1790
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1791
+ {
1792
+ return RC_fopen(fname[0], mode, FALSE);
1793
+ }
1794
+ #endif
1795
+
1796
+ #ifdef USE_SIMPLE_FOPEN
1797
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1798
+ {
1799
+ return fopen(fname[0], mode);
1800
+ }
1801
+ #endif
1802
+
1803
+ class FileOpener
1804
+ {
1805
+ public:
1806
+ FILE *handle = NULL;
1807
+ FileOpener(const SEXP fname, const char *mode)
1808
+ {
1809
+ if (this->handle != NULL)
1810
+ this->close_file();
1811
+ this->handle = R_fopen(fname, mode);
1812
+ }
1813
+ FILE *get_handle()
1814
+ {
1815
+ return this->handle;
1816
+ }
1817
+ void close_file()
1818
+ {
1819
+ if (this->handle != NULL) {
1820
+ fclose(this->handle);
1821
+ this->handle = NULL;
1822
+ }
1823
+ }
1824
+ ~FileOpener()
1825
+ {
1826
+ this->close_file();
1827
+ }
1828
+ };
1829
+
1830
+ // [[Rcpp::export]]
1831
+ void serialize_to_file
1832
+ (
1833
+ Rcpp::RawVector serialized_obj,
1834
+ Rcpp::RawVector serialized_imputer,
1835
+ Rcpp::RawVector serialized_indexer,
1836
+ bool is_extended,
1837
+ Rcpp::RawVector metadata,
1838
+ Rcpp::CharacterVector fname
1839
+ )
1840
+ {
1841
+ FileOpener file_(fname[0], "wb");
1842
+ FILE *output_file = file_.get_handle();
1843
+ serialize_combined(
1844
+ is_extended? nullptr : (char*)RAW(serialized_obj),
1845
+ is_extended? (char*)RAW(serialized_obj) : nullptr,
1846
+ serialized_imputer.size()? (char*)RAW(serialized_imputer) : nullptr,
1847
+ serialized_indexer.size()? (char*)RAW(serialized_indexer) : nullptr,
1848
+ metadata.size()? (char*)RAW(metadata) : nullptr,
1849
+ metadata.size(),
1850
+ output_file
1851
+ );
1852
+ }
1853
+
1854
+ // [[Rcpp::export]]
1855
+ Rcpp::List deserialize_from_file(Rcpp::CharacterVector fname)
1856
+ {
1857
+ Rcpp::List out = Rcpp::List::create(
1858
+ Rcpp::_["ptr"] = R_NilValue,
1859
+ Rcpp::_["serialized"] = R_NilValue,
1860
+ Rcpp::_["imp_ptr"] = R_NilValue,
1861
+ Rcpp::_["imp_ser"] = R_NilValue,
1862
+ Rcpp::_["indexer"] = R_NilValue,
1863
+ Rcpp::_["ind_ser"] = R_NilValue,
1864
+ Rcpp::_["metadata"] = R_NilValue
1865
+ );
1866
+
1867
+ FileOpener file_(fname[0], "rb");
1868
+ FILE *input_file = file_.get_handle();
1869
+
1870
+ bool is_isotree_model;
1871
+ bool is_compatible;
1872
+ bool has_combined_objects;
1873
+ bool has_IsoForest;
1874
+ bool has_ExtIsoForest;
1875
+ bool has_Imputer;
1876
+ bool has_Indexer;
1877
+ bool has_metadata;
1878
+ size_t size_metadata;
1879
+
1880
+ inspect_serialized_object(
1881
+ input_file,
1882
+ is_isotree_model,
1883
+ is_compatible,
1884
+ has_combined_objects,
1885
+ has_IsoForest,
1886
+ has_ExtIsoForest,
1887
+ has_Imputer,
1888
+ has_Indexer,
1889
+ has_metadata,
1890
+ size_metadata
1891
+ );
1892
+
1893
+ if (!is_isotree_model || !has_combined_objects)
1894
+ Rcpp::stop("Input file is not a serialized isotree model.\n");
1895
+ if (!is_compatible)
1896
+ Rcpp::stop("Model file format is incompatible.\n");
1897
+ if (!size_metadata)
1898
+ Rcpp::stop("Input file does not contain metadata.\n");
1899
+
1900
+ out["metadata"] = Rcpp::unwindProtect(alloc_RawVec, (void*)&size_metadata);
1901
+
1902
+ std::unique_ptr<IsoForest> model(new IsoForest());
1903
+ std::unique_ptr<ExtIsoForest> model_ext(new ExtIsoForest());
1904
+ std::unique_ptr<Imputer> imputer(new Imputer());
1905
+ std::unique_ptr<TreesIndexer> indexer(new TreesIndexer());
1906
+
1907
+ IsoForest *ptr_model = NULL;
1908
+ ExtIsoForest *ptr_model_ext = NULL;
1909
+ Imputer *ptr_imputer = NULL;
1910
+ TreesIndexer *ptr_indexer = NULL;
1911
+ char *ptr_metadata = (char*)RAW(out["metadata"]);
1912
+
1913
+ if (has_IsoForest)
1914
+ ptr_model = model.get();
1915
+ if (has_ExtIsoForest)
1916
+ ptr_model_ext = model_ext.get();
1917
+ if (has_Imputer)
1918
+ ptr_imputer = imputer.get();
1919
+ if (has_Indexer)
1920
+ ptr_indexer = indexer.get();
1921
+
1922
+ deserialize_combined(
1923
+ input_file,
1924
+ ptr_model,
1925
+ ptr_model_ext,
1926
+ ptr_imputer,
1927
+ ptr_indexer,
1928
+ ptr_metadata
1929
+ );
1930
+
1931
+ if (has_IsoForest)
1932
+ out["serialized"] = serialize_cpp_obj(model.get());
1933
+ else
1934
+ out["serialized"] = serialize_cpp_obj(model_ext.get());
1935
+ if (has_Imputer)
1936
+ out["imp_ser"] = serialize_cpp_obj(imputer.get());
1937
+ if (has_Indexer)
1938
+ out["ind_ser"] = serialize_cpp_obj(indexer.get());
1939
+
1940
+ if (has_IsoForest) {
1941
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, model.get());
1942
+ model.release();
1943
+ }
1944
+ else {
1945
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, model_ext.get());
1946
+ model_ext.release();
1947
+ }
1948
+ if (has_Imputer) {
1949
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, imputer.get());
1950
+ imputer.release();
1951
+ }
1952
+ if (has_Indexer) {
1953
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, indexer.get());
1954
+ indexer.release();
1955
+ }
1956
+
1957
+ return out;
1958
+ }
1959
+
1960
+ /* The functions below make for missing functionality in the
1961
+ 'Matrix' and 'SparseM' packages for sub-setting the data */
1962
+
1963
+ // [[Rcpp::export(rng = false)]]
1964
+ void call_sort_csc_indices(Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr)
1965
+ {
1966
+ size_t ncols_numeric = Xc_indptr.size() - 1;
1967
+ sort_csc_indices(REAL(Xc), INTEGER(Xc_ind), INTEGER(Xc_indptr), ncols_numeric);
1968
+ }
1969
+
1970
+ // [[Rcpp::export(rng = false)]]
1971
+ void call_reconstruct_csr_sliced
1972
+ (
1973
+ Rcpp::NumericVector orig_Xr, Rcpp::IntegerVector orig_Xr_indptr,
1974
+ Rcpp::NumericVector rec_Xr, Rcpp::IntegerVector rec_Xr_indptr,
1975
+ size_t nrows
1976
+ )
1977
+ {
1978
+ reconstruct_csr_sliced<double, int>(
1979
+ REAL(orig_Xr), INTEGER(orig_Xr_indptr),
1980
+ REAL(rec_Xr), INTEGER(rec_Xr_indptr),
1981
+ nrows
1982
+ );
1983
+ }
1984
+
1985
+ // [[Rcpp::export(rng = false)]]
1986
+ void call_reconstruct_csr_with_categ
1987
+ (
1988
+ Rcpp::NumericVector orig_Xr, Rcpp::IntegerVector orig_Xr_ind, Rcpp::IntegerVector orig_Xr_indptr,
1989
+ Rcpp::NumericVector rec_Xr, Rcpp::IntegerVector rec_Xr_ind, Rcpp::IntegerVector rec_Xr_indptr,
1990
+ Rcpp::IntegerVector rec_X_cat,
1991
+ Rcpp::IntegerVector cols_numeric, Rcpp::IntegerVector cols_categ,
1992
+ size_t nrows, size_t ncols
1993
+ )
1994
+ {
1995
+ reconstruct_csr_with_categ<double, int, int>(
1996
+ REAL(orig_Xr), INTEGER(orig_Xr_ind), INTEGER(orig_Xr_indptr),
1997
+ REAL(rec_Xr), INTEGER(rec_Xr_ind), INTEGER(rec_Xr_indptr),
1998
+ INTEGER(rec_X_cat), true,
1999
+ INTEGER(cols_numeric), INTEGER(cols_categ),
2000
+ nrows, ncols, cols_numeric.size(), cols_categ.size()
2001
+ );
2002
+ }
2003
+
2004
+ // [[Rcpp::export(rng = false)]]
2005
+ Rcpp::NumericVector deepcopy_vector(Rcpp::NumericVector inp)
2006
+ {
2007
+ return Rcpp::NumericVector(inp.begin(), inp.end());
2008
+ }
2009
+
2010
+ Rcpp::IntegerMatrix csc_to_dense_int
2011
+ (
2012
+ Rcpp::NumericVector Xc,
2013
+ Rcpp::IntegerVector Xc_ind,
2014
+ Rcpp::IntegerVector Xc_indptr,
2015
+ size_t nrows
2016
+ )
2017
+ {
2018
+ size_t ncols = Xc_indptr.size() - 1;
2019
+ Rcpp::IntegerMatrix out_(nrows, ncols);
2020
+ int *restrict out = INTEGER(out_);
2021
+ for (size_t col = 0; col < ncols; col++)
2022
+ {
2023
+ for (auto ix = Xc_indptr[col]; ix < Xc_indptr[col+1]; ix++)
2024
+ out[(size_t)Xc_ind[ix] + col*nrows]
2025
+ =
2026
+ (Xc[ix] >= 0 && !ISNAN(Xc[ix]))?
2027
+ (int)Xc[ix] : (int)(-1);
2028
+ }
2029
+ return out_;
2030
+ }
2031
+
2032
+ template <class real_vec, class int_vec>
2033
+ Rcpp::IntegerMatrix csr_to_dense_int
2034
+ (
2035
+ real_vec Xr,
2036
+ int_vec Xr_ind,
2037
+ int_vec Xr_indptr,
2038
+ int ncols
2039
+ )
2040
+ {
2041
+ size_t nrows = Xr_indptr.size() - 1;
2042
+ size_t matrix_dims[] = {nrows, (size_t)ncols};
2043
+ Rcpp::IntegerMatrix out_ = Rcpp::unwindProtect(safe_int_matrix, (void*)matrix_dims);
2044
+ int *restrict out = INTEGER(out_);
2045
+ for (size_t row = 0; row < nrows; row++)
2046
+ {
2047
+ for (auto ix = Xr_indptr[row]; ix < Xr_indptr[row+1]; ix++)
2048
+ out[row + (size_t)Xr_ind[ix]*nrows]
2049
+ =
2050
+ (Xr[ix] >= 0 && !ISNAN(Xr[ix]))?
2051
+ (int)Xr[ix] : (int)(-1);
2052
+ }
2053
+ return out_;
2054
+ }
2055
+
2056
+ // [[Rcpp::export(rng = false)]]
2057
+ Rcpp::List call_take_cols_by_slice_csr
2058
+ (
2059
+ Rcpp::NumericVector Xr_,
2060
+ Rcpp::IntegerVector Xr_ind_,
2061
+ Rcpp::IntegerVector Xr_indptr,
2062
+ int ncols_take,
2063
+ bool as_dense
2064
+ )
2065
+ {
2066
+ /* Indices need to be sorted beforehand */
2067
+ double *restrict Xr = REAL(Xr_);
2068
+ int *restrict Xr_ind = INTEGER(Xr_ind_);
2069
+ size_t nrows = Xr_indptr.size() - 1;
2070
+ Rcpp::IntegerVector out_Xr_indptr(nrows+1);
2071
+ out_Xr_indptr[0] = 0;
2072
+ size_t total_size = 0;
2073
+ for (size_t row = 0; row < nrows; row++)
2074
+ {
2075
+ for (auto col = Xr_indptr[row]; col < Xr_indptr[row+1]; col++)
2076
+ total_size += Xr_ind[col] < ncols_take;
2077
+ out_Xr_indptr[row+1] = total_size;
2078
+ }
2079
+
2080
+ Rcpp::NumericVector out_Xr_(total_size);
2081
+ Rcpp::IntegerVector out_Xr_ind_(total_size);
2082
+ double *restrict out_Xr = REAL(out_Xr_);
2083
+ int *restrict out_Xr_ind = INTEGER(out_Xr_ind_);
2084
+
2085
+ size_t n_this;
2086
+ for (size_t row = 0; row < nrows; row++)
2087
+ {
2088
+ n_this = out_Xr_indptr[row+1] - out_Xr_indptr[row];
2089
+ if (n_this) {
2090
+ std::copy(Xr + Xr_indptr[row],
2091
+ Xr + Xr_indptr[row] + n_this,
2092
+ out_Xr + out_Xr_indptr[row]);
2093
+ std::copy(Xr_ind + Xr_indptr[row],
2094
+ Xr_ind + Xr_indptr[row] + n_this,
2095
+ out_Xr_ind + out_Xr_indptr[row]);
2096
+ }
2097
+ }
2098
+
2099
+ if (!as_dense)
2100
+ return Rcpp::List::create(
2101
+ Rcpp::_["Xr"] = out_Xr_,
2102
+ Rcpp::_["Xr_ind"] = out_Xr_ind_,
2103
+ Rcpp::_["Xr_indptr"] = out_Xr_indptr
2104
+ );
2105
+ else
2106
+ return Rcpp::List::create(
2107
+ Rcpp::_["X_cat"] = csr_to_dense_int(out_Xr_,
2108
+ out_Xr_ind_,
2109
+ out_Xr_indptr,
2110
+ ncols_take)
2111
+ );
2112
+ }
2113
+
2114
+ // [[Rcpp::export(rng = false)]]
2115
+ Rcpp::List call_take_cols_by_index_csr
2116
+ (
2117
+ Rcpp::NumericVector Xr,
2118
+ Rcpp::IntegerVector Xr_ind,
2119
+ Rcpp::IntegerVector Xr_indptr,
2120
+ Rcpp::IntegerVector cols_take,
2121
+ bool as_dense
2122
+ )
2123
+ {
2124
+ Rcpp::List out;
2125
+ if (!as_dense) {
2126
+ out = Rcpp::List::create(
2127
+ Rcpp::_["Xr"] = R_NilValue,
2128
+ Rcpp::_["Xr_ind"] = R_NilValue,
2129
+ Rcpp::_["Xr_indptr"] = R_NilValue
2130
+ );
2131
+ }
2132
+ else {
2133
+ out = Rcpp::List::create(
2134
+ Rcpp::_["X_cat"] = R_NilValue
2135
+ );
2136
+ }
2137
+
2138
+
2139
+ /* 'cols_take' should be sorted */
2140
+ int n_take = cols_take.size();
2141
+ int nrows = Xr_indptr.size() - 1;
2142
+ std::vector<double> out_Xr;
2143
+ std::vector<int> out_Xr_ind;
2144
+ std::vector<int> out_Xr_indptr(nrows + 1);
2145
+
2146
+ int *curr_ptr;
2147
+ int *end_ptr;
2148
+ int *restrict ptr_Xr_ind = INTEGER(Xr_ind);
2149
+ int *restrict ptr_cols_take = INTEGER(cols_take);
2150
+ int *restrict ptr_cols_take_end = ptr_cols_take + n_take;
2151
+ int curr_col;
2152
+ int *search_res;
2153
+
2154
+ for (int row = 0; row < nrows; row++)
2155
+ {
2156
+ curr_ptr = ptr_Xr_ind + Xr_indptr[row];
2157
+ end_ptr = ptr_Xr_ind + Xr_indptr[row+1];
2158
+ curr_col = 0;
2159
+
2160
+ if (end_ptr == curr_ptr + 1)
2161
+ {
2162
+ search_res = std::lower_bound(ptr_cols_take, ptr_cols_take_end, *curr_ptr);
2163
+ curr_col = std::distance(ptr_cols_take, search_res);
2164
+ if (curr_col < n_take && *search_res == *curr_ptr)
2165
+ {
2166
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2167
+ out_Xr_ind.push_back(curr_col);
2168
+ }
2169
+ }
2170
+
2171
+ else
2172
+ if (end_ptr > curr_ptr)
2173
+ {
2174
+ while (true)
2175
+ {
2176
+ curr_ptr = std::lower_bound(curr_ptr, end_ptr, ptr_cols_take[curr_col]);
2177
+
2178
+ if (curr_ptr >= end_ptr)
2179
+ {
2180
+ break;
2181
+ }
2182
+
2183
+
2184
+ else if (*curr_ptr == ptr_cols_take[curr_col])
2185
+ {
2186
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2187
+ out_Xr_ind.push_back(curr_col);
2188
+ curr_ptr++;
2189
+ curr_col++;
2190
+
2191
+ if (curr_ptr >= end_ptr || curr_col >= n_take)
2192
+ break;
2193
+ }
2194
+
2195
+
2196
+ else
2197
+ {
2198
+ curr_col = std::distance(
2199
+ ptr_cols_take,
2200
+ std::lower_bound(ptr_cols_take + curr_col, ptr_cols_take_end, *curr_ptr)
2201
+ );
2202
+
2203
+ if (curr_col >= n_take)
2204
+ break;
2205
+
2206
+ if (curr_col == *curr_ptr) {
2207
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2208
+ out_Xr_ind.push_back(curr_col);
2209
+ curr_ptr++;
2210
+ curr_col++;
2211
+ }
2212
+
2213
+ if (curr_ptr >= end_ptr || curr_col >= n_take)
2214
+ break;
2215
+ }
2216
+ }
2217
+ }
2218
+
2219
+ out_Xr_indptr[row+1] = out_Xr.size();
2220
+ }
2221
+
2222
+ if (!as_dense)
2223
+ {
2224
+ out["Xr"] = Rcpp::unwindProtect(safe_copy_vec, (void*)&out_Xr);
2225
+ out["Xr_ind"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xr_ind);
2226
+ out["Xr_indptr"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xr_indptr);
2227
+ }
2228
+
2229
+ else
2230
+ {
2231
+ out["X_cat"] = csr_to_dense_int(out_Xr,
2232
+ out_Xr_ind,
2233
+ out_Xr_indptr,
2234
+ n_take);
2235
+ }
2236
+
2237
+ return out;
2238
+ }
2239
+
2240
+ // [[Rcpp::export(rng = false)]]
2241
+ Rcpp::List call_take_cols_by_slice_csc
2242
+ (
2243
+ Rcpp::NumericVector Xc,
2244
+ Rcpp::IntegerVector Xc_ind,
2245
+ Rcpp::IntegerVector Xc_indptr,
2246
+ size_t ncols_take,
2247
+ bool as_dense, size_t nrows
2248
+ )
2249
+ {
2250
+ Rcpp::IntegerVector out_Xc_indptr(ncols_take+1);
2251
+ size_t total_size = Xc_indptr[ncols_take+1];
2252
+ Rcpp::NumericVector out_Xc(REAL(Xc), REAL(Xc) + total_size);
2253
+ Rcpp::IntegerVector out_Xc_ind(INTEGER(Xc_ind), INTEGER(Xc_ind) + total_size);
2254
+
2255
+ if (!as_dense)
2256
+ return Rcpp::List::create(
2257
+ Rcpp::_["Xc"] = out_Xc,
2258
+ Rcpp::_["Xc_ind"] = out_Xc_ind,
2259
+ Rcpp::_["Xc_indptr"] = out_Xc_indptr
2260
+ );
2261
+ else
2262
+ return Rcpp::List::create(
2263
+ Rcpp::_["X_cat"] = csc_to_dense_int(out_Xc,
2264
+ out_Xc_ind,
2265
+ out_Xc_indptr,
2266
+ nrows)
2267
+ );
2268
+ }
2269
+
2270
+ // [[Rcpp::export(rng = false)]]
2271
+ Rcpp::List call_take_cols_by_index_csc
2272
+ (
2273
+ Rcpp::NumericVector Xc_,
2274
+ Rcpp::IntegerVector Xc_ind_,
2275
+ Rcpp::IntegerVector Xc_indptr,
2276
+ Rcpp::IntegerVector cols_take,
2277
+ bool as_dense, size_t nrows
2278
+ )
2279
+ {
2280
+ /* 'cols_take' should be sorted */
2281
+ double *restrict Xc = REAL(Xc_);
2282
+ int *restrict Xc_ind = INTEGER(Xc_ind_);
2283
+ size_t n_take = cols_take.size();
2284
+ Rcpp::IntegerVector out_Xc_indptr(n_take+1);
2285
+ size_t total_size = 0;
2286
+
2287
+ for (size_t col = 0; col < n_take; col++)
2288
+ total_size += Xc_indptr[cols_take[col]+1] - Xc_indptr[cols_take[col]];
2289
+
2290
+ Rcpp::NumericVector out_Xc_(total_size);
2291
+ Rcpp::IntegerVector out_Xc_ind_(total_size);
2292
+ double *restrict out_Xc = REAL(out_Xc_);
2293
+ int *restrict out_Xc_ind = INTEGER(out_Xc_ind_);
2294
+
2295
+ total_size = 0;
2296
+ size_t n_this;
2297
+ out_Xc_indptr[0] = 0;
2298
+ for (size_t col = 0; col < n_take; col++)
2299
+ {
2300
+ n_this = Xc_indptr[cols_take[col]+1] - Xc_indptr[cols_take[col]];
2301
+ if (n_this) {
2302
+ std::copy(Xc + Xc_indptr[cols_take[col]],
2303
+ Xc + Xc_indptr[cols_take[col]] + n_this,
2304
+ out_Xc + total_size);
2305
+ std::copy(Xc_ind + Xc_indptr[cols_take[col]],
2306
+ Xc_ind + Xc_indptr[cols_take[col]] + n_this,
2307
+ out_Xc_ind + total_size);
2308
+ }
2309
+ total_size += n_this;
2310
+ out_Xc_indptr[col+1] = total_size;
2311
+ }
2312
+
2313
+ if (!as_dense)
2314
+ return Rcpp::List::create(
2315
+ Rcpp::_["Xc"] = out_Xc_,
2316
+ Rcpp::_["Xc_ind"] = out_Xc_ind_,
2317
+ Rcpp::_["Xc_indptr"] = out_Xc_indptr
2318
+ );
2319
+ else
2320
+ return Rcpp::List::create(
2321
+ Rcpp::_["X_cat"] = csc_to_dense_int(out_Xc_,
2322
+ out_Xc_ind_,
2323
+ out_Xc_indptr,
2324
+ nrows)
2325
+ );
2326
+ }
2327
+
2328
+ // [[Rcpp::export(rng = false)]]
2329
+ void copy_csc_cols_by_slice
2330
+ (
2331
+ Rcpp::NumericVector out_Xc_,
2332
+ Rcpp::IntegerVector out_Xc_indptr,
2333
+ Rcpp::NumericVector from_Xc_,
2334
+ Rcpp::IntegerVector from_Xc_indptr,
2335
+ size_t n_copy
2336
+ )
2337
+ {
2338
+ size_t total_size = from_Xc_indptr[n_copy+1];
2339
+ std::copy(REAL(from_Xc_), REAL(from_Xc_) + total_size, REAL(out_Xc_));
2340
+ }
2341
+
2342
+ // [[Rcpp::export(rng = false)]]
2343
+ void copy_csc_cols_by_index
2344
+ (
2345
+ Rcpp::NumericVector out_Xc_,
2346
+ Rcpp::IntegerVector out_Xc_indptr,
2347
+ Rcpp::NumericVector from_Xc_,
2348
+ Rcpp::IntegerVector from_Xc_indptr,
2349
+ Rcpp::IntegerVector cols_copy
2350
+ )
2351
+ {
2352
+ size_t n_copy = cols_copy.size();
2353
+ double *restrict out_Xc = REAL(out_Xc_);
2354
+ double *restrict from_Xc = REAL(from_Xc_);
2355
+
2356
+ for (size_t col = 0; col < n_copy; col++)
2357
+ {
2358
+ std::copy(from_Xc + from_Xc_indptr[col],
2359
+ from_Xc + from_Xc_indptr[col+1],
2360
+ out_Xc + out_Xc_indptr[cols_copy[col]]);
2361
+ }
2362
+ }
2363
+
2364
+
2365
+ // [[Rcpp::export(rng = false)]]
2366
+ Rcpp::List assign_csc_cols
2367
+ (
2368
+ Rcpp::NumericVector Xc_,
2369
+ Rcpp::IntegerVector Xc_ind_,
2370
+ Rcpp::IntegerVector Xc_indptr,
2371
+ Rcpp::IntegerVector X_cat_,
2372
+ Rcpp::IntegerVector cols_categ,
2373
+ Rcpp::IntegerVector cols_numeric,
2374
+ size_t nrows
2375
+ )
2376
+ {
2377
+ Rcpp::List out = Rcpp::List::create(
2378
+ Rcpp::_["Xc"] = R_NilValue,
2379
+ Rcpp::_["Xc_ind"] = R_NilValue,
2380
+ Rcpp::_["Xc_indptr"] = R_NilValue
2381
+ );
2382
+ size_t ncols_tot = (size_t)cols_categ.size() + (size_t)cols_numeric.size();
2383
+ std::vector<double> out_Xc;
2384
+ std::vector<int> out_Xc_ind;
2385
+ std::vector<int> out_Xc_indptr(ncols_tot + 1);
2386
+
2387
+ double *restrict Xc = REAL(Xc_);
2388
+ int *restrict Xc_ind = INTEGER(Xc_ind_);
2389
+ int *restrict X_cat = INTEGER(X_cat_);
2390
+
2391
+ hashed_set<int> cols_categ_set(INTEGER(cols_categ), INTEGER(cols_categ) + cols_categ.size());
2392
+ hashed_set<int> cols_numeric_set(INTEGER(cols_numeric), INTEGER(cols_numeric) + cols_numeric.size());
2393
+
2394
+ size_t curr_num = 0;
2395
+ size_t curr_cat = 0;
2396
+ bool has_zeros;
2397
+ size_t curr_size;
2398
+
2399
+ for (size_t col = 0; col < ncols_tot; col++)
2400
+ {
2401
+ if (is_in_set((int)col, cols_numeric_set))
2402
+ {
2403
+ std::copy(Xc + Xc_indptr[curr_num],
2404
+ Xc + Xc_indptr[curr_num+1],
2405
+ std::back_inserter(out_Xc));
2406
+ std::copy(Xc_ind + Xc_indptr[curr_num],
2407
+ Xc_ind + Xc_indptr[curr_num+1],
2408
+ std::back_inserter(out_Xc_ind));
2409
+ curr_num++;
2410
+ }
2411
+
2412
+ else if (is_in_set((int)col, cols_categ_set))
2413
+ {
2414
+ has_zeros = false;
2415
+ for (size_t row = 0; row < nrows; row++)
2416
+ if (X_cat[row + (size_t)curr_cat*nrows] == 0)
2417
+ has_zeros = true;
2418
+
2419
+ if (!has_zeros) {
2420
+ std::copy(X_cat + (size_t)curr_cat*nrows,
2421
+ X_cat + ((size_t)curr_cat+1)*nrows,
2422
+ std::back_inserter(out_Xc));
2423
+ curr_size = out_Xc_ind.size();
2424
+ out_Xc_ind.resize(curr_size + (size_t)nrows);
2425
+ std::iota(out_Xc_ind.begin() + curr_size, out_Xc_ind.end(), (int)0);
2426
+ }
2427
+
2428
+ else {
2429
+ for (size_t row = 0; row < nrows; row++) {
2430
+ if (X_cat[row + (size_t)curr_cat*nrows] > 0) {
2431
+ out_Xc.push_back(X_cat[row + (size_t)curr_cat*nrows]);
2432
+ out_Xc_ind.push_back((int)row);
2433
+ }
2434
+ }
2435
+ }
2436
+
2437
+ curr_cat++;
2438
+ }
2439
+
2440
+ out_Xc_indptr[col+1] = out_Xc.size();
2441
+ }
2442
+
2443
+
2444
+ out["Xc"] = Rcpp::unwindProtect(safe_copy_vec, (void*)&out_Xc);
2445
+ out["Xc_ind"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xc_ind);
2446
+ out["Xc_indptr"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xc_indptr);
2447
+ return out;
2448
+ }
2449
+
2450
+ /* These are helpers for dealing with large integers and R's copy-on-write semantics */
2451
+
2452
+ // [[Rcpp::export(rng = false)]]
2453
+ Rcpp::NumericVector get_empty_tmat(int nrows_)
2454
+ {
2455
+ size_t nrows = (size_t)nrows_;
2456
+ size_t tmat_size = (nrows * (nrows - (size_t)1)) / (size_t)2;
2457
+ return Rcpp::NumericVector((R_xlen_t)tmat_size);
2458
+ }
2459
+
2460
+ // [[Rcpp::export(rng = false)]]
2461
+ Rcpp::IntegerMatrix get_empty_int_mat(int nrows, int ncols)
2462
+ {
2463
+ return Rcpp::IntegerMatrix(nrows, ncols);
2464
+ }
2465
+
2466
+ // [[Rcpp::export(rng = false)]]
2467
+ Rcpp::IntegerMatrix get_null_int_mat()
2468
+ {
2469
+ return Rcpp::IntegerMatrix(0, 0);
2470
+ }
2471
+
2472
+ // [[Rcpp::export(rng = false)]]
2473
+ int get_ntrees(SEXP model_R_ptr, bool is_extended)
2474
+ {
2475
+ if (is_extended) {
2476
+ ExtIsoForest* ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
2477
+ return ext_model_ptr->hplanes.size();
2478
+ }
2479
+
2480
+ else {
2481
+ IsoForest* model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
2482
+ return model_ptr->trees.size();
2483
+ }
2484
+ }
2485
+
2486
+ // [[Rcpp::export(rng = false)]]
2487
+ SEXP deepcopy_int(SEXP x)
2488
+ {
2489
+ return Rf_ScalarInteger(Rf_asInteger(x));
2490
+ }
2491
+
2492
+ // [[Rcpp::export(rng = false)]]
2493
+ void modify_R_list_inplace(SEXP lst, int ix, SEXP el)
2494
+ {
2495
+ SET_VECTOR_ELT(lst, ix, el);
2496
+ }
2497
+
2498
+ // [[Rcpp::export(rng = false)]]
2499
+ void addto_R_list_inplace(Rcpp::List &lst, Rcpp::String nm, SEXP el)
2500
+ {
2501
+ lst[nm] = el;
2502
+ }
2503
+
2504
+
2505
+ // [[Rcpp::export(rng = false)]]
2506
+ bool R_has_openmp()
2507
+ {
2508
+ #ifdef _OPENMP
2509
+ return true;
2510
+ #else
2511
+ return false;
2512
+ #endif
850
2513
  }
851
2514
 
852
2515
  #endif /* _FOR_R */