isotree 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (151) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -1
  3. data/LICENSE.txt +2 -2
  4. data/README.md +32 -14
  5. data/ext/isotree/ext.cpp +144 -31
  6. data/ext/isotree/extconf.rb +7 -7
  7. data/lib/isotree/isolation_forest.rb +110 -30
  8. data/lib/isotree/version.rb +1 -1
  9. data/vendor/isotree/LICENSE +1 -1
  10. data/vendor/isotree/README.md +165 -27
  11. data/vendor/isotree/include/isotree.hpp +2111 -0
  12. data/vendor/isotree/include/isotree_oop.hpp +394 -0
  13. data/vendor/isotree/inst/COPYRIGHTS +62 -0
  14. data/vendor/isotree/src/RcppExports.cpp +525 -52
  15. data/vendor/isotree/src/Rwrapper.cpp +1931 -268
  16. data/vendor/isotree/src/c_interface.cpp +953 -0
  17. data/vendor/isotree/src/crit.hpp +4232 -0
  18. data/vendor/isotree/src/dist.hpp +1886 -0
  19. data/vendor/isotree/src/exp_depth_table.hpp +134 -0
  20. data/vendor/isotree/src/extended.hpp +1444 -0
  21. data/vendor/isotree/src/external_facing_generic.hpp +399 -0
  22. data/vendor/isotree/src/fit_model.hpp +2401 -0
  23. data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
  24. data/vendor/isotree/src/helpers_iforest.hpp +813 -0
  25. data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
  26. data/vendor/isotree/src/indexer.cpp +515 -0
  27. data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
  28. data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
  29. data/vendor/isotree/src/isoforest.hpp +1659 -0
  30. data/vendor/isotree/src/isotree.hpp +1804 -392
  31. data/vendor/isotree/src/isotree_exportable.hpp +99 -0
  32. data/vendor/isotree/src/merge_models.cpp +159 -16
  33. data/vendor/isotree/src/mult.hpp +1321 -0
  34. data/vendor/isotree/src/oop_interface.cpp +842 -0
  35. data/vendor/isotree/src/oop_interface.hpp +278 -0
  36. data/vendor/isotree/src/other_helpers.hpp +219 -0
  37. data/vendor/isotree/src/predict.hpp +1932 -0
  38. data/vendor/isotree/src/python_helpers.hpp +134 -0
  39. data/vendor/isotree/src/ref_indexer.hpp +154 -0
  40. data/vendor/isotree/src/robinmap/LICENSE +21 -0
  41. data/vendor/isotree/src/robinmap/README.md +483 -0
  42. data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
  43. data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
  44. data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
  45. data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
  46. data/vendor/isotree/src/serialize.cpp +4300 -139
  47. data/vendor/isotree/src/sql.cpp +141 -59
  48. data/vendor/isotree/src/subset_models.cpp +174 -0
  49. data/vendor/isotree/src/utils.hpp +3808 -0
  50. data/vendor/isotree/src/xoshiro.hpp +467 -0
  51. data/vendor/isotree/src/ziggurat.hpp +405 -0
  52. metadata +38 -104
  53. data/vendor/cereal/LICENSE +0 -24
  54. data/vendor/cereal/README.md +0 -85
  55. data/vendor/cereal/include/cereal/access.hpp +0 -351
  56. data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
  57. data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
  58. data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
  59. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
  60. data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
  61. data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
  62. data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
  63. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
  64. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
  65. data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
  66. data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
  67. data/vendor/cereal/include/cereal/details/util.hpp +0 -84
  68. data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
  69. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
  70. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
  71. data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
  72. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
  73. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
  74. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
  75. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
  76. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
  77. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
  78. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
  79. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
  80. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
  81. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
  82. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
  83. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
  84. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
  85. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
  86. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
  87. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
  88. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
  89. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
  90. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
  91. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
  92. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
  93. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
  94. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
  95. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
  96. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
  97. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
  98. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
  99. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
  100. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
  101. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
  102. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
  103. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
  104. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
  105. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
  106. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
  107. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
  108. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
  109. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
  110. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
  111. data/vendor/cereal/include/cereal/macros.hpp +0 -154
  112. data/vendor/cereal/include/cereal/specialize.hpp +0 -139
  113. data/vendor/cereal/include/cereal/types/array.hpp +0 -79
  114. data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
  115. data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
  116. data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
  117. data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
  118. data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
  119. data/vendor/cereal/include/cereal/types/common.hpp +0 -129
  120. data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
  121. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
  122. data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
  123. data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
  124. data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
  125. data/vendor/cereal/include/cereal/types/list.hpp +0 -62
  126. data/vendor/cereal/include/cereal/types/map.hpp +0 -36
  127. data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
  128. data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
  129. data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
  130. data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
  131. data/vendor/cereal/include/cereal/types/set.hpp +0 -103
  132. data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
  133. data/vendor/cereal/include/cereal/types/string.hpp +0 -61
  134. data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
  135. data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
  136. data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
  137. data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
  138. data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
  139. data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
  140. data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
  141. data/vendor/cereal/include/cereal/version.hpp +0 -52
  142. data/vendor/isotree/src/Makevars +0 -4
  143. data/vendor/isotree/src/crit.cpp +0 -912
  144. data/vendor/isotree/src/dist.cpp +0 -749
  145. data/vendor/isotree/src/extended.cpp +0 -790
  146. data/vendor/isotree/src/fit_model.cpp +0 -1090
  147. data/vendor/isotree/src/helpers_iforest.cpp +0 -324
  148. data/vendor/isotree/src/isoforest.cpp +0 -771
  149. data/vendor/isotree/src/mult.cpp +0 -607
  150. data/vendor/isotree/src/predict.cpp +0 -853
  151. data/vendor/isotree/src/utils.cpp +0 -1566
@@ -18,11 +18,29 @@
18
18
  * [5] https://sourceforge.net/projects/iforest/
19
19
  * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
- * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
- * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
21
+ * [8] Cortes, David.
22
+ * "Distance approximation using Isolation Forests."
23
+ * arXiv preprint arXiv:1910.12362 (2019).
24
+ * [9] Cortes, David.
25
+ * "Imputing missing values with unsupervised random trees."
26
+ * arXiv preprint arXiv:1911.06646 (2019).
27
+ * [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
28
+ * [11] Cortes, David.
29
+ * "Revisiting randomized choices in isolation forests."
30
+ * arXiv preprint arXiv:2110.13402 (2021).
31
+ * [12] Guha, Sudipto, et al.
32
+ * "Robust random cut forest based anomaly detection on streams."
33
+ * International conference on machine learning. PMLR, 2016.
34
+ * [13] Cortes, David.
35
+ * "Isolation forests: looking beyond tree depth."
36
+ * arXiv preprint arXiv:2111.11639 (2021).
37
+ * [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
38
+ * "Isolation kernel and its effect on SVM"
39
+ * Proceedings of the 24th ACM SIGKDD
40
+ * International Conference on Knowledge Discovery & Data Mining. 2018.
23
41
  *
24
42
  * BSD 2-Clause License
25
- * Copyright (c) 2020, David Cortes
43
+ * Copyright (c) 2019-2022, David Cortes
26
44
  * All rights reserved.
27
45
  * Redistribution and use in source and binary forms, with or without
28
46
  * modification, are permitted provided that the following conditions are met:
@@ -45,76 +63,138 @@
45
63
  #ifdef _FOR_R
46
64
 
47
65
  #include <Rcpp.h>
48
- // [[Rcpp::plugins(cpp11)]]
66
+ #include <Rcpp/unwindProtect.h>
67
+ // [[Rcpp::plugins(unwindProtect)]]
68
+ #include <Rinternals.h>
49
69
 
50
- /* This is to serialize the model objects */
51
- // [[Rcpp::depends(Rcereal)]]
52
- #include <cereal/archives/binary.hpp>
53
- #include <cereal/types/vector.hpp>
54
- #include <sstream>
55
- #include <string>
70
+ #ifndef _FOR_R
71
+ #define FOR_R
72
+ #endif
56
73
 
57
74
  /* This is the package's header */
58
75
  #include "isotree.hpp"
59
76
 
77
+ /* Library is templated, base R comes with only these 2 types though */
78
+ #include "headers_joined.hpp"
79
+ #define real_t double
80
+ #define sparse_ix int
81
+ #include "instantiate_template_headers.hpp"
82
+
83
+ /* For imputing CSR matrices with differing columns from input */
84
+ #include "other_helpers.hpp"
85
+
86
+ /* Note: the R version calls the 'sort_csc_indices' templated function,
87
+ so it's not enough to just include 'isotree_exportable.hpp' and let
88
+ the templates be instantiated elsewhere. */
89
+
90
+ #define throw_mem_err() Rcpp::stop("Error: insufficient memory. Try smaller sample sizes and fewer trees.\n")
91
+
92
+ SEXP alloc_RawVec(void *data)
93
+ {
94
+ size_t vecsize = *(size_t*)data;
95
+ if (unlikely(vecsize > (size_t)std::numeric_limits<R_xlen_t>::max()))
96
+ Rcpp::stop("Object is too big for R to handle.");
97
+ return Rcpp::RawVector((R_xlen_t)vecsize);
98
+ }
99
+
100
+ SEXP safe_copy_vec(void *data)
101
+ {
102
+ std::vector<double> *vec = (std::vector<double>*)data;
103
+ return Rcpp::NumericVector(vec->begin(), vec->end());
104
+ }
105
+
106
+ SEXP safe_copy_intvec(void *data)
107
+ {
108
+ std::vector<int> *vec = (std::vector<int>*)data;
109
+ return Rcpp::IntegerVector(vec->begin(), vec->end());
110
+ }
111
+
112
+ SEXP safe_int_matrix(void *dims)
113
+ {
114
+ size_t *dims_ = (size_t*)dims;
115
+ size_t nrows = dims_[0];
116
+ size_t ncols = dims_[1];
117
+ return Rcpp::IntegerMatrix(nrows, ncols);
118
+ }
119
+
120
+ template <class Model>
121
+ SEXP safe_XPtr(void *model_ptr)
122
+ {
123
+ return Rcpp::XPtr<Model>((Model*)model_ptr, true);
124
+ }
125
+
126
+ SEXP safe_errlist(void *ignored)
127
+ {
128
+ return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
129
+ }
130
+
131
+ SEXP safe_FALSE(void *ignored)
132
+ {
133
+ return Rcpp::LogicalVector::create(0);
134
+ }
135
+
136
+ Rcpp::RawVector resize_vec(Rcpp::RawVector inp, size_t new_size)
137
+ {
138
+ Rcpp::RawVector out = Rcpp::unwindProtect(alloc_RawVec, (void*)&new_size);
139
+ memcpy(RAW(out), RAW(inp), std::min((size_t)inp.size(), new_size));
140
+ return out;
141
+ }
142
+
60
143
  /* for model serialization and re-usage in R */
61
144
  /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
62
145
  /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
63
- #include <Rinternals.h>
64
- template <class T>
65
- Rcpp::RawVector serialize_cpp_obj(T *model_outputs)
146
+ template <class Model>
147
+ Rcpp::RawVector serialize_cpp_obj(const Model *model_outputs)
66
148
  {
67
- std::stringstream ss;
68
- {
69
- cereal::BinaryOutputArchive oarchive(ss); // Create an output archive
70
- oarchive(*model_outputs);
71
- }
72
- ss.seekg(0, ss.end);
73
- /* Checking for potential integer overflows */
74
- std::stringstream::pos_type vec_size = ss.tellg();
75
- if (vec_size <= 0) {
76
- Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
77
- return Rcpp::RawVector();
78
- }
79
- Rcpp::RawVector retval((size_t) vec_size);
80
- ss.seekg(0, ss.beg);
81
- ss.read(reinterpret_cast<char*>(&retval[0]), retval.size());
82
- return retval;
149
+ size_t serialized_size = determine_serialized_size(*model_outputs);
150
+ if (unlikely(!serialized_size))
151
+ Rcpp::stop("Unexpected error.");
152
+ if (unlikely(serialized_size > (size_t)std::numeric_limits<R_xlen_t>::max()))
153
+ Rcpp::stop("Resulting model is too large for R to handle.");
154
+ Rcpp::RawVector out = Rcpp::unwindProtect(alloc_RawVec, (void*)&serialized_size);
155
+ char *out_ = (char*)RAW(out);
156
+ serialize_isotree(*model_outputs, out_);
157
+ return out;
83
158
  }
84
159
 
85
- template <class T>
160
+ template <class Model>
86
161
  SEXP deserialize_cpp_obj(Rcpp::RawVector src)
87
162
  {
88
- std::stringstream ss;
89
- ss.write(reinterpret_cast<char*>(&src[0]), src.size());
90
- ss.seekg(0, ss.beg);
91
- std::unique_ptr<T> model_outputs = std::unique_ptr<T>(new T());
92
- {
93
- cereal::BinaryInputArchive iarchive(ss);
94
- iarchive(*model_outputs);
95
- }
96
- return Rcpp::XPtr<T>(model_outputs.release(), true);
163
+ if (unlikely(!src.size()))
164
+ Rcpp::stop("Unexpected error.");
165
+ std::unique_ptr<Model> out(new Model());
166
+ const char *inp = (const char*)RAW(src);
167
+ deserialize_isotree(*out, inp);
168
+ SEXP out_ = Rcpp::unwindProtect(safe_XPtr<Model>, out.get());
169
+ out.release();
170
+ return out_;
97
171
  }
98
172
 
99
- // [[Rcpp::export]]
173
+ // [[Rcpp::export(rng = false)]]
100
174
  SEXP deserialize_IsoForest(Rcpp::RawVector src)
101
175
  {
102
176
  return deserialize_cpp_obj<IsoForest>(src);
103
177
  }
104
178
 
105
- // [[Rcpp::export]]
179
+ // [[Rcpp::export(rng = false)]]
106
180
  SEXP deserialize_ExtIsoForest(Rcpp::RawVector src)
107
181
  {
108
182
  return deserialize_cpp_obj<ExtIsoForest>(src);
109
183
  }
110
184
 
111
- // [[Rcpp::export]]
185
+ // [[Rcpp::export(rng = false)]]
112
186
  SEXP deserialize_Imputer(Rcpp::RawVector src)
113
187
  {
114
188
  return deserialize_cpp_obj<Imputer>(src);
115
189
  }
116
190
 
117
- // [[Rcpp::export]]
191
+ // [[Rcpp::export(rng = false)]]
192
+ SEXP deserialize_Indexer(Rcpp::RawVector src)
193
+ {
194
+ return deserialize_cpp_obj<TreesIndexer>(src);
195
+ }
196
+
197
+ // [[Rcpp::export(rng = false)]]
118
198
  Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
119
199
  {
120
200
  return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
@@ -123,79 +203,87 @@ Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
123
203
  double* set_R_nan_as_C_nan(double *x, size_t n, std::vector<double> &v, int nthreads)
124
204
  {
125
205
  v.assign(x, x + n);
126
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, v, n)
127
- for (size_t_for i = 0; i < n; i++)
128
- if (isnan(v[i]))
129
- v[i] = NAN;
206
+ for (size_t i = 0; i < n; i++)
207
+ if (unlikely(std::isnan(v[i]))) v[i] = NAN;
130
208
  return v.data();
131
209
  }
132
210
 
211
+ double* set_R_nan_as_C_nan(double *x, size_t n, Rcpp::NumericVector &v, int nthreads)
212
+ {
213
+ v = Rcpp::NumericVector(x, x + n);
214
+ for (size_t i = 0; i < n; i++)
215
+ if (unlikely(std::isnan(v[i]))) v[i] = NAN;
216
+ return REAL(v);
217
+ }
218
+
133
219
  double* set_R_nan_as_C_nan(double *x, size_t n, int nthreads)
134
220
  {
135
- #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, n)
136
- for (size_t_for i = 0; i < n; i++)
137
- if (isnan(x[i]))
138
- x[i] = NAN;
221
+ for (size_t i = 0; i < n; i++)
222
+ if (unlikely(std::isnan(x[i]))) x[i] = NAN;
139
223
  return x;
140
224
  }
141
225
 
142
- // [[Rcpp::export]]
226
+ // [[Rcpp::export(rng = false)]]
143
227
  Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
144
228
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
145
229
  Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
146
230
  size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry,
147
231
  Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample,
148
- size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth,
149
- bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist,
232
+ size_t sample_size, size_t ntrees, size_t max_depth, size_t ncols_per_tree, bool limit_depth,
233
+ bool penalize_range, bool standardize_data,
234
+ Rcpp::CharacterVector scoring_metric, bool fast_bratio,
235
+ bool calc_dist, bool standardize_dist, bool sq_dist,
150
236
  bool calc_depth, bool standardize_depth, bool weigh_by_kurt,
151
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
152
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
237
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
238
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
239
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
240
+ double prob_pick_col_by_kurt, double min_gain,
153
241
  Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
154
242
  Rcpp::CharacterVector missing_action, bool all_perm,
155
243
  bool build_imputer, bool output_imputations, size_t min_imp_obs,
156
244
  Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
157
- int random_seed, bool handle_interrupt, int nthreads)
245
+ int random_seed, bool use_long_double, int nthreads)
158
246
  {
159
247
  double* numeric_data_ptr = NULL;
160
248
  int* categ_data_ptr = NULL;
161
249
  int* ncat_ptr = NULL;
162
250
  double* Xc_ptr = NULL;
163
- sparse_ix* Xc_ind_ptr = NULL;
164
- sparse_ix* Xc_indptr_ptr = NULL;
251
+ int* Xc_ind_ptr = NULL;
252
+ int* Xc_indptr_ptr = NULL;
165
253
  double* sample_weights_ptr = NULL;
166
254
  double* col_weights_ptr = NULL;
167
- std::vector<double> Xcpp;
255
+ Rcpp::NumericVector Xcpp;
168
256
 
169
257
  if (X_num.size())
170
258
  {
171
- numeric_data_ptr = &X_num[0];
172
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
259
+ numeric_data_ptr = REAL(X_num);
260
+ if (Rcpp::as<std::string>(missing_action) != "fail")
173
261
  numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, nthreads);
174
262
  }
175
263
 
176
264
  if (X_cat.size())
177
265
  {
178
- categ_data_ptr = &X_cat[0];
179
- ncat_ptr = &ncat[0];
266
+ categ_data_ptr = INTEGER(X_cat);
267
+ ncat_ptr = INTEGER(ncat);
180
268
  }
181
269
 
182
270
  if (Xc.size())
183
271
  {
184
- Xc_ptr = &Xc[0];
185
- Xc_ind_ptr = &Xc_ind[0];
186
- Xc_indptr_ptr = &Xc_indptr[0];
187
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
272
+ Xc_ptr = REAL(Xc);
273
+ Xc_ind_ptr = INTEGER(Xc_ind);
274
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
275
+ if (Rcpp::as<std::string>(missing_action) != "fail")
188
276
  Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
189
277
  }
190
278
 
191
279
  if (sample_weights.size())
192
280
  {
193
- sample_weights_ptr = &sample_weights[0];
281
+ sample_weights_ptr = REAL(sample_weights);
194
282
  }
195
283
 
196
284
  if (col_weights.size())
197
285
  {
198
- col_weights_ptr = &col_weights[0];
286
+ col_weights_ptr = REAL(col_weights);
199
287
  }
200
288
 
201
289
  CoefType coef_type_C = Normal;
@@ -204,47 +292,72 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
204
292
  MissingAction missing_action_C = Divide;
205
293
  UseDepthImp depth_imp_C = Higher;
206
294
  WeighImpRows weigh_imp_rows_C = Inverse;
295
+ ScoringMetric scoring_metric_C = Depth;
207
296
 
208
- if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
297
+ if (Rcpp::as<std::string>(coef_type) == "uniform")
209
298
  {
210
299
  coef_type_C = Uniform;
211
300
  }
212
- if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
301
+ if (Rcpp::as<std::string>(cat_split_type) == "single_categ")
213
302
  {
214
303
  cat_split_type_C = SingleCateg;
215
304
  }
216
- if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
305
+ if (Rcpp::as<std::string>(new_cat_action) == "smallest")
217
306
  {
218
307
  new_cat_action_C = Smallest;
219
308
  }
220
- else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
309
+ else if (Rcpp::as<std::string>(new_cat_action) == "random")
221
310
  {
222
311
  new_cat_action_C = Random;
223
312
  }
224
- if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
313
+ if (Rcpp::as<std::string>(missing_action) == "impute")
225
314
  {
226
315
  missing_action_C = Impute;
227
316
  }
228
- else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
317
+ else if (Rcpp::as<std::string>(missing_action) == "fail")
229
318
  {
230
319
  missing_action_C = Fail;
231
320
  }
232
- if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
321
+ if (Rcpp::as<std::string>(depth_imp) == "lower")
233
322
  {
234
323
  depth_imp_C = Lower;
235
324
  }
236
- else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
325
+ else if (Rcpp::as<std::string>(depth_imp) == "same")
237
326
  {
238
327
  depth_imp_C = Same;
239
328
  }
240
- if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
329
+ if (Rcpp::as<std::string>(weigh_imp_rows) == "prop")
241
330
  {
242
331
  weigh_imp_rows_C = Prop;
243
332
  }
244
- else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
333
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == "flat")
245
334
  {
246
335
  weigh_imp_rows_C = Flat;
247
336
  }
337
+ if (Rcpp::as<std::string>(scoring_metric) == "adj_depth")
338
+ {
339
+ scoring_metric_C = AdjDepth;
340
+ }
341
+ else if (Rcpp::as<std::string>(scoring_metric) == "density")
342
+ {
343
+ scoring_metric_C = Density;
344
+ }
345
+ else if (Rcpp::as<std::string>(scoring_metric) == "adj_density")
346
+ {
347
+ scoring_metric_C = AdjDensity;
348
+ }
349
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_density")
350
+ {
351
+ scoring_metric_C = BoxedDensity;
352
+ }
353
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_density2")
354
+ {
355
+ scoring_metric_C = BoxedDensity2;
356
+ }
357
+ else if (Rcpp::as<std::string>(scoring_metric) == "boxed_ratio")
358
+ {
359
+ scoring_metric_C = BoxedRatio;
360
+ }
248
361
 
249
362
  Rcpp::NumericVector tmat = Rcpp::NumericVector();
250
363
  Rcpp::NumericMatrix dmat = Rcpp::NumericMatrix();
@@ -255,24 +368,37 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
255
368
 
256
369
  if (calc_dist)
257
370
  {
258
- tmat = Rcpp::NumericVector((nrows * (nrows - 1)) / 2);
259
- tmat_ptr = &tmat[0];
371
+ tmat = Rcpp::NumericVector(calc_ncomb(nrows));
372
+ tmat_ptr = REAL(tmat);
260
373
  if (sq_dist)
261
374
  {
262
- dmat = Rcpp::NumericMatrix(nrows);
263
- dmat_ptr = &dmat(0, 0);
375
+ dmat = Rcpp::NumericMatrix(nrows, nrows);
376
+ dmat_ptr = REAL(dmat);
264
377
  }
265
378
  }
266
379
 
267
380
  if (calc_depth)
268
381
  {
269
382
  depths = Rcpp::NumericVector(nrows);
270
- depths_ptr = &depths[0];
383
+ depths_ptr = REAL(depths);
271
384
  }
272
385
 
273
- std::unique_ptr<IsoForest> model_ptr = std::unique_ptr<IsoForest>();
274
- std::unique_ptr<ExtIsoForest> ext_model_ptr = std::unique_ptr<ExtIsoForest>();
275
- std::unique_ptr<Imputer> imputer_ptr = std::unique_ptr<Imputer>();
386
+ Rcpp::List outp = Rcpp::List::create(
387
+ Rcpp::_["depths"] = depths,
388
+ Rcpp::_["tmat"] = tmat,
389
+ Rcpp::_["dmat"] = dmat,
390
+ Rcpp::_["ptr"] = R_NilValue,
391
+ Rcpp::_["serialized"] = R_NilValue,
392
+ Rcpp::_["imp_ptr"] = R_NilValue,
393
+ Rcpp::_["imp_ser"] = R_NilValue,
394
+ Rcpp::_["imputed_num"] = R_NilValue,
395
+ Rcpp::_["imputed_cat"] = R_NilValue,
396
+ Rcpp::_["err"] = Rcpp::LogicalVector::create(1)
397
+ );
398
+
399
+ std::unique_ptr<IsoForest> model_ptr(nullptr);
400
+ std::unique_ptr<ExtIsoForest> ext_model_ptr(nullptr);
401
+ std::unique_ptr<Imputer> imputer_ptr(nullptr);
276
402
 
277
403
  if (ndim == 1)
278
404
  model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
@@ -282,68 +408,86 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
282
408
  if (build_imputer)
283
409
  imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
284
410
 
285
- int ret_val =
411
+ int ret_val;
412
+ try {
413
+ ret_val =
286
414
  fit_iforest(model_ptr.get(), ext_model_ptr.get(),
287
415
  numeric_data_ptr, ncols_numeric,
288
416
  categ_data_ptr, ncols_categ, ncat_ptr,
289
417
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
290
418
  ndim, ntry, coef_type_C, coef_by_prop,
291
419
  sample_weights_ptr, with_replacement, weight_as_sample,
292
- nrows, sample_size, ntrees, max_depth,
293
- limit_depth, penalize_range,
420
+ nrows, sample_size, ntrees, max_depth, ncols_per_tree,
421
+ limit_depth, penalize_range, standardize_data,
422
+ scoring_metric_C, fast_bratio,
294
423
  standardize_dist, tmat_ptr,
295
424
  depths_ptr, standardize_depth,
296
425
  col_weights_ptr, weigh_by_kurt,
297
- prob_pick_by_gain_avg, prob_split_by_gain_avg,
298
- prob_pick_by_gain_pl, prob_split_by_gain_pl,
426
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
427
+ prob_pick_by_full_gain, prob_pick_by_dens,
428
+ prob_pick_col_by_range, prob_pick_col_by_var,
429
+ prob_pick_col_by_kurt,
299
430
  min_gain, missing_action_C,
300
431
  cat_split_type_C, new_cat_action_C,
301
432
  all_perm, imputer_ptr.get(), min_imp_obs,
302
433
  depth_imp_C, weigh_imp_rows_C, output_imputations,
303
- (uint64_t) random_seed, handle_interrupt, nthreads);
434
+ (uint64_t) random_seed, use_long_double, nthreads);
435
+ }
436
+ catch (std::bad_alloc &e) {
437
+ throw_mem_err();
438
+ }
439
+ Rcpp::checkUserInterrupt();
304
440
 
305
441
  if (ret_val == EXIT_FAILURE)
306
442
  {
307
- return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
443
+ return Rcpp::unwindProtect(safe_errlist, nullptr);
308
444
  }
309
445
 
310
446
  if (calc_dist && sq_dist)
311
- tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
447
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, standardize_dist? 0. : std::numeric_limits<double>::infinity());
312
448
 
313
449
  bool serialization_failed = false;
314
450
  Rcpp::RawVector serialized_obj;
315
- if (ndim == 1)
316
- serialized_obj = serialize_cpp_obj(model_ptr.get());
317
- else
318
- serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
319
- if (!serialized_obj.size()) serialization_failed = true;
320
- if (serialization_failed) {
451
+ try {
452
+ if (ndim == 1)
453
+ serialized_obj = serialize_cpp_obj(model_ptr.get());
454
+ else
455
+ serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
456
+ }
457
+ catch (std::bad_alloc &e) {
458
+ throw_mem_err();
459
+ }
460
+ if (unlikely(!serialized_obj.size())) serialization_failed = true;
461
+ if (unlikely(serialization_failed)) {
321
462
  if (ndim == 1)
322
463
  model_ptr.reset();
323
464
  else
324
465
  ext_model_ptr.reset();
325
466
  }
326
467
 
327
- Rcpp::List outp = Rcpp::List::create(
328
- Rcpp::_["serialized_obj"] = serialized_obj,
329
- Rcpp::_["depths"] = depths,
330
- Rcpp::_["tmat"] = tmat,
331
- Rcpp::_["dmat"] = dmat
332
- );
333
-
334
468
  if (!serialization_failed)
335
469
  {
336
- if (ndim == 1)
337
- outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
338
- else
339
- outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
470
+ outp["serialized"] = serialized_obj;
471
+ if (ndim == 1) {
472
+ outp["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, model_ptr.get());
473
+ model_ptr.release();
474
+ }
475
+ else {
476
+ outp["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, ext_model_ptr.get());
477
+ ext_model_ptr.release();
478
+ }
340
479
  } else
341
- outp["model_ptr"] = R_NilValue;
480
+ outp["ptr"] = R_NilValue;
342
481
 
343
482
  if (build_imputer && !serialization_failed)
344
483
  {
345
- outp["imputer_ser"] = serialize_cpp_obj(imputer_ptr.get());
346
- if (!Rf_xlength(outp["imputer_ser"]))
484
+ try {
485
+ outp["imp_ser"] = serialize_cpp_obj(imputer_ptr.get());
486
+ }
487
+ catch (std::bad_alloc &e) {
488
+ throw_mem_err();
489
+ }
490
+ if (!Rf_xlength(outp["imp_ser"]))
347
491
  {
348
492
  serialization_failed = true;
349
493
  imputer_ptr.reset();
@@ -351,79 +495,122 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
351
495
  model_ptr.reset();
352
496
  else
353
497
  ext_model_ptr.reset();
354
- outp["imputer_ptr"] = R_NilValue;
355
- outp["model_ptr"] = R_NilValue;
356
- } else
357
- outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
498
+ outp["imp_ptr"] = R_NilValue;
499
+ outp["ptr"] = R_NilValue;
500
+ } else {
501
+ outp["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, imputer_ptr.get());
502
+ imputer_ptr.release();
503
+ }
358
504
  }
359
505
 
360
506
  if (output_imputations && !serialization_failed)
361
507
  {
362
- outp["imputed_num"] = Rcpp::NumericVector(Xcpp.begin(), Xcpp.end());
508
+ outp["imputed_num"] = Xcpp;
363
509
  outp["imputed_cat"] = X_cat;
364
510
  }
365
511
 
366
- outp["err"] = Rcpp::LogicalVector::create(0);
367
-
512
+ outp["err"] = Rcpp::unwindProtect(safe_FALSE, nullptr);
368
513
  return outp;
369
514
  }
370
515
 
371
- // [[Rcpp::export]]
372
- Rcpp::RawVector fit_tree(SEXP model_R_ptr,
373
- Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
374
- Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
375
- Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
376
- size_t nrows, size_t ncols_numeric, size_t ncols_categ,
377
- size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
378
- size_t max_depth, bool limit_depth, bool penalize_range,
379
- bool weigh_by_kurt,
380
- double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
381
- double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
382
- Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
383
- Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
384
- Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
385
- bool all_perm, uint64_t random_seed)
516
+ // [[Rcpp::export(rng = false)]]
517
+ void fit_tree(SEXP model_R_ptr, Rcpp::RawVector serialized_obj, Rcpp::RawVector serialized_imputer,
518
+ SEXP indexer_R_ptr, Rcpp::RawVector serialized_indexer,
519
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
520
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
521
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
522
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ,
523
+ size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
524
+ size_t max_depth, size_t ncols_per_tree, bool limit_depth, bool penalize_range,
525
+ bool standardize_data, bool fast_bratio, bool weigh_by_kurt,
526
+ double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
527
+ double prob_pick_by_full_gain, double prob_pick_by_dens,
528
+ double prob_pick_col_by_range, double prob_pick_col_by_var,
529
+ double prob_pick_col_by_kurt, double min_gain,
530
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
531
+ Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
532
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
533
+ bool all_perm,
534
+ Rcpp::NumericVector ref_X_num, Rcpp::IntegerVector ref_X_cat,
535
+ Rcpp::NumericVector ref_Xc, Rcpp::IntegerVector ref_Xc_ind, Rcpp::IntegerVector ref_Xc_indptr,
536
+ uint64_t random_seed, bool use_long_double,
537
+ Rcpp::List &model_cpp_obj_update, Rcpp::List &model_params_update)
386
538
  {
539
+ Rcpp::List out = Rcpp::List::create(
540
+ Rcpp::_["serialized"] = R_NilValue,
541
+ Rcpp::_["imp_ser"] = R_NilValue,
542
+ Rcpp::_["ind_ser"] = R_NilValue
543
+ );
544
+
545
+ Rcpp::IntegerVector ntrees_plus1 = Rcpp::IntegerVector::create(Rf_asInteger(model_params_update["ntrees"]) + 1);
546
+
387
547
  double* numeric_data_ptr = NULL;
388
548
  int* categ_data_ptr = NULL;
389
549
  int* ncat_ptr = NULL;
390
550
  double* Xc_ptr = NULL;
391
- sparse_ix* Xc_ind_ptr = NULL;
392
- sparse_ix* Xc_indptr_ptr = NULL;
551
+ int* Xc_ind_ptr = NULL;
552
+ int* Xc_indptr_ptr = NULL;
393
553
  double* sample_weights_ptr = NULL;
394
554
  double* col_weights_ptr = NULL;
395
- std::vector<double> Xcpp;
555
+ Rcpp::NumericVector Xcpp;
396
556
 
397
557
  if (X_num.size())
398
558
  {
399
- numeric_data_ptr = &X_num[0];
400
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
559
+ numeric_data_ptr = REAL(X_num);
560
+ if (Rcpp::as<std::string>(missing_action) != "fail")
401
561
  numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, 1);
402
562
  }
403
563
 
404
564
  if (X_cat.size())
405
565
  {
406
- categ_data_ptr = &X_cat[0];
407
- ncat_ptr = &ncat[0];
566
+ categ_data_ptr = INTEGER(X_cat);
567
+ ncat_ptr = INTEGER(ncat);
408
568
  }
409
569
 
410
570
  if (Xc.size())
411
571
  {
412
- Xc_ptr = &Xc[0];
413
- Xc_ind_ptr = &Xc_ind[0];
414
- Xc_indptr_ptr = &Xc_indptr[0];
415
- if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
572
+ Xc_ptr = REAL(Xc);
573
+ Xc_ind_ptr = INTEGER(Xc_ind);
574
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
575
+ if (Rcpp::as<std::string>(missing_action) != "fail")
416
576
  Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, 1);
417
577
  }
418
578
 
579
+ double* ref_numeric_data_ptr = NULL;
580
+ int* ref_categ_data_ptr = NULL;
581
+ double* ref_Xc_ptr = NULL;
582
+ int* ref_Xc_ind_ptr = NULL;
583
+ int* ref_Xc_indptr_ptr = NULL;
584
+ Rcpp::NumericVector ref_Xcpp;
585
+ if (ref_X_num.size())
586
+ {
587
+ ref_numeric_data_ptr = REAL(ref_X_num);
588
+ if (Rcpp::as<std::string>(missing_action) != "fail")
589
+ ref_numeric_data_ptr = set_R_nan_as_C_nan(ref_numeric_data_ptr, ref_X_num.size(), ref_Xcpp, 1);
590
+ }
591
+
592
+ if (ref_X_cat.size())
593
+ {
594
+ ref_categ_data_ptr = INTEGER(ref_X_cat);
595
+ }
596
+
597
+ if (ref_Xc.size())
598
+ {
599
+ ref_Xc_ptr = REAL(ref_Xc);
600
+ ref_Xc_ind_ptr = INTEGER(ref_Xc_ind);
601
+ ref_Xc_indptr_ptr = INTEGER(ref_Xc_indptr);
602
+ if (Rcpp::as<std::string>(missing_action) != "fail")
603
+ ref_Xc_ptr = set_R_nan_as_C_nan(ref_Xc_ptr, ref_Xc.size(), ref_Xcpp, 1);
604
+ }
605
+
419
606
  if (sample_weights.size())
420
607
  {
421
- sample_weights_ptr = &sample_weights[0];
608
+ sample_weights_ptr = REAL(sample_weights);
422
609
  }
423
610
 
424
611
  if (col_weights.size())
425
612
  {
426
- col_weights_ptr = &col_weights[0];
613
+ col_weights_ptr = REAL(col_weights);
427
614
  }
428
615
 
429
616
  CoefType coef_type_C = Normal;
@@ -433,62 +620,66 @@ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
433
620
  UseDepthImp depth_imp_C = Higher;
434
621
  WeighImpRows weigh_imp_rows_C = Inverse;
435
622
 
436
- if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
623
+ if (Rcpp::as<std::string>(coef_type) == "uniform")
437
624
  {
438
625
  coef_type_C = Uniform;
439
626
  }
440
- if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
627
+ if (Rcpp::as<std::string>(cat_split_type) == "single_categ")
441
628
  {
442
629
  cat_split_type_C = SingleCateg;
443
630
  }
444
- if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
631
+ if (Rcpp::as<std::string>(new_cat_action) == "smallest")
445
632
  {
446
633
  new_cat_action_C = Smallest;
447
634
  }
448
- else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
635
+ else if (Rcpp::as<std::string>(new_cat_action) == "random")
449
636
  {
450
637
  new_cat_action_C = Random;
451
638
  }
452
- if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
639
+ if (Rcpp::as<std::string>(missing_action) == "impute")
453
640
  {
454
641
  missing_action_C = Impute;
455
642
  }
456
- else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
643
+ else if (Rcpp::as<std::string>(missing_action) == "fail")
457
644
  {
458
645
  missing_action_C = Fail;
459
646
  }
460
- if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
647
+ if (Rcpp::as<std::string>(depth_imp) == "lower")
461
648
  {
462
649
  depth_imp_C = Lower;
463
650
  }
464
- else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
651
+ else if (Rcpp::as<std::string>(depth_imp) == "same")
465
652
  {
466
653
  depth_imp_C = Same;
467
654
  }
468
- if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
655
+ if (Rcpp::as<std::string>(weigh_imp_rows) == "prop")
469
656
  {
470
657
  weigh_imp_rows_C = Prop;
471
658
  }
472
- else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
659
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == "flat")
473
660
  {
474
661
  weigh_imp_rows_C = Flat;
475
662
  }
663
+
476
664
 
477
665
  IsoForest* model_ptr = NULL;
478
666
  ExtIsoForest* ext_model_ptr = NULL;
479
- Imputer* imputer_ptr = NULL;
667
+ Imputer* imputer_ptr = NULL;
668
+ TreesIndexer* indexer_ptr = NULL;
480
669
  if (ndim == 1)
481
670
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
482
671
  else
483
672
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
484
673
 
485
- std::vector<ImputeNode> *imp_ptr = NULL;
486
674
  if (build_imputer)
487
- {
488
675
  imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
489
- imputer_ptr->imputer_tree.emplace_back();
490
- imp_ptr = &imputer_ptr->imputer_tree.back();
491
- }
676
+
677
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
678
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
679
+ if (indexer_ptr != NULL && indexer_ptr->indices.empty())
680
+ indexer_ptr = NULL;
681
+
682
+ size_t old_ntrees = (ndim == 1)? (model_ptr->trees.size()) : (ext_model_ptr->hplanes.size());
492
683
 
493
684
  add_tree(model_ptr, ext_model_ptr,
494
685
  numeric_data_ptr, ncols_numeric,
@@ -496,24 +687,153 @@ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
496
687
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
497
688
  ndim, ntry, coef_type_C, coef_by_prop,
498
689
  sample_weights_ptr,
499
- nrows, max_depth,
500
- limit_depth, penalize_range,
690
+ nrows, max_depth, ncols_per_tree,
691
+ limit_depth, penalize_range, standardize_data, fast_bratio,
501
692
  col_weights_ptr, weigh_by_kurt,
502
- prob_pick_by_gain_avg, prob_split_by_gain_avg,
503
- prob_pick_by_gain_pl, prob_split_by_gain_pl,
693
+ prob_pick_by_gain_pl, prob_pick_by_gain_avg,
694
+ prob_pick_by_full_gain, prob_pick_by_dens,
695
+ prob_pick_col_by_range, prob_pick_col_by_var,
696
+ prob_pick_col_by_kurt,
504
697
  min_gain, missing_action_C,
505
698
  cat_split_type_C, new_cat_action_C,
506
699
  depth_imp_C, weigh_imp_rows_C, all_perm,
507
- imp_ptr, min_imp_obs, (uint64_t)random_seed);
700
+ imputer_ptr, min_imp_obs,
701
+ indexer_ptr,
702
+ ref_numeric_data_ptr, ref_categ_data_ptr,
703
+ true, (size_t)0, (size_t)0,
704
+ ref_Xc_ptr, ref_Xc_ind_ptr, ref_Xc_indptr_ptr,
705
+ (uint64_t)random_seed, use_long_double);
706
+
707
+ Rcpp::RawVector new_serialized, new_imp_serialized, new_ind_serialized;
708
+ size_t new_size;
709
+ try
710
+ {
711
+ if (ndim == 1)
712
+ {
713
+ if (serialized_obj.size() &&
714
+ check_can_undergo_incremental_serialization(*model_ptr, (char*)RAW(serialized_obj)))
715
+ {
716
+ try {
717
+ new_size = serialized_obj.size()
718
+ + determine_serialized_size_additional_trees(*model_ptr, old_ntrees);
719
+ new_serialized = resize_vec(serialized_obj, new_size);
720
+ char *temp = (char*)RAW(new_serialized);
721
+ incremental_serialize_isotree(*model_ptr, temp);
722
+ out["serialized"] = new_serialized;
723
+ }
724
+
725
+ catch (std::runtime_error &e) {
726
+ goto serialize_anew_singlevar;
727
+ }
728
+ }
729
+
730
+ else {
731
+ serialize_anew_singlevar:
732
+ out["serialized"] = serialize_cpp_obj(model_ptr);
733
+ }
734
+ }
508
735
 
509
- if (ndim == 1)
510
- return serialize_cpp_obj(model_ptr);
511
- else
512
- return serialize_cpp_obj(ext_model_ptr);
736
+ else
737
+ {
738
+ if (serialized_obj.size() &&
739
+ check_can_undergo_incremental_serialization(*ext_model_ptr, (char*)RAW(serialized_obj)))
740
+ {
741
+ try {
742
+ new_size = serialized_obj.size()
743
+ + determine_serialized_size_additional_trees(*ext_model_ptr, old_ntrees);
744
+ new_serialized = resize_vec(serialized_obj, new_size);
745
+ char *temp = (char*)RAW(new_serialized);
746
+ incremental_serialize_isotree(*ext_model_ptr, temp);
747
+ out["serialized"] = new_serialized;
748
+ }
749
+
750
+ catch (std::runtime_error &e) {
751
+ goto serialize_anew_ext;
752
+ }
753
+ }
754
+
755
+ else {
756
+ serialize_anew_ext:
757
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
758
+ }
759
+ }
760
+
761
+ if (imputer_ptr != NULL)
762
+ {
763
+ if (serialized_imputer.size() &&
764
+ check_can_undergo_incremental_serialization(*imputer_ptr, (char*)RAW(serialized_imputer)))
765
+ {
766
+ try {
767
+ new_size = serialized_imputer.size()
768
+ + determine_serialized_size_additional_trees(*imputer_ptr, old_ntrees);
769
+ new_imp_serialized = resize_vec(serialized_imputer, new_size);
770
+ char *temp = (char*)RAW(new_imp_serialized);
771
+ incremental_serialize_isotree(*imputer_ptr, temp);
772
+ out["imp_ser"] = new_imp_serialized;
773
+ }
774
+
775
+ catch (std::runtime_error &e) {
776
+ goto serialize_anew_imp;
777
+ }
778
+ }
779
+
780
+ else {
781
+ serialize_anew_imp:
782
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
783
+ }
784
+ }
785
+
786
+ if (indexer_ptr != NULL)
787
+ {
788
+ if (serialized_indexer.size() &&
789
+ check_can_undergo_incremental_serialization(*indexer_ptr, (char*)RAW(serialized_indexer)))
790
+ {
791
+ try {
792
+ new_size = serialized_indexer.size()
793
+ + determine_serialized_size_additional_trees(*indexer_ptr, old_ntrees);
794
+ new_ind_serialized = resize_vec(serialized_indexer, new_size);
795
+ char *temp = (char*)RAW(new_ind_serialized);
796
+ incremental_serialize_isotree(*indexer_ptr, temp);
797
+ out["ind_ser"] = new_ind_serialized;
798
+ }
799
+
800
+ catch (std::runtime_error &e) {
801
+ goto serialize_anew_ind;
802
+ }
803
+ }
804
+
805
+ else {
806
+ serialize_anew_ind:
807
+ out["ind_ser"] = serialize_cpp_obj(indexer_ptr);
808
+ }
809
+ }
810
+ }
811
+
812
+ catch (...)
813
+ {
814
+ if (ndim == 1)
815
+ model_ptr->trees.resize(old_ntrees);
816
+ else
817
+ ext_model_ptr->hplanes.resize(old_ntrees);
818
+ if (build_imputer)
819
+ imputer_ptr->imputer_tree.resize(old_ntrees);
820
+ if (indexer_ptr != NULL)
821
+ indexer_ptr->indices.resize(old_ntrees);
822
+ throw;
823
+ }
824
+
825
+ model_cpp_obj_update["serialized"] = out["serialized"];
826
+ if (build_imputer)
827
+ model_cpp_obj_update["imp_ser"] = out["imp_ser"];
828
+ if (indexer_ptr != NULL)
829
+ model_cpp_obj_update["ind_ser"] = out["ind_ser"];
830
+ model_params_update["ntrees"] = ntrees_plus1;
513
831
  }
514
832
 
515
- // [[Rcpp::export]]
516
- void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector tree_num, bool is_extended,
833
+ // [[Rcpp::export(rng = false)]]
834
+ void predict_iso(SEXP model_R_ptr, bool is_extended,
835
+ SEXP indexer_R_ptr,
836
+ Rcpp::NumericVector outp, Rcpp::IntegerMatrix tree_num, Rcpp::NumericMatrix tree_depths,
517
837
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
518
838
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
519
839
  Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
@@ -522,48 +842,40 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
522
842
  double* numeric_data_ptr = NULL;
523
843
  int* categ_data_ptr = NULL;
524
844
  double* Xc_ptr = NULL;
525
- sparse_ix* Xc_ind_ptr = NULL;
526
- sparse_ix* Xc_indptr_ptr = NULL;
845
+ int* Xc_ind_ptr = NULL;
846
+ int* Xc_indptr_ptr = NULL;
527
847
  double* Xr_ptr = NULL;
528
- sparse_ix* Xr_ind_ptr = NULL;
529
- sparse_ix* Xr_indptr_ptr = NULL;
530
- sparse_ix* tree_num_ptr = NULL;
531
- std::vector<double> Xcpp;
848
+ int* Xr_ind_ptr = NULL;
849
+ int* Xr_indptr_ptr = NULL;
850
+ Rcpp::NumericVector Xcpp;
532
851
 
533
852
  if (X_num.size())
534
853
  {
535
- numeric_data_ptr = &X_num[0];
854
+ numeric_data_ptr = REAL(X_num);
536
855
  }
537
856
 
538
857
  if (X_cat.size())
539
858
  {
540
- categ_data_ptr = &X_cat[0];
859
+ categ_data_ptr = INTEGER(X_cat);
541
860
  }
542
861
 
543
862
  if (Xc_indptr.size())
544
863
  {
545
- if (Xc.size())
546
- Xc_ptr = &Xc[0];
547
- if (Xc_ind.size())
548
- Xc_ind_ptr = &Xc_ind[0];
549
- Xc_indptr_ptr = &Xc_indptr[0];
864
+ Xc_ptr = REAL(Xc);
865
+ Xc_ind_ptr = INTEGER(Xc_ind);
866
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
550
867
  }
551
868
 
552
869
  if (Xr_indptr.size())
553
870
  {
554
- if (Xr.size())
555
- Xr_ptr = &Xr[0];
556
- if (Xr_ind.size())
557
- Xr_ind_ptr = &Xr_ind[0];
558
- Xr_indptr_ptr = &Xr_indptr[0];
559
- }
560
-
561
- if (tree_num.size())
562
- {
563
- tree_num_ptr = &tree_num[0];
871
+ Xr_ptr = REAL(Xr);
872
+ Xr_ind_ptr = INTEGER(Xr_ind);
873
+ Xr_indptr_ptr = INTEGER(Xr_indptr);
564
874
  }
565
875
 
566
- double* depths_ptr = &outp[0];
876
+ double *depths_ptr = REAL(outp);
877
+ double *tree_depths_ptr = tree_depths.size()? REAL(tree_depths) : NULL;
878
+ int *tree_num_ptr = tree_num.size()? INTEGER(tree_num) : NULL;
567
879
 
568
880
  IsoForest* model_ptr = NULL;
569
881
  ExtIsoForest* ext_model_ptr = NULL;
@@ -571,6 +883,11 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
571
883
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
572
884
  else
573
885
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
886
+ TreesIndexer* indexer = NULL;
887
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
888
+ indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
889
+ if (indexer != NULL && indexer->indices.empty())
890
+ indexer = NULL;
574
891
 
575
892
  MissingAction missing_action = is_extended?
576
893
  ext_model_ptr->missing_action
@@ -583,58 +900,75 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
583
900
  if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), Xcpp, nthreads);
584
901
  }
585
902
 
586
- predict_iforest(numeric_data_ptr, categ_data_ptr,
587
- Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
588
- Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
589
- nrows, nthreads, standardize,
590
- model_ptr, ext_model_ptr,
591
- depths_ptr, tree_num_ptr);
903
+ predict_iforest<double, int>(numeric_data_ptr, categ_data_ptr,
904
+ true, (size_t)0, (size_t)0,
905
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
906
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
907
+ nrows, nthreads, standardize,
908
+ model_ptr, ext_model_ptr,
909
+ depths_ptr, tree_num_ptr,
910
+ tree_depths_ptr,
911
+ indexer);
592
912
  }
593
913
 
594
- // [[Rcpp::export]]
595
- void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dmat,
596
- Rcpp::NumericVector rmat, bool is_extended,
914
+ // [[Rcpp::export(rng = false)]]
915
+ void dist_iso(SEXP model_R_ptr, SEXP indexer_R_ptr,
916
+ Rcpp::NumericVector tmat, Rcpp::NumericMatrix dmat,
917
+ Rcpp::NumericMatrix rmat, bool is_extended,
597
918
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
598
919
  Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
599
- size_t nrows, int nthreads, bool assume_full_distr,
600
- bool standardize_dist, bool sq_dist, size_t n_from)
920
+ size_t nrows, bool use_long_double, int nthreads, bool assume_full_distr,
921
+ bool standardize_dist, bool sq_dist, size_t n_from,
922
+ bool use_reference_points, bool as_kernel)
601
923
  {
602
924
  double* numeric_data_ptr = NULL;
603
925
  int* categ_data_ptr = NULL;
604
926
  double* Xc_ptr = NULL;
605
- sparse_ix* Xc_ind_ptr = NULL;
606
- sparse_ix* Xc_indptr_ptr = NULL;
607
- std::vector<double> Xcpp;
927
+ int* Xc_ind_ptr = NULL;
928
+ int* Xc_indptr_ptr = NULL;
929
+ Rcpp::NumericVector Xcpp;
608
930
 
609
931
  if (X_num.size())
610
932
  {
611
- numeric_data_ptr = &X_num[0];
933
+ numeric_data_ptr = REAL(X_num);
612
934
  }
613
935
 
614
936
  if (X_cat.size())
615
937
  {
616
- categ_data_ptr = &X_cat[0];
938
+ categ_data_ptr = INTEGER(X_cat);
617
939
  }
618
940
 
619
941
  if (Xc_indptr.size())
620
942
  {
621
- if (Xc.size())
622
- Xc_ptr = &Xc[0];
623
- if (Xc_ind.size())
624
- Xc_ind_ptr = &Xc_ind[0];
625
- Xc_indptr_ptr = &Xc_indptr[0];
943
+ Xc_ptr = REAL(Xc);
944
+ Xc_ind_ptr = INTEGER(Xc_ind);
945
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
626
946
  }
627
947
 
628
- double* tmat_ptr = n_from? (double*)NULL : &tmat[0];
629
- double* dmat_ptr = (sq_dist & !n_from)? &dmat[0] : NULL;
630
- double* rmat_ptr = n_from? &rmat[0] : NULL;
948
+ double* tmat_ptr = n_from? (double*)NULL : REAL(tmat);
949
+ double* dmat_ptr = (sq_dist & !n_from)? REAL(dmat) : NULL;
950
+ double* rmat_ptr = n_from? REAL(rmat) : NULL;
631
951
 
632
952
  IsoForest* model_ptr = NULL;
633
953
  ExtIsoForest* ext_model_ptr = NULL;
954
+ TreesIndexer* indexer = NULL;
634
955
  if (is_extended)
635
956
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
636
957
  else
637
958
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
959
+ if (!Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL)
960
+ indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
961
+ if (indexer != NULL && (indexer->indices.empty() || (!as_kernel && indexer->indices.front().node_distances.empty())))
962
+ indexer = NULL;
963
+
964
+ if (use_reference_points && indexer != NULL && !indexer->indices.front().reference_points.empty()) {
965
+ tmat_ptr = NULL;
966
+ dmat_ptr = NULL;
967
+ rmat_ptr = REAL(rmat);
968
+ }
969
+ else {
970
+ use_reference_points = false;
971
+ }
638
972
 
639
973
 
640
974
  MissingAction missing_action = is_extended?
@@ -650,43 +984,58 @@ void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dm
650
984
 
651
985
  calc_similarity(numeric_data_ptr, categ_data_ptr,
652
986
  Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
653
- nrows, nthreads, assume_full_distr, standardize_dist,
987
+ nrows, use_long_double, nthreads,
988
+ assume_full_distr, standardize_dist, as_kernel,
654
989
  model_ptr, ext_model_ptr,
655
- tmat_ptr, rmat_ptr, n_from);
990
+ tmat_ptr, rmat_ptr, n_from, use_reference_points,
991
+ indexer, true, (size_t)0, (size_t)0);
656
992
 
657
- if (sq_dist & !n_from)
658
- tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
993
+ if (tmat.size() && dmat.ncol() > 0)
994
+ {
995
+ double diag_filler;
996
+ if (as_kernel) {
997
+ if (standardize_dist)
998
+ diag_filler = 1.;
999
+ else
1000
+ diag_filler = (model_ptr != NULL)? model_ptr->trees.size() : ext_model_ptr->hplanes.size();
1001
+ }
1002
+ else {
1003
+ if (standardize_dist)
1004
+ diag_filler = 0;
1005
+ else
1006
+ diag_filler = std::numeric_limits<double>::infinity();
1007
+ }
1008
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, diag_filler);
1009
+ }
659
1010
  }
660
1011
 
661
- // [[Rcpp::export]]
1012
+ // [[Rcpp::export(rng = false)]]
662
1013
  Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
663
1014
  Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
664
1015
  Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
665
- size_t nrows, int nthreads)
1016
+ size_t nrows, bool use_long_double, int nthreads)
666
1017
  {
667
1018
  double* numeric_data_ptr = NULL;
668
1019
  int* categ_data_ptr = NULL;
669
1020
  double* Xr_ptr = NULL;
670
- sparse_ix* Xr_ind_ptr = NULL;
671
- sparse_ix* Xr_indptr_ptr = NULL;
1021
+ int* Xr_ind_ptr = NULL;
1022
+ int* Xr_indptr_ptr = NULL;
672
1023
 
673
1024
  if (X_num.size())
674
1025
  {
675
- numeric_data_ptr = &X_num[0];
1026
+ numeric_data_ptr = REAL(X_num);
676
1027
  }
677
1028
 
678
1029
  if (X_cat.size())
679
1030
  {
680
- categ_data_ptr = &X_cat[0];
1031
+ categ_data_ptr = INTEGER(X_cat);
681
1032
  }
682
1033
 
683
1034
  if (Xr_indptr.size())
684
1035
  {
685
- if (Xr.size())
686
- Xr_ptr = &Xr[0];
687
- if (Xr_ind.size())
688
- Xr_ind_ptr = &Xr_ind[0];
689
- Xr_indptr_ptr = &Xr_indptr[0];
1036
+ Xr_ptr = REAL(Xr);
1037
+ Xr_ind_ptr = INTEGER(Xr_ind);
1038
+ Xr_indptr_ptr = INTEGER(Xr_indptr);
690
1039
  }
691
1040
 
692
1041
  if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), nthreads);
@@ -702,9 +1051,9 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
702
1051
  Imputer* imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
703
1052
 
704
1053
 
705
- impute_missing_values(numeric_data_ptr, categ_data_ptr,
1054
+ impute_missing_values(numeric_data_ptr, categ_data_ptr, true,
706
1055
  Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
707
- nrows, nthreads,
1056
+ nrows, use_long_double, nthreads,
708
1057
  model_ptr, ext_model_ptr,
709
1058
  *imputer_ptr);
710
1059
 
@@ -714,7 +1063,187 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
714
1063
  );
715
1064
  }
716
1065
 
717
- // [[Rcpp::export]]
1066
+ // [[Rcpp::export(rng = false)]]
1067
+ void drop_imputer(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1068
+ {
1069
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1070
+ Rcpp::LogicalVector FalseObj = Rcpp::LogicalVector::create(false);
1071
+ Rcpp::XPtr<Imputer> imp_ptr = lst_modify["imp_ptr"];
1072
+ imp_ptr.release();
1073
+
1074
+ lst_modify["imp_ser"] = empty_ser;
1075
+ lst_modify2["build_imputer"] = FalseObj;
1076
+ }
1077
+
1078
+ // [[Rcpp::export(rng = false)]]
1079
+ void drop_indexer(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1080
+ {
1081
+ Rcpp::XPtr<TreesIndexer> empty_ptr = Rcpp::XPtr<TreesIndexer>(nullptr, false);
1082
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1083
+ Rcpp::CharacterVector empty_char = Rcpp::CharacterVector();
1084
+ Rcpp::XPtr<TreesIndexer> indexer = lst_modify["indexer"];
1085
+ indexer.release();
1086
+
1087
+ lst_modify["ind_ser"] = empty_ser;
1088
+ lst_modify2["reference_names"] = empty_char;
1089
+ }
1090
+
1091
+ // [[Rcpp::export(rng = false)]]
1092
+ void drop_reference_points(Rcpp::List lst_modify, Rcpp::List lst_modify2)
1093
+ {
1094
+ Rcpp::CharacterVector empty_char = Rcpp::CharacterVector();
1095
+ Rcpp::RawVector empty_ser = Rcpp::RawVector();
1096
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1097
+ TreesIndexer *indexer_ptr = indexer_R_ptr.get();
1098
+ if (indexer_ptr == NULL) {
1099
+ lst_modify["ind_ser"] = empty_ser;
1100
+ lst_modify2["reference_names"] = empty_char;
1101
+ return;
1102
+ }
1103
+ if (indexer_ptr->indices.empty()) {
1104
+ indexer_R_ptr.release();
1105
+ lst_modify["ind_ser"] = empty_ser;
1106
+ lst_modify2["reference_names"] = empty_char;
1107
+ return;
1108
+ }
1109
+ if (indexer_ptr->indices.front().reference_points.empty()) {
1110
+ lst_modify2["reference_names"] = empty_char;
1111
+ return;
1112
+ }
1113
+
1114
+ std::unique_ptr<TreesIndexer> new_indexer(new TreesIndexer(*indexer_ptr));
1115
+ for (auto &tree : new_indexer->indices)
1116
+ {
1117
+ tree.reference_points.clear();
1118
+ tree.reference_indptr.clear();
1119
+ tree.reference_mapping.clear();
1120
+ }
1121
+ Rcpp::RawVector ind_ser = serialize_cpp_obj(new_indexer.get());
1122
+ *indexer_ptr = std::move(*new_indexer);
1123
+ new_indexer.release();
1124
+ lst_modify["ind_ser"] = ind_ser;
1125
+ lst_modify2["reference_names"] = empty_char;
1126
+ }
1127
+
1128
+ // [[Rcpp::export(rng = false)]]
1129
+ Rcpp::List subset_trees
1130
+ (
1131
+ SEXP model_R_ptr, SEXP imputer_R_ptr, SEXP indexer_R_ptr,
1132
+ bool is_extended, bool has_imputer,
1133
+ Rcpp::IntegerVector trees_take
1134
+ )
1135
+ {
1136
+ bool has_indexer = !Rf_isNull(indexer_R_ptr) && R_ExternalPtrAddr(indexer_R_ptr) != NULL;
1137
+
1138
+ Rcpp::List out = Rcpp::List::create(
1139
+ Rcpp::_["ptr"] = R_NilValue,
1140
+ Rcpp::_["serialized"] = R_NilValue,
1141
+ Rcpp::_["imp_ptr"] = R_NilValue,
1142
+ Rcpp::_["imp_ser"] = R_NilValue,
1143
+ Rcpp::_["indexer"] = R_NilValue,
1144
+ Rcpp::_["ind_ser"] = R_NilValue
1145
+ );
1146
+
1147
+ IsoForest* model_ptr = NULL;
1148
+ ExtIsoForest* ext_model_ptr = NULL;
1149
+ Imputer* imputer_ptr = NULL;
1150
+ TreesIndexer* indexer_ptr = NULL;
1151
+ std::unique_ptr<IsoForest> new_model_ptr(nullptr);
1152
+ std::unique_ptr<ExtIsoForest> new_ext_model_ptr(nullptr);
1153
+ std::unique_ptr<Imputer> new_imputer_ptr(nullptr);
1154
+ std::unique_ptr<TreesIndexer> new_indexer_ptr(nullptr);
1155
+
1156
+ if (is_extended) {
1157
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1158
+ new_ext_model_ptr = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
1159
+ }
1160
+ else {
1161
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1162
+ new_model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
1163
+ }
1164
+
1165
+
1166
+ if (has_imputer) {
1167
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
1168
+ new_imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
1169
+ }
1170
+
1171
+ if (has_indexer) {
1172
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1173
+ new_indexer_ptr = std::unique_ptr<TreesIndexer>(new TreesIndexer());
1174
+ }
1175
+
1176
+ std::unique_ptr<size_t[]> trees_take_(new size_t[trees_take.size()]);
1177
+ for (decltype(trees_take.size()) ix = 0; ix < trees_take.size(); ix++)
1178
+ trees_take_[ix] = (size_t)(trees_take[ix] - 1);
1179
+
1180
+ subset_model(model_ptr, new_model_ptr.get(),
1181
+ ext_model_ptr, new_ext_model_ptr.get(),
1182
+ imputer_ptr, new_imputer_ptr.get(),
1183
+ indexer_ptr, new_indexer_ptr.get(),
1184
+ trees_take_.get(), trees_take.size());
1185
+ trees_take_.reset();
1186
+
1187
+ if (!is_extended)
1188
+ out["serialized"] = serialize_cpp_obj(new_model_ptr.get());
1189
+ else
1190
+ out["serialized"] = serialize_cpp_obj(new_ext_model_ptr.get());
1191
+ if (has_imputer)
1192
+ out["imp_ser"] = serialize_cpp_obj(new_imputer_ptr.get());
1193
+ if (has_indexer)
1194
+ out["ind_ser"] = serialize_cpp_obj(new_indexer_ptr.get());
1195
+
1196
+ if (!is_extended) {
1197
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, new_model_ptr.get());
1198
+ new_model_ptr.release();
1199
+ }
1200
+ else {
1201
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, new_ext_model_ptr.get());
1202
+ new_ext_model_ptr.release();
1203
+ }
1204
+ if (has_imputer) {
1205
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, new_imputer_ptr.get());
1206
+ new_imputer_ptr.release();
1207
+ }
1208
+ if (has_indexer) {
1209
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, new_indexer_ptr.get());
1210
+ new_indexer_ptr.release();
1211
+ }
1212
+ return out;
1213
+ }
1214
+
1215
+ // [[Rcpp::export(rng = false)]]
1216
+ void inplace_set_to_zero(SEXP obj)
1217
+ {
1218
+ auto obj_type = TYPEOF(obj);
1219
+ switch(obj_type)
1220
+ {
1221
+ case REALSXP:
1222
+ {
1223
+ REAL(obj)[0] = 0;
1224
+ break;
1225
+ }
1226
+
1227
+ case INTSXP:
1228
+ {
1229
+ INTEGER(obj)[0] = 0;
1230
+ break;
1231
+ }
1232
+
1233
+ case LGLSXP:
1234
+ {
1235
+ LOGICAL(obj)[0] = 0;
1236
+ break;
1237
+ }
1238
+
1239
+ default:
1240
+ {
1241
+ Rcpp::stop("Model object has incorrect structure.\n");
1242
+ }
1243
+ }
1244
+ }
1245
+
1246
+ // [[Rcpp::export(rng = false)]]
718
1247
  Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
719
1248
  {
720
1249
  size_t ntrees;
@@ -734,9 +1263,9 @@ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
734
1263
  Rcpp::IntegerVector n_nodes(ntrees);
735
1264
  Rcpp::IntegerVector n_terminal(ntrees);
736
1265
  if (is_extended)
737
- get_num_nodes(*ext_model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
1266
+ get_num_nodes(*ext_model_ptr, INTEGER(n_nodes), INTEGER(n_terminal), nthreads);
738
1267
  else
739
- get_num_nodes(*model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
1268
+ get_num_nodes(*model_ptr, INTEGER(n_nodes), INTEGER(n_terminal), nthreads);
740
1269
 
741
1270
  return Rcpp::List::create(
742
1271
  Rcpp::_["total"] = n_nodes,
@@ -744,25 +1273,56 @@ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
744
1273
  );
745
1274
  }
746
1275
 
747
- // [[Rcpp::export]]
748
- Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
749
- SEXP imp_R_ptr, SEXP oimp_R_ptr,
750
- bool is_extended)
1276
+ // [[Rcpp::export(rng = false)]]
1277
+ void append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
1278
+ SEXP imp_R_ptr, SEXP oimp_R_ptr,
1279
+ SEXP ind_R_ptr, SEXP oind_R_ptr,
1280
+ bool is_extended,
1281
+ Rcpp::RawVector serialized_obj,
1282
+ Rcpp::RawVector serialized_imputer,
1283
+ Rcpp::RawVector serialized_indexer,
1284
+ Rcpp::List &model_cpp_obj_update,
1285
+ Rcpp::List &model_params_update)
751
1286
  {
752
- Rcpp::List out;
1287
+ if ((!Rf_isNull(imp_R_ptr) && R_ExternalPtrAddr(imp_R_ptr) != NULL)
1288
+ &&
1289
+ !(!Rf_isNull(oimp_R_ptr) && R_ExternalPtrAddr(oimp_R_ptr) != NULL))
1290
+ {
1291
+ Rcpp::stop("Model to append trees to has imputer, but model to append from doesn't. Try dropping the imputer.\n");
1292
+ }
1293
+ if ((!Rf_isNull(ind_R_ptr) && R_ExternalPtrAddr(ind_R_ptr) != NULL)
1294
+ &&
1295
+ !(!Rf_isNull(oind_R_ptr) && R_ExternalPtrAddr(oind_R_ptr) != NULL))
1296
+ {
1297
+ Rcpp::stop("Model to append trees to has indexer, but model to append from doesn't. Try dropping the indexer.\n");
1298
+ }
1299
+
1300
+ Rcpp::List out = Rcpp::List::create(
1301
+ Rcpp::_["serialized"] = R_NilValue,
1302
+ Rcpp::_["imp_ser"] = R_NilValue,
1303
+ Rcpp::_["ind_ser"] = R_NilValue
1304
+ );
1305
+
1306
+ Rcpp::IntegerVector ntrees_new = Rcpp::IntegerVector::create(Rf_asInteger(model_params_update["ntrees"]));
1307
+
753
1308
  IsoForest* model_ptr = NULL;
754
1309
  IsoForest* other_ptr = NULL;
755
1310
  ExtIsoForest* ext_model_ptr = NULL;
756
1311
  ExtIsoForest* ext_other_ptr = NULL;
757
1312
  Imputer* imputer_ptr = NULL;
758
1313
  Imputer* oimputer_ptr = NULL;
1314
+ TreesIndexer* indexer_ptr = NULL;
1315
+ TreesIndexer* oindexer_ptr = NULL;
1316
+ size_t old_ntrees;
759
1317
 
760
1318
  if (is_extended) {
761
1319
  ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
762
1320
  ext_other_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(other_R_ptr));
1321
+ old_ntrees = ext_model_ptr->hplanes.size();
763
1322
  } else {
764
1323
  model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
765
1324
  other_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(other_R_ptr));
1325
+ old_ntrees = model_ptr->trees.size();
766
1326
  }
767
1327
 
768
1328
  if (!Rf_isNull(imp_R_ptr) && !Rf_isNull(oimp_R_ptr) &&
@@ -773,23 +1333,158 @@ Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
773
1333
  oimputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(oimp_R_ptr));
774
1334
  }
775
1335
 
1336
+ if (!Rf_isNull(ind_R_ptr) && !Rf_isNull(oind_R_ptr) &&
1337
+ R_ExternalPtrAddr(ind_R_ptr) != NULL &&
1338
+ R_ExternalPtrAddr(oind_R_ptr) != NULL)
1339
+ {
1340
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(ind_R_ptr));
1341
+ oindexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(oind_R_ptr));
1342
+ }
1343
+
776
1344
  merge_models(model_ptr, other_ptr,
777
1345
  ext_model_ptr, ext_other_ptr,
778
- imputer_ptr, oimputer_ptr);
1346
+ imputer_ptr, oimputer_ptr,
1347
+ indexer_ptr, oindexer_ptr);
779
1348
 
1349
+ Rcpp::RawVector new_serialized, new_imp_serialized, new_ind_serialized;
1350
+ size_t new_size;
1351
+ try
1352
+ {
1353
+ if (!is_extended)
1354
+ {
1355
+ if (serialized_obj.size() &&
1356
+ check_can_undergo_incremental_serialization(*model_ptr, (char*)RAW(serialized_obj)))
1357
+ {
1358
+ try {
1359
+ new_size = serialized_obj.size()
1360
+ + determine_serialized_size_additional_trees(*model_ptr, old_ntrees);
1361
+ new_serialized = resize_vec(serialized_obj, new_size);
1362
+ char *temp = (char*)RAW(new_serialized);
1363
+ incremental_serialize_isotree(*model_ptr, temp);
1364
+ out["serialized"] = new_serialized;
1365
+ }
1366
+
1367
+ catch (std::runtime_error &e) {
1368
+ goto serialize_anew_singlevar;
1369
+ }
1370
+ }
1371
+
1372
+ else {
1373
+ serialize_anew_singlevar:
1374
+ out["serialized"] = serialize_cpp_obj(model_ptr);
1375
+ }
1376
+ }
780
1377
 
781
- if (is_extended)
782
- out["serialized"] = serialize_cpp_obj(ext_model_ptr);
783
- else
784
- out["serialized"] = serialize_cpp_obj(model_ptr);
1378
+ else
1379
+ {
1380
+ if (serialized_obj.size() &&
1381
+ check_can_undergo_incremental_serialization(*ext_model_ptr, (char*)RAW(serialized_obj)))
1382
+ {
1383
+ try {
1384
+ new_size = serialized_obj.size()
1385
+ + determine_serialized_size_additional_trees(*ext_model_ptr, old_ntrees);
1386
+ new_serialized = resize_vec(serialized_obj, new_size);
1387
+ char *temp = (char*)RAW(new_serialized);
1388
+ incremental_serialize_isotree(*ext_model_ptr, temp);
1389
+ out["serialized"] = new_serialized;
1390
+ }
1391
+
1392
+ catch (std::runtime_error &e) {
1393
+ goto serialize_anew_ext;
1394
+ }
1395
+ }
1396
+
1397
+ else {
1398
+ serialize_anew_ext:
1399
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
1400
+ }
1401
+ }
785
1402
 
786
- if (imputer_ptr != NULL && oimputer_ptr != NULL)
787
- out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
1403
+ if (imputer_ptr != NULL)
1404
+ {
1405
+ if (serialized_imputer.size() &&
1406
+ check_can_undergo_incremental_serialization(*imputer_ptr, (char*)RAW(serialized_imputer)))
1407
+ {
1408
+ try {
1409
+ new_size = serialized_obj.size()
1410
+ + determine_serialized_size_additional_trees(*imputer_ptr, old_ntrees);
1411
+ new_imp_serialized = resize_vec(serialized_imputer, new_size);
1412
+ char *temp = (char*)RAW(new_imp_serialized);
1413
+ incremental_serialize_isotree(*imputer_ptr, temp);
1414
+ out["imp_ser"] = new_imp_serialized;
1415
+ }
1416
+
1417
+ catch (std::runtime_error &e) {
1418
+ goto serialize_anew_imp;
1419
+ }
1420
+ }
1421
+
1422
+ else {
1423
+ serialize_anew_imp:
1424
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
1425
+ }
1426
+ }
788
1427
 
789
- return out;
1428
+ if (indexer_ptr != NULL)
1429
+ {
1430
+ if (serialized_indexer.size() &&
1431
+ check_can_undergo_incremental_serialization(*indexer_ptr, (char*)RAW(serialized_indexer)))
1432
+ {
1433
+ try {
1434
+ new_size = serialized_obj.size()
1435
+ + determine_serialized_size_additional_trees(*indexer_ptr, old_ntrees);
1436
+ new_ind_serialized = resize_vec(serialized_indexer, new_size);
1437
+ char *temp = (char*)RAW(new_ind_serialized);
1438
+ incremental_serialize_isotree(*indexer_ptr, temp);
1439
+ out["ind_ser"] = new_ind_serialized;
1440
+ }
1441
+
1442
+ catch (std::runtime_error &e) {
1443
+ goto serialize_anew_ind;
1444
+ }
1445
+ }
1446
+
1447
+ else {
1448
+ serialize_anew_ind:
1449
+ out["ind_ser"] = serialize_cpp_obj(indexer_ptr);
1450
+ }
1451
+ }
1452
+ }
1453
+
1454
+ catch (...)
1455
+ {
1456
+ if (!is_extended)
1457
+ model_ptr->trees.resize(old_ntrees);
1458
+ else
1459
+ ext_model_ptr->hplanes.resize(old_ntrees);
1460
+
1461
+ if (imputer_ptr != NULL)
1462
+ imputer_ptr->imputer_tree.resize(old_ntrees);
1463
+ if (indexer_ptr != NULL)
1464
+ indexer_ptr->indices.resize(old_ntrees);
1465
+ throw;
1466
+ }
1467
+
1468
+ model_cpp_obj_update["serialized"] = out["serialized"];
1469
+ if (imputer_ptr)
1470
+ model_cpp_obj_update["imp_ser"] = out["imp_ser"];
1471
+ if (indexer_ptr)
1472
+ model_cpp_obj_update["ind_ser"] = out["ind_ser"];
1473
+ *(INTEGER(ntrees_new)) = is_extended? ext_model_ptr->hplanes.size() : model_ptr->trees.size();
1474
+ model_params_update["ntrees"] = ntrees_new;
790
1475
  }
791
1476
 
792
- // [[Rcpp::export]]
1477
+ SEXP alloc_List(void *data)
1478
+ {
1479
+ return Rcpp::List(*(size_t*)data);
1480
+ }
1481
+
1482
+ SEXP safe_CastString(void *data)
1483
+ {
1484
+ return Rcpp::CharacterVector(*(std::string*)data);
1485
+ }
1486
+
1487
+ // [[Rcpp::export(rng = false)]]
793
1488
  Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_extended,
794
1489
  Rcpp::CharacterVector numeric_colanmes,
795
1490
  Rcpp::CharacterVector categ_colnames,
@@ -814,13 +1509,16 @@ Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_exten
814
1509
  categ_levels_cpp,
815
1510
  output_tree_num, true, single_tree, tree_num,
816
1511
  nthreads);
817
- Rcpp::List out(res.size());
1512
+ /* TODO: this function could create objects through the ALTREP system instead.
1513
+ That way, it would avoid an extra copy of the data */
1514
+ size_t sz = res.size();
1515
+ Rcpp::List out = Rcpp::unwindProtect(alloc_List, (void*)&sz);
818
1516
  for (size_t ix = 0; ix < res.size(); ix++)
819
- out[ix] = Rcpp::CharacterVector(res[ix]);
1517
+ out[ix] = Rcpp::unwindProtect(safe_CastString, &(res[ix]));
820
1518
  return out;
821
1519
  }
822
1520
 
823
- // [[Rcpp::export]]
1521
+ // [[Rcpp::export(rng = false)]]
824
1522
  Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_extended,
825
1523
  Rcpp::CharacterVector numeric_colanmes,
826
1524
  Rcpp::CharacterVector categ_colnames,
@@ -842,11 +1540,976 @@ Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_ex
842
1540
  std::string table_from_cpp = Rcpp::as<std::string>(table_from);
843
1541
  std::string select_as_cpp = Rcpp::as<std::string>(select_as);
844
1542
 
845
- return generate_sql_with_select_from(model_ptr, ext_model_ptr,
846
- table_from_cpp, select_as_cpp,
847
- numeric_colanmes_cpp, categ_colanmes_cpp,
848
- categ_levels_cpp,
849
- true, nthreads);
1543
+ std::string out = generate_sql_with_select_from(model_ptr, ext_model_ptr,
1544
+ table_from_cpp, select_as_cpp,
1545
+ numeric_colanmes_cpp, categ_colanmes_cpp,
1546
+ categ_levels_cpp,
1547
+ true, nthreads);
1548
+ /* TODO: this function could create objects through the ALTREP system instead.
1549
+ That way, it would avoid an extra copy of the data */
1550
+ return Rcpp::unwindProtect(safe_CastString, &out);
1551
+ }
1552
+
1553
+ // [[Rcpp::export(rng = false)]]
1554
+ Rcpp::List copy_cpp_objects(SEXP model_R_ptr, bool is_extended, SEXP imp_R_ptr, bool has_imputer, SEXP ind_R_ptr)
1555
+ {
1556
+ bool has_indexer = !Rf_isNull(ind_R_ptr) && R_ExternalPtrAddr(ind_R_ptr) != NULL;
1557
+
1558
+ Rcpp::List out = Rcpp::List::create(
1559
+ Rcpp::_["ptr"] = R_NilValue,
1560
+ Rcpp::_["imp_ptr"] = R_NilValue,
1561
+ Rcpp::_["indexer"] = R_NilValue
1562
+ );
1563
+
1564
+ IsoForest* model_ptr = NULL;
1565
+ ExtIsoForest* ext_model_ptr = NULL;
1566
+ Imputer* imputer_ptr = NULL;
1567
+ TreesIndexer* indexer_ptr = NULL;
1568
+ if (is_extended)
1569
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1570
+ else
1571
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
1572
+ if (has_imputer)
1573
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
1574
+ if (has_indexer)
1575
+ indexer_ptr = static_cast<TreesIndexer*>(R_ExternalPtrAddr(ind_R_ptr));
1576
+
1577
+ std::unique_ptr<IsoForest> copy_model(new IsoForest());
1578
+ std::unique_ptr<ExtIsoForest> copy_ext_model(new ExtIsoForest());
1579
+ std::unique_ptr<Imputer> copy_imputer(new Imputer());
1580
+ std::unique_ptr<TreesIndexer> copy_indexer(new TreesIndexer());
1581
+
1582
+ if (model_ptr != NULL)
1583
+ *copy_model = *model_ptr;
1584
+ if (ext_model_ptr != NULL)
1585
+ *copy_ext_model = *ext_model_ptr;
1586
+ if (imputer_ptr != NULL)
1587
+ *copy_imputer = *imputer_ptr;
1588
+ if (indexer_ptr != NULL)
1589
+ *copy_indexer = *indexer_ptr;
1590
+
1591
+ if (is_extended) {
1592
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, copy_ext_model.get());
1593
+ copy_ext_model.release();
1594
+ }
1595
+ else {
1596
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, copy_model.get());
1597
+ copy_model.release();
1598
+ }
1599
+ if (has_imputer) {
1600
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, copy_imputer.get());
1601
+ copy_imputer.release();
1602
+ }
1603
+ if (has_indexer) {
1604
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, copy_indexer.get());
1605
+ copy_indexer.release();
1606
+ }
1607
+ return out;
1608
+ }
1609
+
1610
+ // [[Rcpp::export(rng = false)]]
1611
+ void build_tree_indices(Rcpp::List lst_modify, bool is_extended, bool with_distances, int nthreads)
1612
+ {
1613
+ Rcpp::RawVector ind_ser = Rcpp::RawVector();
1614
+ Rcpp::List empty_lst = Rcpp::List::create(Rcpp::_["indexer"] = R_NilValue);
1615
+ std::unique_ptr<TreesIndexer> indexer(new TreesIndexer());
1616
+
1617
+ if (!is_extended) {
1618
+ build_tree_indices(*indexer,
1619
+ *static_cast<IsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"])),
1620
+ nthreads,
1621
+ with_distances);
1622
+ }
1623
+ else {
1624
+ build_tree_indices(*indexer,
1625
+ *static_cast<ExtIsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"])),
1626
+ nthreads,
1627
+ with_distances);
1628
+ }
1629
+
1630
+ ind_ser = serialize_cpp_obj(indexer.get());
1631
+ empty_lst["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, indexer.get());
1632
+ if (!Rf_isNull(lst_modify["indexer"])) {
1633
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1634
+ indexer_R_ptr.release();
1635
+ }
1636
+
1637
+ lst_modify["ind_ser"] = ind_ser;
1638
+ lst_modify["indexer"] = empty_lst["indexer"];
1639
+ indexer.release();
1640
+ }
1641
+
1642
+ // [[Rcpp::export(rng = false)]]
1643
+ bool check_node_indexer_has_distances(SEXP indexer_R_ptr)
1644
+ {
1645
+ if (Rf_isNull(indexer_R_ptr) || R_ExternalPtrAddr(indexer_R_ptr) == NULL)
1646
+ return false;
1647
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1648
+ if (indexer->indices.empty()) return false;
1649
+ return !indexer->indices.front().node_distances.empty();
1650
+ }
1651
+
1652
+ // [[Rcpp::export(rng = false)]]
1653
+ void set_reference_points(Rcpp::List lst_modify, Rcpp::List lst_modify2, SEXP rnames, bool is_extended,
1654
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
1655
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
1656
+ size_t nrows, int nthreads, bool with_distances)
1657
+ {
1658
+ Rcpp::RawVector ind_ser = Rcpp::RawVector();
1659
+ Rcpp::XPtr<TreesIndexer> indexer_R_ptr = lst_modify["indexer"];
1660
+
1661
+ double* numeric_data_ptr = NULL;
1662
+ int* categ_data_ptr = NULL;
1663
+ double* Xc_ptr = NULL;
1664
+ int* Xc_ind_ptr = NULL;
1665
+ int* Xc_indptr_ptr = NULL;
1666
+ Rcpp::NumericVector Xcpp;
1667
+
1668
+ if (X_num.size())
1669
+ {
1670
+ numeric_data_ptr = REAL(X_num);
1671
+ }
1672
+
1673
+ if (X_cat.size())
1674
+ {
1675
+ categ_data_ptr = INTEGER(X_cat);
1676
+ }
1677
+
1678
+ if (Xc_indptr.size())
1679
+ {
1680
+ Xc_ptr = REAL(Xc);
1681
+ Xc_ind_ptr = INTEGER(Xc_ind);
1682
+ Xc_indptr_ptr = INTEGER(Xc_indptr);
1683
+ }
1684
+
1685
+ IsoForest* model_ptr = NULL;
1686
+ ExtIsoForest* ext_model_ptr = NULL;
1687
+ TreesIndexer* indexer = NULL;
1688
+ if (is_extended)
1689
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"]));
1690
+ else
1691
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(lst_modify["ptr"]));
1692
+ indexer = indexer_R_ptr.get();
1693
+
1694
+ MissingAction missing_action = is_extended?
1695
+ ext_model_ptr->missing_action
1696
+ :
1697
+ model_ptr->missing_action;
1698
+ if (missing_action != Fail)
1699
+ {
1700
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
1701
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
1702
+ }
1703
+
1704
+ std::unique_ptr<TreesIndexer> new_indexer(new TreesIndexer(*indexer));
1705
+
1706
+ set_reference_points(model_ptr, ext_model_ptr, new_indexer.get(),
1707
+ with_distances,
1708
+ numeric_data_ptr, categ_data_ptr,
1709
+ true, (size_t)0, (size_t)0,
1710
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
1711
+ (double*)NULL, (int*)NULL, (int*)NULL,
1712
+ nrows, nthreads);
1713
+
1714
+ ind_ser = serialize_cpp_obj(new_indexer.get());
1715
+ *indexer = std::move(*new_indexer);
1716
+ new_indexer.release();
1717
+ lst_modify["ind_ser"] = ind_ser;
1718
+ lst_modify2["reference_names"] = rnames;
1719
+ }
1720
+
1721
+ // [[Rcpp::export(rng = false)]]
1722
+ bool check_node_indexer_has_references(SEXP indexer_R_ptr)
1723
+ {
1724
+ if (Rf_isNull(indexer_R_ptr) || R_ExternalPtrAddr(indexer_R_ptr) == NULL)
1725
+ return false;
1726
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1727
+ if (indexer->indices.empty())
1728
+ return false;
1729
+ if (indexer->indices.front().reference_points.empty())
1730
+ return false;
1731
+ else
1732
+ return true;
1733
+ }
1734
+
1735
+ // [[Rcpp::export(rng = false)]]
1736
+ int get_num_references(SEXP indexer_R_ptr)
1737
+ {
1738
+ TreesIndexer *indexer = static_cast<TreesIndexer*>(R_ExternalPtrAddr(indexer_R_ptr));
1739
+ if (indexer == NULL || indexer->indices.empty()) return 0;
1740
+ return indexer->indices.front().reference_points.size();
1741
+ }
1742
+
1743
+ // [[Rcpp::export(rng = false)]]
1744
+ SEXP get_null_R_pointer()
1745
+ {
1746
+ return R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue);
1747
+ }
1748
+
1749
+ /* This library will use different code paths for opening a file path
1750
+ in order to support non-ASCII characters, depending on compiler and
1751
+ platform support. */
1752
+ #if (defined(_WIN32) || defined(_WIN64))
1753
+ # if defined(__GNUC__) && (__GNUC__ >= 5)
1754
+ # define USE_CODECVT
1755
+ # define TAKE_AS_UTF8 true
1756
+ # elif !defined(_FOR_CRAN)
1757
+ # define USE_RC_FOPEN
1758
+ # define TAKE_AS_UTF8 false
1759
+ # else
1760
+ # define USE_SIMPLE_FOPEN
1761
+ # define TAKE_AS_UTF8 false
1762
+ # endif
1763
+ #else
1764
+ # define USE_SIMPLE_FOPEN
1765
+ # define TAKE_AS_UTF8 false
1766
+ #endif
1767
+
1768
+ /* Now the actual implementations */
1769
+ #ifdef USE_CODECVT
1770
+ /* https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t */
1771
+ /* */
1772
+ #include <locale>
1773
+ #include <codecvt>
1774
+ #include <string>
1775
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1776
+ {
1777
+ Rcpp::String s(fname[0], CE_UTF8);
1778
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
1779
+ std::wstring wide = converter.from_bytes(s.get_cstring());
1780
+ std::string mode__(mode);
1781
+ std::wstring mode_ = converter.from_bytes(mode__);
1782
+ return _wfopen(wide.c_str(), mode_.c_str());
1783
+ }
1784
+ #endif
1785
+
1786
+ #ifdef USE_RC_FOPEN
1787
+ extern "C" {
1788
+ FILE *RC_fopen(const SEXP fn, const char *mode, const Rboolean expand);
1789
+ }
1790
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1791
+ {
1792
+ return RC_fopen(fname[0], mode, FALSE);
1793
+ }
1794
+ #endif
1795
+
1796
+ #ifdef USE_SIMPLE_FOPEN
1797
+ FILE* R_fopen(Rcpp::CharacterVector fname, const char *mode)
1798
+ {
1799
+ return fopen(fname[0], mode);
1800
+ }
1801
+ #endif
1802
+
1803
+ class FileOpener
1804
+ {
1805
+ public:
1806
+ FILE *handle = NULL;
1807
+ FileOpener(const SEXP fname, const char *mode)
1808
+ {
1809
+ if (this->handle != NULL)
1810
+ this->close_file();
1811
+ this->handle = R_fopen(fname, mode);
1812
+ }
1813
+ FILE *get_handle()
1814
+ {
1815
+ return this->handle;
1816
+ }
1817
+ void close_file()
1818
+ {
1819
+ if (this->handle != NULL) {
1820
+ fclose(this->handle);
1821
+ this->handle = NULL;
1822
+ }
1823
+ }
1824
+ ~FileOpener()
1825
+ {
1826
+ this->close_file();
1827
+ }
1828
+ };
1829
+
1830
+ // [[Rcpp::export]]
1831
+ void serialize_to_file
1832
+ (
1833
+ Rcpp::RawVector serialized_obj,
1834
+ Rcpp::RawVector serialized_imputer,
1835
+ Rcpp::RawVector serialized_indexer,
1836
+ bool is_extended,
1837
+ Rcpp::RawVector metadata,
1838
+ Rcpp::CharacterVector fname
1839
+ )
1840
+ {
1841
+ FileOpener file_(fname[0], "wb");
1842
+ FILE *output_file = file_.get_handle();
1843
+ serialize_combined(
1844
+ is_extended? nullptr : (char*)RAW(serialized_obj),
1845
+ is_extended? (char*)RAW(serialized_obj) : nullptr,
1846
+ serialized_imputer.size()? (char*)RAW(serialized_imputer) : nullptr,
1847
+ serialized_indexer.size()? (char*)RAW(serialized_indexer) : nullptr,
1848
+ metadata.size()? (char*)RAW(metadata) : nullptr,
1849
+ metadata.size(),
1850
+ output_file
1851
+ );
1852
+ }
1853
+
1854
+ // [[Rcpp::export]]
1855
+ Rcpp::List deserialize_from_file(Rcpp::CharacterVector fname)
1856
+ {
1857
+ Rcpp::List out = Rcpp::List::create(
1858
+ Rcpp::_["ptr"] = R_NilValue,
1859
+ Rcpp::_["serialized"] = R_NilValue,
1860
+ Rcpp::_["imp_ptr"] = R_NilValue,
1861
+ Rcpp::_["imp_ser"] = R_NilValue,
1862
+ Rcpp::_["indexer"] = R_NilValue,
1863
+ Rcpp::_["ind_ser"] = R_NilValue,
1864
+ Rcpp::_["metadata"] = R_NilValue
1865
+ );
1866
+
1867
+ FileOpener file_(fname[0], "rb");
1868
+ FILE *input_file = file_.get_handle();
1869
+
1870
+ bool is_isotree_model;
1871
+ bool is_compatible;
1872
+ bool has_combined_objects;
1873
+ bool has_IsoForest;
1874
+ bool has_ExtIsoForest;
1875
+ bool has_Imputer;
1876
+ bool has_Indexer;
1877
+ bool has_metadata;
1878
+ size_t size_metadata;
1879
+
1880
+ inspect_serialized_object(
1881
+ input_file,
1882
+ is_isotree_model,
1883
+ is_compatible,
1884
+ has_combined_objects,
1885
+ has_IsoForest,
1886
+ has_ExtIsoForest,
1887
+ has_Imputer,
1888
+ has_Indexer,
1889
+ has_metadata,
1890
+ size_metadata
1891
+ );
1892
+
1893
+ if (!is_isotree_model || !has_combined_objects)
1894
+ Rcpp::stop("Input file is not a serialized isotree model.\n");
1895
+ if (!is_compatible)
1896
+ Rcpp::stop("Model file format is incompatible.\n");
1897
+ if (!size_metadata)
1898
+ Rcpp::stop("Input file does not contain metadata.\n");
1899
+
1900
+ out["metadata"] = Rcpp::unwindProtect(alloc_RawVec, (void*)&size_metadata);
1901
+
1902
+ std::unique_ptr<IsoForest> model(new IsoForest());
1903
+ std::unique_ptr<ExtIsoForest> model_ext(new ExtIsoForest());
1904
+ std::unique_ptr<Imputer> imputer(new Imputer());
1905
+ std::unique_ptr<TreesIndexer> indexer(new TreesIndexer());
1906
+
1907
+ IsoForest *ptr_model = NULL;
1908
+ ExtIsoForest *ptr_model_ext = NULL;
1909
+ Imputer *ptr_imputer = NULL;
1910
+ TreesIndexer *ptr_indexer = NULL;
1911
+ char *ptr_metadata = (char*)RAW(out["metadata"]);
1912
+
1913
+ if (has_IsoForest)
1914
+ ptr_model = model.get();
1915
+ if (has_ExtIsoForest)
1916
+ ptr_model_ext = model_ext.get();
1917
+ if (has_Imputer)
1918
+ ptr_imputer = imputer.get();
1919
+ if (has_Indexer)
1920
+ ptr_indexer = indexer.get();
1921
+
1922
+ deserialize_combined(
1923
+ input_file,
1924
+ ptr_model,
1925
+ ptr_model_ext,
1926
+ ptr_imputer,
1927
+ ptr_indexer,
1928
+ ptr_metadata
1929
+ );
1930
+
1931
+ if (has_IsoForest)
1932
+ out["serialized"] = serialize_cpp_obj(model.get());
1933
+ else
1934
+ out["serialized"] = serialize_cpp_obj(model_ext.get());
1935
+ if (has_Imputer)
1936
+ out["imp_ser"] = serialize_cpp_obj(imputer.get());
1937
+ if (has_Indexer)
1938
+ out["ind_ser"] = serialize_cpp_obj(indexer.get());
1939
+
1940
+ if (has_IsoForest) {
1941
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<IsoForest>, model.get());
1942
+ model.release();
1943
+ }
1944
+ else {
1945
+ out["ptr"] = Rcpp::unwindProtect(safe_XPtr<ExtIsoForest>, model_ext.get());
1946
+ model_ext.release();
1947
+ }
1948
+ if (has_Imputer) {
1949
+ out["imp_ptr"] = Rcpp::unwindProtect(safe_XPtr<Imputer>, imputer.get());
1950
+ imputer.release();
1951
+ }
1952
+ if (has_Indexer) {
1953
+ out["indexer"] = Rcpp::unwindProtect(safe_XPtr<TreesIndexer>, indexer.get());
1954
+ indexer.release();
1955
+ }
1956
+
1957
+ return out;
1958
+ }
1959
+
1960
+ /* The functions below make for missing functionality in the
1961
+ 'Matrix' and 'SparseM' packages for sub-setting the data */
1962
+
1963
+ // [[Rcpp::export(rng = false)]]
1964
+ void call_sort_csc_indices(Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr)
1965
+ {
1966
+ size_t ncols_numeric = Xc_indptr.size() - 1;
1967
+ sort_csc_indices(REAL(Xc), INTEGER(Xc_ind), INTEGER(Xc_indptr), ncols_numeric);
1968
+ }
1969
+
1970
+ // [[Rcpp::export(rng = false)]]
1971
+ void call_reconstruct_csr_sliced
1972
+ (
1973
+ Rcpp::NumericVector orig_Xr, Rcpp::IntegerVector orig_Xr_indptr,
1974
+ Rcpp::NumericVector rec_Xr, Rcpp::IntegerVector rec_Xr_indptr,
1975
+ size_t nrows
1976
+ )
1977
+ {
1978
+ reconstruct_csr_sliced<double, int>(
1979
+ REAL(orig_Xr), INTEGER(orig_Xr_indptr),
1980
+ REAL(rec_Xr), INTEGER(rec_Xr_indptr),
1981
+ nrows
1982
+ );
1983
+ }
1984
+
1985
+ // [[Rcpp::export(rng = false)]]
1986
+ void call_reconstruct_csr_with_categ
1987
+ (
1988
+ Rcpp::NumericVector orig_Xr, Rcpp::IntegerVector orig_Xr_ind, Rcpp::IntegerVector orig_Xr_indptr,
1989
+ Rcpp::NumericVector rec_Xr, Rcpp::IntegerVector rec_Xr_ind, Rcpp::IntegerVector rec_Xr_indptr,
1990
+ Rcpp::IntegerVector rec_X_cat,
1991
+ Rcpp::IntegerVector cols_numeric, Rcpp::IntegerVector cols_categ,
1992
+ size_t nrows, size_t ncols
1993
+ )
1994
+ {
1995
+ reconstruct_csr_with_categ<double, int, int>(
1996
+ REAL(orig_Xr), INTEGER(orig_Xr_ind), INTEGER(orig_Xr_indptr),
1997
+ REAL(rec_Xr), INTEGER(rec_Xr_ind), INTEGER(rec_Xr_indptr),
1998
+ INTEGER(rec_X_cat), true,
1999
+ INTEGER(cols_numeric), INTEGER(cols_categ),
2000
+ nrows, ncols, cols_numeric.size(), cols_categ.size()
2001
+ );
2002
+ }
2003
+
2004
+ // [[Rcpp::export(rng = false)]]
2005
+ Rcpp::NumericVector deepcopy_vector(Rcpp::NumericVector inp)
2006
+ {
2007
+ return Rcpp::NumericVector(inp.begin(), inp.end());
2008
+ }
2009
+
2010
+ Rcpp::IntegerMatrix csc_to_dense_int
2011
+ (
2012
+ Rcpp::NumericVector Xc,
2013
+ Rcpp::IntegerVector Xc_ind,
2014
+ Rcpp::IntegerVector Xc_indptr,
2015
+ size_t nrows
2016
+ )
2017
+ {
2018
+ size_t ncols = Xc_indptr.size() - 1;
2019
+ Rcpp::IntegerMatrix out_(nrows, ncols);
2020
+ int *restrict out = INTEGER(out_);
2021
+ for (size_t col = 0; col < ncols; col++)
2022
+ {
2023
+ for (auto ix = Xc_indptr[col]; ix < Xc_indptr[col+1]; ix++)
2024
+ out[(size_t)Xc_ind[ix] + col*nrows]
2025
+ =
2026
+ (Xc[ix] >= 0 && !ISNAN(Xc[ix]))?
2027
+ (int)Xc[ix] : (int)(-1);
2028
+ }
2029
+ return out_;
2030
+ }
2031
+
2032
+ template <class real_vec, class int_vec>
2033
+ Rcpp::IntegerMatrix csr_to_dense_int
2034
+ (
2035
+ real_vec Xr,
2036
+ int_vec Xr_ind,
2037
+ int_vec Xr_indptr,
2038
+ int ncols
2039
+ )
2040
+ {
2041
+ size_t nrows = Xr_indptr.size() - 1;
2042
+ size_t matrix_dims[] = {nrows, (size_t)ncols};
2043
+ Rcpp::IntegerMatrix out_ = Rcpp::unwindProtect(safe_int_matrix, (void*)matrix_dims);
2044
+ int *restrict out = INTEGER(out_);
2045
+ for (size_t row = 0; row < nrows; row++)
2046
+ {
2047
+ for (auto ix = Xr_indptr[row]; ix < Xr_indptr[row+1]; ix++)
2048
+ out[row + (size_t)Xr_ind[ix]*nrows]
2049
+ =
2050
+ (Xr[ix] >= 0 && !ISNAN(Xr[ix]))?
2051
+ (int)Xr[ix] : (int)(-1);
2052
+ }
2053
+ return out_;
2054
+ }
2055
+
2056
+ // [[Rcpp::export(rng = false)]]
2057
+ Rcpp::List call_take_cols_by_slice_csr
2058
+ (
2059
+ Rcpp::NumericVector Xr_,
2060
+ Rcpp::IntegerVector Xr_ind_,
2061
+ Rcpp::IntegerVector Xr_indptr,
2062
+ int ncols_take,
2063
+ bool as_dense
2064
+ )
2065
+ {
2066
+ /* Indices need to be sorted beforehand */
2067
+ double *restrict Xr = REAL(Xr_);
2068
+ int *restrict Xr_ind = INTEGER(Xr_ind_);
2069
+ size_t nrows = Xr_indptr.size() - 1;
2070
+ Rcpp::IntegerVector out_Xr_indptr(nrows+1);
2071
+ out_Xr_indptr[0] = 0;
2072
+ size_t total_size = 0;
2073
+ for (size_t row = 0; row < nrows; row++)
2074
+ {
2075
+ for (auto col = Xr_indptr[row]; col < Xr_indptr[row+1]; col++)
2076
+ total_size += Xr_ind[col] < ncols_take;
2077
+ out_Xr_indptr[row+1] = total_size;
2078
+ }
2079
+
2080
+ Rcpp::NumericVector out_Xr_(total_size);
2081
+ Rcpp::IntegerVector out_Xr_ind_(total_size);
2082
+ double *restrict out_Xr = REAL(out_Xr_);
2083
+ int *restrict out_Xr_ind = INTEGER(out_Xr_ind_);
2084
+
2085
+ size_t n_this;
2086
+ for (size_t row = 0; row < nrows; row++)
2087
+ {
2088
+ n_this = out_Xr_indptr[row+1] - out_Xr_indptr[row];
2089
+ if (n_this) {
2090
+ std::copy(Xr + Xr_indptr[row],
2091
+ Xr + Xr_indptr[row] + n_this,
2092
+ out_Xr + out_Xr_indptr[row]);
2093
+ std::copy(Xr_ind + Xr_indptr[row],
2094
+ Xr_ind + Xr_indptr[row] + n_this,
2095
+ out_Xr_ind + out_Xr_indptr[row]);
2096
+ }
2097
+ }
2098
+
2099
+ if (!as_dense)
2100
+ return Rcpp::List::create(
2101
+ Rcpp::_["Xr"] = out_Xr_,
2102
+ Rcpp::_["Xr_ind"] = out_Xr_ind_,
2103
+ Rcpp::_["Xr_indptr"] = out_Xr_indptr
2104
+ );
2105
+ else
2106
+ return Rcpp::List::create(
2107
+ Rcpp::_["X_cat"] = csr_to_dense_int(out_Xr_,
2108
+ out_Xr_ind_,
2109
+ out_Xr_indptr,
2110
+ ncols_take)
2111
+ );
2112
+ }
2113
+
2114
+ // [[Rcpp::export(rng = false)]]
2115
+ Rcpp::List call_take_cols_by_index_csr
2116
+ (
2117
+ Rcpp::NumericVector Xr,
2118
+ Rcpp::IntegerVector Xr_ind,
2119
+ Rcpp::IntegerVector Xr_indptr,
2120
+ Rcpp::IntegerVector cols_take,
2121
+ bool as_dense
2122
+ )
2123
+ {
2124
+ Rcpp::List out;
2125
+ if (!as_dense) {
2126
+ out = Rcpp::List::create(
2127
+ Rcpp::_["Xr"] = R_NilValue,
2128
+ Rcpp::_["Xr_ind"] = R_NilValue,
2129
+ Rcpp::_["Xr_indptr"] = R_NilValue
2130
+ );
2131
+ }
2132
+ else {
2133
+ out = Rcpp::List::create(
2134
+ Rcpp::_["X_cat"] = R_NilValue
2135
+ );
2136
+ }
2137
+
2138
+
2139
+ /* 'cols_take' should be sorted */
2140
+ int n_take = cols_take.size();
2141
+ int nrows = Xr_indptr.size() - 1;
2142
+ std::vector<double> out_Xr;
2143
+ std::vector<int> out_Xr_ind;
2144
+ std::vector<int> out_Xr_indptr(nrows + 1);
2145
+
2146
+ int *curr_ptr;
2147
+ int *end_ptr;
2148
+ int *restrict ptr_Xr_ind = INTEGER(Xr_ind);
2149
+ int *restrict ptr_cols_take = INTEGER(cols_take);
2150
+ int *restrict ptr_cols_take_end = ptr_cols_take + n_take;
2151
+ int curr_col;
2152
+ int *search_res;
2153
+
2154
+ for (int row = 0; row < nrows; row++)
2155
+ {
2156
+ curr_ptr = ptr_Xr_ind + Xr_indptr[row];
2157
+ end_ptr = ptr_Xr_ind + Xr_indptr[row+1];
2158
+ curr_col = 0;
2159
+
2160
+ if (end_ptr == curr_ptr + 1)
2161
+ {
2162
+ search_res = std::lower_bound(ptr_cols_take, ptr_cols_take_end, *curr_ptr);
2163
+ curr_col = std::distance(ptr_cols_take, search_res);
2164
+ if (curr_col < n_take && *search_res == *curr_ptr)
2165
+ {
2166
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2167
+ out_Xr_ind.push_back(curr_col);
2168
+ }
2169
+ }
2170
+
2171
+ else
2172
+ if (end_ptr > curr_ptr)
2173
+ {
2174
+ while (true)
2175
+ {
2176
+ curr_ptr = std::lower_bound(curr_ptr, end_ptr, ptr_cols_take[curr_col]);
2177
+
2178
+ if (curr_ptr >= end_ptr)
2179
+ {
2180
+ break;
2181
+ }
2182
+
2183
+
2184
+ else if (*curr_ptr == ptr_cols_take[curr_col])
2185
+ {
2186
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2187
+ out_Xr_ind.push_back(curr_col);
2188
+ curr_ptr++;
2189
+ curr_col++;
2190
+
2191
+ if (curr_ptr >= end_ptr || curr_col >= n_take)
2192
+ break;
2193
+ }
2194
+
2195
+
2196
+ else
2197
+ {
2198
+ curr_col = std::distance(
2199
+ ptr_cols_take,
2200
+ std::lower_bound(ptr_cols_take + curr_col, ptr_cols_take_end, *curr_ptr)
2201
+ );
2202
+
2203
+ if (curr_col >= n_take)
2204
+ break;
2205
+
2206
+ if (curr_col == *curr_ptr) {
2207
+ out_Xr.push_back(Xr[std::distance(ptr_Xr_ind, curr_ptr)]);
2208
+ out_Xr_ind.push_back(curr_col);
2209
+ curr_ptr++;
2210
+ curr_col++;
2211
+ }
2212
+
2213
+ if (curr_ptr >= end_ptr || curr_col >= n_take)
2214
+ break;
2215
+ }
2216
+ }
2217
+ }
2218
+
2219
+ out_Xr_indptr[row+1] = out_Xr.size();
2220
+ }
2221
+
2222
+ if (!as_dense)
2223
+ {
2224
+ out["Xr"] = Rcpp::unwindProtect(safe_copy_vec, (void*)&out_Xr);
2225
+ out["Xr_ind"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xr_ind);
2226
+ out["Xr_indptr"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xr_indptr);
2227
+ }
2228
+
2229
+ else
2230
+ {
2231
+ out["X_cat"] = csr_to_dense_int(out_Xr,
2232
+ out_Xr_ind,
2233
+ out_Xr_indptr,
2234
+ n_take);
2235
+ }
2236
+
2237
+ return out;
2238
+ }
2239
+
2240
+ // [[Rcpp::export(rng = false)]]
2241
+ Rcpp::List call_take_cols_by_slice_csc
2242
+ (
2243
+ Rcpp::NumericVector Xc,
2244
+ Rcpp::IntegerVector Xc_ind,
2245
+ Rcpp::IntegerVector Xc_indptr,
2246
+ size_t ncols_take,
2247
+ bool as_dense, size_t nrows
2248
+ )
2249
+ {
2250
+ Rcpp::IntegerVector out_Xc_indptr(ncols_take+1);
2251
+ size_t total_size = Xc_indptr[ncols_take+1];
2252
+ Rcpp::NumericVector out_Xc(REAL(Xc), REAL(Xc) + total_size);
2253
+ Rcpp::IntegerVector out_Xc_ind(INTEGER(Xc_ind), INTEGER(Xc_ind) + total_size);
2254
+
2255
+ if (!as_dense)
2256
+ return Rcpp::List::create(
2257
+ Rcpp::_["Xc"] = out_Xc,
2258
+ Rcpp::_["Xc_ind"] = out_Xc_ind,
2259
+ Rcpp::_["Xc_indptr"] = out_Xc_indptr
2260
+ );
2261
+ else
2262
+ return Rcpp::List::create(
2263
+ Rcpp::_["X_cat"] = csc_to_dense_int(out_Xc,
2264
+ out_Xc_ind,
2265
+ out_Xc_indptr,
2266
+ nrows)
2267
+ );
2268
+ }
2269
+
2270
+ // [[Rcpp::export(rng = false)]]
2271
+ Rcpp::List call_take_cols_by_index_csc
2272
+ (
2273
+ Rcpp::NumericVector Xc_,
2274
+ Rcpp::IntegerVector Xc_ind_,
2275
+ Rcpp::IntegerVector Xc_indptr,
2276
+ Rcpp::IntegerVector cols_take,
2277
+ bool as_dense, size_t nrows
2278
+ )
2279
+ {
2280
+ /* 'cols_take' should be sorted */
2281
+ double *restrict Xc = REAL(Xc_);
2282
+ int *restrict Xc_ind = INTEGER(Xc_ind_);
2283
+ size_t n_take = cols_take.size();
2284
+ Rcpp::IntegerVector out_Xc_indptr(n_take+1);
2285
+ size_t total_size = 0;
2286
+
2287
+ for (size_t col = 0; col < n_take; col++)
2288
+ total_size += Xc_indptr[cols_take[col]+1] - Xc_indptr[cols_take[col]];
2289
+
2290
+ Rcpp::NumericVector out_Xc_(total_size);
2291
+ Rcpp::IntegerVector out_Xc_ind_(total_size);
2292
+ double *restrict out_Xc = REAL(out_Xc_);
2293
+ int *restrict out_Xc_ind = INTEGER(out_Xc_ind_);
2294
+
2295
+ total_size = 0;
2296
+ size_t n_this;
2297
+ out_Xc_indptr[0] = 0;
2298
+ for (size_t col = 0; col < n_take; col++)
2299
+ {
2300
+ n_this = Xc_indptr[cols_take[col]+1] - Xc_indptr[cols_take[col]];
2301
+ if (n_this) {
2302
+ std::copy(Xc + Xc_indptr[cols_take[col]],
2303
+ Xc + Xc_indptr[cols_take[col]] + n_this,
2304
+ out_Xc + total_size);
2305
+ std::copy(Xc_ind + Xc_indptr[cols_take[col]],
2306
+ Xc_ind + Xc_indptr[cols_take[col]] + n_this,
2307
+ out_Xc_ind + total_size);
2308
+ }
2309
+ total_size += n_this;
2310
+ out_Xc_indptr[col+1] = total_size;
2311
+ }
2312
+
2313
+ if (!as_dense)
2314
+ return Rcpp::List::create(
2315
+ Rcpp::_["Xc"] = out_Xc_,
2316
+ Rcpp::_["Xc_ind"] = out_Xc_ind_,
2317
+ Rcpp::_["Xc_indptr"] = out_Xc_indptr
2318
+ );
2319
+ else
2320
+ return Rcpp::List::create(
2321
+ Rcpp::_["X_cat"] = csc_to_dense_int(out_Xc_,
2322
+ out_Xc_ind_,
2323
+ out_Xc_indptr,
2324
+ nrows)
2325
+ );
2326
+ }
2327
+
2328
+ // [[Rcpp::export(rng = false)]]
2329
+ void copy_csc_cols_by_slice
2330
+ (
2331
+ Rcpp::NumericVector out_Xc_,
2332
+ Rcpp::IntegerVector out_Xc_indptr,
2333
+ Rcpp::NumericVector from_Xc_,
2334
+ Rcpp::IntegerVector from_Xc_indptr,
2335
+ size_t n_copy
2336
+ )
2337
+ {
2338
+ size_t total_size = from_Xc_indptr[n_copy+1];
2339
+ std::copy(REAL(from_Xc_), REAL(from_Xc_) + total_size, REAL(out_Xc_));
2340
+ }
2341
+
2342
+ // [[Rcpp::export(rng = false)]]
2343
+ void copy_csc_cols_by_index
2344
+ (
2345
+ Rcpp::NumericVector out_Xc_,
2346
+ Rcpp::IntegerVector out_Xc_indptr,
2347
+ Rcpp::NumericVector from_Xc_,
2348
+ Rcpp::IntegerVector from_Xc_indptr,
2349
+ Rcpp::IntegerVector cols_copy
2350
+ )
2351
+ {
2352
+ size_t n_copy = cols_copy.size();
2353
+ double *restrict out_Xc = REAL(out_Xc_);
2354
+ double *restrict from_Xc = REAL(from_Xc_);
2355
+
2356
+ for (size_t col = 0; col < n_copy; col++)
2357
+ {
2358
+ std::copy(from_Xc + from_Xc_indptr[col],
2359
+ from_Xc + from_Xc_indptr[col+1],
2360
+ out_Xc + out_Xc_indptr[cols_copy[col]]);
2361
+ }
2362
+ }
2363
+
2364
+
2365
+ // [[Rcpp::export(rng = false)]]
2366
+ Rcpp::List assign_csc_cols
2367
+ (
2368
+ Rcpp::NumericVector Xc_,
2369
+ Rcpp::IntegerVector Xc_ind_,
2370
+ Rcpp::IntegerVector Xc_indptr,
2371
+ Rcpp::IntegerVector X_cat_,
2372
+ Rcpp::IntegerVector cols_categ,
2373
+ Rcpp::IntegerVector cols_numeric,
2374
+ size_t nrows
2375
+ )
2376
+ {
2377
+ Rcpp::List out = Rcpp::List::create(
2378
+ Rcpp::_["Xc"] = R_NilValue,
2379
+ Rcpp::_["Xc_ind"] = R_NilValue,
2380
+ Rcpp::_["Xc_indptr"] = R_NilValue
2381
+ );
2382
+ size_t ncols_tot = (size_t)cols_categ.size() + (size_t)cols_numeric.size();
2383
+ std::vector<double> out_Xc;
2384
+ std::vector<int> out_Xc_ind;
2385
+ std::vector<int> out_Xc_indptr(ncols_tot + 1);
2386
+
2387
+ double *restrict Xc = REAL(Xc_);
2388
+ int *restrict Xc_ind = INTEGER(Xc_ind_);
2389
+ int *restrict X_cat = INTEGER(X_cat_);
2390
+
2391
+ hashed_set<int> cols_categ_set(INTEGER(cols_categ), INTEGER(cols_categ) + cols_categ.size());
2392
+ hashed_set<int> cols_numeric_set(INTEGER(cols_numeric), INTEGER(cols_numeric) + cols_numeric.size());
2393
+
2394
+ size_t curr_num = 0;
2395
+ size_t curr_cat = 0;
2396
+ bool has_zeros;
2397
+ size_t curr_size;
2398
+
2399
+ for (size_t col = 0; col < ncols_tot; col++)
2400
+ {
2401
+ if (is_in_set((int)col, cols_numeric_set))
2402
+ {
2403
+ std::copy(Xc + Xc_indptr[curr_num],
2404
+ Xc + Xc_indptr[curr_num+1],
2405
+ std::back_inserter(out_Xc));
2406
+ std::copy(Xc_ind + Xc_indptr[curr_num],
2407
+ Xc_ind + Xc_indptr[curr_num+1],
2408
+ std::back_inserter(out_Xc_ind));
2409
+ curr_num++;
2410
+ }
2411
+
2412
+ else if (is_in_set((int)col, cols_categ_set))
2413
+ {
2414
+ has_zeros = false;
2415
+ for (size_t row = 0; row < nrows; row++)
2416
+ if (X_cat[row + (size_t)curr_cat*nrows] == 0)
2417
+ has_zeros = true;
2418
+
2419
+ if (!has_zeros) {
2420
+ std::copy(X_cat + (size_t)curr_cat*nrows,
2421
+ X_cat + ((size_t)curr_cat+1)*nrows,
2422
+ std::back_inserter(out_Xc));
2423
+ curr_size = out_Xc_ind.size();
2424
+ out_Xc_ind.resize(curr_size + (size_t)nrows);
2425
+ std::iota(out_Xc_ind.begin() + curr_size, out_Xc_ind.end(), (int)0);
2426
+ }
2427
+
2428
+ else {
2429
+ for (size_t row = 0; row < nrows; row++) {
2430
+ if (X_cat[row + (size_t)curr_cat*nrows] > 0) {
2431
+ out_Xc.push_back(X_cat[row + (size_t)curr_cat*nrows]);
2432
+ out_Xc_ind.push_back((int)row);
2433
+ }
2434
+ }
2435
+ }
2436
+
2437
+ curr_cat++;
2438
+ }
2439
+
2440
+ out_Xc_indptr[col+1] = out_Xc.size();
2441
+ }
2442
+
2443
+
2444
+ out["Xc"] = Rcpp::unwindProtect(safe_copy_vec, (void*)&out_Xc);
2445
+ out["Xc_ind"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xc_ind);
2446
+ out["Xc_indptr"] = Rcpp::unwindProtect(safe_copy_intvec, (void*)&out_Xc_indptr);
2447
+ return out;
2448
+ }
2449
+
2450
+ /* These are helpers for dealing with large integers and R's copy-on-write semantics */
2451
+
2452
+ // [[Rcpp::export(rng = false)]]
2453
+ Rcpp::NumericVector get_empty_tmat(int nrows_)
2454
+ {
2455
+ size_t nrows = (size_t)nrows_;
2456
+ size_t tmat_size = (nrows * (nrows - (size_t)1)) / (size_t)2;
2457
+ return Rcpp::NumericVector((R_xlen_t)tmat_size);
2458
+ }
2459
+
2460
+ // [[Rcpp::export(rng = false)]]
2461
+ Rcpp::IntegerMatrix get_empty_int_mat(int nrows, int ncols)
2462
+ {
2463
+ return Rcpp::IntegerMatrix(nrows, ncols);
2464
+ }
2465
+
2466
+ // [[Rcpp::export(rng = false)]]
2467
+ Rcpp::IntegerMatrix get_null_int_mat()
2468
+ {
2469
+ return Rcpp::IntegerMatrix(0, 0);
2470
+ }
2471
+
2472
+ // [[Rcpp::export(rng = false)]]
2473
+ int get_ntrees(SEXP model_R_ptr, bool is_extended)
2474
+ {
2475
+ if (is_extended) {
2476
+ ExtIsoForest* ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
2477
+ return ext_model_ptr->hplanes.size();
2478
+ }
2479
+
2480
+ else {
2481
+ IsoForest* model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
2482
+ return model_ptr->trees.size();
2483
+ }
2484
+ }
2485
+
2486
+ // [[Rcpp::export(rng = false)]]
2487
+ SEXP deepcopy_int(SEXP x)
2488
+ {
2489
+ return Rf_ScalarInteger(Rf_asInteger(x));
2490
+ }
2491
+
2492
+ // [[Rcpp::export(rng = false)]]
2493
+ void modify_R_list_inplace(SEXP lst, int ix, SEXP el)
2494
+ {
2495
+ SET_VECTOR_ELT(lst, ix, el);
2496
+ }
2497
+
2498
+ // [[Rcpp::export(rng = false)]]
2499
+ void addto_R_list_inplace(Rcpp::List &lst, Rcpp::String nm, SEXP el)
2500
+ {
2501
+ lst[nm] = el;
2502
+ }
2503
+
2504
+
2505
+ // [[Rcpp::export(rng = false)]]
2506
+ bool R_has_openmp()
2507
+ {
2508
+ #ifdef _OPENMP
2509
+ return true;
2510
+ #else
2511
+ return false;
2512
+ #endif
850
2513
  }
851
2514
 
852
2515
  #endif /* _FOR_R */