isotree 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (118) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/LICENSE.txt +2 -2
  4. data/README.md +22 -1
  5. data/ext/isotree/ext.cpp +26 -0
  6. data/ext/isotree/extconf.rb +3 -3
  7. data/lib/isotree.rb +1 -0
  8. data/lib/isotree/isolation_forest.rb +86 -1
  9. data/lib/isotree/version.rb +1 -1
  10. data/vendor/cereal/LICENSE +24 -0
  11. data/vendor/cereal/README.md +85 -0
  12. data/vendor/cereal/include/cereal/access.hpp +351 -0
  13. data/vendor/cereal/include/cereal/archives/adapters.hpp +163 -0
  14. data/vendor/cereal/include/cereal/archives/binary.hpp +169 -0
  15. data/vendor/cereal/include/cereal/archives/json.hpp +1019 -0
  16. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +334 -0
  17. data/vendor/cereal/include/cereal/archives/xml.hpp +956 -0
  18. data/vendor/cereal/include/cereal/cereal.hpp +1089 -0
  19. data/vendor/cereal/include/cereal/details/helpers.hpp +422 -0
  20. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +796 -0
  21. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +65 -0
  22. data/vendor/cereal/include/cereal/details/static_object.hpp +127 -0
  23. data/vendor/cereal/include/cereal/details/traits.hpp +1411 -0
  24. data/vendor/cereal/include/cereal/details/util.hpp +84 -0
  25. data/vendor/cereal/include/cereal/external/base64.hpp +134 -0
  26. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +284 -0
  27. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +78 -0
  28. data/vendor/cereal/include/cereal/external/rapidjson/document.h +2652 -0
  29. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +299 -0
  30. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +716 -0
  31. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +74 -0
  32. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +161 -0
  33. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +99 -0
  34. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +104 -0
  35. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +151 -0
  36. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +290 -0
  37. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +271 -0
  38. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +245 -0
  39. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +78 -0
  40. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +308 -0
  41. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +186 -0
  42. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +55 -0
  43. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +740 -0
  44. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +232 -0
  45. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +69 -0
  46. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +290 -0
  47. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +46 -0
  48. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +128 -0
  49. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +70 -0
  50. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +71 -0
  51. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +316 -0
  52. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +300 -0
  53. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +81 -0
  54. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +1414 -0
  55. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +277 -0
  56. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +656 -0
  57. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +2230 -0
  58. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +2497 -0
  59. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +223 -0
  60. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +121 -0
  61. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +709 -0
  62. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +52 -0
  63. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +406 -0
  64. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +2624 -0
  65. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +175 -0
  66. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +428 -0
  67. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +123 -0
  68. data/vendor/cereal/include/cereal/macros.hpp +154 -0
  69. data/vendor/cereal/include/cereal/specialize.hpp +139 -0
  70. data/vendor/cereal/include/cereal/types/array.hpp +79 -0
  71. data/vendor/cereal/include/cereal/types/atomic.hpp +55 -0
  72. data/vendor/cereal/include/cereal/types/base_class.hpp +203 -0
  73. data/vendor/cereal/include/cereal/types/bitset.hpp +176 -0
  74. data/vendor/cereal/include/cereal/types/boost_variant.hpp +164 -0
  75. data/vendor/cereal/include/cereal/types/chrono.hpp +72 -0
  76. data/vendor/cereal/include/cereal/types/common.hpp +129 -0
  77. data/vendor/cereal/include/cereal/types/complex.hpp +56 -0
  78. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +73 -0
  79. data/vendor/cereal/include/cereal/types/deque.hpp +62 -0
  80. data/vendor/cereal/include/cereal/types/forward_list.hpp +68 -0
  81. data/vendor/cereal/include/cereal/types/functional.hpp +43 -0
  82. data/vendor/cereal/include/cereal/types/list.hpp +62 -0
  83. data/vendor/cereal/include/cereal/types/map.hpp +36 -0
  84. data/vendor/cereal/include/cereal/types/memory.hpp +425 -0
  85. data/vendor/cereal/include/cereal/types/optional.hpp +66 -0
  86. data/vendor/cereal/include/cereal/types/polymorphic.hpp +483 -0
  87. data/vendor/cereal/include/cereal/types/queue.hpp +132 -0
  88. data/vendor/cereal/include/cereal/types/set.hpp +103 -0
  89. data/vendor/cereal/include/cereal/types/stack.hpp +76 -0
  90. data/vendor/cereal/include/cereal/types/string.hpp +61 -0
  91. data/vendor/cereal/include/cereal/types/tuple.hpp +123 -0
  92. data/vendor/cereal/include/cereal/types/unordered_map.hpp +36 -0
  93. data/vendor/cereal/include/cereal/types/unordered_set.hpp +99 -0
  94. data/vendor/cereal/include/cereal/types/utility.hpp +47 -0
  95. data/vendor/cereal/include/cereal/types/valarray.hpp +89 -0
  96. data/vendor/cereal/include/cereal/types/variant.hpp +109 -0
  97. data/vendor/cereal/include/cereal/types/vector.hpp +112 -0
  98. data/vendor/cereal/include/cereal/version.hpp +52 -0
  99. data/vendor/isotree/LICENSE +1 -1
  100. data/vendor/isotree/README.md +2 -1
  101. data/vendor/isotree/src/RcppExports.cpp +44 -4
  102. data/vendor/isotree/src/Rwrapper.cpp +141 -51
  103. data/vendor/isotree/src/crit.cpp +1 -1
  104. data/vendor/isotree/src/dealloc.cpp +1 -1
  105. data/vendor/isotree/src/dist.cpp +6 -6
  106. data/vendor/isotree/src/extended.cpp +5 -5
  107. data/vendor/isotree/src/fit_model.cpp +30 -19
  108. data/vendor/isotree/src/helpers_iforest.cpp +26 -11
  109. data/vendor/isotree/src/impute.cpp +7 -7
  110. data/vendor/isotree/src/isoforest.cpp +7 -7
  111. data/vendor/isotree/src/isotree.hpp +27 -5
  112. data/vendor/isotree/src/merge_models.cpp +1 -1
  113. data/vendor/isotree/src/mult.cpp +1 -1
  114. data/vendor/isotree/src/predict.cpp +20 -16
  115. data/vendor/isotree/src/serialize.cpp +1 -1
  116. data/vendor/isotree/src/sql.cpp +545 -0
  117. data/vendor/isotree/src/utils.cpp +36 -44
  118. metadata +98 -92
@@ -0,0 +1,52 @@
1
+ /*! \file version.hpp
2
+ \brief Macros to detect cereal version
3
+
4
+ These macros can assist in determining the version of cereal. Be
5
+ warned that cereal is not guaranteed to be compatible across
6
+ different versions. For more information on releases of cereal,
7
+ see https://github.com/USCiLab/cereal/releases.
8
+
9
+ \ingroup utility */
10
+ /*
11
+ Copyright (c) 2018, Shane Grant
12
+ All rights reserved.
13
+
14
+ Redistribution and use in source and binary forms, with or without
15
+ modification, are permitted provided that the following conditions are met:
16
+ * Redistributions of source code must retain the above copyright
17
+ notice, this list of conditions and the following disclaimer.
18
+ * Redistributions in binary form must reproduce the above copyright
19
+ notice, this list of conditions and the following disclaimer in the
20
+ documentation and/or other materials provided with the distribution.
21
+ * Neither the name of cereal nor the
22
+ names of its contributors may be used to endorse or promote products
23
+ derived from this software without specific prior written permission.
24
+
25
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28
+ DISCLAIMED. IN NO EVENT SHALL RANDOLPH VOORHIES OR SHANE GRANT BE LIABLE FOR ANY
29
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
32
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ */
36
+
37
+ #ifndef CEREAL_VERSION_HPP_
38
+ #define CEREAL_VERSION_HPP_
39
+
40
+ //! The major version
41
+ #define CEREAL_VERSION_MAJOR 1
42
+ //! The minor version
43
+ #define CEREAL_VERSION_MINOR 3
44
+ //! The patch version
45
+ #define CEREAL_VERSION_PATCH 0
46
+
47
+ //! The full version as a single number
48
+ #define CEREAL_VERSION (CEREAL_VERSION_MAJOR * 10000 \
49
+ + CEREAL_VERSION_MINOR * 100 \
50
+ + CEREAL_VERSION_PATCH)
51
+
52
+ #endif // CEREAL_VERSION_HPP_
@@ -1,6 +1,6 @@
1
1
  BSD 2-Clause License
2
2
 
3
- Copyright (c) 2019, David Cortes
3
+ Copyright (c) 2020, David Cortes
4
4
  All rights reserved.
5
5
 
6
6
  Redistribution and use in source and binary forms, with or without
@@ -42,7 +42,8 @@ There's already many available implementations of isolation forests for both Pyt
42
42
  * Uses exact formula (not approximation as others do) for harmonic numbers at lower sample and remainder sizes.
43
43
  * Can fit trees incrementally to user-provided data samples.
44
44
  * Produces serializable model objects with reasonable file sizes.
45
- * Fast and multi-threaded C++ code. Can be wrapped in languages other than Python and R.
45
+ * Can translate the generated trees into SQL statements.
46
+ * Fast and multi-threaded C++ code. Can be wrapped in languages other than Python/R/Ruby.
46
47
 
47
48
  (Note that categoricals, NAs, and density-like sample weights, are treated heuristically with different options as there is no single logical extension of the original idea to them, and having them present might degrade performance/accuracy for regular numerical non-missing observations)
48
49
 
@@ -50,8 +50,8 @@ BEGIN_RCPP
50
50
  END_RCPP
51
51
  }
52
52
  // fit_model
53
- Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat, Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr, Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights, size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample, size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth, bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist, bool calc_depth, bool standardize_depth, bool weigh_by_kurt, double prob_pick_by_gain_avg, double prob_split_by_gain_avg, double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain, Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action, Rcpp::CharacterVector missing_action, bool all_perm, bool build_imputer, bool output_imputations, size_t min_imp_obs, Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows, int random_seed, int nthreads);
54
- RcppExport SEXP _isotree_fit_model(SEXP X_numSEXP, SEXP X_catSEXP, SEXP ncatSEXP, SEXP XcSEXP, SEXP Xc_indSEXP, SEXP Xc_indptrSEXP, SEXP sample_weightsSEXP, SEXP col_weightsSEXP, SEXP nrowsSEXP, SEXP ncols_numericSEXP, SEXP ncols_categSEXP, SEXP ndimSEXP, SEXP ntrySEXP, SEXP coef_typeSEXP, SEXP coef_by_propSEXP, SEXP with_replacementSEXP, SEXP weight_as_sampleSEXP, SEXP sample_sizeSEXP, SEXP ntreesSEXP, SEXP max_depthSEXP, SEXP limit_depthSEXP, SEXP penalize_rangeSEXP, SEXP calc_distSEXP, SEXP standardize_distSEXP, SEXP sq_distSEXP, SEXP calc_depthSEXP, SEXP standardize_depthSEXP, SEXP weigh_by_kurtSEXP, SEXP prob_pick_by_gain_avgSEXP, SEXP prob_split_by_gain_avgSEXP, SEXP prob_pick_by_gain_plSEXP, SEXP prob_split_by_gain_plSEXP, SEXP min_gainSEXP, SEXP cat_split_typeSEXP, SEXP new_cat_actionSEXP, SEXP missing_actionSEXP, SEXP all_permSEXP, SEXP build_imputerSEXP, SEXP output_imputationsSEXP, SEXP min_imp_obsSEXP, SEXP depth_impSEXP, SEXP weigh_imp_rowsSEXP, SEXP random_seedSEXP, SEXP nthreadsSEXP) {
53
+ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat, Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr, Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights, size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample, size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth, bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist, bool calc_depth, bool standardize_depth, bool weigh_by_kurt, double prob_pick_by_gain_avg, double prob_split_by_gain_avg, double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain, Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action, Rcpp::CharacterVector missing_action, bool all_perm, bool build_imputer, bool output_imputations, size_t min_imp_obs, Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows, int random_seed, bool handle_interrupt, int nthreads);
54
+ RcppExport SEXP _isotree_fit_model(SEXP X_numSEXP, SEXP X_catSEXP, SEXP ncatSEXP, SEXP XcSEXP, SEXP Xc_indSEXP, SEXP Xc_indptrSEXP, SEXP sample_weightsSEXP, SEXP col_weightsSEXP, SEXP nrowsSEXP, SEXP ncols_numericSEXP, SEXP ncols_categSEXP, SEXP ndimSEXP, SEXP ntrySEXP, SEXP coef_typeSEXP, SEXP coef_by_propSEXP, SEXP with_replacementSEXP, SEXP weight_as_sampleSEXP, SEXP sample_sizeSEXP, SEXP ntreesSEXP, SEXP max_depthSEXP, SEXP limit_depthSEXP, SEXP penalize_rangeSEXP, SEXP calc_distSEXP, SEXP standardize_distSEXP, SEXP sq_distSEXP, SEXP calc_depthSEXP, SEXP standardize_depthSEXP, SEXP weigh_by_kurtSEXP, SEXP prob_pick_by_gain_avgSEXP, SEXP prob_split_by_gain_avgSEXP, SEXP prob_pick_by_gain_plSEXP, SEXP prob_split_by_gain_plSEXP, SEXP min_gainSEXP, SEXP cat_split_typeSEXP, SEXP new_cat_actionSEXP, SEXP missing_actionSEXP, SEXP all_permSEXP, SEXP build_imputerSEXP, SEXP output_imputationsSEXP, SEXP min_imp_obsSEXP, SEXP depth_impSEXP, SEXP weigh_imp_rowsSEXP, SEXP random_seedSEXP, SEXP handle_interruptSEXP, SEXP nthreadsSEXP) {
55
55
  BEGIN_RCPP
56
56
  Rcpp::RObject rcpp_result_gen;
57
57
  Rcpp::RNGScope rcpp_rngScope_gen;
@@ -98,8 +98,9 @@ BEGIN_RCPP
98
98
  Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type depth_imp(depth_impSEXP);
99
99
  Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type weigh_imp_rows(weigh_imp_rowsSEXP);
100
100
  Rcpp::traits::input_parameter< int >::type random_seed(random_seedSEXP);
101
+ Rcpp::traits::input_parameter< bool >::type handle_interrupt(handle_interruptSEXP);
101
102
  Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP);
102
- rcpp_result_gen = Rcpp::wrap(fit_model(X_num, X_cat, ncat, Xc, Xc_ind, Xc_indptr, sample_weights, col_weights, nrows, ncols_numeric, ncols_categ, ndim, ntry, coef_type, coef_by_prop, with_replacement, weight_as_sample, sample_size, ntrees, max_depth, limit_depth, penalize_range, calc_dist, standardize_dist, sq_dist, calc_depth, standardize_depth, weigh_by_kurt, prob_pick_by_gain_avg, prob_split_by_gain_avg, prob_pick_by_gain_pl, prob_split_by_gain_pl, min_gain, cat_split_type, new_cat_action, missing_action, all_perm, build_imputer, output_imputations, min_imp_obs, depth_imp, weigh_imp_rows, random_seed, nthreads));
103
+ rcpp_result_gen = Rcpp::wrap(fit_model(X_num, X_cat, ncat, Xc, Xc_ind, Xc_indptr, sample_weights, col_weights, nrows, ncols_numeric, ncols_categ, ndim, ntry, coef_type, coef_by_prop, with_replacement, weight_as_sample, sample_size, ntrees, max_depth, limit_depth, penalize_range, calc_dist, standardize_dist, sq_dist, calc_depth, standardize_depth, weigh_by_kurt, prob_pick_by_gain_avg, prob_split_by_gain_avg, prob_pick_by_gain_pl, prob_split_by_gain_pl, min_gain, cat_split_type, new_cat_action, missing_action, all_perm, build_imputer, output_imputations, min_imp_obs, depth_imp, weigh_imp_rows, random_seed, handle_interrupt, nthreads));
103
104
  return rcpp_result_gen;
104
105
  END_RCPP
105
106
  }
@@ -245,19 +246,58 @@ BEGIN_RCPP
245
246
  return rcpp_result_gen;
246
247
  END_RCPP
247
248
  }
249
+ // model_to_sql
250
+ Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_extended, Rcpp::CharacterVector numeric_colanmes, Rcpp::CharacterVector categ_colnames, Rcpp::ListOf<Rcpp::CharacterVector> categ_levels, bool output_tree_num, bool single_tree, size_t tree_num, int nthreads);
251
+ RcppExport SEXP _isotree_model_to_sql(SEXP model_R_ptrSEXP, SEXP is_extendedSEXP, SEXP numeric_colanmesSEXP, SEXP categ_colnamesSEXP, SEXP categ_levelsSEXP, SEXP output_tree_numSEXP, SEXP single_treeSEXP, SEXP tree_numSEXP, SEXP nthreadsSEXP) {
252
+ BEGIN_RCPP
253
+ Rcpp::RObject rcpp_result_gen;
254
+ Rcpp::RNGScope rcpp_rngScope_gen;
255
+ Rcpp::traits::input_parameter< SEXP >::type model_R_ptr(model_R_ptrSEXP);
256
+ Rcpp::traits::input_parameter< bool >::type is_extended(is_extendedSEXP);
257
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type numeric_colanmes(numeric_colanmesSEXP);
258
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type categ_colnames(categ_colnamesSEXP);
259
+ Rcpp::traits::input_parameter< Rcpp::ListOf<Rcpp::CharacterVector> >::type categ_levels(categ_levelsSEXP);
260
+ Rcpp::traits::input_parameter< bool >::type output_tree_num(output_tree_numSEXP);
261
+ Rcpp::traits::input_parameter< bool >::type single_tree(single_treeSEXP);
262
+ Rcpp::traits::input_parameter< size_t >::type tree_num(tree_numSEXP);
263
+ Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP);
264
+ rcpp_result_gen = Rcpp::wrap(model_to_sql(model_R_ptr, is_extended, numeric_colanmes, categ_colnames, categ_levels, output_tree_num, single_tree, tree_num, nthreads));
265
+ return rcpp_result_gen;
266
+ END_RCPP
267
+ }
268
+ // model_to_sql_with_select_from
269
+ Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_extended, Rcpp::CharacterVector numeric_colanmes, Rcpp::CharacterVector categ_colnames, Rcpp::ListOf<Rcpp::CharacterVector> categ_levels, Rcpp::CharacterVector table_from, Rcpp::CharacterVector select_as, int nthreads);
270
+ RcppExport SEXP _isotree_model_to_sql_with_select_from(SEXP model_R_ptrSEXP, SEXP is_extendedSEXP, SEXP numeric_colanmesSEXP, SEXP categ_colnamesSEXP, SEXP categ_levelsSEXP, SEXP table_fromSEXP, SEXP select_asSEXP, SEXP nthreadsSEXP) {
271
+ BEGIN_RCPP
272
+ Rcpp::RObject rcpp_result_gen;
273
+ Rcpp::RNGScope rcpp_rngScope_gen;
274
+ Rcpp::traits::input_parameter< SEXP >::type model_R_ptr(model_R_ptrSEXP);
275
+ Rcpp::traits::input_parameter< bool >::type is_extended(is_extendedSEXP);
276
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type numeric_colanmes(numeric_colanmesSEXP);
277
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type categ_colnames(categ_colnamesSEXP);
278
+ Rcpp::traits::input_parameter< Rcpp::ListOf<Rcpp::CharacterVector> >::type categ_levels(categ_levelsSEXP);
279
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type table_from(table_fromSEXP);
280
+ Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type select_as(select_asSEXP);
281
+ Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP);
282
+ rcpp_result_gen = Rcpp::wrap(model_to_sql_with_select_from(model_R_ptr, is_extended, numeric_colanmes, categ_colnames, categ_levels, table_from, select_as, nthreads));
283
+ return rcpp_result_gen;
284
+ END_RCPP
285
+ }
248
286
 
249
287
  static const R_CallMethodDef CallEntries[] = {
250
288
  {"_isotree_deserialize_IsoForest", (DL_FUNC) &_isotree_deserialize_IsoForest, 1},
251
289
  {"_isotree_deserialize_ExtIsoForest", (DL_FUNC) &_isotree_deserialize_ExtIsoForest, 1},
252
290
  {"_isotree_deserialize_Imputer", (DL_FUNC) &_isotree_deserialize_Imputer, 1},
253
291
  {"_isotree_check_null_ptr_model", (DL_FUNC) &_isotree_check_null_ptr_model, 1},
254
- {"_isotree_fit_model", (DL_FUNC) &_isotree_fit_model, 44},
292
+ {"_isotree_fit_model", (DL_FUNC) &_isotree_fit_model, 45},
255
293
  {"_isotree_fit_tree", (DL_FUNC) &_isotree_fit_tree, 35},
256
294
  {"_isotree_predict_iso", (DL_FUNC) &_isotree_predict_iso, 15},
257
295
  {"_isotree_dist_iso", (DL_FUNC) &_isotree_dist_iso, 16},
258
296
  {"_isotree_impute_iso", (DL_FUNC) &_isotree_impute_iso, 10},
259
297
  {"_isotree_get_n_nodes", (DL_FUNC) &_isotree_get_n_nodes, 3},
260
298
  {"_isotree_append_trees_from_other", (DL_FUNC) &_isotree_append_trees_from_other, 5},
299
+ {"_isotree_model_to_sql", (DL_FUNC) &_isotree_model_to_sql, 9},
300
+ {"_isotree_model_to_sql_with_select_from", (DL_FUNC) &_isotree_model_to_sql_with_select_from, 8},
261
301
  {NULL, NULL, 0}
262
302
  };
263
303
 
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@
42
42
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
43
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
44
  */
45
+ #ifdef _FOR_R
45
46
 
46
47
  #include <Rcpp.h>
47
48
  // [[Rcpp::plugins(cpp11)]]
@@ -69,52 +70,48 @@ Rcpp::RawVector serialize_cpp_obj(T *model_outputs)
69
70
  oarchive(*model_outputs);
70
71
  }
71
72
  ss.seekg(0, ss.end);
72
- Rcpp::RawVector retval(ss.tellg());
73
+ /* Checking for potential integer overflows */
74
+ std::stringstream::pos_type vec_size = ss.tellg();
75
+ if (vec_size <= 0) {
76
+ Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
77
+ return Rcpp::RawVector();
78
+ }
79
+ Rcpp::RawVector retval((size_t) vec_size);
73
80
  ss.seekg(0, ss.beg);
74
81
  ss.read(reinterpret_cast<char*>(&retval[0]), retval.size());
75
82
  return retval;
76
83
  }
77
84
 
78
- // [[Rcpp::export]]
79
- SEXP deserialize_IsoForest(Rcpp::RawVector src)
85
+ template <class T>
86
+ SEXP deserialize_cpp_obj(Rcpp::RawVector src)
80
87
  {
81
88
  std::stringstream ss;
82
89
  ss.write(reinterpret_cast<char*>(&src[0]), src.size());
83
90
  ss.seekg(0, ss.beg);
84
- std::unique_ptr<IsoForest> model_outputs = std::unique_ptr<IsoForest>(new IsoForest());
91
+ std::unique_ptr<T> model_outputs = std::unique_ptr<T>(new T());
85
92
  {
86
93
  cereal::BinaryInputArchive iarchive(ss);
87
94
  iarchive(*model_outputs);
88
95
  }
89
- return Rcpp::XPtr<IsoForest>(model_outputs.release(), true);
96
+ return Rcpp::XPtr<T>(model_outputs.release(), true);
97
+ }
98
+
99
+ // [[Rcpp::export]]
100
+ SEXP deserialize_IsoForest(Rcpp::RawVector src)
101
+ {
102
+ return deserialize_cpp_obj<IsoForest>(src);
90
103
  }
91
104
 
92
105
  // [[Rcpp::export]]
93
106
  SEXP deserialize_ExtIsoForest(Rcpp::RawVector src)
94
107
  {
95
- std::stringstream ss;
96
- ss.write(reinterpret_cast<char*>(&src[0]), src.size());
97
- ss.seekg(0, ss.beg);
98
- std::unique_ptr<ExtIsoForest> model_outputs = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
99
- {
100
- cereal::BinaryInputArchive iarchive(ss);
101
- iarchive(*model_outputs);
102
- }
103
- return Rcpp::XPtr<ExtIsoForest>(model_outputs.release(), true);
108
+ return deserialize_cpp_obj<ExtIsoForest>(src);
104
109
  }
105
110
 
106
111
  // [[Rcpp::export]]
107
112
  SEXP deserialize_Imputer(Rcpp::RawVector src)
108
113
  {
109
- std::stringstream ss;
110
- ss.write(reinterpret_cast<char*>(&src[0]), src.size());
111
- ss.seekg(0, ss.beg);
112
- std::unique_ptr<Imputer> imputer = std::unique_ptr<Imputer>(new Imputer());
113
- {
114
- cereal::BinaryInputArchive iarchive(ss);
115
- iarchive(*imputer);
116
- }
117
- return Rcpp::XPtr<Imputer>(imputer.release(), true);
114
+ return deserialize_cpp_obj<Imputer>(src);
118
115
  }
119
116
 
120
117
  // [[Rcpp::export]]
@@ -139,7 +136,7 @@ double* set_R_nan_as_C_nan(double *x, size_t n, int nthreads)
139
136
  for (size_t_for i = 0; i < n; i++)
140
137
  if (isnan(x[i]))
141
138
  x[i] = NAN;
142
- return &x[0];
139
+ return x;
143
140
  }
144
141
 
145
142
  // [[Rcpp::export]]
@@ -157,7 +154,7 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
157
154
  Rcpp::CharacterVector missing_action, bool all_perm,
158
155
  bool build_imputer, bool output_imputations, size_t min_imp_obs,
159
156
  Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
160
- int random_seed, int nthreads)
157
+ int random_seed, bool handle_interrupt, int nthreads)
161
158
  {
162
159
  double* numeric_data_ptr = NULL;
163
160
  int* categ_data_ptr = NULL;
@@ -303,7 +300,7 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
303
300
  cat_split_type_C, new_cat_action_C,
304
301
  all_perm, imputer_ptr.get(), min_imp_obs,
305
302
  depth_imp_C, weigh_imp_rows_C, output_imputations,
306
- (uint64_t) random_seed, nthreads);
303
+ (uint64_t) random_seed, handle_interrupt, nthreads);
307
304
 
308
305
  if (ret_val == EXIT_FAILURE)
309
306
  {
@@ -313,11 +310,19 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
313
310
  if (calc_dist && sq_dist)
314
311
  tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
315
312
 
313
+ bool serialization_failed = false;
316
314
  Rcpp::RawVector serialized_obj;
317
315
  if (ndim == 1)
318
316
  serialized_obj = serialize_cpp_obj(model_ptr.get());
319
317
  else
320
318
  serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
319
+ if (!serialized_obj.size()) serialization_failed = true;
320
+ if (serialization_failed) {
321
+ if (ndim == 1)
322
+ model_ptr.reset();
323
+ else
324
+ ext_model_ptr.reset();
325
+ }
321
326
 
322
327
  Rcpp::List outp = Rcpp::List::create(
323
328
  Rcpp::_["serialized_obj"] = serialized_obj,
@@ -326,18 +331,33 @@ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp:
326
331
  Rcpp::_["dmat"] = dmat
327
332
  );
328
333
 
329
- if (ndim == 1)
330
- outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
331
- else
332
- outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
334
+ if (!serialization_failed)
335
+ {
336
+ if (ndim == 1)
337
+ outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
338
+ else
339
+ outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
340
+ } else
341
+ outp["model_ptr"] = R_NilValue;
333
342
 
334
- if (build_imputer)
343
+ if (build_imputer && !serialization_failed)
335
344
  {
336
345
  outp["imputer_ser"] = serialize_cpp_obj(imputer_ptr.get());
337
- outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
346
+ if (!Rf_xlength(outp["imputer_ser"]))
347
+ {
348
+ serialization_failed = true;
349
+ imputer_ptr.reset();
350
+ if (ndim == 1)
351
+ model_ptr.reset();
352
+ else
353
+ ext_model_ptr.reset();
354
+ outp["imputer_ptr"] = R_NilValue;
355
+ outp["model_ptr"] = R_NilValue;
356
+ } else
357
+ outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
338
358
  }
339
359
 
340
- if (output_imputations)
360
+ if (output_imputations && !serialization_failed)
341
361
  {
342
362
  outp["imputed_num"] = Rcpp::NumericVector(Xcpp.begin(), Xcpp.end());
343
363
  outp["imputed_cat"] = X_cat;
@@ -517,21 +537,25 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
517
537
 
518
538
  if (X_cat.size())
519
539
  {
520
- categ_data_ptr = &X_cat[0];
540
+ categ_data_ptr = &X_cat[0];
521
541
  }
522
542
 
523
- if (Xc.size())
543
+ if (Xc_indptr.size())
524
544
  {
525
- Xc_ptr = &Xc[0];
526
- Xc_ind_ptr = &Xc_ind[0];
527
- Xc_indptr_ptr = &Xc_indptr[0];
545
+ if (Xc.size())
546
+ Xc_ptr = &Xc[0];
547
+ if (Xc_ind.size())
548
+ Xc_ind_ptr = &Xc_ind[0];
549
+ Xc_indptr_ptr = &Xc_indptr[0];
528
550
  }
529
551
 
530
- if (Xr.size())
552
+ if (Xr_indptr.size())
531
553
  {
532
- Xr_ptr = &Xr[0];
533
- Xr_ind_ptr = &Xr_ind[0];
534
- Xr_indptr_ptr = &Xr_indptr[0];
554
+ if (Xr.size())
555
+ Xr_ptr = &Xr[0];
556
+ if (Xr_ind.size())
557
+ Xr_ind_ptr = &Xr_ind[0];
558
+ Xr_indptr_ptr = &Xr_indptr[0];
535
559
  }
536
560
 
537
561
  if (tree_num.size())
@@ -539,7 +563,7 @@ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector
539
563
  tree_num_ptr = &tree_num[0];
540
564
  }
541
565
 
542
- double* depths_ptr = &outp[0];
566
+ double* depths_ptr = &outp[0];
543
567
 
544
568
  IsoForest* model_ptr = NULL;
545
569
  ExtIsoForest* ext_model_ptr = NULL;
@@ -592,10 +616,12 @@ void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dm
592
616
  categ_data_ptr = &X_cat[0];
593
617
  }
594
618
 
595
- if (Xc.size())
619
+ if (Xc_indptr.size())
596
620
  {
597
- Xc_ptr = &Xc[0];
598
- Xc_ind_ptr = &Xc_ind[0];
621
+ if (Xc.size())
622
+ Xc_ptr = &Xc[0];
623
+ if (Xc_ind.size())
624
+ Xc_ind_ptr = &Xc_ind[0];
599
625
  Xc_indptr_ptr = &Xc_indptr[0];
600
626
  }
601
627
 
@@ -654,10 +680,12 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
654
680
  categ_data_ptr = &X_cat[0];
655
681
  }
656
682
 
657
- if (Xr.size())
683
+ if (Xr_indptr.size())
658
684
  {
659
- Xr_ptr = &Xr[0];
660
- Xr_ind_ptr = &Xr_ind[0];
685
+ if (Xr.size())
686
+ Xr_ptr = &Xr[0];
687
+ if (Xr_ind.size())
688
+ Xr_ind_ptr = &Xr_ind[0];
661
689
  Xr_indptr_ptr = &Xr_indptr[0];
662
690
  }
663
691
 
@@ -681,7 +709,7 @@ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
681
709
  *imputer_ptr);
682
710
 
683
711
  return Rcpp::List::create(
684
- Rcpp::_["X_num"] = Xr.size()? Xr : X_num,
712
+ Rcpp::_["X_num"] = (Xr.size())? (Xr) : (X_num),
685
713
  Rcpp::_["X_cat"] = X_cat
686
714
  );
687
715
  }
@@ -760,3 +788,65 @@ Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
760
788
 
761
789
  return out;
762
790
  }
791
+
792
+ // [[Rcpp::export]]
793
+ Rcpp::ListOf<Rcpp::CharacterVector> model_to_sql(SEXP model_R_ptr, bool is_extended,
794
+ Rcpp::CharacterVector numeric_colanmes,
795
+ Rcpp::CharacterVector categ_colnames,
796
+ Rcpp::ListOf<Rcpp::CharacterVector> categ_levels,
797
+ bool output_tree_num, bool single_tree, size_t tree_num,
798
+ int nthreads)
799
+ {
800
+ IsoForest* model_ptr = NULL;
801
+ ExtIsoForest* ext_model_ptr = NULL;
802
+ if (is_extended)
803
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
804
+ else
805
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
806
+
807
+ std::vector<std::string> numeric_colanmes_cpp = Rcpp::as<std::vector<std::string>>(numeric_colanmes);
808
+ std::vector<std::string> categ_colanmes_cpp = Rcpp::as<std::vector<std::string>>(categ_colnames);
809
+ std::vector<std::vector<std::string>> categ_levels_cpp = Rcpp::as<std::vector<std::vector<std::string>>>(categ_levels);
810
+
811
+ std::vector<std::string> res = generate_sql(model_ptr, ext_model_ptr,
812
+ numeric_colanmes_cpp,
813
+ categ_colanmes_cpp,
814
+ categ_levels_cpp,
815
+ output_tree_num, true, single_tree, tree_num,
816
+ nthreads);
817
+ Rcpp::List out(res.size());
818
+ for (size_t ix = 0; ix < res.size(); ix++)
819
+ out[ix] = Rcpp::CharacterVector(res[ix]);
820
+ return out;
821
+ }
822
+
823
+ // [[Rcpp::export]]
824
+ Rcpp::CharacterVector model_to_sql_with_select_from(SEXP model_R_ptr, bool is_extended,
825
+ Rcpp::CharacterVector numeric_colanmes,
826
+ Rcpp::CharacterVector categ_colnames,
827
+ Rcpp::ListOf<Rcpp::CharacterVector> categ_levels,
828
+ Rcpp::CharacterVector table_from,
829
+ Rcpp::CharacterVector select_as,
830
+ int nthreads)
831
+ {
832
+ IsoForest* model_ptr = NULL;
833
+ ExtIsoForest* ext_model_ptr = NULL;
834
+ if (is_extended)
835
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
836
+ else
837
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
838
+
839
+ std::vector<std::string> numeric_colanmes_cpp = Rcpp::as<std::vector<std::string>>(numeric_colanmes);
840
+ std::vector<std::string> categ_colanmes_cpp = Rcpp::as<std::vector<std::string>>(categ_colnames);
841
+ std::vector<std::vector<std::string>> categ_levels_cpp = Rcpp::as<std::vector<std::vector<std::string>>>(categ_levels);
842
+ std::string table_from_cpp = Rcpp::as<std::string>(table_from);
843
+ std::string select_as_cpp = Rcpp::as<std::string>(select_as);
844
+
845
+ return generate_sql_with_select_from(model_ptr, ext_model_ptr,
846
+ table_from_cpp, select_as_cpp,
847
+ numeric_colanmes_cpp, categ_colanmes_cpp,
848
+ categ_levels_cpp,
849
+ true, nthreads);
850
+ }
851
+
852
+ #endif /* _FOR_R */