isotree 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (118) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/LICENSE.txt +2 -2
  4. data/README.md +22 -1
  5. data/ext/isotree/ext.cpp +26 -0
  6. data/ext/isotree/extconf.rb +3 -3
  7. data/lib/isotree.rb +1 -0
  8. data/lib/isotree/isolation_forest.rb +86 -1
  9. data/lib/isotree/version.rb +1 -1
  10. data/vendor/cereal/LICENSE +24 -0
  11. data/vendor/cereal/README.md +85 -0
  12. data/vendor/cereal/include/cereal/access.hpp +351 -0
  13. data/vendor/cereal/include/cereal/archives/adapters.hpp +163 -0
  14. data/vendor/cereal/include/cereal/archives/binary.hpp +169 -0
  15. data/vendor/cereal/include/cereal/archives/json.hpp +1019 -0
  16. data/vendor/cereal/include/cereal/archives/portable_binary.hpp +334 -0
  17. data/vendor/cereal/include/cereal/archives/xml.hpp +956 -0
  18. data/vendor/cereal/include/cereal/cereal.hpp +1089 -0
  19. data/vendor/cereal/include/cereal/details/helpers.hpp +422 -0
  20. data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +796 -0
  21. data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +65 -0
  22. data/vendor/cereal/include/cereal/details/static_object.hpp +127 -0
  23. data/vendor/cereal/include/cereal/details/traits.hpp +1411 -0
  24. data/vendor/cereal/include/cereal/details/util.hpp +84 -0
  25. data/vendor/cereal/include/cereal/external/base64.hpp +134 -0
  26. data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +284 -0
  27. data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +78 -0
  28. data/vendor/cereal/include/cereal/external/rapidjson/document.h +2652 -0
  29. data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +299 -0
  30. data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +716 -0
  31. data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +74 -0
  32. data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +161 -0
  33. data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +99 -0
  34. data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +104 -0
  35. data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +151 -0
  36. data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +290 -0
  37. data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +271 -0
  38. data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +245 -0
  39. data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +78 -0
  40. data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +308 -0
  41. data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +186 -0
  42. data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +55 -0
  43. data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +740 -0
  44. data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +232 -0
  45. data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +69 -0
  46. data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +290 -0
  47. data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +46 -0
  48. data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +128 -0
  49. data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +70 -0
  50. data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +71 -0
  51. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +316 -0
  52. data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +300 -0
  53. data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +81 -0
  54. data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +1414 -0
  55. data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +277 -0
  56. data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +656 -0
  57. data/vendor/cereal/include/cereal/external/rapidjson/reader.h +2230 -0
  58. data/vendor/cereal/include/cereal/external/rapidjson/schema.h +2497 -0
  59. data/vendor/cereal/include/cereal/external/rapidjson/stream.h +223 -0
  60. data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +121 -0
  61. data/vendor/cereal/include/cereal/external/rapidjson/writer.h +709 -0
  62. data/vendor/cereal/include/cereal/external/rapidxml/license.txt +52 -0
  63. data/vendor/cereal/include/cereal/external/rapidxml/manual.html +406 -0
  64. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +2624 -0
  65. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +175 -0
  66. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +428 -0
  67. data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +123 -0
  68. data/vendor/cereal/include/cereal/macros.hpp +154 -0
  69. data/vendor/cereal/include/cereal/specialize.hpp +139 -0
  70. data/vendor/cereal/include/cereal/types/array.hpp +79 -0
  71. data/vendor/cereal/include/cereal/types/atomic.hpp +55 -0
  72. data/vendor/cereal/include/cereal/types/base_class.hpp +203 -0
  73. data/vendor/cereal/include/cereal/types/bitset.hpp +176 -0
  74. data/vendor/cereal/include/cereal/types/boost_variant.hpp +164 -0
  75. data/vendor/cereal/include/cereal/types/chrono.hpp +72 -0
  76. data/vendor/cereal/include/cereal/types/common.hpp +129 -0
  77. data/vendor/cereal/include/cereal/types/complex.hpp +56 -0
  78. data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +73 -0
  79. data/vendor/cereal/include/cereal/types/deque.hpp +62 -0
  80. data/vendor/cereal/include/cereal/types/forward_list.hpp +68 -0
  81. data/vendor/cereal/include/cereal/types/functional.hpp +43 -0
  82. data/vendor/cereal/include/cereal/types/list.hpp +62 -0
  83. data/vendor/cereal/include/cereal/types/map.hpp +36 -0
  84. data/vendor/cereal/include/cereal/types/memory.hpp +425 -0
  85. data/vendor/cereal/include/cereal/types/optional.hpp +66 -0
  86. data/vendor/cereal/include/cereal/types/polymorphic.hpp +483 -0
  87. data/vendor/cereal/include/cereal/types/queue.hpp +132 -0
  88. data/vendor/cereal/include/cereal/types/set.hpp +103 -0
  89. data/vendor/cereal/include/cereal/types/stack.hpp +76 -0
  90. data/vendor/cereal/include/cereal/types/string.hpp +61 -0
  91. data/vendor/cereal/include/cereal/types/tuple.hpp +123 -0
  92. data/vendor/cereal/include/cereal/types/unordered_map.hpp +36 -0
  93. data/vendor/cereal/include/cereal/types/unordered_set.hpp +99 -0
  94. data/vendor/cereal/include/cereal/types/utility.hpp +47 -0
  95. data/vendor/cereal/include/cereal/types/valarray.hpp +89 -0
  96. data/vendor/cereal/include/cereal/types/variant.hpp +109 -0
  97. data/vendor/cereal/include/cereal/types/vector.hpp +112 -0
  98. data/vendor/cereal/include/cereal/version.hpp +52 -0
  99. data/vendor/isotree/LICENSE +1 -1
  100. data/vendor/isotree/README.md +2 -1
  101. data/vendor/isotree/src/RcppExports.cpp +44 -4
  102. data/vendor/isotree/src/Rwrapper.cpp +141 -51
  103. data/vendor/isotree/src/crit.cpp +1 -1
  104. data/vendor/isotree/src/dealloc.cpp +1 -1
  105. data/vendor/isotree/src/dist.cpp +6 -6
  106. data/vendor/isotree/src/extended.cpp +5 -5
  107. data/vendor/isotree/src/fit_model.cpp +30 -19
  108. data/vendor/isotree/src/helpers_iforest.cpp +26 -11
  109. data/vendor/isotree/src/impute.cpp +7 -7
  110. data/vendor/isotree/src/isoforest.cpp +7 -7
  111. data/vendor/isotree/src/isotree.hpp +27 -5
  112. data/vendor/isotree/src/merge_models.cpp +1 -1
  113. data/vendor/isotree/src/mult.cpp +1 -1
  114. data/vendor/isotree/src/predict.cpp +20 -16
  115. data/vendor/isotree/src/serialize.cpp +1 -1
  116. data/vendor/isotree/src/sql.cpp +545 -0
  117. data/vendor/isotree/src/utils.cpp +36 -44
  118. metadata +98 -92
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -280,14 +280,14 @@ void traverse_tree_sim(WorkerForSimilarity &workspace,
280
280
 
281
281
 
282
282
  /* divide according to tree */
283
- if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
283
+ if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
284
284
  std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
285
285
  size_t st_NA, end_NA, split_ix;
286
286
  switch(trees[curr_tree].col_type)
287
287
  {
288
288
  case Numeric:
289
289
  {
290
- if (prediction_data.Xc == NULL)
290
+ if (prediction_data.Xc_indptr == NULL)
291
291
  divide_subset_split(workspace.ix_arr.data(),
292
292
  prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
293
293
  workspace.st, workspace.end, trees[curr_tree].num_split,
@@ -477,14 +477,14 @@ void traverse_hplane_sim(WorkerForSimilarity &workspace,
477
477
  prediction_data.nrows, workspace.rmat.data(), -1.);
478
478
  }
479
479
 
480
- if (prediction_data.Xc != NULL && !workspace.tmat_sep.size())
480
+ if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
481
481
  std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
482
482
 
483
483
  /* reconstruct linear combination */
484
484
  size_t ncols_numeric = 0;
485
485
  size_t ncols_categ = 0;
486
486
  std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
487
- if (prediction_data.categ_data != NULL || prediction_data.Xc != NULL)
487
+ if (prediction_data.categ_data != NULL || prediction_data.Xc_indptr != NULL)
488
488
  {
489
489
  for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
490
490
  {
@@ -492,7 +492,7 @@ void traverse_hplane_sim(WorkerForSimilarity &workspace,
492
492
  {
493
493
  case Numeric:
494
494
  {
495
- if (prediction_data.Xc == NULL)
495
+ if (prediction_data.Xc_indptr == NULL)
496
496
  add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
497
497
  prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
498
498
  hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -60,7 +60,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
60
60
  /* calculate imputation statistics if desired */
61
61
  if (impute_nodes != NULL)
62
62
  {
63
- if (input_data.Xc != NULL)
63
+ if (input_data.Xc_indptr != NULL)
64
64
  std::sort(workspace.ix_arr.begin() + workspace.st,
65
65
  workspace.ix_arr.begin() + workspace.end + 1);
66
66
  build_impute_node(impute_nodes->back(), workspace,
@@ -85,7 +85,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
85
85
  goto terminal_statistics;
86
86
 
87
87
  /* for sparse matrices, need to sort the indices */
88
- if (input_data.Xc != NULL && impute_nodes == NULL)
88
+ if (input_data.Xc_indptr != NULL && impute_nodes == NULL)
89
89
  std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
90
90
 
91
91
  /* pick column to split according to criteria */
@@ -370,7 +370,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
370
370
  {
371
371
  case Numeric:
372
372
  {
373
- if (input_data.Xc == NULL)
373
+ if (input_data.Xc_indptr == NULL)
374
374
  {
375
375
  add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
376
376
  input_data.numeric_data + hplanes.back().col_num[col] * input_data.nrows,
@@ -551,7 +551,7 @@ void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelPara
551
551
  }
552
552
  }
553
553
 
554
- if (input_data.Xc == NULL)
554
+ if (input_data.Xc_indptr == NULL)
555
555
  {
556
556
  calc_mean_and_sd(workspace.ix_arr.data(), workspace.st, workspace.end,
557
557
  input_data.numeric_data + workspace.col_chosen * input_data.nrows,
@@ -1,7 +1,7 @@
1
1
  /* Isolation forests and variations thereof, with adjustments for incorporation
2
2
  * of categorical variables and missing values.
3
3
  * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
4
+ *
5
5
  * This library is based on the following works:
6
6
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
7
  * "Isolation forest."
@@ -20,9 +20,9 @@
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
21
  * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
23
+ *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -47,7 +47,7 @@
47
47
  bool interrupt_switch;
48
48
 
49
49
  /* Fit Isolation Forest model, or variant of it such as SCiForest
50
- *
50
+ *
51
51
  * Parameters:
52
52
  * ===========
53
53
  * - model_outputs (out)
@@ -287,11 +287,15 @@ bool interrupt_switch;
287
287
  * 'categ_data', and 'Xc', will get overwritten with the imputations produced.
288
288
  * - random_seed
289
289
  * Seed that will be used to generate random numbers used by the model.
290
+ * - handle_interrupt
291
+ * Whether to handle interrupt signals while the process is running. Note that this will
292
+ * interfere with interrupt handles when the procedure is called from interpreted languages
293
+ * such as Python or R.
290
294
  * - nthreads
291
295
  * Number of parallel threads to use. Note that, the more threads, the more memory will be
292
296
  * allocated, even if the thread does not end up being used. Ignored when not building with
293
297
  * OpenMP support.
294
- *
298
+ *
295
299
  * Returns
296
300
  * =======
297
301
  * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
@@ -300,7 +304,7 @@ bool interrupt_switch;
300
304
  * what these values correspond to, you can use the functions
301
305
  * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
302
306
  * as integers.
303
- *
307
+ *
304
308
  * References
305
309
  * ==========
306
310
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
@@ -337,7 +341,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
337
341
  CategSplit cat_split_type, NewCategAction new_cat_action,
338
342
  bool all_perm, Imputer *imputer, size_t min_imp_obs,
339
343
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
340
- uint64_t random_seed, int nthreads)
344
+ uint64_t random_seed, bool handle_interrupt, int nthreads)
341
345
  {
342
346
  /* calculate maximum number of categories to use later */
343
347
  int max_categ = 0;
@@ -421,9 +425,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
421
425
  /* TODO: find a better way of handling interrupt signals when calling in Python/R.
422
426
  The following will still change the behavior of interrupts when called through e.g. Flask */
423
427
  #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
424
- struct sigaction sig_handle;
425
- sig_handle.sa_flags = SA_RESETHAND;
426
- sig_handle.sa_handler = set_interrup_global_variable;
428
+ struct sigaction sig_handle = {};
429
+ if (handle_interrupt)
430
+ {
431
+ sig_handle.sa_flags = SA_RESETHAND;
432
+ sig_handle.sa_handler = set_interrup_global_variable;
433
+ sigemptyset(&sig_handle.sa_mask);
434
+ }
427
435
  #endif
428
436
 
429
437
  /* grow trees */
@@ -468,11 +476,14 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
468
476
  else
469
477
  model_outputs_ext->hplanes[tree].shrink_to_fit();
470
478
 
471
- #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
472
- // sigaction(SIGINT, &sig_handle, NULL);
473
- #else
474
- // signal(SIGINT, set_interrup_global_variable);
475
- #endif
479
+ if (handle_interrupt)
480
+ {
481
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
482
+ sigaction(SIGINT, &sig_handle, NULL);
483
+ #else
484
+ signal(SIGINT, set_interrup_global_variable);
485
+ #endif
486
+ }
476
487
  }
477
488
 
478
489
  /* check if the procedure got interrupted */
@@ -556,7 +567,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
556
567
 
557
568
 
558
569
  /* Add additional trees to already-fitted isolation forest model
559
- *
570
+ *
560
571
  * Parameters
561
572
  * ==========
562
573
  * - model_outputs
@@ -972,7 +983,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
972
983
  min_size_chr = input_data.max_categ;
973
984
  }
974
985
 
975
- if (input_data.Xc != NULL && gain)
986
+ if (input_data.Xc_indptr != NULL && gain)
976
987
  {
977
988
  min_size_szt = std::max(min_size_szt, model_params.sample_size);
978
989
  min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
@@ -1012,7 +1023,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
1012
1023
  if (
1013
1024
  model_params.cat_split_type == SubSet &&
1014
1025
  (
1015
- model_params.prob_pick_by_gain_avg ||
1026
+ model_params.prob_pick_by_gain_avg ||
1016
1027
  model_params.prob_pick_by_gain_pl
1017
1028
  )
1018
1029
  )
@@ -1027,7 +1038,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
1027
1038
  {
1028
1039
  std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
1029
1040
 
1030
- if (input_data.Xc == NULL)
1041
+ if (input_data.Xc_indptr == NULL)
1031
1042
  {
1032
1043
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
1033
1044
  kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -91,7 +91,7 @@ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams
91
91
  {
92
92
  if (tree.col_type == Numeric)
93
93
  {
94
- if (input_data.Xc == NULL)
94
+ if (input_data.Xc_indptr == NULL)
95
95
  get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * tree.col_num,
96
96
  workspace.st, workspace.end, model_params.missing_action,
97
97
  workspace.xmin, workspace.xmax, workspace.unsplittable);
@@ -114,7 +114,7 @@ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams
114
114
  {
115
115
  if (workspace.col_type == Numeric)
116
116
  {
117
- if (input_data.Xc == NULL)
117
+ if (input_data.Xc_indptr == NULL)
118
118
  get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * workspace.col_chosen,
119
119
  workspace.st, workspace.end, model_params.missing_action,
120
120
  workspace.xmin, workspace.xmax, workspace.unsplittable);
@@ -281,10 +281,19 @@ void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_s
281
281
  /* for the extended model, it's not necessary to copy everything */
282
282
  if (!workspace.comb_val.size())
283
283
  {
284
- /* TODO: here only need to copy the left half, as the right one is untouched */
285
- recursion_state.ix_arr = workspace.ix_arr;
286
- recursion_state.weights_map = workspace.weights_map;
287
- recursion_state.weights_arr = workspace.weights_arr;
284
+ recursion_state.ix_arr = std::vector<size_t>(workspace.ix_arr.begin() + workspace.st_NA,
285
+ workspace.ix_arr.begin() + workspace.end + 1);
286
+ size_t tot = workspace.end - workspace.st_NA + 1;
287
+ if (workspace.weights_arr.size() || workspace.weights_map.size())
288
+ recursion_state.weights_arr = std::unique_ptr<double[]>(new double[tot]);
289
+ if (workspace.weights_arr.size())
290
+ for (size_t ix = 0; ix < tot; ix++)
291
+ recursion_state.weights_arr[ix] = workspace.weights_arr[workspace.ix_arr[ix + workspace.st_NA]];
292
+ else if (workspace.weights_map.size())
293
+ for (size_t ix = 0; ix < tot; ix++)
294
+ recursion_state.weights_arr[ix] = workspace.weights_map[workspace.ix_arr[ix + workspace.st_NA]];
295
+
296
+
288
297
  }
289
298
  }
290
299
 
@@ -301,9 +310,15 @@ void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_
301
310
 
302
311
  if (!workspace.comb_val.size())
303
312
  {
304
- /* TODO: here only need to copy the left half, as the right one is untouched */
305
- workspace.ix_arr = std::move(recursion_state.ix_arr);
306
- workspace.weights_map = std::move(recursion_state.weights_map);
307
- workspace.weights_arr = std::move(recursion_state.weights_arr);
313
+ std::copy(recursion_state.ix_arr.begin(),
314
+ recursion_state.ix_arr.end(),
315
+ workspace.ix_arr.begin() + recursion_state.st_NA);
316
+ size_t tot = workspace.end - workspace.st_NA + 1;
317
+ if (workspace.weights_arr.size())
318
+ for (size_t ix = 0; ix < tot; ix++)
319
+ workspace.weights_arr[workspace.ix_arr[ix + workspace.st_NA]] = recursion_state.weights_arr[ix];
320
+ else if (workspace.weights_map.size())
321
+ for (size_t ix = 0; ix < tot; ix++)
322
+ workspace.weights_map[workspace.ix_arr[ix + workspace.st_NA]] = recursion_state.weights_arr[ix];
308
323
  }
309
324
  }
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -226,7 +226,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
226
226
  }
227
227
  }
228
228
 
229
- else if (input_data.Xc != NULL)
229
+ else if (input_data.Xc_indptr != NULL)
230
230
  {
231
231
  #pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
232
232
  for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
@@ -304,7 +304,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
304
304
  double weight;
305
305
  size_t ix;
306
306
 
307
- if ((input_data.Xc == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
307
+ if ((input_data.Xc_indptr == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
308
308
  {
309
309
  if (!has_weights)
310
310
  {
@@ -401,7 +401,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
401
401
  }
402
402
  }
403
403
 
404
- if (input_data.Xc != NULL) /* sparse numeric */
404
+ if (input_data.Xc_indptr != NULL) /* sparse numeric */
405
405
  {
406
406
  size_t *ix_arr = workspace.ix_arr.data();
407
407
  size_t st_col, end_col, ind_end_col, curr_pos;
@@ -802,7 +802,7 @@ void apply_imputation_results(imp_arr &impute_vec,
802
802
  {
803
803
  size_t col;
804
804
 
805
- if (input_data.Xc != NULL)
805
+ if (input_data.Xc_indptr != NULL)
806
806
  {
807
807
  std::vector<size_t> row_pos(input_data.nrows, 0);
808
808
  size_t row;
@@ -950,7 +950,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
950
950
  imp.num_weight.assign(imp.n_missing_num, 0);
951
951
  }
952
952
 
953
- else if (input_data.Xc != NULL)
953
+ else if (input_data.Xc_indptr != NULL)
954
954
  {
955
955
  imp.missing_sp.resize(input_data.ncols_numeric);
956
956
  sparse_ix *res;
@@ -1105,7 +1105,7 @@ void check_for_missing(InputData &input_data,
1105
1105
  {
1106
1106
  input_data.has_missing.assign(input_data.nrows, false);
1107
1107
 
1108
- if (input_data.Xc != NULL)
1108
+ if (input_data.Xc_indptr != NULL)
1109
1109
  {
1110
1110
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
1111
1111
  #pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -56,7 +56,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
56
56
  /* calculate imputation statistics if desired */
57
57
  if (impute_nodes != NULL)
58
58
  {
59
- if (input_data.Xc != NULL)
59
+ if (input_data.Xc_indptr != NULL)
60
60
  std::sort(workspace.ix_arr.begin() + workspace.st,
61
61
  workspace.ix_arr.begin() + workspace.end + 1);
62
62
  build_impute_node(impute_nodes->back(), workspace,
@@ -81,7 +81,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
81
81
  goto terminal_statistics;
82
82
 
83
83
  /* for sparse matrices, need to sort the indices */
84
- if (input_data.Xc != NULL && impute_nodes == NULL)
84
+ if (input_data.Xc_indptr != NULL && impute_nodes == NULL)
85
85
  std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
86
86
 
87
87
  /* pick column to split according to criteria */
@@ -108,7 +108,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
108
108
 
109
109
  /* evaluate gain for all columns */
110
110
  trees.back().score = -HUGE_VAL; /* this is used to track the best gain */
111
- if (input_data.Xc == NULL)
111
+ if (input_data.Xc_indptr == NULL)
112
112
  {
113
113
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
114
114
  {
@@ -291,7 +291,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
291
291
  if (workspace.unsplittable)
292
292
  {
293
293
  workspace.ncols_tried = 0; /* note: this is used here as a counter for the number of still splittable columns */
294
- if (input_data.Xc == NULL)
294
+ if (input_data.Xc_indptr == NULL)
295
295
  {
296
296
  for (size_t col = 0; col < input_data.ncols_numeric; col++)
297
297
  {
@@ -406,7 +406,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
406
406
 
407
407
  default:
408
408
  {
409
- if (input_data.Xc == NULL)
409
+ if (input_data.Xc_indptr == NULL)
410
410
  {
411
411
  eval_guided_crit(workspace.ix_arr.data(), workspace.st, workspace.end,
412
412
  input_data.numeric_data + trees.back().col_num * input_data.nrows,
@@ -441,7 +441,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
441
441
  }
442
442
  }
443
443
 
444
- if (input_data.Xc == NULL)
444
+ if (input_data.Xc_indptr == NULL)
445
445
  divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * trees.back().col_num,
446
446
  workspace.st, workspace.end, trees.back().num_split, model_params.missing_action,
447
447
  workspace.st_NA, workspace.end_NA, workspace.split_ix);
@@ -22,7 +22,7 @@
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
23
  *
24
24
  * BSD 2-Clause License
25
- * Copyright (c) 2019, David Cortes
25
+ * Copyright (c) 2020, David Cortes
26
26
  * All rights reserved.
27
27
  * Redistribution and use in source and binary forms, with or without
28
28
  * modification, are permitted provided that the following conditions are met:
@@ -213,7 +213,7 @@ typedef struct IsoHPlane {
213
213
  std::vector<std::vector<double>> cat_coef;
214
214
  std::vector<int> chosen_cat;
215
215
  std::vector<double> fill_val;
216
- std::vector<double> fill_new;
216
+ std::vector<double> fill_new; /* <- when using single categ, coef will be here */
217
217
 
218
218
  double split_point;
219
219
  size_t hplane_left;
@@ -545,9 +545,8 @@ typedef struct {
545
545
  size_t split_ix;
546
546
  size_t end;
547
547
  std::vector<size_t> ix_arr;
548
- std::unordered_map<size_t, double> weights_map;
549
- std::vector<double> weights_arr;
550
548
  std::vector<bool> cols_possible;
549
+ std::unique_ptr<double[]> weights_arr;
551
550
  std::discrete_distribution<size_t> col_sampler;
552
551
  } RecursionState;
553
552
 
@@ -572,7 +571,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
572
571
  CategSplit cat_split_type, NewCategAction new_cat_action,
573
572
  bool all_perm, Imputer *imputer, size_t min_imp_obs,
574
573
  UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
575
- uint64_t random_seed, int nthreads);
574
+ uint64_t random_seed, bool handle_interrupt, int nthreads);
576
575
  int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
577
576
  double numeric_data[], size_t ncols_numeric,
578
577
  int categ_data[], size_t ncols_categ, int ncat[],
@@ -923,6 +922,29 @@ void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
923
922
  bool has_msvc();
924
923
  #endif /* _ENABLE_CEREAL */
925
924
 
925
+ /* sql.cpp */
926
+ std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
927
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
928
+ std::vector<std::vector<std::string>> &categ_levels,
929
+ bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
930
+ int nthreads);
931
+ std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
932
+ std::string &table_from, std::string &select_as,
933
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
934
+ std::vector<std::vector<std::string>> &categ_levels,
935
+ bool index1, int nthreads);
936
+ void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
937
+ size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
938
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right);
939
+ void extract_cond_isotree(IsoForest &model, IsoTree &tree,
940
+ std::string &cond_left, std::string &cond_right,
941
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
942
+ std::vector<std::vector<std::string>> &categ_levels);
943
+ void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
944
+ std::string &cond_left, std::string &cond_right,
945
+ std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
946
+ std::vector<std::vector<std::string>> &categ_levels);
947
+
926
948
  /* dealloc.cpp */
927
949
  void dealloc_IsoForest(IsoForest &model_outputs);
928
950
  void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);