isotree 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/LICENSE.txt +2 -2
- data/README.md +22 -1
- data/ext/isotree/ext.cpp +26 -0
- data/ext/isotree/extconf.rb +3 -3
- data/lib/isotree.rb +1 -0
- data/lib/isotree/isolation_forest.rb +86 -1
- data/lib/isotree/version.rb +1 -1
- data/vendor/cereal/LICENSE +24 -0
- data/vendor/cereal/README.md +85 -0
- data/vendor/cereal/include/cereal/access.hpp +351 -0
- data/vendor/cereal/include/cereal/archives/adapters.hpp +163 -0
- data/vendor/cereal/include/cereal/archives/binary.hpp +169 -0
- data/vendor/cereal/include/cereal/archives/json.hpp +1019 -0
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +334 -0
- data/vendor/cereal/include/cereal/archives/xml.hpp +956 -0
- data/vendor/cereal/include/cereal/cereal.hpp +1089 -0
- data/vendor/cereal/include/cereal/details/helpers.hpp +422 -0
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +796 -0
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +65 -0
- data/vendor/cereal/include/cereal/details/static_object.hpp +127 -0
- data/vendor/cereal/include/cereal/details/traits.hpp +1411 -0
- data/vendor/cereal/include/cereal/details/util.hpp +84 -0
- data/vendor/cereal/include/cereal/external/base64.hpp +134 -0
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +284 -0
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +78 -0
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +2652 -0
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +299 -0
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +716 -0
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +74 -0
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +161 -0
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +99 -0
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +104 -0
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +151 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +290 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +271 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +245 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +78 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +308 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +186 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +55 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +740 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +232 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +69 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +290 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +46 -0
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +128 -0
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +70 -0
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +71 -0
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +316 -0
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +300 -0
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +81 -0
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +1414 -0
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +277 -0
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +656 -0
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +2230 -0
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +2497 -0
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +223 -0
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +121 -0
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +709 -0
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +52 -0
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +406 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +2624 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +175 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +428 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +123 -0
- data/vendor/cereal/include/cereal/macros.hpp +154 -0
- data/vendor/cereal/include/cereal/specialize.hpp +139 -0
- data/vendor/cereal/include/cereal/types/array.hpp +79 -0
- data/vendor/cereal/include/cereal/types/atomic.hpp +55 -0
- data/vendor/cereal/include/cereal/types/base_class.hpp +203 -0
- data/vendor/cereal/include/cereal/types/bitset.hpp +176 -0
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +164 -0
- data/vendor/cereal/include/cereal/types/chrono.hpp +72 -0
- data/vendor/cereal/include/cereal/types/common.hpp +129 -0
- data/vendor/cereal/include/cereal/types/complex.hpp +56 -0
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +73 -0
- data/vendor/cereal/include/cereal/types/deque.hpp +62 -0
- data/vendor/cereal/include/cereal/types/forward_list.hpp +68 -0
- data/vendor/cereal/include/cereal/types/functional.hpp +43 -0
- data/vendor/cereal/include/cereal/types/list.hpp +62 -0
- data/vendor/cereal/include/cereal/types/map.hpp +36 -0
- data/vendor/cereal/include/cereal/types/memory.hpp +425 -0
- data/vendor/cereal/include/cereal/types/optional.hpp +66 -0
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +483 -0
- data/vendor/cereal/include/cereal/types/queue.hpp +132 -0
- data/vendor/cereal/include/cereal/types/set.hpp +103 -0
- data/vendor/cereal/include/cereal/types/stack.hpp +76 -0
- data/vendor/cereal/include/cereal/types/string.hpp +61 -0
- data/vendor/cereal/include/cereal/types/tuple.hpp +123 -0
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +36 -0
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +99 -0
- data/vendor/cereal/include/cereal/types/utility.hpp +47 -0
- data/vendor/cereal/include/cereal/types/valarray.hpp +89 -0
- data/vendor/cereal/include/cereal/types/variant.hpp +109 -0
- data/vendor/cereal/include/cereal/types/vector.hpp +112 -0
- data/vendor/cereal/include/cereal/version.hpp +52 -0
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +2 -1
- data/vendor/isotree/src/RcppExports.cpp +44 -4
- data/vendor/isotree/src/Rwrapper.cpp +141 -51
- data/vendor/isotree/src/crit.cpp +1 -1
- data/vendor/isotree/src/dealloc.cpp +1 -1
- data/vendor/isotree/src/dist.cpp +6 -6
- data/vendor/isotree/src/extended.cpp +5 -5
- data/vendor/isotree/src/fit_model.cpp +30 -19
- data/vendor/isotree/src/helpers_iforest.cpp +26 -11
- data/vendor/isotree/src/impute.cpp +7 -7
- data/vendor/isotree/src/isoforest.cpp +7 -7
- data/vendor/isotree/src/isotree.hpp +27 -5
- data/vendor/isotree/src/merge_models.cpp +1 -1
- data/vendor/isotree/src/mult.cpp +1 -1
- data/vendor/isotree/src/predict.cpp +20 -16
- data/vendor/isotree/src/serialize.cpp +1 -1
- data/vendor/isotree/src/sql.cpp +545 -0
- data/vendor/isotree/src/utils.cpp +36 -44
- metadata +98 -92
data/vendor/isotree/src/crit.cpp
CHANGED
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
data/vendor/isotree/src/dist.cpp
CHANGED
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -280,14 +280,14 @@ void traverse_tree_sim(WorkerForSimilarity &workspace,
|
|
280
280
|
|
281
281
|
|
282
282
|
/* divide according to tree */
|
283
|
-
if (prediction_data.
|
283
|
+
if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
|
284
284
|
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
285
285
|
size_t st_NA, end_NA, split_ix;
|
286
286
|
switch(trees[curr_tree].col_type)
|
287
287
|
{
|
288
288
|
case Numeric:
|
289
289
|
{
|
290
|
-
if (prediction_data.
|
290
|
+
if (prediction_data.Xc_indptr == NULL)
|
291
291
|
divide_subset_split(workspace.ix_arr.data(),
|
292
292
|
prediction_data.numeric_data + prediction_data.nrows * trees[curr_tree].col_num,
|
293
293
|
workspace.st, workspace.end, trees[curr_tree].num_split,
|
@@ -477,14 +477,14 @@ void traverse_hplane_sim(WorkerForSimilarity &workspace,
|
|
477
477
|
prediction_data.nrows, workspace.rmat.data(), -1.);
|
478
478
|
}
|
479
479
|
|
480
|
-
if (prediction_data.
|
480
|
+
if (prediction_data.Xc_indptr != NULL && workspace.tmat_sep.size())
|
481
481
|
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
482
482
|
|
483
483
|
/* reconstruct linear combination */
|
484
484
|
size_t ncols_numeric = 0;
|
485
485
|
size_t ncols_categ = 0;
|
486
486
|
std::fill(workspace.comb_val.begin(), workspace.comb_val.begin() + (workspace.end - workspace.st + 1), 0);
|
487
|
-
if (prediction_data.categ_data != NULL || prediction_data.
|
487
|
+
if (prediction_data.categ_data != NULL || prediction_data.Xc_indptr != NULL)
|
488
488
|
{
|
489
489
|
for (size_t col = 0; col < hplanes[curr_tree].col_num.size(); col++)
|
490
490
|
{
|
@@ -492,7 +492,7 @@ void traverse_hplane_sim(WorkerForSimilarity &workspace,
|
|
492
492
|
{
|
493
493
|
case Numeric:
|
494
494
|
{
|
495
|
-
if (prediction_data.
|
495
|
+
if (prediction_data.Xc_indptr == NULL)
|
496
496
|
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
497
497
|
prediction_data.numeric_data + prediction_data.nrows * hplanes[curr_tree].col_num[col],
|
498
498
|
hplanes[curr_tree].coef[ncols_numeric], (double)0, hplanes[curr_tree].mean[ncols_numeric],
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -60,7 +60,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
|
|
60
60
|
/* calculate imputation statistics if desired */
|
61
61
|
if (impute_nodes != NULL)
|
62
62
|
{
|
63
|
-
if (input_data.
|
63
|
+
if (input_data.Xc_indptr != NULL)
|
64
64
|
std::sort(workspace.ix_arr.begin() + workspace.st,
|
65
65
|
workspace.ix_arr.begin() + workspace.end + 1);
|
66
66
|
build_impute_node(impute_nodes->back(), workspace,
|
@@ -85,7 +85,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
|
|
85
85
|
goto terminal_statistics;
|
86
86
|
|
87
87
|
/* for sparse matrices, need to sort the indices */
|
88
|
-
if (input_data.
|
88
|
+
if (input_data.Xc_indptr != NULL && impute_nodes == NULL)
|
89
89
|
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
90
90
|
|
91
91
|
/* pick column to split according to criteria */
|
@@ -370,7 +370,7 @@ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
|
|
370
370
|
{
|
371
371
|
case Numeric:
|
372
372
|
{
|
373
|
-
if (input_data.
|
373
|
+
if (input_data.Xc_indptr == NULL)
|
374
374
|
{
|
375
375
|
add_linear_comb(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.comb_val.data(),
|
376
376
|
input_data.numeric_data + hplanes.back().col_num[col] * input_data.nrows,
|
@@ -551,7 +551,7 @@ void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelPara
|
|
551
551
|
}
|
552
552
|
}
|
553
553
|
|
554
|
-
if (input_data.
|
554
|
+
if (input_data.Xc_indptr == NULL)
|
555
555
|
{
|
556
556
|
calc_mean_and_sd(workspace.ix_arr.data(), workspace.st, workspace.end,
|
557
557
|
input_data.numeric_data + workspace.col_chosen * input_data.nrows,
|
@@ -1,7 +1,7 @@
|
|
1
1
|
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
2
|
* of categorical variables and missing values.
|
3
3
|
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
-
*
|
4
|
+
*
|
5
5
|
* This library is based on the following works:
|
6
6
|
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
7
|
* "Isolation forest."
|
@@ -20,9 +20,9 @@
|
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
21
|
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
-
*
|
23
|
+
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -47,7 +47,7 @@
|
|
47
47
|
bool interrupt_switch;
|
48
48
|
|
49
49
|
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
50
|
-
*
|
50
|
+
*
|
51
51
|
* Parameters:
|
52
52
|
* ===========
|
53
53
|
* - model_outputs (out)
|
@@ -287,11 +287,15 @@ bool interrupt_switch;
|
|
287
287
|
* 'categ_data', and 'Xc', will get overwritten with the imputations produced.
|
288
288
|
* - random_seed
|
289
289
|
* Seed that will be used to generate random numbers used by the model.
|
290
|
+
* - handle_interrupt
|
291
|
+
* Whether to handle interrupt signals while the process is running. Note that this will
|
292
|
+
* interfere with interrupt handles when the procedure is called from interpreted languages
|
293
|
+
* such as Python or R.
|
290
294
|
* - nthreads
|
291
295
|
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
292
296
|
* allocated, even if the thread does not end up being used. Ignored when not building with
|
293
297
|
* OpenMP support.
|
294
|
-
*
|
298
|
+
*
|
295
299
|
* Returns
|
296
300
|
* =======
|
297
301
|
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
@@ -300,7 +304,7 @@ bool interrupt_switch;
|
|
300
304
|
* what these values correspond to, you can use the functions
|
301
305
|
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
302
306
|
* as integers.
|
303
|
-
*
|
307
|
+
*
|
304
308
|
* References
|
305
309
|
* ==========
|
306
310
|
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
@@ -337,7 +341,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
337
341
|
CategSplit cat_split_type, NewCategAction new_cat_action,
|
338
342
|
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
339
343
|
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
340
|
-
uint64_t random_seed, int nthreads)
|
344
|
+
uint64_t random_seed, bool handle_interrupt, int nthreads)
|
341
345
|
{
|
342
346
|
/* calculate maximum number of categories to use later */
|
343
347
|
int max_categ = 0;
|
@@ -421,9 +425,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
421
425
|
/* TODO: find a better way of handling interrupt signals when calling in Python/R.
|
422
426
|
The following will still change the behavior of interrupts when called through e.g. Flask */
|
423
427
|
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
424
|
-
struct sigaction sig_handle;
|
425
|
-
|
426
|
-
|
428
|
+
struct sigaction sig_handle = {};
|
429
|
+
if (handle_interrupt)
|
430
|
+
{
|
431
|
+
sig_handle.sa_flags = SA_RESETHAND;
|
432
|
+
sig_handle.sa_handler = set_interrup_global_variable;
|
433
|
+
sigemptyset(&sig_handle.sa_mask);
|
434
|
+
}
|
427
435
|
#endif
|
428
436
|
|
429
437
|
/* grow trees */
|
@@ -468,11 +476,14 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
468
476
|
else
|
469
477
|
model_outputs_ext->hplanes[tree].shrink_to_fit();
|
470
478
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
479
|
+
if (handle_interrupt)
|
480
|
+
{
|
481
|
+
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
482
|
+
sigaction(SIGINT, &sig_handle, NULL);
|
483
|
+
#else
|
484
|
+
signal(SIGINT, set_interrup_global_variable);
|
485
|
+
#endif
|
486
|
+
}
|
476
487
|
}
|
477
488
|
|
478
489
|
/* check if the procedure got interrupted */
|
@@ -556,7 +567,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
556
567
|
|
557
568
|
|
558
569
|
/* Add additional trees to already-fitted isolation forest model
|
559
|
-
*
|
570
|
+
*
|
560
571
|
* Parameters
|
561
572
|
* ==========
|
562
573
|
* - model_outputs
|
@@ -972,7 +983,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
|
|
972
983
|
min_size_chr = input_data.max_categ;
|
973
984
|
}
|
974
985
|
|
975
|
-
if (input_data.
|
986
|
+
if (input_data.Xc_indptr != NULL && gain)
|
976
987
|
{
|
977
988
|
min_size_szt = std::max(min_size_szt, model_params.sample_size);
|
978
989
|
min_size_dbl = std::max(min_size_dbl, model_params.sample_size);
|
@@ -1012,7 +1023,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
|
|
1012
1023
|
if (
|
1013
1024
|
model_params.cat_split_type == SubSet &&
|
1014
1025
|
(
|
1015
|
-
model_params.prob_pick_by_gain_avg ||
|
1026
|
+
model_params.prob_pick_by_gain_avg ||
|
1016
1027
|
model_params.prob_pick_by_gain_pl
|
1017
1028
|
)
|
1018
1029
|
)
|
@@ -1027,7 +1038,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
|
|
1027
1038
|
{
|
1028
1039
|
std::vector<double> kurt_weights(input_data.ncols_numeric + input_data.ncols_categ);
|
1029
1040
|
|
1030
|
-
if (input_data.
|
1041
|
+
if (input_data.Xc_indptr == NULL)
|
1031
1042
|
{
|
1032
1043
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1033
1044
|
kurt_weights[col] = calc_kurtosis(workspace.ix_arr.data(), workspace.st, workspace.end,
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -91,7 +91,7 @@ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams
|
|
91
91
|
{
|
92
92
|
if (tree.col_type == Numeric)
|
93
93
|
{
|
94
|
-
if (input_data.
|
94
|
+
if (input_data.Xc_indptr == NULL)
|
95
95
|
get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * tree.col_num,
|
96
96
|
workspace.st, workspace.end, model_params.missing_action,
|
97
97
|
workspace.xmin, workspace.xmax, workspace.unsplittable);
|
@@ -114,7 +114,7 @@ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams
|
|
114
114
|
{
|
115
115
|
if (workspace.col_type == Numeric)
|
116
116
|
{
|
117
|
-
if (input_data.
|
117
|
+
if (input_data.Xc_indptr == NULL)
|
118
118
|
get_range(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * workspace.col_chosen,
|
119
119
|
workspace.st, workspace.end, model_params.missing_action,
|
120
120
|
workspace.xmin, workspace.xmax, workspace.unsplittable);
|
@@ -281,10 +281,19 @@ void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_s
|
|
281
281
|
/* for the extended model, it's not necessary to copy everything */
|
282
282
|
if (!workspace.comb_val.size())
|
283
283
|
{
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
284
|
+
recursion_state.ix_arr = std::vector<size_t>(workspace.ix_arr.begin() + workspace.st_NA,
|
285
|
+
workspace.ix_arr.begin() + workspace.end + 1);
|
286
|
+
size_t tot = workspace.end - workspace.st_NA + 1;
|
287
|
+
if (workspace.weights_arr.size() || workspace.weights_map.size())
|
288
|
+
recursion_state.weights_arr = std::unique_ptr<double[]>(new double[tot]);
|
289
|
+
if (workspace.weights_arr.size())
|
290
|
+
for (size_t ix = 0; ix < tot; ix++)
|
291
|
+
recursion_state.weights_arr[ix] = workspace.weights_arr[workspace.ix_arr[ix + workspace.st_NA]];
|
292
|
+
else if (workspace.weights_map.size())
|
293
|
+
for (size_t ix = 0; ix < tot; ix++)
|
294
|
+
recursion_state.weights_arr[ix] = workspace.weights_map[workspace.ix_arr[ix + workspace.st_NA]];
|
295
|
+
|
296
|
+
|
288
297
|
}
|
289
298
|
}
|
290
299
|
|
@@ -301,9 +310,15 @@ void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_
|
|
301
310
|
|
302
311
|
if (!workspace.comb_val.size())
|
303
312
|
{
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
workspace.
|
313
|
+
std::copy(recursion_state.ix_arr.begin(),
|
314
|
+
recursion_state.ix_arr.end(),
|
315
|
+
workspace.ix_arr.begin() + recursion_state.st_NA);
|
316
|
+
size_t tot = workspace.end - workspace.st_NA + 1;
|
317
|
+
if (workspace.weights_arr.size())
|
318
|
+
for (size_t ix = 0; ix < tot; ix++)
|
319
|
+
workspace.weights_arr[workspace.ix_arr[ix + workspace.st_NA]] = recursion_state.weights_arr[ix];
|
320
|
+
else if (workspace.weights_map.size())
|
321
|
+
for (size_t ix = 0; ix < tot; ix++)
|
322
|
+
workspace.weights_map[workspace.ix_arr[ix + workspace.st_NA]] = recursion_state.weights_arr[ix];
|
308
323
|
}
|
309
324
|
}
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -226,7 +226,7 @@ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees,
|
|
226
226
|
}
|
227
227
|
}
|
228
228
|
|
229
|
-
else if (input_data.
|
229
|
+
else if (input_data.Xc_indptr != NULL)
|
230
230
|
{
|
231
231
|
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) private(cnt) shared(input_data, imputer)
|
232
232
|
for (size_t_for col = 0; col < input_data.ncols_numeric; col++)
|
@@ -304,7 +304,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
304
304
|
double weight;
|
305
305
|
size_t ix;
|
306
306
|
|
307
|
-
if ((input_data.
|
307
|
+
if ((input_data.Xc_indptr == NULL && input_data.ncols_numeric) || input_data.ncols_categ)
|
308
308
|
{
|
309
309
|
if (!has_weights)
|
310
310
|
{
|
@@ -401,7 +401,7 @@ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
|
401
401
|
}
|
402
402
|
}
|
403
403
|
|
404
|
-
if (input_data.
|
404
|
+
if (input_data.Xc_indptr != NULL) /* sparse numeric */
|
405
405
|
{
|
406
406
|
size_t *ix_arr = workspace.ix_arr.data();
|
407
407
|
size_t st_col, end_col, ind_end_col, curr_pos;
|
@@ -802,7 +802,7 @@ void apply_imputation_results(imp_arr &impute_vec,
|
|
802
802
|
{
|
803
803
|
size_t col;
|
804
804
|
|
805
|
-
if (input_data.
|
805
|
+
if (input_data.Xc_indptr != NULL)
|
806
806
|
{
|
807
807
|
std::vector<size_t> row_pos(input_data.nrows, 0);
|
808
808
|
size_t row;
|
@@ -950,7 +950,7 @@ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row)
|
|
950
950
|
imp.num_weight.assign(imp.n_missing_num, 0);
|
951
951
|
}
|
952
952
|
|
953
|
-
else if (input_data.
|
953
|
+
else if (input_data.Xc_indptr != NULL)
|
954
954
|
{
|
955
955
|
imp.missing_sp.resize(input_data.ncols_numeric);
|
956
956
|
sparse_ix *res;
|
@@ -1105,7 +1105,7 @@ void check_for_missing(InputData &input_data,
|
|
1105
1105
|
{
|
1106
1106
|
input_data.has_missing.assign(input_data.nrows, false);
|
1107
1107
|
|
1108
|
-
if (input_data.
|
1108
|
+
if (input_data.Xc_indptr != NULL)
|
1109
1109
|
{
|
1110
1110
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
1111
1111
|
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(col, input_data)
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -56,7 +56,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
56
56
|
/* calculate imputation statistics if desired */
|
57
57
|
if (impute_nodes != NULL)
|
58
58
|
{
|
59
|
-
if (input_data.
|
59
|
+
if (input_data.Xc_indptr != NULL)
|
60
60
|
std::sort(workspace.ix_arr.begin() + workspace.st,
|
61
61
|
workspace.ix_arr.begin() + workspace.end + 1);
|
62
62
|
build_impute_node(impute_nodes->back(), workspace,
|
@@ -81,7 +81,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
81
81
|
goto terminal_statistics;
|
82
82
|
|
83
83
|
/* for sparse matrices, need to sort the indices */
|
84
|
-
if (input_data.
|
84
|
+
if (input_data.Xc_indptr != NULL && impute_nodes == NULL)
|
85
85
|
std::sort(workspace.ix_arr.begin() + workspace.st, workspace.ix_arr.begin() + workspace.end + 1);
|
86
86
|
|
87
87
|
/* pick column to split according to criteria */
|
@@ -108,7 +108,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
108
108
|
|
109
109
|
/* evaluate gain for all columns */
|
110
110
|
trees.back().score = -HUGE_VAL; /* this is used to track the best gain */
|
111
|
-
if (input_data.
|
111
|
+
if (input_data.Xc_indptr == NULL)
|
112
112
|
{
|
113
113
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
114
114
|
{
|
@@ -291,7 +291,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
291
291
|
if (workspace.unsplittable)
|
292
292
|
{
|
293
293
|
workspace.ncols_tried = 0; /* note: this is used here as a counter for the number of still splittable columns */
|
294
|
-
if (input_data.
|
294
|
+
if (input_data.Xc_indptr == NULL)
|
295
295
|
{
|
296
296
|
for (size_t col = 0; col < input_data.ncols_numeric; col++)
|
297
297
|
{
|
@@ -406,7 +406,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
406
406
|
|
407
407
|
default:
|
408
408
|
{
|
409
|
-
if (input_data.
|
409
|
+
if (input_data.Xc_indptr == NULL)
|
410
410
|
{
|
411
411
|
eval_guided_crit(workspace.ix_arr.data(), workspace.st, workspace.end,
|
412
412
|
input_data.numeric_data + trees.back().col_num * input_data.nrows,
|
@@ -441,7 +441,7 @@ void split_itree_recursive(std::vector<IsoTree> &trees,
|
|
441
441
|
}
|
442
442
|
}
|
443
443
|
|
444
|
-
if (input_data.
|
444
|
+
if (input_data.Xc_indptr == NULL)
|
445
445
|
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + input_data.nrows * trees.back().col_num,
|
446
446
|
workspace.st, workspace.end, trees.back().num_split, model_params.missing_action,
|
447
447
|
workspace.st_NA, workspace.end_NA, workspace.split_ix);
|
@@ -22,7 +22,7 @@
|
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
23
|
*
|
24
24
|
* BSD 2-Clause License
|
25
|
-
* Copyright (c)
|
25
|
+
* Copyright (c) 2020, David Cortes
|
26
26
|
* All rights reserved.
|
27
27
|
* Redistribution and use in source and binary forms, with or without
|
28
28
|
* modification, are permitted provided that the following conditions are met:
|
@@ -213,7 +213,7 @@ typedef struct IsoHPlane {
|
|
213
213
|
std::vector<std::vector<double>> cat_coef;
|
214
214
|
std::vector<int> chosen_cat;
|
215
215
|
std::vector<double> fill_val;
|
216
|
-
std::vector<double> fill_new;
|
216
|
+
std::vector<double> fill_new; /* <- when using single categ, coef will be here */
|
217
217
|
|
218
218
|
double split_point;
|
219
219
|
size_t hplane_left;
|
@@ -545,9 +545,8 @@ typedef struct {
|
|
545
545
|
size_t split_ix;
|
546
546
|
size_t end;
|
547
547
|
std::vector<size_t> ix_arr;
|
548
|
-
std::unordered_map<size_t, double> weights_map;
|
549
|
-
std::vector<double> weights_arr;
|
550
548
|
std::vector<bool> cols_possible;
|
549
|
+
std::unique_ptr<double[]> weights_arr;
|
551
550
|
std::discrete_distribution<size_t> col_sampler;
|
552
551
|
} RecursionState;
|
553
552
|
|
@@ -572,7 +571,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
572
571
|
CategSplit cat_split_type, NewCategAction new_cat_action,
|
573
572
|
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
574
573
|
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
575
|
-
uint64_t random_seed, int nthreads);
|
574
|
+
uint64_t random_seed, bool handle_interrupt, int nthreads);
|
576
575
|
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
577
576
|
double numeric_data[], size_t ncols_numeric,
|
578
577
|
int categ_data[], size_t ncols_categ, int ncat[],
|
@@ -923,6 +922,29 @@ void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
|
|
923
922
|
bool has_msvc();
|
924
923
|
#endif /* _ENABLE_CEREAL */
|
925
924
|
|
925
|
+
/* sql.cpp */
|
926
|
+
std::vector<std::string> generate_sql(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
927
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
928
|
+
std::vector<std::vector<std::string>> &categ_levels,
|
929
|
+
bool output_tree_num, bool index1, bool single_tree, size_t tree_num,
|
930
|
+
int nthreads);
|
931
|
+
std::string generate_sql_with_select_from(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
932
|
+
std::string &table_from, std::string &select_as,
|
933
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
934
|
+
std::vector<std::vector<std::string>> &categ_levels,
|
935
|
+
bool index1, int nthreads);
|
936
|
+
void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
|
937
|
+
size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
|
938
|
+
std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right);
|
939
|
+
void extract_cond_isotree(IsoForest &model, IsoTree &tree,
|
940
|
+
std::string &cond_left, std::string &cond_right,
|
941
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
942
|
+
std::vector<std::vector<std::string>> &categ_levels);
|
943
|
+
void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
|
944
|
+
std::string &cond_left, std::string &cond_right,
|
945
|
+
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
|
946
|
+
std::vector<std::vector<std::string>> &categ_levels);
|
947
|
+
|
926
948
|
/* dealloc.cpp */
|
927
949
|
void dealloc_IsoForest(IsoForest &model_outputs);
|
928
950
|
void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);
|