isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
/***********************************************************************************
|
|
66
|
+
---------------------
|
|
67
|
+
IsoTree OOP interface
|
|
68
|
+
---------------------
|
|
69
|
+
|
|
70
|
+
This is provided as an alternative easier-to-use interface for this library
|
|
71
|
+
which follows scikit-learn-style methods with a single C++ class. It is a
|
|
72
|
+
wrapper over the non-OOP header 'isotree.hpp', providing the same functionality
|
|
73
|
+
in a perhaps more comprehensible structure, while still offering direct access
|
|
74
|
+
to the underlying objects so as to allow using the functions from 'isotree.hpp'.
|
|
75
|
+
|
|
76
|
+
It is a more limited interface as it does not implement all the functionality
|
|
77
|
+
for serialization, distance prediction, oproducing predictions in the same call
|
|
78
|
+
as the model is fit, or fitting/predicting on data with types other than
|
|
79
|
+
'double' and 'int'.
|
|
80
|
+
|
|
81
|
+
The descriptions here do not contain the full documentation, but rather only
|
|
82
|
+
some hints so as to make them more comprehensible, aiming at producing function
|
|
83
|
+
signatures that are self-descriptive instead (if you are familiar with the
|
|
84
|
+
scikit-learn library for Python).
|
|
85
|
+
|
|
86
|
+
For detailed documentation see the same or similar-looking methods in the
|
|
87
|
+
'isotree.hpp' header instead.
|
|
88
|
+
|
|
89
|
+
***********************************************************************************/
|
|
90
|
+
|
|
91
|
+
#if !defined(_FOR_R) && !defined(_FOR_PYTHON)
|
|
92
|
+
|
|
93
|
+
#include "isotree.hpp"
|
|
94
|
+
|
|
95
|
+
namespace isotree {
|
|
96
|
+
|
|
97
|
+
class ISOTREE_EXPORTED IsolationForest
|
|
98
|
+
{
|
|
99
|
+
public:
|
|
100
|
+
int nthreads = -1;
|
|
101
|
+
|
|
102
|
+
uint64_t random_seed = 1;
|
|
103
|
+
|
|
104
|
+
size_t ndim = 3;
|
|
105
|
+
size_t ntry = 1;
|
|
106
|
+
CoefType coef_type = Uniform;
|
|
107
|
+
bool with_replacement = false;
|
|
108
|
+
bool weight_as_sample = true;
|
|
109
|
+
size_t sample_size = 0;
|
|
110
|
+
size_t ntrees = 500;
|
|
111
|
+
size_t max_depth = 0;
|
|
112
|
+
size_t ncols_per_tree = 0;
|
|
113
|
+
bool limit_depth = true;
|
|
114
|
+
bool penalize_range = false;
|
|
115
|
+
bool standardize_data = true;
|
|
116
|
+
ScoringMetric scoring_metric = Depth;
|
|
117
|
+
bool fast_bratio = true;
|
|
118
|
+
bool weigh_by_kurt = false;
|
|
119
|
+
double prob_pick_by_gain_pl = 0.;
|
|
120
|
+
double prob_pick_by_gain_avg = 0.;
|
|
121
|
+
double prob_pick_by_full_gain = 0.;
|
|
122
|
+
double prob_pick_by_dens = 0.;
|
|
123
|
+
double prob_pick_col_by_range = 0.;
|
|
124
|
+
double prob_pick_col_by_var = 0.;
|
|
125
|
+
double prob_pick_col_by_kurt = 0.;
|
|
126
|
+
double min_gain = 0.;
|
|
127
|
+
MissingAction missing_action = Impute;
|
|
128
|
+
|
|
129
|
+
CategSplit cat_split_type = SubSet;
|
|
130
|
+
NewCategAction new_cat_action = Weighted;
|
|
131
|
+
bool coef_by_prop = false;
|
|
132
|
+
bool all_perm = false;
|
|
133
|
+
|
|
134
|
+
bool build_imputer = false;
|
|
135
|
+
size_t min_imp_obs = 3;
|
|
136
|
+
UseDepthImp depth_imp = Higher;
|
|
137
|
+
WeighImpRows weigh_imp_rows = Inverse;
|
|
138
|
+
|
|
139
|
+
IsoForest model;
|
|
140
|
+
ExtIsoForest model_ext;
|
|
141
|
+
Imputer imputer;
|
|
142
|
+
TreesIndexer indexer;
|
|
143
|
+
|
|
144
|
+
IsolationForest() = default;
|
|
145
|
+
|
|
146
|
+
~IsolationForest() = default;
|
|
147
|
+
|
|
148
|
+
IsolationForest
|
|
149
|
+
(
|
|
150
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
151
|
+
bool with_replacement, bool weight_as_sample,
|
|
152
|
+
size_t sample_size, size_t ntrees,
|
|
153
|
+
size_t max_depth, size_t ncols_per_tree, bool limit_depth,
|
|
154
|
+
bool penalize_range, bool standardize_datam,
|
|
155
|
+
ScoringMetric scoring_metric, bool fast_bratio, bool weigh_by_kurt,
|
|
156
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
157
|
+
double prob_pick_by_full_gain, double prob_pick_by_dens,
|
|
158
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
159
|
+
double prob_pick_col_by_kurt,
|
|
160
|
+
double min_gain, MissingAction missing_action,
|
|
161
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
162
|
+
bool all_perm, bool build_imputer, size_t min_imp_obs,
|
|
163
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
164
|
+
uint64_t random_seed, int nthreads
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
void fit(double X[], size_t nrows, size_t ncols);
|
|
168
|
+
|
|
169
|
+
void fit(double numeric_data[], size_t ncols_numeric, size_t nrows,
|
|
170
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
171
|
+
double sample_weights[], double col_weights[]);
|
|
172
|
+
|
|
173
|
+
void fit(double Xc[], int Xc_ind[], int Xc_indptr[],
|
|
174
|
+
size_t ncols_numeric, size_t nrows,
|
|
175
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
176
|
+
double sample_weights[], double col_weights[]);
|
|
177
|
+
|
|
178
|
+
std::vector<double> predict(double X[], size_t nrows, bool standardize);
|
|
179
|
+
|
|
180
|
+
void predict(double numeric_data[], int categ_data[], bool is_col_major,
|
|
181
|
+
size_t nrows, size_t ld_numeric, size_t ld_categ, bool standardize,
|
|
182
|
+
double output_depths[], int tree_num[], double per_tree_depths[]);
|
|
183
|
+
|
|
184
|
+
void predict(double X_sparse[], int X_ind[], int X_indptr[], bool is_csc,
|
|
185
|
+
int categ_data[], bool is_col_major, size_t ld_categ, size_t nrows, bool standardize,
|
|
186
|
+
double output_depths[], int tree_num[], double per_tree_depths[]);
|
|
187
|
+
|
|
188
|
+
std::vector<double> predict_distance(double X[], size_t nrows,
|
|
189
|
+
bool as_kernel,
|
|
190
|
+
bool assume_full_distr, bool standardize,
|
|
191
|
+
bool triangular);
|
|
192
|
+
|
|
193
|
+
void predict_distance(double numeric_data[], int categ_data[],
|
|
194
|
+
size_t nrows,
|
|
195
|
+
bool as_kernel,
|
|
196
|
+
bool assume_full_distr, bool standardize,
|
|
197
|
+
bool triangular,
|
|
198
|
+
double dist_matrix[]);
|
|
199
|
+
|
|
200
|
+
void predict_distance(double Xc[], int Xc_ind[], int Xc_indptr[], int categ_data[],
|
|
201
|
+
size_t nrows,
|
|
202
|
+
bool as_kernel,
|
|
203
|
+
bool assume_full_distr, bool standardize,
|
|
204
|
+
bool triangular,
|
|
205
|
+
double dist_matrix[]);
|
|
206
|
+
|
|
207
|
+
void impute(double X[], size_t nrows);
|
|
208
|
+
|
|
209
|
+
void impute(double numeric_data[], int categ_data[], bool is_col_major, size_t nrows);
|
|
210
|
+
|
|
211
|
+
void impute(double Xr[], int Xr_ind[], int Xr_indptr[],
|
|
212
|
+
int categ_data[], bool is_col_major, size_t nrows);
|
|
213
|
+
|
|
214
|
+
void build_indexer(const bool with_distances);
|
|
215
|
+
|
|
216
|
+
void set_as_reference_points(double numeric_data[], int categ_data[], bool is_col_major,
|
|
217
|
+
size_t nrows, size_t ld_numeric, size_t ld_categ,
|
|
218
|
+
const bool with_distances);
|
|
219
|
+
|
|
220
|
+
void set_as_reference_points(double Xc[], int Xc_ind[], int Xc_indptr[], int categ_data[],
|
|
221
|
+
size_t nrows, const bool with_distances);
|
|
222
|
+
|
|
223
|
+
size_t get_num_reference_points() const noexcept;
|
|
224
|
+
|
|
225
|
+
void predict_distance_to_ref_points(double numeric_data[], int categ_data[],
|
|
226
|
+
double Xc[], int Xc_ind[], int Xc_indptr[],
|
|
227
|
+
size_t nrows, bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
228
|
+
bool as_kernel, bool standardize,
|
|
229
|
+
double dist_matrix[]);
|
|
230
|
+
|
|
231
|
+
void serialize(FILE *out) const;
|
|
232
|
+
|
|
233
|
+
void serialize(std::ostream &out) const;
|
|
234
|
+
|
|
235
|
+
static IsolationForest deserialize(FILE *inp, int nthreads);
|
|
236
|
+
|
|
237
|
+
static IsolationForest deserialize(std::istream &inp, int nthreads);
|
|
238
|
+
|
|
239
|
+
friend std::ostream& operator<<(std::ostream &ost, const IsolationForest &model);
|
|
240
|
+
|
|
241
|
+
friend std::istream& operator>>(std::istream &ist, IsolationForest &model);
|
|
242
|
+
|
|
243
|
+
IsoForest& get_model();
|
|
244
|
+
|
|
245
|
+
ExtIsoForest& get_model_ext();
|
|
246
|
+
|
|
247
|
+
Imputer& get_imputer();
|
|
248
|
+
|
|
249
|
+
TreesIndexer& get_indexer();
|
|
250
|
+
|
|
251
|
+
void check_nthreads();
|
|
252
|
+
|
|
253
|
+
size_t get_ntrees() const;
|
|
254
|
+
|
|
255
|
+
bool check_can_predict_per_tree() const;
|
|
256
|
+
|
|
257
|
+
private:
|
|
258
|
+
bool is_fitted = false;
|
|
259
|
+
|
|
260
|
+
void override_previous_fit();
|
|
261
|
+
void check_params();
|
|
262
|
+
void check_is_fitted() const;
|
|
263
|
+
IsolationForest(int nthreads, size_t ndim, size_t ntrees, bool build_imputer);
|
|
264
|
+
template <class otype>
|
|
265
|
+
void serialize_template(otype &out) const;
|
|
266
|
+
template <class itype>
|
|
267
|
+
static IsolationForest deserialize_template(itype &inp, int nthreads);
|
|
268
|
+
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
ISOTREE_EXPORTED
|
|
272
|
+
std::ostream& operator<<(std::ostream &ost, const IsolationForest &model);
|
|
273
|
+
ISOTREE_EXPORTED
|
|
274
|
+
std::istream& operator>>(std::istream &ist, IsolationForest &model);
|
|
275
|
+
|
|
276
|
+
}
|
|
277
|
+
#endif
|
|
278
|
+
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2022, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
#include "isotree.hpp"
|
|
64
|
+
|
|
65
|
+
template <class sparse_ix__>
|
|
66
|
+
bool check_indices_are_sorted(sparse_ix__ indices[], size_t n)
|
|
67
|
+
{
|
|
68
|
+
if (n <= 1)
|
|
69
|
+
return true;
|
|
70
|
+
if (indices[n-1] < indices[0])
|
|
71
|
+
return false;
|
|
72
|
+
for (size_t ix = 1; ix < n; ix++)
|
|
73
|
+
if (indices[ix] < indices[ix-1])
|
|
74
|
+
return false;
|
|
75
|
+
return true;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
template <class real_t__, class sparse_ix__>
|
|
79
|
+
void sort_csc_indices(real_t__ *restrict Xc, sparse_ix__ *restrict Xc_ind, sparse_ix__ *restrict Xc_indptr, size_t ncols_numeric)
|
|
80
|
+
{
|
|
81
|
+
std::vector<double> buffer_sorted_vals;
|
|
82
|
+
std::vector<sparse_ix__> buffer_sorted_ix;
|
|
83
|
+
std::vector<size_t> argsorted;
|
|
84
|
+
size_t n_this;
|
|
85
|
+
size_t ix1, ix2;
|
|
86
|
+
for (size_t col = 0; col < ncols_numeric; col++)
|
|
87
|
+
{
|
|
88
|
+
ix1 = Xc_indptr[col];
|
|
89
|
+
ix2 = Xc_indptr[col+1];
|
|
90
|
+
n_this = ix2 - ix1;
|
|
91
|
+
if (n_this && !check_indices_are_sorted(Xc_ind + ix1, n_this))
|
|
92
|
+
{
|
|
93
|
+
if (buffer_sorted_vals.size() < n_this)
|
|
94
|
+
{
|
|
95
|
+
buffer_sorted_vals.resize(n_this);
|
|
96
|
+
buffer_sorted_ix.resize(n_this);
|
|
97
|
+
argsorted.resize(n_this);
|
|
98
|
+
}
|
|
99
|
+
std::iota(argsorted.begin(), argsorted.begin() + n_this, ix1);
|
|
100
|
+
std::sort(argsorted.begin(), argsorted.begin() + n_this,
|
|
101
|
+
[&Xc_ind](const size_t a, const size_t b){return Xc_ind[a] < Xc_ind[b];});
|
|
102
|
+
for (size_t ix = 0; ix < n_this; ix++)
|
|
103
|
+
buffer_sorted_ix[ix] = Xc_ind[argsorted[ix]];
|
|
104
|
+
std::copy(buffer_sorted_ix.begin(), buffer_sorted_ix.begin() + n_this, Xc_ind + ix1);
|
|
105
|
+
for (size_t ix = 0; ix < n_this; ix++)
|
|
106
|
+
buffer_sorted_vals[ix] = Xc[argsorted[ix]];
|
|
107
|
+
std::copy(buffer_sorted_vals.begin(), buffer_sorted_vals.begin() + n_this, Xc + ix1);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
template <class real_t__, class sparse_ix__>
|
|
114
|
+
void reconstruct_csr_sliced
|
|
115
|
+
(
|
|
116
|
+
real_t__ *restrict orig_Xr, sparse_ix__ *restrict orig_Xr_indptr,
|
|
117
|
+
real_t__ *restrict rec_Xr, sparse_ix__ *restrict rec_Xr_indptr,
|
|
118
|
+
size_t nrows
|
|
119
|
+
)
|
|
120
|
+
{
|
|
121
|
+
for (size_t row = 0; row < nrows; row++)
|
|
122
|
+
std::copy(rec_Xr + rec_Xr_indptr[row],
|
|
123
|
+
rec_Xr + rec_Xr_indptr[row+(size_t)1],
|
|
124
|
+
orig_Xr + orig_Xr_indptr[row]);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#define is_in_set(vv, ss) ((ss).find((vv)) != (ss).end())
|
|
128
|
+
|
|
129
|
+
template <class real_t__, class sparse_ix__, class size_t_>
|
|
130
|
+
void reconstruct_csr_with_categ
|
|
131
|
+
(
|
|
132
|
+
real_t__ *restrict orig_Xr, sparse_ix__ *restrict orig_Xr_ind, sparse_ix__ *restrict orig_Xr_indptr,
|
|
133
|
+
real_t__ *restrict rec_Xr, sparse_ix__ *restrict rec_Xr_ind, sparse_ix__ *restrict rec_Xr_indptr,
|
|
134
|
+
int *restrict rec_X_cat, bool is_col_major,
|
|
135
|
+
size_t_ *restrict cols_numeric, size_t_ *restrict cols_categ,
|
|
136
|
+
size_t nrows, size_t ncols, size_t ncols_numeric, size_t ncols_categ
|
|
137
|
+
)
|
|
138
|
+
{
|
|
139
|
+
/* Check if the numeric columns go first and in the original order */
|
|
140
|
+
bool num_is_seq = false;
|
|
141
|
+
if (ncols_numeric > 0 && check_indices_are_sorted(cols_numeric, ncols_numeric)) {
|
|
142
|
+
if (cols_numeric[0] == 0 && cols_numeric[ncols_numeric-1] == (size_t_)ncols_numeric-1)
|
|
143
|
+
num_is_seq = true;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
hashed_set<size_t> cols_numeric_set;
|
|
147
|
+
hashed_set<size_t> cols_categ_set(cols_categ, cols_categ + ncols_categ);
|
|
148
|
+
hashed_map<size_t, sparse_ix__> orig_to_rec_num;
|
|
149
|
+
hashed_map<size_t, size_t> orig_to_rec_cat;
|
|
150
|
+
|
|
151
|
+
sparse_ix__ col_orig;
|
|
152
|
+
sparse_ix__ *restrict col_ptr;
|
|
153
|
+
|
|
154
|
+
if (num_is_seq)
|
|
155
|
+
{
|
|
156
|
+
reconstruct_csr_sliced(
|
|
157
|
+
orig_Xr, orig_Xr_indptr,
|
|
158
|
+
rec_Xr, rec_Xr_indptr,
|
|
159
|
+
nrows
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
else
|
|
164
|
+
{
|
|
165
|
+
if (ncols_numeric)
|
|
166
|
+
cols_numeric_set = hashed_set<size_t>(cols_numeric, cols_numeric + ncols_numeric);
|
|
167
|
+
for (size_t col = 0; col < ncols_numeric; col++)
|
|
168
|
+
orig_to_rec_num[cols_numeric[col]] = col;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
for (size_t col = 0; col < ncols_categ; col++)
|
|
172
|
+
orig_to_rec_cat[cols_categ[col]] = col;
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
for (size_t row = 0; row < nrows; row++)
|
|
176
|
+
{
|
|
177
|
+
for (auto col = orig_Xr_indptr[row]; col < orig_Xr_indptr[row+1]; col++)
|
|
178
|
+
{
|
|
179
|
+
if (std::isnan(orig_Xr[col]))
|
|
180
|
+
{
|
|
181
|
+
col_orig = orig_Xr_ind[col];
|
|
182
|
+
if (is_in_set(col_orig, cols_numeric_set)) {
|
|
183
|
+
col_ptr = std::lower_bound(rec_Xr_ind + rec_Xr_indptr[row],
|
|
184
|
+
rec_Xr_ind + rec_Xr_indptr[row+1],
|
|
185
|
+
col_orig);
|
|
186
|
+
orig_Xr[col] = rec_Xr[std::distance(rec_Xr_ind, col_ptr)];
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
else if (is_in_set((size_t)col_orig, cols_categ_set)) {
|
|
190
|
+
orig_Xr[col] = rec_X_cat[is_col_major?
|
|
191
|
+
(row + nrows*orig_to_rec_cat[col_orig])
|
|
192
|
+
:
|
|
193
|
+
(orig_to_rec_cat[col_orig] + row*ncols_categ)];
|
|
194
|
+
#ifndef _FOR_R
|
|
195
|
+
orig_Xr[col] = (orig_Xr[col] < 0)? NAN : orig_Xr[col];
|
|
196
|
+
#else
|
|
197
|
+
orig_Xr[col] = (orig_Xr[col] < 0)? NA_REAL : orig_Xr[col];
|
|
198
|
+
#endif
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
else if (orig_Xr[col] < 0)
|
|
203
|
+
{
|
|
204
|
+
col_orig = orig_Xr_ind[col];
|
|
205
|
+
if (is_in_set((size_t)col_orig, cols_categ_set)) {
|
|
206
|
+
orig_Xr[col] = rec_X_cat[is_col_major?
|
|
207
|
+
(row + nrows*orig_to_rec_cat[col_orig])
|
|
208
|
+
:
|
|
209
|
+
(orig_to_rec_cat[col_orig] + row*ncols_categ)];
|
|
210
|
+
#ifndef _FOR_R
|
|
211
|
+
orig_Xr[col] = (orig_Xr[col] < 0)? NAN : orig_Xr[col];
|
|
212
|
+
#else
|
|
213
|
+
orig_Xr[col] = (orig_Xr[col] < 0)? NA_REAL : orig_Xr[col];
|
|
214
|
+
#endif
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|