isotree 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -1
- data/LICENSE.txt +2 -2
- data/README.md +32 -14
- data/ext/isotree/ext.cpp +144 -31
- data/ext/isotree/extconf.rb +7 -7
- data/lib/isotree/isolation_forest.rb +110 -30
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +165 -27
- data/vendor/isotree/include/isotree.hpp +2111 -0
- data/vendor/isotree/include/isotree_oop.hpp +394 -0
- data/vendor/isotree/inst/COPYRIGHTS +62 -0
- data/vendor/isotree/src/RcppExports.cpp +525 -52
- data/vendor/isotree/src/Rwrapper.cpp +1931 -268
- data/vendor/isotree/src/c_interface.cpp +953 -0
- data/vendor/isotree/src/crit.hpp +4232 -0
- data/vendor/isotree/src/dist.hpp +1886 -0
- data/vendor/isotree/src/exp_depth_table.hpp +134 -0
- data/vendor/isotree/src/extended.hpp +1444 -0
- data/vendor/isotree/src/external_facing_generic.hpp +399 -0
- data/vendor/isotree/src/fit_model.hpp +2401 -0
- data/vendor/isotree/src/{dealloc.cpp → headers_joined.hpp} +38 -22
- data/vendor/isotree/src/helpers_iforest.hpp +813 -0
- data/vendor/isotree/src/{impute.cpp → impute.hpp} +353 -122
- data/vendor/isotree/src/indexer.cpp +515 -0
- data/vendor/isotree/src/instantiate_template_headers.cpp +118 -0
- data/vendor/isotree/src/instantiate_template_headers.hpp +240 -0
- data/vendor/isotree/src/isoforest.hpp +1659 -0
- data/vendor/isotree/src/isotree.hpp +1804 -392
- data/vendor/isotree/src/isotree_exportable.hpp +99 -0
- data/vendor/isotree/src/merge_models.cpp +159 -16
- data/vendor/isotree/src/mult.hpp +1321 -0
- data/vendor/isotree/src/oop_interface.cpp +842 -0
- data/vendor/isotree/src/oop_interface.hpp +278 -0
- data/vendor/isotree/src/other_helpers.hpp +219 -0
- data/vendor/isotree/src/predict.hpp +1932 -0
- data/vendor/isotree/src/python_helpers.hpp +134 -0
- data/vendor/isotree/src/ref_indexer.hpp +154 -0
- data/vendor/isotree/src/robinmap/LICENSE +21 -0
- data/vendor/isotree/src/robinmap/README.md +483 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_growth_policy.h +406 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_hash.h +1620 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_map.h +807 -0
- data/vendor/isotree/src/robinmap/include/tsl/robin_set.h +660 -0
- data/vendor/isotree/src/serialize.cpp +4300 -139
- data/vendor/isotree/src/sql.cpp +141 -59
- data/vendor/isotree/src/subset_models.cpp +174 -0
- data/vendor/isotree/src/utils.hpp +3808 -0
- data/vendor/isotree/src/xoshiro.hpp +467 -0
- data/vendor/isotree/src/ziggurat.hpp +405 -0
- metadata +38 -104
- data/vendor/cereal/LICENSE +0 -24
- data/vendor/cereal/README.md +0 -85
- data/vendor/cereal/include/cereal/access.hpp +0 -351
- data/vendor/cereal/include/cereal/archives/adapters.hpp +0 -163
- data/vendor/cereal/include/cereal/archives/binary.hpp +0 -169
- data/vendor/cereal/include/cereal/archives/json.hpp +0 -1019
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +0 -334
- data/vendor/cereal/include/cereal/archives/xml.hpp +0 -956
- data/vendor/cereal/include/cereal/cereal.hpp +0 -1089
- data/vendor/cereal/include/cereal/details/helpers.hpp +0 -422
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +0 -796
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +0 -65
- data/vendor/cereal/include/cereal/details/static_object.hpp +0 -127
- data/vendor/cereal/include/cereal/details/traits.hpp +0 -1411
- data/vendor/cereal/include/cereal/details/util.hpp +0 -84
- data/vendor/cereal/include/cereal/external/base64.hpp +0 -134
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +0 -284
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +0 -2652
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +0 -299
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +0 -716
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +0 -74
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +0 -161
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +0 -99
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +0 -104
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +0 -151
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +0 -271
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +0 -245
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +0 -78
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +0 -308
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +0 -186
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +0 -55
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +0 -740
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +0 -232
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +0 -69
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +0 -290
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +0 -46
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +0 -128
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +0 -70
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +0 -71
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +0 -316
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +0 -300
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +0 -81
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +0 -1414
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +0 -277
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +0 -656
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +0 -2230
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +0 -2497
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +0 -223
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +0 -121
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +0 -709
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +0 -52
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +0 -406
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +0 -2624
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +0 -175
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +0 -428
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +0 -123
- data/vendor/cereal/include/cereal/macros.hpp +0 -154
- data/vendor/cereal/include/cereal/specialize.hpp +0 -139
- data/vendor/cereal/include/cereal/types/array.hpp +0 -79
- data/vendor/cereal/include/cereal/types/atomic.hpp +0 -55
- data/vendor/cereal/include/cereal/types/base_class.hpp +0 -203
- data/vendor/cereal/include/cereal/types/bitset.hpp +0 -176
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +0 -164
- data/vendor/cereal/include/cereal/types/chrono.hpp +0 -72
- data/vendor/cereal/include/cereal/types/common.hpp +0 -129
- data/vendor/cereal/include/cereal/types/complex.hpp +0 -56
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +0 -73
- data/vendor/cereal/include/cereal/types/deque.hpp +0 -62
- data/vendor/cereal/include/cereal/types/forward_list.hpp +0 -68
- data/vendor/cereal/include/cereal/types/functional.hpp +0 -43
- data/vendor/cereal/include/cereal/types/list.hpp +0 -62
- data/vendor/cereal/include/cereal/types/map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/memory.hpp +0 -425
- data/vendor/cereal/include/cereal/types/optional.hpp +0 -66
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +0 -483
- data/vendor/cereal/include/cereal/types/queue.hpp +0 -132
- data/vendor/cereal/include/cereal/types/set.hpp +0 -103
- data/vendor/cereal/include/cereal/types/stack.hpp +0 -76
- data/vendor/cereal/include/cereal/types/string.hpp +0 -61
- data/vendor/cereal/include/cereal/types/tuple.hpp +0 -123
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +0 -36
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +0 -99
- data/vendor/cereal/include/cereal/types/utility.hpp +0 -47
- data/vendor/cereal/include/cereal/types/valarray.hpp +0 -89
- data/vendor/cereal/include/cereal/types/variant.hpp +0 -109
- data/vendor/cereal/include/cereal/types/vector.hpp +0 -112
- data/vendor/cereal/include/cereal/version.hpp +0 -52
- data/vendor/isotree/src/Makevars +0 -4
- data/vendor/isotree/src/crit.cpp +0 -912
- data/vendor/isotree/src/dist.cpp +0 -749
- data/vendor/isotree/src/extended.cpp +0 -790
- data/vendor/isotree/src/fit_model.cpp +0 -1090
- data/vendor/isotree/src/helpers_iforest.cpp +0 -324
- data/vendor/isotree/src/isoforest.cpp +0 -771
- data/vendor/isotree/src/mult.cpp +0 -607
- data/vendor/isotree/src/predict.cpp +0 -853
- data/vendor/isotree/src/utils.cpp +0 -1566
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
|
2
|
+
* of categorical variables and missing values.
|
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
|
4
|
+
*
|
|
5
|
+
* This library is based on the following works:
|
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
7
|
+
* "Isolation forest."
|
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
10
|
+
* "Isolation-based anomaly detection."
|
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
|
13
|
+
* "Extended Isolation Forest."
|
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
|
21
|
+
* [8] Cortes, David.
|
|
22
|
+
* "Distance approximation using Isolation Forests."
|
|
23
|
+
* arXiv preprint arXiv:1910.12362 (2019).
|
|
24
|
+
* [9] Cortes, David.
|
|
25
|
+
* "Imputing missing values with unsupervised random trees."
|
|
26
|
+
* arXiv preprint arXiv:1911.06646 (2019).
|
|
27
|
+
* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
|
|
28
|
+
* [11] Cortes, David.
|
|
29
|
+
* "Revisiting randomized choices in isolation forests."
|
|
30
|
+
* arXiv preprint arXiv:2110.13402 (2021).
|
|
31
|
+
* [12] Guha, Sudipto, et al.
|
|
32
|
+
* "Robust random cut forest based anomaly detection on streams."
|
|
33
|
+
* International conference on machine learning. PMLR, 2016.
|
|
34
|
+
* [13] Cortes, David.
|
|
35
|
+
* "Isolation forests: looking beyond tree depth."
|
|
36
|
+
* arXiv preprint arXiv:2111.11639 (2021).
|
|
37
|
+
* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
|
|
38
|
+
* "Isolation kernel and its effect on SVM"
|
|
39
|
+
* Proceedings of the 24th ACM SIGKDD
|
|
40
|
+
* International Conference on Knowledge Discovery & Data Mining. 2018.
|
|
41
|
+
*
|
|
42
|
+
* BSD 2-Clause License
|
|
43
|
+
* Copyright (c) 2019-2021, David Cortes
|
|
44
|
+
* All rights reserved.
|
|
45
|
+
* Redistribution and use in source and binary forms, with or without
|
|
46
|
+
* modification, are permitted provided that the following conditions are met:
|
|
47
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
|
48
|
+
* list of conditions and the following disclaimer.
|
|
49
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
50
|
+
* this list of conditions and the following disclaimer in the documentation
|
|
51
|
+
* and/or other materials provided with the distribution.
|
|
52
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
53
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
54
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
55
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
56
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
57
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
58
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
59
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
60
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
61
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
62
|
+
*/
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
/***********************************************************************************
|
|
66
|
+
---------------------
|
|
67
|
+
IsoTree OOP interface
|
|
68
|
+
---------------------
|
|
69
|
+
|
|
70
|
+
This is provided as an alternative easier-to-use interface for this library
|
|
71
|
+
which follows scikit-learn-style methods with a single C++ class. It is a
|
|
72
|
+
wrapper over the non-OOP header 'isotree.hpp', providing the same functionality
|
|
73
|
+
in a perhaps more comprehensible structure, while still offering direct access
|
|
74
|
+
to the underlying objects so as to allow using the functions from 'isotree.hpp'.
|
|
75
|
+
|
|
76
|
+
It is a more limited interface as it does not implement all the functionality
|
|
77
|
+
for serialization, distance prediction, oproducing predictions in the same call
|
|
78
|
+
as the model is fit, or fitting/predicting on data with types other than
|
|
79
|
+
'double' and 'int'.
|
|
80
|
+
|
|
81
|
+
The descriptions here do not contain the full documentation, but rather only
|
|
82
|
+
some hints so as to make them more comprehensible, aiming at producing function
|
|
83
|
+
signatures that are self-descriptive instead (if you are familiar with the
|
|
84
|
+
scikit-learn library for Python).
|
|
85
|
+
|
|
86
|
+
For detailed documentation see the same or similar-looking methods in the
|
|
87
|
+
'isotree.hpp' header instead.
|
|
88
|
+
|
|
89
|
+
***********************************************************************************/
|
|
90
|
+
#ifndef ISOTREE_OOP_H
|
|
91
|
+
#define ISOTREE_OOP_H
|
|
92
|
+
|
|
93
|
+
#include "isotree.hpp"
|
|
94
|
+
|
|
95
|
+
namespace isotree {
|
|
96
|
+
|
|
97
|
+
class ISOTREE_EXPORTED IsolationForest
|
|
98
|
+
{
|
|
99
|
+
public:
|
|
100
|
+
/* Note: if passing nthreads<0, will reset it to 'max_threads + nthreads + 1',
|
|
101
|
+
so passing -1 means using all available threads. */
|
|
102
|
+
int nthreads = -1; /* <- May be manually changed at any time */
|
|
103
|
+
|
|
104
|
+
uint64_t random_seed = 1;
|
|
105
|
+
|
|
106
|
+
/* General tree construction parameters */
|
|
107
|
+
size_t ndim = 3;
|
|
108
|
+
size_t ntry = 1;
|
|
109
|
+
CoefType coef_type = Uniform; /* only for ndim>1 */
|
|
110
|
+
bool with_replacement = false;
|
|
111
|
+
bool weight_as_sample = true;
|
|
112
|
+
size_t sample_size = 0;
|
|
113
|
+
size_t ntrees = 500;
|
|
114
|
+
size_t max_depth = 0;
|
|
115
|
+
size_t ncols_per_tree = 0;
|
|
116
|
+
bool limit_depth = true; /* if 'true', then 'max_depth' is ignored */
|
|
117
|
+
bool penalize_range = false;
|
|
118
|
+
bool standardize_data = true; /* only for ndim==1 */
|
|
119
|
+
ScoringMetric scoring_metric = Depth;
|
|
120
|
+
bool fast_bratio = true; /* only for scoring_metric with 'Boxed' */
|
|
121
|
+
bool weigh_by_kurt = false;
|
|
122
|
+
double prob_pick_by_gain_pl = 0.;
|
|
123
|
+
double prob_pick_by_gain_avg = 0.;
|
|
124
|
+
double prob_pick_by_full_gain = 0.;
|
|
125
|
+
double prob_pick_by_dens = 0.;
|
|
126
|
+
double prob_pick_col_by_range = 0.;
|
|
127
|
+
double prob_pick_col_by_var = 0.;
|
|
128
|
+
double prob_pick_col_by_kurt = 0.;
|
|
129
|
+
double min_gain = 0.;
|
|
130
|
+
MissingAction missing_action = Impute;
|
|
131
|
+
|
|
132
|
+
/* For categorical variables */
|
|
133
|
+
CategSplit cat_split_type = SubSet;
|
|
134
|
+
NewCategAction new_cat_action = Weighted;
|
|
135
|
+
bool coef_by_prop = false;
|
|
136
|
+
bool all_perm = false;
|
|
137
|
+
|
|
138
|
+
/* For imputation methods (when using 'build_imputer=true' and calling 'impute') */
|
|
139
|
+
bool build_imputer = false;
|
|
140
|
+
size_t min_imp_obs = 3;
|
|
141
|
+
UseDepthImp depth_imp = Higher;
|
|
142
|
+
WeighImpRows weigh_imp_rows = Inverse;
|
|
143
|
+
|
|
144
|
+
/* Internal objects which can be used with the non-OOP interface */
|
|
145
|
+
IsoForest model;
|
|
146
|
+
ExtIsoForest model_ext;
|
|
147
|
+
Imputer imputer;
|
|
148
|
+
TreesIndexer indexer;
|
|
149
|
+
|
|
150
|
+
IsolationForest() = default;
|
|
151
|
+
|
|
152
|
+
~IsolationForest() = default;
|
|
153
|
+
|
|
154
|
+
/* Be aware that many combinations of parameters are invalid.
|
|
155
|
+
This function will not do any validation of the inputs it receives.
|
|
156
|
+
|
|
157
|
+
Calling 'fit' with a combination of invalid parameters *may* throw a
|
|
158
|
+
runtime exception, but it will not be able to detect all the possible
|
|
159
|
+
invalid parameter combinations and could potentially lead to silent
|
|
160
|
+
errors like statistically incorrect models or predictions that do not
|
|
161
|
+
make sense. See the documentation of the non-OOP header or of the R
|
|
162
|
+
and Python interfaces for more details about the parameters and the
|
|
163
|
+
valid and invalid combinations of parameters. */
|
|
164
|
+
IsolationForest
|
|
165
|
+
(
|
|
166
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
|
167
|
+
bool with_replacement, bool weight_as_sample,
|
|
168
|
+
size_t sample_size, size_t ntrees,
|
|
169
|
+
size_t max_depth, size_t ncols_per_tree, bool limit_depth,
|
|
170
|
+
bool penalize_range, bool standardize_data,
|
|
171
|
+
ScoringMetric scoring_metric, bool fast_bratio, bool weigh_by_kurt,
|
|
172
|
+
double prob_pick_by_gain_pl, double prob_pick_by_gain_avg,
|
|
173
|
+
double prob_pick_col_by_range, double prob_pick_col_by_var,
|
|
174
|
+
double prob_pick_col_by_kurt,
|
|
175
|
+
double min_gain, MissingAction missing_action,
|
|
176
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
|
177
|
+
bool all_perm, bool build_imputer, size_t min_imp_obs,
|
|
178
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
|
179
|
+
uint64_t random_seed, int nthreads
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
/* 'X' must be in column-major order (like Fortran). */
|
|
183
|
+
void fit(double X[], size_t nrows, size_t ncols);
|
|
184
|
+
|
|
185
|
+
/* Model can also be fit to categorical data (must also be column-major).
|
|
186
|
+
Categorical data should be passed as integers starting at zero, with
|
|
187
|
+
negative values denoting missing, and must pass also the number of
|
|
188
|
+
categories to expect in each column.
|
|
189
|
+
|
|
190
|
+
Can also pass row and column weights (see the documentation for options
|
|
191
|
+
on how to interpret the row weights). */
|
|
192
|
+
void fit(double numeric_data[], size_t ncols_numeric, size_t nrows,
|
|
193
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
194
|
+
double sample_weights[], double col_weights[]);
|
|
195
|
+
|
|
196
|
+
/* Numeric data may also be supplied as a sparse matrix, in which case it
|
|
197
|
+
must be CSC format (colum-major). Categorical data is not supported in
|
|
198
|
+
sparse format. */
|
|
199
|
+
void fit(double Xc[], int Xc_ind[], int Xc_indptr[],
|
|
200
|
+
size_t ncols_numeric, size_t nrows,
|
|
201
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
|
202
|
+
double sample_weights[], double col_weights[]);
|
|
203
|
+
|
|
204
|
+
/* 'predict' will return a vector with the standardized outlier scores
|
|
205
|
+
(output length is the same as the number of rows in the data), in
|
|
206
|
+
which higher values mean more outlierness.
|
|
207
|
+
|
|
208
|
+
The data must again be in column-major format.
|
|
209
|
+
|
|
210
|
+
This function will run multi-threaded if there is more than one row and
|
|
211
|
+
the object has number of threads set to more than 1. */
|
|
212
|
+
std::vector<double> predict(double X[], size_t nrows, bool standardize);
|
|
213
|
+
|
|
214
|
+
/* Can optionally write to a non-owned array, or obtain the non-standardized
|
|
215
|
+
isolation depth instead of the standardized score (also on a per-tree basis
|
|
216
|
+
if desired), or get the terminal node numbers/indices for each tree. Note
|
|
217
|
+
that 'tree_num' and 'per_tree_depths' are optional (pass NULL if not desired),
|
|
218
|
+
while 'output_depths' should always be passed. Be aware that the outputs of
|
|
219
|
+
'tree_num' will be filled in column-major order ([nrows, ntrees]), while the
|
|
220
|
+
outputs of 'per_tree_depths' will be in row-major order.
|
|
221
|
+
|
|
222
|
+
Note: 'tree_num' and 'per_tree_depths' will not be calculable when using
|
|
223
|
+
'ndim==1' plus either 'missing_action==Divide' or 'new_cat_action==Weighted'.
|
|
224
|
+
These can be checked through 'check_can_predict_per_tree'.
|
|
225
|
+
|
|
226
|
+
Here, the data might be passed as either column-major or row-major (getting
|
|
227
|
+
predictions in row-major order will be faster). If the data is in row-major
|
|
228
|
+
order, must also provide the leading dimension of the array (typically this
|
|
229
|
+
corresponds to the number of columns, but might be larger if using a subset
|
|
230
|
+
of a larger array). */
|
|
231
|
+
void predict(double numeric_data[], int categ_data[], bool is_col_major,
|
|
232
|
+
size_t nrows, size_t ld_numeric, size_t ld_categ, bool standardize,
|
|
233
|
+
double output_depths[], int tree_num[], double per_tree_depths[]);
|
|
234
|
+
|
|
235
|
+
/* Numeric data may also be provided in sparse format, which can be either
|
|
236
|
+
CSC (column-major) or CSR (row-major). If the number of rows is large,
|
|
237
|
+
predictions in CSC format will be faster than in CSR (assuming that
|
|
238
|
+
categorical data is either missing or column-major). Note that for CSC,
|
|
239
|
+
parallelization is done by trees instead of by rows, and outputs are
|
|
240
|
+
subject to numerical rounding error between runs. */
|
|
241
|
+
void predict(double X_sparse[], int X_ind[], int X_indptr[], bool is_csc,
|
|
242
|
+
int categ_data[], bool is_col_major, size_t ld_categ, size_t nrows, bool standardize,
|
|
243
|
+
double output_depths[], int tree_num[], double per_tree_depths[]);
|
|
244
|
+
|
|
245
|
+
/* Distances between observations will be returned either as a triangular matrix
|
|
246
|
+
representing an upper diagonal (length is nrows*(nrows-1)/2), or as a full
|
|
247
|
+
square matrix (length is nrows^2). */
|
|
248
|
+
std::vector<double> predict_distance(double X[], size_t nrows,
|
|
249
|
+
bool as_kernel,
|
|
250
|
+
bool assume_full_distr, bool standardize,
|
|
251
|
+
bool triangular);
|
|
252
|
+
|
|
253
|
+
void predict_distance(double numeric_data[], int categ_data[],
|
|
254
|
+
size_t nrows,
|
|
255
|
+
bool as_kernel,
|
|
256
|
+
bool assume_full_distr, bool standardize,
|
|
257
|
+
bool triangular,
|
|
258
|
+
double dist_matrix[]);
|
|
259
|
+
|
|
260
|
+
/* Sparse data is only supported in CSC format. */
|
|
261
|
+
void predict_distance(double Xc[], int Xc_ind[], int Xc_indptr[], int categ_data[],
|
|
262
|
+
size_t nrows,
|
|
263
|
+
bool as_kernel,
|
|
264
|
+
bool assume_full_distr, bool standardize,
|
|
265
|
+
bool triangular,
|
|
266
|
+
double dist_matrix[]);
|
|
267
|
+
|
|
268
|
+
/* This will impute missing values in-place. Data here must be in column-major order. */
|
|
269
|
+
void impute(double X[], size_t nrows);
|
|
270
|
+
|
|
271
|
+
/* This variation will accept data in either row-major or column-major order.
|
|
272
|
+
The leading dimension must match with the number of columns for row major,
|
|
273
|
+
or with the number of rows for column-major (custom leading dimensions are
|
|
274
|
+
not supported). */
|
|
275
|
+
void impute(double numeric_data[], int categ_data[], bool is_col_major, size_t nrows);
|
|
276
|
+
|
|
277
|
+
/* Numeric data may be passed in sparse CSR format. Note however that it will
|
|
278
|
+
impute the values that are NAN, not the values that are ommited from the
|
|
279
|
+
sparse format. */
|
|
280
|
+
void impute(double Xr[], int Xr_ind[], int Xr_indptr[],
|
|
281
|
+
int categ_data[], bool is_col_major, size_t nrows);
|
|
282
|
+
|
|
283
|
+
void build_indexer(const bool with_distances);
|
|
284
|
+
|
|
285
|
+
/* Sets points as reference to later calculate distances or kernel from arbitrary points
|
|
286
|
+
to these ones, without having to save these reference points's original features. */
|
|
287
|
+
void set_as_reference_points(double numeric_data[], int categ_data[], bool is_col_major,
|
|
288
|
+
size_t nrows, size_t ld_numeric, size_t ld_categ,
|
|
289
|
+
const bool with_distances);
|
|
290
|
+
|
|
291
|
+
void set_as_reference_points(double Xc[], int Xc_ind[], int Xc_indptr[], int categ_data[],
|
|
292
|
+
size_t nrows, const bool with_distances);
|
|
293
|
+
|
|
294
|
+
size_t get_num_reference_points() const noexcept;
|
|
295
|
+
|
|
296
|
+
/* Must call 'set_as_reference_points' to make this method available.
|
|
297
|
+
|
|
298
|
+
Here 'dist_matrix' should have dimension [nrows, n_references],
|
|
299
|
+
and will be filled in row-major order.
|
|
300
|
+
|
|
301
|
+
This will always take 'assume_full_distr=true'. */
|
|
302
|
+
void predict_distance_to_ref_points(double numeric_data[], int categ_data[],
|
|
303
|
+
double Xc[], int Xc_ind[], int Xc_indptr[],
|
|
304
|
+
size_t nrows, bool is_col_major, size_t ld_numeric, size_t ld_categ,
|
|
305
|
+
bool as_kernel, bool standardize,
|
|
306
|
+
double dist_matrix[]);
|
|
307
|
+
|
|
308
|
+
/* Serialize (save) the model to a file. See 'isotree.hpp' for compatibility
|
|
309
|
+
details. Note that this does not save all the details of the object, but
|
|
310
|
+
rather only those that are necessary for prediction.
|
|
311
|
+
|
|
312
|
+
The file must be opened in binary write mode ('wb').
|
|
313
|
+
|
|
314
|
+
Note that models serialized through this interface are not importable in
|
|
315
|
+
the R and Python wrappers around this library. */
|
|
316
|
+
void serialize(FILE *out) const;
|
|
317
|
+
|
|
318
|
+
/* The stream must be opened in binary mode. */
|
|
319
|
+
void serialize(std::ostream &out) const;
|
|
320
|
+
|
|
321
|
+
/* The number of threads here does not mean 'how many threads to use while
|
|
322
|
+
deserializing', but rather, 'how many threads will be set for the prediction
|
|
323
|
+
functions of the resulting object'.
|
|
324
|
+
|
|
325
|
+
The input file must be opened in binary read more ('rb').
|
|
326
|
+
|
|
327
|
+
Note that not all the members of an 'IsolationForest' object are saved
|
|
328
|
+
when serializing, so if you access members such as 'prob_pick_by_gain_avg',
|
|
329
|
+
they will all be at their default values.
|
|
330
|
+
|
|
331
|
+
These functions can de-serialize models saved from the R and Python interfaces,
|
|
332
|
+
but models that are serialized from this C++ interface are not importable in
|
|
333
|
+
those R and Python versions. */
|
|
334
|
+
static IsolationForest deserialize(FILE *inp, int nthreads);
|
|
335
|
+
|
|
336
|
+
/* The stream must be opened in binary mode. */
|
|
337
|
+
static IsolationForest deserialize(std::istream &inp, int nthreads);
|
|
338
|
+
|
|
339
|
+
/* To serialize and deserialize in a more idiomatic way
|
|
340
|
+
('stream << model' and 'stream >> model').
|
|
341
|
+
Note that 'ist >> model' will set 'nthreads=-1', which you might
|
|
342
|
+
want to modify afterwards. */
|
|
343
|
+
friend std::ostream& operator<<(std::ostream &ost, const IsolationForest &model);
|
|
344
|
+
|
|
345
|
+
friend std::istream& operator>>(std::istream &ist, IsolationForest &model);
|
|
346
|
+
|
|
347
|
+
/* These functions allow getting the underlying objects to use with the more
|
|
348
|
+
featureful non-OOP interface.
|
|
349
|
+
|
|
350
|
+
Note that it is also possible to use the C-interface functions with this
|
|
351
|
+
object by passing a pointer to the 'IsolationForest' object instead. */
|
|
352
|
+
IsoForest& get_model();
|
|
353
|
+
|
|
354
|
+
ExtIsoForest& get_model_ext();
|
|
355
|
+
|
|
356
|
+
Imputer& get_imputer();
|
|
357
|
+
|
|
358
|
+
TreesIndexer& get_indexer();
|
|
359
|
+
|
|
360
|
+
/* This converts from a negative 'nthreads' to the actual number (provided it
|
|
361
|
+
was compiled with OpenMP support), and will set to 1 if the number is invalid.
|
|
362
|
+
If the library was compiled without multi-threading and it requests more than
|
|
363
|
+
one thread, will write a message to 'stderr'. */
|
|
364
|
+
void check_nthreads();
|
|
365
|
+
|
|
366
|
+
/* This will return the number of trees in the object. If it is not fitted, will
|
|
367
|
+
throw an error instead. */
|
|
368
|
+
size_t get_ntrees() const;
|
|
369
|
+
|
|
370
|
+
/* This checks whether 'predict' can output 'tree_num' and 'per_tree_depths'. */
|
|
371
|
+
bool check_can_predict_per_tree() const;
|
|
372
|
+
|
|
373
|
+
private:
|
|
374
|
+
bool is_fitted = false;
|
|
375
|
+
|
|
376
|
+
void override_previous_fit();
|
|
377
|
+
void check_params();
|
|
378
|
+
void check_is_fitted() const;
|
|
379
|
+
IsolationForest(int nthreads, size_t ndim, size_t ntrees, bool build_imputer);
|
|
380
|
+
template <class otype>
|
|
381
|
+
void serialize_template(otype &out) const;
|
|
382
|
+
template <class itype>
|
|
383
|
+
static IsolationForest deserialize_template(itype &inp, int nthreads);
|
|
384
|
+
|
|
385
|
+
};
|
|
386
|
+
|
|
387
|
+
ISOTREE_EXPORTED
|
|
388
|
+
std::ostream& operator<<(std::ostream &ost, const IsolationForest &model);
|
|
389
|
+
ISOTREE_EXPORTED
|
|
390
|
+
std::istream& operator>>(std::istream &ist, IsolationForest &model);
|
|
391
|
+
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
#endif /* ifndef ISOTREE_OOP_H */
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
For the included robinmap library (files under "src/robinmap"):
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
|
|
6
|
+
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
in the Software without restriction, including without limitation the rights
|
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
furnished to do so, subject to the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
SOFTWARE.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
----------------
|
|
27
|
+
|
|
28
|
+
For the ziggurat tables (file "src/ziggurat.hpp"):
|
|
29
|
+
|
|
30
|
+
BSD License:
|
|
31
|
+
|
|
32
|
+
Copyright (c) 2005-2022, NumPy Developers.
|
|
33
|
+
|
|
34
|
+
All rights reserved.
|
|
35
|
+
|
|
36
|
+
Redistribution and use in source and binary forms, with or without
|
|
37
|
+
modification, are permitted provided that the following conditions are
|
|
38
|
+
met:
|
|
39
|
+
|
|
40
|
+
* Redistributions of source code must retain the above copyright
|
|
41
|
+
notice, this list of conditions and the following disclaimer.
|
|
42
|
+
|
|
43
|
+
* Redistributions in binary form must reproduce the above
|
|
44
|
+
copyright notice, this list of conditions and the following
|
|
45
|
+
disclaimer in the documentation and/or other materials provided
|
|
46
|
+
with the distribution.
|
|
47
|
+
|
|
48
|
+
* Neither the name of the NumPy Developers nor the names of any
|
|
49
|
+
contributors may be used to endorse or promote products derived
|
|
50
|
+
from this software without specific prior written permission.
|
|
51
|
+
|
|
52
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
53
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
54
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
55
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
56
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
57
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
58
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
59
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
60
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
61
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
62
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|