isotree 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/LICENSE.txt +2 -1
- data/README.md +57 -6
- data/ext/isotree/ext.cpp +170 -39
- data/ext/isotree/extconf.rb +3 -3
- data/lib/isotree.rb +2 -0
- data/lib/isotree/dataset.rb +73 -0
- data/lib/isotree/isolation_forest.rb +182 -35
- data/lib/isotree/version.rb +1 -1
- data/vendor/cereal/LICENSE +24 -0
- data/vendor/cereal/README.md +85 -0
- data/vendor/cereal/include/cereal/access.hpp +351 -0
- data/vendor/cereal/include/cereal/archives/adapters.hpp +163 -0
- data/vendor/cereal/include/cereal/archives/binary.hpp +169 -0
- data/vendor/cereal/include/cereal/archives/json.hpp +1019 -0
- data/vendor/cereal/include/cereal/archives/portable_binary.hpp +334 -0
- data/vendor/cereal/include/cereal/archives/xml.hpp +956 -0
- data/vendor/cereal/include/cereal/cereal.hpp +1089 -0
- data/vendor/cereal/include/cereal/details/helpers.hpp +422 -0
- data/vendor/cereal/include/cereal/details/polymorphic_impl.hpp +796 -0
- data/vendor/cereal/include/cereal/details/polymorphic_impl_fwd.hpp +65 -0
- data/vendor/cereal/include/cereal/details/static_object.hpp +127 -0
- data/vendor/cereal/include/cereal/details/traits.hpp +1411 -0
- data/vendor/cereal/include/cereal/details/util.hpp +84 -0
- data/vendor/cereal/include/cereal/external/base64.hpp +134 -0
- data/vendor/cereal/include/cereal/external/rapidjson/allocators.h +284 -0
- data/vendor/cereal/include/cereal/external/rapidjson/cursorstreamwrapper.h +78 -0
- data/vendor/cereal/include/cereal/external/rapidjson/document.h +2652 -0
- data/vendor/cereal/include/cereal/external/rapidjson/encodedstream.h +299 -0
- data/vendor/cereal/include/cereal/external/rapidjson/encodings.h +716 -0
- data/vendor/cereal/include/cereal/external/rapidjson/error/en.h +74 -0
- data/vendor/cereal/include/cereal/external/rapidjson/error/error.h +161 -0
- data/vendor/cereal/include/cereal/external/rapidjson/filereadstream.h +99 -0
- data/vendor/cereal/include/cereal/external/rapidjson/filewritestream.h +104 -0
- data/vendor/cereal/include/cereal/external/rapidjson/fwd.h +151 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/biginteger.h +290 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/diyfp.h +271 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/dtoa.h +245 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/ieee754.h +78 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/itoa.h +308 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/meta.h +186 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/pow10.h +55 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/regex.h +740 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/stack.h +232 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strfunc.h +69 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/strtod.h +290 -0
- data/vendor/cereal/include/cereal/external/rapidjson/internal/swap.h +46 -0
- data/vendor/cereal/include/cereal/external/rapidjson/istreamwrapper.h +128 -0
- data/vendor/cereal/include/cereal/external/rapidjson/memorybuffer.h +70 -0
- data/vendor/cereal/include/cereal/external/rapidjson/memorystream.h +71 -0
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/inttypes.h +316 -0
- data/vendor/cereal/include/cereal/external/rapidjson/msinttypes/stdint.h +300 -0
- data/vendor/cereal/include/cereal/external/rapidjson/ostreamwrapper.h +81 -0
- data/vendor/cereal/include/cereal/external/rapidjson/pointer.h +1414 -0
- data/vendor/cereal/include/cereal/external/rapidjson/prettywriter.h +277 -0
- data/vendor/cereal/include/cereal/external/rapidjson/rapidjson.h +656 -0
- data/vendor/cereal/include/cereal/external/rapidjson/reader.h +2230 -0
- data/vendor/cereal/include/cereal/external/rapidjson/schema.h +2497 -0
- data/vendor/cereal/include/cereal/external/rapidjson/stream.h +223 -0
- data/vendor/cereal/include/cereal/external/rapidjson/stringbuffer.h +121 -0
- data/vendor/cereal/include/cereal/external/rapidjson/writer.h +709 -0
- data/vendor/cereal/include/cereal/external/rapidxml/license.txt +52 -0
- data/vendor/cereal/include/cereal/external/rapidxml/manual.html +406 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml.hpp +2624 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_iterators.hpp +175 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_print.hpp +428 -0
- data/vendor/cereal/include/cereal/external/rapidxml/rapidxml_utils.hpp +123 -0
- data/vendor/cereal/include/cereal/macros.hpp +154 -0
- data/vendor/cereal/include/cereal/specialize.hpp +139 -0
- data/vendor/cereal/include/cereal/types/array.hpp +79 -0
- data/vendor/cereal/include/cereal/types/atomic.hpp +55 -0
- data/vendor/cereal/include/cereal/types/base_class.hpp +203 -0
- data/vendor/cereal/include/cereal/types/bitset.hpp +176 -0
- data/vendor/cereal/include/cereal/types/boost_variant.hpp +164 -0
- data/vendor/cereal/include/cereal/types/chrono.hpp +72 -0
- data/vendor/cereal/include/cereal/types/common.hpp +129 -0
- data/vendor/cereal/include/cereal/types/complex.hpp +56 -0
- data/vendor/cereal/include/cereal/types/concepts/pair_associative_container.hpp +73 -0
- data/vendor/cereal/include/cereal/types/deque.hpp +62 -0
- data/vendor/cereal/include/cereal/types/forward_list.hpp +68 -0
- data/vendor/cereal/include/cereal/types/functional.hpp +43 -0
- data/vendor/cereal/include/cereal/types/list.hpp +62 -0
- data/vendor/cereal/include/cereal/types/map.hpp +36 -0
- data/vendor/cereal/include/cereal/types/memory.hpp +425 -0
- data/vendor/cereal/include/cereal/types/optional.hpp +66 -0
- data/vendor/cereal/include/cereal/types/polymorphic.hpp +483 -0
- data/vendor/cereal/include/cereal/types/queue.hpp +132 -0
- data/vendor/cereal/include/cereal/types/set.hpp +103 -0
- data/vendor/cereal/include/cereal/types/stack.hpp +76 -0
- data/vendor/cereal/include/cereal/types/string.hpp +61 -0
- data/vendor/cereal/include/cereal/types/tuple.hpp +123 -0
- data/vendor/cereal/include/cereal/types/unordered_map.hpp +36 -0
- data/vendor/cereal/include/cereal/types/unordered_set.hpp +99 -0
- data/vendor/cereal/include/cereal/types/utility.hpp +47 -0
- data/vendor/cereal/include/cereal/types/valarray.hpp +89 -0
- data/vendor/cereal/include/cereal/types/variant.hpp +109 -0
- data/vendor/cereal/include/cereal/types/vector.hpp +112 -0
- data/vendor/cereal/include/cereal/version.hpp +52 -0
- data/vendor/isotree/LICENSE +1 -1
- data/vendor/isotree/README.md +7 -2
- data/vendor/isotree/src/RcppExports.cpp +44 -4
- data/vendor/isotree/src/Rwrapper.cpp +141 -51
- data/vendor/isotree/src/crit.cpp +1 -1
- data/vendor/isotree/src/dealloc.cpp +1 -1
- data/vendor/isotree/src/dist.cpp +6 -6
- data/vendor/isotree/src/extended.cpp +5 -5
- data/vendor/isotree/src/fit_model.cpp +27 -5
- data/vendor/isotree/src/helpers_iforest.cpp +26 -11
- data/vendor/isotree/src/impute.cpp +7 -7
- data/vendor/isotree/src/isoforest.cpp +7 -7
- data/vendor/isotree/src/isotree.hpp +27 -5
- data/vendor/isotree/src/merge_models.cpp +1 -1
- data/vendor/isotree/src/mult.cpp +1 -1
- data/vendor/isotree/src/predict.cpp +20 -16
- data/vendor/isotree/src/serialize.cpp +1 -1
- data/vendor/isotree/src/sql.cpp +545 -0
- data/vendor/isotree/src/utils.cpp +36 -44
- metadata +102 -81
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ffdc3a4283698bfc43c5563bd1c8cb4dd41881ec0a9920bfbe91f01fcbe0e822
|
|
4
|
+
data.tar.gz: 83cc4a38f6640fd5a37c2a6aabe0cba12e198bdbd7b010cabb34f8d36073a74b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b6642570d6330fc2d72b210a040985dfa025d5d0deed622d04533fe9bf830c6cd3dda1ace94ace3d034c8302aa360e5ef6eeda576b48340db4d6c95016b8979c
|
|
7
|
+
data.tar.gz: df0716c317e01bd174157e5c1b09e4f4e566f07512ea3861324b8fa9659a416c6ef1319580d2ee8a7231c74f74b9a9e5bdc9770a5adf0d208b82a22db1a9b29d
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,28 @@
|
|
|
1
|
+
## 0.2.1 (2021-05-23)
|
|
2
|
+
|
|
3
|
+
- Improved performance
|
|
4
|
+
|
|
5
|
+
## 0.2.0 (2021-05-17)
|
|
6
|
+
|
|
7
|
+
- Updated to Rice 4
|
|
8
|
+
- Dropped support for Ruby < 2.6
|
|
9
|
+
|
|
10
|
+
## 0.1.5 (2021-03-14)
|
|
11
|
+
|
|
12
|
+
- Updated Isotree to 0.1.25
|
|
13
|
+
- Added support for exporting and importing models
|
|
14
|
+
|
|
15
|
+
## 0.1.4 (2020-08-22)
|
|
16
|
+
|
|
17
|
+
- Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
|
|
18
|
+
- Fixed signal handling
|
|
19
|
+
|
|
20
|
+
## 0.1.3 (2020-08-13)
|
|
21
|
+
|
|
22
|
+
- Added support for categorical data
|
|
23
|
+
- Added support for Rover data frames
|
|
24
|
+
- Added `output` option to `predict` method
|
|
25
|
+
|
|
1
26
|
## 0.1.2 (2020-08-11)
|
|
2
27
|
|
|
3
28
|
- Fixed outlier scores
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
|
|
5
5
|
Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
|
|
6
6
|
|
|
7
|
-
[
|
|
7
|
+
:deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
|
|
8
|
+
|
|
9
|
+
[](https://github.com/ankane/isotree/actions)
|
|
8
10
|
|
|
9
11
|
## Installation
|
|
10
12
|
|
|
@@ -19,24 +21,40 @@ gem 'isotree'
|
|
|
19
21
|
Prep your data
|
|
20
22
|
|
|
21
23
|
```ruby
|
|
22
|
-
|
|
24
|
+
data = [
|
|
25
|
+
{department: "Books", sale: false, price: 2.50},
|
|
26
|
+
{department: "Books", sale: true, price: 3.00},
|
|
27
|
+
{department: "Movies", sale: false, price: 5.00}
|
|
28
|
+
]
|
|
23
29
|
```
|
|
24
30
|
|
|
25
31
|
Train a model
|
|
26
32
|
|
|
27
33
|
```ruby
|
|
28
34
|
model = IsoTree::IsolationForest.new
|
|
29
|
-
model.fit(
|
|
35
|
+
model.fit(data)
|
|
30
36
|
```
|
|
31
37
|
|
|
32
38
|
Get outlier scores
|
|
33
39
|
|
|
34
40
|
```ruby
|
|
35
|
-
model.predict(
|
|
41
|
+
model.predict(data)
|
|
36
42
|
```
|
|
37
43
|
|
|
38
44
|
Scores are between 0 and 1, with higher scores indicating outliers
|
|
39
45
|
|
|
46
|
+
Export the model
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
model.export_model("model.bin")
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Import a model
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
model = IsoTree::IsolationForest.import_model("model.bin")
|
|
56
|
+
```
|
|
57
|
+
|
|
40
58
|
## Parameters
|
|
41
59
|
|
|
42
60
|
Pass parameters - default values below
|
|
@@ -52,12 +70,18 @@ IsoTree::IsolationForest.new(
|
|
|
52
70
|
prob_split_avg_gain: 0,
|
|
53
71
|
prob_split_pooled_gain: 0,
|
|
54
72
|
min_gain: 0,
|
|
73
|
+
missing_action: "impute",
|
|
74
|
+
new_categ_action: "smallest",
|
|
75
|
+
categ_split_type: "subset",
|
|
55
76
|
all_perm: false,
|
|
56
77
|
coef_by_prop: false,
|
|
57
78
|
sample_with_replacement: false,
|
|
58
79
|
penalize_range: true,
|
|
59
80
|
weigh_by_kurtosis: false,
|
|
81
|
+
coefs: "normal",
|
|
60
82
|
min_imp_obs: 3,
|
|
83
|
+
depth_imp: "higher",
|
|
84
|
+
weigh_imp_rows: "inverse",
|
|
61
85
|
random_seed: 1,
|
|
62
86
|
nthreads: -1
|
|
63
87
|
)
|
|
@@ -67,10 +91,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
|
|
|
67
91
|
|
|
68
92
|
## Data
|
|
69
93
|
|
|
70
|
-
Data can be an array of
|
|
94
|
+
Data can be an array of hashes
|
|
71
95
|
|
|
72
96
|
```ruby
|
|
73
|
-
[
|
|
97
|
+
[
|
|
98
|
+
{department: "Books", sale: false, price: 2.50},
|
|
99
|
+
{department: "Books", sale: true, price: 3.00},
|
|
100
|
+
{department: "Movies", sale: false, price: 5.00}
|
|
101
|
+
]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Or a Rover data frame
|
|
105
|
+
|
|
106
|
+
```ruby
|
|
107
|
+
Rover.read_csv("data.csv")
|
|
74
108
|
```
|
|
75
109
|
|
|
76
110
|
Or a Numo array
|
|
@@ -94,6 +128,23 @@ gem uninstall isotree --force
|
|
|
94
128
|
bundle install
|
|
95
129
|
```
|
|
96
130
|
|
|
131
|
+
## Deployment
|
|
132
|
+
|
|
133
|
+
Check out [Trove](https://github.com/ankane/trove) for deploying models.
|
|
134
|
+
|
|
135
|
+
```sh
|
|
136
|
+
trove push model.bin
|
|
137
|
+
trove push model.bin.metadata
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Reference
|
|
141
|
+
|
|
142
|
+
Get the average isolation depth
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
model.predict(data, output: "avg_depth")
|
|
146
|
+
```
|
|
147
|
+
|
|
97
148
|
## History
|
|
98
149
|
|
|
99
150
|
View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
|
data/ext/isotree/ext.cpp
CHANGED
|
@@ -2,20 +2,102 @@
|
|
|
2
2
|
#include <isotree.hpp>
|
|
3
3
|
|
|
4
4
|
// rice
|
|
5
|
-
#include <rice/
|
|
6
|
-
#include <rice/Hash.hpp>
|
|
7
|
-
#include <rice/Module.hpp>
|
|
8
|
-
#include <rice/String.hpp>
|
|
9
|
-
#include <rice/Symbol.hpp>
|
|
5
|
+
#include <rice/rice.hpp>
|
|
10
6
|
|
|
11
7
|
using Rice::Array;
|
|
12
8
|
using Rice::Hash;
|
|
13
9
|
using Rice::Module;
|
|
10
|
+
using Rice::Object;
|
|
14
11
|
using Rice::String;
|
|
15
12
|
using Rice::Symbol;
|
|
16
13
|
using Rice::define_class_under;
|
|
17
14
|
using Rice::define_module;
|
|
18
15
|
|
|
16
|
+
namespace Rice::detail
|
|
17
|
+
{
|
|
18
|
+
template<>
|
|
19
|
+
class From_Ruby<NewCategAction>
|
|
20
|
+
{
|
|
21
|
+
public:
|
|
22
|
+
NewCategAction convert(VALUE x)
|
|
23
|
+
{
|
|
24
|
+
auto value = Object(x).to_s().str();
|
|
25
|
+
if (value == "weighted") return Weighted;
|
|
26
|
+
if (value == "smallest") return Smallest;
|
|
27
|
+
if (value == "random") return Random;
|
|
28
|
+
throw std::runtime_error("Unknown new categ action: " + value);
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
template<>
|
|
33
|
+
class From_Ruby<MissingAction>
|
|
34
|
+
{
|
|
35
|
+
public:
|
|
36
|
+
MissingAction convert(VALUE x)
|
|
37
|
+
{
|
|
38
|
+
auto value = Object(x).to_s().str();
|
|
39
|
+
if (value == "divide") return Divide;
|
|
40
|
+
if (value == "impute") return Impute;
|
|
41
|
+
if (value == "fail") return Fail;
|
|
42
|
+
throw std::runtime_error("Unknown missing action: " + value);
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
template<>
|
|
47
|
+
class From_Ruby<CategSplit>
|
|
48
|
+
{
|
|
49
|
+
public:
|
|
50
|
+
CategSplit convert(VALUE x)
|
|
51
|
+
{
|
|
52
|
+
auto value = Object(x).to_s().str();
|
|
53
|
+
if (value == "subset") return SubSet;
|
|
54
|
+
if (value == "single_categ") return SingleCateg;
|
|
55
|
+
throw std::runtime_error("Unknown categ split: " + value);
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
template<>
|
|
60
|
+
class From_Ruby<CoefType>
|
|
61
|
+
{
|
|
62
|
+
public:
|
|
63
|
+
CoefType convert(VALUE x)
|
|
64
|
+
{
|
|
65
|
+
auto value = Object(x).to_s().str();
|
|
66
|
+
if (value == "uniform") return Uniform;
|
|
67
|
+
if (value == "normal") return Normal;
|
|
68
|
+
throw std::runtime_error("Unknown coef type: " + value);
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
template<>
|
|
73
|
+
class From_Ruby<UseDepthImp>
|
|
74
|
+
{
|
|
75
|
+
public:
|
|
76
|
+
UseDepthImp convert(VALUE x)
|
|
77
|
+
{
|
|
78
|
+
auto value = Object(x).to_s().str();
|
|
79
|
+
if (value == "lower") return Lower;
|
|
80
|
+
if (value == "higher") return Higher;
|
|
81
|
+
if (value == "same") return Same;
|
|
82
|
+
throw std::runtime_error("Unknown depth imp: " + value);
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
template<>
|
|
87
|
+
class From_Ruby<WeighImpRows>
|
|
88
|
+
{
|
|
89
|
+
public:
|
|
90
|
+
WeighImpRows convert(VALUE x)
|
|
91
|
+
{
|
|
92
|
+
auto value = Object(x).to_s().str();
|
|
93
|
+
if (value == "inverse") return Inverse;
|
|
94
|
+
if (value == "prop") return Prop;
|
|
95
|
+
if (value == "flat") return Flat;
|
|
96
|
+
throw std::runtime_error("Unknown weight imp rows: " + value);
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
19
101
|
extern "C"
|
|
20
102
|
void Init_ext()
|
|
21
103
|
{
|
|
@@ -25,44 +107,36 @@ void Init_ext()
|
|
|
25
107
|
define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
|
|
26
108
|
|
|
27
109
|
rb_mExt
|
|
28
|
-
.
|
|
110
|
+
.define_singleton_function(
|
|
29
111
|
"fit_iforest",
|
|
30
|
-
|
|
112
|
+
[](Hash options) {
|
|
31
113
|
// model
|
|
32
114
|
ExtIsoForest iso;
|
|
33
115
|
|
|
34
116
|
// data
|
|
35
117
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
|
36
|
-
size_t
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
118
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
119
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
120
|
+
|
|
121
|
+
double *restrict numeric_data = NULL;
|
|
122
|
+
if (ncols_numeric > 0) {
|
|
123
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
int *restrict categorical_data = NULL;
|
|
127
|
+
int *restrict ncat = NULL;
|
|
128
|
+
if (ncols_categ > 0) {
|
|
129
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
|
130
|
+
ncat = (int*) options.get<String, Symbol>("ncat").c_str();
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// not used (sparse matrices)
|
|
42
134
|
double* Xc = NULL;
|
|
43
135
|
sparse_ix* Xc_ind = NULL;
|
|
44
136
|
sparse_ix* Xc_indptr = NULL;
|
|
45
137
|
|
|
46
138
|
// options
|
|
47
|
-
|
|
48
|
-
double* sample_weights = NULL;
|
|
49
|
-
bool weight_as_sample = false;
|
|
50
|
-
size_t max_depth = 0;
|
|
51
|
-
bool limit_depth = true;
|
|
52
|
-
bool standardize_dist = false;
|
|
53
|
-
double* tmat = NULL;
|
|
54
|
-
double* output_depths = NULL;
|
|
55
|
-
bool standardize_depth = false;
|
|
56
|
-
double* col_weights = NULL;
|
|
57
|
-
MissingAction missing_action = Impute;
|
|
58
|
-
CategSplit cat_split_type = SubSet;
|
|
59
|
-
NewCategAction new_cat_action = Smallest;
|
|
60
|
-
Imputer *imputer = NULL;
|
|
61
|
-
UseDepthImp depth_imp = Higher;
|
|
62
|
-
WeighImpRows weigh_imp_rows = Inverse;
|
|
63
|
-
bool impute_at_fit = false;
|
|
64
|
-
|
|
65
|
-
// Rice has limit of 14 arguments, so use hash for options
|
|
139
|
+
// Rice has limit of 14 arguments, so use hash
|
|
66
140
|
size_t sample_size = options.get<size_t, Symbol>("sample_size");
|
|
67
141
|
size_t ndim = options.get<size_t, Symbol>("ndim");
|
|
68
142
|
size_t ntrees = options.get<size_t, Symbol>("ntrees");
|
|
@@ -72,21 +146,41 @@ void Init_ext()
|
|
|
72
146
|
double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
|
|
73
147
|
double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
|
|
74
148
|
double min_gain = options.get<double, Symbol>("min_gain");
|
|
149
|
+
MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
|
|
150
|
+
CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
|
|
151
|
+
NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
|
|
75
152
|
bool all_perm = options.get<bool, Symbol>("all_perm");
|
|
76
153
|
bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
|
|
77
154
|
bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
|
|
78
155
|
bool penalize_range = options.get<bool, Symbol>("penalize_range");
|
|
79
156
|
bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
|
|
157
|
+
CoefType coef_type = options.get<CoefType, Symbol>("coefs");
|
|
80
158
|
size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
|
|
159
|
+
UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
|
|
160
|
+
WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
|
|
81
161
|
uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
|
|
82
162
|
int nthreads = options.get<int, Symbol>("nthreads");
|
|
83
163
|
|
|
164
|
+
// TODO options
|
|
165
|
+
double* sample_weights = NULL;
|
|
166
|
+
bool weight_as_sample = false;
|
|
167
|
+
size_t max_depth = 0;
|
|
168
|
+
bool limit_depth = true;
|
|
169
|
+
bool standardize_dist = false;
|
|
170
|
+
double* tmat = NULL;
|
|
171
|
+
double* output_depths = NULL;
|
|
172
|
+
bool standardize_depth = false;
|
|
173
|
+
double* col_weights = NULL;
|
|
174
|
+
Imputer *imputer = NULL;
|
|
175
|
+
bool impute_at_fit = false;
|
|
176
|
+
bool handle_interrupt = false;
|
|
177
|
+
|
|
84
178
|
fit_iforest(
|
|
85
179
|
NULL,
|
|
86
180
|
&iso,
|
|
87
181
|
numeric_data,
|
|
88
182
|
ncols_numeric,
|
|
89
|
-
|
|
183
|
+
categorical_data,
|
|
90
184
|
ncols_categ,
|
|
91
185
|
ncat,
|
|
92
186
|
Xc,
|
|
@@ -126,18 +220,31 @@ void Init_ext()
|
|
|
126
220
|
weigh_imp_rows,
|
|
127
221
|
impute_at_fit,
|
|
128
222
|
random_seed,
|
|
223
|
+
handle_interrupt,
|
|
129
224
|
nthreads
|
|
130
225
|
);
|
|
131
226
|
|
|
132
227
|
return iso;
|
|
133
228
|
})
|
|
134
|
-
.
|
|
229
|
+
.define_singleton_function(
|
|
135
230
|
"predict_iforest",
|
|
136
|
-
|
|
231
|
+
[](ExtIsoForest& iso, Hash options) {
|
|
137
232
|
// data
|
|
138
233
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
|
139
|
-
|
|
140
|
-
|
|
234
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
235
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
236
|
+
|
|
237
|
+
double *restrict numeric_data = NULL;
|
|
238
|
+
if (ncols_numeric > 0) {
|
|
239
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
int *restrict categorical_data = NULL;
|
|
243
|
+
if (ncols_categ > 0) {
|
|
244
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// not used (sparse matrices)
|
|
141
248
|
double* Xc = NULL;
|
|
142
249
|
sparse_ix* Xc_ind = NULL;
|
|
143
250
|
sparse_ix* Xc_indptr = NULL;
|
|
@@ -147,13 +254,13 @@ void Init_ext()
|
|
|
147
254
|
|
|
148
255
|
// options
|
|
149
256
|
int nthreads = options.get<int, Symbol>("nthreads");
|
|
150
|
-
bool standardize =
|
|
257
|
+
bool standardize = options.get<bool, Symbol>("standardize");
|
|
151
258
|
std::vector<double> outlier_scores(nrows);
|
|
152
259
|
sparse_ix* tree_num = NULL;
|
|
153
260
|
|
|
154
261
|
predict_iforest(
|
|
155
262
|
numeric_data,
|
|
156
|
-
|
|
263
|
+
categorical_data,
|
|
157
264
|
Xc,
|
|
158
265
|
Xc_ind,
|
|
159
266
|
Xc_indptr,
|
|
@@ -174,5 +281,29 @@ void Init_ext()
|
|
|
174
281
|
ret.push(outlier_scores[i]);
|
|
175
282
|
}
|
|
176
283
|
return ret;
|
|
284
|
+
})
|
|
285
|
+
.define_singleton_function(
|
|
286
|
+
"serialize_ext_isoforest",
|
|
287
|
+
[](ExtIsoForest& iso, String path) {
|
|
288
|
+
#ifdef _MSC_VER
|
|
289
|
+
// TODO convert to wchar_t
|
|
290
|
+
throw std::runtime_error("Not supported on Windows yet");
|
|
291
|
+
#else
|
|
292
|
+
serialize_ext_isoforest(iso, path.c_str());
|
|
293
|
+
#endif
|
|
294
|
+
})
|
|
295
|
+
.define_singleton_function(
|
|
296
|
+
"deserialize_ext_isoforest",
|
|
297
|
+
[](String path) {
|
|
298
|
+
ExtIsoForest iso;
|
|
299
|
+
|
|
300
|
+
#ifdef _MSC_VER
|
|
301
|
+
// TODO convert to wchar_t
|
|
302
|
+
throw std::runtime_error("Not supported on Windows yet");
|
|
303
|
+
#else
|
|
304
|
+
deserialize_ext_isoforest(iso, path.c_str());
|
|
305
|
+
#endif
|
|
306
|
+
|
|
307
|
+
return iso;
|
|
177
308
|
});
|
|
178
309
|
}
|