anomaly_detection 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +16 -2
- data/ext/anomaly_detection/anomaly_detection.hpp +193 -2
- data/ext/anomaly_detection/dist.h +190 -0
- data/ext/anomaly_detection/ext.cpp +9 -3
- data/lib/anomaly_detection/version.rb +1 -1
- data/lib/anomaly_detection.rb +65 -11
- data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
- data/licenses/LICENSE-MIT-dist-h.txt +21 -0
- data/licenses/UNLICENSE-dist-h.txt +24 -0
- metadata +7 -7
- data/ext/anomaly_detection/anomaly_detection.cpp +0 -153
- data/ext/anomaly_detection/cdflib.cpp +0 -12126
- data/ext/anomaly_detection/cdflib.hpp +0 -123
- data/licenses/LICENSE-cdflib.txt +0 -165
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c60bb6d75cb8523ecd0926f391d79413a1cb2eb131cd579fd381bb6683f82da3
|
|
4
|
+
data.tar.gz: '01594d0f0a97ad8cbb7b0b50cb30894bd0d773d4db45b3158345567ce1732efb'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fe09cc140a5d6543f3b00983a754861f6a3a3a436f8a8afecc80d202f1112bb6ea180df794072ee4711c044508a559a015c885cfd61d2c5be9378fc7b6590d96
|
|
7
|
+
data.tar.gz: 5616e6075888b4521355e6c0fb33f7a94361c971c7f58f9ae6a61e5d8529a3e1938deba10be22c24ec5849f1909f48cba07b97fd1e6ea8b47ae4f66626eb703e
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
|
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
|
|
|
11
11
|
Add this line to your application’s Gemfile:
|
|
12
12
|
|
|
13
13
|
```ruby
|
|
14
|
-
gem
|
|
14
|
+
gem "anomaly_detection"
|
|
15
15
|
```
|
|
16
16
|
|
|
17
17
|
## Getting Started
|
|
@@ -58,9 +58,23 @@ AnomalyDetection.detect(
|
|
|
58
58
|
)
|
|
59
59
|
```
|
|
60
60
|
|
|
61
|
+
## Plotting
|
|
62
|
+
|
|
63
|
+
Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
gem "vega"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
And use:
|
|
70
|
+
|
|
71
|
+
```ruby
|
|
72
|
+
AnomalyDetection.plot(series, anomalies)
|
|
73
|
+
```
|
|
74
|
+
|
|
61
75
|
## Credits
|
|
62
76
|
|
|
63
|
-
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [
|
|
77
|
+
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [stl-cpp](https://github.com/ankane/stl-cpp) for seasonal-trend decomposition and [dist.h](https://github.com/ankane/dist.h) for the quantile function.
|
|
64
78
|
|
|
65
79
|
## References
|
|
66
80
|
|
|
@@ -1,12 +1,203 @@
|
|
|
1
|
+
/*!
|
|
2
|
+
* AnomalyDetection.cpp v0.1.0
|
|
3
|
+
* https://github.com/ankane/AnomalyDetection.cpp
|
|
4
|
+
* GPL-3.0-or-later License
|
|
5
|
+
*/
|
|
6
|
+
|
|
1
7
|
#pragma once
|
|
2
8
|
|
|
3
|
-
#include <
|
|
9
|
+
#include <functional>
|
|
10
|
+
#include <iostream>
|
|
11
|
+
#include <iterator>
|
|
12
|
+
#include <numeric>
|
|
4
13
|
#include <vector>
|
|
5
14
|
|
|
15
|
+
#include "dist.h"
|
|
16
|
+
#include "stl.hpp"
|
|
17
|
+
|
|
6
18
|
namespace anomaly_detection {
|
|
7
19
|
|
|
8
20
|
enum Direction { Positive, Negative, Both };
|
|
9
21
|
|
|
10
|
-
|
|
22
|
+
float median_sorted(const std::vector<float>& sorted) {
|
|
23
|
+
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
float median(const std::vector<float>& data) {
|
|
27
|
+
std::vector<float> sorted(data);
|
|
28
|
+
std::sort(sorted.begin(), sorted.end());
|
|
29
|
+
return median_sorted(sorted);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
float mad(const std::vector<float>& data, float med) {
|
|
33
|
+
std::vector<float> res;
|
|
34
|
+
res.reserve(data.size());
|
|
35
|
+
for (auto v : data) {
|
|
36
|
+
res.push_back(fabs(v - med));
|
|
37
|
+
}
|
|
38
|
+
std::sort(res.begin(), res.end());
|
|
39
|
+
return 1.4826 * median_sorted(res);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
|
|
43
|
+
auto n = data.size();
|
|
44
|
+
|
|
45
|
+
// Check to make sure we have at least two periods worth of data for anomaly context
|
|
46
|
+
if (n < num_obs_per_period * 2) {
|
|
47
|
+
throw std::invalid_argument("series must contain at least 2 periods");
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Handle NANs
|
|
51
|
+
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
|
52
|
+
if (nan > 0) {
|
|
53
|
+
throw std::invalid_argument("series contains NANs");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
|
57
|
+
auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
|
|
58
|
+
auto seasonal = data_decomp.seasonal;
|
|
59
|
+
|
|
60
|
+
std::vector<float> data2;
|
|
61
|
+
data2.reserve(n);
|
|
62
|
+
auto med = median(data);
|
|
63
|
+
for (auto i = 0; i < n; i++) {
|
|
64
|
+
data2.push_back(data[i] - seasonal[i] - med);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
auto num_anoms = 0;
|
|
68
|
+
auto max_outliers = (size_t) n * k;
|
|
69
|
+
std::vector<size_t> anomalies;
|
|
70
|
+
anomalies.reserve(max_outliers);
|
|
71
|
+
|
|
72
|
+
// Sort data for fast median
|
|
73
|
+
// Use stable sort for indexes for deterministic results
|
|
74
|
+
std::vector<size_t> indexes(n);
|
|
75
|
+
std::iota(indexes.begin(), indexes.end(), 0);
|
|
76
|
+
std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
|
|
77
|
+
std::sort(data2.begin(), data2.end());
|
|
78
|
+
|
|
79
|
+
// Compute test statistic until r=max_outliers values have been removed from the sample
|
|
80
|
+
for (auto i = 1; i <= max_outliers; i++) {
|
|
81
|
+
if (verbose) {
|
|
82
|
+
std::cout << i << " / " << max_outliers << " completed" << std::endl;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// TODO Improve performance between loop iterations
|
|
86
|
+
auto ma = median_sorted(data2);
|
|
87
|
+
std::vector<float> ares;
|
|
88
|
+
ares.reserve(data2.size());
|
|
89
|
+
if (one_tail) {
|
|
90
|
+
if (upper_tail) {
|
|
91
|
+
for (auto v : data2) {
|
|
92
|
+
ares.push_back(v - ma);
|
|
93
|
+
}
|
|
94
|
+
} else {
|
|
95
|
+
for (auto v : data2) {
|
|
96
|
+
ares.push_back(ma - v);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
} else {
|
|
100
|
+
for (auto v : data2) {
|
|
101
|
+
ares.push_back(fabs(v - ma));
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Protect against constant time series
|
|
106
|
+
auto data_sigma = mad(data2, ma);
|
|
107
|
+
if (data_sigma == 0.0) {
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
auto iter = std::max_element(ares.begin(), ares.end());
|
|
112
|
+
auto r_idx_i = std::distance(ares.begin(), iter);
|
|
113
|
+
|
|
114
|
+
// Only need to take sigma of r for performance
|
|
115
|
+
auto r = ares[r_idx_i] / data_sigma;
|
|
116
|
+
|
|
117
|
+
anomalies.push_back(indexes[r_idx_i]);
|
|
118
|
+
data2.erase(data2.begin() + r_idx_i);
|
|
119
|
+
indexes.erase(indexes.begin() + r_idx_i);
|
|
120
|
+
|
|
121
|
+
// Compute critical value
|
|
122
|
+
float p;
|
|
123
|
+
if (one_tail) {
|
|
124
|
+
p = 1.0 - alpha / (n - i + 1);
|
|
125
|
+
} else {
|
|
126
|
+
p = 1.0 - alpha / (2.0 * (n - i + 1));
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
auto t = students_t_ppf(p, n - i - 1);
|
|
130
|
+
auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
|
|
131
|
+
|
|
132
|
+
if (r > lam) {
|
|
133
|
+
num_anoms = i;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (callback != nullptr) {
|
|
137
|
+
callback();
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
anomalies.resize(num_anoms);
|
|
142
|
+
|
|
143
|
+
// Sort like R version
|
|
144
|
+
std::sort(anomalies.begin(), anomalies.end());
|
|
145
|
+
|
|
146
|
+
return anomalies;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
class AnomalyDetectionResult {
|
|
150
|
+
public:
|
|
151
|
+
std::vector<size_t> anomalies;
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
class AnomalyDetectionParams {
|
|
155
|
+
float alpha_ = 0.05;
|
|
156
|
+
float max_anoms_ = 0.1;
|
|
157
|
+
Direction direction_ = Direction::Both;
|
|
158
|
+
bool verbose_ = false;
|
|
159
|
+
std::function<void()> callback_ = nullptr;
|
|
160
|
+
|
|
161
|
+
public:
|
|
162
|
+
inline AnomalyDetectionParams alpha(float alpha) {
|
|
163
|
+
this->alpha_ = alpha;
|
|
164
|
+
return *this;
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
inline AnomalyDetectionParams max_anoms(float max_anoms) {
|
|
168
|
+
this->max_anoms_ = max_anoms;
|
|
169
|
+
return *this;
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
inline AnomalyDetectionParams direction(Direction direction) {
|
|
173
|
+
this->direction_ = direction;
|
|
174
|
+
return *this;
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
inline AnomalyDetectionParams verbose(bool verbose) {
|
|
178
|
+
this->verbose_ = verbose;
|
|
179
|
+
return *this;
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
inline AnomalyDetectionParams callback(std::function<void()> callback) {
|
|
183
|
+
this->callback_ = callback;
|
|
184
|
+
return *this;
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
AnomalyDetectionParams params() {
|
|
191
|
+
return AnomalyDetectionParams();
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
|
|
195
|
+
bool one_tail = this->direction_ != Direction::Both;
|
|
196
|
+
bool upper_tail = this->direction_ == Direction::Positive;
|
|
197
|
+
|
|
198
|
+
auto res = AnomalyDetectionResult();
|
|
199
|
+
res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
|
|
200
|
+
return res;
|
|
201
|
+
}
|
|
11
202
|
|
|
12
203
|
}
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/*!
|
|
2
|
+
* dist.h v0.1.1
|
|
3
|
+
* https://github.com/ankane/dist.h
|
|
4
|
+
* Unlicense OR MIT License
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
#pragma once
|
|
8
|
+
|
|
9
|
+
#include <assert.h>
|
|
10
|
+
#include <math.h>
|
|
11
|
+
|
|
12
|
+
#ifdef M_E
|
|
13
|
+
#define DIST_E M_E
|
|
14
|
+
#else
|
|
15
|
+
#define DIST_E 2.71828182845904523536
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
#ifdef M_PI
|
|
19
|
+
#define DIST_PI M_PI
|
|
20
|
+
#else
|
|
21
|
+
#define DIST_PI 3.14159265358979323846
|
|
22
|
+
#endif
|
|
23
|
+
|
|
24
|
+
// Winitzki, S. (2008).
|
|
25
|
+
// A handy approximation for the error function and its inverse.
|
|
26
|
+
// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
|
|
27
|
+
// from https://sites.google.com/site/winitzki
|
|
28
|
+
double erf(double x) {
|
|
29
|
+
double sign = x < 0 ? -1.0 : 1.0;
|
|
30
|
+
x = x < 0 ? -x : x;
|
|
31
|
+
|
|
32
|
+
double a = 0.14;
|
|
33
|
+
double x2 = x * x;
|
|
34
|
+
return sign * sqrt(1.0 - exp(-x2 * (4.0 / DIST_PI + a * x2) / (1.0 + a * x2)));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Winitzki, S. (2008).
|
|
38
|
+
// A handy approximation for the error function and its inverse.
|
|
39
|
+
// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
|
|
40
|
+
// from https://sites.google.com/site/winitzki
|
|
41
|
+
double inverse_erf(double x) {
|
|
42
|
+
double sign = x < 0 ? -1.0 : 1.0;
|
|
43
|
+
x = x < 0 ? -x : x;
|
|
44
|
+
|
|
45
|
+
double a = 0.147;
|
|
46
|
+
double ln = log(1.0 - x * x);
|
|
47
|
+
double f1 = 2.0 / (DIST_PI * a);
|
|
48
|
+
double f2 = ln / 2.0;
|
|
49
|
+
double f3 = f1 + f2;
|
|
50
|
+
double f4 = 1.0 / a * ln;
|
|
51
|
+
return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
double normal_pdf(double x, double mean, double std_dev) {
|
|
55
|
+
double var = std_dev * std_dev;
|
|
56
|
+
return (1.0 / (var * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * pow((x - mean) / var, 2));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
double normal_cdf(double x, double mean, double std_dev) {
|
|
60
|
+
return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
double normal_ppf(double p, double mean, double std_dev) {
|
|
64
|
+
assert(p >= 0 && p <= 1);
|
|
65
|
+
|
|
66
|
+
return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
double students_t_pdf(double x, unsigned int n) {
|
|
70
|
+
assert(n >= 1);
|
|
71
|
+
|
|
72
|
+
return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Hill, G. W. (1970).
|
|
76
|
+
// Algorithm 395: Student's t-distribution.
|
|
77
|
+
// Communications of the ACM, 13(10), 617-619.
|
|
78
|
+
double students_t_cdf(double x, unsigned int n) {
|
|
79
|
+
assert(n >= 1);
|
|
80
|
+
|
|
81
|
+
double start = x < 0 ? 0 : 1;
|
|
82
|
+
double sign = x < 0 ? 1 : -1;
|
|
83
|
+
|
|
84
|
+
double z = 1.0;
|
|
85
|
+
double t = x * x;
|
|
86
|
+
double y = t / n;
|
|
87
|
+
double b = 1.0 + y;
|
|
88
|
+
|
|
89
|
+
if ((n >= 20 && t < n) || n > 200) {
|
|
90
|
+
// asymptotic series for large or noninteger n
|
|
91
|
+
if (y > 10e-6) {
|
|
92
|
+
y = log(b);
|
|
93
|
+
}
|
|
94
|
+
double a = n - 0.5;
|
|
95
|
+
b = 48.0 * a * a;
|
|
96
|
+
y = a * y;
|
|
97
|
+
y = (((((-0.4 * y - 3.3) * y - 24.0) * y - 85.5) / (0.8 * y * y + 100.0 + b) + y + 3.0) / b + 1.0) * sqrt(y);
|
|
98
|
+
return start + sign * normal_cdf(-y, 0.0, 1.0);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (n < 20 && t < 4.0) {
|
|
102
|
+
// nested summation of cosine series
|
|
103
|
+
y = sqrt(y);
|
|
104
|
+
double a = y;
|
|
105
|
+
if (n == 1) {
|
|
106
|
+
a = 0.0;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// loop
|
|
110
|
+
if (n > 1) {
|
|
111
|
+
n -= 2;
|
|
112
|
+
while (n > 1) {
|
|
113
|
+
a = (n - 1) / (b * n) * a + y;
|
|
114
|
+
n -= 2;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
|
|
118
|
+
return start + sign * (z - a) / 2;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// tail series expanation for large t-values
|
|
122
|
+
double a = sqrt(b);
|
|
123
|
+
y = a * n;
|
|
124
|
+
int j = 0;
|
|
125
|
+
while (a != z) {
|
|
126
|
+
j += 2;
|
|
127
|
+
z = a;
|
|
128
|
+
y = y * (j - 1) / (b * j);
|
|
129
|
+
a = a + y / (n + j);
|
|
130
|
+
}
|
|
131
|
+
z = 0.0;
|
|
132
|
+
y = 0.0;
|
|
133
|
+
a = -a;
|
|
134
|
+
|
|
135
|
+
// loop (without n + 2 and n - 2)
|
|
136
|
+
while (n > 1) {
|
|
137
|
+
a = (n - 1) / (b * n) * a + y;
|
|
138
|
+
n -= 2;
|
|
139
|
+
}
|
|
140
|
+
a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
|
|
141
|
+
return start + sign * (z - a) / 2;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Hill, G. W. (1970).
|
|
145
|
+
// Algorithm 396: Student's t-quantiles.
|
|
146
|
+
// Communications of the ACM, 13(10), 619-620.
|
|
147
|
+
double students_t_ppf(double p, unsigned int n) {
|
|
148
|
+
assert(p >= 0 && p <= 1);
|
|
149
|
+
assert(n >= 1);
|
|
150
|
+
|
|
151
|
+
// distribution is symmetric
|
|
152
|
+
double sign = p < 0.5 ? -1 : 1;
|
|
153
|
+
p = p < 0.5 ? 1 - p : p;
|
|
154
|
+
|
|
155
|
+
// two-tail to one-tail
|
|
156
|
+
p = 2.0 * (1.0 - p);
|
|
157
|
+
|
|
158
|
+
if (n == 2) {
|
|
159
|
+
return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
double half_pi = DIST_PI / 2.0;
|
|
163
|
+
|
|
164
|
+
if (n == 1) {
|
|
165
|
+
p = p * half_pi;
|
|
166
|
+
return sign * cos(p) / sin(p);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
double a = 1.0 / (n - 0.5);
|
|
170
|
+
double b = 48.0 / (a * a);
|
|
171
|
+
double c = ((20700.0 * a / b - 98.0) * a - 16.0) * a + 96.36;
|
|
172
|
+
double d = ((94.5 / (b + c) - 3.0) / b + 1.0) * sqrt(a * half_pi) * n;
|
|
173
|
+
double x = d * p;
|
|
174
|
+
double y = pow(x, 2.0 / n);
|
|
175
|
+
if (y > 0.05 + a) {
|
|
176
|
+
// asymptotic inverse expansion about normal
|
|
177
|
+
x = normal_ppf(p * 0.5, 0.0, 1.0);
|
|
178
|
+
y = x * x;
|
|
179
|
+
if (n < 5) {
|
|
180
|
+
c += 0.3 * (n - 4.5) * (x + 0.6);
|
|
181
|
+
}
|
|
182
|
+
c = (((0.05 * d * x - 5.0) * x - 7.0) * x - 2.0) * x + b + c;
|
|
183
|
+
y = (((((0.4 * y + 6.3) * y + 36.0) * y + 94.5) / c - y - 3.0) / b + 1.0) * x;
|
|
184
|
+
y = a * y * y;
|
|
185
|
+
y = y > 0.002 ? exp(y) - 1.0 : 0.5 * y * y + y;
|
|
186
|
+
} else {
|
|
187
|
+
y = ((1.0 / (((n + 6.0) / (n * y) - 0.089 * d - 0.822) * (n + 2.0) * 3.0) + 0.5 / (n + 4.0)) * y - 1.0) * (n + 1.0) / (n + 2.0) + 1.0 / y;
|
|
188
|
+
}
|
|
189
|
+
return sign * sqrt(n * y);
|
|
190
|
+
}
|
|
@@ -12,7 +12,7 @@ void Init_ext() {
|
|
|
12
12
|
rb_mAnomalyDetection
|
|
13
13
|
.define_singleton_function(
|
|
14
14
|
"_detect",
|
|
15
|
-
[](std::vector<float>
|
|
15
|
+
[](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
|
|
16
16
|
Direction dir;
|
|
17
17
|
if (direction == "pos") {
|
|
18
18
|
dir = Direction::Positive;
|
|
@@ -24,10 +24,16 @@ void Init_ext() {
|
|
|
24
24
|
throw std::invalid_argument("direction must be pos, neg, or both");
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
auto res = anomaly_detection::
|
|
27
|
+
auto res = anomaly_detection::params()
|
|
28
|
+
.max_anoms(k)
|
|
29
|
+
.alpha(alpha)
|
|
30
|
+
.direction(dir)
|
|
31
|
+
.verbose(verbose)
|
|
32
|
+
.callback(rb_thread_check_ints)
|
|
33
|
+
.fit(series, period);
|
|
28
34
|
|
|
29
35
|
auto a = Rice::Array();
|
|
30
|
-
for (auto v : res) {
|
|
36
|
+
for (auto v : res.anomalies) {
|
|
31
37
|
a.push(v);
|
|
32
38
|
}
|
|
33
39
|
return a;
|
data/lib/anomaly_detection.rb
CHANGED
|
@@ -5,18 +5,72 @@ require "anomaly_detection/ext"
|
|
|
5
5
|
require "anomaly_detection/version"
|
|
6
6
|
|
|
7
7
|
module AnomalyDetection
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
8
|
+
class << self
|
|
9
|
+
def detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", plot: false, verbose: false)
|
|
10
|
+
raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
|
|
11
|
+
|
|
12
|
+
if series.is_a?(Hash)
|
|
13
|
+
sorted = series.sort_by { |k, _| k }
|
|
14
|
+
x = sorted.map(&:last)
|
|
15
|
+
else
|
|
16
|
+
x = series
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
res = _detect(x, period, max_anoms, alpha, direction, verbose)
|
|
20
|
+
res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
|
|
21
|
+
res
|
|
16
22
|
end
|
|
17
23
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
24
|
+
# TODO add tooltips
|
|
25
|
+
def plot(series, anomalies)
|
|
26
|
+
require "vega"
|
|
27
|
+
|
|
28
|
+
data =
|
|
29
|
+
if series.is_a?(Hash)
|
|
30
|
+
series.map { |k, v| {x: iso8601(k), y: v, anomaly: anomalies.include?(k)} }
|
|
31
|
+
else
|
|
32
|
+
series.map.with_index { |v, i| {x: i, y: v, anomaly: anomalies.include?(i)} }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
if series.is_a?(Hash)
|
|
36
|
+
x = {field: "x", type: "temporal"}
|
|
37
|
+
x["scale"] = {type: "utc"} if series.keys.first.is_a?(Date)
|
|
38
|
+
else
|
|
39
|
+
x = {field: "x", type: "quantitative"}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
Vega.lite
|
|
43
|
+
.data(data)
|
|
44
|
+
.layer([
|
|
45
|
+
{
|
|
46
|
+
mark: {type: "line"},
|
|
47
|
+
encoding: {
|
|
48
|
+
x: x,
|
|
49
|
+
y: {field: "y", type: "quantitative", scale: {zero: false}},
|
|
50
|
+
color: {value: "#fa9088"}
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
transform: [{"filter": "datum.anomaly == true"}],
|
|
55
|
+
mark: {type: "point", size: 200},
|
|
56
|
+
encoding: {
|
|
57
|
+
x: x,
|
|
58
|
+
y: {field: "y", type: "quantitative"},
|
|
59
|
+
color: {value: "#19c7ca"}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
])
|
|
63
|
+
.config(axis: {title: nil, labelFontSize: 12})
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def iso8601(v)
|
|
69
|
+
if v.is_a?(Date)
|
|
70
|
+
v.strftime("%Y-%m-%d")
|
|
71
|
+
else
|
|
72
|
+
v.strftime("%Y-%m-%dT%H:%M:%S.%L%z")
|
|
73
|
+
end
|
|
74
|
+
end
|
|
21
75
|
end
|
|
22
76
|
end
|