anomaly_detection 0.1.1 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +16 -2
- data/ext/anomaly_detection/anomaly_detection.hpp +193 -2
- data/ext/anomaly_detection/dist.h +190 -0
- data/ext/anomaly_detection/ext.cpp +9 -3
- data/lib/anomaly_detection/version.rb +1 -1
- data/lib/anomaly_detection.rb +65 -11
- data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
- data/licenses/LICENSE-MIT-dist-h.txt +21 -0
- data/licenses/UNLICENSE-dist-h.txt +24 -0
- metadata +7 -7
- data/ext/anomaly_detection/anomaly_detection.cpp +0 -153
- data/ext/anomaly_detection/cdflib.cpp +0 -12126
- data/ext/anomaly_detection/cdflib.hpp +0 -123
- data/licenses/LICENSE-cdflib.txt +0 -165
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c60bb6d75cb8523ecd0926f391d79413a1cb2eb131cd579fd381bb6683f82da3
|
4
|
+
data.tar.gz: '01594d0f0a97ad8cbb7b0b50cb30894bd0d773d4db45b3158345567ce1732efb'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe09cc140a5d6543f3b00983a754861f6a3a3a436f8a8afecc80d202f1112bb6ea180df794072ee4711c044508a559a015c885cfd61d2c5be9378fc7b6590d96
|
7
|
+
data.tar.gz: 5616e6075888b4521355e6c0fb33f7a94361c971c7f58f9ae6a61e5d8529a3e1938deba10be22c24ec5849f1909f48cba07b97fd1e6ea8b47ae4f66626eb703e
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
|
|
11
11
|
Add this line to your application’s Gemfile:
|
12
12
|
|
13
13
|
```ruby
|
14
|
-
gem
|
14
|
+
gem "anomaly_detection"
|
15
15
|
```
|
16
16
|
|
17
17
|
## Getting Started
|
@@ -58,9 +58,23 @@ AnomalyDetection.detect(
|
|
58
58
|
)
|
59
59
|
```
|
60
60
|
|
61
|
+
## Plotting
|
62
|
+
|
63
|
+
Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
gem "vega"
|
67
|
+
```
|
68
|
+
|
69
|
+
And use:
|
70
|
+
|
71
|
+
```ruby
|
72
|
+
AnomalyDetection.plot(series, anomalies)
|
73
|
+
```
|
74
|
+
|
61
75
|
## Credits
|
62
76
|
|
63
|
-
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [
|
77
|
+
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [stl-cpp](https://github.com/ankane/stl-cpp) for seasonal-trend decomposition and [dist.h](https://github.com/ankane/dist.h) for the quantile function.
|
64
78
|
|
65
79
|
## References
|
66
80
|
|
@@ -1,12 +1,203 @@
|
|
1
|
+
/*!
|
2
|
+
* AnomalyDetection.cpp v0.1.0
|
3
|
+
* https://github.com/ankane/AnomalyDetection.cpp
|
4
|
+
* GPL-3.0-or-later License
|
5
|
+
*/
|
6
|
+
|
1
7
|
#pragma once
|
2
8
|
|
3
|
-
#include <
|
9
|
+
#include <functional>
|
10
|
+
#include <iostream>
|
11
|
+
#include <iterator>
|
12
|
+
#include <numeric>
|
4
13
|
#include <vector>
|
5
14
|
|
15
|
+
#include "dist.h"
|
16
|
+
#include "stl.hpp"
|
17
|
+
|
6
18
|
namespace anomaly_detection {
|
7
19
|
|
8
20
|
enum Direction { Positive, Negative, Both };
|
9
21
|
|
10
|
-
|
22
|
+
float median_sorted(const std::vector<float>& sorted) {
|
23
|
+
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
24
|
+
}
|
25
|
+
|
26
|
+
float median(const std::vector<float>& data) {
|
27
|
+
std::vector<float> sorted(data);
|
28
|
+
std::sort(sorted.begin(), sorted.end());
|
29
|
+
return median_sorted(sorted);
|
30
|
+
}
|
31
|
+
|
32
|
+
float mad(const std::vector<float>& data, float med) {
|
33
|
+
std::vector<float> res;
|
34
|
+
res.reserve(data.size());
|
35
|
+
for (auto v : data) {
|
36
|
+
res.push_back(fabs(v - med));
|
37
|
+
}
|
38
|
+
std::sort(res.begin(), res.end());
|
39
|
+
return 1.4826 * median_sorted(res);
|
40
|
+
}
|
41
|
+
|
42
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
|
43
|
+
auto n = data.size();
|
44
|
+
|
45
|
+
// Check to make sure we have at least two periods worth of data for anomaly context
|
46
|
+
if (n < num_obs_per_period * 2) {
|
47
|
+
throw std::invalid_argument("series must contain at least 2 periods");
|
48
|
+
}
|
49
|
+
|
50
|
+
// Handle NANs
|
51
|
+
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
52
|
+
if (nan > 0) {
|
53
|
+
throw std::invalid_argument("series contains NANs");
|
54
|
+
}
|
55
|
+
|
56
|
+
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
57
|
+
auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
|
58
|
+
auto seasonal = data_decomp.seasonal;
|
59
|
+
|
60
|
+
std::vector<float> data2;
|
61
|
+
data2.reserve(n);
|
62
|
+
auto med = median(data);
|
63
|
+
for (auto i = 0; i < n; i++) {
|
64
|
+
data2.push_back(data[i] - seasonal[i] - med);
|
65
|
+
}
|
66
|
+
|
67
|
+
auto num_anoms = 0;
|
68
|
+
auto max_outliers = (size_t) n * k;
|
69
|
+
std::vector<size_t> anomalies;
|
70
|
+
anomalies.reserve(max_outliers);
|
71
|
+
|
72
|
+
// Sort data for fast median
|
73
|
+
// Use stable sort for indexes for deterministic results
|
74
|
+
std::vector<size_t> indexes(n);
|
75
|
+
std::iota(indexes.begin(), indexes.end(), 0);
|
76
|
+
std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
|
77
|
+
std::sort(data2.begin(), data2.end());
|
78
|
+
|
79
|
+
// Compute test statistic until r=max_outliers values have been removed from the sample
|
80
|
+
for (auto i = 1; i <= max_outliers; i++) {
|
81
|
+
if (verbose) {
|
82
|
+
std::cout << i << " / " << max_outliers << " completed" << std::endl;
|
83
|
+
}
|
84
|
+
|
85
|
+
// TODO Improve performance between loop iterations
|
86
|
+
auto ma = median_sorted(data2);
|
87
|
+
std::vector<float> ares;
|
88
|
+
ares.reserve(data2.size());
|
89
|
+
if (one_tail) {
|
90
|
+
if (upper_tail) {
|
91
|
+
for (auto v : data2) {
|
92
|
+
ares.push_back(v - ma);
|
93
|
+
}
|
94
|
+
} else {
|
95
|
+
for (auto v : data2) {
|
96
|
+
ares.push_back(ma - v);
|
97
|
+
}
|
98
|
+
}
|
99
|
+
} else {
|
100
|
+
for (auto v : data2) {
|
101
|
+
ares.push_back(fabs(v - ma));
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
// Protect against constant time series
|
106
|
+
auto data_sigma = mad(data2, ma);
|
107
|
+
if (data_sigma == 0.0) {
|
108
|
+
break;
|
109
|
+
}
|
110
|
+
|
111
|
+
auto iter = std::max_element(ares.begin(), ares.end());
|
112
|
+
auto r_idx_i = std::distance(ares.begin(), iter);
|
113
|
+
|
114
|
+
// Only need to take sigma of r for performance
|
115
|
+
auto r = ares[r_idx_i] / data_sigma;
|
116
|
+
|
117
|
+
anomalies.push_back(indexes[r_idx_i]);
|
118
|
+
data2.erase(data2.begin() + r_idx_i);
|
119
|
+
indexes.erase(indexes.begin() + r_idx_i);
|
120
|
+
|
121
|
+
// Compute critical value
|
122
|
+
float p;
|
123
|
+
if (one_tail) {
|
124
|
+
p = 1.0 - alpha / (n - i + 1);
|
125
|
+
} else {
|
126
|
+
p = 1.0 - alpha / (2.0 * (n - i + 1));
|
127
|
+
}
|
128
|
+
|
129
|
+
auto t = students_t_ppf(p, n - i - 1);
|
130
|
+
auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
|
131
|
+
|
132
|
+
if (r > lam) {
|
133
|
+
num_anoms = i;
|
134
|
+
}
|
135
|
+
|
136
|
+
if (callback != nullptr) {
|
137
|
+
callback();
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
anomalies.resize(num_anoms);
|
142
|
+
|
143
|
+
// Sort like R version
|
144
|
+
std::sort(anomalies.begin(), anomalies.end());
|
145
|
+
|
146
|
+
return anomalies;
|
147
|
+
}
|
148
|
+
|
149
|
+
class AnomalyDetectionResult {
|
150
|
+
public:
|
151
|
+
std::vector<size_t> anomalies;
|
152
|
+
};
|
153
|
+
|
154
|
+
class AnomalyDetectionParams {
|
155
|
+
float alpha_ = 0.05;
|
156
|
+
float max_anoms_ = 0.1;
|
157
|
+
Direction direction_ = Direction::Both;
|
158
|
+
bool verbose_ = false;
|
159
|
+
std::function<void()> callback_ = nullptr;
|
160
|
+
|
161
|
+
public:
|
162
|
+
inline AnomalyDetectionParams alpha(float alpha) {
|
163
|
+
this->alpha_ = alpha;
|
164
|
+
return *this;
|
165
|
+
};
|
166
|
+
|
167
|
+
inline AnomalyDetectionParams max_anoms(float max_anoms) {
|
168
|
+
this->max_anoms_ = max_anoms;
|
169
|
+
return *this;
|
170
|
+
};
|
171
|
+
|
172
|
+
inline AnomalyDetectionParams direction(Direction direction) {
|
173
|
+
this->direction_ = direction;
|
174
|
+
return *this;
|
175
|
+
};
|
176
|
+
|
177
|
+
inline AnomalyDetectionParams verbose(bool verbose) {
|
178
|
+
this->verbose_ = verbose;
|
179
|
+
return *this;
|
180
|
+
};
|
181
|
+
|
182
|
+
inline AnomalyDetectionParams callback(std::function<void()> callback) {
|
183
|
+
this->callback_ = callback;
|
184
|
+
return *this;
|
185
|
+
};
|
186
|
+
|
187
|
+
AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
|
188
|
+
};
|
189
|
+
|
190
|
+
AnomalyDetectionParams params() {
|
191
|
+
return AnomalyDetectionParams();
|
192
|
+
}
|
193
|
+
|
194
|
+
AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
|
195
|
+
bool one_tail = this->direction_ != Direction::Both;
|
196
|
+
bool upper_tail = this->direction_ == Direction::Positive;
|
197
|
+
|
198
|
+
auto res = AnomalyDetectionResult();
|
199
|
+
res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
|
200
|
+
return res;
|
201
|
+
}
|
11
202
|
|
12
203
|
}
|
@@ -0,0 +1,190 @@
|
|
1
|
+
/*!
|
2
|
+
* dist.h v0.1.1
|
3
|
+
* https://github.com/ankane/dist.h
|
4
|
+
* Unlicense OR MIT License
|
5
|
+
*/
|
6
|
+
|
7
|
+
#pragma once
|
8
|
+
|
9
|
+
#include <assert.h>
|
10
|
+
#include <math.h>
|
11
|
+
|
12
|
+
#ifdef M_E
|
13
|
+
#define DIST_E M_E
|
14
|
+
#else
|
15
|
+
#define DIST_E 2.71828182845904523536
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifdef M_PI
|
19
|
+
#define DIST_PI M_PI
|
20
|
+
#else
|
21
|
+
#define DIST_PI 3.14159265358979323846
|
22
|
+
#endif
|
23
|
+
|
24
|
+
// Winitzki, S. (2008).
|
25
|
+
// A handy approximation for the error function and its inverse.
|
26
|
+
// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
|
27
|
+
// from https://sites.google.com/site/winitzki
|
28
|
+
double erf(double x) {
|
29
|
+
double sign = x < 0 ? -1.0 : 1.0;
|
30
|
+
x = x < 0 ? -x : x;
|
31
|
+
|
32
|
+
double a = 0.14;
|
33
|
+
double x2 = x * x;
|
34
|
+
return sign * sqrt(1.0 - exp(-x2 * (4.0 / DIST_PI + a * x2) / (1.0 + a * x2)));
|
35
|
+
}
|
36
|
+
|
37
|
+
// Winitzki, S. (2008).
|
38
|
+
// A handy approximation for the error function and its inverse.
|
39
|
+
// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
|
40
|
+
// from https://sites.google.com/site/winitzki
|
41
|
+
double inverse_erf(double x) {
|
42
|
+
double sign = x < 0 ? -1.0 : 1.0;
|
43
|
+
x = x < 0 ? -x : x;
|
44
|
+
|
45
|
+
double a = 0.147;
|
46
|
+
double ln = log(1.0 - x * x);
|
47
|
+
double f1 = 2.0 / (DIST_PI * a);
|
48
|
+
double f2 = ln / 2.0;
|
49
|
+
double f3 = f1 + f2;
|
50
|
+
double f4 = 1.0 / a * ln;
|
51
|
+
return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
|
52
|
+
}
|
53
|
+
|
54
|
+
double normal_pdf(double x, double mean, double std_dev) {
|
55
|
+
double var = std_dev * std_dev;
|
56
|
+
return (1.0 / (var * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * pow((x - mean) / var, 2));
|
57
|
+
}
|
58
|
+
|
59
|
+
double normal_cdf(double x, double mean, double std_dev) {
|
60
|
+
return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
|
61
|
+
}
|
62
|
+
|
63
|
+
double normal_ppf(double p, double mean, double std_dev) {
|
64
|
+
assert(p >= 0 && p <= 1);
|
65
|
+
|
66
|
+
return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
|
67
|
+
}
|
68
|
+
|
69
|
+
double students_t_pdf(double x, unsigned int n) {
|
70
|
+
assert(n >= 1);
|
71
|
+
|
72
|
+
return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
|
73
|
+
}
|
74
|
+
|
75
|
+
// Hill, G. W. (1970).
|
76
|
+
// Algorithm 395: Student's t-distribution.
|
77
|
+
// Communications of the ACM, 13(10), 617-619.
|
78
|
+
double students_t_cdf(double x, unsigned int n) {
|
79
|
+
assert(n >= 1);
|
80
|
+
|
81
|
+
double start = x < 0 ? 0 : 1;
|
82
|
+
double sign = x < 0 ? 1 : -1;
|
83
|
+
|
84
|
+
double z = 1.0;
|
85
|
+
double t = x * x;
|
86
|
+
double y = t / n;
|
87
|
+
double b = 1.0 + y;
|
88
|
+
|
89
|
+
if ((n >= 20 && t < n) || n > 200) {
|
90
|
+
// asymptotic series for large or noninteger n
|
91
|
+
if (y > 10e-6) {
|
92
|
+
y = log(b);
|
93
|
+
}
|
94
|
+
double a = n - 0.5;
|
95
|
+
b = 48.0 * a * a;
|
96
|
+
y = a * y;
|
97
|
+
y = (((((-0.4 * y - 3.3) * y - 24.0) * y - 85.5) / (0.8 * y * y + 100.0 + b) + y + 3.0) / b + 1.0) * sqrt(y);
|
98
|
+
return start + sign * normal_cdf(-y, 0.0, 1.0);
|
99
|
+
}
|
100
|
+
|
101
|
+
if (n < 20 && t < 4.0) {
|
102
|
+
// nested summation of cosine series
|
103
|
+
y = sqrt(y);
|
104
|
+
double a = y;
|
105
|
+
if (n == 1) {
|
106
|
+
a = 0.0;
|
107
|
+
}
|
108
|
+
|
109
|
+
// loop
|
110
|
+
if (n > 1) {
|
111
|
+
n -= 2;
|
112
|
+
while (n > 1) {
|
113
|
+
a = (n - 1) / (b * n) * a + y;
|
114
|
+
n -= 2;
|
115
|
+
}
|
116
|
+
}
|
117
|
+
a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
|
118
|
+
return start + sign * (z - a) / 2;
|
119
|
+
}
|
120
|
+
|
121
|
+
// tail series expanation for large t-values
|
122
|
+
double a = sqrt(b);
|
123
|
+
y = a * n;
|
124
|
+
int j = 0;
|
125
|
+
while (a != z) {
|
126
|
+
j += 2;
|
127
|
+
z = a;
|
128
|
+
y = y * (j - 1) / (b * j);
|
129
|
+
a = a + y / (n + j);
|
130
|
+
}
|
131
|
+
z = 0.0;
|
132
|
+
y = 0.0;
|
133
|
+
a = -a;
|
134
|
+
|
135
|
+
// loop (without n + 2 and n - 2)
|
136
|
+
while (n > 1) {
|
137
|
+
a = (n - 1) / (b * n) * a + y;
|
138
|
+
n -= 2;
|
139
|
+
}
|
140
|
+
a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
|
141
|
+
return start + sign * (z - a) / 2;
|
142
|
+
}
|
143
|
+
|
144
|
+
// Hill, G. W. (1970).
|
145
|
+
// Algorithm 396: Student's t-quantiles.
|
146
|
+
// Communications of the ACM, 13(10), 619-620.
|
147
|
+
double students_t_ppf(double p, unsigned int n) {
|
148
|
+
assert(p >= 0 && p <= 1);
|
149
|
+
assert(n >= 1);
|
150
|
+
|
151
|
+
// distribution is symmetric
|
152
|
+
double sign = p < 0.5 ? -1 : 1;
|
153
|
+
p = p < 0.5 ? 1 - p : p;
|
154
|
+
|
155
|
+
// two-tail to one-tail
|
156
|
+
p = 2.0 * (1.0 - p);
|
157
|
+
|
158
|
+
if (n == 2) {
|
159
|
+
return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
|
160
|
+
}
|
161
|
+
|
162
|
+
double half_pi = DIST_PI / 2.0;
|
163
|
+
|
164
|
+
if (n == 1) {
|
165
|
+
p = p * half_pi;
|
166
|
+
return sign * cos(p) / sin(p);
|
167
|
+
}
|
168
|
+
|
169
|
+
double a = 1.0 / (n - 0.5);
|
170
|
+
double b = 48.0 / (a * a);
|
171
|
+
double c = ((20700.0 * a / b - 98.0) * a - 16.0) * a + 96.36;
|
172
|
+
double d = ((94.5 / (b + c) - 3.0) / b + 1.0) * sqrt(a * half_pi) * n;
|
173
|
+
double x = d * p;
|
174
|
+
double y = pow(x, 2.0 / n);
|
175
|
+
if (y > 0.05 + a) {
|
176
|
+
// asymptotic inverse expansion about normal
|
177
|
+
x = normal_ppf(p * 0.5, 0.0, 1.0);
|
178
|
+
y = x * x;
|
179
|
+
if (n < 5) {
|
180
|
+
c += 0.3 * (n - 4.5) * (x + 0.6);
|
181
|
+
}
|
182
|
+
c = (((0.05 * d * x - 5.0) * x - 7.0) * x - 2.0) * x + b + c;
|
183
|
+
y = (((((0.4 * y + 6.3) * y + 36.0) * y + 94.5) / c - y - 3.0) / b + 1.0) * x;
|
184
|
+
y = a * y * y;
|
185
|
+
y = y > 0.002 ? exp(y) - 1.0 : 0.5 * y * y + y;
|
186
|
+
} else {
|
187
|
+
y = ((1.0 / (((n + 6.0) / (n * y) - 0.089 * d - 0.822) * (n + 2.0) * 3.0) + 0.5 / (n + 4.0)) * y - 1.0) * (n + 1.0) / (n + 2.0) + 1.0 / y;
|
188
|
+
}
|
189
|
+
return sign * sqrt(n * y);
|
190
|
+
}
|
@@ -12,7 +12,7 @@ void Init_ext() {
|
|
12
12
|
rb_mAnomalyDetection
|
13
13
|
.define_singleton_function(
|
14
14
|
"_detect",
|
15
|
-
[](std::vector<float>
|
15
|
+
[](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
|
16
16
|
Direction dir;
|
17
17
|
if (direction == "pos") {
|
18
18
|
dir = Direction::Positive;
|
@@ -24,10 +24,16 @@ void Init_ext() {
|
|
24
24
|
throw std::invalid_argument("direction must be pos, neg, or both");
|
25
25
|
}
|
26
26
|
|
27
|
-
auto res = anomaly_detection::
|
27
|
+
auto res = anomaly_detection::params()
|
28
|
+
.max_anoms(k)
|
29
|
+
.alpha(alpha)
|
30
|
+
.direction(dir)
|
31
|
+
.verbose(verbose)
|
32
|
+
.callback(rb_thread_check_ints)
|
33
|
+
.fit(series, period);
|
28
34
|
|
29
35
|
auto a = Rice::Array();
|
30
|
-
for (auto v : res) {
|
36
|
+
for (auto v : res.anomalies) {
|
31
37
|
a.push(v);
|
32
38
|
}
|
33
39
|
return a;
|
data/lib/anomaly_detection.rb
CHANGED
@@ -5,18 +5,72 @@ require "anomaly_detection/ext"
|
|
5
5
|
require "anomaly_detection/version"
|
6
6
|
|
7
7
|
module AnomalyDetection
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
8
|
+
class << self
|
9
|
+
def detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", plot: false, verbose: false)
|
10
|
+
raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
|
11
|
+
|
12
|
+
if series.is_a?(Hash)
|
13
|
+
sorted = series.sort_by { |k, _| k }
|
14
|
+
x = sorted.map(&:last)
|
15
|
+
else
|
16
|
+
x = series
|
17
|
+
end
|
18
|
+
|
19
|
+
res = _detect(x, period, max_anoms, alpha, direction, verbose)
|
20
|
+
res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
|
21
|
+
res
|
16
22
|
end
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
# TODO add tooltips
|
25
|
+
def plot(series, anomalies)
|
26
|
+
require "vega"
|
27
|
+
|
28
|
+
data =
|
29
|
+
if series.is_a?(Hash)
|
30
|
+
series.map { |k, v| {x: iso8601(k), y: v, anomaly: anomalies.include?(k)} }
|
31
|
+
else
|
32
|
+
series.map.with_index { |v, i| {x: i, y: v, anomaly: anomalies.include?(i)} }
|
33
|
+
end
|
34
|
+
|
35
|
+
if series.is_a?(Hash)
|
36
|
+
x = {field: "x", type: "temporal"}
|
37
|
+
x["scale"] = {type: "utc"} if series.keys.first.is_a?(Date)
|
38
|
+
else
|
39
|
+
x = {field: "x", type: "quantitative"}
|
40
|
+
end
|
41
|
+
|
42
|
+
Vega.lite
|
43
|
+
.data(data)
|
44
|
+
.layer([
|
45
|
+
{
|
46
|
+
mark: {type: "line"},
|
47
|
+
encoding: {
|
48
|
+
x: x,
|
49
|
+
y: {field: "y", type: "quantitative", scale: {zero: false}},
|
50
|
+
color: {value: "#fa9088"}
|
51
|
+
}
|
52
|
+
},
|
53
|
+
{
|
54
|
+
transform: [{"filter": "datum.anomaly == true"}],
|
55
|
+
mark: {type: "point", size: 200},
|
56
|
+
encoding: {
|
57
|
+
x: x,
|
58
|
+
y: {field: "y", type: "quantitative"},
|
59
|
+
color: {value: "#19c7ca"}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
])
|
63
|
+
.config(axis: {title: nil, labelFontSize: 12})
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def iso8601(v)
|
69
|
+
if v.is_a?(Date)
|
70
|
+
v.strftime("%Y-%m-%d")
|
71
|
+
else
|
72
|
+
v.strftime("%Y-%m-%dT%H:%M:%S.%L%z")
|
73
|
+
end
|
74
|
+
end
|
21
75
|
end
|
22
76
|
end
|