anomaly_detection 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +675 -0
- data/NOTICE.txt +15 -0
- data/README.md +89 -0
- data/ext/anomaly_detection/anomaly_detection.cpp +156 -0
- data/ext/anomaly_detection/cdflib.cpp +12126 -0
- data/ext/anomaly_detection/cdflib.hpp +123 -0
- data/ext/anomaly_detection/ext.cpp +23 -0
- data/ext/anomaly_detection/extconf.rb +5 -0
- data/ext/anomaly_detection/stl.hpp +458 -0
- data/lib/anomaly_detection/version.rb +3 -0
- data/lib/anomaly_detection.rb +22 -0
- data/licenses/LICENSE-MIT-stl-cpp.txt +21 -0
- data/licenses/LICENSE-cdflib.txt +165 -0
- data/licenses/UNLICENSE-stl-cpp.txt +24 -0
- metadata +72 -0
data/NOTICE.txt
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Copyright (C) 2015 Twitter, Inc and other contributors
|
2
|
+
Copyright (C) 2021 Andrew Kane
|
3
|
+
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
5
|
+
it under the terms of the GNU General Public License as published by
|
6
|
+
the Free Software Foundation, either version 3 of the License, or
|
7
|
+
(at your option) any later version.
|
8
|
+
|
9
|
+
This program is distributed in the hope that it will be useful,
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12
|
+
GNU General Public License for more details.
|
13
|
+
|
14
|
+
You should have received a copy of the GNU General Public License
|
15
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/README.md
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
# AnomalyDetection.rb
|
2
|
+
|
3
|
+
:fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
4
|
+
|
5
|
+
Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
|
6
|
+
|
7
|
+
[![Build Status](https://github.com/ankane/AnomalyDetection.rb/workflows/build/badge.svg?branch=master)](https://github.com/ankane/AnomalyDetection.rb/actions)
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application’s Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'anomaly_detection'
|
15
|
+
```
|
16
|
+
|
17
|
+
## Getting Started
|
18
|
+
|
19
|
+
Detect anomalies in a time series
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
series = {
|
23
|
+
Date.parse("2020-01-01") => 100,
|
24
|
+
Date.parse("2020-01-02") => 150,
|
25
|
+
Date.parse("2020-01-03") => 136,
|
26
|
+
# ...
|
27
|
+
}
|
28
|
+
|
29
|
+
AnomalyDetection.detect(series, period: 7)
|
30
|
+
```
|
31
|
+
|
32
|
+
Works great with [Groupdate](https://github.com/ankane/groupdate)
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
series = User.group_by_day(:created_at).count
|
36
|
+
AnomalyDetection.detect(series, period: 7)
|
37
|
+
```
|
38
|
+
|
39
|
+
Series can also be an array without times (the index is returned)
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
series = [100, 150, 136, ...]
|
43
|
+
AnomalyDetection.detect(series, period: 7)
|
44
|
+
```
|
45
|
+
|
46
|
+
## Options
|
47
|
+
|
48
|
+
Pass options
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
AnomalyDetection.detect(
|
52
|
+
series,
|
53
|
+
period: 7, # number of observations in a single period
|
54
|
+
alpha: 0.05, # level of statistical significance
|
55
|
+
max_anoms: 0.1, # maximum number of anomalies as percent of data
|
56
|
+
direction: "both" # pos, neg, or both
|
57
|
+
)
|
58
|
+
```
|
59
|
+
|
60
|
+
## Credits
|
61
|
+
|
62
|
+
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
|
63
|
+
|
64
|
+
## References
|
65
|
+
|
66
|
+
- [Automatic Anomaly Detection in the Cloud Via Statistical Learning](https://arxiv.org/abs/1704.07706)
|
67
|
+
|
68
|
+
## History
|
69
|
+
|
70
|
+
View the [changelog](https://github.com/ankane/AnomalyDetection.rb/blob/master/CHANGELOG.md)
|
71
|
+
|
72
|
+
## Contributing
|
73
|
+
|
74
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
75
|
+
|
76
|
+
- [Report bugs](https://github.com/ankane/AnomalyDetection.rb/issues)
|
77
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/AnomalyDetection.rb/pulls)
|
78
|
+
- Write, clarify, or fix documentation
|
79
|
+
- Suggest or add new features
|
80
|
+
|
81
|
+
To get started with development:
|
82
|
+
|
83
|
+
```sh
|
84
|
+
git clone https://github.com/ankane/AnomalyDetection.rb.git
|
85
|
+
cd AnomalyDetection.rb
|
86
|
+
bundle install
|
87
|
+
bundle exec rake compile
|
88
|
+
bundle exec rake test
|
89
|
+
```
|
@@ -0,0 +1,156 @@
|
|
1
|
+
#include <cassert>
|
2
|
+
#include <iostream>
|
3
|
+
#include <iterator>
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
#include "cdflib.hpp"
|
7
|
+
#include "stl.hpp"
|
8
|
+
|
9
|
+
float median(const std::vector<float>& data) {
|
10
|
+
std::vector<float> sorted(data);
|
11
|
+
std::sort(sorted.begin(), sorted.end());
|
12
|
+
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
13
|
+
}
|
14
|
+
|
15
|
+
float mad(const std::vector<float>& data) {
|
16
|
+
auto med = median(data);
|
17
|
+
std::vector<float> res;
|
18
|
+
res.reserve(data.size());
|
19
|
+
for (auto v : data) {
|
20
|
+
res.push_back(fabs(v - med));
|
21
|
+
}
|
22
|
+
return 1.4826 * median(res);
|
23
|
+
}
|
24
|
+
|
25
|
+
float qt(double p, double df) {
|
26
|
+
int which = 2;
|
27
|
+
double q = 1 - p;
|
28
|
+
double t;
|
29
|
+
int status;
|
30
|
+
double bound;
|
31
|
+
cdft(&which, &p, &q, &t, &df, &status, &bound);
|
32
|
+
|
33
|
+
if (status != 0) {
|
34
|
+
throw std::invalid_argument("Bad status");
|
35
|
+
}
|
36
|
+
return t;
|
37
|
+
}
|
38
|
+
|
39
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
|
40
|
+
auto num_obs = data.size();
|
41
|
+
|
42
|
+
// Check to make sure we have at least two periods worth of data for anomaly context
|
43
|
+
assert(num_obs >= num_obs_per_period * 2);
|
44
|
+
|
45
|
+
// Handle NANs
|
46
|
+
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
47
|
+
if (nan > 0) {
|
48
|
+
throw std::invalid_argument("Data contains NANs");
|
49
|
+
}
|
50
|
+
|
51
|
+
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
52
|
+
auto seasonal_length = data.size() * 10 + 1;
|
53
|
+
auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
|
54
|
+
|
55
|
+
auto seasonal = data_decomp.seasonal;
|
56
|
+
auto med = median(data);
|
57
|
+
std::vector<float> data2;
|
58
|
+
data2.reserve(data.size());
|
59
|
+
for (auto i = 0; i < data.size(); i++) {
|
60
|
+
data2.push_back(data[i] - seasonal[i] - med);
|
61
|
+
}
|
62
|
+
|
63
|
+
auto max_outliers = (size_t) num_obs * k;
|
64
|
+
assert(max_outliers > 0);
|
65
|
+
|
66
|
+
auto n = data2.size();
|
67
|
+
|
68
|
+
std::vector<size_t> r_idx;
|
69
|
+
|
70
|
+
std::vector<size_t> indexes;
|
71
|
+
indexes.reserve(data2.size());
|
72
|
+
for (auto i = 0; i < data2.size(); i++) {
|
73
|
+
indexes.push_back(i);
|
74
|
+
}
|
75
|
+
|
76
|
+
// Compute test statistic until r=max_outliers values have been removed from the sample
|
77
|
+
for (auto i = 1; i <= max_outliers; i++) {
|
78
|
+
// TODO Improve performance between loop iterations
|
79
|
+
auto ma = median(data2);
|
80
|
+
std::vector<float> ares;
|
81
|
+
ares.reserve(data2.size());
|
82
|
+
if (one_tail) {
|
83
|
+
if (upper_tail) {
|
84
|
+
for (auto v : data2) {
|
85
|
+
ares.push_back(v - ma);
|
86
|
+
}
|
87
|
+
} else {
|
88
|
+
for (auto v : data2) {
|
89
|
+
ares.push_back(ma - v);
|
90
|
+
}
|
91
|
+
}
|
92
|
+
} else {
|
93
|
+
for (auto v : data2) {
|
94
|
+
ares.push_back(fabs(v - ma));
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
// Protect against constant time series
|
99
|
+
auto data_sigma = mad(data2);
|
100
|
+
if (data_sigma == 0.0) {
|
101
|
+
break;
|
102
|
+
}
|
103
|
+
|
104
|
+
auto iter = std::max_element(ares.begin(), ares.end());
|
105
|
+
auto r_idx_i = std::distance(ares.begin(), iter);
|
106
|
+
auto r_idx_i2 = indexes[r_idx_i];
|
107
|
+
|
108
|
+
// Only need to take sigma of r for performance
|
109
|
+
auto r = ares[r_idx_i] / data_sigma;
|
110
|
+
|
111
|
+
// TODO Swap to last position and delete
|
112
|
+
data2.erase(data2.begin() + r_idx_i);
|
113
|
+
indexes.erase(indexes.begin() + r_idx_i);
|
114
|
+
|
115
|
+
// Compute critical value
|
116
|
+
float p;
|
117
|
+
if (one_tail) {
|
118
|
+
p = 1.0 - alpha / (n - i + 1);
|
119
|
+
} else {
|
120
|
+
p = 1.0 - alpha / (2.0 * (n - i + 1));
|
121
|
+
}
|
122
|
+
|
123
|
+
auto t = qt(p, n - i - 1);
|
124
|
+
auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
|
125
|
+
|
126
|
+
if (r > lam) {
|
127
|
+
r_idx.push_back(r_idx_i2);
|
128
|
+
} else {
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
// Sort like R version
|
134
|
+
std::sort(r_idx.begin(), r_idx.end());
|
135
|
+
|
136
|
+
return r_idx;
|
137
|
+
}
|
138
|
+
|
139
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
|
140
|
+
bool one_tail;
|
141
|
+
bool upper_tail;
|
142
|
+
if (direction == "pos") {
|
143
|
+
one_tail = true;
|
144
|
+
upper_tail = true;
|
145
|
+
} else if (direction == "neg") {
|
146
|
+
one_tail = true;
|
147
|
+
upper_tail = false;
|
148
|
+
} else if (direction == "both") {
|
149
|
+
one_tail = false;
|
150
|
+
upper_tail = true; // not used
|
151
|
+
} else {
|
152
|
+
throw std::invalid_argument("Bad direction");
|
153
|
+
}
|
154
|
+
|
155
|
+
return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
|
156
|
+
}
|