anomaly_detection 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +675 -0
- data/NOTICE.txt +15 -0
- data/README.md +89 -0
- data/ext/anomaly_detection/anomaly_detection.cpp +156 -0
- data/ext/anomaly_detection/cdflib.cpp +12126 -0
- data/ext/anomaly_detection/cdflib.hpp +123 -0
- data/ext/anomaly_detection/ext.cpp +23 -0
- data/ext/anomaly_detection/extconf.rb +5 -0
- data/ext/anomaly_detection/stl.hpp +458 -0
- data/lib/anomaly_detection/version.rb +3 -0
- data/lib/anomaly_detection.rb +22 -0
- data/licenses/LICENSE-MIT-stl-cpp.txt +21 -0
- data/licenses/LICENSE-cdflib.txt +165 -0
- data/licenses/UNLICENSE-stl-cpp.txt +24 -0
- metadata +72 -0
data/NOTICE.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Copyright (C) 2015 Twitter, Inc and other contributors
|
|
2
|
+
Copyright (C) 2021 Andrew Kane
|
|
3
|
+
|
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
|
5
|
+
it under the terms of the GNU General Public License as published by
|
|
6
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
(at your option) any later version.
|
|
8
|
+
|
|
9
|
+
This program is distributed in the hope that it will be useful,
|
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
GNU General Public License for more details.
|
|
13
|
+
|
|
14
|
+
You should have received a copy of the GNU General Public License
|
|
15
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
data/README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# AnomalyDetection.rb
|
|
2
|
+
|
|
3
|
+
:fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
|
4
|
+
|
|
5
|
+
Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
|
|
6
|
+
|
|
7
|
+
[](https://github.com/ankane/AnomalyDetection.rb/actions)
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Add this line to your application’s Gemfile:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem 'anomaly_detection'
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Getting Started
|
|
18
|
+
|
|
19
|
+
Detect anomalies in a time series
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
series = {
|
|
23
|
+
Date.parse("2020-01-01") => 100,
|
|
24
|
+
Date.parse("2020-01-02") => 150,
|
|
25
|
+
Date.parse("2020-01-03") => 136,
|
|
26
|
+
# ...
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
AnomalyDetection.detect(series, period: 7)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Works great with [Groupdate](https://github.com/ankane/groupdate)
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
series = User.group_by_day(:created_at).count
|
|
36
|
+
AnomalyDetection.detect(series, period: 7)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Series can also be an array without times (the index is returned)
|
|
40
|
+
|
|
41
|
+
```ruby
|
|
42
|
+
series = [100, 150, 136, ...]
|
|
43
|
+
AnomalyDetection.detect(series, period: 7)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Options
|
|
47
|
+
|
|
48
|
+
Pass options
|
|
49
|
+
|
|
50
|
+
```ruby
|
|
51
|
+
AnomalyDetection.detect(
|
|
52
|
+
series,
|
|
53
|
+
period: 7, # number of observations in a single period
|
|
54
|
+
alpha: 0.05, # level of statistical significance
|
|
55
|
+
max_anoms: 0.1, # maximum number of anomalies as percent of data
|
|
56
|
+
direction: "both" # pos, neg, or both
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Credits
|
|
61
|
+
|
|
62
|
+
This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
|
|
63
|
+
|
|
64
|
+
## References
|
|
65
|
+
|
|
66
|
+
- [Automatic Anomaly Detection in the Cloud Via Statistical Learning](https://arxiv.org/abs/1704.07706)
|
|
67
|
+
|
|
68
|
+
## History
|
|
69
|
+
|
|
70
|
+
View the [changelog](https://github.com/ankane/AnomalyDetection.rb/blob/master/CHANGELOG.md)
|
|
71
|
+
|
|
72
|
+
## Contributing
|
|
73
|
+
|
|
74
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
|
75
|
+
|
|
76
|
+
- [Report bugs](https://github.com/ankane/AnomalyDetection.rb/issues)
|
|
77
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/AnomalyDetection.rb/pulls)
|
|
78
|
+
- Write, clarify, or fix documentation
|
|
79
|
+
- Suggest or add new features
|
|
80
|
+
|
|
81
|
+
To get started with development:
|
|
82
|
+
|
|
83
|
+
```sh
|
|
84
|
+
git clone https://github.com/ankane/AnomalyDetection.rb.git
|
|
85
|
+
cd AnomalyDetection.rb
|
|
86
|
+
bundle install
|
|
87
|
+
bundle exec rake compile
|
|
88
|
+
bundle exec rake test
|
|
89
|
+
```
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
#include <cassert>
|
|
2
|
+
#include <iostream>
|
|
3
|
+
#include <iterator>
|
|
4
|
+
#include <vector>
|
|
5
|
+
|
|
6
|
+
#include "cdflib.hpp"
|
|
7
|
+
#include "stl.hpp"
|
|
8
|
+
|
|
9
|
+
float median(const std::vector<float>& data) {
|
|
10
|
+
std::vector<float> sorted(data);
|
|
11
|
+
std::sort(sorted.begin(), sorted.end());
|
|
12
|
+
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
float mad(const std::vector<float>& data) {
|
|
16
|
+
auto med = median(data);
|
|
17
|
+
std::vector<float> res;
|
|
18
|
+
res.reserve(data.size());
|
|
19
|
+
for (auto v : data) {
|
|
20
|
+
res.push_back(fabs(v - med));
|
|
21
|
+
}
|
|
22
|
+
return 1.4826 * median(res);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
float qt(double p, double df) {
|
|
26
|
+
int which = 2;
|
|
27
|
+
double q = 1 - p;
|
|
28
|
+
double t;
|
|
29
|
+
int status;
|
|
30
|
+
double bound;
|
|
31
|
+
cdft(&which, &p, &q, &t, &df, &status, &bound);
|
|
32
|
+
|
|
33
|
+
if (status != 0) {
|
|
34
|
+
throw std::invalid_argument("Bad status");
|
|
35
|
+
}
|
|
36
|
+
return t;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
|
|
40
|
+
auto num_obs = data.size();
|
|
41
|
+
|
|
42
|
+
// Check to make sure we have at least two periods worth of data for anomaly context
|
|
43
|
+
assert(num_obs >= num_obs_per_period * 2);
|
|
44
|
+
|
|
45
|
+
// Handle NANs
|
|
46
|
+
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
|
47
|
+
if (nan > 0) {
|
|
48
|
+
throw std::invalid_argument("Data contains NANs");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
|
52
|
+
auto seasonal_length = data.size() * 10 + 1;
|
|
53
|
+
auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
|
|
54
|
+
|
|
55
|
+
auto seasonal = data_decomp.seasonal;
|
|
56
|
+
auto med = median(data);
|
|
57
|
+
std::vector<float> data2;
|
|
58
|
+
data2.reserve(data.size());
|
|
59
|
+
for (auto i = 0; i < data.size(); i++) {
|
|
60
|
+
data2.push_back(data[i] - seasonal[i] - med);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
auto max_outliers = (size_t) num_obs * k;
|
|
64
|
+
assert(max_outliers > 0);
|
|
65
|
+
|
|
66
|
+
auto n = data2.size();
|
|
67
|
+
|
|
68
|
+
std::vector<size_t> r_idx;
|
|
69
|
+
|
|
70
|
+
std::vector<size_t> indexes;
|
|
71
|
+
indexes.reserve(data2.size());
|
|
72
|
+
for (auto i = 0; i < data2.size(); i++) {
|
|
73
|
+
indexes.push_back(i);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Compute test statistic until r=max_outliers values have been removed from the sample
|
|
77
|
+
for (auto i = 1; i <= max_outliers; i++) {
|
|
78
|
+
// TODO Improve performance between loop iterations
|
|
79
|
+
auto ma = median(data2);
|
|
80
|
+
std::vector<float> ares;
|
|
81
|
+
ares.reserve(data2.size());
|
|
82
|
+
if (one_tail) {
|
|
83
|
+
if (upper_tail) {
|
|
84
|
+
for (auto v : data2) {
|
|
85
|
+
ares.push_back(v - ma);
|
|
86
|
+
}
|
|
87
|
+
} else {
|
|
88
|
+
for (auto v : data2) {
|
|
89
|
+
ares.push_back(ma - v);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
} else {
|
|
93
|
+
for (auto v : data2) {
|
|
94
|
+
ares.push_back(fabs(v - ma));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Protect against constant time series
|
|
99
|
+
auto data_sigma = mad(data2);
|
|
100
|
+
if (data_sigma == 0.0) {
|
|
101
|
+
break;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
auto iter = std::max_element(ares.begin(), ares.end());
|
|
105
|
+
auto r_idx_i = std::distance(ares.begin(), iter);
|
|
106
|
+
auto r_idx_i2 = indexes[r_idx_i];
|
|
107
|
+
|
|
108
|
+
// Only need to take sigma of r for performance
|
|
109
|
+
auto r = ares[r_idx_i] / data_sigma;
|
|
110
|
+
|
|
111
|
+
// TODO Swap to last position and delete
|
|
112
|
+
data2.erase(data2.begin() + r_idx_i);
|
|
113
|
+
indexes.erase(indexes.begin() + r_idx_i);
|
|
114
|
+
|
|
115
|
+
// Compute critical value
|
|
116
|
+
float p;
|
|
117
|
+
if (one_tail) {
|
|
118
|
+
p = 1.0 - alpha / (n - i + 1);
|
|
119
|
+
} else {
|
|
120
|
+
p = 1.0 - alpha / (2.0 * (n - i + 1));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
auto t = qt(p, n - i - 1);
|
|
124
|
+
auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
|
|
125
|
+
|
|
126
|
+
if (r > lam) {
|
|
127
|
+
r_idx.push_back(r_idx_i2);
|
|
128
|
+
} else {
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Sort like R version
|
|
134
|
+
std::sort(r_idx.begin(), r_idx.end());
|
|
135
|
+
|
|
136
|
+
return r_idx;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
|
|
140
|
+
bool one_tail;
|
|
141
|
+
bool upper_tail;
|
|
142
|
+
if (direction == "pos") {
|
|
143
|
+
one_tail = true;
|
|
144
|
+
upper_tail = true;
|
|
145
|
+
} else if (direction == "neg") {
|
|
146
|
+
one_tail = true;
|
|
147
|
+
upper_tail = false;
|
|
148
|
+
} else if (direction == "both") {
|
|
149
|
+
one_tail = false;
|
|
150
|
+
upper_tail = true; // not used
|
|
151
|
+
} else {
|
|
152
|
+
throw std::invalid_argument("Bad direction");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
|
|
156
|
+
}
|