anomaly_detection 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/NOTICE.txt ADDED
@@ -0,0 +1,15 @@
1
+ Copyright (C) 2015 Twitter, Inc and other contributors
2
+ Copyright (C) 2021 Andrew Kane
3
+
4
+ This program is free software: you can redistribute it and/or modify
5
+ it under the terms of the GNU General Public License as published by
6
+ the Free Software Foundation, either version 3 of the License, or
7
+ (at your option) any later version.
8
+
9
+ This program is distributed in the hope that it will be useful,
10
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ GNU General Public License for more details.
13
+
14
+ You should have received a copy of the GNU General Public License
15
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # AnomalyDetection.rb
2
+
3
+ :fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
4
+
5
+ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
6
+
7
+ [![Build Status](https://github.com/ankane/AnomalyDetection.rb/workflows/build/badge.svg?branch=master)](https://github.com/ankane/AnomalyDetection.rb/actions)
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application’s Gemfile:
12
+
13
+ ```ruby
14
+ gem 'anomaly_detection'
15
+ ```
16
+
17
+ ## Getting Started
18
+
19
+ Detect anomalies in a time series
20
+
21
+ ```ruby
22
+ series = {
23
+ Date.parse("2020-01-01") => 100,
24
+ Date.parse("2020-01-02") => 150,
25
+ Date.parse("2020-01-03") => 136,
26
+ # ...
27
+ }
28
+
29
+ AnomalyDetection.detect(series, period: 7)
30
+ ```
31
+
32
+ Works great with [Groupdate](https://github.com/ankane/groupdate)
33
+
34
+ ```ruby
35
+ series = User.group_by_day(:created_at).count
36
+ AnomalyDetection.detect(series, period: 7)
37
+ ```
38
+
39
+ Series can also be an array without times (the index is returned)
40
+
41
+ ```ruby
42
+ series = [100, 150, 136, ...]
43
+ AnomalyDetection.detect(series, period: 7)
44
+ ```
45
+
46
+ ## Options
47
+
48
+ Pass options
49
+
50
+ ```ruby
51
+ AnomalyDetection.detect(
52
+ series,
53
+ period: 7, # number of observations in a single period
54
+ alpha: 0.05, # level of statistical significance
55
+ max_anoms: 0.1, # maximum number of anomalies as percent of data
56
+ direction: "both" # pos, neg, or both
57
+ )
58
+ ```
59
+
60
+ ## Credits
61
+
62
+ This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
63
+
64
+ ## References
65
+
66
+ - [Automatic Anomaly Detection in the Cloud Via Statistical Learning](https://arxiv.org/abs/1704.07706)
67
+
68
+ ## History
69
+
70
+ View the [changelog](https://github.com/ankane/AnomalyDetection.rb/blob/master/CHANGELOG.md)
71
+
72
+ ## Contributing
73
+
74
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
75
+
76
+ - [Report bugs](https://github.com/ankane/AnomalyDetection.rb/issues)
77
+ - Fix bugs and [submit pull requests](https://github.com/ankane/AnomalyDetection.rb/pulls)
78
+ - Write, clarify, or fix documentation
79
+ - Suggest or add new features
80
+
81
+ To get started with development:
82
+
83
+ ```sh
84
+ git clone https://github.com/ankane/AnomalyDetection.rb.git
85
+ cd AnomalyDetection.rb
86
+ bundle install
87
+ bundle exec rake compile
88
+ bundle exec rake test
89
+ ```
@@ -0,0 +1,156 @@
1
+ #include <cassert>
2
+ #include <iostream>
3
+ #include <iterator>
4
+ #include <vector>
5
+
6
+ #include "cdflib.hpp"
7
+ #include "stl.hpp"
8
+
9
+ float median(const std::vector<float>& data) {
10
+ std::vector<float> sorted(data);
11
+ std::sort(sorted.begin(), sorted.end());
12
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
13
+ }
14
+
15
+ float mad(const std::vector<float>& data) {
16
+ auto med = median(data);
17
+ std::vector<float> res;
18
+ res.reserve(data.size());
19
+ for (auto v : data) {
20
+ res.push_back(fabs(v - med));
21
+ }
22
+ return 1.4826 * median(res);
23
+ }
24
+
25
+ float qt(double p, double df) {
26
+ int which = 2;
27
+ double q = 1 - p;
28
+ double t;
29
+ int status;
30
+ double bound;
31
+ cdft(&which, &p, &q, &t, &df, &status, &bound);
32
+
33
+ if (status != 0) {
34
+ throw std::invalid_argument("Bad status");
35
+ }
36
+ return t;
37
+ }
38
+
39
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
40
+ auto num_obs = data.size();
41
+
42
+ // Check to make sure we have at least two periods worth of data for anomaly context
43
+ assert(num_obs >= num_obs_per_period * 2);
44
+
45
+ // Handle NANs
46
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
47
+ if (nan > 0) {
48
+ throw std::invalid_argument("Data contains NANs");
49
+ }
50
+
51
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
52
+ auto seasonal_length = data.size() * 10 + 1;
53
+ auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
54
+
55
+ auto seasonal = data_decomp.seasonal;
56
+ auto med = median(data);
57
+ std::vector<float> data2;
58
+ data2.reserve(data.size());
59
+ for (auto i = 0; i < data.size(); i++) {
60
+ data2.push_back(data[i] - seasonal[i] - med);
61
+ }
62
+
63
+ auto max_outliers = (size_t) num_obs * k;
64
+ assert(max_outliers > 0);
65
+
66
+ auto n = data2.size();
67
+
68
+ std::vector<size_t> r_idx;
69
+
70
+ std::vector<size_t> indexes;
71
+ indexes.reserve(data2.size());
72
+ for (auto i = 0; i < data2.size(); i++) {
73
+ indexes.push_back(i);
74
+ }
75
+
76
+ // Compute test statistic until r=max_outliers values have been removed from the sample
77
+ for (auto i = 1; i <= max_outliers; i++) {
78
+ // TODO Improve performance between loop iterations
79
+ auto ma = median(data2);
80
+ std::vector<float> ares;
81
+ ares.reserve(data2.size());
82
+ if (one_tail) {
83
+ if (upper_tail) {
84
+ for (auto v : data2) {
85
+ ares.push_back(v - ma);
86
+ }
87
+ } else {
88
+ for (auto v : data2) {
89
+ ares.push_back(ma - v);
90
+ }
91
+ }
92
+ } else {
93
+ for (auto v : data2) {
94
+ ares.push_back(fabs(v - ma));
95
+ }
96
+ }
97
+
98
+ // Protect against constant time series
99
+ auto data_sigma = mad(data2);
100
+ if (data_sigma == 0.0) {
101
+ break;
102
+ }
103
+
104
+ auto iter = std::max_element(ares.begin(), ares.end());
105
+ auto r_idx_i = std::distance(ares.begin(), iter);
106
+ auto r_idx_i2 = indexes[r_idx_i];
107
+
108
+ // Only need to take sigma of r for performance
109
+ auto r = ares[r_idx_i] / data_sigma;
110
+
111
+ // TODO Swap to last position and delete
112
+ data2.erase(data2.begin() + r_idx_i);
113
+ indexes.erase(indexes.begin() + r_idx_i);
114
+
115
+ // Compute critical value
116
+ float p;
117
+ if (one_tail) {
118
+ p = 1.0 - alpha / (n - i + 1);
119
+ } else {
120
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
121
+ }
122
+
123
+ auto t = qt(p, n - i - 1);
124
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
125
+
126
+ if (r > lam) {
127
+ r_idx.push_back(r_idx_i2);
128
+ } else {
129
+ break;
130
+ }
131
+ }
132
+
133
+ // Sort like R version
134
+ std::sort(r_idx.begin(), r_idx.end());
135
+
136
+ return r_idx;
137
+ }
138
+
139
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
140
+ bool one_tail;
141
+ bool upper_tail;
142
+ if (direction == "pos") {
143
+ one_tail = true;
144
+ upper_tail = true;
145
+ } else if (direction == "neg") {
146
+ one_tail = true;
147
+ upper_tail = false;
148
+ } else if (direction == "both") {
149
+ one_tail = false;
150
+ upper_tail = true; // not used
151
+ } else {
152
+ throw std::invalid_argument("Bad direction");
153
+ }
154
+
155
+ return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
156
+ }