anomaly_detection 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/NOTICE.txt ADDED
@@ -0,0 +1,15 @@
1
+ Copyright (C) 2015 Twitter, Inc and other contributors
2
+ Copyright (C) 2021 Andrew Kane
3
+
4
+ This program is free software: you can redistribute it and/or modify
5
+ it under the terms of the GNU General Public License as published by
6
+ the Free Software Foundation, either version 3 of the License, or
7
+ (at your option) any later version.
8
+
9
+ This program is distributed in the hope that it will be useful,
10
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ GNU General Public License for more details.
13
+
14
+ You should have received a copy of the GNU General Public License
15
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # AnomalyDetection.rb
2
+
3
+ :fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
4
+
5
+ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
6
+
7
+ [![Build Status](https://github.com/ankane/AnomalyDetection.rb/workflows/build/badge.svg?branch=master)](https://github.com/ankane/AnomalyDetection.rb/actions)
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application’s Gemfile:
12
+
13
+ ```ruby
14
+ gem 'anomaly_detection'
15
+ ```
16
+
17
+ ## Getting Started
18
+
19
+ Detect anomalies in a time series
20
+
21
+ ```ruby
22
+ series = {
23
+ Date.parse("2020-01-01") => 100,
24
+ Date.parse("2020-01-02") => 150,
25
+ Date.parse("2020-01-03") => 136,
26
+ # ...
27
+ }
28
+
29
+ AnomalyDetection.detect(series, period: 7)
30
+ ```
31
+
32
+ Works great with [Groupdate](https://github.com/ankane/groupdate)
33
+
34
+ ```ruby
35
+ series = User.group_by_day(:created_at).count
36
+ AnomalyDetection.detect(series, period: 7)
37
+ ```
38
+
39
+ Series can also be an array without times (the index is returned)
40
+
41
+ ```ruby
42
+ series = [100, 150, 136, ...]
43
+ AnomalyDetection.detect(series, period: 7)
44
+ ```
45
+
46
+ ## Options
47
+
48
+ Pass options
49
+
50
+ ```ruby
51
+ AnomalyDetection.detect(
52
+ series,
53
+ period: 7, # number of observations in a single period
54
+ alpha: 0.05, # level of statistical significance
55
+ max_anoms: 0.1, # maximum number of anomalies as percent of data
56
+ direction: "both" # pos, neg, or both
57
+ )
58
+ ```
59
+
60
+ ## Credits
61
+
62
+ This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
63
+
64
+ ## References
65
+
66
+ - [Automatic Anomaly Detection in the Cloud Via Statistical Learning](https://arxiv.org/abs/1704.07706)
67
+
68
+ ## History
69
+
70
+ View the [changelog](https://github.com/ankane/AnomalyDetection.rb/blob/master/CHANGELOG.md)
71
+
72
+ ## Contributing
73
+
74
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
75
+
76
+ - [Report bugs](https://github.com/ankane/AnomalyDetection.rb/issues)
77
+ - Fix bugs and [submit pull requests](https://github.com/ankane/AnomalyDetection.rb/pulls)
78
+ - Write, clarify, or fix documentation
79
+ - Suggest or add new features
80
+
81
+ To get started with development:
82
+
83
+ ```sh
84
+ git clone https://github.com/ankane/AnomalyDetection.rb.git
85
+ cd AnomalyDetection.rb
86
+ bundle install
87
+ bundle exec rake compile
88
+ bundle exec rake test
89
+ ```
@@ -0,0 +1,156 @@
1
+ #include <cassert>
2
+ #include <iostream>
3
+ #include <iterator>
4
+ #include <vector>
5
+
6
+ #include "cdflib.hpp"
7
+ #include "stl.hpp"
8
+
9
+ float median(const std::vector<float>& data) {
10
+ std::vector<float> sorted(data);
11
+ std::sort(sorted.begin(), sorted.end());
12
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
13
+ }
14
+
15
+ float mad(const std::vector<float>& data) {
16
+ auto med = median(data);
17
+ std::vector<float> res;
18
+ res.reserve(data.size());
19
+ for (auto v : data) {
20
+ res.push_back(fabs(v - med));
21
+ }
22
+ return 1.4826 * median(res);
23
+ }
24
+
25
+ float qt(double p, double df) {
26
+ int which = 2;
27
+ double q = 1 - p;
28
+ double t;
29
+ int status;
30
+ double bound;
31
+ cdft(&which, &p, &q, &t, &df, &status, &bound);
32
+
33
+ if (status != 0) {
34
+ throw std::invalid_argument("Bad status");
35
+ }
36
+ return t;
37
+ }
38
+
39
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
40
+ auto num_obs = data.size();
41
+
42
+ // Check to make sure we have at least two periods worth of data for anomaly context
43
+ assert(num_obs >= num_obs_per_period * 2);
44
+
45
+ // Handle NANs
46
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
47
+ if (nan > 0) {
48
+ throw std::invalid_argument("Data contains NANs");
49
+ }
50
+
51
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
52
+ auto seasonal_length = data.size() * 10 + 1;
53
+ auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
54
+
55
+ auto seasonal = data_decomp.seasonal;
56
+ auto med = median(data);
57
+ std::vector<float> data2;
58
+ data2.reserve(data.size());
59
+ for (auto i = 0; i < data.size(); i++) {
60
+ data2.push_back(data[i] - seasonal[i] - med);
61
+ }
62
+
63
+ auto max_outliers = (size_t) num_obs * k;
64
+ assert(max_outliers > 0);
65
+
66
+ auto n = data2.size();
67
+
68
+ std::vector<size_t> r_idx;
69
+
70
+ std::vector<size_t> indexes;
71
+ indexes.reserve(data2.size());
72
+ for (auto i = 0; i < data2.size(); i++) {
73
+ indexes.push_back(i);
74
+ }
75
+
76
+ // Compute test statistic until r=max_outliers values have been removed from the sample
77
+ for (auto i = 1; i <= max_outliers; i++) {
78
+ // TODO Improve performance between loop iterations
79
+ auto ma = median(data2);
80
+ std::vector<float> ares;
81
+ ares.reserve(data2.size());
82
+ if (one_tail) {
83
+ if (upper_tail) {
84
+ for (auto v : data2) {
85
+ ares.push_back(v - ma);
86
+ }
87
+ } else {
88
+ for (auto v : data2) {
89
+ ares.push_back(ma - v);
90
+ }
91
+ }
92
+ } else {
93
+ for (auto v : data2) {
94
+ ares.push_back(fabs(v - ma));
95
+ }
96
+ }
97
+
98
+ // Protect against constant time series
99
+ auto data_sigma = mad(data2);
100
+ if (data_sigma == 0.0) {
101
+ break;
102
+ }
103
+
104
+ auto iter = std::max_element(ares.begin(), ares.end());
105
+ auto r_idx_i = std::distance(ares.begin(), iter);
106
+ auto r_idx_i2 = indexes[r_idx_i];
107
+
108
+ // Only need to take sigma of r for performance
109
+ auto r = ares[r_idx_i] / data_sigma;
110
+
111
+ // TODO Swap to last position and delete
112
+ data2.erase(data2.begin() + r_idx_i);
113
+ indexes.erase(indexes.begin() + r_idx_i);
114
+
115
+ // Compute critical value
116
+ float p;
117
+ if (one_tail) {
118
+ p = 1.0 - alpha / (n - i + 1);
119
+ } else {
120
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
121
+ }
122
+
123
+ auto t = qt(p, n - i - 1);
124
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
125
+
126
+ if (r > lam) {
127
+ r_idx.push_back(r_idx_i2);
128
+ } else {
129
+ break;
130
+ }
131
+ }
132
+
133
+ // Sort like R version
134
+ std::sort(r_idx.begin(), r_idx.end());
135
+
136
+ return r_idx;
137
+ }
138
+
139
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
140
+ bool one_tail;
141
+ bool upper_tail;
142
+ if (direction == "pos") {
143
+ one_tail = true;
144
+ upper_tail = true;
145
+ } else if (direction == "neg") {
146
+ one_tail = true;
147
+ upper_tail = false;
148
+ } else if (direction == "both") {
149
+ one_tail = false;
150
+ upper_tail = true; // not used
151
+ } else {
152
+ throw std::invalid_argument("Bad direction");
153
+ }
154
+
155
+ return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
156
+ }