anomaly_detection 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +2 -2
- data/ext/anomaly_detection/anomaly_detection.hpp +200 -2
- data/ext/anomaly_detection/dist.h +105 -49
- data/ext/anomaly_detection/ext.cpp +9 -3
- data/ext/anomaly_detection/stl.hpp +103 -50
- data/lib/anomaly_detection/version.rb +1 -1
- data/lib/anomaly_detection.rb +57 -2
- data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
- data/licenses/NOTICE-AnomalyDetection-cpp.txt +15 -0
- metadata +6 -5
- data/ext/anomaly_detection/anomaly_detection.cpp +0 -139
@@ -0,0 +1,15 @@
|
|
1
|
+
Copyright (C) 2015 Twitter, Inc and other contributors
|
2
|
+
Copyright (C) 2022 Andrew Kane
|
3
|
+
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
5
|
+
it under the terms of the GNU General Public License as published by
|
6
|
+
the Free Software Foundation, either version 3 of the License, or
|
7
|
+
(at your option) any later version.
|
8
|
+
|
9
|
+
This program is distributed in the hope that it will be useful,
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12
|
+
GNU General Public License for more details.
|
13
|
+
|
14
|
+
You should have received a copy of the GNU General Public License
|
15
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly_detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-02-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -35,7 +35,6 @@ files:
|
|
35
35
|
- LICENSE.txt
|
36
36
|
- NOTICE.txt
|
37
37
|
- README.md
|
38
|
-
- ext/anomaly_detection/anomaly_detection.cpp
|
39
38
|
- ext/anomaly_detection/anomaly_detection.hpp
|
40
39
|
- ext/anomaly_detection/dist.h
|
41
40
|
- ext/anomaly_detection/ext.cpp
|
@@ -43,8 +42,10 @@ files:
|
|
43
42
|
- ext/anomaly_detection/stl.hpp
|
44
43
|
- lib/anomaly_detection.rb
|
45
44
|
- lib/anomaly_detection/version.rb
|
45
|
+
- licenses/LICENSE-AnomalyDetection-cpp.txt
|
46
46
|
- licenses/LICENSE-MIT-dist-h.txt
|
47
47
|
- licenses/LICENSE-MIT-stl-cpp.txt
|
48
|
+
- licenses/NOTICE-AnomalyDetection-cpp.txt
|
48
49
|
- licenses/UNLICENSE-dist-h.txt
|
49
50
|
- licenses/UNLICENSE-stl-cpp.txt
|
50
51
|
homepage: https://github.com/ankane/AnomalyDetection.rb
|
@@ -59,14 +60,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
59
60
|
requirements:
|
60
61
|
- - ">="
|
61
62
|
- !ruby/object:Gem::Version
|
62
|
-
version: '2.
|
63
|
+
version: '2.7'
|
63
64
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
65
|
requirements:
|
65
66
|
- - ">="
|
66
67
|
- !ruby/object:Gem::Version
|
67
68
|
version: '0'
|
68
69
|
requirements: []
|
69
|
-
rubygems_version: 3.
|
70
|
+
rubygems_version: 3.4.1
|
70
71
|
signing_key:
|
71
72
|
specification_version: 4
|
72
73
|
summary: Time series anomaly detection for Ruby
|
@@ -1,139 +0,0 @@
|
|
1
|
-
#include <functional>
|
2
|
-
#include <iostream>
|
3
|
-
#include <iterator>
|
4
|
-
#include <numeric>
|
5
|
-
#include <string>
|
6
|
-
#include <vector>
|
7
|
-
|
8
|
-
#include "anomaly_detection.hpp"
|
9
|
-
#include "dist.h"
|
10
|
-
#include "stl.hpp"
|
11
|
-
|
12
|
-
namespace anomaly_detection {
|
13
|
-
|
14
|
-
float median(const std::vector<float>& sorted) {
|
15
|
-
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
16
|
-
}
|
17
|
-
|
18
|
-
float mad(const std::vector<float>& data, float med) {
|
19
|
-
std::vector<float> res;
|
20
|
-
res.reserve(data.size());
|
21
|
-
for (auto v : data) {
|
22
|
-
res.push_back(fabs(v - med));
|
23
|
-
}
|
24
|
-
std::sort(res.begin(), res.end());
|
25
|
-
return 1.4826 * median(res);
|
26
|
-
}
|
27
|
-
|
28
|
-
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> check_for_interrupts) {
|
29
|
-
auto n = data.size();
|
30
|
-
|
31
|
-
// Check to make sure we have at least two periods worth of data for anomaly context
|
32
|
-
if (n < num_obs_per_period * 2) {
|
33
|
-
throw std::invalid_argument("series must contain at least 2 periods");
|
34
|
-
}
|
35
|
-
|
36
|
-
// Handle NANs
|
37
|
-
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
38
|
-
if (nan > 0) {
|
39
|
-
throw std::invalid_argument("series contains NANs");
|
40
|
-
}
|
41
|
-
|
42
|
-
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
43
|
-
auto seasonal_length = n * 10 + 1;
|
44
|
-
auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
|
45
|
-
|
46
|
-
auto seasonal = data_decomp.seasonal;
|
47
|
-
auto med = median(data);
|
48
|
-
std::vector<float> data2;
|
49
|
-
data2.reserve(n);
|
50
|
-
for (auto i = 0; i < n; i++) {
|
51
|
-
data2.push_back(data[i] - seasonal[i] - med);
|
52
|
-
}
|
53
|
-
|
54
|
-
std::vector<size_t> r_idx;
|
55
|
-
auto num_anoms = 0;
|
56
|
-
auto max_outliers = (size_t) n * k;
|
57
|
-
|
58
|
-
// Sort data for fast median
|
59
|
-
// Use stable sort for indexes for deterministic results
|
60
|
-
std::vector<size_t> indexes(n);
|
61
|
-
std::iota(indexes.begin(), indexes.end(), 0);
|
62
|
-
std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
|
63
|
-
std::sort(data2.begin(), data2.end());
|
64
|
-
|
65
|
-
// Compute test statistic until r=max_outliers values have been removed from the sample
|
66
|
-
for (auto i = 1; i <= max_outliers; i++) {
|
67
|
-
check_for_interrupts();
|
68
|
-
|
69
|
-
if (verbose) {
|
70
|
-
std::cout << i << " / " << max_outliers << " completed" << std::endl;
|
71
|
-
}
|
72
|
-
|
73
|
-
// TODO Improve performance between loop iterations
|
74
|
-
auto ma = median(data2);
|
75
|
-
std::vector<float> ares;
|
76
|
-
ares.reserve(data2.size());
|
77
|
-
if (one_tail) {
|
78
|
-
if (upper_tail) {
|
79
|
-
for (auto v : data2) {
|
80
|
-
ares.push_back(v - ma);
|
81
|
-
}
|
82
|
-
} else {
|
83
|
-
for (auto v : data2) {
|
84
|
-
ares.push_back(ma - v);
|
85
|
-
}
|
86
|
-
}
|
87
|
-
} else {
|
88
|
-
for (auto v : data2) {
|
89
|
-
ares.push_back(fabs(v - ma));
|
90
|
-
}
|
91
|
-
}
|
92
|
-
|
93
|
-
// Protect against constant time series
|
94
|
-
auto data_sigma = mad(data2, ma);
|
95
|
-
if (data_sigma == 0.0) {
|
96
|
-
break;
|
97
|
-
}
|
98
|
-
|
99
|
-
auto iter = std::max_element(ares.begin(), ares.end());
|
100
|
-
auto r_idx_i = std::distance(ares.begin(), iter);
|
101
|
-
|
102
|
-
// Only need to take sigma of r for performance
|
103
|
-
auto r = ares[r_idx_i] / data_sigma;
|
104
|
-
|
105
|
-
r_idx.push_back(indexes[r_idx_i]);
|
106
|
-
data2.erase(data2.begin() + r_idx_i);
|
107
|
-
indexes.erase(indexes.begin() + r_idx_i);
|
108
|
-
|
109
|
-
// Compute critical value
|
110
|
-
float p;
|
111
|
-
if (one_tail) {
|
112
|
-
p = 1.0 - alpha / (n - i + 1);
|
113
|
-
} else {
|
114
|
-
p = 1.0 - alpha / (2.0 * (n - i + 1));
|
115
|
-
}
|
116
|
-
|
117
|
-
auto t = students_t_ppf(p, n - i - 1);
|
118
|
-
auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
|
119
|
-
|
120
|
-
if (r > lam) {
|
121
|
-
num_anoms = i;
|
122
|
-
}
|
123
|
-
}
|
124
|
-
|
125
|
-
std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
|
126
|
-
|
127
|
-
// Sort like R version
|
128
|
-
std::sort(anomalies.begin(), anomalies.end());
|
129
|
-
|
130
|
-
return anomalies;
|
131
|
-
}
|
132
|
-
|
133
|
-
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> check_for_interrupts) {
|
134
|
-
bool one_tail = direction != Direction::Both;
|
135
|
-
bool upper_tail = direction == Direction::Positive;
|
136
|
-
return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, check_for_interrupts);
|
137
|
-
}
|
138
|
-
|
139
|
-
}
|