RubyGems - anomaly_detection - Versions diffs - 0.1.3 → 0.2.0 - Mend

anomaly_detection 0.1.3 → 0.2.0

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/NOTICE.txt +1 -1
data/README.md +2 -2
data/ext/anomaly_detection/anomaly_detection.hpp +200 -2
data/ext/anomaly_detection/dist.h +105 -49
data/ext/anomaly_detection/ext.cpp +9 -3
data/ext/anomaly_detection/stl.hpp +103 -50
data/lib/anomaly_detection/version.rb +1 -1
data/lib/anomaly_detection.rb +57 -2
data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
data/licenses/NOTICE-AnomalyDetection-cpp.txt +15 -0
metadata +6 -5
data/ext/anomaly_detection/anomaly_detection.cpp +0 -139

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cfb709e43863f4221a67e8f675f28b5361f6bf33a0d0f6fa4f52cdc0cad01796
-  data.tar.gz: 40965f08bb75cdb673d43e42c7fc47403fbfb1b082de10cab86bac569916d5b8
+  metadata.gz: da5eb71023f77a4c05e6322c020ef602e8e22b7b5ba516fce99679af702c881d
+  data.tar.gz: 26560c8dd893c491bd3094202ff82ae33eefdcdba74fe4386b006f7f522906df
 SHA512:
-  metadata.gz: '0496d044ecbe143be64164bd88092c5a5bd660f5fec6425e9f6ea0759f2ab7dbaa41a055c9d03a78c8e5a661b86bb25629c5e7a809614ba415d941f4d63fc9cb'
-  data.tar.gz: 8cc3f28c981d0be5cdb3dbfd054910d45469d258a662e163d7b01379af93e6714874d043ff1521ae19506f742b63eb93bf631f6f895c11842c4c15028c66a4b8
+  metadata.gz: ec2e1459ca2410ee6ab1bce3fe9c528d6419b75e10c6448f1fe5b3030a2e3d8de320a23a9bded17702a01fd23d112007b909c8611e2da6c1ff4f8521352c89ac
+  data.tar.gz: ad150705d6e32a111c3bc044ef7f99910beebe572e07799719e72118422b7a9e6439943cac8b86d613537d7d0fb52cba86a668faf78d135abc888ce3737f8104

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,13 @@
+## 0.2.0 (2023-01-31)
+- Added experimental support for auto-detecting period
+- Fixed result when no seasonality (period is less than 2)
+- Dropped support for Ruby < 2.7
+## 0.1.4 (2022-03-19)
+- Fixed initial median calculation
 ## 0.1.3 (2022-01-03)
 - Switched to dist.h

data/NOTICE.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 Copyright (C) 2015 Twitter, Inc and other contributors
-Copyright (C) 2021 Andrew Kane
+Copyright (C) 2021-2023 Andrew Kane
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

data/README.md CHANGED Viewed

@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
 Add this line to your application’s Gemfile:
 ```ruby
-gem 'anomaly_detection'
+gem "anomaly_detection"
 ```
 ## Getting Started
@@ -63,7 +63,7 @@ AnomalyDetection.detect(
 Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
 ```ruby
-gem 'vega'
+gem "vega"
 ```
 And use:

data/ext/anomaly_detection/anomaly_detection.hpp CHANGED Viewed

@@ -1,12 +1,210 @@
+/*!
+ * AnomalyDetection.cpp v0.1.3
+ * https://github.com/ankane/AnomalyDetection.cpp
+ * GPL-3.0-or-later License
+ */
 #pragma once
-#include <string>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
 #include <vector>
+#include "dist.h"
+#include "stl.hpp"
 namespace anomaly_detection {
 enum Direction { Positive, Negative, Both };
-std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
+float median_sorted(const std::vector<float>& sorted) {
+    return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
+}
+float median(const std::vector<float>& data) {
+    std::vector<float> sorted(data);
+    std::sort(sorted.begin(), sorted.end());
+    return median_sorted(sorted);
+}
+float mad(const std::vector<float>& data, float med) {
+    std::vector<float> res;
+    res.reserve(data.size());
+    for (auto v : data) {
+        res.push_back(fabs(v - med));
+    }
+    std::sort(res.begin(), res.end());
+    return 1.4826 * median_sorted(res);
+}
+std::vector<size_t> detect_anoms(const std::vector<float>& data, size_t num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
+    auto n = data.size();
+    // Check to make sure we have at least two periods worth of data for anomaly context
+    if (n < num_obs_per_period * 2) {
+        throw std::invalid_argument("series must contain at least 2 periods");
+    }
+    // Handle NANs
+    auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
+    if (nan > 0) {
+        throw std::invalid_argument("series contains NANs");
+    }
+    std::vector<float> data2;
+    data2.reserve(n);
+    auto med = median(data);
+    if (num_obs_per_period > 1) {
+        // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
+        auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
+        auto seasonal = data_decomp.seasonal;
+        for (size_t i = 0; i < n; i++) {
+            data2.push_back(data[i] - seasonal[i] - med);
+        }
+    } else {
+        for (size_t i = 0; i < n; i++) {
+            data2.push_back(data[i] - med);
+        }
+    }
+    auto num_anoms = 0;
+    auto max_outliers = (size_t) n * k;
+    std::vector<size_t> anomalies;
+    anomalies.reserve(max_outliers);
+    // Sort data for fast median
+    // Use stable sort for indexes for deterministic results
+    std::vector<size_t> indexes(n);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
+    std::sort(data2.begin(), data2.end());
+    // Compute test statistic until r=max_outliers values have been removed from the sample
+    for (auto i = 1; i <= max_outliers; i++) {
+        if (verbose) {
+            std::cout << i << " / " << max_outliers << " completed" << std::endl;
+        }
+        // TODO Improve performance between loop iterations
+        auto ma = median_sorted(data2);
+        std::vector<float> ares;
+        ares.reserve(data2.size());
+        if (one_tail) {
+            if (upper_tail) {
+                for (auto v : data2) {
+                    ares.push_back(v - ma);
+                }
+            } else {
+                for (auto v : data2) {
+                    ares.push_back(ma - v);
+                }
+            }
+        } else {
+            for (auto v : data2) {
+                ares.push_back(fabs(v - ma));
+            }
+        }
+        // Protect against constant time series
+        auto data_sigma = mad(data2, ma);
+        if (data_sigma == 0.0) {
+            break;
+        }
+        auto iter = std::max_element(ares.begin(), ares.end());
+        auto r_idx_i = std::distance(ares.begin(), iter);
+        // Only need to take sigma of r for performance
+        auto r = ares[r_idx_i] / data_sigma;
+        anomalies.push_back(indexes[r_idx_i]);
+        data2.erase(data2.begin() + r_idx_i);
+        indexes.erase(indexes.begin() + r_idx_i);
+        // Compute critical value
+        float p;
+        if (one_tail) {
+            p = 1.0 - alpha / (n - i + 1);
+        } else {
+            p = 1.0 - alpha / (2.0 * (n - i + 1));
+        }
+        auto t = students_t_ppf(p, n - i - 1);
+        auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
+        if (r > lam) {
+            num_anoms = i;
+        }
+        if (callback != nullptr) {
+            callback();
+        }
+    }
+    anomalies.resize(num_anoms);
+    // Sort like R version
+    std::sort(anomalies.begin(), anomalies.end());
+    return anomalies;
+}
+class AnomalyDetectionResult {
+public:
+    std::vector<size_t> anomalies;
+};
+class AnomalyDetectionParams {
+    float alpha_ = 0.05;
+    float max_anoms_ = 0.1;
+    Direction direction_ = Direction::Both;
+    bool verbose_ = false;
+    std::function<void()> callback_ = nullptr;
+public:
+    inline AnomalyDetectionParams alpha(float alpha) {
+        this->alpha_ = alpha;
+        return *this;
+    };
+    inline AnomalyDetectionParams max_anoms(float max_anoms) {
+        this->max_anoms_ = max_anoms;
+        return *this;
+    };
+    inline AnomalyDetectionParams direction(Direction direction) {
+        this->direction_ = direction;
+        return *this;
+    };
+    inline AnomalyDetectionParams verbose(bool verbose) {
+        this->verbose_ = verbose;
+        return *this;
+    };
+    inline AnomalyDetectionParams callback(std::function<void()> callback) {
+        this->callback_ = callback;
+        return *this;
+    };
+    AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
+};
+AnomalyDetectionParams params() {
+    return AnomalyDetectionParams();
+}
+AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
+    bool one_tail = this->direction_ != Direction::Both;
+    bool upper_tail = this->direction_ == Direction::Positive;
+    auto res = AnomalyDetectionResult();
+    res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
+    return res;
+}
 }

data/ext/anomaly_detection/dist.h CHANGED Viewed

@@ -1,72 +1,119 @@
 /*!
- * dist.h v0.1.0
+ * dist.h v0.3.0
  * https://github.com/ankane/dist.h
  * Unlicense OR MIT License
  */
 #pragma once
-#define _USE_MATH_DEFINES
-#include <assert.h>
 #include <math.h>
-// Winitzki, S. (2008).
-// A handy approximation for the error function and its inverse.
-// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
-// from https://sites.google.com/site/winitzki
-double erf(double x) {
-    double sign = x < 0 ? -1.0 : 1.0;
-    x = x < 0 ? -x : x;
-    double a = 0.14;
-    double x2 = x * x;
-    return sign * sqrt(1.0 - exp(-x2 * (4.0 / M_PI + a * x2) / (1.0 + a * x2)));
-}
+#ifdef M_E
+#define DIST_E M_E
+#else
+#define DIST_E 2.71828182845904523536
+#endif
-// Winitzki, S. (2008).
-// A handy approximation for the error function and its inverse.
-// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
-// from https://sites.google.com/site/winitzki
-double inverse_erf(double x) {
-    double sign = x < 0 ? -1.0 : 1.0;
-    x = x < 0 ? -x : x;
-    double a = 0.147;
-    double ln = log(1.0 - x * x);
-    double f1 = 2.0 / (M_PI * a);
-    double f2 = ln / 2.0;
-    double f3 = f1 + f2;
-    double f4 = 1.0 / a * ln;
-    return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
-}
+#ifdef M_PI
+#define DIST_PI M_PI
+#else
+#define DIST_PI 3.14159265358979323846
+#endif
+#ifdef M_SQRT2
+#define DIST_SQRT2 M_SQRT2
+#else
+#define DIST_SQRT2 1.41421356237309504880
+#endif
 double normal_pdf(double x, double mean, double std_dev) {
-    double var = std_dev * std_dev;
-    return (1.0 / (var * sqrt(2.0 * M_PI))) * pow(M_E, -0.5 * pow((x - mean) / var, 2));
+    if (std_dev <= 0) {
+        return NAN;
+    }
+    double n = (x - mean) / std_dev;
+    return (1.0 / (std_dev * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * n * n);
 }
 double normal_cdf(double x, double mean, double std_dev) {
-    return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
+    if (std_dev <= 0) {
+        return NAN;
+    }
+    return 0.5 * (1.0 + erf((x - mean) / (std_dev * DIST_SQRT2)));
 }
+// Wichura, M. J. (1988).
+// Algorithm AS 241: The Percentage Points of the Normal Distribution.
+// Journal of the Royal Statistical Society. Series C (Applied Statistics), 37(3), 477-484.
 double normal_ppf(double p, double mean, double std_dev) {
-    assert(p >= 0 && p <= 1);
+    if (p < 0 || p > 1 || std_dev <= 0 || isnan(mean) || isnan(std_dev)) {
+        return NAN;
+    }
+    if (p == 0) {
+        return -INFINITY;
+    }
-    return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
+    if (p == 1) {
+        return INFINITY;
+    }
+    double q = p - 0.5;
+    if (fabs(q) < 0.425) {
+        double r = 0.180625 - q * q;
+        return mean + std_dev * q *
+            (((((((2.5090809287301226727e3 * r + 3.3430575583588128105e4) * r + 6.7265770927008700853e4) * r + 4.5921953931549871457e4) * r + 1.3731693765509461125e4) * r + 1.9715909503065514427e3) * r + 1.3314166789178437745e2) * r + 3.3871328727963666080e0) /
+            (((((((5.2264952788528545610e3 * r + 2.8729085735721942674e4) * r + 3.9307895800092710610e4) * r + 2.1213794301586595867e4) * r + 5.3941960214247511077e3) * r + 6.8718700749205790830e2) * r + 4.2313330701600911252e1) * r + 1);
+    } else {
+        double r = q < 0 ? p : 1 - p;
+        r = sqrt(-log(r));
+        double sign = q < 0 ? -1 : 1;
+        if (r < 5) {
+            r -= 1.6;
+            return mean + std_dev * sign *
+                (((((((7.74545014278341407640e-4 * r + 2.27238449892691845833e-2) * r + 2.41780725177450611770e-1) * r + 1.27045825245236838258e0) * r + 3.64784832476320460504e0) * r + 5.76949722146069140550e0) * r + 4.63033784615654529590e0) * r + 1.42343711074968357734e0) /
+                (((((((1.05075007164441684324e-9 * r + 5.47593808499534494600e-4) * r + 1.51986665636164571966e-2) * r + 1.48103976427480074590e-1) * r + 6.89767334985100004550e-1) * r + 1.67638483018380384940e0) * r + 2.05319162663775882187e0) * r + 1);
+        } else {
+            r -= 5;
+            return mean + std_dev * sign *
+                (((((((2.01033439929228813265e-7 * r + 2.71155556874348757815e-5) * r + 1.24266094738807843860e-3) * r + 2.65321895265761230930e-2) * r + 2.96560571828504891230e-1) * r + 1.78482653991729133580e0) * r + 5.46378491116411436990e0) * r + 6.65790464350110377720e0) /
+                (((((((2.04426310338993978564e-15 * r + 1.42151175831644588870e-7) * r + 1.84631831751005468180e-5) * r + 7.86869131145613259100e-4) * r + 1.48753612908506148525e-2) * r + 1.36929880922735805310e-1) * r + 5.99832206555887937690e-1) * r + 1);
+        }
+    }
 }
-double students_t_pdf(double x, unsigned int n) {
-    assert(n >= 1);
+double students_t_pdf(double x, double n) {
+    if (n <= 0) {
+        return NAN;
+    }
+    if (n == INFINITY) {
+        return normal_pdf(x, 0, 1);
+    }
-    return tgamma((n + 1.0) / 2.0) / (sqrt(n * M_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
+    return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
 }
 // Hill, G. W. (1970).
 // Algorithm 395: Student's t-distribution.
 // Communications of the ACM, 13(10), 617-619.
-double students_t_cdf(double x, unsigned int n) {
-    assert(n >= 1);
+double students_t_cdf(double x, double n) {
+    if (n < 1) {
+        return NAN;
+    }
+    if (isnan(x)) {
+        return NAN;
+    }
+    if (!isfinite(x)) {
+        return x < 0 ? 0 : 1;
+    }
+    if (n == INFINITY) {
+        return normal_cdf(x, 0, 1);
+    }
     double start = x < 0 ? 0 : 1;
     double sign = x < 0 ? 1 : -1;
@@ -76,7 +123,7 @@ double students_t_cdf(double x, unsigned int n) {
     double y = t / n;
     double b = 1.0 + y;
-    if ((n >= 20 && t < n) || n > 200) {
+    if (n > floor(n) || (n >= 20 && t < n) || n > 200) {
         // asymptotic series for large or noninteger n
         if (y > 10e-6) {
             y = log(b);
@@ -88,6 +135,10 @@ double students_t_cdf(double x, unsigned int n) {
         return start + sign * normal_cdf(-y, 0.0, 1.0);
     }
+    // make n int
+    // n is int between 1 and 200 if made it here
+    n = (int) n;
     if (n < 20 && t < 4.0) {
         // nested summation of cosine series
         y = sqrt(y);
@@ -104,7 +155,7 @@ double students_t_cdf(double x, unsigned int n) {
                 n -= 2;
             }
         }
-        a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
+        a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
         return start + sign * (z - a) / 2;
     }
@@ -127,16 +178,21 @@ double students_t_cdf(double x, unsigned int n) {
         a = (n - 1) / (b * n) * a + y;
         n -= 2;
     }
-    a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
+    a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
     return start + sign * (z - a) / 2;
 }
 // Hill, G. W. (1970).
 // Algorithm 396: Student's t-quantiles.
 // Communications of the ACM, 13(10), 619-620.
-double students_t_ppf(double p, unsigned int n) {
-    assert(p >= 0 && p <= 1);
-    assert(n >= 1);
+double students_t_ppf(double p, double n) {
+    if (p < 0 || p > 1 || n < 1) {
+        return NAN;
+    }
+    if (n == INFINITY) {
+        return normal_ppf(p, 0, 1);
+    }
     // distribution is symmetric
     double sign = p < 0.5 ? -1 : 1;
@@ -149,7 +205,7 @@ double students_t_ppf(double p, unsigned int n) {
         return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
     }
-    double half_pi = M_PI / 2.0;
+    double half_pi = DIST_PI / 2.0;
     if (n == 1) {
         p = p * half_pi;

data/ext/anomaly_detection/ext.cpp CHANGED Viewed

@@ -12,7 +12,7 @@ void Init_ext() {
   rb_mAnomalyDetection
     .define_singleton_function(
       "_detect",
-      [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
+      [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
         Direction dir;
         if (direction == "pos") {
           dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
           throw std::invalid_argument("direction must be pos, neg, or both");
         }
-        auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
+        auto res = anomaly_detection::params()
+          .max_anoms(k)
+          .alpha(alpha)
+          .direction(dir)
+          .verbose(verbose)
+          .callback(rb_thread_check_ints)
+          .fit(series, period);
         auto a = Rice::Array();
-        for (auto v : res) {
+        for (auto v : res.anomalies) {
           a.push(v);
         }
         return a;