RubyGems - anomaly_detection - Versions diffs - 0.1.3 → 0.2.0 - Mend

anomaly_detection 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/NOTICE.txt +1 -1
data/README.md +2 -2
data/ext/anomaly_detection/anomaly_detection.hpp +200 -2
data/ext/anomaly_detection/dist.h +105 -49
data/ext/anomaly_detection/ext.cpp +9 -3
data/ext/anomaly_detection/stl.hpp +103 -50
data/lib/anomaly_detection/version.rb +1 -1
data/lib/anomaly_detection.rb +57 -2
data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
data/licenses/NOTICE-AnomalyDetection-cpp.txt +15 -0
metadata +6 -5
data/ext/anomaly_detection/anomaly_detection.cpp +0 -139

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cfb709e43863f4221a67e8f675f28b5361f6bf33a0d0f6fa4f52cdc0cad01796
-  data.tar.gz: 40965f08bb75cdb673d43e42c7fc47403fbfb1b082de10cab86bac569916d5b8
+  metadata.gz: da5eb71023f77a4c05e6322c020ef602e8e22b7b5ba516fce99679af702c881d
+  data.tar.gz: 26560c8dd893c491bd3094202ff82ae33eefdcdba74fe4386b006f7f522906df
 SHA512:
-  metadata.gz: '0496d044ecbe143be64164bd88092c5a5bd660f5fec6425e9f6ea0759f2ab7dbaa41a055c9d03a78c8e5a661b86bb25629c5e7a809614ba415d941f4d63fc9cb'
-  data.tar.gz: 8cc3f28c981d0be5cdb3dbfd054910d45469d258a662e163d7b01379af93e6714874d043ff1521ae19506f742b63eb93bf631f6f895c11842c4c15028c66a4b8
+  metadata.gz: ec2e1459ca2410ee6ab1bce3fe9c528d6419b75e10c6448f1fe5b3030a2e3d8de320a23a9bded17702a01fd23d112007b909c8611e2da6c1ff4f8521352c89ac
+  data.tar.gz: ad150705d6e32a111c3bc044ef7f99910beebe572e07799719e72118422b7a9e6439943cac8b86d613537d7d0fb52cba86a668faf78d135abc888ce3737f8104

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,13 @@
+## 0.2.0 (2023-01-31)
+- Added experimental support for auto-detecting period
+- Fixed result when no seasonality (period is less than 2)
+- Dropped support for Ruby < 2.7
+## 0.1.4 (2022-03-19)
+- Fixed initial median calculation
 ## 0.1.3 (2022-01-03)
 - Switched to dist.h

data/NOTICE.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 Copyright (C) 2015 Twitter, Inc and other contributors
-Copyright (C) 2021 Andrew Kane
+Copyright (C) 2021-2023 Andrew Kane
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

data/README.md CHANGED Viewed

@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
 Add this line to your application’s Gemfile:
 ```ruby
-gem 'anomaly_detection'
+gem "anomaly_detection"
 ```
 ## Getting Started
@@ -63,7 +63,7 @@ AnomalyDetection.detect(
 Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
 ```ruby
-gem 'vega'
+gem "vega"
 ```
 And use:

data/ext/anomaly_detection/anomaly_detection.hpp CHANGED Viewed

@@ -1,12 +1,210 @@
+/*!
+ * AnomalyDetection.cpp v0.1.3
+ * https://github.com/ankane/AnomalyDetection.cpp
+ * GPL-3.0-or-later License
+ */
 #pragma once
-#include <string>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
 #include <vector>
+#include "dist.h"
+#include "stl.hpp"
 namespace anomaly_detection {
 enum Direction { Positive, Negative, Both };
-std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
+float median_sorted(const std::vector<float>& sorted) {
+    return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
+}
+float median(const std::vector<float>& data) {
+    std::vector<float> sorted(data);
+    std::sort(sorted.begin(), sorted.end());
+    return median_sorted(sorted);
+}
+float mad(const std::vector<float>& data, float med) {
+    std::vector<float> res;
+    res.reserve(data.size());
+    for (auto v : data) {
+        res.push_back(fabs(v - med));
+    }
+    std::sort(res.begin(), res.end());
+    return 1.4826 * median_sorted(res);
+}
+std::vector<size_t> detect_anoms(const std::vector<float>& data, size_t num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
+    auto n = data.size();
+    // Check to make sure we have at least two periods worth of data for anomaly context
+    if (n < num_obs_per_period * 2) {
+        throw std::invalid_argument("series must contain at least 2 periods");
+    }
+    // Handle NANs
+    auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
+    if (nan > 0) {
+        throw std::invalid_argument("series contains NANs");
+    }
+    std::vector<float> data2;
+    data2.reserve(n);
+    auto med = median(data);
+    if (num_obs_per_period > 1) {
+        // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
+        auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
+        auto seasonal = data_decomp.seasonal;
+        for (size_t i = 0; i < n; i++) {
+            data2.push_back(data[i] - seasonal[i] - med);
+        }
+    } else {
+        for (size_t i = 0; i < n; i++) {
+            data2.push_back(data[i] - med);
+        }
+    }
+    auto num_anoms = 0;
+    auto max_outliers = (size_t) n * k;
+    std::vector<size_t> anomalies;
+    anomalies.reserve(max_outliers);
+    // Sort data for fast median
+    // Use stable sort for indexes for deterministic results
+    std::vector<size_t> indexes(n);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
+    std::sort(data2.begin(), data2.end());
+    // Compute test statistic until r=max_outliers values have been removed from the sample
+    for (auto i = 1; i <= max_outliers; i++) {
+        if (verbose) {
+            std::cout << i << " / " << max_outliers << " completed" << std::endl;
+        }
+        // TODO Improve performance between loop iterations
+        auto ma = median_sorted(data2);
+        std::vector<float> ares;
+        ares.reserve(data2.size());
+        if (one_tail) {
+            if (upper_tail) {
+                for (auto v : data2) {
+                    ares.push_back(v - ma);
+                }
+            } else {
+                for (auto v : data2) {
+                    ares.push_back(ma - v);
+                }
+            }
+        } else {
+            for (auto v : data2) {
+                ares.push_back(fabs(v - ma));
+            }
+        }
+        // Protect against constant time series
+        auto data_sigma = mad(data2, ma);
+        if (data_sigma == 0.0) {
+            break;
+        }
+        auto iter = std::max_element(ares.begin(), ares.end());
+        auto r_idx_i = std::distance(ares.begin(), iter);
+        // Only need to take sigma of r for performance
+        auto r = ares[r_idx_i] / data_sigma;
+        anomalies.push_back(indexes[r_idx_i]);
+        data2.erase(data2.begin() + r_idx_i);
+        indexes.erase(indexes.begin() + r_idx_i);
+        // Compute critical value
+        float p;
+        if (one_tail) {
+            p = 1.0 - alpha / (n - i + 1);
+        } else {
+            p = 1.0 - alpha / (2.0 * (n - i + 1));
+        }
+        auto t = students_t_ppf(p, n - i - 1);
+        auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
+        if (r > lam) {
+            num_anoms = i;
+        }
+        if (callback != nullptr) {
+            callback();
+        }
+    }
+    anomalies.resize(num_anoms);
+    // Sort like R version
+    std::sort(anomalies.begin(), anomalies.end());
+    return anomalies;
+}
+class AnomalyDetectionResult {
+public:
+    std::vector<size_t> anomalies;
+};
+class AnomalyDetectionParams {
+    float alpha_ = 0.05;
+    float max_anoms_ = 0.1;
+    Direction direction_ = Direction::Both;
+    bool verbose_ = false;
+    std::function<void()> callback_ = nullptr;
+public:
+    inline AnomalyDetectionParams alpha(float alpha) {
+        this->alpha_ = alpha;
+        return *this;
+    };
+    inline AnomalyDetectionParams max_anoms(float max_anoms) {
+        this->max_anoms_ = max_anoms;
+        return *this;
+    };
+    inline AnomalyDetectionParams direction(Direction direction) {
+        this->direction_ = direction;
+        return *this;
+    };
+    inline AnomalyDetectionParams verbose(bool verbose) {
+        this->verbose_ = verbose;
+        return *this;
+    };
+    inline AnomalyDetectionParams callback(std::function<void()> callback) {
+        this->callback_ = callback;
+        return *this;
+    };
+    AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
+};
+AnomalyDetectionParams params() {
+    return AnomalyDetectionParams();
+}
+AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
+    bool one_tail = this->direction_ != Direction::Both;
+    bool upper_tail = this->direction_ == Direction::Positive;
+    auto res = AnomalyDetectionResult();
+    res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
+    return res;
+}
 }

data/ext/anomaly_detection/dist.h CHANGED Viewed

@@ -1,72 +1,119 @@
 /*!
- * dist.h v0.1.0
+ * dist.h v0.3.0
  * https://github.com/ankane/dist.h
  * Unlicense OR MIT License
  */
 #pragma once
-#define _USE_MATH_DEFINES
-#include <assert.h>
 #include <math.h>
-// Winitzki, S. (2008).
-// A handy approximation for the error function and its inverse.
-// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
-// from https://sites.google.com/site/winitzki
-double erf(double x) {
-    double sign = x < 0 ? -1.0 : 1.0;
-    x = x < 0 ? -x : x;
-    double a = 0.14;
-    double x2 = x * x;
-    return sign * sqrt(1.0 - exp(-x2 * (4.0 / M_PI + a * x2) / (1.0 + a * x2)));
-}
+#ifdef M_E
+#define DIST_E M_E
+#else
+#define DIST_E 2.71828182845904523536
+#endif
-// Winitzki, S. (2008).
-// A handy approximation for the error function and its inverse.
-// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
-// from https://sites.google.com/site/winitzki
-double inverse_erf(double x) {
-    double sign = x < 0 ? -1.0 : 1.0;
-    x = x < 0 ? -x : x;
-    double a = 0.147;
-    double ln = log(1.0 - x * x);
-    double f1 = 2.0 / (M_PI * a);
-    double f2 = ln / 2.0;
-    double f3 = f1 + f2;
-    double f4 = 1.0 / a * ln;
-    return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
-}
+#ifdef M_PI
+#define DIST_PI M_PI
+#else
+#define DIST_PI 3.14159265358979323846
+#endif
+#ifdef M_SQRT2
+#define DIST_SQRT2 M_SQRT2
+#else
+#define DIST_SQRT2 1.41421356237309504880
+#endif
 double normal_pdf(double x, double mean, double std_dev) {
-    double var = std_dev * std_dev;
-    return (1.0 / (var * sqrt(2.0 * M_PI))) * pow(M_E, -0.5 * pow((x - mean) / var, 2));
+    if (std_dev <= 0) {
+        return NAN;
+    }
+    double n = (x - mean) / std_dev;
+    return (1.0 / (std_dev * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * n * n);
 }
 double normal_cdf(double x, double mean, double std_dev) {
-    return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
+    if (std_dev <= 0) {
+        return NAN;
+    }
+    return 0.5 * (1.0 + erf((x - mean) / (std_dev * DIST_SQRT2)));
 }
+// Wichura, M. J. (1988).
+// Algorithm AS 241: The Percentage Points of the Normal Distribution.
+// Journal of the Royal Statistical Society. Series C (Applied Statistics), 37(3), 477-484.
 double normal_ppf(double p, double mean, double std_dev) {
-    assert(p >= 0 && p <= 1);
+    if (p < 0 || p > 1 || std_dev <= 0 || isnan(mean) || isnan(std_dev)) {
+        return NAN;
+    }
+    if (p == 0) {
+        return -INFINITY;
+    }
-    return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
+    if (p == 1) {
+        return INFINITY;
+    }
+    double q = p - 0.5;
+    if (fabs(q) < 0.425) {
+        double r = 0.180625 - q * q;
+        return mean + std_dev * q *
+            (((((((2.5090809287301226727e3 * r + 3.3430575583588128105e4) * r + 6.7265770927008700853e4) * r + 4.5921953931549871457e4) * r + 1.3731693765509461125e4) * r + 1.9715909503065514427e3) * r + 1.3314166789178437745e2) * r + 3.3871328727963666080e0) /
+            (((((((5.2264952788528545610e3 * r + 2.8729085735721942674e4) * r + 3.9307895800092710610e4) * r + 2.1213794301586595867e4) * r + 5.3941960214247511077e3) * r + 6.8718700749205790830e2) * r + 4.2313330701600911252e1) * r + 1);
+    } else {
+        double r = q < 0 ? p : 1 - p;
+        r = sqrt(-log(r));
+        double sign = q < 0 ? -1 : 1;
+        if (r < 5) {
+            r -= 1.6;
+            return mean + std_dev * sign *
+                (((((((7.74545014278341407640e-4 * r + 2.27238449892691845833e-2) * r + 2.41780725177450611770e-1) * r + 1.27045825245236838258e0) * r + 3.64784832476320460504e0) * r + 5.76949722146069140550e0) * r + 4.63033784615654529590e0) * r + 1.42343711074968357734e0) /
+                (((((((1.05075007164441684324e-9 * r + 5.47593808499534494600e-4) * r + 1.51986665636164571966e-2) * r + 1.48103976427480074590e-1) * r + 6.89767334985100004550e-1) * r + 1.67638483018380384940e0) * r + 2.05319162663775882187e0) * r + 1);
+        } else {
+            r -= 5;
+            return mean + std_dev * sign *
+                (((((((2.01033439929228813265e-7 * r + 2.71155556874348757815e-5) * r + 1.24266094738807843860e-3) * r + 2.65321895265761230930e-2) * r + 2.96560571828504891230e-1) * r + 1.78482653991729133580e0) * r + 5.46378491116411436990e0) * r + 6.65790464350110377720e0) /
+                (((((((2.04426310338993978564e-15 * r + 1.42151175831644588870e-7) * r + 1.84631831751005468180e-5) * r + 7.86869131145613259100e-4) * r + 1.48753612908506148525e-2) * r + 1.36929880922735805310e-1) * r + 5.99832206555887937690e-1) * r + 1);
+        }
+    }
 }
-double students_t_pdf(double x, unsigned int n) {
-    assert(n >= 1);
+double students_t_pdf(double x, double n) {
+    if (n <= 0) {
+        return NAN;
+    }
+    if (n == INFINITY) {
+        return normal_pdf(x, 0, 1);
+    }
-    return tgamma((n + 1.0) / 2.0) / (sqrt(n * M_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
+    return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
 }
 // Hill, G. W. (1970).
 // Algorithm 395: Student's t-distribution.
 // Communications of the ACM, 13(10), 617-619.
-double students_t_cdf(double x, unsigned int n) {
-    assert(n >= 1);
+double students_t_cdf(double x, double n) {
+    if (n < 1) {
+        return NAN;
+    }
+    if (isnan(x)) {
+        return NAN;
+    }
+    if (!isfinite(x)) {
+        return x < 0 ? 0 : 1;
+    }
+    if (n == INFINITY) {
+        return normal_cdf(x, 0, 1);
+    }
     double start = x < 0 ? 0 : 1;
     double sign = x < 0 ? 1 : -1;
@@ -76,7 +123,7 @@ double students_t_cdf(double x, unsigned int n) {
     double y = t / n;
     double b = 1.0 + y;
-    if ((n >= 20 && t < n) || n > 200) {
+    if (n > floor(n) || (n >= 20 && t < n) || n > 200) {
         // asymptotic series for large or noninteger n
         if (y > 10e-6) {
             y = log(b);
@@ -88,6 +135,10 @@ double students_t_cdf(double x, unsigned int n) {
         return start + sign * normal_cdf(-y, 0.0, 1.0);
     }
+    // make n int
+    // n is int between 1 and 200 if made it here
+    n = (int) n;
     if (n < 20 && t < 4.0) {
         // nested summation of cosine series
         y = sqrt(y);
@@ -104,7 +155,7 @@ double students_t_cdf(double x, unsigned int n) {
                 n -= 2;
             }
         }
-        a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
+        a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
         return start + sign * (z - a) / 2;
     }
@@ -127,16 +178,21 @@ double students_t_cdf(double x, unsigned int n) {
         a = (n - 1) / (b * n) * a + y;
         n -= 2;
     }
-    a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
+    a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
     return start + sign * (z - a) / 2;
 }
 // Hill, G. W. (1970).
 // Algorithm 396: Student's t-quantiles.
 // Communications of the ACM, 13(10), 619-620.
-double students_t_ppf(double p, unsigned int n) {
-    assert(p >= 0 && p <= 1);
-    assert(n >= 1);
+double students_t_ppf(double p, double n) {
+    if (p < 0 || p > 1 || n < 1) {
+        return NAN;
+    }
+    if (n == INFINITY) {
+        return normal_ppf(p, 0, 1);
+    }
     // distribution is symmetric
     double sign = p < 0.5 ? -1 : 1;
@@ -149,7 +205,7 @@ double students_t_ppf(double p, unsigned int n) {
         return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
     }
-    double half_pi = M_PI / 2.0;
+    double half_pi = DIST_PI / 2.0;
     if (n == 1) {
         p = p * half_pi;

data/ext/anomaly_detection/ext.cpp CHANGED Viewed

@@ -12,7 +12,7 @@ void Init_ext() {
   rb_mAnomalyDetection
     .define_singleton_function(
       "_detect",
-      [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
+      [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
         Direction dir;
         if (direction == "pos") {
           dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
           throw std::invalid_argument("direction must be pos, neg, or both");
         }
-        auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
+        auto res = anomaly_detection::params()
+          .max_anoms(k)
+          .alpha(alpha)
+          .direction(dir)
+          .verbose(verbose)
+          .callback(rb_thread_check_ints)
+          .fit(series, period);
         auto a = Rice::Array();
-        for (auto v : res) {
+        for (auto v : res.anomalies) {
           a.push(v);
         }
         return a;