RubyGems - anomaly_detection - Versions diffs - 0.1.1 → 0.1.4 - Mend

anomaly_detection 0.1.1 → 0.1.4

Files changed (16) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/README.md +16 -2
data/ext/anomaly_detection/anomaly_detection.hpp +193 -2
data/ext/anomaly_detection/dist.h +190 -0
data/ext/anomaly_detection/ext.cpp +9 -3
data/lib/anomaly_detection/version.rb +1 -1
data/lib/anomaly_detection.rb +65 -11
data/licenses/LICENSE-AnomalyDetection-cpp.txt +675 -0
data/licenses/LICENSE-MIT-dist-h.txt +21 -0
data/licenses/UNLICENSE-dist-h.txt +24 -0
metadata +7 -7
data/ext/anomaly_detection/anomaly_detection.cpp +0 -153
data/ext/anomaly_detection/cdflib.cpp +0 -12126
data/ext/anomaly_detection/cdflib.hpp +0 -123
data/licenses/LICENSE-cdflib.txt +0 -165

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
-  data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
+  metadata.gz: c60bb6d75cb8523ecd0926f391d79413a1cb2eb131cd579fd381bb6683f82da3
+  data.tar.gz: '01594d0f0a97ad8cbb7b0b50cb30894bd0d773d4db45b3158345567ce1732efb'
 SHA512:
-  metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
-  data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
+  metadata.gz: fe09cc140a5d6543f3b00983a754861f6a3a3a436f8a8afecc80d202f1112bb6ea180df794072ee4711c044508a559a015c885cfd61d2c5be9378fc7b6590d96
+  data.tar.gz: 5616e6075888b4521355e6c0fb33f7a94361c971c7f58f9ae6a61e5d8529a3e1938deba10be22c24ec5849f1909f48cba07b97fd1e6ea8b47ae4f66626eb703e

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,15 @@
+## 0.1.4 (2022-03-19)
+- Fixed initial median calculation
+## 0.1.3 (2022-01-03)
+- Switched to dist.h
+## 0.1.2 (2021-10-20)
+- Added `plot` method
 ## 0.1.1 (2021-10-17)
 - Added `verbose` option

data/README.md CHANGED Viewed

@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
 Add this line to your application’s Gemfile:
 ```ruby
-gem 'anomaly_detection'
+gem "anomaly_detection"
 ```
 ## Getting Started
@@ -58,9 +58,23 @@ AnomalyDetection.detect(
 )
 ```
+## Plotting
+Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
+```ruby
+gem "vega"
+```
+And use:
+```ruby
+AnomalyDetection.plot(series, anomalies)
+```
 ## Credits
-This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
+This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [stl-cpp](https://github.com/ankane/stl-cpp) for seasonal-trend decomposition and [dist.h](https://github.com/ankane/dist.h) for the quantile function.
 ## References

data/ext/anomaly_detection/anomaly_detection.hpp CHANGED Viewed

@@ -1,12 +1,203 @@
+/*!
+ * AnomalyDetection.cpp v0.1.0
+ * https://github.com/ankane/AnomalyDetection.cpp
+ * GPL-3.0-or-later License
+ */
 #pragma once
-#include <string>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
 #include <vector>
+#include "dist.h"
+#include "stl.hpp"
 namespace anomaly_detection {
 enum Direction { Positive, Negative, Both };
-std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
+float median_sorted(const std::vector<float>& sorted) {
+    return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
+}
+float median(const std::vector<float>& data) {
+    std::vector<float> sorted(data);
+    std::sort(sorted.begin(), sorted.end());
+    return median_sorted(sorted);
+}
+float mad(const std::vector<float>& data, float med) {
+    std::vector<float> res;
+    res.reserve(data.size());
+    for (auto v : data) {
+        res.push_back(fabs(v - med));
+    }
+    std::sort(res.begin(), res.end());
+    return 1.4826 * median_sorted(res);
+}
+std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
+    auto n = data.size();
+    // Check to make sure we have at least two periods worth of data for anomaly context
+    if (n < num_obs_per_period * 2) {
+        throw std::invalid_argument("series must contain at least 2 periods");
+    }
+    // Handle NANs
+    auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
+    if (nan > 0) {
+        throw std::invalid_argument("series contains NANs");
+    }
+    // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
+    auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
+    auto seasonal = data_decomp.seasonal;
+    std::vector<float> data2;
+    data2.reserve(n);
+    auto med = median(data);
+    for (auto i = 0; i < n; i++) {
+        data2.push_back(data[i] - seasonal[i] - med);
+    }
+    auto num_anoms = 0;
+    auto max_outliers = (size_t) n * k;
+    std::vector<size_t> anomalies;
+    anomalies.reserve(max_outliers);
+    // Sort data for fast median
+    // Use stable sort for indexes for deterministic results
+    std::vector<size_t> indexes(n);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
+    std::sort(data2.begin(), data2.end());
+    // Compute test statistic until r=max_outliers values have been removed from the sample
+    for (auto i = 1; i <= max_outliers; i++) {
+        if (verbose) {
+            std::cout << i << " / " << max_outliers << " completed" << std::endl;
+        }
+        // TODO Improve performance between loop iterations
+        auto ma = median_sorted(data2);
+        std::vector<float> ares;
+        ares.reserve(data2.size());
+        if (one_tail) {
+            if (upper_tail) {
+                for (auto v : data2) {
+                    ares.push_back(v - ma);
+                }
+            } else {
+                for (auto v : data2) {
+                    ares.push_back(ma - v);
+                }
+            }
+        } else {
+            for (auto v : data2) {
+                ares.push_back(fabs(v - ma));
+            }
+        }
+        // Protect against constant time series
+        auto data_sigma = mad(data2, ma);
+        if (data_sigma == 0.0) {
+            break;
+        }
+        auto iter = std::max_element(ares.begin(), ares.end());
+        auto r_idx_i = std::distance(ares.begin(), iter);
+        // Only need to take sigma of r for performance
+        auto r = ares[r_idx_i] / data_sigma;
+        anomalies.push_back(indexes[r_idx_i]);
+        data2.erase(data2.begin() + r_idx_i);
+        indexes.erase(indexes.begin() + r_idx_i);
+        // Compute critical value
+        float p;
+        if (one_tail) {
+            p = 1.0 - alpha / (n - i + 1);
+        } else {
+            p = 1.0 - alpha / (2.0 * (n - i + 1));
+        }
+        auto t = students_t_ppf(p, n - i - 1);
+        auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
+        if (r > lam) {
+            num_anoms = i;
+        }
+        if (callback != nullptr) {
+            callback();
+        }
+    }
+    anomalies.resize(num_anoms);
+    // Sort like R version
+    std::sort(anomalies.begin(), anomalies.end());
+    return anomalies;
+}
+class AnomalyDetectionResult {
+public:
+    std::vector<size_t> anomalies;
+};
+class AnomalyDetectionParams {
+    float alpha_ = 0.05;
+    float max_anoms_ = 0.1;
+    Direction direction_ = Direction::Both;
+    bool verbose_ = false;
+    std::function<void()> callback_ = nullptr;
+public:
+    inline AnomalyDetectionParams alpha(float alpha) {
+        this->alpha_ = alpha;
+        return *this;
+    };
+    inline AnomalyDetectionParams max_anoms(float max_anoms) {
+        this->max_anoms_ = max_anoms;
+        return *this;
+    };
+    inline AnomalyDetectionParams direction(Direction direction) {
+        this->direction_ = direction;
+        return *this;
+    };
+    inline AnomalyDetectionParams verbose(bool verbose) {
+        this->verbose_ = verbose;
+        return *this;
+    };
+    inline AnomalyDetectionParams callback(std::function<void()> callback) {
+        this->callback_ = callback;
+        return *this;
+    };
+    AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
+};
+AnomalyDetectionParams params() {
+    return AnomalyDetectionParams();
+}
+AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
+    bool one_tail = this->direction_ != Direction::Both;
+    bool upper_tail = this->direction_ == Direction::Positive;
+    auto res = AnomalyDetectionResult();
+    res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
+    return res;
+}
 }

data/ext/anomaly_detection/dist.h ADDED Viewed

@@ -0,0 +1,190 @@
+/*!
+ * dist.h v0.1.1
+ * https://github.com/ankane/dist.h
+ * Unlicense OR MIT License
+ */
+#pragma once
+#include <assert.h>
+#include <math.h>
+#ifdef M_E
+#define DIST_E M_E
+#else
+#define DIST_E 2.71828182845904523536
+#endif
+#ifdef M_PI
+#define DIST_PI M_PI
+#else
+#define DIST_PI 3.14159265358979323846
+#endif
+// Winitzki, S. (2008).
+// A handy approximation for the error function and its inverse.
+// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
+// from https://sites.google.com/site/winitzki
+double erf(double x) {
+    double sign = x < 0 ? -1.0 : 1.0;
+    x = x < 0 ? -x : x;
+    double a = 0.14;
+    double x2 = x * x;
+    return sign * sqrt(1.0 - exp(-x2 * (4.0 / DIST_PI + a * x2) / (1.0 + a * x2)));
+}
+// Winitzki, S. (2008).
+// A handy approximation for the error function and its inverse.
+// https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
+// from https://sites.google.com/site/winitzki
+double inverse_erf(double x) {
+    double sign = x < 0 ? -1.0 : 1.0;
+    x = x < 0 ? -x : x;
+    double a = 0.147;
+    double ln = log(1.0 - x * x);
+    double f1 = 2.0 / (DIST_PI * a);
+    double f2 = ln / 2.0;
+    double f3 = f1 + f2;
+    double f4 = 1.0 / a * ln;
+    return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
+}
+double normal_pdf(double x, double mean, double std_dev) {
+    double var = std_dev * std_dev;
+    return (1.0 / (var * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * pow((x - mean) / var, 2));
+}
+double normal_cdf(double x, double mean, double std_dev) {
+    return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
+}
+double normal_ppf(double p, double mean, double std_dev) {
+    assert(p >= 0 && p <= 1);
+    return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
+}
+double students_t_pdf(double x, unsigned int n) {
+    assert(n >= 1);
+    return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
+}
+// Hill, G. W. (1970).
+// Algorithm 395: Student's t-distribution.
+// Communications of the ACM, 13(10), 617-619.
+double students_t_cdf(double x, unsigned int n) {
+    assert(n >= 1);
+    double start = x < 0 ? 0 : 1;
+    double sign = x < 0 ? 1 : -1;
+    double z = 1.0;
+    double t = x * x;
+    double y = t / n;
+    double b = 1.0 + y;
+    if ((n >= 20 && t < n) || n > 200) {
+        // asymptotic series for large or noninteger n
+        if (y > 10e-6) {
+            y = log(b);
+        }
+        double a = n - 0.5;
+        b = 48.0 * a * a;
+        y = a * y;
+        y = (((((-0.4 * y - 3.3) * y - 24.0) * y - 85.5) / (0.8 * y * y + 100.0 + b) + y + 3.0) / b + 1.0) * sqrt(y);
+        return start + sign * normal_cdf(-y, 0.0, 1.0);
+    }
+    if (n < 20 && t < 4.0) {
+        // nested summation of cosine series
+        y = sqrt(y);
+        double a = y;
+        if (n == 1) {
+            a = 0.0;
+        }
+        // loop
+        if (n > 1) {
+            n -= 2;
+            while (n > 1) {
+                a = (n - 1) / (b * n) * a + y;
+                n -= 2;
+            }
+        }
+        a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
+        return start + sign * (z - a) / 2;
+    }
+    // tail series expanation for large t-values
+    double a = sqrt(b);
+    y = a * n;
+    int j = 0;
+    while (a != z) {
+        j += 2;
+        z = a;
+        y = y * (j - 1) / (b * j);
+        a = a + y / (n + j);
+    }
+    z = 0.0;
+    y = 0.0;
+    a = -a;
+    // loop (without n + 2 and n - 2)
+    while (n > 1) {
+        a = (n - 1) / (b * n) * a + y;
+        n -= 2;
+    }
+    a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
+    return start + sign * (z - a) / 2;
+}
+// Hill, G. W. (1970).
+// Algorithm 396: Student's t-quantiles.
+// Communications of the ACM, 13(10), 619-620.
+double students_t_ppf(double p, unsigned int n) {
+    assert(p >= 0 && p <= 1);
+    assert(n >= 1);
+    // distribution is symmetric
+    double sign = p < 0.5 ? -1 : 1;
+    p = p < 0.5 ? 1 - p : p;
+    // two-tail to one-tail
+    p = 2.0 * (1.0 - p);
+    if (n == 2) {
+        return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
+    }
+    double half_pi = DIST_PI / 2.0;
+    if (n == 1) {
+        p = p * half_pi;
+        return sign * cos(p) / sin(p);
+    }
+    double a = 1.0 / (n - 0.5);
+    double b = 48.0 / (a * a);
+    double c = ((20700.0 * a / b - 98.0) * a - 16.0) * a + 96.36;
+    double d = ((94.5 / (b + c) - 3.0) / b + 1.0) * sqrt(a * half_pi) * n;
+    double x = d * p;
+    double y = pow(x, 2.0 / n);
+    if (y > 0.05 + a) {
+        // asymptotic inverse expansion about normal
+        x = normal_ppf(p * 0.5, 0.0, 1.0);
+        y = x * x;
+        if (n < 5) {
+            c += 0.3 * (n - 4.5) * (x + 0.6);
+        }
+        c = (((0.05 * d * x - 5.0) * x - 7.0) * x - 2.0) * x + b + c;
+        y = (((((0.4 * y + 6.3) * y + 36.0) * y + 94.5) / c - y - 3.0) / b + 1.0) * x;
+        y = a * y * y;
+        y = y > 0.002 ? exp(y) - 1.0 : 0.5 * y * y + y;
+    } else {
+        y = ((1.0 / (((n + 6.0) / (n * y) - 0.089 * d - 0.822) * (n + 2.0) * 3.0) + 0.5 / (n + 4.0)) * y - 1.0) * (n + 1.0) / (n + 2.0) + 1.0 / y;
+    }
+    return sign * sqrt(n * y);
+}

data/ext/anomaly_detection/ext.cpp CHANGED Viewed

@@ -12,7 +12,7 @@ void Init_ext() {
   rb_mAnomalyDetection
     .define_singleton_function(
       "_detect",
-      [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
+      [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
         Direction dir;
         if (direction == "pos") {
           dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
           throw std::invalid_argument("direction must be pos, neg, or both");
         }
-        auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
+        auto res = anomaly_detection::params()
+          .max_anoms(k)
+          .alpha(alpha)
+          .direction(dir)
+          .verbose(verbose)
+          .callback(rb_thread_check_ints)
+          .fit(series, period);
         auto a = Rice::Array();
-        for (auto v : res) {
+        for (auto v : res.anomalies) {
           a.push(v);
         }
         return a;

data/lib/anomaly_detection/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module AnomalyDetection
-  VERSION = "0.1.1"
+  VERSION = "0.1.4"
 end

data/lib/anomaly_detection.rb CHANGED Viewed

@@ -5,18 +5,72 @@ require "anomaly_detection/ext"
 require "anomaly_detection/version"
 module AnomalyDetection
-  def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
-    raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
-    if series.is_a?(Hash)
-      sorted = series.sort_by { |k, _| k }
-      x = sorted.map(&:last)
-    else
-      x = series
+  class << self
+    def detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", plot: false, verbose: false)
+      raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
+      if series.is_a?(Hash)
+        sorted = series.sort_by { |k, _| k }
+        x = sorted.map(&:last)
+      else
+        x = series
+      end
+      res = _detect(x, period, max_anoms, alpha, direction, verbose)
+      res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
+      res
     end
-    res = _detect(x, period, max_anoms, alpha, direction, verbose)
-    res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
-    res
+    # TODO add tooltips
+    def plot(series, anomalies)
+      require "vega"
+      data =
+        if series.is_a?(Hash)
+          series.map { |k, v| {x: iso8601(k), y: v, anomaly: anomalies.include?(k)} }
+        else
+          series.map.with_index { |v, i| {x: i, y: v, anomaly: anomalies.include?(i)} }
+        end
+      if series.is_a?(Hash)
+        x = {field: "x", type: "temporal"}
+        x["scale"] = {type: "utc"} if series.keys.first.is_a?(Date)
+      else
+        x = {field: "x", type: "quantitative"}
+      end
+      Vega.lite
+        .data(data)
+        .layer([
+          {
+            mark: {type: "line"},
+            encoding: {
+              x: x,
+              y: {field: "y", type: "quantitative", scale: {zero: false}},
+              color: {value: "#fa9088"}
+            }
+          },
+          {
+            transform: [{"filter": "datum.anomaly == true"}],
+            mark: {type: "point", size: 200},
+            encoding: {
+              x: x,
+              y: {field: "y", type: "quantitative"},
+              color: {value: "#19c7ca"}
+            }
+          }
+        ])
+        .config(axis: {title: nil, labelFontSize: 12})
+    end
+    private
+    def iso8601(v)
+      if v.is_a?(Date)
+        v.strftime("%Y-%m-%d")
+      else
+        v.strftime("%Y-%m-%dT%H:%M:%S.%L%z")
+      end
+    end
   end
 end