RubyGems - anomaly - Versions diffs - 0.0.2 → 0.0.3 - Mend

anomaly 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.md +26 -14
data/Rakefile +13 -0
data/anomaly.gemspec +1 -0
data/lib/anomaly/detector.rb +18 -13
data/lib/anomaly/version.rb +1 -1
data/spec/anomaly/detector_spec.rb +22 -9
metadata +16 -5

data/README.md CHANGED

@@ -16,7 +16,7 @@ And then execute:
 bundle install
 ```
-For max performance (about 3x faster), also install the NArray gem:
+For max performance (~ 2x faster), also install the NArray gem:
 ```ruby
 gem "narray"
@@ -26,23 +26,32 @@ Anomaly will automatically detect it and use it.
 ## How to Use
-Train the detector with **only non-anomalies**. Each row is a sample.
+Say we have weather data for sunny days and we're trying to detect days that aren't sunny. The data looks like:
 ```ruby
-train_data = [
-  [0.1, 100, 1.4],
-  [0.2, 101, 2.1],
-  [0.5, 102, 1.6]
+# Each row is a different day.
+# [temperature (°F), humidity (%), pressure (in)]
+weather_data = [
+  [85, 68, 10.4],
+  [88, 62, 12.1],
+  [86, 64, 13.6],
+  ...
 ]
-ad = Anomaly::Detector.new(train_data)
+```
+Train the detector with **only non-anomalies** (sunny days in our case).
+```ruby
+ad = Anomaly::Detector.new(weather_data)
 ```
 That's it! Let's test for anomalies.
 ```ruby
-test_sample = [1.0, 100, 1.4]
+# 79°F, 66% humidity, 12.3 in. pressure
+test_sample = [79, 66, 12.3]
 ad.probability(test_sample)
-# => 0.0007328491480297603
+# => 7.537174740907633e-08
 ```
 **Super-important:** You must select a threshold for anomalies (which we denote with ε - "epsilon")
@@ -52,15 +61,13 @@ Probabilities less than ε are considered anomalies. If ε is higher, more thing
 ``` ruby
 ad.anomaly?(test_sample, 1e-10)
 # => false
-ad.anomaly?(test_sample, 0.5)
+ad.anomaly?(test_sample, 1e-5)
 # => true
 ```
-Here's sample to code to help you find the best ε for your application.
+The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
-```ruby
-# TODO
-```
+### Persistence
 You can easily persist the detector to a file or database - it's very tiny.
@@ -76,6 +83,11 @@ File.open("anomaly_detector.dump", "w") {|f| f.write(serialized_ad) }
 ad2 = Marshal.load(File.open("anomaly_detector.dump", "r").read)
 ```
+## TODO
+- Train in chunks (for very large datasets)
+- Multivariate normal distribution (possibly)
 ## Contributing
 1. Fork it

data/Rakefile CHANGED

@@ -2,3 +2,16 @@
 require "bundler/gem_tasks"
 require "rspec/core/rake_task"
 RSpec::Core::RakeTask.new("spec")
+require "benchmark"
+require "anomaly"
+task :benchmark do
+  data = 1_000_000.times.map{ [rand, rand, rand, rand] }
+  Benchmark.bm do |x|
+    x.report { Anomaly::Detector.new(data) }
+    require "narray"
+    x.report { Anomaly::Detector.new(data) }
+  end
+end

data/anomaly.gemspec CHANGED

@@ -17,4 +17,5 @@ Gem::Specification.new do |gem|
   gem.add_development_dependency "rake"
   gem.add_development_dependency "rspec", ">= 2.0.0"
+  gem.add_development_dependency "narray"
 end

data/lib/anomaly/detector.rb CHANGED

@@ -2,39 +2,46 @@ module Anomaly
   class Detector
     def initialize(data = nil)
-      @trained = false
+      @m = 0
       train(data) if data
     end
     def train(data)
       if defined?(NMatrix)
         d = NMatrix.to_na(data)
+        @n, @m = d.sizes
         # Convert these to an array for Marshal.dump
         @mean = d.mean(1).to_a
         @std = d.stddev(1).to_a
       else
         # Default to Array, since built-in Matrix does not give us a big performance advantage.
         d = data.to_a
-        cols = d.first.size.times.map{|i| d.map{|r| r[i]}}
+        @m = d.size
+        @n = d.first ? d.first.size : 0
+        cols = @n.times.map{|i| d.map{|r| r[i]}}
         @mean = cols.map{|c| mean(c)}
         @std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
       end
       @std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
-      # raise "Standard deviation cannot be zero" if @std.find_index{|i| i == 0 or i.nan?}
-      @trained = true
     end
     def trained?
-      @trained
+      @m > 0
+    end
+    def samples
+      @m
     end
+    # Limit the probability of features to [0,1]
+    # to keep probabilities at same scale.
     def probability(x)
       raise "Train me first" unless trained?
-      raise ArgumentError, "x must have #{@mean.size} elements" if x.size != @mean.size
-      x.each_with_index.map{|a,i| normal_pdf(a, @mean[i], @std[i]) }.reduce(1, :*)
+      raise ArgumentError, "x must have #{@n} elements" if x.size != @n
+      @n.times.map do |i|
+        p = normal_pdf(x[i], @mean[i], @std[i])
+        (p.nan? or p > 1) ? 1 : p
+      end.reduce(1, :*)
     end
     def anomaly?(x, epsilon)
@@ -45,10 +52,8 @@ module Anomaly
     SQRT2PI = Math.sqrt(2*Math::PI)
-    # Return 1 (exclude feature) if std ~ 0
     def normal_pdf(x, mean = 0, std = 1)
-      p = 1.0/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
-      p.nan? ? 1 : p
+      1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
     end
     # Not used for NArray

data/lib/anomaly/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Anomaly
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/spec/anomaly/detector_spec.rb CHANGED

@@ -25,15 +25,7 @@ describe Anomaly::Detector do
     end
   end
-  context "when one training example" do
-    let(:data) { [[0]] }
-    it "returns infinity" do
-      ad.probability([0]).should == 1
-    end
-  end
-  context "when data is a matrix" do
+  context "when data is an array" do
     let(:data) { [[-1,-2],[0,0],[1,2]] }
     let(:sample) { [rand, rand] }
@@ -43,4 +35,25 @@ describe Anomaly::Detector do
       prob.should == Anomaly::Detector.new(data).probability(sample)
     end
   end
+  context "when lots of samples" do
+    let(:data) { m.times.map{[0]} }
+    let(:m) { rand(100) + 1 }
+    it { ad.samples.should == m }
+    it { ad.trained?.should be_true }
+  end
+  context "when no samples" do
+    let(:data) { [] }
+    it { ad.samples.should == 0 }
+    it { ad.trained?.should be_false }
+  end
+  context "when pdf is greater than 1" do
+    let(:data) { 100.times.map{[0]}.push([1]) }
+    it { ad.probability([0]).should == 1 }
+  end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: anomaly
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2011-12-12 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
-  requirement: &2160640240 !ruby/object:Gem::Requirement
+  requirement: &2160380920 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *2160640240
+  version_requirements: *2160380920
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &2160639580 !ruby/object:Gem::Requirement
+  requirement: &2160379640 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,7 +32,18 @@ dependencies:
         version: 2.0.0
   type: :development
   prerelease: false
-  version_requirements: *2160639580
+  version_requirements: *2160379640
+- !ruby/object:Gem::Dependency
+  name: narray
+  requirement: &2160378180 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *2160378180
 description: Easy-to-use anomaly detection
 email:
 - andrew@getformidable.com