RubyGems - fastout - Versions diffs - 0.0.1 - Mend

fastout 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/.gitignore +4 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/Gemfile.lock +38 -0
data/MIT-LICENSE +20 -0
data/README.md +8 -0
data/Rakefile +2 -0
data/doc/classes/Fastout.html +105 -0
data/doc/classes/Ranker.html +468 -0
data/doc/classes/Ranker.src/M000001.html +18 -0
data/doc/classes/Ranker.src/M000002.html +22 -0
data/doc/classes/Ranker.src/M000003.html +42 -0
data/doc/classes/Ranker.src/M000004.html +21 -0
data/doc/classes/Ranker.src/M000005.html +28 -0
data/doc/classes/Ranker.src/M000006.html +35 -0
data/doc/classes/Ranker.src/M000007.html +18 -0
data/doc/classes/Ranker.src/M000008.html +18 -0
data/doc/classes/Ranker.src/M000009.html +33 -0
data/doc/classes/Ranker.src/M000010.html +18 -0
data/doc/classes/Ranker.src/M000011.html +22 -0
data/doc/classes/Ranker.src/M000012.html +24 -0
data/doc/classes/Ranker.src/M000013.html +28 -0
data/doc/classes/Ranker.src/M000014.html +20 -0
data/doc/classes/Ranker.src/M000015.html +19 -0
data/doc/classes/Ranker/Point.html +262 -0
data/doc/classes/Ranker/Point.src/M000016.html +18 -0
data/doc/classes/Ranker/Point.src/M000017.html +24 -0
data/doc/classes/Ranker/Point.src/M000018.html +18 -0
data/doc/classes/Ranker/Point.src/M000019.html +18 -0
data/doc/classes/Ranker/Point.src/M000020.html +18 -0
data/doc/classes/Ranker/Point.src/M000021.html +26 -0
data/doc/classes/Ranker/Point.src/M000022.html +18 -0
data/doc/created.rid +1 -0
data/doc/files/lib/fastout/ranker_rb.html +121 -0
data/doc/files/lib/fastout/version_rb.html +101 -0
data/doc/files/lib/fastout_rb.html +108 -0
data/doc/files/spec/fastout/ranker_spec_rb.html +109 -0
data/doc/files/spec/spec_helper_rb.html +110 -0
data/doc/fr_class_index.html +29 -0
data/doc/fr_file_index.html +28 -0
data/doc/fr_method_index.html +48 -0
data/doc/index.html +24 -0
data/doc/rdoc-style.css +208 -0
data/fastout.gemspec +29 -0
data/lib/fastout.rb +1 -0
data/lib/fastout/ranker.rb +243 -0
data/lib/fastout/version.rb +3 -0
data/spec/fastout/ranker_spec.rb +252 -0
data/spec/parkinsons.csv +1 -0
data/spec/spec_helper.rb +9 -0
metadata +217 -0

data/fastout.gemspec ADDED

@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "fastout/version"
+Gem::Specification.new do |s|
+  s.name        = "fastout"
+  s.version     = Fastout::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Jason Dew"]
+  s.email       = ["jason.dew@gmail.com"]
+  s.homepage    = "http://rubygems.org/gems/fastout"
+  s.summary     = %q{Detect outliers in high-dimension data sets}
+  s.description = %q{Detect outliers in high-dimension data sets using the FASTOUT algorithm by Foss et. al}
+  s.rubyforge_project = "fastout"
+  s.add_development_dependency "rspec", "~>2.0"
+  s.add_development_dependency "rr"
+  s.add_development_dependency "autotest"
+  s.add_development_dependency "autotest-fsevent"
+  s.add_development_dependency "autotest-growl"
+  s.add_development_dependency "redgreen"
+  s.add_development_dependency "fastercsv"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+end

data/lib/fastout.rb ADDED

	@@ -0,0 +1 @@
1	+ require "fastout/ranker"

data/lib/fastout/ranker.rb ADDED

@@ -0,0 +1,243 @@
+# Takes a data set and determines the outliers using the FASTOUT algorithm from
+# Foss et al., "Class Separation through Variance: a new application of outlier detection",
+# Knowledge and Information Systems, 2010.
+#
+# Author:: Jason Dew (mailto:jason.dew@gmail.com)
+# Copyright:: Copyright (c) 2010 Jason Dew
+# License:: MIT
+class Ranker
+  class Point
+    @@next_id = 0
+    def self.next_id= id
+      @@next_id = id
+    end
+    attr_reader :id, :attributes, :bins
+    attr_accessor :cluster, :score
+    def initialize *attributes
+      @attributes = attributes
+      @cluster = nil
+      @score = 0
+      @bins = []
+      @id = @@next_id
+      @@next_id += 1
+    end
+    def [] index
+      @attributes[index]
+    end
+    def clustered?
+      !! cluster
+    end
+    def uncluster!
+      @cluster = nil
+    end
+    def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
+      attribute_indexes.each do |attribute_index|
+        return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
+      end
+      attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
+        return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
+      end
+      true
+    end
+    def neighbor_of_any? points, attribute_indexes, neighborhoods
+      points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
+    end
+  end
+  attr_reader :data, :points, :minimums, :maximums
+  def self.pointify data
+    data.map {|attributes| Point.new *attributes }
+  end
+  # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
+  # values should all be numerical
+  # * +data+ should not be empty or nil will be returned
+  # * also generates minimum and maximum values for each attribute for later use
+  def initialize data
+    raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
+    @data = data
+    @points = self.class.pointify data
+    @minimums, @maximums = compute_minimums_and_maximums
+    Point.next_id = 0
+  end
+  # searches the parameter space to find the optimized values of +k+ and +q+
+  # * +theta_target+ is the maximum acceptable value of theta, default is 1
+  # * +sample+ is the number of iterations to perform in estimating the parameters
+  # * +n+ is the number of points to rank
+  def optimized_ranking sample, n, theta_target=1
+    k = 3
+    q = 5
+    max_q = n / 4
+    step_q = 10
+    last_theta = n
+    theta, s = calculate_theta(sample, k, n, q)
+    while (theta > theta_target or theta < last_theta or q < max_q) do
+      return s if (theta <= theta_target)
+      if (theta >= last_theta)
+        # effectiveness declining so try next k
+        k += 1
+        q -= step_q
+        last_theta = n
+      else
+        # try next q
+        q += step_q
+        last_theta = theta
+      end
+      theta, s = calculate_theta(sample, k, n, q)
+    end
+    s
+  end
+  # find and rank the points by their outlier score and determine
+  # theta (the number of points with an outlier score of +n+)
+  def calculate_theta sample, k, n, q
+    s = ranked_outliers sample, k, q
+    theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
+    [theta, s]
+  end
+  # chooses +k+ random attributes with an average of +q+ data points
+  # in each bin +sample+ times to determine outliers
+  def ranked_outliers sample_size, k, q
+    # determine number of bins and their widths
+    bin_count =  compute_bin_count(q)
+    bin_widths = compute_bin_widths(q, bin_count)
+    # assign points to the attribute bins
+    assign_points_to_bins! bin_widths, bin_count
+    1.upto(sample_size) {
+      score_points_from_a_random_set_of_attributes! k, bin_widths }
+    points.sort_by(&:score).reverse
+  end
+  # pick a random set of attributes and compute the outlier score
+  # for each of the points
+  def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
+    cluster = 0
+    attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
+    bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
+    points.each do |point|
+      next if point.clustered?
+      point.cluster = (cluster += 1)
+      neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
+      point.uncluster!  if neighbors.empty?
+    end
+    points.each do |point|
+      next unless point.clustered?
+      point.uncluster!
+      point.score += 1
+    end
+  end
+  # randomly choose +number+ of attribute indexes
+  def random_attribute_indexes number
+    (0...@data.first.size).sort_by { rand }[0..number]
+  end
+  # find all unclustered points that are neighbors of +point+ on
+  # *all* selected attributes or neighbors in the neighborhood
+  # of +point+; find recursively until no additions can be made
+  def cluster_neighbors point, cluster, attribute_indexes, bin_widths
+    recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
+  end
+  # recursive step of #cluster_neighbors
+  def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
+    fruitful = false
+    unclustered_points.each do |unclustered_point|
+      next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
+                  unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
+      fruitful = true
+      unclustered_point.cluster = cluster
+      neighbors << unclustered_point
+    end
+    if fruitful
+      recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
+    else
+      neighbors
+    end
+  end
+  # find all of the points that don't already belong to a cluster
+  def unclustered_points
+    points.select {|point| not point.clustered? }
+  end
+  # assign each of the data points to a bin based on the given +bin_widths+,
+  # returns a 2-d array in attribute-major order
+  def assign_points_to_bins! bin_widths, bin_count
+    bin_widths.each_with_index do |bin_width, attribute_index|
+      points.each do |point|
+        point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
+      end
+    end
+  end
+  def bin_index point, attribute_index, bin_width
+    minimum = @minimums[attribute_index]
+    maximum = @maximums[attribute_index]
+    value = point[attribute_index]
+    index = ((value - minimum) / bin_width).floor
+    value == maximum ? index - 1 : index
+  end
+  def compute_minimums_and_maximums
+    minimums = @data.first.dup
+    maximums = @data.first.dup
+    @data.each do |attributes|
+      attributes.each_with_index do |attribute, attribute_index|
+        minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
+        maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
+      end
+    end
+    [minimums, maximums]
+  end
+  # determine the widths of the bins based on +q+
+  def compute_bin_widths q, bin_count
+    (0...@data.first.size).map do |attribute_index|
+      (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
+    end
+  end
+  # compute the number of bins for a given +q+
+  def compute_bin_count q
+    count = (@data.size / q.to_f).ceil
+    count < 2 ? 2 : count
+  end
+end

data/lib/fastout/version.rb ADDED

@@ -0,0 +1,3 @@
+module Fastout
+  VERSION = "0.0.1"
+end

data/spec/fastout/ranker_spec.rb ADDED

@@ -0,0 +1,252 @@
+require "spec_helper"
+require "fastercsv"
+module Fastout
+  describe Ranker do
+    describe Ranker::Point do
+      before { @point = Ranker::Point.new 1.0, 4.2, -1 }
+      context "#[]" do
+        it("should be able to index the attributes directly") { @point[0].should == 1.0 }
+      end
+      context "#clustered?" do
+        it "should be false when cluster is nil" do
+          mock(@point).cluster { nil }
+          @point.clustered?.should be_false
+        end
+        it "should be true when cluster is not nil" do
+          mock(@point).cluster { 42 }
+          @point.clustered?.should be_true
+        end
+      end
+      context "#uncluster!" do
+        it "should set cluster equal to nil" do
+          @point.cluster = 42
+          @point.uncluster!
+          @point.cluster.should be_nil
+        end
+      end
+      context "#in_the_neighborhood_of?" do
+        before(:each) do
+          stub(@point).bins { [2, 2] }
+          @test_point = Ranker::Point.new
+        end
+        it "should be false when the test point is two bins away for an attribute" do
+          mock(@test_point).bins { [0, 2] }
+          @point.in_the_neighborhood_of?(@test_point, [0, 1], [1, 1]).should be_false
+        end
+        it "should be false when the test point is more than half a neighborhood away for an attribute" do
+          mock(@test_point).bins { [1, 2] }.times(2)
+          mock(@point).attributes { [2.5, 2.5] }
+          mock(@test_point).attributes { [1.75, 2.5] }
+          @point.in_the_neighborhood_of?(@test_point, [0, 1], [1, 1]).should be_false
+        end
+        it "should be true when the test point is less than half a neighborhood away for an attribute" do
+          mock(@test_point).bins { [3, 2] }.times(2)
+          mock(@point).attributes { [2.5, 2.5] }.times(2)
+          mock(@test_point).attributes { [2.75, 2.5] }.times(2)
+          @point.in_the_neighborhood_of?(@test_point, [0, 1], [1, 1]).should be_true
+        end
+      end
+      context "#neighbor_of_any?" do
+        it "should check to see if any point is a neighbor" do
+          mock(@point).in_the_neighborhood_of?(:point_0, :attribute_indexes, :neighborhoods) { false }
+          mock(@point).in_the_neighborhood_of?(:point_1, :attribute_indexes, :neighborhoods) { true }
+          @point.neighbor_of_any?([:point_0, :point_1], :attribute_indexes, :neighborhoods).should be_true
+        end
+      end
+    end
+    it("should raise an error when given an empty array") { lambda { Ranker.new([]) }.should raise_error }
+    it("should raise an error when given an array containing an empty array") { lambda { Ranker.new([[]]) }.should raise_error }
+    it("should raise an error when given an array containing one non-empty array") { lambda { Ranker.new([[1, 2, 3]]) }.should raise_error }
+    it("should raise an error when given an array containing only one attribute") { lambda { Ranker.new([[1], [2], [3]]) }.should raise_error }
+    context "given 3 attributes and 4 data points" do
+      before(:each) do
+        @ranker = Ranker.new [[ 1.0,  3, -1],
+                              [ 2.0, 50,  1],
+                              [ 3.0,  5,  1],
+                              [ 4.2,  2,  1]]
+      end
+      context ".pointify" do
+        it "should generate a point object for each row" do
+          @ranker.points.size.should == 4
+        end
+      end
+      context "#optimized_ranking" do
+        it "should find the optimal values for k and q" do
+          @ranker.optimized_ranking(10, 5, 1).map(&:id).should == [3, 2, 1, 0]
+        end
+      end
+      context "#ranked_outliers" do
+        it "should compute the necessary parameters and return the points sorted by score" do
+          mock(@ranker).compute_bin_count(42) { :bin_count }
+          mock(@ranker).compute_bin_widths(42, :bin_count) { :bin_widths }
+          mock(@ranker).assign_points_to_bins!(:bin_widths, :bin_count)
+          mock(@ranker).score_points_from_a_random_set_of_attributes!(5, :bin_widths).times(100)
+          mock(@ranker.points).sort_by { mock!.reverse { :answer }.subject }
+          @ranker.ranked_outliers(100, 5, 42).should == :answer
+        end
+      end
+      context "#score_points_from_a_random_set_of_attributes!" do
+        it "should pick a random set of attributes and cycle through the points" do
+          mock(@ranker).random_attribute_indexes(5) { [2, 0] }
+          mock(@ranker).cluster_neighbors(is_a(Ranker::Point), is_a(Fixnum), [2, 0], [2, 0]) { [] }.times(4)
+          @ranker.score_points_from_a_random_set_of_attributes!(5, [0, 1, 2])
+        end
+      end
+      context "#random_attribute_indexes" do
+        it("should give me back the correct number of indexes") { @ranker.random_attribute_indexes(3).size.should == 3 }
+      end
+      context "#cluster_neighbors" do
+        it "should call recursively_cluster_neighbors" do
+          mock(@ranker).recursively_cluster_neighbors(:point, :cluster, :attribute_indexes, :bin_widths, [])
+          @ranker.cluster_neighbors :point, :cluster, :attribute_indexes, :bin_widths
+        end
+      end
+      context "#recursively_cluster_neighbors" do
+        it "should return its neighbors when there are no more unclustered points" do
+          mock(@ranker).unclustered_points { [] }
+          @ranker.recursively_cluster_neighbors(:point, :cluster, :attribute_indexes, :bin_widths, :neighbors).should == :neighbors
+        end
+        it "should return its neighbors if it doesn't find any new neighbors" do
+          unclustered_point = mock!.neighbor_of_any?(:neighbors, :attribute_indexes, :bin_widths) { false }.subject
+          mock(@ranker).unclustered_points { [unclustered_point] }
+          point = mock!.in_the_neighborhood_of?(unclustered_point, :attribute_indexes, :bin_widths) { false }.subject
+          @ranker.recursively_cluster_neighbors(point, :cluster, :attribute_indexes, :bin_widths, :neighbors).should == :neighbors
+        end
+        it "should call itself if it finds a new neighbor" do
+          @called = false
+          unclustered_point = Ranker::Point.new
+          mock(unclustered_point).cluster=(:cluster)
+          mock(@ranker).unclustered_points do
+            if @called
+              []
+            else
+              @called = true
+              [unclustered_point]
+            end
+          end.times(2)
+          point = mock!.in_the_neighborhood_of?(unclustered_point, :attribute_indexes, :bin_widths) { true }.subject
+          @ranker.recursively_cluster_neighbors(point, :cluster, :attribute_indexes, :bin_widths, []).should == [unclustered_point]
+        end
+      end
+      context "#unclustered_points" do
+        it "should find only the points that aren't clustered" do
+          mock(@ranker).points do
+            [mock!.clustered? { true }.subject,
+             mock!.clustered? { true }.subject,
+             mock!.clustered? { false }.subject]
+          end
+          @ranker.unclustered_points.size == 2
+        end
+      end
+      context "#calculate_theta" do
+        it "should call #ranked_outliers and find theta" do
+          mock(@ranker).ranked_outliers(:sample, :k, :q) { :s }
+          mock(@ranker).points { [mock!.score { :n }.subject,
+                                  mock!.score { :not_n }.subject,
+                                  mock!.score { :not_n }.subject,
+                                  mock!.score { :n }.subject] }
+          @ranker.calculate_theta(:sample, :k, :n, :q).should == [2, :s]
+        end
+      end
+      context "#compute_minimums_and_maximums" do
+        it "should properly compute minimums and maximums" do
+          @ranker.minimums.should == [1.0,  2.0, -1.0]
+          @ranker.maximums.should == [4.2, 50.0,  1.0]
+        end
+      end
+      context "#bin_count" do
+        it("should be equal to 4 when Q=1") { @ranker.compute_bin_count(1).should == 4 }
+        it("should be equal to 2 when Q=2") { @ranker.compute_bin_count(2).should == 2 }
+        it("should be equal to 2 when Q=3") { @ranker.compute_bin_count(3).should == 2 }
+        it("should be equal to 2 when Q=4") { @ranker.compute_bin_count(4).should == 2 }
+        it("should be equal to 2 when Q=5") { @ranker.compute_bin_count(5).should == 2 }
+      end
+      context "#compute_bin_widths" do
+        it("should be equal to [0.75, 12.0, 0.0] when Q=1") { @ranker.compute_bin_widths(1, 4).should == [0.8, 12.0, 0.5] }
+        it("should be equal to [1.6, 24.0, 0.0] when Q=2") { @ranker.compute_bin_widths(2, 2).should == [1.6, 24.0, 1.0] }
+      end
+      context "#assign_to_bins!" do
+        context "with q=2" do
+          it "should work properly" do
+            points = [(point_0 = Ranker::Point.new(1.0,  3, -1)),
+                      (point_1 = Ranker::Point.new(2.0, 50,  1)),
+                      (point_2 = Ranker::Point.new(3.0,  5,  1)),
+                      (point_3 = Ranker::Point.new(4.2,  2,  1))]
+            mock(@ranker).points { points }.times(3)
+            @ranker.assign_points_to_bins! [1.6, 24.0, 1.0], 2
+            point_0.bins.should == [0, 0, 0]
+            point_1.bins.should == [0, 1, 1]
+            point_2.bins.should == [1, 0, 1]
+            point_3.bins.should == [1, 0, 1]
+          end
+        end
+      end
+    end
+    context "given a somewhat non-trivially dataset" do
+      it "should find the outliers" do
+        # data from Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice Disorder Detection', Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM. BioMedical Engineering OnLine 2007, 6:23 (26 June 2007)
+        data = FasterCSV.read("spec/parkinsons.csv").map {|row| row.map {|datum| datum = datum.to_f } }
+        ranker = Ranker.new data
+        ranker.optimized_ranking(5, 5).size.should == 195
+      end
+    end
+  end
+end