RubyGems - fastout - Versions diffs - 0.0.1 → 0.0.2 - Mend

fastout 0.0.1 → 0.0.2

Files changed (3) hide show

data/lib/fastout/ranker.rb CHANGED Viewed

@@ -5,239 +5,242 @@
 # Author:: Jason Dew (mailto:jason.dew@gmail.com)
 # Copyright:: Copyright (c) 2010 Jason Dew
 # License:: MIT
-class Ranker
-  class Point
+module Fastout
+  class Ranker
-    @@next_id = 0
+    class Point
-    def self.next_id= id
-      @@next_id = id
-    end
+      @@next_id = 0
-    attr_reader :id, :attributes, :bins
-    attr_accessor :cluster, :score
+      def self.next_id= id
+        @@next_id = id
+      end
-    def initialize *attributes
-      @attributes = attributes
-      @cluster = nil
-      @score = 0
-      @bins = []
+      attr_reader :id, :attributes, :bins
+      attr_accessor :cluster, :score
-      @id = @@next_id
-      @@next_id += 1
-    end
+      def initialize *attributes
+        @attributes = attributes
+        @cluster = nil
+        @score = 0
+        @bins = []
-    def [] index
-      @attributes[index]
-    end
+        @id = @@next_id
+        @@next_id += 1
+      end
-    def clustered?
-      !! cluster
-    end
+      def [] index
+        @attributes[index]
+      end
-    def uncluster!
-      @cluster = nil
-    end
+      def clustered?
+        !! cluster
+      end
-    def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
-      attribute_indexes.each do |attribute_index|
-        return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
+      def uncluster!
+        @cluster = nil
       end
-      attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
-        return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
+      def in_the_neighborhood_of? point, attribute_indexes, neighborhoods
+        attribute_indexes.each do |attribute_index|
+          return false if (bins[attribute_index] - point.bins[attribute_index]).abs > 1
+        end
+        attribute_indexes.each_with_index do |attribute_index, neighborhood_index|
+          return false if (attributes[attribute_index] - point.attributes[attribute_index]) > (neighborhoods[neighborhood_index] / 2.0)
+        end
+        true
       end
-      true
-    end
+      def neighbor_of_any? points, attribute_indexes, neighborhoods
+        points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
+      end
-    def neighbor_of_any? points, attribute_indexes, neighborhoods
-      points.inject(false) {|found, point| found or in_the_neighborhood_of?(point, attribute_indexes, neighborhoods) }
     end
-  end
+    attr_reader :data, :points, :minimums, :maximums
-  attr_reader :data, :points, :minimums, :maximums
+    def self.pointify data
+      data.map {|attributes| Point.new *attributes }
+    end
-  def self.pointify data
-    data.map {|attributes| Point.new *attributes }
-  end
+    # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
+    # values should all be numerical
+    # * +data+ should not be empty or nil will be returned
+    # * also generates minimum and maximum values for each attribute for later use
+    def initialize data
+      raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
+      @data = data
+      @points = self.class.pointify data
+      @minimums, @maximums = compute_minimums_and_maximums
+      Point.next_id = 0
+    end
-  # takes a 2-d array, +data+, where the rows are data points and the columns are the attributes,
-  # values should all be numerical
-  # * +data+ should not be empty or nil will be returned
-  # * also generates minimum and maximum values for each attribute for later use
-  def initialize data
-    raise "data must have more than one attribute and more than one data point" unless data.size > 1 and data.first.size > 1
-    @data = data
-    @points = self.class.pointify data
-    @minimums, @maximums = compute_minimums_and_maximums
-    Point.next_id = 0
-  end
+    # searches the parameter space to find the optimized values of +k+ and +q+
+    # * +theta_target+ is the maximum acceptable value of theta, default is 1
+    # * +sample+ is the number of iterations to perform in estimating the parameters
+    # * +n+ is the number of points to rank
+    def optimized_ranking sample, n, theta_target=1
+      k = 3
+      q = 5
+      max_q = n / 4
+      step_q = 10
+      last_theta = n
+      theta, s = calculate_theta(sample, k, n, q)
-  # searches the parameter space to find the optimized values of +k+ and +q+
-  # * +theta_target+ is the maximum acceptable value of theta, default is 1
-  # * +sample+ is the number of iterations to perform in estimating the parameters
-  # * +n+ is the number of points to rank
-  def optimized_ranking sample, n, theta_target=1
-    k = 3
-    q = 5
-    max_q = n / 4
-    step_q = 10
-    last_theta = n
-    theta, s = calculate_theta(sample, k, n, q)
-    while (theta > theta_target or theta < last_theta or q < max_q) do
-      return s if (theta <= theta_target)
-      if (theta >= last_theta)
-        # effectiveness declining so try next k
-        k += 1
-        q -= step_q
-        last_theta = n
-      else
-        # try next q
-        q += step_q
-        last_theta = theta
+      while (theta > theta_target or theta < last_theta or q < max_q) do
+        return s if (theta <= theta_target)
+        if (theta >= last_theta)
+          # effectiveness declining so try next k
+          k += 1
+          q -= step_q
+          last_theta = n
+        else
+          # try next q
+          q += step_q
+          last_theta = theta
+        end
+        theta, s = calculate_theta(sample, k, n, q)
       end
-      theta, s = calculate_theta(sample, k, n, q)
+      s
     end
-    s
-  end
+    # find and rank the points by their outlier score and determine
+    # theta (the number of points with an outlier score of +n+)
+    def calculate_theta sample, k, n, q
+      s = ranked_outliers sample, k, q
+      theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
-  # find and rank the points by their outlier score and determine
-  # theta (the number of points with an outlier score of +n+)
-  def calculate_theta sample, k, n, q
-    s = ranked_outliers sample, k, q
-    theta = points.inject(0) {|sum, point| point.score == n ? sum + 1 : sum }
+      [theta, s]
+    end
-    [theta, s]
-  end
+    # chooses +k+ random attributes with an average of +q+ data points
+    # in each bin +sample+ times to determine outliers
+    def ranked_outliers sample_size, k, q
+      # determine number of bins and their widths
+      bin_count =  compute_bin_count(q)
+      bin_widths = compute_bin_widths(q, bin_count)
-  # chooses +k+ random attributes with an average of +q+ data points
-  # in each bin +sample+ times to determine outliers
-  def ranked_outliers sample_size, k, q
-    # determine number of bins and their widths
-    bin_count =  compute_bin_count(q)
-    bin_widths = compute_bin_widths(q, bin_count)
+      # assign points to the attribute bins
+      assign_points_to_bins! bin_widths, bin_count
-    # assign points to the attribute bins
-    assign_points_to_bins! bin_widths, bin_count
+      1.upto(sample_size) {
+        score_points_from_a_random_set_of_attributes! k, bin_widths }
-    1.upto(sample_size) {
-      score_points_from_a_random_set_of_attributes! k, bin_widths }
+      points.sort_by(&:score).reverse
+    end
-    points.sort_by(&:score).reverse
-  end
+    # pick a random set of attributes and compute the outlier score
+    # for each of the points
+    def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
+      cluster = 0
+      attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
+      bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
-  # pick a random set of attributes and compute the outlier score
-  # for each of the points
-  def score_points_from_a_random_set_of_attributes! number_of_attributes_to_choose, all_bin_widths
-    cluster = 0
-    attribute_indexes = random_attribute_indexes number_of_attributes_to_choose
-    bin_widths = attribute_indexes.map {|index| all_bin_widths[index] }
+      points.each do |point|
+        next if point.clustered?
-    points.each do |point|
-      next if point.clustered?
+        point.cluster = (cluster += 1)
+        neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
-      point.cluster = (cluster += 1)
-      neighbors = cluster_neighbors point, cluster, attribute_indexes, bin_widths
+        point.uncluster!  if neighbors.empty?
+      end
-      point.uncluster!  if neighbors.empty?
+      points.each do |point|
+        next unless point.clustered?
+        point.uncluster!
+        point.score += 1
+      end
     end
-    points.each do |point|
-      next unless point.clustered?
-      point.uncluster!
-      point.score += 1
+    # randomly choose +number+ of attribute indexes
+    def random_attribute_indexes number
+      (0...@data.first.size).sort_by { rand }[0..number]
     end
-  end
-  # randomly choose +number+ of attribute indexes
-  def random_attribute_indexes number
-    (0...@data.first.size).sort_by { rand }[0..number]
-  end
+    # find all unclustered points that are neighbors of +point+ on
+    # *all* selected attributes or neighbors in the neighborhood
+    # of +point+; find recursively until no additions can be made
+    def cluster_neighbors point, cluster, attribute_indexes, bin_widths
+      recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
+    end
-  # find all unclustered points that are neighbors of +point+ on
-  # *all* selected attributes or neighbors in the neighborhood
-  # of +point+; find recursively until no additions can be made
-  def cluster_neighbors point, cluster, attribute_indexes, bin_widths
-    recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, []
-  end
+    # recursive step of #cluster_neighbors
+    def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
+      fruitful = false
-  # recursive step of #cluster_neighbors
-  def recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
-    fruitful = false
+      unclustered_points.each do |unclustered_point|
+        next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
+                    unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
-    unclustered_points.each do |unclustered_point|
-      next unless point.in_the_neighborhood_of?(unclustered_point, attribute_indexes, bin_widths) or
-                  unclustered_point.neighbor_of_any?(neighbors, attribute_indexes, bin_widths)
+        fruitful = true
+        unclustered_point.cluster = cluster
+        neighbors << unclustered_point
+      end
-      fruitful = true
-      unclustered_point.cluster = cluster
-      neighbors << unclustered_point
+      if fruitful
+        recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
+      else
+        neighbors
+      end
     end
-    if fruitful
-      recursively_cluster_neighbors point, cluster, attribute_indexes, bin_widths, neighbors
-    else
-      neighbors
+    # find all of the points that don't already belong to a cluster
+    def unclustered_points
+      points.select {|point| not point.clustered? }
     end
-  end
-  # find all of the points that don't already belong to a cluster
-  def unclustered_points
-    points.select {|point| not point.clustered? }
-  end
-  # assign each of the data points to a bin based on the given +bin_widths+,
-  # returns a 2-d array in attribute-major order
-  def assign_points_to_bins! bin_widths, bin_count
-    bin_widths.each_with_index do |bin_width, attribute_index|
-      points.each do |point|
-        point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
+    # assign each of the data points to a bin based on the given +bin_widths+,
+    # returns a 2-d array in attribute-major order
+    def assign_points_to_bins! bin_widths, bin_count
+      bin_widths.each_with_index do |bin_width, attribute_index|
+        points.each do |point|
+          point.bins[attribute_index] = bin_index(point, attribute_index, bin_width)
+        end
       end
     end
-  end
-  def bin_index point, attribute_index, bin_width
-    minimum = @minimums[attribute_index]
-    maximum = @maximums[attribute_index]
+    def bin_index point, attribute_index, bin_width
+      minimum = @minimums[attribute_index]
+      maximum = @maximums[attribute_index]
-    value = point[attribute_index]
-    index = ((value - minimum) / bin_width).floor
+      value = point[attribute_index]
+      index = ((value - minimum) / bin_width).floor
-    value == maximum ? index - 1 : index
-  end
+      value == maximum ? index - 1 : index
+    end
-  def compute_minimums_and_maximums
-    minimums = @data.first.dup
-    maximums = @data.first.dup
+    def compute_minimums_and_maximums
+      minimums = @data.first.dup
+      maximums = @data.first.dup
-    @data.each do |attributes|
-      attributes.each_with_index do |attribute, attribute_index|
-        minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
-        maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
+      @data.each do |attributes|
+        attributes.each_with_index do |attribute, attribute_index|
+          minimums[attribute_index] = attribute if attribute < minimums[attribute_index]
+          maximums[attribute_index] = attribute if attribute > maximums[attribute_index]
+        end
       end
+      [minimums, maximums]
     end
-    [minimums, maximums]
-  end
+    # determine the widths of the bins based on +q+
+    def compute_bin_widths q, bin_count
+      (0...@data.first.size).map do |attribute_index|
+        (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
+      end
+    end
-  # determine the widths of the bins based on +q+
-  def compute_bin_widths q, bin_count
-    (0...@data.first.size).map do |attribute_index|
-      (@maximums[attribute_index] - @minimums[attribute_index]) / bin_count.to_f
+    # compute the number of bins for a given +q+
+    def compute_bin_count q
+      count = (@data.size / q.to_f).ceil
+      count < 2 ? 2 : count
     end
-  end
-  # compute the number of bins for a given +q+
-  def compute_bin_count q
-    count = (@data.size / q.to_f).ceil
-    count < 2 ? 2 : count
   end
 end

data/lib/fastout/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Fastout
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: fastout
 version: !ruby/object:Gem::Version
-  hash: 29
+  hash: 27
   prerelease: false
   segments:
   - 0
   - 0
-  - 1
-  version: 0.0.1
+  - 2
+  version: 0.0.2
 platform: ruby
 authors:
 - Jason Dew