RubyGems - feldtruby - Versions diffs - 0.3.8 → 0.3.9 - Mend

feldtruby 0.3.8 → 0.3.9

Files changed (24) hide show

data/.gitignore +2 -0
data/README.md +9 -11
data/Rakefile +21 -4
data/lib/feldtruby/array.rb +5 -0
data/lib/feldtruby/minitest_extensions.rb +10 -0
data/lib/feldtruby/optimize/objective.rb +168 -125
data/lib/feldtruby/optimize/search_space.rb +105 -9
data/lib/feldtruby/optimize/sub_qualities_comparators.rb +73 -0
data/lib/feldtruby/statistics/array_archive.rb +66 -0
data/lib/feldtruby/statistics/clustering.rb +31 -0
data/lib/feldtruby/statistics/distance.rb +35 -0
data/lib/feldtruby/statistics/euclidean_distance.rb +4 -0
data/lib/feldtruby/statistics.rb +48 -0
data/lib/feldtruby/version.rb +1 -1
data/spikes/zlib_for_short_strings.rb +27 -0
data/test/skip_test_array_archive.rb +65 -0
data/test/test_array.rb +6 -0
data/test/test_clustering.rb +53 -0
data/test/test_euclidean_distance.rb +28 -0
data/test/test_optimize_objective.rb +133 -93
data/test/test_optimize_search_space.rb +54 -0
data/test/test_sax.rb +14 -1
data/test/test_sub_qualitites_comparator.rb +109 -0
metadata +15 -2

data/lib/feldtruby/optimize/sub_qualities_comparators.rb ADDED Viewed

@@ -0,0 +1,73 @@
+module FeldtRuby::Optimize
+# A SubQualititesComparator can compare vectors of sub-qualitites for two individuals
+# and rank the individuals based on if one is better (or dominates) the other.
+class SubQualitiesComparator
+  def initialize(objective)
+    @objective = objective
+  end
+  # Compare two sub-quality vectors and return
+  #   -1 if the first one dominates the other one
+  #    0 if none of them dominate the other
+  #    1 if the second one dominates the first one
+  def compare_sub_qualitites(subQualitites1, subQualitites2)
+    raise NotImplementedError
+  end
+  def compare_candidates(candidate1, candidate2)
+    sq1, sq2 = @objective.sub_qualities_of(candidate1), @objective.sub_qualities_of(candidate2)
+    compare_sub_qualitites sq1, sq2
+  end
+  # True iff the first dominates the second sub-quality vectors.
+  def first_dominates?(subQualitites1, subQualitites2)
+    compare_sub_qualitites(subQualitites1, subQualitites2) == -1
+  end
+  # True iff the second dominates the first sub-quality vectors.
+  def second_dominates?(subQualitites1, subQualitites2)
+    compare_sub_qualitites(subQualitites1, subQualitites2) == 1
+  end
+end
+# Epsilon-distance non-dominance comparator. Default epsilon is 0.0 which
+# gives the standard non-dominance comparator.
+class EpsilonNonDominance < SubQualitiesComparator
+  def initialize(objective, epsilon = 0.0)
+    super(objective)
+    @epsilon = epsilon
+  end
+  # Map hat operator to paired sub-quality values.
+  def map_hat_operator(sq1, sq2)
+    # NOTE! Below we assume that all sub-objectives should be minimized. If not we should
+    # change the sign of the hat operator return value!
+    sq1.zip(sq2).map do |sqv1, sqv2|
+      if (sqv1 - sqv2).abs > @epsilon
+        (sqv1 < sqv2) ? -1 : 1
+      else
+        0
+      end
+    end
+  end
+  def compare_sub_qualitites(subQualitites1, subQualitites2)
+    hat_values = map_hat_operator(subQualitites1, subQualitites2)
+    num_1_better = num_2_better = 0
+    hat_values.each do |hv|
+      if hv == -1
+        num_1_better += 1
+      elsif hv == 1
+        num_2_better += 1
+      end
+    end
+    if num_1_better > 0
+      (num_2_better == 0) ? -1 : 0
+    else
+      (num_2_better > 0) ? 1 : 0
+    end
+  end
+end
+end

data/lib/feldtruby/statistics/array_archive.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'feldtruby/array/basic_stats.rb'
+module FeldtRuby
+# A ValueArchive keeps basic statistics about values supplied to it in array.
+class ValueArchive
+  def initialize
+    @count = 0
+  end
+  # Returns the number of times an array has been added to the archive.
+  attr_reader :count
+  def update(values)
+    @count += 1
+  end
+end
+# A PositionBasedValueArchive assumes that each individual position in the supplied
+# value arrays have semantic meaning and thus we should not mix properties we calculate
+# and save between positions.
+class PositionBasedValueArchive < ValueArchive
+end
+# A MinMaxAveragePerPositionArchive keeps the min, max and average values for each
+# position in the supplied arrays. It can thus be used for min-max-normalization
+# of values in each position.
+class MinMaxMeanPerPositionArchive < PositionBasedValueArchive
+  attr_reader :mins, :maxs
+  def initialize
+    super
+    @mins, @maxs, @sums = [], [], []
+  end
+  def update(values)
+    super
+    @mins = update_statistic_per_position(@mins, values) {|newold| newold.compact.min}
+    @maxs = update_statistic_per_position(@maxs, values) {|newold| newold.compact.max}
+    @sums = update_statistic_per_position(@sums, values) {|newold| newold.compact.sum}
+  end
+  def update_statistic_per_position(currentStatistics, values, &updateStatistic)
+    values.zip(currentStatistics).map {|newold| updateStatistic.call(newold)}
+  end
+  # Return the minimum value we have seen so far in position _index_.
+  def min_for_position(index)
+    @mins[index]
+  end
+  # Return the maximum value we have seen so far in position _index_.
+  def max_for_position(index)
+    @maxs[index]
+  end
+  # Return the maximum value we have seen so far in position _index_.
+  def mean_for_position(index)
+    (@sums[index] / @count.to_f) if @sums[index]
+  end
+  def means
+    @sums.map {|v| v/@count.to_f}
+  end
+end
+end

data/lib/feldtruby/statistics/clustering.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'feldtruby/statistics/distance'
+require 'feldtruby/array/basic_stats'
+module FeldtRuby
+class ClusterLinkageMetric < CompositeMetric
+  include SetDistance
+end
+# Average linkage metric between clusters.
+class AverageLinkageMetric < ClusterLinkageMetric
+  def calc(cluster1, cluster2)
+    pairwise_distances(cluster1, cluster2, @sub_metric).sum.to_f / (cluster1.length * cluster2.length)
+  end
+end
+# Single linkage metric between clusters - distance between nearest members.
+class SingleLinkageMetric < ClusterLinkageMetric
+  def calc(cluster1, cluster2)
+    pairwise_distances(cluster1, cluster2, @sub_metric).min
+  end
+end
+# Complete linkage metric between clusters - distance between furthest members.
+class CompleteLinkageMetric < ClusterLinkageMetric
+  def calc(cluster1, cluster2)
+    pairwise_distances(cluster1, cluster2, @sub_metric).max
+  end
+end
+end

data/lib/feldtruby/statistics/distance.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require 'feldtruby/statistics/euclidean_distance'
+module FeldtRuby
+class Distance
+  def calc(o1, o2)
+    raise NotImplementedError
+  end
+end
+module CompositableDistance
+  def initialize(metric = EuclideanDistance.new)
+    @sub_metric = metric
+  end
+end
+# Functions specific to distances defined on sets of individual objects
+module SetDistance
+  def pairwise_distances(set1, set2, metric)
+    set1.map {|a| set2.map {|b| metric.calc(a,b)}}.flatten
+  end
+end
+# Metric is a Distance with particular properties. They need to be ensured
+# in sub-classes so not defined here though.
+class Metric < Distance
+end
+# A CompositeDistance takes another metric as input and calculates a new
+# distance based on it.
+class CompositeMetric < Metric
+  include CompositableDistance
+end
+end

data/lib/feldtruby/statistics/euclidean_distance.rb CHANGED Viewed

@@ -11,4 +11,8 @@ class EuclideanDistance
   end
 end
+def euclidean_distance(o1, o2)
+  (@euclidean_distance ||= EuclideanDistance.new).calc(o1, o2)
+end
 end

data/lib/feldtruby/statistics.rb CHANGED Viewed

@@ -153,6 +153,54 @@ module Statistics
   end
 end
+# Plotting data sets in R with ggplot2 and save them to files.
+module FeldtRuby::Statistics::Plotting
+  def plot_2dims(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", width = 1200, height = 900)
+    include_library("ggplot2")
+    pre = [
+      "td <- read.csv(#{csvFilePath.inspect}",
+      "png(#{graphFilePath.inspect}, width=#{width}, height=#{height})"
+    ]
+    plot = yield()
+    plot.last << " theme_bw(base_size = 12, base_family = \"\")"
+    post = [
+      "dev.off()"
+    ]
+    lines = pre + plot + post
+    eval lines.join("\n")
+  end
+  # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
+  def scatter_plot(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", smoothFit = true, width = 1200, height = 900)
+    plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
+      [
+        "ggplot(td, aes(#{xName}, #{yName})) + ",
+        "  geom_point(shape = 1) + ", # Each point is non-filled circle
+        (smoothFit ? "  geom_smooth() + " : nil),
+        "  ggtitle(#{title.inspect})"
+      ].compact
+    }
+  end
+  # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
+  def hexbin_heatmap(csvFilePath, graphFilePath, xName, yName, title = "heatmap", bins = 30, width = 1200, height = 900)
+    plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
+      [ "ggplot(td, aes(#{xName}, #{yName})) + geom_hex( bins = #{bins} ) + ggtitle(\"#{title}\")"]
+    }
+  end
+end
+class FeldtRuby::Statistics::RCommunicator
+  include FeldtRuby::Statistics::Plotting
+end
 # Make them available at top level
 extend Statistics

data/lib/feldtruby/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module FeldtRuby
-  VERSION = "0.3.8"
+  VERSION = "0.3.9"
 end

data/spikes/zlib_for_short_strings.rb ADDED Viewed

@@ -0,0 +1,27 @@
+require 'zlib'
+require 'feldtruby/array/basic_stats'
+def rand_string(length, alphabet)
+  as = alphabet.length
+  (1..length).map {alphabet[rand(as)]}.join
+end
+def compress(string)
+  Zlib::Deflate.deflate(string)
+end
+def compression_ratio(string)
+  compress(string).length.to_f/string.length
+end
+def info_for_alphabet(alphabet)
+  puts "for alphabet = #{alphabet.inspect}"
+  ([1,5,10,15,20,25,30,35,40,45,50,60,70]).each do |len|
+    avg_c_len = (1..1000).map {compress(rand_string(len, alphabet)).length}.mean
+    puts( "#{len}: %.2f, %.2f, %.2f" % [avg_c_len, avg_c_len-len, avg_c_len/len.to_f] )
+  end
+end
+info_for_alphabet(("a".."z").to_a)
+info_for_alphabet(("a".."d").to_a)
+info_for_alphabet(("a".."b").to_a)

data/test/skip_test_array_archive.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'feldtruby/statistics/array_archive'
+describe "MinMaxAveragePositionArchive" do
+  before do
+    @a = FeldtRuby::MinMaxAveragePerPositionArchive.new
+  end
+  it "updates the counts as we add arrays" do
+    @a.count.must_equal 0
+    @a.update([1,2,3])
+    @a.count.must_equal 1
+    @a.update([4,5,6])
+    @a.count.must_equal 2
+    @a.update([1,2,3])
+    @a.count.must_equal 3
+  end
+  it "correctly updates the min values" do
+    @a.min_for_position(0).must_equal nil
+    @a.update([1,2,3])
+    @a.min_for_position(0).must_equal 1
+    @a.min_for_position(1).must_equal 2
+    @a.min_for_position(2).must_equal 3
+    @a.mins.must_equal [1,2,3]
+    @a.update([1,5,-2])
+    @a.mins.must_equal [1,2,-2]
+    @a.min_for_position(0).must_equal 1
+    @a.min_for_position(1).must_equal 2
+    @a.min_for_position(2).must_equal -2
+  end
+  it "correctly updates the max values" do
+    @a.max_for_position(0).must_equal nil
+    @a.update([1,2,3])
+    @a.max_for_position(0).must_equal 1
+    @a.max_for_position(1).must_equal 2
+    @a.max_for_position(2).must_equal 3
+    @a.maxs.must_equal [1,2,3]
+    @a.update([1,5,-2])
+    @a.maxs.must_equal [1,5,3]
+    @a.max_for_position(0).must_equal 1
+    @a.max_for_position(1).must_equal 5
+    @a.max_for_position(2).must_equal 3
+  end
+  it "correctly updates the mean values" do
+    @a.mean_for_position(0).must_equal nil
+    @a.update([1,2,3])
+    @a.mean_for_position(0).must_equal 1
+    @a.mean_for_position(1).must_equal 2
+    @a.mean_for_position(2).must_equal 3
+    @a.means.must_equal [1,2,3]
+    @a.update([1,5,-2])
+    @a.means.must_equal [1, 3.5, 0.5]
+    @a.mean_for_position(0).must_equal 1
+    @a.mean_for_position(1).must_equal 3.5
+    @a.mean_for_position(2).must_equal 0.5
+  end
+end

data/test/test_array.rb CHANGED Viewed

@@ -34,6 +34,12 @@ class TestFeldtRubyArray < MiniTest::Unit::TestCase
 end
 describe "Array extensions" do
+	describe "map_with_index" do
+		it "calls the block with both the value and an index" do
+			[1,2,3].map_with_index {|v,i| [v,i]}.must_equal [[1,0], [2,1], [3,2]]
+		end
+	end
 	describe "ranks" do
 		it "works when elements are already in order" do
 			[2.5, 1.5, 0.3].ranks.must_equal [1, 2, 3]

data/test/test_clustering.rb ADDED Viewed

@@ -0,0 +1,53 @@
+require 'feldtruby/statistics/clustering'
+require 'feldtruby/statistics/euclidean_distance'
+describe "Clustering linkage metrics - i.e. distance between clusters of objects in a set" do
+  describe "Average linkage metric" do
+    it "can be calculated on clusters of with only one number each" do
+      alm = FeldtRuby::AverageLinkageMetric.new()
+      alm.calc([[1.0]], [[1.0]]).must_equal 0.0
+      alm.calc([[0.0]], [[1]]).must_equal 1.0
+    end
+    it "can be calculated on clusters of with several numbers in them" do
+      alm = FeldtRuby::AverageLinkageMetric.new()
+      alm.calc([[1], [0]], [[1], [0]]).must_equal 0.5
+      alm.calc([[1], [0], [2]], [[1], [0]]).must_equal (5.0/6)
+      alm.calc([[1], [0], [2]], [[1], [0], [3]]).must_equal (11.0/9)
+    end
+  end
+  describe "Single linkage metric" do
+    it "can be calculated on clusters of with only one float number each" do
+      slm = FeldtRuby::SingleLinkageMetric.new()
+      slm.calc([[1.0]], [[1.0]]).must_equal 0.0
+      slm.calc([[0.0]], [[1.0]]).must_equal 1.0
+    end
+    it "can be calculated on clusters of with several numbers in them" do
+      slm = FeldtRuby::SingleLinkageMetric.new()
+      slm.calc([[1], [0]], [[1], [0]]).must_equal 0.0
+      slm.calc([[1], [2]], [[1], [0]]).must_equal 0.0
+      slm.calc([[1], [2]], [[3], [5]]).must_equal 1.0
+      slm.calc([[1], [2], [3]], [[3], [5]]).must_equal 0.0
+      slm.calc([[1], [2], [3]], [[6], [7]]).must_equal 3.0
+    end
+  end
+  describe "Complete linkage metric" do
+    it "can be calculated on clusters of with only one float number each" do
+      clm = FeldtRuby::CompleteLinkageMetric.new()
+      clm.calc([[1.0]], [[1.0]]).must_equal 0.0
+      clm.calc([[0.0]], [[1.0]]).must_equal 1.0
+    end
+    it "can be calculated on clusters of with several numbers in them" do
+      clm = FeldtRuby::CompleteLinkageMetric.new()
+      clm.calc([[1], [0]], [[1], [0]]).must_equal 1.0
+      clm.calc([[1], [2]], [[1], [0]]).must_equal 2.0
+      clm.calc([[1], [2]], [[3], [5]]).must_equal 4.0
+      clm.calc([[1], [2], [3]], [[3], [5]]).must_equal 4.0
+      clm.calc([[1], [2], [3]], [[6], [7]]).must_equal 6.0
+    end
+  end
+end

data/test/test_euclidean_distance.rb ADDED Viewed

@@ -0,0 +1,28 @@
+require 'feldtruby/statistics/euclidean_distance'
+include FeldtRuby
+describe "Euclidean distance" do
+  it "can be calculated on float vectors of length 1" do
+    euclidean_distance([1.0], [1.0]).must_equal 0.0
+  end
+  it "can be calculated on float vectors of length 2" do
+    euclidean_distance([2.0, -1.0], [-2.0, 2.0]).must_equal 5.0
+  end
+  it "can be calculated on float vectors of length 3" do
+    euclidean_distance([1.0,2.0,3.0], [4.0,5.0,6.0]).must_be_close_to 5.196152
+  end
+  it "can be calculated on int vectors of length 1" do
+    euclidean_distance([1], [1]).must_equal 0.0
+  end
+  it "can be calculated on int vectors of length 2" do
+    euclidean_distance([2, -1], [-2, 2]).must_equal 5.0
+  end
+  it "can be calculated on int vectors of length 3" do
+    euclidean_distance([1,2,3], [4,5,6]).must_be_close_to 5.196152
+  end
+end