feldtruby 0.3.8 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,73 @@
1
+ module FeldtRuby::Optimize
2
+
3
+ # A SubQualititesComparator can compare vectors of sub-qualitites for two individuals
4
+ # and rank the individuals based on if one is better (or dominates) the other.
5
+ class SubQualitiesComparator
6
+ def initialize(objective)
7
+ @objective = objective
8
+ end
9
+
10
+ # Compare two sub-quality vectors and return
11
+ # -1 if the first one dominates the other one
12
+ # 0 if none of them dominate the other
13
+ # 1 if the second one dominates the first one
14
+ def compare_sub_qualitites(subQualitites1, subQualitites2)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def compare_candidates(candidate1, candidate2)
19
+ sq1, sq2 = @objective.sub_qualities_of(candidate1), @objective.sub_qualities_of(candidate2)
20
+ compare_sub_qualitites sq1, sq2
21
+ end
22
+
23
+ # True iff the first dominates the second sub-quality vectors.
24
+ def first_dominates?(subQualitites1, subQualitites2)
25
+ compare_sub_qualitites(subQualitites1, subQualitites2) == -1
26
+ end
27
+
28
+ # True iff the second dominates the first sub-quality vectors.
29
+ def second_dominates?(subQualitites1, subQualitites2)
30
+ compare_sub_qualitites(subQualitites1, subQualitites2) == 1
31
+ end
32
+ end
33
+
34
+ # Epsilon-distance non-dominance comparator. Default epsilon is 0.0 which
35
+ # gives the standard non-dominance comparator.
36
+ class EpsilonNonDominance < SubQualitiesComparator
37
+ def initialize(objective, epsilon = 0.0)
38
+ super(objective)
39
+ @epsilon = epsilon
40
+ end
41
+
42
+ # Map hat operator to paired sub-quality values.
43
+ def map_hat_operator(sq1, sq2)
44
+ # NOTE! Below we assume that all sub-objectives should be minimized. If not we should
45
+ # change the sign of the hat operator return value!
46
+ sq1.zip(sq2).map do |sqv1, sqv2|
47
+ if (sqv1 - sqv2).abs > @epsilon
48
+ (sqv1 < sqv2) ? -1 : 1
49
+ else
50
+ 0
51
+ end
52
+ end
53
+ end
54
+
55
+ def compare_sub_qualitites(subQualitites1, subQualitites2)
56
+ hat_values = map_hat_operator(subQualitites1, subQualitites2)
57
+ num_1_better = num_2_better = 0
58
+ hat_values.each do |hv|
59
+ if hv == -1
60
+ num_1_better += 1
61
+ elsif hv == 1
62
+ num_2_better += 1
63
+ end
64
+ end
65
+ if num_1_better > 0
66
+ (num_2_better == 0) ? -1 : 0
67
+ else
68
+ (num_2_better > 0) ? 1 : 0
69
+ end
70
+ end
71
+ end
72
+
73
+ end
@@ -0,0 +1,66 @@
1
+ require 'feldtruby/array/basic_stats.rb'
2
+
3
+ module FeldtRuby
4
+
5
+ # A ValueArchive keeps basic statistics about values supplied to it in array.
6
+ class ValueArchive
7
+ def initialize
8
+ @count = 0
9
+ end
10
+
11
+ # Returns the number of times an array has been added to the archive.
12
+ attr_reader :count
13
+
14
+ def update(values)
15
+ @count += 1
16
+ end
17
+ end
18
+
19
+ # A PositionBasedValueArchive assumes that each individual position in the supplied
20
+ # value arrays have semantic meaning and thus we should not mix properties we calculate
21
+ # and save between positions.
22
+ class PositionBasedValueArchive < ValueArchive
23
+ end
24
+
25
+ # A MinMaxAveragePerPositionArchive keeps the min, max and average values for each
26
+ # position in the supplied arrays. It can thus be used for min-max-normalization
27
+ # of values in each position.
28
+ class MinMaxMeanPerPositionArchive < PositionBasedValueArchive
29
+ attr_reader :mins, :maxs
30
+
31
+ def initialize
32
+ super
33
+ @mins, @maxs, @sums = [], [], []
34
+ end
35
+ def update(values)
36
+ super
37
+ @mins = update_statistic_per_position(@mins, values) {|newold| newold.compact.min}
38
+ @maxs = update_statistic_per_position(@maxs, values) {|newold| newold.compact.max}
39
+ @sums = update_statistic_per_position(@sums, values) {|newold| newold.compact.sum}
40
+ end
41
+
42
+ def update_statistic_per_position(currentStatistics, values, &updateStatistic)
43
+ values.zip(currentStatistics).map {|newold| updateStatistic.call(newold)}
44
+ end
45
+
46
+ # Return the minimum value we have seen so far in position _index_.
47
+ def min_for_position(index)
48
+ @mins[index]
49
+ end
50
+
51
+ # Return the maximum value we have seen so far in position _index_.
52
+ def max_for_position(index)
53
+ @maxs[index]
54
+ end
55
+
56
+ # Return the maximum value we have seen so far in position _index_.
57
+ def mean_for_position(index)
58
+ (@sums[index] / @count.to_f) if @sums[index]
59
+ end
60
+
61
+ def means
62
+ @sums.map {|v| v/@count.to_f}
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,31 @@
1
+ require 'feldtruby/statistics/distance'
2
+ require 'feldtruby/array/basic_stats'
3
+
4
+ module FeldtRuby
5
+
6
+ class ClusterLinkageMetric < CompositeMetric
7
+ include SetDistance
8
+ end
9
+
10
+ # Average linkage metric between clusters.
11
+ class AverageLinkageMetric < ClusterLinkageMetric
12
+ def calc(cluster1, cluster2)
13
+ pairwise_distances(cluster1, cluster2, @sub_metric).sum.to_f / (cluster1.length * cluster2.length)
14
+ end
15
+ end
16
+
17
+ # Single linkage metric between clusters - distance between nearest members.
18
+ class SingleLinkageMetric < ClusterLinkageMetric
19
+ def calc(cluster1, cluster2)
20
+ pairwise_distances(cluster1, cluster2, @sub_metric).min
21
+ end
22
+ end
23
+
24
+ # Complete linkage metric between clusters - distance between furthest members.
25
+ class CompleteLinkageMetric < ClusterLinkageMetric
26
+ def calc(cluster1, cluster2)
27
+ pairwise_distances(cluster1, cluster2, @sub_metric).max
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,35 @@
1
+ require 'feldtruby/statistics/euclidean_distance'
2
+
3
+ module FeldtRuby
4
+
5
+ class Distance
6
+ def calc(o1, o2)
7
+ raise NotImplementedError
8
+ end
9
+ end
10
+
11
+ module CompositableDistance
12
+ def initialize(metric = EuclideanDistance.new)
13
+ @sub_metric = metric
14
+ end
15
+ end
16
+
17
+ # Functions specific to distances defined on sets of individual objects
18
+ module SetDistance
19
+ def pairwise_distances(set1, set2, metric)
20
+ set1.map {|a| set2.map {|b| metric.calc(a,b)}}.flatten
21
+ end
22
+ end
23
+
24
+ # Metric is a Distance with particular properties. They need to be ensured
25
+ # in sub-classes so not defined here though.
26
+ class Metric < Distance
27
+ end
28
+
29
+ # A CompositeDistance takes another metric as input and calculates a new
30
+ # distance based on it.
31
+ class CompositeMetric < Metric
32
+ include CompositableDistance
33
+ end
34
+
35
+ end
@@ -11,4 +11,8 @@ class EuclideanDistance
11
11
  end
12
12
  end
13
13
 
14
+ def euclidean_distance(o1, o2)
15
+ (@euclidean_distance ||= EuclideanDistance.new).calc(o1, o2)
16
+ end
17
+
14
18
  end
@@ -153,6 +153,54 @@ module Statistics
153
153
  end
154
154
  end
155
155
 
156
+ # Plotting data sets in R with ggplot2 and save them to files.
157
+ module FeldtRuby::Statistics::Plotting
158
+
159
+ def plot_2dims(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", width = 1200, height = 900)
160
+
161
+ include_library("ggplot2")
162
+
163
+ pre = [
164
+ "td <- read.csv(#{csvFilePath.inspect}",
165
+ "png(#{graphFilePath.inspect}, width=#{width}, height=#{height})"
166
+ ]
167
+
168
+ plot = yield()
169
+ plot.last << " theme_bw(base_size = 12, base_family = \"\")"
170
+
171
+ post = [
172
+ "dev.off()"
173
+ ]
174
+
175
+ lines = pre + plot + post
176
+ eval lines.join("\n")
177
+
178
+ end
179
+
180
+ # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
181
+ def scatter_plot(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", smoothFit = true, width = 1200, height = 900)
182
+ plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
183
+ [
184
+ "ggplot(td, aes(#{xName}, #{yName})) + ",
185
+ " geom_point(shape = 1) + ", # Each point is non-filled circle
186
+ (smoothFit ? " geom_smooth() + " : nil),
187
+ " ggtitle(#{title.inspect})"
188
+ ].compact
189
+ }
190
+ end
191
+
192
+ # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
193
+ def hexbin_heatmap(csvFilePath, graphFilePath, xName, yName, title = "heatmap", bins = 30, width = 1200, height = 900)
194
+ plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
195
+ [ "ggplot(td, aes(#{xName}, #{yName})) + geom_hex( bins = #{bins} ) + ggtitle(\"#{title}\")"]
196
+ }
197
+ end
198
+ end
199
+
200
+ class FeldtRuby::Statistics::RCommunicator
201
+ include FeldtRuby::Statistics::Plotting
202
+ end
203
+
156
204
  # Make them available at top level
157
205
  extend Statistics
158
206
 
@@ -1,3 +1,3 @@
1
1
  module FeldtRuby
2
- VERSION = "0.3.8"
2
+ VERSION = "0.3.9"
3
3
  end
@@ -0,0 +1,27 @@
1
+ require 'zlib'
2
+ require 'feldtruby/array/basic_stats'
3
+
4
+ def rand_string(length, alphabet)
5
+ as = alphabet.length
6
+ (1..length).map {alphabet[rand(as)]}.join
7
+ end
8
+
9
+ def compress(string)
10
+ Zlib::Deflate.deflate(string)
11
+ end
12
+
13
+ def compression_ratio(string)
14
+ compress(string).length.to_f/string.length
15
+ end
16
+
17
+ def info_for_alphabet(alphabet)
18
+ puts "for alphabet = #{alphabet.inspect}"
19
+ ([1,5,10,15,20,25,30,35,40,45,50,60,70]).each do |len|
20
+ avg_c_len = (1..1000).map {compress(rand_string(len, alphabet)).length}.mean
21
+ puts( "#{len}: %.2f, %.2f, %.2f" % [avg_c_len, avg_c_len-len, avg_c_len/len.to_f] )
22
+ end
23
+ end
24
+
25
+ info_for_alphabet(("a".."z").to_a)
26
+ info_for_alphabet(("a".."d").to_a)
27
+ info_for_alphabet(("a".."b").to_a)
@@ -0,0 +1,65 @@
1
+ require 'feldtruby/statistics/array_archive'
2
+
3
+ describe "MinMaxAveragePositionArchive" do
4
+ before do
5
+ @a = FeldtRuby::MinMaxAveragePerPositionArchive.new
6
+ end
7
+
8
+ it "updates the counts as we add arrays" do
9
+ @a.count.must_equal 0
10
+ @a.update([1,2,3])
11
+ @a.count.must_equal 1
12
+ @a.update([4,5,6])
13
+ @a.count.must_equal 2
14
+ @a.update([1,2,3])
15
+ @a.count.must_equal 3
16
+ end
17
+
18
+ it "correctly updates the min values" do
19
+ @a.min_for_position(0).must_equal nil
20
+
21
+ @a.update([1,2,3])
22
+ @a.min_for_position(0).must_equal 1
23
+ @a.min_for_position(1).must_equal 2
24
+ @a.min_for_position(2).must_equal 3
25
+ @a.mins.must_equal [1,2,3]
26
+
27
+ @a.update([1,5,-2])
28
+ @a.mins.must_equal [1,2,-2]
29
+ @a.min_for_position(0).must_equal 1
30
+ @a.min_for_position(1).must_equal 2
31
+ @a.min_for_position(2).must_equal -2
32
+ end
33
+
34
+ it "correctly updates the max values" do
35
+ @a.max_for_position(0).must_equal nil
36
+
37
+ @a.update([1,2,3])
38
+ @a.max_for_position(0).must_equal 1
39
+ @a.max_for_position(1).must_equal 2
40
+ @a.max_for_position(2).must_equal 3
41
+ @a.maxs.must_equal [1,2,3]
42
+
43
+ @a.update([1,5,-2])
44
+ @a.maxs.must_equal [1,5,3]
45
+ @a.max_for_position(0).must_equal 1
46
+ @a.max_for_position(1).must_equal 5
47
+ @a.max_for_position(2).must_equal 3
48
+ end
49
+
50
+ it "correctly updates the mean values" do
51
+ @a.mean_for_position(0).must_equal nil
52
+
53
+ @a.update([1,2,3])
54
+ @a.mean_for_position(0).must_equal 1
55
+ @a.mean_for_position(1).must_equal 2
56
+ @a.mean_for_position(2).must_equal 3
57
+ @a.means.must_equal [1,2,3]
58
+
59
+ @a.update([1,5,-2])
60
+ @a.means.must_equal [1, 3.5, 0.5]
61
+ @a.mean_for_position(0).must_equal 1
62
+ @a.mean_for_position(1).must_equal 3.5
63
+ @a.mean_for_position(2).must_equal 0.5
64
+ end
65
+ end
data/test/test_array.rb CHANGED
@@ -34,6 +34,12 @@ class TestFeldtRubyArray < MiniTest::Unit::TestCase
34
34
  end
35
35
 
36
36
  describe "Array extensions" do
37
+ describe "map_with_index" do
38
+ it "calls the block with both the value and an index" do
39
+ [1,2,3].map_with_index {|v,i| [v,i]}.must_equal [[1,0], [2,1], [3,2]]
40
+ end
41
+ end
42
+
37
43
  describe "ranks" do
38
44
  it "works when elements are already in order" do
39
45
  [2.5, 1.5, 0.3].ranks.must_equal [1, 2, 3]
@@ -0,0 +1,53 @@
1
+ require 'feldtruby/statistics/clustering'
2
+ require 'feldtruby/statistics/euclidean_distance'
3
+
4
+ describe "Clustering linkage metrics - i.e. distance between clusters of objects in a set" do
5
+ describe "Average linkage metric" do
6
+ it "can be calculated on clusters of with only one number each" do
7
+ alm = FeldtRuby::AverageLinkageMetric.new()
8
+ alm.calc([[1.0]], [[1.0]]).must_equal 0.0
9
+ alm.calc([[0.0]], [[1]]).must_equal 1.0
10
+ end
11
+
12
+ it "can be calculated on clusters of with several numbers in them" do
13
+ alm = FeldtRuby::AverageLinkageMetric.new()
14
+ alm.calc([[1], [0]], [[1], [0]]).must_equal 0.5
15
+ alm.calc([[1], [0], [2]], [[1], [0]]).must_equal (5.0/6)
16
+ alm.calc([[1], [0], [2]], [[1], [0], [3]]).must_equal (11.0/9)
17
+ end
18
+ end
19
+
20
+ describe "Single linkage metric" do
21
+ it "can be calculated on clusters of with only one float number each" do
22
+ slm = FeldtRuby::SingleLinkageMetric.new()
23
+ slm.calc([[1.0]], [[1.0]]).must_equal 0.0
24
+ slm.calc([[0.0]], [[1.0]]).must_equal 1.0
25
+ end
26
+
27
+ it "can be calculated on clusters of with several numbers in them" do
28
+ slm = FeldtRuby::SingleLinkageMetric.new()
29
+ slm.calc([[1], [0]], [[1], [0]]).must_equal 0.0
30
+ slm.calc([[1], [2]], [[1], [0]]).must_equal 0.0
31
+ slm.calc([[1], [2]], [[3], [5]]).must_equal 1.0
32
+ slm.calc([[1], [2], [3]], [[3], [5]]).must_equal 0.0
33
+ slm.calc([[1], [2], [3]], [[6], [7]]).must_equal 3.0
34
+ end
35
+ end
36
+
37
+ describe "Complete linkage metric" do
38
+ it "can be calculated on clusters of with only one float number each" do
39
+ clm = FeldtRuby::CompleteLinkageMetric.new()
40
+ clm.calc([[1.0]], [[1.0]]).must_equal 0.0
41
+ clm.calc([[0.0]], [[1.0]]).must_equal 1.0
42
+ end
43
+
44
+ it "can be calculated on clusters of with several numbers in them" do
45
+ clm = FeldtRuby::CompleteLinkageMetric.new()
46
+ clm.calc([[1], [0]], [[1], [0]]).must_equal 1.0
47
+ clm.calc([[1], [2]], [[1], [0]]).must_equal 2.0
48
+ clm.calc([[1], [2]], [[3], [5]]).must_equal 4.0
49
+ clm.calc([[1], [2], [3]], [[3], [5]]).must_equal 4.0
50
+ clm.calc([[1], [2], [3]], [[6], [7]]).must_equal 6.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,28 @@
1
+ require 'feldtruby/statistics/euclidean_distance'
2
+ include FeldtRuby
3
+
4
+ describe "Euclidean distance" do
5
+ it "can be calculated on float vectors of length 1" do
6
+ euclidean_distance([1.0], [1.0]).must_equal 0.0
7
+ end
8
+
9
+ it "can be calculated on float vectors of length 2" do
10
+ euclidean_distance([2.0, -1.0], [-2.0, 2.0]).must_equal 5.0
11
+ end
12
+
13
+ it "can be calculated on float vectors of length 3" do
14
+ euclidean_distance([1.0,2.0,3.0], [4.0,5.0,6.0]).must_be_close_to 5.196152
15
+ end
16
+
17
+ it "can be calculated on int vectors of length 1" do
18
+ euclidean_distance([1], [1]).must_equal 0.0
19
+ end
20
+
21
+ it "can be calculated on int vectors of length 2" do
22
+ euclidean_distance([2, -1], [-2, 2]).must_equal 5.0
23
+ end
24
+
25
+ it "can be calculated on int vectors of length 3" do
26
+ euclidean_distance([1,2,3], [4,5,6]).must_be_close_to 5.196152
27
+ end
28
+ end