feldtruby 0.3.8 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ module FeldtRuby::Optimize
2
+
3
+ # A SubQualititesComparator can compare vectors of sub-qualitites for two individuals
4
+ # and rank the individuals based on if one is better (or dominates) the other.
5
+ class SubQualitiesComparator
6
+ def initialize(objective)
7
+ @objective = objective
8
+ end
9
+
10
+ # Compare two sub-quality vectors and return
11
+ # -1 if the first one dominates the other one
12
+ # 0 if none of them dominate the other
13
+ # 1 if the second one dominates the first one
14
+ def compare_sub_qualitites(subQualitites1, subQualitites2)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def compare_candidates(candidate1, candidate2)
19
+ sq1, sq2 = @objective.sub_qualities_of(candidate1), @objective.sub_qualities_of(candidate2)
20
+ compare_sub_qualitites sq1, sq2
21
+ end
22
+
23
+ # True iff the first dominates the second sub-quality vectors.
24
+ def first_dominates?(subQualitites1, subQualitites2)
25
+ compare_sub_qualitites(subQualitites1, subQualitites2) == -1
26
+ end
27
+
28
+ # True iff the second dominates the first sub-quality vectors.
29
+ def second_dominates?(subQualitites1, subQualitites2)
30
+ compare_sub_qualitites(subQualitites1, subQualitites2) == 1
31
+ end
32
+ end
33
+
34
+ # Epsilon-distance non-dominance comparator. Default epsilon is 0.0 which
35
+ # gives the standard non-dominance comparator.
36
+ class EpsilonNonDominance < SubQualitiesComparator
37
+ def initialize(objective, epsilon = 0.0)
38
+ super(objective)
39
+ @epsilon = epsilon
40
+ end
41
+
42
+ # Map hat operator to paired sub-quality values.
43
+ def map_hat_operator(sq1, sq2)
44
+ # NOTE! Below we assume that all sub-objectives should be minimized. If not we should
45
+ # change the sign of the hat operator return value!
46
+ sq1.zip(sq2).map do |sqv1, sqv2|
47
+ if (sqv1 - sqv2).abs > @epsilon
48
+ (sqv1 < sqv2) ? -1 : 1
49
+ else
50
+ 0
51
+ end
52
+ end
53
+ end
54
+
55
+ def compare_sub_qualitites(subQualitites1, subQualitites2)
56
+ hat_values = map_hat_operator(subQualitites1, subQualitites2)
57
+ num_1_better = num_2_better = 0
58
+ hat_values.each do |hv|
59
+ if hv == -1
60
+ num_1_better += 1
61
+ elsif hv == 1
62
+ num_2_better += 1
63
+ end
64
+ end
65
+ if num_1_better > 0
66
+ (num_2_better == 0) ? -1 : 0
67
+ else
68
+ (num_2_better > 0) ? 1 : 0
69
+ end
70
+ end
71
+ end
72
+
73
+ end
@@ -0,0 +1,66 @@
1
+ require 'feldtruby/array/basic_stats.rb'
2
+
3
+ module FeldtRuby
4
+
5
+ # A ValueArchive keeps basic statistics about values supplied to it in array.
6
+ class ValueArchive
7
+ def initialize
8
+ @count = 0
9
+ end
10
+
11
+ # Returns the number of times an array has been added to the archive.
12
+ attr_reader :count
13
+
14
+ def update(values)
15
+ @count += 1
16
+ end
17
+ end
18
+
19
+ # A PositionBasedValueArchive assumes that each individual position in the supplied
20
+ # value arrays have semantic meaning and thus we should not mix properties we calculate
21
+ # and save between positions.
22
+ class PositionBasedValueArchive < ValueArchive
23
+ end
24
+
25
+ # A MinMaxAveragePerPositionArchive keeps the min, max and average values for each
26
+ # position in the supplied arrays. It can thus be used for min-max-normalization
27
+ # of values in each position.
28
+ class MinMaxMeanPerPositionArchive < PositionBasedValueArchive
29
+ attr_reader :mins, :maxs
30
+
31
+ def initialize
32
+ super
33
+ @mins, @maxs, @sums = [], [], []
34
+ end
35
+ def update(values)
36
+ super
37
+ @mins = update_statistic_per_position(@mins, values) {|newold| newold.compact.min}
38
+ @maxs = update_statistic_per_position(@maxs, values) {|newold| newold.compact.max}
39
+ @sums = update_statistic_per_position(@sums, values) {|newold| newold.compact.sum}
40
+ end
41
+
42
+ def update_statistic_per_position(currentStatistics, values, &updateStatistic)
43
+ values.zip(currentStatistics).map {|newold| updateStatistic.call(newold)}
44
+ end
45
+
46
+ # Return the minimum value we have seen so far in position _index_.
47
+ def min_for_position(index)
48
+ @mins[index]
49
+ end
50
+
51
+ # Return the maximum value we have seen so far in position _index_.
52
+ def max_for_position(index)
53
+ @maxs[index]
54
+ end
55
+
56
+ # Return the maximum value we have seen so far in position _index_.
57
+ def mean_for_position(index)
58
+ (@sums[index] / @count.to_f) if @sums[index]
59
+ end
60
+
61
+ def means
62
+ @sums.map {|v| v/@count.to_f}
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,31 @@
1
+ require 'feldtruby/statistics/distance'
2
+ require 'feldtruby/array/basic_stats'
3
+
4
+ module FeldtRuby
5
+
6
+ class ClusterLinkageMetric < CompositeMetric
7
+ include SetDistance
8
+ end
9
+
10
+ # Average linkage metric between clusters.
11
+ class AverageLinkageMetric < ClusterLinkageMetric
12
+ def calc(cluster1, cluster2)
13
+ pairwise_distances(cluster1, cluster2, @sub_metric).sum.to_f / (cluster1.length * cluster2.length)
14
+ end
15
+ end
16
+
17
+ # Single linkage metric between clusters - distance between nearest members.
18
+ class SingleLinkageMetric < ClusterLinkageMetric
19
+ def calc(cluster1, cluster2)
20
+ pairwise_distances(cluster1, cluster2, @sub_metric).min
21
+ end
22
+ end
23
+
24
+ # Complete linkage metric between clusters - distance between furthest members.
25
+ class CompleteLinkageMetric < ClusterLinkageMetric
26
+ def calc(cluster1, cluster2)
27
+ pairwise_distances(cluster1, cluster2, @sub_metric).max
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,35 @@
1
+ require 'feldtruby/statistics/euclidean_distance'
2
+
3
+ module FeldtRuby
4
+
5
+ class Distance
6
+ def calc(o1, o2)
7
+ raise NotImplementedError
8
+ end
9
+ end
10
+
11
+ module CompositableDistance
12
+ def initialize(metric = EuclideanDistance.new)
13
+ @sub_metric = metric
14
+ end
15
+ end
16
+
17
+ # Functions specific to distances defined on sets of individual objects
18
+ module SetDistance
19
+ def pairwise_distances(set1, set2, metric)
20
+ set1.map {|a| set2.map {|b| metric.calc(a,b)}}.flatten
21
+ end
22
+ end
23
+
24
+ # Metric is a Distance with particular properties. They need to be ensured
25
+ # in sub-classes so not defined here though.
26
+ class Metric < Distance
27
+ end
28
+
29
+ # A CompositeDistance takes another metric as input and calculates a new
30
+ # distance based on it.
31
+ class CompositeMetric < Metric
32
+ include CompositableDistance
33
+ end
34
+
35
+ end
@@ -11,4 +11,8 @@ class EuclideanDistance
11
11
  end
12
12
  end
13
13
 
14
+ def euclidean_distance(o1, o2)
15
+ (@euclidean_distance ||= EuclideanDistance.new).calc(o1, o2)
16
+ end
17
+
14
18
  end
@@ -153,6 +153,54 @@ module Statistics
153
153
  end
154
154
  end
155
155
 
156
+ # Plotting data sets in R with ggplot2 and save them to files.
157
+ module FeldtRuby::Statistics::Plotting
158
+
159
+ def plot_2dims(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", width = 1200, height = 900)
160
+
161
+ include_library("ggplot2")
162
+
163
+ pre = [
164
+ "td <- read.csv(#{csvFilePath.inspect}",
165
+ "png(#{graphFilePath.inspect}, width=#{width}, height=#{height})"
166
+ ]
167
+
168
+ plot = yield()
169
+ plot.last << " theme_bw(base_size = 12, base_family = \"\")"
170
+
171
+ post = [
172
+ "dev.off()"
173
+ ]
174
+
175
+ lines = pre + plot + post
176
+ eval lines.join("\n")
177
+
178
+ end
179
+
180
+ # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
181
+ def scatter_plot(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", smoothFit = true, width = 1200, height = 900)
182
+ plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
183
+ [
184
+ "ggplot(td, aes(#{xName}, #{yName})) + ",
185
+ " geom_point(shape = 1) + ", # Each point is non-filled circle
186
+ (smoothFit ? " geom_smooth() + " : nil),
187
+ " ggtitle(#{title.inspect})"
188
+ ].compact
189
+ }
190
+ end
191
+
192
+ # Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
193
+ def hexbin_heatmap(csvFilePath, graphFilePath, xName, yName, title = "heatmap", bins = 30, width = 1200, height = 900)
194
+ plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
195
+ [ "ggplot(td, aes(#{xName}, #{yName})) + geom_hex( bins = #{bins} ) + ggtitle(\"#{title}\")"]
196
+ }
197
+ end
198
+ end
199
+
200
+ class FeldtRuby::Statistics::RCommunicator
201
+ include FeldtRuby::Statistics::Plotting
202
+ end
203
+
156
204
  # Make them available at top level
157
205
  extend Statistics
158
206
 
@@ -1,3 +1,3 @@
1
1
  module FeldtRuby
2
- VERSION = "0.3.8"
2
+ VERSION = "0.3.9"
3
3
  end
@@ -0,0 +1,27 @@
1
+ require 'zlib'
2
+ require 'feldtruby/array/basic_stats'
3
+
4
+ def rand_string(length, alphabet)
5
+ as = alphabet.length
6
+ (1..length).map {alphabet[rand(as)]}.join
7
+ end
8
+
9
+ def compress(string)
10
+ Zlib::Deflate.deflate(string)
11
+ end
12
+
13
+ def compression_ratio(string)
14
+ compress(string).length.to_f/string.length
15
+ end
16
+
17
+ def info_for_alphabet(alphabet)
18
+ puts "for alphabet = #{alphabet.inspect}"
19
+ ([1,5,10,15,20,25,30,35,40,45,50,60,70]).each do |len|
20
+ avg_c_len = (1..1000).map {compress(rand_string(len, alphabet)).length}.mean
21
+ puts( "#{len}: %.2f, %.2f, %.2f" % [avg_c_len, avg_c_len-len, avg_c_len/len.to_f] )
22
+ end
23
+ end
24
+
25
+ info_for_alphabet(("a".."z").to_a)
26
+ info_for_alphabet(("a".."d").to_a)
27
+ info_for_alphabet(("a".."b").to_a)
@@ -0,0 +1,65 @@
1
+ require 'feldtruby/statistics/array_archive'
2
+
3
+ describe "MinMaxAveragePositionArchive" do
4
+ before do
5
+ @a = FeldtRuby::MinMaxAveragePerPositionArchive.new
6
+ end
7
+
8
+ it "updates the counts as we add arrays" do
9
+ @a.count.must_equal 0
10
+ @a.update([1,2,3])
11
+ @a.count.must_equal 1
12
+ @a.update([4,5,6])
13
+ @a.count.must_equal 2
14
+ @a.update([1,2,3])
15
+ @a.count.must_equal 3
16
+ end
17
+
18
+ it "correctly updates the min values" do
19
+ @a.min_for_position(0).must_equal nil
20
+
21
+ @a.update([1,2,3])
22
+ @a.min_for_position(0).must_equal 1
23
+ @a.min_for_position(1).must_equal 2
24
+ @a.min_for_position(2).must_equal 3
25
+ @a.mins.must_equal [1,2,3]
26
+
27
+ @a.update([1,5,-2])
28
+ @a.mins.must_equal [1,2,-2]
29
+ @a.min_for_position(0).must_equal 1
30
+ @a.min_for_position(1).must_equal 2
31
+ @a.min_for_position(2).must_equal -2
32
+ end
33
+
34
+ it "correctly updates the max values" do
35
+ @a.max_for_position(0).must_equal nil
36
+
37
+ @a.update([1,2,3])
38
+ @a.max_for_position(0).must_equal 1
39
+ @a.max_for_position(1).must_equal 2
40
+ @a.max_for_position(2).must_equal 3
41
+ @a.maxs.must_equal [1,2,3]
42
+
43
+ @a.update([1,5,-2])
44
+ @a.maxs.must_equal [1,5,3]
45
+ @a.max_for_position(0).must_equal 1
46
+ @a.max_for_position(1).must_equal 5
47
+ @a.max_for_position(2).must_equal 3
48
+ end
49
+
50
+ it "correctly updates the mean values" do
51
+ @a.mean_for_position(0).must_equal nil
52
+
53
+ @a.update([1,2,3])
54
+ @a.mean_for_position(0).must_equal 1
55
+ @a.mean_for_position(1).must_equal 2
56
+ @a.mean_for_position(2).must_equal 3
57
+ @a.means.must_equal [1,2,3]
58
+
59
+ @a.update([1,5,-2])
60
+ @a.means.must_equal [1, 3.5, 0.5]
61
+ @a.mean_for_position(0).must_equal 1
62
+ @a.mean_for_position(1).must_equal 3.5
63
+ @a.mean_for_position(2).must_equal 0.5
64
+ end
65
+ end
data/test/test_array.rb CHANGED
@@ -34,6 +34,12 @@ class TestFeldtRubyArray < MiniTest::Unit::TestCase
34
34
  end
35
35
 
36
36
  describe "Array extensions" do
37
+ describe "map_with_index" do
38
+ it "calls the block with both the value and an index" do
39
+ [1,2,3].map_with_index {|v,i| [v,i]}.must_equal [[1,0], [2,1], [3,2]]
40
+ end
41
+ end
42
+
37
43
  describe "ranks" do
38
44
  it "works when elements are already in order" do
39
45
  [2.5, 1.5, 0.3].ranks.must_equal [1, 2, 3]
@@ -0,0 +1,53 @@
1
+ require 'feldtruby/statistics/clustering'
2
+ require 'feldtruby/statistics/euclidean_distance'
3
+
4
+ describe "Clustering linkage metrics - i.e. distance between clusters of objects in a set" do
5
+ describe "Average linkage metric" do
6
+ it "can be calculated on clusters of with only one number each" do
7
+ alm = FeldtRuby::AverageLinkageMetric.new()
8
+ alm.calc([[1.0]], [[1.0]]).must_equal 0.0
9
+ alm.calc([[0.0]], [[1]]).must_equal 1.0
10
+ end
11
+
12
+ it "can be calculated on clusters of with several numbers in them" do
13
+ alm = FeldtRuby::AverageLinkageMetric.new()
14
+ alm.calc([[1], [0]], [[1], [0]]).must_equal 0.5
15
+ alm.calc([[1], [0], [2]], [[1], [0]]).must_equal (5.0/6)
16
+ alm.calc([[1], [0], [2]], [[1], [0], [3]]).must_equal (11.0/9)
17
+ end
18
+ end
19
+
20
+ describe "Single linkage metric" do
21
+ it "can be calculated on clusters of with only one float number each" do
22
+ slm = FeldtRuby::SingleLinkageMetric.new()
23
+ slm.calc([[1.0]], [[1.0]]).must_equal 0.0
24
+ slm.calc([[0.0]], [[1.0]]).must_equal 1.0
25
+ end
26
+
27
+ it "can be calculated on clusters of with several numbers in them" do
28
+ slm = FeldtRuby::SingleLinkageMetric.new()
29
+ slm.calc([[1], [0]], [[1], [0]]).must_equal 0.0
30
+ slm.calc([[1], [2]], [[1], [0]]).must_equal 0.0
31
+ slm.calc([[1], [2]], [[3], [5]]).must_equal 1.0
32
+ slm.calc([[1], [2], [3]], [[3], [5]]).must_equal 0.0
33
+ slm.calc([[1], [2], [3]], [[6], [7]]).must_equal 3.0
34
+ end
35
+ end
36
+
37
+ describe "Complete linkage metric" do
38
+ it "can be calculated on clusters of with only one float number each" do
39
+ clm = FeldtRuby::CompleteLinkageMetric.new()
40
+ clm.calc([[1.0]], [[1.0]]).must_equal 0.0
41
+ clm.calc([[0.0]], [[1.0]]).must_equal 1.0
42
+ end
43
+
44
+ it "can be calculated on clusters of with several numbers in them" do
45
+ clm = FeldtRuby::CompleteLinkageMetric.new()
46
+ clm.calc([[1], [0]], [[1], [0]]).must_equal 1.0
47
+ clm.calc([[1], [2]], [[1], [0]]).must_equal 2.0
48
+ clm.calc([[1], [2]], [[3], [5]]).must_equal 4.0
49
+ clm.calc([[1], [2], [3]], [[3], [5]]).must_equal 4.0
50
+ clm.calc([[1], [2], [3]], [[6], [7]]).must_equal 6.0
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,28 @@
1
+ require 'feldtruby/statistics/euclidean_distance'
2
+ include FeldtRuby
3
+
4
+ describe "Euclidean distance" do
5
+ it "can be calculated on float vectors of length 1" do
6
+ euclidean_distance([1.0], [1.0]).must_equal 0.0
7
+ end
8
+
9
+ it "can be calculated on float vectors of length 2" do
10
+ euclidean_distance([2.0, -1.0], [-2.0, 2.0]).must_equal 5.0
11
+ end
12
+
13
+ it "can be calculated on float vectors of length 3" do
14
+ euclidean_distance([1.0,2.0,3.0], [4.0,5.0,6.0]).must_be_close_to 5.196152
15
+ end
16
+
17
+ it "can be calculated on int vectors of length 1" do
18
+ euclidean_distance([1], [1]).must_equal 0.0
19
+ end
20
+
21
+ it "can be calculated on int vectors of length 2" do
22
+ euclidean_distance([2, -1], [-2, 2]).must_equal 5.0
23
+ end
24
+
25
+ it "can be calculated on int vectors of length 3" do
26
+ euclidean_distance([1,2,3], [4,5,6]).must_be_close_to 5.196152
27
+ end
28
+ end