feldtruby 0.3.8 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.md +9 -11
- data/Rakefile +21 -4
- data/lib/feldtruby/array.rb +5 -0
- data/lib/feldtruby/minitest_extensions.rb +10 -0
- data/lib/feldtruby/optimize/objective.rb +168 -125
- data/lib/feldtruby/optimize/search_space.rb +105 -9
- data/lib/feldtruby/optimize/sub_qualities_comparators.rb +73 -0
- data/lib/feldtruby/statistics/array_archive.rb +66 -0
- data/lib/feldtruby/statistics/clustering.rb +31 -0
- data/lib/feldtruby/statistics/distance.rb +35 -0
- data/lib/feldtruby/statistics/euclidean_distance.rb +4 -0
- data/lib/feldtruby/statistics.rb +48 -0
- data/lib/feldtruby/version.rb +1 -1
- data/spikes/zlib_for_short_strings.rb +27 -0
- data/test/skip_test_array_archive.rb +65 -0
- data/test/test_array.rb +6 -0
- data/test/test_clustering.rb +53 -0
- data/test/test_euclidean_distance.rb +28 -0
- data/test/test_optimize_objective.rb +133 -93
- data/test/test_optimize_search_space.rb +54 -0
- data/test/test_sax.rb +14 -1
- data/test/test_sub_qualitites_comparator.rb +109 -0
- metadata +15 -2
@@ -0,0 +1,73 @@
|
|
1
|
+
module FeldtRuby::Optimize
|
2
|
+
|
3
|
+
# A SubQualititesComparator can compare vectors of sub-qualitites for two individuals
|
4
|
+
# and rank the individuals based on if one is better (or dominates) the other.
|
5
|
+
class SubQualitiesComparator
|
6
|
+
def initialize(objective)
|
7
|
+
@objective = objective
|
8
|
+
end
|
9
|
+
|
10
|
+
# Compare two sub-quality vectors and return
|
11
|
+
# -1 if the first one dominates the other one
|
12
|
+
# 0 if none of them dominate the other
|
13
|
+
# 1 if the second one dominates the first one
|
14
|
+
def compare_sub_qualitites(subQualitites1, subQualitites2)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def compare_candidates(candidate1, candidate2)
|
19
|
+
sq1, sq2 = @objective.sub_qualities_of(candidate1), @objective.sub_qualities_of(candidate2)
|
20
|
+
compare_sub_qualitites sq1, sq2
|
21
|
+
end
|
22
|
+
|
23
|
+
# True iff the first dominates the second sub-quality vectors.
|
24
|
+
def first_dominates?(subQualitites1, subQualitites2)
|
25
|
+
compare_sub_qualitites(subQualitites1, subQualitites2) == -1
|
26
|
+
end
|
27
|
+
|
28
|
+
# True iff the second dominates the first sub-quality vectors.
|
29
|
+
def second_dominates?(subQualitites1, subQualitites2)
|
30
|
+
compare_sub_qualitites(subQualitites1, subQualitites2) == 1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Epsilon-distance non-dominance comparator. Default epsilon is 0.0 which
|
35
|
+
# gives the standard non-dominance comparator.
|
36
|
+
class EpsilonNonDominance < SubQualitiesComparator
|
37
|
+
def initialize(objective, epsilon = 0.0)
|
38
|
+
super(objective)
|
39
|
+
@epsilon = epsilon
|
40
|
+
end
|
41
|
+
|
42
|
+
# Map hat operator to paired sub-quality values.
|
43
|
+
def map_hat_operator(sq1, sq2)
|
44
|
+
# NOTE! Below we assume that all sub-objectives should be minimized. If not we should
|
45
|
+
# change the sign of the hat operator return value!
|
46
|
+
sq1.zip(sq2).map do |sqv1, sqv2|
|
47
|
+
if (sqv1 - sqv2).abs > @epsilon
|
48
|
+
(sqv1 < sqv2) ? -1 : 1
|
49
|
+
else
|
50
|
+
0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def compare_sub_qualitites(subQualitites1, subQualitites2)
|
56
|
+
hat_values = map_hat_operator(subQualitites1, subQualitites2)
|
57
|
+
num_1_better = num_2_better = 0
|
58
|
+
hat_values.each do |hv|
|
59
|
+
if hv == -1
|
60
|
+
num_1_better += 1
|
61
|
+
elsif hv == 1
|
62
|
+
num_2_better += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
if num_1_better > 0
|
66
|
+
(num_2_better == 0) ? -1 : 0
|
67
|
+
else
|
68
|
+
(num_2_better > 0) ? 1 : 0
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'feldtruby/array/basic_stats.rb'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
# A ValueArchive keeps basic statistics about values supplied to it in array.
|
6
|
+
class ValueArchive
|
7
|
+
def initialize
|
8
|
+
@count = 0
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns the number of times an array has been added to the archive.
|
12
|
+
attr_reader :count
|
13
|
+
|
14
|
+
def update(values)
|
15
|
+
@count += 1
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# A PositionBasedValueArchive assumes that each individual position in the supplied
|
20
|
+
# value arrays have semantic meaning and thus we should not mix properties we calculate
|
21
|
+
# and save between positions.
|
22
|
+
class PositionBasedValueArchive < ValueArchive
|
23
|
+
end
|
24
|
+
|
25
|
+
# A MinMaxAveragePerPositionArchive keeps the min, max and average values for each
|
26
|
+
# position in the supplied arrays. It can thus be used for min-max-normalization
|
27
|
+
# of values in each position.
|
28
|
+
class MinMaxMeanPerPositionArchive < PositionBasedValueArchive
|
29
|
+
attr_reader :mins, :maxs
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
super
|
33
|
+
@mins, @maxs, @sums = [], [], []
|
34
|
+
end
|
35
|
+
def update(values)
|
36
|
+
super
|
37
|
+
@mins = update_statistic_per_position(@mins, values) {|newold| newold.compact.min}
|
38
|
+
@maxs = update_statistic_per_position(@maxs, values) {|newold| newold.compact.max}
|
39
|
+
@sums = update_statistic_per_position(@sums, values) {|newold| newold.compact.sum}
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_statistic_per_position(currentStatistics, values, &updateStatistic)
|
43
|
+
values.zip(currentStatistics).map {|newold| updateStatistic.call(newold)}
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return the minimum value we have seen so far in position _index_.
|
47
|
+
def min_for_position(index)
|
48
|
+
@mins[index]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Return the maximum value we have seen so far in position _index_.
|
52
|
+
def max_for_position(index)
|
53
|
+
@maxs[index]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return the maximum value we have seen so far in position _index_.
|
57
|
+
def mean_for_position(index)
|
58
|
+
(@sums[index] / @count.to_f) if @sums[index]
|
59
|
+
end
|
60
|
+
|
61
|
+
def means
|
62
|
+
@sums.map {|v| v/@count.to_f}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'feldtruby/statistics/distance'
|
2
|
+
require 'feldtruby/array/basic_stats'
|
3
|
+
|
4
|
+
module FeldtRuby
|
5
|
+
|
6
|
+
class ClusterLinkageMetric < CompositeMetric
|
7
|
+
include SetDistance
|
8
|
+
end
|
9
|
+
|
10
|
+
# Average linkage metric between clusters.
|
11
|
+
class AverageLinkageMetric < ClusterLinkageMetric
|
12
|
+
def calc(cluster1, cluster2)
|
13
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).sum.to_f / (cluster1.length * cluster2.length)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Single linkage metric between clusters - distance between nearest members.
|
18
|
+
class SingleLinkageMetric < ClusterLinkageMetric
|
19
|
+
def calc(cluster1, cluster2)
|
20
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).min
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Complete linkage metric between clusters - distance between furthest members.
|
25
|
+
class CompleteLinkageMetric < ClusterLinkageMetric
|
26
|
+
def calc(cluster1, cluster2)
|
27
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).max
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'feldtruby/statistics/euclidean_distance'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
class Distance
|
6
|
+
def calc(o1, o2)
|
7
|
+
raise NotImplementedError
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module CompositableDistance
|
12
|
+
def initialize(metric = EuclideanDistance.new)
|
13
|
+
@sub_metric = metric
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Functions specific to distances defined on sets of individual objects
|
18
|
+
module SetDistance
|
19
|
+
def pairwise_distances(set1, set2, metric)
|
20
|
+
set1.map {|a| set2.map {|b| metric.calc(a,b)}}.flatten
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Metric is a Distance with particular properties. They need to be ensured
|
25
|
+
# in sub-classes so not defined here though.
|
26
|
+
class Metric < Distance
|
27
|
+
end
|
28
|
+
|
29
|
+
# A CompositeDistance takes another metric as input and calculates a new
|
30
|
+
# distance based on it.
|
31
|
+
class CompositeMetric < Metric
|
32
|
+
include CompositableDistance
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
data/lib/feldtruby/statistics.rb
CHANGED
@@ -153,6 +153,54 @@ module Statistics
|
|
153
153
|
end
|
154
154
|
end
|
155
155
|
|
156
|
+
# Plotting data sets in R with ggplot2 and save them to files.
|
157
|
+
module FeldtRuby::Statistics::Plotting
|
158
|
+
|
159
|
+
def plot_2dims(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", width = 1200, height = 900)
|
160
|
+
|
161
|
+
include_library("ggplot2")
|
162
|
+
|
163
|
+
pre = [
|
164
|
+
"td <- read.csv(#{csvFilePath.inspect}",
|
165
|
+
"png(#{graphFilePath.inspect}, width=#{width}, height=#{height})"
|
166
|
+
]
|
167
|
+
|
168
|
+
plot = yield()
|
169
|
+
plot.last << " theme_bw(base_size = 12, base_family = \"\")"
|
170
|
+
|
171
|
+
post = [
|
172
|
+
"dev.off()"
|
173
|
+
]
|
174
|
+
|
175
|
+
lines = pre + plot + post
|
176
|
+
eval lines.join("\n")
|
177
|
+
|
178
|
+
end
|
179
|
+
|
180
|
+
# Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
|
181
|
+
def scatter_plot(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", smoothFit = true, width = 1200, height = 900)
|
182
|
+
plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
|
183
|
+
[
|
184
|
+
"ggplot(td, aes(#{xName}, #{yName})) + ",
|
185
|
+
" geom_point(shape = 1) + ", # Each point is non-filled circle
|
186
|
+
(smoothFit ? " geom_smooth() + " : nil),
|
187
|
+
" ggtitle(#{title.inspect})"
|
188
|
+
].compact
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
# Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
|
193
|
+
def hexbin_heatmap(csvFilePath, graphFilePath, xName, yName, title = "heatmap", bins = 30, width = 1200, height = 900)
|
194
|
+
plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
|
195
|
+
[ "ggplot(td, aes(#{xName}, #{yName})) + geom_hex( bins = #{bins} ) + ggtitle(\"#{title}\")"]
|
196
|
+
}
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class FeldtRuby::Statistics::RCommunicator
|
201
|
+
include FeldtRuby::Statistics::Plotting
|
202
|
+
end
|
203
|
+
|
156
204
|
# Make them available at top level
|
157
205
|
extend Statistics
|
158
206
|
|
data/lib/feldtruby/version.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'feldtruby/array/basic_stats'
|
3
|
+
|
4
|
+
def rand_string(length, alphabet)
|
5
|
+
as = alphabet.length
|
6
|
+
(1..length).map {alphabet[rand(as)]}.join
|
7
|
+
end
|
8
|
+
|
9
|
+
def compress(string)
|
10
|
+
Zlib::Deflate.deflate(string)
|
11
|
+
end
|
12
|
+
|
13
|
+
def compression_ratio(string)
|
14
|
+
compress(string).length.to_f/string.length
|
15
|
+
end
|
16
|
+
|
17
|
+
def info_for_alphabet(alphabet)
|
18
|
+
puts "for alphabet = #{alphabet.inspect}"
|
19
|
+
([1,5,10,15,20,25,30,35,40,45,50,60,70]).each do |len|
|
20
|
+
avg_c_len = (1..1000).map {compress(rand_string(len, alphabet)).length}.mean
|
21
|
+
puts( "#{len}: %.2f, %.2f, %.2f" % [avg_c_len, avg_c_len-len, avg_c_len/len.to_f] )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
info_for_alphabet(("a".."z").to_a)
|
26
|
+
info_for_alphabet(("a".."d").to_a)
|
27
|
+
info_for_alphabet(("a".."b").to_a)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'feldtruby/statistics/array_archive'
|
2
|
+
|
3
|
+
describe "MinMaxAveragePositionArchive" do
|
4
|
+
before do
|
5
|
+
@a = FeldtRuby::MinMaxAveragePerPositionArchive.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it "updates the counts as we add arrays" do
|
9
|
+
@a.count.must_equal 0
|
10
|
+
@a.update([1,2,3])
|
11
|
+
@a.count.must_equal 1
|
12
|
+
@a.update([4,5,6])
|
13
|
+
@a.count.must_equal 2
|
14
|
+
@a.update([1,2,3])
|
15
|
+
@a.count.must_equal 3
|
16
|
+
end
|
17
|
+
|
18
|
+
it "correctly updates the min values" do
|
19
|
+
@a.min_for_position(0).must_equal nil
|
20
|
+
|
21
|
+
@a.update([1,2,3])
|
22
|
+
@a.min_for_position(0).must_equal 1
|
23
|
+
@a.min_for_position(1).must_equal 2
|
24
|
+
@a.min_for_position(2).must_equal 3
|
25
|
+
@a.mins.must_equal [1,2,3]
|
26
|
+
|
27
|
+
@a.update([1,5,-2])
|
28
|
+
@a.mins.must_equal [1,2,-2]
|
29
|
+
@a.min_for_position(0).must_equal 1
|
30
|
+
@a.min_for_position(1).must_equal 2
|
31
|
+
@a.min_for_position(2).must_equal -2
|
32
|
+
end
|
33
|
+
|
34
|
+
it "correctly updates the max values" do
|
35
|
+
@a.max_for_position(0).must_equal nil
|
36
|
+
|
37
|
+
@a.update([1,2,3])
|
38
|
+
@a.max_for_position(0).must_equal 1
|
39
|
+
@a.max_for_position(1).must_equal 2
|
40
|
+
@a.max_for_position(2).must_equal 3
|
41
|
+
@a.maxs.must_equal [1,2,3]
|
42
|
+
|
43
|
+
@a.update([1,5,-2])
|
44
|
+
@a.maxs.must_equal [1,5,3]
|
45
|
+
@a.max_for_position(0).must_equal 1
|
46
|
+
@a.max_for_position(1).must_equal 5
|
47
|
+
@a.max_for_position(2).must_equal 3
|
48
|
+
end
|
49
|
+
|
50
|
+
it "correctly updates the mean values" do
|
51
|
+
@a.mean_for_position(0).must_equal nil
|
52
|
+
|
53
|
+
@a.update([1,2,3])
|
54
|
+
@a.mean_for_position(0).must_equal 1
|
55
|
+
@a.mean_for_position(1).must_equal 2
|
56
|
+
@a.mean_for_position(2).must_equal 3
|
57
|
+
@a.means.must_equal [1,2,3]
|
58
|
+
|
59
|
+
@a.update([1,5,-2])
|
60
|
+
@a.means.must_equal [1, 3.5, 0.5]
|
61
|
+
@a.mean_for_position(0).must_equal 1
|
62
|
+
@a.mean_for_position(1).must_equal 3.5
|
63
|
+
@a.mean_for_position(2).must_equal 0.5
|
64
|
+
end
|
65
|
+
end
|
data/test/test_array.rb
CHANGED
@@ -34,6 +34,12 @@ class TestFeldtRubyArray < MiniTest::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
|
36
36
|
describe "Array extensions" do
|
37
|
+
describe "map_with_index" do
|
38
|
+
it "calls the block with both the value and an index" do
|
39
|
+
[1,2,3].map_with_index {|v,i| [v,i]}.must_equal [[1,0], [2,1], [3,2]]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
37
43
|
describe "ranks" do
|
38
44
|
it "works when elements are already in order" do
|
39
45
|
[2.5, 1.5, 0.3].ranks.must_equal [1, 2, 3]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'feldtruby/statistics/clustering'
|
2
|
+
require 'feldtruby/statistics/euclidean_distance'
|
3
|
+
|
4
|
+
describe "Clustering linkage metrics - i.e. distance between clusters of objects in a set" do
|
5
|
+
describe "Average linkage metric" do
|
6
|
+
it "can be calculated on clusters of with only one number each" do
|
7
|
+
alm = FeldtRuby::AverageLinkageMetric.new()
|
8
|
+
alm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
9
|
+
alm.calc([[0.0]], [[1]]).must_equal 1.0
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can be calculated on clusters of with several numbers in them" do
|
13
|
+
alm = FeldtRuby::AverageLinkageMetric.new()
|
14
|
+
alm.calc([[1], [0]], [[1], [0]]).must_equal 0.5
|
15
|
+
alm.calc([[1], [0], [2]], [[1], [0]]).must_equal (5.0/6)
|
16
|
+
alm.calc([[1], [0], [2]], [[1], [0], [3]]).must_equal (11.0/9)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Single linkage metric" do
|
21
|
+
it "can be calculated on clusters of with only one float number each" do
|
22
|
+
slm = FeldtRuby::SingleLinkageMetric.new()
|
23
|
+
slm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
24
|
+
slm.calc([[0.0]], [[1.0]]).must_equal 1.0
|
25
|
+
end
|
26
|
+
|
27
|
+
it "can be calculated on clusters of with several numbers in them" do
|
28
|
+
slm = FeldtRuby::SingleLinkageMetric.new()
|
29
|
+
slm.calc([[1], [0]], [[1], [0]]).must_equal 0.0
|
30
|
+
slm.calc([[1], [2]], [[1], [0]]).must_equal 0.0
|
31
|
+
slm.calc([[1], [2]], [[3], [5]]).must_equal 1.0
|
32
|
+
slm.calc([[1], [2], [3]], [[3], [5]]).must_equal 0.0
|
33
|
+
slm.calc([[1], [2], [3]], [[6], [7]]).must_equal 3.0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "Complete linkage metric" do
|
38
|
+
it "can be calculated on clusters of with only one float number each" do
|
39
|
+
clm = FeldtRuby::CompleteLinkageMetric.new()
|
40
|
+
clm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
41
|
+
clm.calc([[0.0]], [[1.0]]).must_equal 1.0
|
42
|
+
end
|
43
|
+
|
44
|
+
it "can be calculated on clusters of with several numbers in them" do
|
45
|
+
clm = FeldtRuby::CompleteLinkageMetric.new()
|
46
|
+
clm.calc([[1], [0]], [[1], [0]]).must_equal 1.0
|
47
|
+
clm.calc([[1], [2]], [[1], [0]]).must_equal 2.0
|
48
|
+
clm.calc([[1], [2]], [[3], [5]]).must_equal 4.0
|
49
|
+
clm.calc([[1], [2], [3]], [[3], [5]]).must_equal 4.0
|
50
|
+
clm.calc([[1], [2], [3]], [[6], [7]]).must_equal 6.0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'feldtruby/statistics/euclidean_distance'
|
2
|
+
include FeldtRuby
|
3
|
+
|
4
|
+
describe "Euclidean distance" do
|
5
|
+
it "can be calculated on float vectors of length 1" do
|
6
|
+
euclidean_distance([1.0], [1.0]).must_equal 0.0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "can be calculated on float vectors of length 2" do
|
10
|
+
euclidean_distance([2.0, -1.0], [-2.0, 2.0]).must_equal 5.0
|
11
|
+
end
|
12
|
+
|
13
|
+
it "can be calculated on float vectors of length 3" do
|
14
|
+
euclidean_distance([1.0,2.0,3.0], [4.0,5.0,6.0]).must_be_close_to 5.196152
|
15
|
+
end
|
16
|
+
|
17
|
+
it "can be calculated on int vectors of length 1" do
|
18
|
+
euclidean_distance([1], [1]).must_equal 0.0
|
19
|
+
end
|
20
|
+
|
21
|
+
it "can be calculated on int vectors of length 2" do
|
22
|
+
euclidean_distance([2, -1], [-2, 2]).must_equal 5.0
|
23
|
+
end
|
24
|
+
|
25
|
+
it "can be calculated on int vectors of length 3" do
|
26
|
+
euclidean_distance([1,2,3], [4,5,6]).must_be_close_to 5.196152
|
27
|
+
end
|
28
|
+
end
|