feldtruby 0.3.8 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.md +9 -11
- data/Rakefile +21 -4
- data/lib/feldtruby/array.rb +5 -0
- data/lib/feldtruby/minitest_extensions.rb +10 -0
- data/lib/feldtruby/optimize/objective.rb +168 -125
- data/lib/feldtruby/optimize/search_space.rb +105 -9
- data/lib/feldtruby/optimize/sub_qualities_comparators.rb +73 -0
- data/lib/feldtruby/statistics/array_archive.rb +66 -0
- data/lib/feldtruby/statistics/clustering.rb +31 -0
- data/lib/feldtruby/statistics/distance.rb +35 -0
- data/lib/feldtruby/statistics/euclidean_distance.rb +4 -0
- data/lib/feldtruby/statistics.rb +48 -0
- data/lib/feldtruby/version.rb +1 -1
- data/spikes/zlib_for_short_strings.rb +27 -0
- data/test/skip_test_array_archive.rb +65 -0
- data/test/test_array.rb +6 -0
- data/test/test_clustering.rb +53 -0
- data/test/test_euclidean_distance.rb +28 -0
- data/test/test_optimize_objective.rb +133 -93
- data/test/test_optimize_search_space.rb +54 -0
- data/test/test_sax.rb +14 -1
- data/test/test_sub_qualitites_comparator.rb +109 -0
- metadata +15 -2
@@ -0,0 +1,73 @@
|
|
1
|
+
module FeldtRuby::Optimize
|
2
|
+
|
3
|
+
# A SubQualititesComparator can compare vectors of sub-qualitites for two individuals
|
4
|
+
# and rank the individuals based on if one is better (or dominates) the other.
|
5
|
+
class SubQualitiesComparator
|
6
|
+
def initialize(objective)
|
7
|
+
@objective = objective
|
8
|
+
end
|
9
|
+
|
10
|
+
# Compare two sub-quality vectors and return
|
11
|
+
# -1 if the first one dominates the other one
|
12
|
+
# 0 if none of them dominate the other
|
13
|
+
# 1 if the second one dominates the first one
|
14
|
+
def compare_sub_qualitites(subQualitites1, subQualitites2)
|
15
|
+
raise NotImplementedError
|
16
|
+
end
|
17
|
+
|
18
|
+
def compare_candidates(candidate1, candidate2)
|
19
|
+
sq1, sq2 = @objective.sub_qualities_of(candidate1), @objective.sub_qualities_of(candidate2)
|
20
|
+
compare_sub_qualitites sq1, sq2
|
21
|
+
end
|
22
|
+
|
23
|
+
# True iff the first dominates the second sub-quality vectors.
|
24
|
+
def first_dominates?(subQualitites1, subQualitites2)
|
25
|
+
compare_sub_qualitites(subQualitites1, subQualitites2) == -1
|
26
|
+
end
|
27
|
+
|
28
|
+
# True iff the second dominates the first sub-quality vectors.
|
29
|
+
def second_dominates?(subQualitites1, subQualitites2)
|
30
|
+
compare_sub_qualitites(subQualitites1, subQualitites2) == 1
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Epsilon-distance non-dominance comparator. Default epsilon is 0.0 which
|
35
|
+
# gives the standard non-dominance comparator.
|
36
|
+
class EpsilonNonDominance < SubQualitiesComparator
|
37
|
+
def initialize(objective, epsilon = 0.0)
|
38
|
+
super(objective)
|
39
|
+
@epsilon = epsilon
|
40
|
+
end
|
41
|
+
|
42
|
+
# Map hat operator to paired sub-quality values.
|
43
|
+
def map_hat_operator(sq1, sq2)
|
44
|
+
# NOTE! Below we assume that all sub-objectives should be minimized. If not we should
|
45
|
+
# change the sign of the hat operator return value!
|
46
|
+
sq1.zip(sq2).map do |sqv1, sqv2|
|
47
|
+
if (sqv1 - sqv2).abs > @epsilon
|
48
|
+
(sqv1 < sqv2) ? -1 : 1
|
49
|
+
else
|
50
|
+
0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def compare_sub_qualitites(subQualitites1, subQualitites2)
|
56
|
+
hat_values = map_hat_operator(subQualitites1, subQualitites2)
|
57
|
+
num_1_better = num_2_better = 0
|
58
|
+
hat_values.each do |hv|
|
59
|
+
if hv == -1
|
60
|
+
num_1_better += 1
|
61
|
+
elsif hv == 1
|
62
|
+
num_2_better += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
if num_1_better > 0
|
66
|
+
(num_2_better == 0) ? -1 : 0
|
67
|
+
else
|
68
|
+
(num_2_better > 0) ? 1 : 0
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'feldtruby/array/basic_stats.rb'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
# A ValueArchive keeps basic statistics about values supplied to it in array.
|
6
|
+
class ValueArchive
|
7
|
+
def initialize
|
8
|
+
@count = 0
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns the number of times an array has been added to the archive.
|
12
|
+
attr_reader :count
|
13
|
+
|
14
|
+
def update(values)
|
15
|
+
@count += 1
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# A PositionBasedValueArchive assumes that each individual position in the supplied
|
20
|
+
# value arrays have semantic meaning and thus we should not mix properties we calculate
|
21
|
+
# and save between positions.
|
22
|
+
class PositionBasedValueArchive < ValueArchive
|
23
|
+
end
|
24
|
+
|
25
|
+
# A MinMaxAveragePerPositionArchive keeps the min, max and average values for each
|
26
|
+
# position in the supplied arrays. It can thus be used for min-max-normalization
|
27
|
+
# of values in each position.
|
28
|
+
class MinMaxMeanPerPositionArchive < PositionBasedValueArchive
|
29
|
+
attr_reader :mins, :maxs
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
super
|
33
|
+
@mins, @maxs, @sums = [], [], []
|
34
|
+
end
|
35
|
+
def update(values)
|
36
|
+
super
|
37
|
+
@mins = update_statistic_per_position(@mins, values) {|newold| newold.compact.min}
|
38
|
+
@maxs = update_statistic_per_position(@maxs, values) {|newold| newold.compact.max}
|
39
|
+
@sums = update_statistic_per_position(@sums, values) {|newold| newold.compact.sum}
|
40
|
+
end
|
41
|
+
|
42
|
+
def update_statistic_per_position(currentStatistics, values, &updateStatistic)
|
43
|
+
values.zip(currentStatistics).map {|newold| updateStatistic.call(newold)}
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return the minimum value we have seen so far in position _index_.
|
47
|
+
def min_for_position(index)
|
48
|
+
@mins[index]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Return the maximum value we have seen so far in position _index_.
|
52
|
+
def max_for_position(index)
|
53
|
+
@maxs[index]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Return the maximum value we have seen so far in position _index_.
|
57
|
+
def mean_for_position(index)
|
58
|
+
(@sums[index] / @count.to_f) if @sums[index]
|
59
|
+
end
|
60
|
+
|
61
|
+
def means
|
62
|
+
@sums.map {|v| v/@count.to_f}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'feldtruby/statistics/distance'
|
2
|
+
require 'feldtruby/array/basic_stats'
|
3
|
+
|
4
|
+
module FeldtRuby
|
5
|
+
|
6
|
+
class ClusterLinkageMetric < CompositeMetric
|
7
|
+
include SetDistance
|
8
|
+
end
|
9
|
+
|
10
|
+
# Average linkage metric between clusters.
|
11
|
+
class AverageLinkageMetric < ClusterLinkageMetric
|
12
|
+
def calc(cluster1, cluster2)
|
13
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).sum.to_f / (cluster1.length * cluster2.length)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Single linkage metric between clusters - distance between nearest members.
|
18
|
+
class SingleLinkageMetric < ClusterLinkageMetric
|
19
|
+
def calc(cluster1, cluster2)
|
20
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).min
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Complete linkage metric between clusters - distance between furthest members.
|
25
|
+
class CompleteLinkageMetric < ClusterLinkageMetric
|
26
|
+
def calc(cluster1, cluster2)
|
27
|
+
pairwise_distances(cluster1, cluster2, @sub_metric).max
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'feldtruby/statistics/euclidean_distance'
|
2
|
+
|
3
|
+
module FeldtRuby
|
4
|
+
|
5
|
+
class Distance
|
6
|
+
def calc(o1, o2)
|
7
|
+
raise NotImplementedError
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
module CompositableDistance
|
12
|
+
def initialize(metric = EuclideanDistance.new)
|
13
|
+
@sub_metric = metric
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Functions specific to distances defined on sets of individual objects
|
18
|
+
module SetDistance
|
19
|
+
def pairwise_distances(set1, set2, metric)
|
20
|
+
set1.map {|a| set2.map {|b| metric.calc(a,b)}}.flatten
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Metric is a Distance with particular properties. They need to be ensured
|
25
|
+
# in sub-classes so not defined here though.
|
26
|
+
class Metric < Distance
|
27
|
+
end
|
28
|
+
|
29
|
+
# A CompositeDistance takes another metric as input and calculates a new
|
30
|
+
# distance based on it.
|
31
|
+
class CompositeMetric < Metric
|
32
|
+
include CompositableDistance
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
data/lib/feldtruby/statistics.rb
CHANGED
@@ -153,6 +153,54 @@ module Statistics
|
|
153
153
|
end
|
154
154
|
end
|
155
155
|
|
156
|
+
# Plotting data sets in R with ggplot2 and save them to files.
|
157
|
+
module FeldtRuby::Statistics::Plotting
|
158
|
+
|
159
|
+
def plot_2dims(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", width = 1200, height = 900)
|
160
|
+
|
161
|
+
include_library("ggplot2")
|
162
|
+
|
163
|
+
pre = [
|
164
|
+
"td <- read.csv(#{csvFilePath.inspect}",
|
165
|
+
"png(#{graphFilePath.inspect}, width=#{width}, height=#{height})"
|
166
|
+
]
|
167
|
+
|
168
|
+
plot = yield()
|
169
|
+
plot.last << " theme_bw(base_size = 12, base_family = \"\")"
|
170
|
+
|
171
|
+
post = [
|
172
|
+
"dev.off()"
|
173
|
+
]
|
174
|
+
|
175
|
+
lines = pre + plot + post
|
176
|
+
eval lines.join("\n")
|
177
|
+
|
178
|
+
end
|
179
|
+
|
180
|
+
# Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
|
181
|
+
def scatter_plot(csvFilePath, graphFilePath, xName, yName, title = "scatterplot", smoothFit = true, width = 1200, height = 900)
|
182
|
+
plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
|
183
|
+
[
|
184
|
+
"ggplot(td, aes(#{xName}, #{yName})) + ",
|
185
|
+
" geom_point(shape = 1) + ", # Each point is non-filled circle
|
186
|
+
(smoothFit ? " geom_smooth() + " : nil),
|
187
|
+
" ggtitle(#{title.inspect})"
|
188
|
+
].compact
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
# Scatter plot of columns xName vs yName in csvFilePath is saved to graphFilePath.
|
193
|
+
def hexbin_heatmap(csvFilePath, graphFilePath, xName, yName, title = "heatmap", bins = 30, width = 1200, height = 900)
|
194
|
+
plot_2dims(csvFilePath, graphFilePath, xName, yName, title, width, height) {
|
195
|
+
[ "ggplot(td, aes(#{xName}, #{yName})) + geom_hex( bins = #{bins} ) + ggtitle(\"#{title}\")"]
|
196
|
+
}
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
class FeldtRuby::Statistics::RCommunicator
|
201
|
+
include FeldtRuby::Statistics::Plotting
|
202
|
+
end
|
203
|
+
|
156
204
|
# Make them available at top level
|
157
205
|
extend Statistics
|
158
206
|
|
data/lib/feldtruby/version.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'feldtruby/array/basic_stats'
|
3
|
+
|
4
|
+
def rand_string(length, alphabet)
|
5
|
+
as = alphabet.length
|
6
|
+
(1..length).map {alphabet[rand(as)]}.join
|
7
|
+
end
|
8
|
+
|
9
|
+
def compress(string)
|
10
|
+
Zlib::Deflate.deflate(string)
|
11
|
+
end
|
12
|
+
|
13
|
+
def compression_ratio(string)
|
14
|
+
compress(string).length.to_f/string.length
|
15
|
+
end
|
16
|
+
|
17
|
+
def info_for_alphabet(alphabet)
|
18
|
+
puts "for alphabet = #{alphabet.inspect}"
|
19
|
+
([1,5,10,15,20,25,30,35,40,45,50,60,70]).each do |len|
|
20
|
+
avg_c_len = (1..1000).map {compress(rand_string(len, alphabet)).length}.mean
|
21
|
+
puts( "#{len}: %.2f, %.2f, %.2f" % [avg_c_len, avg_c_len-len, avg_c_len/len.to_f] )
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
info_for_alphabet(("a".."z").to_a)
|
26
|
+
info_for_alphabet(("a".."d").to_a)
|
27
|
+
info_for_alphabet(("a".."b").to_a)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'feldtruby/statistics/array_archive'
|
2
|
+
|
3
|
+
describe "MinMaxAveragePositionArchive" do
|
4
|
+
before do
|
5
|
+
@a = FeldtRuby::MinMaxAveragePerPositionArchive.new
|
6
|
+
end
|
7
|
+
|
8
|
+
it "updates the counts as we add arrays" do
|
9
|
+
@a.count.must_equal 0
|
10
|
+
@a.update([1,2,3])
|
11
|
+
@a.count.must_equal 1
|
12
|
+
@a.update([4,5,6])
|
13
|
+
@a.count.must_equal 2
|
14
|
+
@a.update([1,2,3])
|
15
|
+
@a.count.must_equal 3
|
16
|
+
end
|
17
|
+
|
18
|
+
it "correctly updates the min values" do
|
19
|
+
@a.min_for_position(0).must_equal nil
|
20
|
+
|
21
|
+
@a.update([1,2,3])
|
22
|
+
@a.min_for_position(0).must_equal 1
|
23
|
+
@a.min_for_position(1).must_equal 2
|
24
|
+
@a.min_for_position(2).must_equal 3
|
25
|
+
@a.mins.must_equal [1,2,3]
|
26
|
+
|
27
|
+
@a.update([1,5,-2])
|
28
|
+
@a.mins.must_equal [1,2,-2]
|
29
|
+
@a.min_for_position(0).must_equal 1
|
30
|
+
@a.min_for_position(1).must_equal 2
|
31
|
+
@a.min_for_position(2).must_equal -2
|
32
|
+
end
|
33
|
+
|
34
|
+
it "correctly updates the max values" do
|
35
|
+
@a.max_for_position(0).must_equal nil
|
36
|
+
|
37
|
+
@a.update([1,2,3])
|
38
|
+
@a.max_for_position(0).must_equal 1
|
39
|
+
@a.max_for_position(1).must_equal 2
|
40
|
+
@a.max_for_position(2).must_equal 3
|
41
|
+
@a.maxs.must_equal [1,2,3]
|
42
|
+
|
43
|
+
@a.update([1,5,-2])
|
44
|
+
@a.maxs.must_equal [1,5,3]
|
45
|
+
@a.max_for_position(0).must_equal 1
|
46
|
+
@a.max_for_position(1).must_equal 5
|
47
|
+
@a.max_for_position(2).must_equal 3
|
48
|
+
end
|
49
|
+
|
50
|
+
it "correctly updates the mean values" do
|
51
|
+
@a.mean_for_position(0).must_equal nil
|
52
|
+
|
53
|
+
@a.update([1,2,3])
|
54
|
+
@a.mean_for_position(0).must_equal 1
|
55
|
+
@a.mean_for_position(1).must_equal 2
|
56
|
+
@a.mean_for_position(2).must_equal 3
|
57
|
+
@a.means.must_equal [1,2,3]
|
58
|
+
|
59
|
+
@a.update([1,5,-2])
|
60
|
+
@a.means.must_equal [1, 3.5, 0.5]
|
61
|
+
@a.mean_for_position(0).must_equal 1
|
62
|
+
@a.mean_for_position(1).must_equal 3.5
|
63
|
+
@a.mean_for_position(2).must_equal 0.5
|
64
|
+
end
|
65
|
+
end
|
data/test/test_array.rb
CHANGED
@@ -34,6 +34,12 @@ class TestFeldtRubyArray < MiniTest::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
|
36
36
|
describe "Array extensions" do
|
37
|
+
describe "map_with_index" do
|
38
|
+
it "calls the block with both the value and an index" do
|
39
|
+
[1,2,3].map_with_index {|v,i| [v,i]}.must_equal [[1,0], [2,1], [3,2]]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
37
43
|
describe "ranks" do
|
38
44
|
it "works when elements are already in order" do
|
39
45
|
[2.5, 1.5, 0.3].ranks.must_equal [1, 2, 3]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'feldtruby/statistics/clustering'
|
2
|
+
require 'feldtruby/statistics/euclidean_distance'
|
3
|
+
|
4
|
+
describe "Clustering linkage metrics - i.e. distance between clusters of objects in a set" do
|
5
|
+
describe "Average linkage metric" do
|
6
|
+
it "can be calculated on clusters of with only one number each" do
|
7
|
+
alm = FeldtRuby::AverageLinkageMetric.new()
|
8
|
+
alm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
9
|
+
alm.calc([[0.0]], [[1]]).must_equal 1.0
|
10
|
+
end
|
11
|
+
|
12
|
+
it "can be calculated on clusters of with several numbers in them" do
|
13
|
+
alm = FeldtRuby::AverageLinkageMetric.new()
|
14
|
+
alm.calc([[1], [0]], [[1], [0]]).must_equal 0.5
|
15
|
+
alm.calc([[1], [0], [2]], [[1], [0]]).must_equal (5.0/6)
|
16
|
+
alm.calc([[1], [0], [2]], [[1], [0], [3]]).must_equal (11.0/9)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "Single linkage metric" do
|
21
|
+
it "can be calculated on clusters of with only one float number each" do
|
22
|
+
slm = FeldtRuby::SingleLinkageMetric.new()
|
23
|
+
slm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
24
|
+
slm.calc([[0.0]], [[1.0]]).must_equal 1.0
|
25
|
+
end
|
26
|
+
|
27
|
+
it "can be calculated on clusters of with several numbers in them" do
|
28
|
+
slm = FeldtRuby::SingleLinkageMetric.new()
|
29
|
+
slm.calc([[1], [0]], [[1], [0]]).must_equal 0.0
|
30
|
+
slm.calc([[1], [2]], [[1], [0]]).must_equal 0.0
|
31
|
+
slm.calc([[1], [2]], [[3], [5]]).must_equal 1.0
|
32
|
+
slm.calc([[1], [2], [3]], [[3], [5]]).must_equal 0.0
|
33
|
+
slm.calc([[1], [2], [3]], [[6], [7]]).must_equal 3.0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "Complete linkage metric" do
|
38
|
+
it "can be calculated on clusters of with only one float number each" do
|
39
|
+
clm = FeldtRuby::CompleteLinkageMetric.new()
|
40
|
+
clm.calc([[1.0]], [[1.0]]).must_equal 0.0
|
41
|
+
clm.calc([[0.0]], [[1.0]]).must_equal 1.0
|
42
|
+
end
|
43
|
+
|
44
|
+
it "can be calculated on clusters of with several numbers in them" do
|
45
|
+
clm = FeldtRuby::CompleteLinkageMetric.new()
|
46
|
+
clm.calc([[1], [0]], [[1], [0]]).must_equal 1.0
|
47
|
+
clm.calc([[1], [2]], [[1], [0]]).must_equal 2.0
|
48
|
+
clm.calc([[1], [2]], [[3], [5]]).must_equal 4.0
|
49
|
+
clm.calc([[1], [2], [3]], [[3], [5]]).must_equal 4.0
|
50
|
+
clm.calc([[1], [2], [3]], [[6], [7]]).must_equal 6.0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'feldtruby/statistics/euclidean_distance'
|
2
|
+
include FeldtRuby
|
3
|
+
|
4
|
+
describe "Euclidean distance" do
|
5
|
+
it "can be calculated on float vectors of length 1" do
|
6
|
+
euclidean_distance([1.0], [1.0]).must_equal 0.0
|
7
|
+
end
|
8
|
+
|
9
|
+
it "can be calculated on float vectors of length 2" do
|
10
|
+
euclidean_distance([2.0, -1.0], [-2.0, 2.0]).must_equal 5.0
|
11
|
+
end
|
12
|
+
|
13
|
+
it "can be calculated on float vectors of length 3" do
|
14
|
+
euclidean_distance([1.0,2.0,3.0], [4.0,5.0,6.0]).must_be_close_to 5.196152
|
15
|
+
end
|
16
|
+
|
17
|
+
it "can be calculated on int vectors of length 1" do
|
18
|
+
euclidean_distance([1], [1]).must_equal 0.0
|
19
|
+
end
|
20
|
+
|
21
|
+
it "can be calculated on int vectors of length 2" do
|
22
|
+
euclidean_distance([2, -1], [-2, 2]).must_equal 5.0
|
23
|
+
end
|
24
|
+
|
25
|
+
it "can be calculated on int vectors of length 3" do
|
26
|
+
euclidean_distance([1,2,3], [4,5,6]).must_be_close_to 5.196152
|
27
|
+
end
|
28
|
+
end
|