biopsy 0.1.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,437 @@
1
+ require 'rubystats'
2
+ require 'statsample'
3
+ require 'set'
4
+ require 'pp'
5
+ require 'matrix'
6
+
7
+ # TODO:
8
+ # - make distributions draw elements from the range, not just from distribution (DONE)
9
+ # - test on real SOAPdt data (in progress)
10
+ # - make code to run 100 times for a particular dataset, capture the trajectory, and plot the progress over time along with a histogram of the data distribution
11
+ # - plot SD and step-size over time
12
+ # - capture data about convergence (done for toy data, need to repeat for other data)
13
+
14
+ module Biopsy
15
+
16
+ # a Distribution represents the probability distribution from
17
+ # which the next value of a parameter is drawn. The set of all
18
+ # distributions acts as a probabilistic neighbourhood structure.
19
+ class Distribution
20
+
21
+ attr_reader :sd
22
+
23
+ # create a new Distribution
24
+ def initialize(mean, range, sd_increment_proportion, sd)
25
+ @mean = mean
26
+ @maxsd = range.size * 0.66
27
+ @minsd = 0.5
28
+ @sd = sd
29
+ self.limit_sd
30
+ @range = range
31
+ @sd_increment_proportion = sd_increment_proportion
32
+ self.generate_distribution
33
+ rescue
34
+ raise "generation of distribution with mean: #{@mean}, sd: #{@sd} failed."
35
+ end
36
+
37
+ # generate the distribution
38
+ def generate_distribution
39
+ @dist = Rubystats::NormalDistribution.new(@mean, @sd)
40
+ end
41
+
42
+ def limit_sd
43
+ @sd = @sd > @maxsd ? @maxsd : @sd
44
+ @sd = @sd < @minsd ? @minsd : @sd
45
+ end
46
+
47
+ # loosen the distribution by increasing the sd
48
+ # and renerating
49
+ def loosen(factor=1)
50
+ @sd += @sd_increment_proportion * factor * @range.size
51
+ self.limit_sd
52
+ self.generate_distribution
53
+ end
54
+
55
+ # tighten the distribution by reducing the sd
56
+ # and regenerating
57
+ def tighten(factor=1)
58
+ @sd -= @sd_increment_proportion * factor * @range.size unless (@sd <= 0.01)
59
+ self.limit_sd
60
+ self.generate_distribution
61
+ end
62
+
63
+ # set standard deviation to the minimum possible value
64
+ def set_sd_min
65
+ @sd = @minsd
66
+ end
67
+
68
+ # draw from the distribution
69
+ def draw
70
+ r = @dist.rng.to_i
71
+ raise "drawn number must be an integer" unless r.is_a? Integer
72
+ # keep the value inside the allowed range
73
+ r = 0 - r if r < 0
74
+ if r >= @range.size
75
+ diff = 1 + r - @range.size
76
+ r = @range.size - diff
77
+ end
78
+ @range[r]
79
+ end
80
+
81
+ end # Distribution
82
+
83
+ # a Hood represents the neighbourhood of a specific location
84
+ # in the parameter space being explored. It is generated using
85
+ # the set of Distributions, which together define the neighbourhood
86
+ # structure.
87
+ class Hood
88
+
89
+ attr_reader :best
90
+
91
+ def initialize(distributions, max_size, tabu)
92
+ # tabu
93
+ @tabu = tabu
94
+ # neighbourhood
95
+ @max_size = max_size
96
+ @members = []
97
+ @best = {
98
+ :parameters => nil,
99
+ :score => 0.0
100
+ }
101
+ # probabilities
102
+ @distributions = distributions
103
+ self.populate
104
+ end
105
+
106
+ # generate a single neighbour
107
+ def generate_neighbour
108
+ n = 0
109
+ begin
110
+ if n >= 100
111
+ # taking too long to generate a neighbour,
112
+ # loosen the neighbourhood structure so we explore further
113
+ # debug("loosening distributions")
114
+ @distributions.each do |param, dist|
115
+ dist.loosen
116
+ end
117
+ end
118
+ # preform the probabilistic step move for each parameter
119
+ neighbour = Hash[@distributions.map { |param, dist| [param, dist.draw] }]
120
+ n += 1
121
+ end while self.is_tabu?(neighbour)
122
+ @tabu << neighbour
123
+ @members << neighbour
124
+ end
125
+
126
+ # update best?
127
+ def update_best? current
128
+ @best = current.clone if current[:score] > @best[:score]
129
+ end
130
+
131
+ # true if location is tabu
132
+ def is_tabu? location
133
+ @tabu.member? location
134
+ end
135
+
136
+ # generate the population of neighbours
137
+ def populate
138
+ @max_size.times do |i|
139
+ self.generate_neighbour
140
+ end
141
+ end
142
+
143
+ # return the next neighbour from this Hood
144
+ def next
145
+ @members.pop
146
+ end
147
+
148
+ # returns true if the current neighbour is
149
+ # the last one in the Hood
150
+ def last?
151
+ @members.empty?
152
+ end
153
+
154
+ end # Hood
155
+
156
+ # A Tabu Search implementation with a domain-specific probabilistic
157
+ # learning heuristic for optimising over an unconstrained parameter
158
+ # space with costly objective evaluation.
159
+ class TabuSearch #< OptmisationAlgorithm
160
+
161
+ attr_reader :current, :best, :hood_no
162
+ attr_accessor :max_hood_size, :sd_increment_proportion, :starting_sd_divisor, :backtrack_cutoff
163
+ attr_accessor :jump_cutoff
164
+
165
+ Thread = Struct.new(:best, :tabu, :distributions,
166
+ :standard_deviations, :recent_scores,
167
+ :iterations_since_best, :backtracks,
168
+ :current, :current_hood, :loaded)
169
+
170
+ def initialize(parameter_ranges, threads=8, limit=nil)
171
+
172
+ @ranges = parameter_ranges
173
+
174
+ # solution tracking
175
+ @best = nil
176
+
177
+ # tabu list
178
+ @tabu = Set.new
179
+ @tabu_limit = nil
180
+ @start_time = Time.now
181
+
182
+ # neighbourhoods
183
+ @max_hood_size = 5
184
+ @starting_sd_divisor = 5
185
+ @standard_deviations = {}
186
+ @sd_increment_proportion = 0.05
187
+ @hood_no = 1
188
+
189
+ # adjustment tracking
190
+ @recent_scores = []
191
+ @jump_cutoff = 10
192
+
193
+ # logging
194
+ @log_data = false
195
+ @logfiles = {}
196
+ self.log_setup
197
+
198
+ # backtracking
199
+ @iterations_since_best = 0
200
+ @backtrack_cutoff = 2
201
+ @backtracks = 1.0
202
+
203
+ # convergence
204
+ @num_threads = 2
205
+ @threads = []
206
+
207
+ end # initialize
208
+
209
+ def setup start_point
210
+ @current = {:parameters => start_point, :score => nil}
211
+ @best = @current
212
+ self.setup_threads
213
+ end
214
+
215
+ # given the score for a parameter set,
216
+ # return the next parameter set to be scored
217
+ def run_one_iteration(parameters, score)
218
+ @current = {:parameters => parameters, :score => score}
219
+ # update best score?
220
+ self.update_best?
221
+ # log any data
222
+ self.log
223
+ # cycle threads
224
+ self.load_next_thread
225
+ # get next parameter set to score
226
+ self.next_candidate
227
+ @current[:parameters]
228
+ end # run_one_iteration
229
+
230
+ def setup_threads
231
+ @num_threads.times do
232
+ @threads << Thread.new
233
+ end
234
+ @threads.each do |thread|
235
+ @current = {
236
+ :parameters => self.random_start_point,
237
+ :score => nil
238
+ }
239
+ @best = @current
240
+ @standard_deviations = {}
241
+ @recent_scores = []
242
+ @tabu = Set.new
243
+ self.define_neighbourhood_structure
244
+ @current_hood = Biopsy::Hood.new(@distributions, @max_hood_size, @tabu)
245
+ thread.members.each do |sym|
246
+ ivar = self.sym_to_ivar_sym sym
247
+ thread[sym] = self.instance_variable_get(ivar)
248
+ end
249
+ thread.loaded = false
250
+ end
251
+ @current_thread = @num_threads - 2
252
+ end
253
+
254
+ def load_next_thread
255
+ thread = @threads[@current_thread]
256
+ if thread.loaded
257
+ thread.members.each do |sym|
258
+ ivar = self.sym_to_ivar_sym sym
259
+ thread[sym] = self.instance_variable_get(ivar)
260
+ end
261
+ else
262
+ thread.loaded = true
263
+ end
264
+ @current_thread = (@current_thread + 1) % @num_threads
265
+ thread = @threads[@current_thread]
266
+ thread.members.each do |sym|
267
+ ivar = self.sym_to_ivar_sym sym
268
+ self.instance_variable_set(ivar, thread[sym])
269
+ end
270
+ end
271
+
272
+ def update_best?
273
+ @current_hood.update_best? @current
274
+ if @best[:score].nil? || @current[:score] > @best[:score]
275
+ @best = @current.clone
276
+ else
277
+ @iterations_since_best += 1
278
+ end
279
+ end
280
+
281
+ # use probability distributions to define the
282
+ # initial neighbourhood structure
283
+ def define_neighbourhood_structure
284
+ # probabilities
285
+ @distributions = {}
286
+ @current[:parameters].each_pair do |param, value|
287
+ self.update_distribution(param, value)
288
+ end
289
+ end
290
+
291
+ # update the neighbourhood structure by adjusting the probability
292
+ # distributions according to total performance of each parameter
293
+ def update_neighbourhood_structure
294
+ self.update_recent_scores
295
+ best = self.backtrack_or_continue
296
+ unless @distributions.empty?
297
+ @standard_deviations = Hash[@distributions.map { |k, d| [k, d.sd] }]
298
+ end
299
+ best[:parameters].each_pair do |param, value|
300
+ self.update_distribution(param, value)
301
+ end
302
+ end
303
+
304
+ # set the distribution for parameter +:param+ to a new one centered
305
+ # around the index of +value+
306
+ def update_distribution(param, value)
307
+ mean = @ranges[param].index(value)
308
+ range = @ranges[param]
309
+ sd = self.sd_for_param(param, range)
310
+ @distributions[param] = Biopsy::Distribution.new(mean,
311
+ range,
312
+ @sd_increment_proportion,
313
+ sd)
314
+ end
315
+
316
+ # return the standard deviation to use for +:param+
317
+ def sd_for_param(param, range)
318
+ @standard_deviations.empty? ? (range.size.to_f / @starting_sd_divisor) : @standard_deviations[param]
319
+ end
320
+
321
+ # return the correct 'best' location to form a new neighbourhood around
322
+ # deciding whether to continue progressing from the current location
323
+ # or to backtrack to a previous good location to explore further
324
+ def backtrack_or_continue
325
+ best = nil
326
+ if (@iterations_since_best / @backtracks) >= @backtrack_cutoff * @max_hood_size
327
+ self.backtrack
328
+ best = @best
329
+ else
330
+ best = @current_hood.best
331
+ self.adjust_distributions_using_gradient
332
+ end
333
+ if best[:parameters].nil?
334
+ # this should never happen!
335
+ best = @best
336
+ end
337
+ best
338
+ end
339
+
340
+ def backtrack
341
+ @backtracks += 1.0
342
+ # debug('backtracked to best')
343
+ @distributions.each_pair { |k, d| d.tighten }
344
+ end
345
+
346
+ # update the array of recent scores
347
+ def update_recent_scores
348
+ @recent_scores.unshift @best[:score]
349
+ @recent_scores = @recent_scores.take @jump_cutoff
350
+ end
351
+
352
+ # use the gradient of recent best scores to update the distributions
353
+ def adjust_distributions_using_gradient
354
+ return if @recent_scores.length < 3
355
+ vx = (1..@recent_scores.length).to_a.to_scale
356
+ vy = @recent_scores.reverse.to_scale
357
+ r = Statsample::Regression::Simple.new_from_vectors(vx,vy)
358
+ slope = r.b
359
+ if slope > 0
360
+ @distributions.each_pair { |k, d| d.tighten slope }
361
+ elsif slope < 0
362
+ @distributions.each_pair { |k, d| d.loosen slope }
363
+ end
364
+ end
365
+
366
+ # shift to the next neighbourhood
367
+ def next_hood
368
+ @hood_no += 1
369
+ # debug("entering hood # #{@hood_no}")
370
+ self.update_neighbourhood_structure
371
+ @current_hood = Hood.new(@distributions, @max_hood_size, @tabu)
372
+ end
373
+
374
+ # get the next neighbour to explore from the current hood
375
+ def next_candidate
376
+ @current[:parameters] = @current_hood.next
377
+ @current[:score] = nil
378
+ # exhausted the neighbourhood?
379
+ if @current_hood.last?
380
+ # debug(@current_hood.best)
381
+ self.next_hood
382
+ end
383
+ end
384
+
385
+ # check termination conditions
386
+ # and return true if met
387
+ def finished?
388
+ return false if @threads.first.recent_scores.size < @jump_cutoff
389
+ scores = @threads.map { |t| t.recent_scores }
390
+ scores.map { |s| s.mean }.uniq.length == 1
391
+ end
392
+
393
+ # True if this algorithm chooses its own starting point
394
+ def knows_starting_point?
395
+ true
396
+ end
397
+
398
+ def log_setup
399
+ if @log_data
400
+ require 'csv'
401
+ @logfiles[:standard_deviations] = CSV.open('standard_deviations.csv', 'w')
402
+ @logfiles[:best] = CSV.open('best.csv', 'w')
403
+ @logfiles[:score] = CSV.open('score.csv', 'w')
404
+ @logfiles[:params] = CSV.open('params.csv', 'w')
405
+ end
406
+ end
407
+
408
+ def log
409
+ if @log_data
410
+ @logfiles[:standard_deviations] << @distributions.map { |k, d| d.sd }
411
+ @logfiles[:best] << [@best[:score]]
412
+ @logfiles[:score] << [@current[:score]]
413
+ @logfiles[:params] << @current[:parameters].map { |k, v| v }
414
+ end
415
+ end
416
+
417
+ def log_teardown
418
+ @logfiles.each_pair do |k, f|
419
+ f.close
420
+ end
421
+ end
422
+
423
+ def sym_to_ivar_sym sym
424
+ "@#{sym.to_s}".to_sym
425
+ end
426
+
427
+ def select_starting_point
428
+ self.random_start_point
429
+ end
430
+
431
+ def random_start_point
432
+ Hash[@ranges.map { |p, r| [p, r.sample] }]
433
+ end
434
+
435
+ end # TabuSearch
436
+
437
+ end # Biopsy
@@ -0,0 +1,110 @@
1
+ # Optimisation Framework: Settings
2
+ #
3
+ # == Description
4
+ #
5
+ # The Settings singleton object maintains general settings (as opposed to
6
+ # those specific to the experiment, which are contained in the Experiment
7
+ # object).
8
+ #
9
+ # Key settings include the location(s) of config file(s), the Domain that
10
+ # is currently active, and the directories to search for objective functions.
11
+ #
12
+ # Methods are provided for loading, listing, accessing and saving the settings
13
+ #
14
+ module Biopsy
15
+
16
+ require 'singleton'
17
+ require 'yaml'
18
+ require 'pp'
19
+
20
+ class SettingsError < StandardError
21
+ end
22
+
23
+ class Settings
24
+ include Singleton
25
+
26
+ attr_accessor :base_dir
27
+ attr_accessor :target_dir
28
+ attr_accessor :domain_dir
29
+ attr_accessor :domain
30
+ attr_accessor :objectives_dir
31
+ attr_accessor :objectives_subset
32
+ attr_accessor :sweep_cutoff
33
+
34
+ def initialize
35
+ self.set_defaults
36
+ end
37
+
38
+ def set_defaults
39
+ # defaults
40
+ @config_file = '~/.biopsyrc'
41
+ @base_dir = ['.']
42
+ @target_dir = ['targets']
43
+ @domain_dir = ['domains']
44
+ @domain = 'test_domain'
45
+ @objectives_dir = ['objectives']
46
+ @objectives_subset = nil
47
+ @sweep_cutoff = 100
48
+ end
49
+
50
+ # Loads settings from a YAML config file. If no file is
51
+ # specified, the default location ('~/.biopsyrc') is used.
52
+ # Settings loaded from the file are merged into any
53
+ # previously loaded settings.
54
+ def load(config_file=@config_file)
55
+ newsets = YAML::load_file(config_file)
56
+ raise 'Config file was not valid YAML' if newsets == false
57
+ newsets.deep_symbolize.each_pair do |key, value|
58
+ varname = "@#{key.to_s}".to_sym
59
+ unless self.instance_variables.include? varname
60
+ raise SettingsError.new "Key #{key.to_s} in settings file is not valid"
61
+ end
62
+ self.instance_variable_set(varname, value)
63
+ end
64
+ end
65
+
66
+ # Saves the settings to a YAML config file. If no file is
67
+ # specified, the default location ('~/.biopsyrc') is used.
68
+ def save(config_file=@config_file)
69
+ File.open(config_file, 'w') do |f|
70
+ f.puts self.to_s
71
+ end
72
+ end
73
+
74
+ # Returns a hash of the settings
75
+ def all_settings
76
+ settings = {}
77
+ instance_variables.each do |var|
78
+ key = var[1..-1]
79
+ settings[key] = self.instance_variable_get(var)
80
+ end
81
+ settings
82
+ end
83
+
84
+ # Returns a YAML string representation of the settings
85
+ def to_s
86
+ all_settings.to_yaml
87
+ end
88
+
89
+ # Locate the first YAML config file whose name
90
+ # excluding extension matches +:name+ (case insensitive)
91
+ # in dirs listed by the +:dir_key+ setting.
92
+ def locate_config(dir_key, name)
93
+ dir_key = "@#{dir_key.to_s}".to_sym
94
+ unless self.instance_variables.include? dir_key
95
+ raise SettingsError.new "no setting found for compulsory key #{dir_key}"
96
+ end
97
+ self.instance_variable_get(dir_key).each do |dir|
98
+ Dir.chdir ::File.expand_path(dir) do
99
+ Dir[name + '.yml'].each do |file|
100
+ return ::File.expand_path(file) if ::File.basename(file, '.yml').downcase == name.downcase
101
+ end
102
+ end
103
+ end
104
+
105
+ nil
106
+ end
107
+
108
+ end # Settings
109
+
110
+ end # Biopsy
@@ -0,0 +1,113 @@
1
+ module Biopsy
2
+
3
+ class TargetLoadError < Exception
4
+ end
5
+
6
+ class Target
7
+ require 'yaml'
8
+ require 'ostruct'
9
+
10
+ # array of input files expected by the target constructor
11
+ attr_accessor :input_files
12
+ # array of output files to keep for submission to objective
13
+ # functions during optimisation
14
+ attr_accessor :output_files
15
+ # hash mapping parameters to the ranges of values they can take
16
+ attr_reader :parameter_ranges
17
+ # path to the constructor code
18
+ attr_reader :constructor_path
19
+ attr_reader :domain
20
+
21
+ # create a new Target instance.
22
+ # arguments:
23
+ # +:domain+ the domain to which this target belongs (see Domain documentation)
24
+ def initialize domain
25
+ @domain = domain
26
+ end
27
+
28
+ # load target with +name+.
29
+ def load_by_name name
30
+ path = self.locate_definition name
31
+ raise TargetLoadError.new("Target definition file does not exist for #{name}") if path.nil?
32
+ config = YAML::load_file(path)
33
+ raise TargetLoadError.new("Target definition file #{path} is not valid YAML") if config.nil?
34
+ missing = self.check_config config.deep_symbolize
35
+ if missing
36
+ msg = "Target definition file #{path} is missing required fields: #{missing}"
37
+ raise TargetLoadError.new(msg)
38
+ end
39
+ errors = self.validate_config config
40
+ unless errors.empty?
41
+ raise TargetLoadError.new("Target definition file #{path} contains the following errors:\n - #{errors.join("\n - ")}")
42
+ end
43
+ self.store_config config
44
+ self.check_constructor
45
+ self.load_constructor
46
+ end
47
+
48
+ # given the name of a target, return the path
49
+ # to the definition YAML file. All +:target_dir+s defined in Settings are
50
+ # searched and the first matching YAML file is loaded.
51
+ def locate_definition name
52
+ Settings.instance.locate_config(:target_dir, name)
53
+ end
54
+
55
+ # verify that +:config+ contains values for all essential target settings
56
+ # returning false if no keys are missing, or an array of the missing keys
57
+ # if any cannot be found
58
+ def check_config config
59
+ required = %w(input_files output_files parameter_ranges constructor_path)
60
+ missing = false
61
+ required.each do |key|
62
+ unless config.has_key? key.to_sym
63
+ missing ||= []
64
+ missing << key
65
+ end
66
+ end
67
+ missing
68
+ end
69
+
70
+ # validate the config against the domain definition. Return an array
71
+ # whose length will be the number of errors found. Thus an array of
72
+ # length 0 indicates that the config is valid according to the domain
73
+ # specification.
74
+ def validate_config config
75
+ @domain.target_valid? config
76
+ end
77
+
78
+ # Store the values in +:config+
79
+ def store_config config
80
+ config.each_pair do |key, value|
81
+ self.instance_variable_set('@' + key.to_s, value)
82
+ end
83
+ end
84
+
85
+ # Validate the constructor. True if valid, false otherwise.
86
+ def check_constructor
87
+ raise "constructor path is not defined for this target" if @constructor_path.nil?
88
+ self.valid_ruby? @constructor_path
89
+ end
90
+
91
+ # Load constructor
92
+ def load_constructor
93
+ require File.join(Settings.instance.target_dir, @constructor_path)
94
+ file_name = File.basename(@constructor_path, '.rb')
95
+ constructor_name = file_name.camelize
96
+ @constructor = Module.const_get(constructor_name).new
97
+ end
98
+
99
+ # Run the constructor for the parameter set +:params+
100
+ def run params
101
+ @constructor.run params
102
+ end
103
+
104
+ # true if file is valid ruby
105
+ def valid_ruby? file
106
+ return false unless ::File.exists? file
107
+ result = `ruby -c #{file} &> /dev/null`
108
+ !result.size.zero?
109
+ end
110
+
111
+ end # end of class Domain
112
+
113
+ end # end of module Biopsy
@@ -0,0 +1,12 @@
1
+ # encoding: utf-8
2
+
3
+ module Biopsy
4
+ module VERSION
5
+ MAJOR = 0
6
+ MINOR = 1
7
+ PATCH = 0
8
+ BUILD = 'alpha'
9
+
10
+ STRING = [MAJOR, MINOR, PATCH, BUILD].compact.join('.')
11
+ end
12
+ end # Biopsy
data/lib/biopsy.rb ADDED
@@ -0,0 +1,13 @@
1
+ require "biopsy/version"
2
+ require "biopsy/base_extensions"
3
+ require "biopsy/settings"
4
+ require "biopsy/domain"
5
+ require "biopsy/experiment"
6
+ require "biopsy/target"
7
+ require "biopsy/objective_handler"
8
+ require "biopsy/objective_function"
9
+ require "biopsy/opt_algorithm"
10
+ require "biopsy/optimisers/genetic_algorithm"
11
+ require "biopsy/optimisers/tabu_search"
12
+ require "biopsy/optimisers/parameter_sweeper"
13
+ require "biopsy/objectives/fastest_optimum"