dspy-miprov2 0.29.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1672 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'time'
5
+ require 'json'
6
+ require 'concurrent-ruby'
7
+ require 'sorbet-runtime'
8
+ require 'securerandom'
9
+ require 'set'
10
+ require_relative 'teleprompter'
11
+ require_relative 'utils'
12
+ require_relative '../propose/grounded_proposer'
13
+ require_relative '../optimizers/gaussian_process'
14
+
15
+ module DSPy
16
+ module Teleprompt
17
+ # Enum for candidate configuration types
18
+ class CandidateType < T::Enum
19
+ enums do
20
+ Baseline = new("baseline")
21
+ InstructionOnly = new("instruction_only")
22
+ FewShotOnly = new("few_shot_only")
23
+ Combined = new("combined")
24
+ end
25
+ end
26
+
27
+ # Enum for optimization strategies
28
+ class OptimizationStrategy < T::Enum
29
+ enums do
30
+ Greedy = new("greedy")
31
+ Adaptive = new("adaptive")
32
+ Bayesian = new("bayesian")
33
+ end
34
+ end
35
+
36
+ class AutoPreset < T::Enum
37
+ enums do
38
+ None = new("none")
39
+ Light = new("light")
40
+ Medium = new("medium")
41
+ Heavy = new("heavy")
42
+ end
43
+ end
44
+
45
+ AUTO_PRESET_SETTINGS = {
46
+ AutoPreset::None => {},
47
+ AutoPreset::Light => {
48
+ candidate_budget: 6,
49
+ instruction_candidates: 3,
50
+ instruction_candidates_when_fewshot: 3,
51
+ bootstrap_sets: 3,
52
+ max_bootstrapped_examples: 2,
53
+ max_labeled_examples: 8,
54
+ optimization_strategy: OptimizationStrategy::Greedy,
55
+ early_stopping_patience: 2,
56
+ valset_target_size: 100,
57
+ minibatch_size: nil
58
+ },
59
+ AutoPreset::Medium => {
60
+ candidate_budget: 12,
61
+ instruction_candidates: 5,
62
+ instruction_candidates_when_fewshot: 5,
63
+ bootstrap_sets: 5,
64
+ max_bootstrapped_examples: 4,
65
+ max_labeled_examples: 16,
66
+ optimization_strategy: OptimizationStrategy::Adaptive,
67
+ early_stopping_patience: 3,
68
+ valset_target_size: 300,
69
+ minibatch_size: nil
70
+ },
71
+ AutoPreset::Heavy => {
72
+ candidate_budget: 18,
73
+ instruction_candidates: 8,
74
+ instruction_candidates_when_fewshot: 8,
75
+ bootstrap_sets: 8,
76
+ max_bootstrapped_examples: 6,
77
+ max_labeled_examples: 24,
78
+ optimization_strategy: OptimizationStrategy::Bayesian,
79
+ early_stopping_patience: 5,
80
+ valset_target_size: 1000,
81
+ minibatch_size: nil
82
+ }
83
+ }.freeze
84
+
85
+ DEFAULT_AUTO_SEED = 42
86
+
87
+ # MIPROv2: Multi-prompt Instruction Proposal with Retrieval Optimization
88
+ # State-of-the-art prompt optimization combining bootstrap sampling,
89
+ # instruction generation, and Bayesian optimization
90
+ class MIPROv2 < Teleprompter
91
+ extend T::Sig
92
+ include Dry::Configurable
93
+
94
+ # Auto-configuration modes for different optimization needs
95
+ module AutoMode
96
+ extend T::Sig
97
+
98
+ sig do
99
+ params(
100
+ metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
101
+ kwargs: T.untyped
102
+ ).returns(MIPROv2)
103
+ end
104
+ def self.light(metric: nil, **kwargs)
105
+ optimizer = MIPROv2.new(metric: metric, **kwargs)
106
+ optimizer.configure do |config|
107
+ MIPROv2.apply_auto_defaults(config, AutoPreset::Light)
108
+ end
109
+ optimizer
110
+ end
111
+
112
+ sig do
113
+ params(
114
+ metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
115
+ kwargs: T.untyped
116
+ ).returns(MIPROv2)
117
+ end
118
+ def self.medium(metric: nil, **kwargs)
119
+ optimizer = MIPROv2.new(metric: metric, **kwargs)
120
+ optimizer.configure do |config|
121
+ MIPROv2.apply_auto_defaults(config, AutoPreset::Medium)
122
+ end
123
+ optimizer
124
+ end
125
+
126
+ sig do
127
+ params(
128
+ metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
129
+ kwargs: T.untyped
130
+ ).returns(MIPROv2)
131
+ end
132
+ def self.heavy(metric: nil, **kwargs)
133
+ optimizer = MIPROv2.new(metric: metric, **kwargs)
134
+ optimizer.configure do |config|
135
+ MIPROv2.apply_auto_defaults(config, AutoPreset::Heavy)
136
+ end
137
+ optimizer
138
+ end
139
+ end
140
+
141
+ # Dry-configurable settings for MIPROv2
142
+ setting :auto_preset, default: AutoPreset::None, constructor: ->(value) {
143
+ case value
144
+ when AutoPreset
145
+ value
146
+ when String, Symbol
147
+ begin
148
+ AutoPreset.deserialize(value.to_s.downcase)
149
+ rescue ArgumentError
150
+ raise ArgumentError, "Invalid auto preset: #{value}. Must be one of :none, :light, :medium, :heavy"
151
+ end
152
+ when nil
153
+ AutoPreset::None
154
+ else
155
+ raise ArgumentError, "Invalid auto preset: #{value.inspect}"
156
+ end
157
+ }
158
+ setting :auto_seed, default: DEFAULT_AUTO_SEED, constructor: ->(value) {
159
+ value.nil? ? DEFAULT_AUTO_SEED : Integer(value)
160
+ }
161
+ setting :valset_target_size, default: nil
162
+ setting :num_trials, default: 12
163
+ setting :num_instruction_candidates, default: 5
164
+ setting :bootstrap_sets, default: 5
165
+ setting :max_bootstrapped_examples, default: 4
166
+ setting :max_labeled_examples, default: 16
167
+ setting :optimization_strategy, default: OptimizationStrategy::Adaptive, constructor: ->(value) {
168
+ # Coerce symbols to enum values
169
+ case value
170
+ when :greedy then OptimizationStrategy::Greedy
171
+ when :adaptive then OptimizationStrategy::Adaptive
172
+ when :bayesian then OptimizationStrategy::Bayesian
173
+ when OptimizationStrategy then value
174
+ when nil then OptimizationStrategy::Adaptive
175
+ else
176
+ raise ArgumentError, "Invalid optimization strategy: #{value}. Must be one of :greedy, :adaptive, :bayesian"
177
+ end
178
+ }
179
+ setting :init_temperature, default: 1.0
180
+ setting :final_temperature, default: 0.1
181
+ setting :early_stopping_patience, default: 3
182
+ setting :use_bayesian_optimization, default: true
183
+ setting :track_diversity, default: true
184
+ setting :max_errors, default: 3
185
+ setting :num_threads, default: 1
186
+ setting :minibatch_size, default: nil
187
+
188
+ # Class-level configuration method - sets defaults for new instances
189
+ def self.configure(&block)
190
+ if block_given?
191
+ # Store configuration in a class variable for new instances
192
+ @default_config_block = block
193
+ end
194
+ end
195
+
196
+ # Get the default configuration block
197
+ def self.default_config_block
198
+ @default_config_block
199
+ end
200
+
201
+ class << self
202
+ extend T::Sig
203
+
204
+ sig { params(config: T.untyped, preset: AutoPreset).void }
205
+ def apply_auto_defaults(config, preset)
206
+ settings = AUTO_PRESET_SETTINGS.fetch(preset) { {} }
207
+
208
+ config.auto_preset = preset
209
+ config.num_trials = settings[:candidate_budget] if settings[:candidate_budget]
210
+ config.num_instruction_candidates = settings[:instruction_candidates] if settings[:instruction_candidates]
211
+ config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
212
+ config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
213
+ config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
214
+ config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
215
+ config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
216
+ config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
217
+ config.valset_target_size = settings[:valset_target_size] if settings[:valset_target_size]
218
+ end
219
+ end
220
+
221
+
222
+ # Simple data structure for evaluated candidate configurations (immutable)
223
+ EvaluatedCandidate = Data.define(
224
+ :instruction,
225
+ :few_shot_examples,
226
+ :type,
227
+ :metadata,
228
+ :config_id
229
+ ) do
230
+ extend T::Sig
231
+
232
+ # Generate a config ID based on content
233
+ sig { params(instruction: String, few_shot_examples: T::Array[T.untyped], type: CandidateType, metadata: T::Hash[Symbol, T.untyped]).returns(EvaluatedCandidate) }
234
+ def self.create(instruction:, few_shot_examples: [], type: CandidateType::Baseline, metadata: {})
235
+ content = "#{instruction}_#{few_shot_examples.size}_#{type.serialize}_#{metadata.hash}"
236
+ config_id = Digest::SHA256.hexdigest(content)[0, 12]
237
+
238
+ new(
239
+ instruction: instruction.freeze,
240
+ few_shot_examples: few_shot_examples.freeze,
241
+ type: type,
242
+ metadata: metadata.freeze,
243
+ config_id: config_id
244
+ )
245
+ end
246
+
247
+ sig { returns(T::Hash[Symbol, T.untyped]) }
248
+ def to_h
249
+ {
250
+ instruction: instruction,
251
+ few_shot_examples: few_shot_examples.size,
252
+ type: type.serialize,
253
+ metadata: metadata,
254
+ config_id: config_id
255
+ }
256
+ end
257
+ end
258
+
259
+ # Result of MIPROv2 optimization
260
+ class MIPROv2Result < OptimizationResult
261
+ extend T::Sig
262
+
263
+ sig { returns(T::Array[EvaluatedCandidate]) }
264
+ attr_reader :evaluated_candidates
265
+
266
+ sig { returns(T::Hash[Symbol, T.untyped]) }
267
+ attr_reader :optimization_trace
268
+
269
+ sig { returns(T::Hash[Symbol, T.untyped]) }
270
+ attr_reader :bootstrap_statistics
271
+
272
+ sig { returns(T::Hash[Symbol, T.untyped]) }
273
+ attr_reader :proposal_statistics
274
+
275
+ sig { returns(T.nilable(DSPy::Evaluate::BatchEvaluationResult)) }
276
+ attr_reader :best_evaluation_result
277
+
278
+ sig do
279
+ params(
280
+ optimized_program: T.untyped,
281
+ scores: T::Hash[Symbol, T.untyped],
282
+ history: T::Hash[Symbol, T.untyped],
283
+ evaluated_candidates: T::Array[EvaluatedCandidate],
284
+ optimization_trace: T::Hash[Symbol, T.untyped],
285
+ bootstrap_statistics: T::Hash[Symbol, T.untyped],
286
+ proposal_statistics: T::Hash[Symbol, T.untyped],
287
+ best_score_name: T.nilable(String),
288
+ best_score_value: T.nilable(Float),
289
+ metadata: T::Hash[Symbol, T.untyped],
290
+ best_evaluation_result: T.nilable(DSPy::Evaluate::BatchEvaluationResult)
291
+ ).void
292
+ end
293
+ def initialize(optimized_program:, scores:, history:, evaluated_candidates:, optimization_trace:, bootstrap_statistics:, proposal_statistics:, best_score_name: nil, best_score_value: nil, metadata: {}, best_evaluation_result: nil)
294
+ super(
295
+ optimized_program: optimized_program,
296
+ scores: scores,
297
+ history: history,
298
+ best_score_name: best_score_name,
299
+ best_score_value: best_score_value,
300
+ metadata: metadata
301
+ )
302
+ @evaluated_candidates = evaluated_candidates.freeze
303
+ @optimization_trace = optimization_trace.freeze
304
+ @bootstrap_statistics = bootstrap_statistics.freeze
305
+ @proposal_statistics = proposal_statistics.freeze
306
+ @best_evaluation_result = best_evaluation_result&.freeze
307
+ end
308
+
309
+ sig { returns(T::Hash[Symbol, T.untyped]) }
310
+ def to_h
311
+ super.merge({
312
+ evaluated_candidates: @evaluated_candidates.map(&:to_h),
313
+ optimization_trace: @optimization_trace,
314
+ bootstrap_statistics: @bootstrap_statistics,
315
+ proposal_statistics: @proposal_statistics,
316
+ best_evaluation_result: @best_evaluation_result&.to_h
317
+ })
318
+ end
319
+ end
320
+
321
+ sig { returns(MIPROv2Config) }
322
+ attr_reader :mipro_config
323
+
324
+ sig { returns(T.nilable(DSPy::Propose::GroundedProposer)) }
325
+ attr_reader :proposer
326
+
327
+ # Override dry-configurable's initialize to add our parameter validation
328
+ def initialize(metric: nil, **kwargs)
329
+ # Reject old config parameter pattern
330
+ if kwargs.key?(:config)
331
+ raise ArgumentError, "config parameter is no longer supported. Use .configure blocks instead."
332
+ end
333
+
334
+ # Let dry-configurable handle its initialization
335
+ super(**kwargs)
336
+
337
+ # Apply class-level configuration if it exists
338
+ if self.class.default_config_block
339
+ configure(&self.class.default_config_block)
340
+ end
341
+
342
+ @metric = metric
343
+
344
+ # Initialize proposer with a basic config for now (will be updated later)
345
+ @proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
346
+ @optimization_trace = []
347
+ @evaluated_candidates = []
348
+ @trial_history = {}
349
+ end
350
+
351
+ # Main MIPROv2 optimization method
352
+ sig do
353
+ params(
354
+ program: T.untyped,
355
+ trainset: T::Array[T.untyped],
356
+ valset: T.nilable(T::Array[T.untyped])
357
+ ).returns(MIPROv2Result)
358
+ end
359
+ def compile(program, trainset:, valset: nil)
360
+ validate_inputs(program, trainset, valset)
361
+
362
+ instrument_step('miprov2_compile', {
363
+ trainset_size: trainset.size,
364
+ valset_size: valset&.size || 0,
365
+ num_trials: config.num_trials,
366
+ optimization_strategy: optimization_strategy_name,
367
+ mode: infer_auto_mode
368
+ }) do
369
+ # Convert examples to typed format
370
+ typed_trainset = ensure_typed_examples(trainset)
371
+ typed_valset = valset ? ensure_typed_examples(valset) : nil
372
+
373
+ if auto_preset_active?
374
+ typed_trainset, typed_valset = prepare_datasets_for_auto(typed_trainset, typed_valset)
375
+ typed_valset = apply_auto_preset!(program, typed_valset)
376
+ else
377
+ typed_valset = limit_validation_set(typed_valset, config.valset_target_size)
378
+ end
379
+
380
+ # Use validation set if available, otherwise use part of training set
381
+ evaluation_set = typed_valset || typed_trainset.take([typed_trainset.size / 3, 10].max)
382
+
383
+ # Phase 1: Bootstrap few-shot examples
384
+ emit_event('phase_start', { phase: 1, name: 'bootstrap' })
385
+ demo_candidates = phase_1_bootstrap(program, typed_trainset)
386
+ emit_event('phase_complete', {
387
+ phase: 1,
388
+ num_predictors: demo_candidates.keys.size,
389
+ demo_sets_per_predictor: demo_candidates[0]&.size || 0
390
+ })
391
+
392
+ # Phase 2: Generate instruction candidates
393
+ emit_event('phase_start', { phase: 2, name: 'instruction_proposal' })
394
+ proposal_result = phase_2_propose_instructions(program, typed_trainset, demo_candidates)
395
+ emit_event('phase_complete', {
396
+ phase: 2,
397
+ num_candidates: proposal_result.num_candidates,
398
+ best_instruction_preview: proposal_result.best_instruction[0, 50]
399
+ })
400
+
401
+ # Phase 3: Bayesian optimization
402
+ emit_event('phase_start', { phase: 3, name: 'optimization' })
403
+ optimization_result = phase_3_optimize(
404
+ program,
405
+ evaluation_set,
406
+ proposal_result,
407
+ demo_candidates
408
+ )
409
+ emit_event('phase_complete', {
410
+ phase: 3,
411
+ best_score: optimization_result[:best_score],
412
+ trials_completed: optimization_result[:trials_completed]
413
+ })
414
+
415
+ # Build final result
416
+ final_result = build_miprov2_result(
417
+ optimization_result,
418
+ demo_candidates,
419
+ proposal_result
420
+ )
421
+
422
+ @trial_history = optimization_result[:trial_logs] || {}
423
+
424
+ save_results(final_result)
425
+ final_result
426
+ end
427
+ end
428
+
429
+ private
430
+
431
+ sig { returns(T::Boolean) }
432
+ def auto_preset_active?
433
+ config.auto_preset != AutoPreset::None
434
+ end
435
+
436
+ sig { params(trainset: T::Array[DSPy::Example], valset: T.nilable(T::Array[DSPy::Example])).returns([T::Array[DSPy::Example], T::Array[DSPy::Example]]) }
437
+ def prepare_datasets_for_auto(trainset, valset)
438
+ settings = auto_settings_for(config.auto_preset)
439
+ target_size = settings[:valset_target_size]
440
+ config.valset_target_size = target_size
441
+
442
+ if valset && valset.any?
443
+ [trainset, limit_validation_set(valset, target_size)]
444
+ else
445
+ raise ArgumentError, "Training set must contain at least 2 examples when auto presets are enabled" if trainset.size < 2
446
+
447
+ shuffled = trainset.shuffle(random: Random.new(config.auto_seed))
448
+ default_val_size = [
449
+ [(trainset.size * 0.8).ceil, 1].max,
450
+ trainset.size - 1
451
+ ].min
452
+
453
+ desired_val_size = target_size ? [default_val_size, target_size].min : default_val_size
454
+ desired_val_size = [[desired_val_size, 1].max, trainset.size - 1].min
455
+
456
+ validation_examples = shuffled.take(desired_val_size)
457
+ training_examples = shuffled.drop(desired_val_size)
458
+
459
+ [training_examples, limit_validation_set(validation_examples, target_size)]
460
+ end
461
+ end
462
+
463
+ sig { params(program: T.untyped, valset: T::Array[DSPy::Example]).returns(T::Array[DSPy::Example]) }
464
+ def apply_auto_preset!(program, valset)
465
+ settings = auto_settings_for(config.auto_preset)
466
+ zeroshot = zero_shot_for_settings?(settings)
467
+ candidate_budget = settings[:candidate_budget]
468
+
469
+ if candidate_budget && candidate_budget.positive?
470
+ config.num_trials = compute_trials_from_candidate_budget(program, candidate_budget, zeroshot)
471
+ instruction_candidates = if zeroshot
472
+ candidate_budget
473
+ else
474
+ settings[:instruction_candidates_when_fewshot] || (candidate_budget / 2.0).ceil
475
+ end
476
+ config.num_instruction_candidates = [instruction_candidates, 1].max
477
+ end
478
+
479
+ config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
480
+ config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
481
+ config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
482
+ config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
483
+ config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
484
+ config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
485
+
486
+ config.valset_target_size = settings[:valset_target_size]
487
+ limit_validation_set(valset, config.valset_target_size)
488
+ end
489
+
490
+ sig { params(valset: T.nilable(T::Array[DSPy::Example]), target_size: T.nilable(Integer)).returns(T.nilable(T::Array[DSPy::Example])) }
491
+ def limit_validation_set(valset, target_size)
492
+ return valset unless valset && target_size && target_size.positive?
493
+ return valset if valset.size <= target_size
494
+
495
+ valset.shuffle(random: Random.new(config.auto_seed)).take(target_size)
496
+ end
497
+
498
+ sig { params(program: T.untyped, num_candidates: Integer, zeroshot: T::Boolean).returns(Integer) }
499
+ def compute_trials_from_candidate_budget(program, num_candidates, zeroshot)
500
+ predictor_count =
501
+ if program.respond_to?(:predictors)
502
+ Array(program.predictors).size
503
+ else
504
+ 1
505
+ end
506
+
507
+ predictor_count = 1 if predictor_count.zero?
508
+ variable_count = zeroshot ? predictor_count : predictor_count * 2
509
+ log_term = Math.log2([num_candidates, 2].max)
510
+
511
+ [
512
+ (2 * variable_count * log_term).ceil,
513
+ (1.5 * num_candidates).ceil
514
+ ].max
515
+ end
516
+
517
+ sig { params(settings: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
518
+ def zero_shot_for_settings?(settings)
519
+ settings.fetch(:max_bootstrapped_examples, 0).to_i.zero? &&
520
+ settings.fetch(:max_labeled_examples, 0).to_i.zero?
521
+ end
522
+
523
+ sig { params(preset: AutoPreset).returns(T::Hash[Symbol, T.untyped]) }
524
+ def auto_settings_for(preset)
525
+ AUTO_PRESET_SETTINGS.fetch(preset) do
526
+ raise ArgumentError, "Unknown auto preset: #{preset.inspect}"
527
+ end
528
+ end
529
+
530
+ # Phase 1: Bootstrap few-shot examples from training data
531
+ # Returns a hash mapping predictor indices to arrays of demo sets
532
+ sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
533
+ def phase_1_bootstrap(program, trainset)
534
+ Utils.create_n_fewshot_demo_sets(
535
+ program,
536
+ config.bootstrap_sets, # num_candidate_sets
537
+ trainset,
538
+ max_bootstrapped_demos: config.max_bootstrapped_examples,
539
+ max_labeled_demos: config.max_labeled_examples,
540
+ metric: @metric
541
+ )
542
+ end
543
+
544
+ # Phase 2: Generate instruction candidates using grounded proposer
545
+ sig do
546
+ params(
547
+ program: T.untyped,
548
+ trainset: T::Array[DSPy::Example],
549
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
550
+ ).returns(DSPy::Propose::GroundedProposer::ProposalResult)
551
+ end
552
+ def phase_2_propose_instructions(program, trainset, demo_candidates)
553
+ # Get current instruction if available
554
+ current_instruction = extract_current_instruction(program)
555
+
556
+ # Use few-shot examples from bootstrap if available
557
+ # Flatten demo sets from first predictor and take first 5 examples
558
+ few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
559
+
560
+ # Re-initialize proposer with program and trainset for awareness features
561
+ # This enables program_aware and use_dataset_summary flags to work correctly
562
+ proposer_config = DSPy::Propose::GroundedProposer::Config.new
563
+ proposer_config.num_instruction_candidates = config.num_instruction_candidates
564
+
565
+ @proposer = DSPy::Propose::GroundedProposer.new(
566
+ config: proposer_config,
567
+ program: program,
568
+ trainset: trainset
569
+ )
570
+
571
+ @proposer.propose_instructions_for_program(
572
+ trainset: trainset,
573
+ program: program,
574
+ demo_candidates: demo_candidates,
575
+ trial_logs: @trial_history,
576
+ num_instruction_candidates: config.num_instruction_candidates
577
+ )
578
+ end
579
+
580
+ # Phase 3: Bayesian optimization to find best configuration
581
+ sig do
582
+ params(
583
+ program: T.untyped,
584
+ evaluation_set: T::Array[DSPy::Example],
585
+ proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
586
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
587
+ ).returns(T::Hash[Symbol, T.untyped])
588
+ end
589
+ def phase_3_optimize(program, evaluation_set, proposal_result, demo_candidates)
590
+ # Generate candidate configurations
591
+ candidates = generate_candidate_configurations(proposal_result, demo_candidates)
592
+
593
+ # Initialize optimization state
594
+ optimization_state = initialize_optimization_state(candidates)
595
+
596
+ # Initialize trial tracking structures
597
+ trial_logs = {}
598
+ param_score_dict = Hash.new { |hash, key| hash[key] = [] }
599
+ fully_evaled_param_combos = {}
600
+ total_eval_calls = 0
601
+
602
+ # Run optimization trials
603
+ trials_completed = 0
604
+ best_score = 0.0
605
+ best_candidate = nil
606
+ best_program = program
607
+ best_evaluation_result = nil
608
+
609
+ config.num_trials.times do |trial_idx|
610
+ trials_completed = trial_idx + 1
611
+
612
+ # Select next candidate based on optimization strategy
613
+ candidate = select_next_candidate(candidates, optimization_state, trial_idx)
614
+ batch_size = evaluation_set.size
615
+
616
+ trial_logs[trials_completed] = create_trial_log_entry(
617
+ trial_number: trials_completed,
618
+ candidate: candidate,
619
+ evaluation_type: :full,
620
+ batch_size: batch_size
621
+ )
622
+
623
+ emit_event('trial_start', {
624
+ trial_number: trials_completed,
625
+ candidate_id: candidate.config_id,
626
+ instruction_preview: candidate.instruction[0, 50],
627
+ num_few_shot: candidate.few_shot_examples.size
628
+ })
629
+
630
+ begin
631
+ # Evaluate candidate
632
+ score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
633
+ total_eval_calls += batch_size
634
+
635
+ instructions_snapshot = extract_program_instructions(modified_program)
636
+ trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
637
+ trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
638
+
639
+ # Update optimization state
640
+ update_optimization_state(optimization_state, candidate, score)
641
+ record_param_score(
642
+ param_score_dict,
643
+ candidate,
644
+ score,
645
+ evaluation_type: :full,
646
+ instructions: instructions_snapshot
647
+ )
648
+ update_fully_evaled_param_combos(
649
+ fully_evaled_param_combos,
650
+ candidate,
651
+ score,
652
+ instructions: instructions_snapshot
653
+ )
654
+
655
+ # Track best result
656
+ is_best = best_candidate.nil? || score > best_score
657
+ if is_best
658
+ best_score = score
659
+ best_candidate = candidate
660
+ best_program = modified_program
661
+ best_evaluation_result = evaluation_result
662
+ end
663
+
664
+ finalize_trial_log_entry(
665
+ trial_logs,
666
+ trials_completed,
667
+ score: score,
668
+ evaluation_type: :full,
669
+ batch_size: batch_size,
670
+ total_eval_calls: total_eval_calls
671
+ )
672
+
673
+ emit_event('trial_complete', {
674
+ trial_number: trials_completed,
675
+ score: score,
676
+ is_best: is_best,
677
+ candidate_id: candidate.config_id
678
+ })
679
+
680
+ # Check early stopping
681
+ if should_early_stop?(optimization_state, trial_idx)
682
+ DSPy.logger.info("Early stopping at trial #{trials_completed}")
683
+ break
684
+ end
685
+
686
+ rescue => error
687
+ finalize_trial_log_entry(
688
+ trial_logs,
689
+ trials_completed,
690
+ score: nil,
691
+ evaluation_type: :full,
692
+ batch_size: batch_size,
693
+ total_eval_calls: total_eval_calls,
694
+ error: error.message
695
+ )
696
+
697
+ emit_event('trial_error', {
698
+ trial_number: trials_completed,
699
+ error: error.message,
700
+ candidate_id: candidate.config_id
701
+ })
702
+
703
+ DSPy.logger.warn("Trial #{trials_completed} failed: #{error.message}")
704
+ end
705
+ end
706
+
707
+ {
708
+ best_score: best_score,
709
+ best_candidate: best_candidate,
710
+ best_program: best_program,
711
+ best_evaluation_result: best_evaluation_result,
712
+ trials_completed: trials_completed,
713
+ optimization_state: optimization_state,
714
+ evaluated_candidates: @evaluated_candidates,
715
+ trial_logs: trial_logs,
716
+ param_score_dict: param_score_dict,
717
+ fully_evaled_param_combos: fully_evaled_param_combos,
718
+ total_eval_calls: total_eval_calls
719
+ }
720
+ end
721
+
722
+ # Generate candidate configurations from proposals and demo candidates
723
+ sig do
724
+ params(
725
+ proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
726
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
727
+ ).returns(T::Array[EvaluatedCandidate])
728
+ end
729
+ def generate_candidate_configurations(proposal_result, demo_candidates)
730
+ candidates = []
731
+ seen_signatures = Set.new
732
+
733
+ add_candidate = lambda do |instruction:, few_shot_examples:, type:, metadata:, config_id:|
734
+ signature = candidate_signature(type, instruction, metadata, few_shot_examples)
735
+ next if seen_signatures.include?(signature)
736
+
737
+ seen_signatures << signature
738
+ candidates << EvaluatedCandidate.new(
739
+ instruction: instruction,
740
+ few_shot_examples: few_shot_examples,
741
+ type: type,
742
+ metadata: metadata,
743
+ config_id: config_id
744
+ )
745
+ end
746
+
747
+ predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
748
+ proposal_result.predictor_instructions
749
+ else
750
+ { 0 => proposal_result.candidate_instructions }
751
+ end
752
+
753
+ instruction_maps = build_instruction_maps(predictor_instruction_map)
754
+ demo_maps = build_demo_maps(demo_candidates)
755
+
756
+ # Base configuration (no modifications)
757
+ add_candidate.call(
758
+ instruction: "",
759
+ few_shot_examples: [],
760
+ type: CandidateType::Baseline,
761
+ metadata: {
762
+ instructions_map: {},
763
+ demos_map: {}
764
+ },
765
+ config_id: SecureRandom.hex(6)
766
+ )
767
+
768
+ instruction_maps.each_with_index do |instruction_map, combo_idx|
769
+ primary_instruction = instruction_map[0] || instruction_map.values.first || ""
770
+ add_candidate.call(
771
+ instruction: primary_instruction,
772
+ few_shot_examples: [],
773
+ type: CandidateType::InstructionOnly,
774
+ metadata: {
775
+ proposal_rank: combo_idx,
776
+ instructions_map: duplicate_instruction_map(instruction_map),
777
+ demos_map: {}
778
+ },
779
+ config_id: SecureRandom.hex(6)
780
+ )
781
+ end
782
+
783
+ demo_maps.each_with_index do |demo_map, idx|
784
+ next if demo_map.empty?
785
+
786
+ flattened_examples = demo_map.values.flatten
787
+ add_candidate.call(
788
+ instruction: "",
789
+ few_shot_examples: flattened_examples,
790
+ type: CandidateType::FewShotOnly,
791
+ metadata: {
792
+ bootstrap_rank: idx,
793
+ instructions_map: {},
794
+ demos_map: duplicate_demo_map(demo_map)
795
+ },
796
+ config_id: SecureRandom.hex(6)
797
+ )
798
+ end
799
+
800
+ # Combined candidates (instruction + few-shot)
801
+ instruction_maps.each_with_index do |instruction_map, combo_idx|
802
+ primary_instruction = instruction_map[0] || instruction_map.values.first || ""
803
+ demo_maps.first(3).each_with_index do |demo_map, demo_idx|
804
+ next if demo_map.empty?
805
+
806
+ flattened_examples = demo_map.values.flatten
807
+ add_candidate.call(
808
+ instruction: primary_instruction,
809
+ few_shot_examples: flattened_examples,
810
+ type: CandidateType::Combined,
811
+ metadata: {
812
+ instruction_rank: combo_idx,
813
+ bootstrap_rank: demo_idx,
814
+ instructions_map: duplicate_instruction_map(instruction_map),
815
+ demos_map: duplicate_demo_map(demo_map)
816
+ },
817
+ config_id: SecureRandom.hex(6)
818
+ )
819
+ end
820
+ end
821
+
822
+ candidates
823
+ end
824
+
825
+ sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
826
+ def build_instruction_maps(predictor_instruction_map)
827
+ return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
828
+
829
+ normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
830
+ next if instructions.nil? || instructions.empty?
831
+ memo[index] = instructions.take(3)
832
+ end
833
+
834
+ return [{}] if normalized.empty?
835
+
836
+ cartesian_product(normalized)
837
+ end
838
+
839
+ sig do
840
+ params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
841
+ end
842
+ def build_demo_maps(demo_candidates)
843
+ return [{}] if demo_candidates.nil? || demo_candidates.empty?
844
+
845
+ normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
846
+ next if sets.nil? || sets.empty?
847
+ memo[index] = sets.take(3)
848
+ end
849
+
850
+ return [{}] if normalized.empty?
851
+
852
+ cartesian_product(normalized)
853
+ end
854
+
855
+ sig do
856
+ params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
857
+ end
858
+ def cartesian_product(options_hash)
859
+ options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
860
+ next acc if values.nil? || values.empty?
861
+
862
+ acc.flat_map do |existing|
863
+ values.map do |value|
864
+ existing.merge(index => value)
865
+ end
866
+ end
867
+ end
868
+ end
869
+
870
+ sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
871
+ def duplicate_instruction_map(instruction_map)
872
+ instruction_map.each_with_object({}) do |(index, instruction), memo|
873
+ memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
874
+ end
875
+ end
876
+
877
+ sig do
878
+ params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
879
+ end
880
+ def duplicate_demo_map(demo_map)
881
+ demo_map.each_with_object({}) do |(index, demos), memo|
882
+ next if demos.nil?
883
+ memo[index] = demos.map { |demo| demo }
884
+ end
885
+ end
886
+
887
+ sig do
888
+ params(
889
+ type: CandidateType,
890
+ instruction: String,
891
+ metadata: T::Hash[Symbol, T.untyped],
892
+ few_shot_examples: T::Array[T.untyped]
893
+ ).returns(String)
894
+ end
895
+ def candidate_signature(type, instruction, metadata, few_shot_examples)
896
+ JSON.generate(
897
+ type: type.serialize,
898
+ instruction: instruction,
899
+ instructions_map: normalize_instruction_map(metadata[:instructions_map] || {}),
900
+ demos_map: normalize_demo_map(metadata[:demos_map] || {}),
901
+ few_shot_examples: few_shot_examples.map { |example| serialize_few_shot_example(example) }
902
+ )
903
+ end
904
+
905
+ sig { params(map: T::Hash[Integer, T.untyped]).returns(T::Hash[Integer, String]) }
906
+ def normalize_instruction_map(map)
907
+ map.sort_by { |index, _| index }.each_with_object({}) do |(index, value), memo|
908
+ memo[index] = value.to_s
909
+ end
910
+ end
911
+
912
+ sig { params(map: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Hash[Integer, T::Array[T.untyped]]) }
913
+ def normalize_demo_map(map)
914
+ map.sort_by { |index, _| index }.each_with_object({}) do |(index, demos), memo|
915
+ memo[index] = Array(demos).map { |demo| serialize_few_shot_example(demo) }
916
+ end
917
+ end
918
+
919
+ sig { params(example: T.untyped).returns(T.untyped) }
920
+ def serialize_few_shot_example(example)
921
+ case example
922
+ when DSPy::FewShotExample
923
+ deep_dup(example.to_h)
924
+ when DSPy::Example
925
+ {
926
+ input: deep_dup(example.input_values),
927
+ expected: deep_dup(example.expected_values)
928
+ }
929
+ when Hash
930
+ deep_dup(example)
931
+ else
932
+ example
933
+ end
934
+ end
935
+
936
+ sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
937
+ def normalize_few_shot_examples(examples)
938
+ examples.map do |example|
939
+ if example.is_a?(DSPy::FewShotExample)
940
+ example
941
+ elsif example.is_a?(DSPy::Example)
942
+ DSPy::FewShotExample.new(
943
+ input: example.input_values,
944
+ output: example.expected_values,
945
+ reasoning: extract_reasoning_from_example(example)
946
+ )
947
+ else
948
+ example
949
+ end
950
+ end
951
+ end
952
+
953
+ sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
954
+ def assign_predictor_examples(predictor, examples)
955
+ predictor.demos = examples if predictor.respond_to?(:demos=)
956
+ return unless predictor.respond_to?(:prompt)
957
+
958
+ cloned_examples = examples.map { |ex| ex }
959
+ predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
960
+ end
961
+
962
+ # Initialize optimization state for candidate selection
963
+ sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
964
+ def initialize_optimization_state(candidates)
965
+ {
966
+ candidates: candidates,
967
+ scores: {},
968
+ exploration_counts: Hash.new(0),
969
+ temperature: config.init_temperature,
970
+ best_score_history: [],
971
+ diversity_scores: {},
972
+ no_improvement_count: 0
973
+ }
974
+ end
975
+
976
+ # Select next candidate based on optimization strategy
977
+ sig do
978
+ params(
979
+ candidates: T::Array[EvaluatedCandidate],
980
+ state: T::Hash[Symbol, T.untyped],
981
+ trial_idx: Integer
982
+ ).returns(EvaluatedCandidate)
983
+ end
984
+ def select_next_candidate(candidates, state, trial_idx)
985
+ case config.optimization_strategy
986
+ when OptimizationStrategy::Greedy
987
+ select_candidate_greedy(candidates, state)
988
+ when OptimizationStrategy::Adaptive
989
+ select_candidate_adaptive(candidates, state, trial_idx)
990
+ when OptimizationStrategy::Bayesian
991
+ select_candidate_bayesian(candidates, state, trial_idx)
992
+ else
993
+ candidates.sample # Random fallback
994
+ end
995
+ end
996
+
997
+ # Greedy candidate selection (exploit best known configurations)
998
+ sig { params(candidates: T::Array[EvaluatedCandidate], state: T::Hash[Symbol, T.untyped]).returns(EvaluatedCandidate) }
999
+ def select_candidate_greedy(candidates, state)
1000
+ # Prioritize unexplored candidates, then highest scoring
1001
+ unexplored = candidates.reject { |c| state[:scores].key?(c.config_id) }
1002
+ return unexplored.sample if unexplored.any?
1003
+
1004
+ # Among explored, pick the best
1005
+ scored_candidates = candidates.select { |c| state[:scores].key?(c.config_id) }
1006
+ scored_candidates.max_by { |c| state[:scores][c.config_id] } || candidates.first
1007
+ end
1008
+
1009
+ # Adaptive candidate selection (balance exploration and exploitation)
1010
+ sig do
1011
+ params(
1012
+ candidates: T::Array[EvaluatedCandidate],
1013
+ state: T::Hash[Symbol, T.untyped],
1014
+ trial_idx: Integer
1015
+ ).returns(EvaluatedCandidate)
1016
+ end
1017
+ def select_candidate_adaptive(candidates, state, trial_idx)
1018
+ # Update temperature based on progress
1019
+ progress = trial_idx.to_f / config.num_trials
1020
+ state[:temperature] = config.init_temperature * (1 - progress) + config.final_temperature * progress
1021
+
1022
+ # Calculate selection scores combining exploitation and exploration
1023
+ candidate_scores = candidates.map do |candidate|
1024
+ exploitation_score = state[:scores][candidate.config_id] || 0.0
1025
+ exploration_bonus = 1.0 / (state[:exploration_counts][candidate.config_id] + 1)
1026
+
1027
+ total_score = exploitation_score + state[:temperature] * exploration_bonus
1028
+ [candidate, total_score]
1029
+ end
1030
+
1031
+ # Select using softmax with temperature
1032
+ if state[:temperature] > 0.01
1033
+ # Probabilistic selection
1034
+ weights = candidate_scores.map { |_, score| Math.exp(score / state[:temperature]) }
1035
+ total_weight = weights.sum
1036
+ probabilities = weights.map { |w| w / total_weight }
1037
+
1038
+ random_value = rand
1039
+ cumulative = 0.0
1040
+ candidate_scores.each_with_index do |(candidate, _), idx|
1041
+ cumulative += probabilities[idx]
1042
+ return candidate if random_value <= cumulative
1043
+ end
1044
+ end
1045
+
1046
+ # Fallback to highest scoring
1047
+ candidate_scores.max_by { |_, score| score }.first
1048
+ end
1049
+
1050
+ # Bayesian candidate selection (use probabilistic model)
1051
+ sig do
1052
+ params(
1053
+ candidates: T::Array[EvaluatedCandidate],
1054
+ state: T::Hash[Symbol, T.untyped],
1055
+ trial_idx: Integer
1056
+ ).returns(EvaluatedCandidate)
1057
+ end
1058
+ def select_candidate_bayesian(candidates, state, trial_idx)
1059
+ # Need at least 3 observations to fit GP, otherwise fall back to adaptive
1060
+ return select_candidate_adaptive(candidates, state, trial_idx) if state[:scores].size < 3
1061
+
1062
+ # Get scored candidates for training the GP
1063
+ scored_candidates = candidates.select { |c| state[:scores].key?(c.config_id) }
1064
+ return select_candidate_adaptive(candidates, state, trial_idx) if scored_candidates.size < 3
1065
+
1066
+ begin
1067
+ # Encode candidates as numerical features
1068
+ all_candidate_features = encode_candidates_for_gp(candidates)
1069
+ scored_features = encode_candidates_for_gp(scored_candidates)
1070
+ scored_targets = scored_candidates.map { |c| state[:scores][c.config_id].to_f }
1071
+
1072
+ # Train Gaussian Process
1073
+ gp = DSPy::Optimizers::GaussianProcess.new(
1074
+ length_scale: 1.0,
1075
+ signal_variance: 1.0,
1076
+ noise_variance: 0.01
1077
+ )
1078
+ gp.fit(scored_features, scored_targets)
1079
+
1080
+ # Predict mean and uncertainty for all candidates
1081
+ means, stds = gp.predict(all_candidate_features, return_std: true)
1082
+
1083
+ # Upper Confidence Bound (UCB) acquisition function
1084
+ kappa = 2.0 * Math.sqrt(Math.log(trial_idx + 1)) # Exploration parameter
1085
+ acquisition_scores = means.to_a.zip(stds.to_a).map { |m, s| m + kappa * s }
1086
+
1087
+ # Select candidate with highest acquisition score
1088
+ best_idx = acquisition_scores.each_with_index.max_by { |score, _| score }[1]
1089
+ candidates[best_idx]
1090
+
1091
+ rescue => e
1092
+ # If GP fails for any reason, fall back to adaptive selection
1093
+ DSPy.logger.warn("Bayesian optimization failed: #{e.message}. Falling back to adaptive selection.")
1094
+ select_candidate_adaptive(candidates, state, trial_idx)
1095
+ end
1096
+ end
1097
+
1098
+ private
1099
+
1100
+
1101
+ # Encode candidates as numerical features for Gaussian Process
1102
+ sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Array[T::Array[Float]]) }
1103
+ def encode_candidates_for_gp(candidates)
1104
+ # Simple encoding: use hash of config as features
1105
+ # In practice, this could be more sophisticated (e.g., instruction embeddings)
1106
+ candidates.map do |candidate|
1107
+ # Create deterministic numerical features from the candidate config
1108
+ config_hash = candidate.config_id.hash.abs
1109
+
1110
+ # Extract multiple features to create a feature vector
1111
+ features = []
1112
+ features << (config_hash % 1000).to_f / 1000.0 # Feature 1: hash mod 1000, normalized
1113
+ features << ((config_hash / 1000) % 1000).to_f / 1000.0 # Feature 2: different part of hash
1114
+ features << ((config_hash / 1_000_000) % 1000).to_f / 1000.0 # Feature 3: high bits
1115
+
1116
+ # Add instruction length if available (Python-compatible: no cap)
1117
+ instruction = candidate.instruction
1118
+ if instruction && !instruction.empty?
1119
+ features << instruction.length.to_f / 100.0 # Instruction length, uncapped
1120
+ else
1121
+ features << 0.5 # Default value
1122
+ end
1123
+
1124
+ features
1125
+ end
1126
+ end
1127
+
1128
+ # Evaluate a candidate configuration
1129
+ sig do
1130
+ params(
1131
+ program: T.untyped,
1132
+ candidate: EvaluatedCandidate,
1133
+ evaluation_set: T::Array[DSPy::Example]
1134
+ ).returns([Float, T.untyped, DSPy::Evaluate::BatchEvaluationResult])
1135
+ end
1136
+ def evaluate_candidate(program, candidate, evaluation_set)
1137
+ # Apply candidate configuration to program
1138
+ modified_program = apply_candidate_configuration(program, candidate)
1139
+
1140
+ # Evaluate modified program
1141
+ evaluation_result = if use_concurrent_evaluation?(evaluation_set)
1142
+ evaluate_candidate_concurrently(modified_program, evaluation_set)
1143
+ else
1144
+ evaluate_program(modified_program, evaluation_set)
1145
+ end
1146
+
1147
+ # Store evaluation details
1148
+ @evaluated_candidates << candidate
1149
+
1150
+ [evaluation_result.pass_rate, modified_program, evaluation_result]
1151
+ end
1152
+
1153
+ sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
1154
+ def use_concurrent_evaluation?(evaluation_set)
1155
+ minibatch_size = config.minibatch_size
1156
+ return false unless minibatch_size&.positive?
1157
+ return false unless config.num_threads && config.num_threads > 1
1158
+
1159
+ evaluation_set.size > minibatch_size
1160
+ end
1161
+
1162
+ sig do
1163
+ params(
1164
+ modified_program: T.untyped,
1165
+ evaluation_set: T::Array[DSPy::Example]
1166
+ ).returns(DSPy::Evaluate::BatchEvaluationResult)
1167
+ end
1168
+ def evaluate_candidate_concurrently(modified_program, evaluation_set)
1169
+ chunk_size = T.must(config.minibatch_size)
1170
+ chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
1171
+ return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
1172
+
1173
+ pool_size = [config.num_threads, chunks.size].min
1174
+ pool_size = 1 if pool_size <= 0
1175
+ executor = Concurrent::FixedThreadPool.new(pool_size)
1176
+
1177
+ futures = chunks.map do |chunk|
1178
+ Concurrent::Promises.future_on(executor) do
1179
+ evaluate_program(modified_program, chunk)
1180
+ end
1181
+ end
1182
+
1183
+ results = futures.map(&:value!)
1184
+ combine_batch_results(results)
1185
+ ensure
1186
+ if executor
1187
+ executor.shutdown
1188
+ executor.wait_for_termination
1189
+ end
1190
+ end
1191
+
1192
+ sig do
1193
+ params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
1194
+ end
1195
+ def combine_batch_results(batch_results)
1196
+ return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
1197
+
1198
+ combined_results = batch_results.flat_map(&:results)
1199
+ total_examples = batch_results.sum(&:total_examples)
1200
+ aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
1201
+
1202
+ DSPy::Evaluate::BatchEvaluationResult.new(
1203
+ results: combined_results,
1204
+ aggregated_metrics: aggregated_metrics
1205
+ )
1206
+ end
1207
+
1208
+ sig do
1209
+ params(
1210
+ batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
1211
+ total_examples: Integer
1212
+ ).returns(T::Hash[Symbol, T.untyped])
1213
+ end
1214
+ def merge_aggregated_metrics(batch_results, total_examples)
1215
+ return {} if total_examples.zero?
1216
+
1217
+ keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
1218
+ keys.each_with_object({}) do |key, memo|
1219
+ numeric_weight = 0.0
1220
+ numeric_sum = 0.0
1221
+ fallback_value = nil
1222
+
1223
+ batch_results.each do |res|
1224
+ value = res.aggregated_metrics[key]
1225
+ next if value.nil?
1226
+
1227
+ if value.is_a?(Numeric)
1228
+ numeric_sum += value.to_f * res.total_examples
1229
+ numeric_weight += res.total_examples
1230
+ else
1231
+ fallback_value = value
1232
+ end
1233
+ end
1234
+
1235
+ if numeric_weight.positive?
1236
+ memo[key] = numeric_sum / numeric_weight
1237
+ elsif fallback_value
1238
+ memo[key] = fallback_value
1239
+ end
1240
+ end
1241
+ end
1242
+
1243
+ # Apply candidate configuration to program
1244
+ sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
1245
+ def apply_candidate_configuration(program, candidate)
1246
+ instructions_map = candidate.metadata[:instructions_map] || {}
1247
+ demos_map = candidate.metadata[:demos_map] || {}
1248
+
1249
+ modified_program = program
1250
+ if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
1251
+ modified_program = modified_program.clone
1252
+ modified_program.predictors.each_with_index do |predictor, idx|
1253
+ if instructions_map.key?(idx)
1254
+ signature = Utils.get_signature(predictor)
1255
+ updated_signature = signature.with_instructions(instructions_map[idx])
1256
+ Utils.set_signature(predictor, updated_signature)
1257
+ end
1258
+
1259
+ if demos_map.key?(idx)
1260
+ normalized_examples = normalize_few_shot_examples(demos_map[idx])
1261
+ assign_predictor_examples(predictor, normalized_examples)
1262
+ end
1263
+ end
1264
+ end
1265
+
1266
+ # Apply instruction if provided (top-level programs still respect with_instruction)
1267
+ if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
1268
+ modified_program = modified_program.with_instruction(candidate.instruction)
1269
+ end
1270
+
1271
+ should_apply_global_examples = candidate.few_shot_examples.any? &&
1272
+ modified_program.respond_to?(:with_examples) &&
1273
+ (demos_map.empty? || !modified_program.respond_to?(:predictors))
1274
+
1275
+ if should_apply_global_examples
1276
+ normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
1277
+ modified_program = modified_program.with_examples(normalized_few_shot)
1278
+ end
1279
+
1280
+ modified_program
1281
+ end
1282
+
1283
+ # Update optimization state after candidate evaluation
1284
+ sig do
1285
+ params(
1286
+ state: T::Hash[Symbol, T.untyped],
1287
+ candidate: EvaluatedCandidate,
1288
+ score: Float
1289
+ ).void
1290
+ end
1291
+ def update_optimization_state(state, candidate, score)
1292
+ state[:scores][candidate.config_id] = score
1293
+ state[:exploration_counts][candidate.config_id] += 1
1294
+ state[:best_score_history] << score
1295
+
1296
+ # Track diversity if enabled
1297
+ if config.track_diversity
1298
+ state[:diversity_scores][candidate.config_id] = calculate_diversity_score(candidate)
1299
+ end
1300
+
1301
+ # Update no improvement counter
1302
+ if state[:best_score_history].size > 1 && score > state[:best_score_history][-2]
1303
+ state[:no_improvement_count] = 0
1304
+ else
1305
+ state[:no_improvement_count] += 1
1306
+ end
1307
+ end
1308
+
1309
+ # Check if optimization should stop early
1310
+ sig { params(state: T::Hash[Symbol, T.untyped], trial_idx: Integer).returns(T::Boolean) }
1311
+ def should_early_stop?(state, trial_idx)
1312
+ # Don't stop too early
1313
+ return false if trial_idx < config.early_stopping_patience
1314
+
1315
+ # Stop if no improvement for patience trials
1316
+ state[:no_improvement_count] >= config.early_stopping_patience
1317
+ end
1318
+
1319
+ # Calculate diversity score for candidate (Python-compatible: only few-shot count)
1320
+ sig { params(candidate: EvaluatedCandidate).returns(Float) }
1321
+ def calculate_diversity_score(candidate)
1322
+ # Python DSPy doesn't use instruction length for diversity, only few-shot count
1323
+ few_shot_diversity = candidate.few_shot_examples.size / 10.0
1324
+
1325
+ [few_shot_diversity, 1.0].min
1326
+ end
1327
+
1328
+ # Build final MIPROv2 result
1329
+ sig do
1330
+ params(
1331
+ optimization_result: T::Hash[Symbol, T.untyped],
1332
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]],
1333
+ proposal_result: DSPy::Propose::GroundedProposer::ProposalResult
1334
+ ).returns(MIPROv2Result)
1335
+ end
1336
+ def build_miprov2_result(optimization_result, demo_candidates, proposal_result)
1337
+ best_candidate = optimization_result[:best_candidate]
1338
+ best_program = optimization_result[:best_program]
1339
+ best_score = optimization_result[:best_score]
1340
+ best_evaluation_result = optimization_result[:best_evaluation_result]
1341
+
1342
+ scores = { pass_rate: best_score }
1343
+
1344
+ history = {
1345
+ total_trials: optimization_result[:trials_completed],
1346
+ optimization_strategy: optimization_strategy_name,
1347
+ early_stopped: optimization_result[:trials_completed] < config.num_trials,
1348
+ score_history: optimization_result[:optimization_state][:best_score_history],
1349
+ total_eval_calls: optimization_result[:total_eval_calls]
1350
+ }
1351
+
1352
+ metadata = {
1353
+ optimizer: "MIPROv2",
1354
+ auto_mode: infer_auto_mode,
1355
+ optimization_strategy: optimization_strategy_name,
1356
+ best_instruction: best_candidate&.instruction || "",
1357
+ best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
1358
+ best_candidate_type: best_candidate&.type&.serialize || "unknown",
1359
+ optimization_timestamp: Time.now.iso8601
1360
+ }
1361
+
1362
+ # Create bootstrap statistics from demo_candidates
1363
+ num_predictors = demo_candidates.keys.size
1364
+ sets_per_predictor = demo_candidates.values.map(&:size)
1365
+ all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
1366
+ bootstrap_statistics = {
1367
+ num_predictors: num_predictors,
1368
+ demo_sets_per_predictor: sets_per_predictor.max || 0,
1369
+ avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
1370
+ }
1371
+ bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
1372
+
1373
+ optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
1374
+ optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
1375
+ optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
1376
+ optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
1377
+ optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
1378
+
1379
+ MIPROv2Result.new(
1380
+ optimized_program: best_program,
1381
+ scores: scores,
1382
+ history: history,
1383
+ best_score_name: "pass_rate",
1384
+ best_score_value: best_score,
1385
+ metadata: metadata,
1386
+ evaluated_candidates: @evaluated_candidates,
1387
+ optimization_trace: optimization_trace,
1388
+ bootstrap_statistics: bootstrap_statistics,
1389
+ proposal_statistics: proposal_result.analysis,
1390
+ best_evaluation_result: best_evaluation_result
1391
+ )
1392
+ end
1393
+
1394
+ # Serialize optimization trace for better JSON output
1395
+ sig { params(optimization_state: T.nilable(T::Hash[Symbol, T.untyped])).returns(T::Hash[Symbol, T.untyped]) }
1396
+ def serialize_optimization_trace(optimization_state)
1397
+ return {} unless optimization_state
1398
+
1399
+ serialized_trace = optimization_state.dup
1400
+
1401
+ # Convert candidate objects to their hash representations
1402
+ if serialized_trace[:candidates]
1403
+ serialized_trace[:candidates] = serialized_trace[:candidates].map(&:to_h)
1404
+ end
1405
+
1406
+ serialized_trace
1407
+ end
1408
+
1409
+ sig do
1410
+ params(
1411
+ trial_number: Integer,
1412
+ candidate: EvaluatedCandidate,
1413
+ evaluation_type: Symbol,
1414
+ batch_size: Integer
1415
+ ).returns(T::Hash[Symbol, T.untyped])
1416
+ end
1417
+ def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
1418
+ # Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
1419
+ trial_number # no-op to acknowledge parameter usage
1420
+ instructions_map = candidate.metadata[:instructions_map] || {}
1421
+ demos_map = candidate.metadata[:demos_map] || {}
1422
+ entry = {
1423
+ candidate_id: candidate.config_id,
1424
+ candidate_type: candidate.type.serialize,
1425
+ instruction_preview: candidate.instruction.to_s[0, 160],
1426
+ few_shot_count: candidate.few_shot_examples.size,
1427
+ metadata: deep_dup(candidate.metadata),
1428
+ evaluation_type: evaluation_type,
1429
+ batch_size: batch_size,
1430
+ status: :in_progress,
1431
+ started_at: Time.now.iso8601
1432
+ }
1433
+ if instructions_map.any?
1434
+ entry[:instructions] = duplicate_instruction_map(instructions_map)
1435
+ entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
1436
+ elsif candidate.instruction && !candidate.instruction.empty?
1437
+ predictor_index = candidate.metadata[:predictor_index] || 0
1438
+ entry[:instruction] = candidate.instruction
1439
+ entry[:instructions] = { predictor_index => candidate.instruction }
1440
+ end
1441
+ entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
1442
+ entry
1443
+ end
1444
+
1445
+ sig do
1446
+ params(
1447
+ trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
1448
+ trial_number: Integer,
1449
+ score: T.nilable(Float),
1450
+ evaluation_type: Symbol,
1451
+ batch_size: Integer,
1452
+ total_eval_calls: Integer,
1453
+ error: T.nilable(String)
1454
+ ).void
1455
+ end
1456
+ def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
1457
+ entry = trial_logs[trial_number] || {}
1458
+ entry[:score] = score if score
1459
+ entry[:evaluation_type] = evaluation_type
1460
+ entry[:batch_size] = batch_size
1461
+ entry[:total_eval_calls] = total_eval_calls
1462
+ entry[:status] = error ? :error : :completed
1463
+ entry[:error] = error if error
1464
+ entry[:completed_at] = Time.now.iso8601
1465
+ trial_logs[trial_number] = entry
1466
+ end
1467
+
1468
+ sig do
1469
+ params(
1470
+ param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
1471
+ candidate: EvaluatedCandidate,
1472
+ score: Float,
1473
+ evaluation_type: Symbol,
1474
+ instructions: T.nilable(T::Hash[Integer, String])
1475
+ ).void
1476
+ end
1477
+ def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
1478
+ instructions_hash = instructions || {}
1479
+ if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
1480
+ predictor_index = candidate.metadata[:predictor_index] || 0
1481
+ instructions_hash[predictor_index] = candidate.instruction
1482
+ end
1483
+
1484
+ record = {
1485
+ candidate_id: candidate.config_id,
1486
+ candidate_type: candidate.type.serialize,
1487
+ score: score,
1488
+ evaluation_type: evaluation_type,
1489
+ timestamp: Time.now.iso8601,
1490
+ metadata: deep_dup(candidate.metadata)
1491
+ }
1492
+ primary_instruction = instructions_hash[0] || candidate.instruction
1493
+ record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
1494
+ record[:instructions] = instructions_hash unless instructions_hash.empty?
1495
+
1496
+ param_score_dict[candidate.config_id] << record
1497
+ end
1498
+
1499
+ sig do
1500
+ params(
1501
+ fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
1502
+ candidate: EvaluatedCandidate,
1503
+ score: Float,
1504
+ instructions: T.nilable(T::Hash[Integer, String])
1505
+ ).void
1506
+ end
1507
+ def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
1508
+ existing = fully_evaled_param_combos[candidate.config_id]
1509
+ if existing.nil? || score > existing[:score]
1510
+ instructions_hash = instructions || {}
1511
+ if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
1512
+ predictor_index = candidate.metadata[:predictor_index] || 0
1513
+ instructions_hash[predictor_index] = candidate.instruction
1514
+ end
1515
+
1516
+ fully_evaled_param_combos[candidate.config_id] = {
1517
+ candidate_id: candidate.config_id,
1518
+ candidate_type: candidate.type.serialize,
1519
+ score: score,
1520
+ metadata: deep_dup(candidate.metadata),
1521
+ updated_at: Time.now.iso8601
1522
+ }
1523
+ unless instructions_hash.empty?
1524
+ fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
1525
+ fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
1526
+ end
1527
+ end
1528
+ end
1529
+
1530
+ sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
1531
+ def serialize_trial_logs(trial_logs)
1532
+ return {} unless trial_logs
1533
+
1534
+ allowed_keys = [
1535
+ :candidate_id,
1536
+ :candidate_type,
1537
+ :instruction_preview,
1538
+ :instruction,
1539
+ :instructions,
1540
+ :few_shot_count,
1541
+ :metadata,
1542
+ :evaluation_type,
1543
+ :batch_size,
1544
+ :score,
1545
+ :status,
1546
+ :error,
1547
+ :started_at,
1548
+ :completed_at,
1549
+ :total_eval_calls
1550
+ ]
1551
+
1552
+ trial_logs.transform_values do |entry|
1553
+ entry.each_with_object({}) do |(key, value), memo|
1554
+ memo[key] = value if allowed_keys.include?(key)
1555
+ end
1556
+ end
1557
+ end
1558
+
1559
+ sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
1560
+ def serialize_param_score_dict(param_score_dict)
1561
+ return {} unless param_score_dict
1562
+
1563
+ allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
1564
+
1565
+ param_score_dict.transform_values do |records|
1566
+ records.map do |record|
1567
+ record.each_with_object({}) do |(key, value), memo|
1568
+ memo[key] = value if allowed_keys.include?(key)
1569
+ end
1570
+ end
1571
+ end
1572
+ end
1573
+
1574
+ sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
1575
+ def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
1576
+ return {} unless fully_evaled_param_combos
1577
+
1578
+ allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
1579
+
1580
+ fully_evaled_param_combos.transform_values do |record|
1581
+ record.each_with_object({}) do |(key, value), memo|
1582
+ memo[key] = value if allowed_keys.include?(key)
1583
+ end
1584
+ end
1585
+ end
1586
+
1587
+ sig { params(value: T.untyped).returns(T.untyped) }
1588
+ def deep_dup(value)
1589
+ case value
1590
+ when Hash
1591
+ value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
1592
+ when Array
1593
+ value.map { |element| deep_dup(element) }
1594
+ else
1595
+ value
1596
+ end
1597
+ end
1598
+
1599
+ # Helper methods
1600
+ sig { returns(String) }
1601
+ def optimization_strategy_name
1602
+ strategy = config.optimization_strategy
1603
+ return strategy.serialize if strategy.respond_to?(:serialize)
1604
+
1605
+ strategy.to_s
1606
+ end
1607
+
1608
+ sig { params(program: T.untyped).returns(T.nilable(String)) }
1609
+ def extract_current_instruction(program)
1610
+ if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
1611
+ program.prompt.instruction
1612
+ elsif program.respond_to?(:system_signature)
1613
+ system_sig = program.system_signature
1614
+ system_sig.is_a?(String) ? system_sig : nil
1615
+ else
1616
+ nil
1617
+ end
1618
+ end
1619
+
1620
+ sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
1621
+ def extract_program_instructions(program)
1622
+ instructions = {}
1623
+ if program.respond_to?(:predictors)
1624
+ program.predictors.each_with_index do |predictor, index|
1625
+ if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
1626
+ value = predictor.prompt.instruction
1627
+ instructions[index] = value if value
1628
+ end
1629
+ end
1630
+ else
1631
+ fallback_instruction = extract_current_instruction(program)
1632
+ instructions[0] = fallback_instruction if fallback_instruction
1633
+ end
1634
+ instructions
1635
+ end
1636
+
1637
+ sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
1638
+ def extract_signature_class(program)
1639
+ program.respond_to?(:signature_class) ? program.signature_class : nil
1640
+ end
1641
+
1642
+ sig { params(example: T.untyped).returns(T.nilable(String)) }
1643
+ def extract_reasoning_from_example(example)
1644
+ case example
1645
+ when DSPy::Example
1646
+ if example.expected_values.key?(:reasoning)
1647
+ example.expected_values[:reasoning]
1648
+ elsif example.expected_values.key?(:explanation)
1649
+ example.expected_values[:explanation]
1650
+ else
1651
+ nil
1652
+ end
1653
+ else
1654
+ nil
1655
+ end
1656
+ end
1657
+
1658
+ # Infer auto mode based on configuration
1659
+ sig { returns(String) }
1660
+ def infer_auto_mode
1661
+ return config.auto_preset.serialize unless config.auto_preset == AutoPreset::None
1662
+
1663
+ case config.num_trials
1664
+ when 0..6 then "light"
1665
+ when 7..12 then "medium"
1666
+ when 13..Float::INFINITY then "heavy"
1667
+ else "manual"
1668
+ end
1669
+ end
1670
+ end
1671
+ end
1672
+ end