dspy 0.28.2 → 0.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -4
- data/lib/dspy/code_act.rb +14 -1
- data/lib/dspy/datasets/ade.rb +90 -0
- data/lib/dspy/datasets.rb +8 -0
- data/lib/dspy/lm.rb +4 -8
- data/lib/dspy/mixins/struct_builder.rb +17 -25
- data/lib/dspy/module.rb +12 -1
- data/lib/dspy/observability/async_span_processor.rb +67 -93
- data/lib/dspy/observability.rb +43 -1
- data/lib/dspy/predict.rb +10 -0
- data/lib/dspy/propose/dataset_summary_generator.rb +36 -3
- data/lib/dspy/propose/grounded_proposer.rb +118 -11
- data/lib/dspy/re_act.rb +13 -0
- data/lib/dspy/reflection_lm.rb +36 -0
- data/lib/dspy/teleprompt/gepa.rb +448 -2803
- data/lib/dspy/teleprompt/mipro_v2.rb +839 -91
- data/lib/dspy/teleprompt/utils.rb +8 -3
- data/lib/dspy/version.rb +2 -2
- data/lib/dspy.rb +3 -2
- data/lib/gepa/api.rb +61 -0
- data/lib/gepa/core/engine.rb +226 -0
- data/lib/gepa/core/evaluation_batch.rb +26 -0
- data/lib/gepa/core/result.rb +92 -0
- data/lib/gepa/core/state.rb +231 -0
- data/lib/gepa/logging/experiment_tracker.rb +54 -0
- data/lib/gepa/logging/logger.rb +57 -0
- data/lib/gepa/logging.rb +9 -0
- data/lib/gepa/proposer/base.rb +27 -0
- data/lib/gepa/proposer/merge_proposer.rb +424 -0
- data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
- data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
- data/lib/gepa/strategies/batch_sampler.rb +91 -0
- data/lib/gepa/strategies/candidate_selector.rb +97 -0
- data/lib/gepa/strategies/component_selector.rb +57 -0
- data/lib/gepa/strategies/instruction_proposal.rb +120 -0
- data/lib/gepa/telemetry.rb +122 -0
- data/lib/gepa/utils/pareto.rb +119 -0
- data/lib/gepa.rb +21 -0
- metadata +38 -3
- data/lib/dspy/teleprompt/simple_optimizer.rb +0 -503
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'digest'
|
|
4
|
+
require 'time'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'concurrent-ruby'
|
|
4
7
|
require 'sorbet-runtime'
|
|
8
|
+
require 'securerandom'
|
|
9
|
+
require 'set'
|
|
5
10
|
require_relative 'teleprompter'
|
|
6
11
|
require_relative 'utils'
|
|
7
12
|
require_relative '../propose/grounded_proposer'
|
|
@@ -27,6 +32,58 @@ module DSPy
|
|
|
27
32
|
Bayesian = new("bayesian")
|
|
28
33
|
end
|
|
29
34
|
end
|
|
35
|
+
|
|
36
|
+
class AutoPreset < T::Enum
|
|
37
|
+
enums do
|
|
38
|
+
None = new("none")
|
|
39
|
+
Light = new("light")
|
|
40
|
+
Medium = new("medium")
|
|
41
|
+
Heavy = new("heavy")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
AUTO_PRESET_SETTINGS = {
|
|
46
|
+
AutoPreset::None => {},
|
|
47
|
+
AutoPreset::Light => {
|
|
48
|
+
candidate_budget: 6,
|
|
49
|
+
instruction_candidates: 3,
|
|
50
|
+
instruction_candidates_when_fewshot: 3,
|
|
51
|
+
bootstrap_sets: 3,
|
|
52
|
+
max_bootstrapped_examples: 2,
|
|
53
|
+
max_labeled_examples: 8,
|
|
54
|
+
optimization_strategy: OptimizationStrategy::Greedy,
|
|
55
|
+
early_stopping_patience: 2,
|
|
56
|
+
valset_target_size: 100,
|
|
57
|
+
minibatch_size: nil
|
|
58
|
+
},
|
|
59
|
+
AutoPreset::Medium => {
|
|
60
|
+
candidate_budget: 12,
|
|
61
|
+
instruction_candidates: 5,
|
|
62
|
+
instruction_candidates_when_fewshot: 5,
|
|
63
|
+
bootstrap_sets: 5,
|
|
64
|
+
max_bootstrapped_examples: 4,
|
|
65
|
+
max_labeled_examples: 16,
|
|
66
|
+
optimization_strategy: OptimizationStrategy::Adaptive,
|
|
67
|
+
early_stopping_patience: 3,
|
|
68
|
+
valset_target_size: 300,
|
|
69
|
+
minibatch_size: nil
|
|
70
|
+
},
|
|
71
|
+
AutoPreset::Heavy => {
|
|
72
|
+
candidate_budget: 18,
|
|
73
|
+
instruction_candidates: 8,
|
|
74
|
+
instruction_candidates_when_fewshot: 8,
|
|
75
|
+
bootstrap_sets: 8,
|
|
76
|
+
max_bootstrapped_examples: 6,
|
|
77
|
+
max_labeled_examples: 24,
|
|
78
|
+
optimization_strategy: OptimizationStrategy::Bayesian,
|
|
79
|
+
early_stopping_patience: 5,
|
|
80
|
+
valset_target_size: 1000,
|
|
81
|
+
minibatch_size: nil
|
|
82
|
+
}
|
|
83
|
+
}.freeze
|
|
84
|
+
|
|
85
|
+
DEFAULT_AUTO_SEED = 42
|
|
86
|
+
|
|
30
87
|
# MIPROv2: Multi-prompt Instruction Proposal with Retrieval Optimization
|
|
31
88
|
# State-of-the-art prompt optimization combining bootstrap sampling,
|
|
32
89
|
# instruction generation, and Bayesian optimization
|
|
@@ -47,13 +104,7 @@ module DSPy
|
|
|
47
104
|
def self.light(metric: nil, **kwargs)
|
|
48
105
|
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
49
106
|
optimizer.configure do |config|
|
|
50
|
-
config
|
|
51
|
-
config.num_instruction_candidates = 3
|
|
52
|
-
config.max_bootstrapped_examples = 2
|
|
53
|
-
config.max_labeled_examples = 8
|
|
54
|
-
config.bootstrap_sets = 3
|
|
55
|
-
config.optimization_strategy = :greedy
|
|
56
|
-
config.early_stopping_patience = 2
|
|
107
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Light)
|
|
57
108
|
end
|
|
58
109
|
optimizer
|
|
59
110
|
end
|
|
@@ -67,13 +118,7 @@ module DSPy
|
|
|
67
118
|
def self.medium(metric: nil, **kwargs)
|
|
68
119
|
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
69
120
|
optimizer.configure do |config|
|
|
70
|
-
config
|
|
71
|
-
config.num_instruction_candidates = 5
|
|
72
|
-
config.max_bootstrapped_examples = 4
|
|
73
|
-
config.max_labeled_examples = 16
|
|
74
|
-
config.bootstrap_sets = 5
|
|
75
|
-
config.optimization_strategy = :adaptive
|
|
76
|
-
config.early_stopping_patience = 3
|
|
121
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Medium)
|
|
77
122
|
end
|
|
78
123
|
optimizer
|
|
79
124
|
end
|
|
@@ -87,19 +132,33 @@ module DSPy
|
|
|
87
132
|
def self.heavy(metric: nil, **kwargs)
|
|
88
133
|
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
89
134
|
optimizer.configure do |config|
|
|
90
|
-
config
|
|
91
|
-
config.num_instruction_candidates = 8
|
|
92
|
-
config.max_bootstrapped_examples = 6
|
|
93
|
-
config.max_labeled_examples = 24
|
|
94
|
-
config.bootstrap_sets = 8
|
|
95
|
-
config.optimization_strategy = :bayesian
|
|
96
|
-
config.early_stopping_patience = 5
|
|
135
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Heavy)
|
|
97
136
|
end
|
|
98
137
|
optimizer
|
|
99
138
|
end
|
|
100
139
|
end
|
|
101
140
|
|
|
102
141
|
# Dry-configurable settings for MIPROv2
|
|
142
|
+
setting :auto_preset, default: AutoPreset::None, constructor: ->(value) {
|
|
143
|
+
case value
|
|
144
|
+
when AutoPreset
|
|
145
|
+
value
|
|
146
|
+
when String, Symbol
|
|
147
|
+
begin
|
|
148
|
+
AutoPreset.deserialize(value.to_s.downcase)
|
|
149
|
+
rescue ArgumentError
|
|
150
|
+
raise ArgumentError, "Invalid auto preset: #{value}. Must be one of :none, :light, :medium, :heavy"
|
|
151
|
+
end
|
|
152
|
+
when nil
|
|
153
|
+
AutoPreset::None
|
|
154
|
+
else
|
|
155
|
+
raise ArgumentError, "Invalid auto preset: #{value.inspect}"
|
|
156
|
+
end
|
|
157
|
+
}
|
|
158
|
+
setting :auto_seed, default: DEFAULT_AUTO_SEED, constructor: ->(value) {
|
|
159
|
+
value.nil? ? DEFAULT_AUTO_SEED : Integer(value)
|
|
160
|
+
}
|
|
161
|
+
setting :valset_target_size, default: nil
|
|
103
162
|
setting :num_trials, default: 12
|
|
104
163
|
setting :num_instruction_candidates, default: 5
|
|
105
164
|
setting :bootstrap_sets, default: 5
|
|
@@ -124,6 +183,7 @@ module DSPy
|
|
|
124
183
|
setting :track_diversity, default: true
|
|
125
184
|
setting :max_errors, default: 3
|
|
126
185
|
setting :num_threads, default: 1
|
|
186
|
+
setting :minibatch_size, default: nil
|
|
127
187
|
|
|
128
188
|
# Class-level configuration method - sets defaults for new instances
|
|
129
189
|
def self.configure(&block)
|
|
@@ -138,6 +198,26 @@ module DSPy
|
|
|
138
198
|
@default_config_block
|
|
139
199
|
end
|
|
140
200
|
|
|
201
|
+
class << self
|
|
202
|
+
extend T::Sig
|
|
203
|
+
|
|
204
|
+
sig { params(config: T.untyped, preset: AutoPreset).void }
|
|
205
|
+
def apply_auto_defaults(config, preset)
|
|
206
|
+
settings = AUTO_PRESET_SETTINGS.fetch(preset) { {} }
|
|
207
|
+
|
|
208
|
+
config.auto_preset = preset
|
|
209
|
+
config.num_trials = settings[:candidate_budget] if settings[:candidate_budget]
|
|
210
|
+
config.num_instruction_candidates = settings[:instruction_candidates] if settings[:instruction_candidates]
|
|
211
|
+
config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
|
|
212
|
+
config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
|
|
213
|
+
config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
|
|
214
|
+
config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
|
|
215
|
+
config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
|
|
216
|
+
config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
|
|
217
|
+
config.valset_target_size = settings[:valset_target_size] if settings[:valset_target_size]
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
141
221
|
|
|
142
222
|
# Simple data structure for evaluated candidate configurations (immutable)
|
|
143
223
|
EvaluatedCandidate = Data.define(
|
|
@@ -265,6 +345,7 @@ module DSPy
|
|
|
265
345
|
@proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
|
|
266
346
|
@optimization_trace = []
|
|
267
347
|
@evaluated_candidates = []
|
|
348
|
+
@trial_history = {}
|
|
268
349
|
end
|
|
269
350
|
|
|
270
351
|
# Main MIPROv2 optimization method
|
|
@@ -282,13 +363,20 @@ module DSPy
|
|
|
282
363
|
trainset_size: trainset.size,
|
|
283
364
|
valset_size: valset&.size || 0,
|
|
284
365
|
num_trials: config.num_trials,
|
|
285
|
-
optimization_strategy:
|
|
366
|
+
optimization_strategy: optimization_strategy_name,
|
|
286
367
|
mode: infer_auto_mode
|
|
287
368
|
}) do
|
|
288
369
|
# Convert examples to typed format
|
|
289
370
|
typed_trainset = ensure_typed_examples(trainset)
|
|
290
371
|
typed_valset = valset ? ensure_typed_examples(valset) : nil
|
|
291
372
|
|
|
373
|
+
if auto_preset_active?
|
|
374
|
+
typed_trainset, typed_valset = prepare_datasets_for_auto(typed_trainset, typed_valset)
|
|
375
|
+
typed_valset = apply_auto_preset!(program, typed_valset)
|
|
376
|
+
else
|
|
377
|
+
typed_valset = limit_validation_set(typed_valset, config.valset_target_size)
|
|
378
|
+
end
|
|
379
|
+
|
|
292
380
|
# Use validation set if available, otherwise use part of training set
|
|
293
381
|
evaluation_set = typed_valset || typed_trainset.take([typed_trainset.size / 3, 10].max)
|
|
294
382
|
|
|
@@ -331,6 +419,8 @@ module DSPy
|
|
|
331
419
|
proposal_result
|
|
332
420
|
)
|
|
333
421
|
|
|
422
|
+
@trial_history = optimization_result[:trial_logs] || {}
|
|
423
|
+
|
|
334
424
|
save_results(final_result)
|
|
335
425
|
final_result
|
|
336
426
|
end
|
|
@@ -338,6 +428,105 @@ module DSPy
|
|
|
338
428
|
|
|
339
429
|
private
|
|
340
430
|
|
|
431
|
+
sig { returns(T::Boolean) }
|
|
432
|
+
def auto_preset_active?
|
|
433
|
+
config.auto_preset != AutoPreset::None
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
sig { params(trainset: T::Array[DSPy::Example], valset: T.nilable(T::Array[DSPy::Example])).returns([T::Array[DSPy::Example], T::Array[DSPy::Example]]) }
|
|
437
|
+
def prepare_datasets_for_auto(trainset, valset)
|
|
438
|
+
settings = auto_settings_for(config.auto_preset)
|
|
439
|
+
target_size = settings[:valset_target_size]
|
|
440
|
+
config.valset_target_size = target_size
|
|
441
|
+
|
|
442
|
+
if valset && valset.any?
|
|
443
|
+
[trainset, limit_validation_set(valset, target_size)]
|
|
444
|
+
else
|
|
445
|
+
raise ArgumentError, "Training set must contain at least 2 examples when auto presets are enabled" if trainset.size < 2
|
|
446
|
+
|
|
447
|
+
shuffled = trainset.shuffle(random: Random.new(config.auto_seed))
|
|
448
|
+
default_val_size = [
|
|
449
|
+
[(trainset.size * 0.8).ceil, 1].max,
|
|
450
|
+
trainset.size - 1
|
|
451
|
+
].min
|
|
452
|
+
|
|
453
|
+
desired_val_size = target_size ? [default_val_size, target_size].min : default_val_size
|
|
454
|
+
desired_val_size = [[desired_val_size, 1].max, trainset.size - 1].min
|
|
455
|
+
|
|
456
|
+
validation_examples = shuffled.take(desired_val_size)
|
|
457
|
+
training_examples = shuffled.drop(desired_val_size)
|
|
458
|
+
|
|
459
|
+
[training_examples, limit_validation_set(validation_examples, target_size)]
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
sig { params(program: T.untyped, valset: T::Array[DSPy::Example]).returns(T::Array[DSPy::Example]) }
|
|
464
|
+
def apply_auto_preset!(program, valset)
|
|
465
|
+
settings = auto_settings_for(config.auto_preset)
|
|
466
|
+
zeroshot = zero_shot_for_settings?(settings)
|
|
467
|
+
candidate_budget = settings[:candidate_budget]
|
|
468
|
+
|
|
469
|
+
if candidate_budget && candidate_budget.positive?
|
|
470
|
+
config.num_trials = compute_trials_from_candidate_budget(program, candidate_budget, zeroshot)
|
|
471
|
+
instruction_candidates = if zeroshot
|
|
472
|
+
candidate_budget
|
|
473
|
+
else
|
|
474
|
+
settings[:instruction_candidates_when_fewshot] || (candidate_budget / 2.0).ceil
|
|
475
|
+
end
|
|
476
|
+
config.num_instruction_candidates = [instruction_candidates, 1].max
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
|
|
480
|
+
config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
|
|
481
|
+
config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
|
|
482
|
+
config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
|
|
483
|
+
config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
|
|
484
|
+
config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
|
|
485
|
+
|
|
486
|
+
config.valset_target_size = settings[:valset_target_size]
|
|
487
|
+
limit_validation_set(valset, config.valset_target_size)
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
sig { params(valset: T.nilable(T::Array[DSPy::Example]), target_size: T.nilable(Integer)).returns(T.nilable(T::Array[DSPy::Example])) }
|
|
491
|
+
def limit_validation_set(valset, target_size)
|
|
492
|
+
return valset unless valset && target_size && target_size.positive?
|
|
493
|
+
return valset if valset.size <= target_size
|
|
494
|
+
|
|
495
|
+
valset.shuffle(random: Random.new(config.auto_seed)).take(target_size)
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
sig { params(program: T.untyped, num_candidates: Integer, zeroshot: T::Boolean).returns(Integer) }
|
|
499
|
+
def compute_trials_from_candidate_budget(program, num_candidates, zeroshot)
|
|
500
|
+
predictor_count =
|
|
501
|
+
if program.respond_to?(:predictors)
|
|
502
|
+
Array(program.predictors).size
|
|
503
|
+
else
|
|
504
|
+
1
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
predictor_count = 1 if predictor_count.zero?
|
|
508
|
+
variable_count = zeroshot ? predictor_count : predictor_count * 2
|
|
509
|
+
log_term = Math.log2([num_candidates, 2].max)
|
|
510
|
+
|
|
511
|
+
[
|
|
512
|
+
(2 * variable_count * log_term).ceil,
|
|
513
|
+
(1.5 * num_candidates).ceil
|
|
514
|
+
].max
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
sig { params(settings: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
|
518
|
+
def zero_shot_for_settings?(settings)
|
|
519
|
+
settings.fetch(:max_bootstrapped_examples, 0).to_i.zero? &&
|
|
520
|
+
settings.fetch(:max_labeled_examples, 0).to_i.zero?
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
sig { params(preset: AutoPreset).returns(T::Hash[Symbol, T.untyped]) }
|
|
524
|
+
def auto_settings_for(preset)
|
|
525
|
+
AUTO_PRESET_SETTINGS.fetch(preset) do
|
|
526
|
+
raise ArgumentError, "Unknown auto preset: #{preset.inspect}"
|
|
527
|
+
end
|
|
528
|
+
end
|
|
529
|
+
|
|
341
530
|
# Phase 1: Bootstrap few-shot examples from training data
|
|
342
531
|
# Returns a hash mapping predictor indices to arrays of demo sets
|
|
343
532
|
sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
|
|
@@ -368,10 +557,6 @@ module DSPy
|
|
|
368
557
|
# Flatten demo sets from first predictor and take first 5 examples
|
|
369
558
|
few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
|
|
370
559
|
|
|
371
|
-
# Get signature class from program
|
|
372
|
-
signature_class = extract_signature_class(program)
|
|
373
|
-
raise ArgumentError, "Cannot extract signature class from program" unless signature_class
|
|
374
|
-
|
|
375
560
|
# Re-initialize proposer with program and trainset for awareness features
|
|
376
561
|
# This enables program_aware and use_dataset_summary flags to work correctly
|
|
377
562
|
proposer_config = DSPy::Propose::GroundedProposer::Config.new
|
|
@@ -383,11 +568,12 @@ module DSPy
|
|
|
383
568
|
trainset: trainset
|
|
384
569
|
)
|
|
385
570
|
|
|
386
|
-
@proposer.
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
571
|
+
@proposer.propose_instructions_for_program(
|
|
572
|
+
trainset: trainset,
|
|
573
|
+
program: program,
|
|
574
|
+
demo_candidates: demo_candidates,
|
|
575
|
+
trial_logs: @trial_history,
|
|
576
|
+
num_instruction_candidates: config.num_instruction_candidates
|
|
391
577
|
)
|
|
392
578
|
end
|
|
393
579
|
|
|
@@ -406,12 +592,18 @@ module DSPy
|
|
|
406
592
|
|
|
407
593
|
# Initialize optimization state
|
|
408
594
|
optimization_state = initialize_optimization_state(candidates)
|
|
409
|
-
|
|
595
|
+
|
|
596
|
+
# Initialize trial tracking structures
|
|
597
|
+
trial_logs = {}
|
|
598
|
+
param_score_dict = Hash.new { |hash, key| hash[key] = [] }
|
|
599
|
+
fully_evaled_param_combos = {}
|
|
600
|
+
total_eval_calls = 0
|
|
601
|
+
|
|
410
602
|
# Run optimization trials
|
|
411
603
|
trials_completed = 0
|
|
412
604
|
best_score = 0.0
|
|
413
605
|
best_candidate = nil
|
|
414
|
-
best_program =
|
|
606
|
+
best_program = program
|
|
415
607
|
best_evaluation_result = nil
|
|
416
608
|
|
|
417
609
|
config.num_trials.times do |trial_idx|
|
|
@@ -419,6 +611,14 @@ module DSPy
|
|
|
419
611
|
|
|
420
612
|
# Select next candidate based on optimization strategy
|
|
421
613
|
candidate = select_next_candidate(candidates, optimization_state, trial_idx)
|
|
614
|
+
batch_size = evaluation_set.size
|
|
615
|
+
|
|
616
|
+
trial_logs[trials_completed] = create_trial_log_entry(
|
|
617
|
+
trial_number: trials_completed,
|
|
618
|
+
candidate: candidate,
|
|
619
|
+
evaluation_type: :full,
|
|
620
|
+
batch_size: batch_size
|
|
621
|
+
)
|
|
422
622
|
|
|
423
623
|
emit_event('trial_start', {
|
|
424
624
|
trial_number: trials_completed,
|
|
@@ -430,12 +630,30 @@ module DSPy
|
|
|
430
630
|
begin
|
|
431
631
|
# Evaluate candidate
|
|
432
632
|
score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
|
|
633
|
+
total_eval_calls += batch_size
|
|
634
|
+
|
|
635
|
+
instructions_snapshot = extract_program_instructions(modified_program)
|
|
636
|
+
trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
|
|
637
|
+
trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
|
|
433
638
|
|
|
434
639
|
# Update optimization state
|
|
435
640
|
update_optimization_state(optimization_state, candidate, score)
|
|
641
|
+
record_param_score(
|
|
642
|
+
param_score_dict,
|
|
643
|
+
candidate,
|
|
644
|
+
score,
|
|
645
|
+
evaluation_type: :full,
|
|
646
|
+
instructions: instructions_snapshot
|
|
647
|
+
)
|
|
648
|
+
update_fully_evaled_param_combos(
|
|
649
|
+
fully_evaled_param_combos,
|
|
650
|
+
candidate,
|
|
651
|
+
score,
|
|
652
|
+
instructions: instructions_snapshot
|
|
653
|
+
)
|
|
436
654
|
|
|
437
655
|
# Track best result
|
|
438
|
-
is_best = score > best_score
|
|
656
|
+
is_best = best_candidate.nil? || score > best_score
|
|
439
657
|
if is_best
|
|
440
658
|
best_score = score
|
|
441
659
|
best_candidate = candidate
|
|
@@ -443,6 +661,15 @@ module DSPy
|
|
|
443
661
|
best_evaluation_result = evaluation_result
|
|
444
662
|
end
|
|
445
663
|
|
|
664
|
+
finalize_trial_log_entry(
|
|
665
|
+
trial_logs,
|
|
666
|
+
trials_completed,
|
|
667
|
+
score: score,
|
|
668
|
+
evaluation_type: :full,
|
|
669
|
+
batch_size: batch_size,
|
|
670
|
+
total_eval_calls: total_eval_calls
|
|
671
|
+
)
|
|
672
|
+
|
|
446
673
|
emit_event('trial_complete', {
|
|
447
674
|
trial_number: trials_completed,
|
|
448
675
|
score: score,
|
|
@@ -457,6 +684,16 @@ module DSPy
|
|
|
457
684
|
end
|
|
458
685
|
|
|
459
686
|
rescue => error
|
|
687
|
+
finalize_trial_log_entry(
|
|
688
|
+
trial_logs,
|
|
689
|
+
trials_completed,
|
|
690
|
+
score: nil,
|
|
691
|
+
evaluation_type: :full,
|
|
692
|
+
batch_size: batch_size,
|
|
693
|
+
total_eval_calls: total_eval_calls,
|
|
694
|
+
error: error.message
|
|
695
|
+
)
|
|
696
|
+
|
|
460
697
|
emit_event('trial_error', {
|
|
461
698
|
trial_number: trials_completed,
|
|
462
699
|
error: error.message,
|
|
@@ -474,7 +711,11 @@ module DSPy
|
|
|
474
711
|
best_evaluation_result: best_evaluation_result,
|
|
475
712
|
trials_completed: trials_completed,
|
|
476
713
|
optimization_state: optimization_state,
|
|
477
|
-
evaluated_candidates: @evaluated_candidates
|
|
714
|
+
evaluated_candidates: @evaluated_candidates,
|
|
715
|
+
trial_logs: trial_logs,
|
|
716
|
+
param_score_dict: param_score_dict,
|
|
717
|
+
fully_evaled_param_combos: fully_evaled_param_combos,
|
|
718
|
+
total_eval_calls: total_eval_calls
|
|
478
719
|
}
|
|
479
720
|
end
|
|
480
721
|
|
|
@@ -487,62 +728,237 @@ module DSPy
|
|
|
487
728
|
end
|
|
488
729
|
def generate_candidate_configurations(proposal_result, demo_candidates)
|
|
489
730
|
candidates = []
|
|
731
|
+
seen_signatures = Set.new
|
|
732
|
+
|
|
733
|
+
add_candidate = lambda do |instruction:, few_shot_examples:, type:, metadata:, config_id:|
|
|
734
|
+
signature = candidate_signature(type, instruction, metadata, few_shot_examples)
|
|
735
|
+
next if seen_signatures.include?(signature)
|
|
736
|
+
|
|
737
|
+
seen_signatures << signature
|
|
738
|
+
candidates << EvaluatedCandidate.new(
|
|
739
|
+
instruction: instruction,
|
|
740
|
+
few_shot_examples: few_shot_examples,
|
|
741
|
+
type: type,
|
|
742
|
+
metadata: metadata,
|
|
743
|
+
config_id: config_id
|
|
744
|
+
)
|
|
745
|
+
end
|
|
746
|
+
|
|
747
|
+
predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
|
|
748
|
+
proposal_result.predictor_instructions
|
|
749
|
+
else
|
|
750
|
+
{ 0 => proposal_result.candidate_instructions }
|
|
751
|
+
end
|
|
752
|
+
|
|
753
|
+
instruction_maps = build_instruction_maps(predictor_instruction_map)
|
|
754
|
+
demo_maps = build_demo_maps(demo_candidates)
|
|
490
755
|
|
|
491
756
|
# Base configuration (no modifications)
|
|
492
|
-
|
|
757
|
+
add_candidate.call(
|
|
493
758
|
instruction: "",
|
|
494
759
|
few_shot_examples: [],
|
|
495
760
|
type: CandidateType::Baseline,
|
|
496
|
-
metadata: {
|
|
761
|
+
metadata: {
|
|
762
|
+
instructions_map: {},
|
|
763
|
+
demos_map: {}
|
|
764
|
+
},
|
|
497
765
|
config_id: SecureRandom.hex(6)
|
|
498
766
|
)
|
|
499
767
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
instruction:
|
|
768
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
|
769
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
|
770
|
+
add_candidate.call(
|
|
771
|
+
instruction: primary_instruction,
|
|
504
772
|
few_shot_examples: [],
|
|
505
773
|
type: CandidateType::InstructionOnly,
|
|
506
|
-
metadata: {
|
|
774
|
+
metadata: {
|
|
775
|
+
proposal_rank: combo_idx,
|
|
776
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
|
777
|
+
demos_map: {}
|
|
778
|
+
},
|
|
507
779
|
config_id: SecureRandom.hex(6)
|
|
508
780
|
)
|
|
509
781
|
end
|
|
510
782
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
783
|
+
demo_maps.each_with_index do |demo_map, idx|
|
|
784
|
+
next if demo_map.empty?
|
|
785
|
+
|
|
786
|
+
flattened_examples = demo_map.values.flatten
|
|
787
|
+
add_candidate.call(
|
|
516
788
|
instruction: "",
|
|
517
|
-
few_shot_examples:
|
|
789
|
+
few_shot_examples: flattened_examples,
|
|
518
790
|
type: CandidateType::FewShotOnly,
|
|
519
|
-
metadata: {
|
|
791
|
+
metadata: {
|
|
792
|
+
bootstrap_rank: idx,
|
|
793
|
+
instructions_map: {},
|
|
794
|
+
demos_map: duplicate_demo_map(demo_map)
|
|
795
|
+
},
|
|
520
796
|
config_id: SecureRandom.hex(6)
|
|
521
797
|
)
|
|
522
798
|
end
|
|
523
799
|
|
|
524
800
|
# Combined candidates (instruction + few-shot)
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
801
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
|
802
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
|
803
|
+
demo_maps.first(3).each_with_index do |demo_map, demo_idx|
|
|
804
|
+
next if demo_map.empty?
|
|
805
|
+
|
|
806
|
+
flattened_examples = demo_map.values.flatten
|
|
807
|
+
add_candidate.call(
|
|
808
|
+
instruction: primary_instruction,
|
|
809
|
+
few_shot_examples: flattened_examples,
|
|
533
810
|
type: CandidateType::Combined,
|
|
534
|
-
metadata: {
|
|
535
|
-
instruction_rank:
|
|
536
|
-
bootstrap_rank:
|
|
811
|
+
metadata: {
|
|
812
|
+
instruction_rank: combo_idx,
|
|
813
|
+
bootstrap_rank: demo_idx,
|
|
814
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
|
815
|
+
demos_map: duplicate_demo_map(demo_map)
|
|
537
816
|
},
|
|
538
817
|
config_id: SecureRandom.hex(6)
|
|
539
818
|
)
|
|
540
819
|
end
|
|
541
820
|
end
|
|
542
|
-
|
|
821
|
+
|
|
543
822
|
candidates
|
|
544
823
|
end
|
|
545
824
|
|
|
825
|
+
sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
|
|
826
|
+
def build_instruction_maps(predictor_instruction_map)
|
|
827
|
+
return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
|
|
828
|
+
|
|
829
|
+
normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
|
|
830
|
+
next if instructions.nil? || instructions.empty?
|
|
831
|
+
memo[index] = instructions.take(3)
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
return [{}] if normalized.empty?
|
|
835
|
+
|
|
836
|
+
cartesian_product(normalized)
|
|
837
|
+
end
|
|
838
|
+
|
|
839
|
+
sig do
|
|
840
|
+
params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
|
|
841
|
+
end
|
|
842
|
+
def build_demo_maps(demo_candidates)
|
|
843
|
+
return [{}] if demo_candidates.nil? || demo_candidates.empty?
|
|
844
|
+
|
|
845
|
+
normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
|
|
846
|
+
next if sets.nil? || sets.empty?
|
|
847
|
+
memo[index] = sets.take(3)
|
|
848
|
+
end
|
|
849
|
+
|
|
850
|
+
return [{}] if normalized.empty?
|
|
851
|
+
|
|
852
|
+
cartesian_product(normalized)
|
|
853
|
+
end
|
|
854
|
+
|
|
855
|
+
sig do
|
|
856
|
+
params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
|
|
857
|
+
end
|
|
858
|
+
def cartesian_product(options_hash)
|
|
859
|
+
options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
|
|
860
|
+
next acc if values.nil? || values.empty?
|
|
861
|
+
|
|
862
|
+
acc.flat_map do |existing|
|
|
863
|
+
values.map do |value|
|
|
864
|
+
existing.merge(index => value)
|
|
865
|
+
end
|
|
866
|
+
end
|
|
867
|
+
end
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
|
|
871
|
+
def duplicate_instruction_map(instruction_map)
|
|
872
|
+
instruction_map.each_with_object({}) do |(index, instruction), memo|
|
|
873
|
+
memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
|
|
874
|
+
end
|
|
875
|
+
end
|
|
876
|
+
|
|
877
|
+
sig do
|
|
878
|
+
params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
|
|
879
|
+
end
|
|
880
|
+
def duplicate_demo_map(demo_map)
|
|
881
|
+
demo_map.each_with_object({}) do |(index, demos), memo|
|
|
882
|
+
next if demos.nil?
|
|
883
|
+
memo[index] = demos.map { |demo| demo }
|
|
884
|
+
end
|
|
885
|
+
end
|
|
886
|
+
|
|
887
|
+
sig do
|
|
888
|
+
params(
|
|
889
|
+
type: CandidateType,
|
|
890
|
+
instruction: String,
|
|
891
|
+
metadata: T::Hash[Symbol, T.untyped],
|
|
892
|
+
few_shot_examples: T::Array[T.untyped]
|
|
893
|
+
).returns(String)
|
|
894
|
+
end
|
|
895
|
+
def candidate_signature(type, instruction, metadata, few_shot_examples)
|
|
896
|
+
JSON.generate(
|
|
897
|
+
type: type.serialize,
|
|
898
|
+
instruction: instruction,
|
|
899
|
+
instructions_map: normalize_instruction_map(metadata[:instructions_map] || {}),
|
|
900
|
+
demos_map: normalize_demo_map(metadata[:demos_map] || {}),
|
|
901
|
+
few_shot_examples: few_shot_examples.map { |example| serialize_few_shot_example(example) }
|
|
902
|
+
)
|
|
903
|
+
end
|
|
904
|
+
|
|
905
|
+
sig { params(map: T::Hash[Integer, T.untyped]).returns(T::Hash[Integer, String]) }
|
|
906
|
+
def normalize_instruction_map(map)
|
|
907
|
+
map.sort_by { |index, _| index }.each_with_object({}) do |(index, value), memo|
|
|
908
|
+
memo[index] = value.to_s
|
|
909
|
+
end
|
|
910
|
+
end
|
|
911
|
+
|
|
912
|
+
sig { params(map: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Hash[Integer, T::Array[T.untyped]]) }
|
|
913
|
+
def normalize_demo_map(map)
|
|
914
|
+
map.sort_by { |index, _| index }.each_with_object({}) do |(index, demos), memo|
|
|
915
|
+
memo[index] = Array(demos).map { |demo| serialize_few_shot_example(demo) }
|
|
916
|
+
end
|
|
917
|
+
end
|
|
918
|
+
|
|
919
|
+
sig { params(example: T.untyped).returns(T.untyped) }
|
|
920
|
+
def serialize_few_shot_example(example)
|
|
921
|
+
case example
|
|
922
|
+
when DSPy::FewShotExample
|
|
923
|
+
deep_dup(example.to_h)
|
|
924
|
+
when DSPy::Example
|
|
925
|
+
{
|
|
926
|
+
input: deep_dup(example.input_values),
|
|
927
|
+
expected: deep_dup(example.expected_values)
|
|
928
|
+
}
|
|
929
|
+
when Hash
|
|
930
|
+
deep_dup(example)
|
|
931
|
+
else
|
|
932
|
+
example
|
|
933
|
+
end
|
|
934
|
+
end
|
|
935
|
+
|
|
936
|
+
sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
|
|
937
|
+
def normalize_few_shot_examples(examples)
|
|
938
|
+
examples.map do |example|
|
|
939
|
+
if example.is_a?(DSPy::FewShotExample)
|
|
940
|
+
example
|
|
941
|
+
elsif example.is_a?(DSPy::Example)
|
|
942
|
+
DSPy::FewShotExample.new(
|
|
943
|
+
input: example.input_values,
|
|
944
|
+
output: example.expected_values,
|
|
945
|
+
reasoning: extract_reasoning_from_example(example)
|
|
946
|
+
)
|
|
947
|
+
else
|
|
948
|
+
example
|
|
949
|
+
end
|
|
950
|
+
end
|
|
951
|
+
end
|
|
952
|
+
|
|
953
|
+
sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
|
|
954
|
+
def assign_predictor_examples(predictor, examples)
|
|
955
|
+
predictor.demos = examples if predictor.respond_to?(:demos=)
|
|
956
|
+
return unless predictor.respond_to?(:prompt)
|
|
957
|
+
|
|
958
|
+
cloned_examples = examples.map { |ex| ex }
|
|
959
|
+
predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
|
|
960
|
+
end
|
|
961
|
+
|
|
546
962
|
# Initialize optimization state for candidate selection
|
|
547
963
|
sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
|
|
548
964
|
def initialize_optimization_state(candidates)
|
|
@@ -722,7 +1138,11 @@ module DSPy
|
|
|
722
1138
|
modified_program = apply_candidate_configuration(program, candidate)
|
|
723
1139
|
|
|
724
1140
|
# Evaluate modified program
|
|
725
|
-
evaluation_result =
|
|
1141
|
+
evaluation_result = if use_concurrent_evaluation?(evaluation_set)
|
|
1142
|
+
evaluate_candidate_concurrently(modified_program, evaluation_set)
|
|
1143
|
+
else
|
|
1144
|
+
evaluate_program(modified_program, evaluation_set)
|
|
1145
|
+
end
|
|
726
1146
|
|
|
727
1147
|
# Store evaluation details
|
|
728
1148
|
@evaluated_candidates << candidate
|
|
@@ -730,32 +1150,131 @@ module DSPy
|
|
|
730
1150
|
[evaluation_result.pass_rate, modified_program, evaluation_result]
|
|
731
1151
|
end
|
|
732
1152
|
|
|
1153
|
+
sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
|
|
1154
|
+
def use_concurrent_evaluation?(evaluation_set)
|
|
1155
|
+
minibatch_size = config.minibatch_size
|
|
1156
|
+
return false unless minibatch_size&.positive?
|
|
1157
|
+
return false unless config.num_threads && config.num_threads > 1
|
|
1158
|
+
|
|
1159
|
+
evaluation_set.size > minibatch_size
|
|
1160
|
+
end
|
|
1161
|
+
|
|
1162
|
+
sig do
|
|
1163
|
+
params(
|
|
1164
|
+
modified_program: T.untyped,
|
|
1165
|
+
evaluation_set: T::Array[DSPy::Example]
|
|
1166
|
+
).returns(DSPy::Evaluate::BatchEvaluationResult)
|
|
1167
|
+
end
|
|
1168
|
+
def evaluate_candidate_concurrently(modified_program, evaluation_set)
|
|
1169
|
+
chunk_size = T.must(config.minibatch_size)
|
|
1170
|
+
chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
|
|
1171
|
+
return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
|
|
1172
|
+
|
|
1173
|
+
pool_size = [config.num_threads, chunks.size].min
|
|
1174
|
+
pool_size = 1 if pool_size <= 0
|
|
1175
|
+
executor = Concurrent::FixedThreadPool.new(pool_size)
|
|
1176
|
+
|
|
1177
|
+
futures = chunks.map do |chunk|
|
|
1178
|
+
Concurrent::Promises.future_on(executor) do
|
|
1179
|
+
evaluate_program(modified_program, chunk)
|
|
1180
|
+
end
|
|
1181
|
+
end
|
|
1182
|
+
|
|
1183
|
+
results = futures.map(&:value!)
|
|
1184
|
+
combine_batch_results(results)
|
|
1185
|
+
ensure
|
|
1186
|
+
if executor
|
|
1187
|
+
executor.shutdown
|
|
1188
|
+
executor.wait_for_termination
|
|
1189
|
+
end
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
sig do
|
|
1193
|
+
params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
|
|
1194
|
+
end
|
|
1195
|
+
def combine_batch_results(batch_results)
|
|
1196
|
+
return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
|
|
1197
|
+
|
|
1198
|
+
combined_results = batch_results.flat_map(&:results)
|
|
1199
|
+
total_examples = batch_results.sum(&:total_examples)
|
|
1200
|
+
aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
|
|
1201
|
+
|
|
1202
|
+
DSPy::Evaluate::BatchEvaluationResult.new(
|
|
1203
|
+
results: combined_results,
|
|
1204
|
+
aggregated_metrics: aggregated_metrics
|
|
1205
|
+
)
|
|
1206
|
+
end
|
|
1207
|
+
|
|
1208
|
+
sig do
|
|
1209
|
+
params(
|
|
1210
|
+
batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
|
|
1211
|
+
total_examples: Integer
|
|
1212
|
+
).returns(T::Hash[Symbol, T.untyped])
|
|
1213
|
+
end
|
|
1214
|
+
def merge_aggregated_metrics(batch_results, total_examples)
|
|
1215
|
+
return {} if total_examples.zero?
|
|
1216
|
+
|
|
1217
|
+
keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
|
|
1218
|
+
keys.each_with_object({}) do |key, memo|
|
|
1219
|
+
numeric_weight = 0.0
|
|
1220
|
+
numeric_sum = 0.0
|
|
1221
|
+
fallback_value = nil
|
|
1222
|
+
|
|
1223
|
+
batch_results.each do |res|
|
|
1224
|
+
value = res.aggregated_metrics[key]
|
|
1225
|
+
next if value.nil?
|
|
1226
|
+
|
|
1227
|
+
if value.is_a?(Numeric)
|
|
1228
|
+
numeric_sum += value.to_f * res.total_examples
|
|
1229
|
+
numeric_weight += res.total_examples
|
|
1230
|
+
else
|
|
1231
|
+
fallback_value = value
|
|
1232
|
+
end
|
|
1233
|
+
end
|
|
1234
|
+
|
|
1235
|
+
if numeric_weight.positive?
|
|
1236
|
+
memo[key] = numeric_sum / numeric_weight
|
|
1237
|
+
elsif fallback_value
|
|
1238
|
+
memo[key] = fallback_value
|
|
1239
|
+
end
|
|
1240
|
+
end
|
|
1241
|
+
end
|
|
1242
|
+
|
|
733
1243
|
# Apply candidate configuration to program
|
|
734
1244
|
sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
|
|
735
1245
|
def apply_candidate_configuration(program, candidate)
|
|
1246
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
|
1247
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
|
1248
|
+
|
|
736
1249
|
modified_program = program
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
else
|
|
750
|
-
# Convert from DSPy::Example
|
|
751
|
-
DSPy::FewShotExample.new(
|
|
752
|
-
input: example.input_values,
|
|
753
|
-
output: example.expected_values,
|
|
754
|
-
reasoning: extract_reasoning_from_example(example)
|
|
755
|
-
)
|
|
1250
|
+
if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
|
|
1251
|
+
modified_program = modified_program.clone
|
|
1252
|
+
modified_program.predictors.each_with_index do |predictor, idx|
|
|
1253
|
+
if instructions_map.key?(idx)
|
|
1254
|
+
signature = Utils.get_signature(predictor)
|
|
1255
|
+
updated_signature = signature.with_instructions(instructions_map[idx])
|
|
1256
|
+
Utils.set_signature(predictor, updated_signature)
|
|
1257
|
+
end
|
|
1258
|
+
|
|
1259
|
+
if demos_map.key?(idx)
|
|
1260
|
+
normalized_examples = normalize_few_shot_examples(demos_map[idx])
|
|
1261
|
+
assign_predictor_examples(predictor, normalized_examples)
|
|
756
1262
|
end
|
|
757
1263
|
end
|
|
758
|
-
|
|
1264
|
+
end
|
|
1265
|
+
|
|
1266
|
+
# Apply instruction if provided (top-level programs still respect with_instruction)
|
|
1267
|
+
if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
|
|
1268
|
+
modified_program = modified_program.with_instruction(candidate.instruction)
|
|
1269
|
+
end
|
|
1270
|
+
|
|
1271
|
+
should_apply_global_examples = candidate.few_shot_examples.any? &&
|
|
1272
|
+
modified_program.respond_to?(:with_examples) &&
|
|
1273
|
+
(demos_map.empty? || !modified_program.respond_to?(:predictors))
|
|
1274
|
+
|
|
1275
|
+
if should_apply_global_examples
|
|
1276
|
+
normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
|
|
1277
|
+
modified_program = modified_program.with_examples(normalized_few_shot)
|
|
759
1278
|
end
|
|
760
1279
|
|
|
761
1280
|
modified_program
|
|
@@ -824,14 +1343,16 @@ module DSPy
|
|
|
824
1343
|
|
|
825
1344
|
history = {
|
|
826
1345
|
total_trials: optimization_result[:trials_completed],
|
|
827
|
-
optimization_strategy:
|
|
1346
|
+
optimization_strategy: optimization_strategy_name,
|
|
828
1347
|
early_stopped: optimization_result[:trials_completed] < config.num_trials,
|
|
829
|
-
score_history: optimization_result[:optimization_state][:best_score_history]
|
|
1348
|
+
score_history: optimization_result[:optimization_state][:best_score_history],
|
|
1349
|
+
total_eval_calls: optimization_result[:total_eval_calls]
|
|
830
1350
|
}
|
|
831
1351
|
|
|
832
1352
|
metadata = {
|
|
833
1353
|
optimizer: "MIPROv2",
|
|
834
1354
|
auto_mode: infer_auto_mode,
|
|
1355
|
+
optimization_strategy: optimization_strategy_name,
|
|
835
1356
|
best_instruction: best_candidate&.instruction || "",
|
|
836
1357
|
best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
|
|
837
1358
|
best_candidate_type: best_candidate&.type&.serialize || "unknown",
|
|
@@ -839,12 +1360,21 @@ module DSPy
|
|
|
839
1360
|
}
|
|
840
1361
|
|
|
841
1362
|
# Create bootstrap statistics from demo_candidates
|
|
842
|
-
|
|
1363
|
+
num_predictors = demo_candidates.keys.size
|
|
1364
|
+
sets_per_predictor = demo_candidates.values.map(&:size)
|
|
1365
|
+
all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
|
|
843
1366
|
bootstrap_statistics = {
|
|
844
|
-
num_predictors:
|
|
845
|
-
demo_sets_per_predictor:
|
|
846
|
-
avg_demos_per_set:
|
|
1367
|
+
num_predictors: num_predictors,
|
|
1368
|
+
demo_sets_per_predictor: sets_per_predictor.max || 0,
|
|
1369
|
+
avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
|
|
847
1370
|
}
|
|
1371
|
+
bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
|
|
1372
|
+
|
|
1373
|
+
optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
|
|
1374
|
+
optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
|
|
1375
|
+
optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
|
|
1376
|
+
optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
|
|
1377
|
+
optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
|
|
848
1378
|
|
|
849
1379
|
MIPROv2Result.new(
|
|
850
1380
|
optimized_program: best_program,
|
|
@@ -854,7 +1384,7 @@ module DSPy
|
|
|
854
1384
|
best_score_value: best_score,
|
|
855
1385
|
metadata: metadata,
|
|
856
1386
|
evaluated_candidates: @evaluated_candidates,
|
|
857
|
-
optimization_trace:
|
|
1387
|
+
optimization_trace: optimization_trace,
|
|
858
1388
|
bootstrap_statistics: bootstrap_statistics,
|
|
859
1389
|
proposal_statistics: proposal_result.analysis,
|
|
860
1390
|
best_evaluation_result: best_evaluation_result
|
|
@@ -876,7 +1406,205 @@ module DSPy
|
|
|
876
1406
|
serialized_trace
|
|
877
1407
|
end
|
|
878
1408
|
|
|
1409
|
+
sig do
|
|
1410
|
+
params(
|
|
1411
|
+
trial_number: Integer,
|
|
1412
|
+
candidate: EvaluatedCandidate,
|
|
1413
|
+
evaluation_type: Symbol,
|
|
1414
|
+
batch_size: Integer
|
|
1415
|
+
).returns(T::Hash[Symbol, T.untyped])
|
|
1416
|
+
end
|
|
1417
|
+
def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
|
|
1418
|
+
# Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
|
|
1419
|
+
trial_number # no-op to acknowledge parameter usage
|
|
1420
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
|
1421
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
|
1422
|
+
entry = {
|
|
1423
|
+
candidate_id: candidate.config_id,
|
|
1424
|
+
candidate_type: candidate.type.serialize,
|
|
1425
|
+
instruction_preview: candidate.instruction.to_s[0, 160],
|
|
1426
|
+
few_shot_count: candidate.few_shot_examples.size,
|
|
1427
|
+
metadata: deep_dup(candidate.metadata),
|
|
1428
|
+
evaluation_type: evaluation_type,
|
|
1429
|
+
batch_size: batch_size,
|
|
1430
|
+
status: :in_progress,
|
|
1431
|
+
started_at: Time.now.iso8601
|
|
1432
|
+
}
|
|
1433
|
+
if instructions_map.any?
|
|
1434
|
+
entry[:instructions] = duplicate_instruction_map(instructions_map)
|
|
1435
|
+
entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
|
|
1436
|
+
elsif candidate.instruction && !candidate.instruction.empty?
|
|
1437
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1438
|
+
entry[:instruction] = candidate.instruction
|
|
1439
|
+
entry[:instructions] = { predictor_index => candidate.instruction }
|
|
1440
|
+
end
|
|
1441
|
+
entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
|
|
1442
|
+
entry
|
|
1443
|
+
end
|
|
1444
|
+
|
|
1445
|
+
sig do
|
|
1446
|
+
params(
|
|
1447
|
+
trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
|
|
1448
|
+
trial_number: Integer,
|
|
1449
|
+
score: T.nilable(Float),
|
|
1450
|
+
evaluation_type: Symbol,
|
|
1451
|
+
batch_size: Integer,
|
|
1452
|
+
total_eval_calls: Integer,
|
|
1453
|
+
error: T.nilable(String)
|
|
1454
|
+
).void
|
|
1455
|
+
end
|
|
1456
|
+
def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
|
|
1457
|
+
entry = trial_logs[trial_number] || {}
|
|
1458
|
+
entry[:score] = score if score
|
|
1459
|
+
entry[:evaluation_type] = evaluation_type
|
|
1460
|
+
entry[:batch_size] = batch_size
|
|
1461
|
+
entry[:total_eval_calls] = total_eval_calls
|
|
1462
|
+
entry[:status] = error ? :error : :completed
|
|
1463
|
+
entry[:error] = error if error
|
|
1464
|
+
entry[:completed_at] = Time.now.iso8601
|
|
1465
|
+
trial_logs[trial_number] = entry
|
|
1466
|
+
end
|
|
1467
|
+
|
|
1468
|
+
sig do
|
|
1469
|
+
params(
|
|
1470
|
+
param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
|
|
1471
|
+
candidate: EvaluatedCandidate,
|
|
1472
|
+
score: Float,
|
|
1473
|
+
evaluation_type: Symbol,
|
|
1474
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
|
1475
|
+
).void
|
|
1476
|
+
end
|
|
1477
|
+
def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
|
|
1478
|
+
instructions_hash = instructions || {}
|
|
1479
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
|
1480
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1481
|
+
instructions_hash[predictor_index] = candidate.instruction
|
|
1482
|
+
end
|
|
1483
|
+
|
|
1484
|
+
record = {
|
|
1485
|
+
candidate_id: candidate.config_id,
|
|
1486
|
+
candidate_type: candidate.type.serialize,
|
|
1487
|
+
score: score,
|
|
1488
|
+
evaluation_type: evaluation_type,
|
|
1489
|
+
timestamp: Time.now.iso8601,
|
|
1490
|
+
metadata: deep_dup(candidate.metadata)
|
|
1491
|
+
}
|
|
1492
|
+
primary_instruction = instructions_hash[0] || candidate.instruction
|
|
1493
|
+
record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
|
|
1494
|
+
record[:instructions] = instructions_hash unless instructions_hash.empty?
|
|
1495
|
+
|
|
1496
|
+
param_score_dict[candidate.config_id] << record
|
|
1497
|
+
end
|
|
1498
|
+
|
|
1499
|
+
sig do
|
|
1500
|
+
params(
|
|
1501
|
+
fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
|
|
1502
|
+
candidate: EvaluatedCandidate,
|
|
1503
|
+
score: Float,
|
|
1504
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
|
1505
|
+
).void
|
|
1506
|
+
end
|
|
1507
|
+
def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
|
|
1508
|
+
existing = fully_evaled_param_combos[candidate.config_id]
|
|
1509
|
+
if existing.nil? || score > existing[:score]
|
|
1510
|
+
instructions_hash = instructions || {}
|
|
1511
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
|
1512
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1513
|
+
instructions_hash[predictor_index] = candidate.instruction
|
|
1514
|
+
end
|
|
1515
|
+
|
|
1516
|
+
fully_evaled_param_combos[candidate.config_id] = {
|
|
1517
|
+
candidate_id: candidate.config_id,
|
|
1518
|
+
candidate_type: candidate.type.serialize,
|
|
1519
|
+
score: score,
|
|
1520
|
+
metadata: deep_dup(candidate.metadata),
|
|
1521
|
+
updated_at: Time.now.iso8601
|
|
1522
|
+
}
|
|
1523
|
+
unless instructions_hash.empty?
|
|
1524
|
+
fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
|
|
1525
|
+
fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
|
|
1526
|
+
end
|
|
1527
|
+
end
|
|
1528
|
+
end
|
|
1529
|
+
|
|
1530
|
+
sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
|
|
1531
|
+
def serialize_trial_logs(trial_logs)
|
|
1532
|
+
return {} unless trial_logs
|
|
1533
|
+
|
|
1534
|
+
allowed_keys = [
|
|
1535
|
+
:candidate_id,
|
|
1536
|
+
:candidate_type,
|
|
1537
|
+
:instruction_preview,
|
|
1538
|
+
:instruction,
|
|
1539
|
+
:instructions,
|
|
1540
|
+
:few_shot_count,
|
|
1541
|
+
:metadata,
|
|
1542
|
+
:evaluation_type,
|
|
1543
|
+
:batch_size,
|
|
1544
|
+
:score,
|
|
1545
|
+
:status,
|
|
1546
|
+
:error,
|
|
1547
|
+
:started_at,
|
|
1548
|
+
:completed_at,
|
|
1549
|
+
:total_eval_calls
|
|
1550
|
+
]
|
|
1551
|
+
|
|
1552
|
+
trial_logs.transform_values do |entry|
|
|
1553
|
+
entry.each_with_object({}) do |(key, value), memo|
|
|
1554
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1555
|
+
end
|
|
1556
|
+
end
|
|
1557
|
+
end
|
|
1558
|
+
|
|
1559
|
+
sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
|
|
1560
|
+
def serialize_param_score_dict(param_score_dict)
|
|
1561
|
+
return {} unless param_score_dict
|
|
1562
|
+
|
|
1563
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
|
|
1564
|
+
|
|
1565
|
+
param_score_dict.transform_values do |records|
|
|
1566
|
+
records.map do |record|
|
|
1567
|
+
record.each_with_object({}) do |(key, value), memo|
|
|
1568
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1569
|
+
end
|
|
1570
|
+
end
|
|
1571
|
+
end
|
|
1572
|
+
end
|
|
1573
|
+
|
|
1574
|
+
sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
|
|
1575
|
+
def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
|
|
1576
|
+
return {} unless fully_evaled_param_combos
|
|
1577
|
+
|
|
1578
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
|
|
1579
|
+
|
|
1580
|
+
fully_evaled_param_combos.transform_values do |record|
|
|
1581
|
+
record.each_with_object({}) do |(key, value), memo|
|
|
1582
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1583
|
+
end
|
|
1584
|
+
end
|
|
1585
|
+
end
|
|
1586
|
+
|
|
1587
|
+
sig { params(value: T.untyped).returns(T.untyped) }
|
|
1588
|
+
def deep_dup(value)
|
|
1589
|
+
case value
|
|
1590
|
+
when Hash
|
|
1591
|
+
value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
|
|
1592
|
+
when Array
|
|
1593
|
+
value.map { |element| deep_dup(element) }
|
|
1594
|
+
else
|
|
1595
|
+
value
|
|
1596
|
+
end
|
|
1597
|
+
end
|
|
1598
|
+
|
|
879
1599
|
# Helper methods
|
|
1600
|
+
sig { returns(String) }
|
|
1601
|
+
def optimization_strategy_name
|
|
1602
|
+
strategy = config.optimization_strategy
|
|
1603
|
+
return strategy.serialize if strategy.respond_to?(:serialize)
|
|
1604
|
+
|
|
1605
|
+
strategy.to_s
|
|
1606
|
+
end
|
|
1607
|
+
|
|
880
1608
|
sig { params(program: T.untyped).returns(T.nilable(String)) }
|
|
881
1609
|
def extract_current_instruction(program)
|
|
882
1610
|
if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
|
|
@@ -889,6 +1617,23 @@ module DSPy
|
|
|
889
1617
|
end
|
|
890
1618
|
end
|
|
891
1619
|
|
|
1620
|
+
sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
|
|
1621
|
+
def extract_program_instructions(program)
|
|
1622
|
+
instructions = {}
|
|
1623
|
+
if program.respond_to?(:predictors)
|
|
1624
|
+
program.predictors.each_with_index do |predictor, index|
|
|
1625
|
+
if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
|
|
1626
|
+
value = predictor.prompt.instruction
|
|
1627
|
+
instructions[index] = value if value
|
|
1628
|
+
end
|
|
1629
|
+
end
|
|
1630
|
+
else
|
|
1631
|
+
fallback_instruction = extract_current_instruction(program)
|
|
1632
|
+
instructions[0] = fallback_instruction if fallback_instruction
|
|
1633
|
+
end
|
|
1634
|
+
instructions
|
|
1635
|
+
end
|
|
1636
|
+
|
|
892
1637
|
sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
|
|
893
1638
|
def extract_signature_class(program)
|
|
894
1639
|
program.respond_to?(:signature_class) ? program.signature_class : nil
|
|
@@ -913,12 +1658,15 @@ module DSPy
|
|
|
913
1658
|
# Infer auto mode based on configuration
|
|
914
1659
|
sig { returns(String) }
|
|
915
1660
|
def infer_auto_mode
|
|
1661
|
+
return config.auto_preset.serialize unless config.auto_preset == AutoPreset::None
|
|
1662
|
+
|
|
916
1663
|
case config.num_trials
|
|
917
1664
|
when 0..6 then "light"
|
|
918
1665
|
when 7..12 then "medium"
|
|
919
|
-
|
|
1666
|
+
when 13..Float::INFINITY then "heavy"
|
|
1667
|
+
else "manual"
|
|
920
1668
|
end
|
|
921
1669
|
end
|
|
922
1670
|
end
|
|
923
1671
|
end
|
|
924
|
-
end
|
|
1672
|
+
end
|