dspy-miprov2 0.29.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +45 -0
- data/README.md +247 -0
- data/lib/dspy/miprov2/version.rb +10 -0
- data/lib/dspy/miprov2.rb +11 -0
- data/lib/dspy/optimizers/gaussian_process.rb +86 -0
- data/lib/dspy/teleprompt/mipro_v2.rb +1672 -0
- metadata +90 -0
|
@@ -0,0 +1,1672 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'time'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'concurrent-ruby'
|
|
7
|
+
require 'sorbet-runtime'
|
|
8
|
+
require 'securerandom'
|
|
9
|
+
require 'set'
|
|
10
|
+
require_relative 'teleprompter'
|
|
11
|
+
require_relative 'utils'
|
|
12
|
+
require_relative '../propose/grounded_proposer'
|
|
13
|
+
require_relative '../optimizers/gaussian_process'
|
|
14
|
+
|
|
15
|
+
module DSPy
|
|
16
|
+
module Teleprompt
|
|
17
|
+
# Enum for candidate configuration types
|
|
18
|
+
class CandidateType < T::Enum
|
|
19
|
+
enums do
|
|
20
|
+
Baseline = new("baseline")
|
|
21
|
+
InstructionOnly = new("instruction_only")
|
|
22
|
+
FewShotOnly = new("few_shot_only")
|
|
23
|
+
Combined = new("combined")
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Enum for optimization strategies
|
|
28
|
+
class OptimizationStrategy < T::Enum
|
|
29
|
+
enums do
|
|
30
|
+
Greedy = new("greedy")
|
|
31
|
+
Adaptive = new("adaptive")
|
|
32
|
+
Bayesian = new("bayesian")
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
class AutoPreset < T::Enum
|
|
37
|
+
enums do
|
|
38
|
+
None = new("none")
|
|
39
|
+
Light = new("light")
|
|
40
|
+
Medium = new("medium")
|
|
41
|
+
Heavy = new("heavy")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
AUTO_PRESET_SETTINGS = {
|
|
46
|
+
AutoPreset::None => {},
|
|
47
|
+
AutoPreset::Light => {
|
|
48
|
+
candidate_budget: 6,
|
|
49
|
+
instruction_candidates: 3,
|
|
50
|
+
instruction_candidates_when_fewshot: 3,
|
|
51
|
+
bootstrap_sets: 3,
|
|
52
|
+
max_bootstrapped_examples: 2,
|
|
53
|
+
max_labeled_examples: 8,
|
|
54
|
+
optimization_strategy: OptimizationStrategy::Greedy,
|
|
55
|
+
early_stopping_patience: 2,
|
|
56
|
+
valset_target_size: 100,
|
|
57
|
+
minibatch_size: nil
|
|
58
|
+
},
|
|
59
|
+
AutoPreset::Medium => {
|
|
60
|
+
candidate_budget: 12,
|
|
61
|
+
instruction_candidates: 5,
|
|
62
|
+
instruction_candidates_when_fewshot: 5,
|
|
63
|
+
bootstrap_sets: 5,
|
|
64
|
+
max_bootstrapped_examples: 4,
|
|
65
|
+
max_labeled_examples: 16,
|
|
66
|
+
optimization_strategy: OptimizationStrategy::Adaptive,
|
|
67
|
+
early_stopping_patience: 3,
|
|
68
|
+
valset_target_size: 300,
|
|
69
|
+
minibatch_size: nil
|
|
70
|
+
},
|
|
71
|
+
AutoPreset::Heavy => {
|
|
72
|
+
candidate_budget: 18,
|
|
73
|
+
instruction_candidates: 8,
|
|
74
|
+
instruction_candidates_when_fewshot: 8,
|
|
75
|
+
bootstrap_sets: 8,
|
|
76
|
+
max_bootstrapped_examples: 6,
|
|
77
|
+
max_labeled_examples: 24,
|
|
78
|
+
optimization_strategy: OptimizationStrategy::Bayesian,
|
|
79
|
+
early_stopping_patience: 5,
|
|
80
|
+
valset_target_size: 1000,
|
|
81
|
+
minibatch_size: nil
|
|
82
|
+
}
|
|
83
|
+
}.freeze
|
|
84
|
+
|
|
85
|
+
DEFAULT_AUTO_SEED = 42
|
|
86
|
+
|
|
87
|
+
# MIPROv2: Multi-prompt Instruction Proposal with Retrieval Optimization
|
|
88
|
+
# State-of-the-art prompt optimization combining bootstrap sampling,
|
|
89
|
+
# instruction generation, and Bayesian optimization
|
|
90
|
+
class MIPROv2 < Teleprompter
|
|
91
|
+
extend T::Sig
|
|
92
|
+
include Dry::Configurable
|
|
93
|
+
|
|
94
|
+
# Auto-configuration modes for different optimization needs
|
|
95
|
+
module AutoMode
|
|
96
|
+
extend T::Sig
|
|
97
|
+
|
|
98
|
+
sig do
|
|
99
|
+
params(
|
|
100
|
+
metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
|
|
101
|
+
kwargs: T.untyped
|
|
102
|
+
).returns(MIPROv2)
|
|
103
|
+
end
|
|
104
|
+
def self.light(metric: nil, **kwargs)
|
|
105
|
+
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
106
|
+
optimizer.configure do |config|
|
|
107
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Light)
|
|
108
|
+
end
|
|
109
|
+
optimizer
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
sig do
|
|
113
|
+
params(
|
|
114
|
+
metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
|
|
115
|
+
kwargs: T.untyped
|
|
116
|
+
).returns(MIPROv2)
|
|
117
|
+
end
|
|
118
|
+
def self.medium(metric: nil, **kwargs)
|
|
119
|
+
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
120
|
+
optimizer.configure do |config|
|
|
121
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Medium)
|
|
122
|
+
end
|
|
123
|
+
optimizer
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
sig do
|
|
127
|
+
params(
|
|
128
|
+
metric: T.nilable(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)),
|
|
129
|
+
kwargs: T.untyped
|
|
130
|
+
).returns(MIPROv2)
|
|
131
|
+
end
|
|
132
|
+
def self.heavy(metric: nil, **kwargs)
|
|
133
|
+
optimizer = MIPROv2.new(metric: metric, **kwargs)
|
|
134
|
+
optimizer.configure do |config|
|
|
135
|
+
MIPROv2.apply_auto_defaults(config, AutoPreset::Heavy)
|
|
136
|
+
end
|
|
137
|
+
optimizer
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Dry-configurable settings for MIPROv2
|
|
142
|
+
setting :auto_preset, default: AutoPreset::None, constructor: ->(value) {
|
|
143
|
+
case value
|
|
144
|
+
when AutoPreset
|
|
145
|
+
value
|
|
146
|
+
when String, Symbol
|
|
147
|
+
begin
|
|
148
|
+
AutoPreset.deserialize(value.to_s.downcase)
|
|
149
|
+
rescue ArgumentError
|
|
150
|
+
raise ArgumentError, "Invalid auto preset: #{value}. Must be one of :none, :light, :medium, :heavy"
|
|
151
|
+
end
|
|
152
|
+
when nil
|
|
153
|
+
AutoPreset::None
|
|
154
|
+
else
|
|
155
|
+
raise ArgumentError, "Invalid auto preset: #{value.inspect}"
|
|
156
|
+
end
|
|
157
|
+
}
|
|
158
|
+
setting :auto_seed, default: DEFAULT_AUTO_SEED, constructor: ->(value) {
|
|
159
|
+
value.nil? ? DEFAULT_AUTO_SEED : Integer(value)
|
|
160
|
+
}
|
|
161
|
+
setting :valset_target_size, default: nil
|
|
162
|
+
setting :num_trials, default: 12
|
|
163
|
+
setting :num_instruction_candidates, default: 5
|
|
164
|
+
setting :bootstrap_sets, default: 5
|
|
165
|
+
setting :max_bootstrapped_examples, default: 4
|
|
166
|
+
setting :max_labeled_examples, default: 16
|
|
167
|
+
setting :optimization_strategy, default: OptimizationStrategy::Adaptive, constructor: ->(value) {
|
|
168
|
+
# Coerce symbols to enum values
|
|
169
|
+
case value
|
|
170
|
+
when :greedy then OptimizationStrategy::Greedy
|
|
171
|
+
when :adaptive then OptimizationStrategy::Adaptive
|
|
172
|
+
when :bayesian then OptimizationStrategy::Bayesian
|
|
173
|
+
when OptimizationStrategy then value
|
|
174
|
+
when nil then OptimizationStrategy::Adaptive
|
|
175
|
+
else
|
|
176
|
+
raise ArgumentError, "Invalid optimization strategy: #{value}. Must be one of :greedy, :adaptive, :bayesian"
|
|
177
|
+
end
|
|
178
|
+
}
|
|
179
|
+
setting :init_temperature, default: 1.0
|
|
180
|
+
setting :final_temperature, default: 0.1
|
|
181
|
+
setting :early_stopping_patience, default: 3
|
|
182
|
+
setting :use_bayesian_optimization, default: true
|
|
183
|
+
setting :track_diversity, default: true
|
|
184
|
+
setting :max_errors, default: 3
|
|
185
|
+
setting :num_threads, default: 1
|
|
186
|
+
setting :minibatch_size, default: nil
|
|
187
|
+
|
|
188
|
+
# Class-level configuration method - sets defaults for new instances
|
|
189
|
+
def self.configure(&block)
|
|
190
|
+
if block_given?
|
|
191
|
+
# Store configuration in a class variable for new instances
|
|
192
|
+
@default_config_block = block
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Get the default configuration block
|
|
197
|
+
def self.default_config_block
|
|
198
|
+
@default_config_block
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
class << self
|
|
202
|
+
extend T::Sig
|
|
203
|
+
|
|
204
|
+
sig { params(config: T.untyped, preset: AutoPreset).void }
|
|
205
|
+
def apply_auto_defaults(config, preset)
|
|
206
|
+
settings = AUTO_PRESET_SETTINGS.fetch(preset) { {} }
|
|
207
|
+
|
|
208
|
+
config.auto_preset = preset
|
|
209
|
+
config.num_trials = settings[:candidate_budget] if settings[:candidate_budget]
|
|
210
|
+
config.num_instruction_candidates = settings[:instruction_candidates] if settings[:instruction_candidates]
|
|
211
|
+
config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
|
|
212
|
+
config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
|
|
213
|
+
config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
|
|
214
|
+
config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
|
|
215
|
+
config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
|
|
216
|
+
config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
|
|
217
|
+
config.valset_target_size = settings[:valset_target_size] if settings[:valset_target_size]
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# Simple data structure for evaluated candidate configurations (immutable)
|
|
223
|
+
EvaluatedCandidate = Data.define(
|
|
224
|
+
:instruction,
|
|
225
|
+
:few_shot_examples,
|
|
226
|
+
:type,
|
|
227
|
+
:metadata,
|
|
228
|
+
:config_id
|
|
229
|
+
) do
|
|
230
|
+
extend T::Sig
|
|
231
|
+
|
|
232
|
+
# Generate a config ID based on content
|
|
233
|
+
sig { params(instruction: String, few_shot_examples: T::Array[T.untyped], type: CandidateType, metadata: T::Hash[Symbol, T.untyped]).returns(EvaluatedCandidate) }
|
|
234
|
+
def self.create(instruction:, few_shot_examples: [], type: CandidateType::Baseline, metadata: {})
|
|
235
|
+
content = "#{instruction}_#{few_shot_examples.size}_#{type.serialize}_#{metadata.hash}"
|
|
236
|
+
config_id = Digest::SHA256.hexdigest(content)[0, 12]
|
|
237
|
+
|
|
238
|
+
new(
|
|
239
|
+
instruction: instruction.freeze,
|
|
240
|
+
few_shot_examples: few_shot_examples.freeze,
|
|
241
|
+
type: type,
|
|
242
|
+
metadata: metadata.freeze,
|
|
243
|
+
config_id: config_id
|
|
244
|
+
)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
248
|
+
def to_h
|
|
249
|
+
{
|
|
250
|
+
instruction: instruction,
|
|
251
|
+
few_shot_examples: few_shot_examples.size,
|
|
252
|
+
type: type.serialize,
|
|
253
|
+
metadata: metadata,
|
|
254
|
+
config_id: config_id
|
|
255
|
+
}
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Result of MIPROv2 optimization
|
|
260
|
+
class MIPROv2Result < OptimizationResult
|
|
261
|
+
extend T::Sig
|
|
262
|
+
|
|
263
|
+
sig { returns(T::Array[EvaluatedCandidate]) }
|
|
264
|
+
attr_reader :evaluated_candidates
|
|
265
|
+
|
|
266
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
267
|
+
attr_reader :optimization_trace
|
|
268
|
+
|
|
269
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
270
|
+
attr_reader :bootstrap_statistics
|
|
271
|
+
|
|
272
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
273
|
+
attr_reader :proposal_statistics
|
|
274
|
+
|
|
275
|
+
sig { returns(T.nilable(DSPy::Evaluate::BatchEvaluationResult)) }
|
|
276
|
+
attr_reader :best_evaluation_result
|
|
277
|
+
|
|
278
|
+
sig do
|
|
279
|
+
params(
|
|
280
|
+
optimized_program: T.untyped,
|
|
281
|
+
scores: T::Hash[Symbol, T.untyped],
|
|
282
|
+
history: T::Hash[Symbol, T.untyped],
|
|
283
|
+
evaluated_candidates: T::Array[EvaluatedCandidate],
|
|
284
|
+
optimization_trace: T::Hash[Symbol, T.untyped],
|
|
285
|
+
bootstrap_statistics: T::Hash[Symbol, T.untyped],
|
|
286
|
+
proposal_statistics: T::Hash[Symbol, T.untyped],
|
|
287
|
+
best_score_name: T.nilable(String),
|
|
288
|
+
best_score_value: T.nilable(Float),
|
|
289
|
+
metadata: T::Hash[Symbol, T.untyped],
|
|
290
|
+
best_evaluation_result: T.nilable(DSPy::Evaluate::BatchEvaluationResult)
|
|
291
|
+
).void
|
|
292
|
+
end
|
|
293
|
+
def initialize(optimized_program:, scores:, history:, evaluated_candidates:, optimization_trace:, bootstrap_statistics:, proposal_statistics:, best_score_name: nil, best_score_value: nil, metadata: {}, best_evaluation_result: nil)
|
|
294
|
+
super(
|
|
295
|
+
optimized_program: optimized_program,
|
|
296
|
+
scores: scores,
|
|
297
|
+
history: history,
|
|
298
|
+
best_score_name: best_score_name,
|
|
299
|
+
best_score_value: best_score_value,
|
|
300
|
+
metadata: metadata
|
|
301
|
+
)
|
|
302
|
+
@evaluated_candidates = evaluated_candidates.freeze
|
|
303
|
+
@optimization_trace = optimization_trace.freeze
|
|
304
|
+
@bootstrap_statistics = bootstrap_statistics.freeze
|
|
305
|
+
@proposal_statistics = proposal_statistics.freeze
|
|
306
|
+
@best_evaluation_result = best_evaluation_result&.freeze
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
sig { returns(T::Hash[Symbol, T.untyped]) }
|
|
310
|
+
def to_h
|
|
311
|
+
super.merge({
|
|
312
|
+
evaluated_candidates: @evaluated_candidates.map(&:to_h),
|
|
313
|
+
optimization_trace: @optimization_trace,
|
|
314
|
+
bootstrap_statistics: @bootstrap_statistics,
|
|
315
|
+
proposal_statistics: @proposal_statistics,
|
|
316
|
+
best_evaluation_result: @best_evaluation_result&.to_h
|
|
317
|
+
})
|
|
318
|
+
end
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
sig { returns(MIPROv2Config) }
|
|
322
|
+
attr_reader :mipro_config
|
|
323
|
+
|
|
324
|
+
sig { returns(T.nilable(DSPy::Propose::GroundedProposer)) }
|
|
325
|
+
attr_reader :proposer
|
|
326
|
+
|
|
327
|
+
# Override dry-configurable's initialize to add our parameter validation
|
|
328
|
+
def initialize(metric: nil, **kwargs)
|
|
329
|
+
# Reject old config parameter pattern
|
|
330
|
+
if kwargs.key?(:config)
|
|
331
|
+
raise ArgumentError, "config parameter is no longer supported. Use .configure blocks instead."
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# Let dry-configurable handle its initialization
|
|
335
|
+
super(**kwargs)
|
|
336
|
+
|
|
337
|
+
# Apply class-level configuration if it exists
|
|
338
|
+
if self.class.default_config_block
|
|
339
|
+
configure(&self.class.default_config_block)
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
@metric = metric
|
|
343
|
+
|
|
344
|
+
# Initialize proposer with a basic config for now (will be updated later)
|
|
345
|
+
@proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
|
|
346
|
+
@optimization_trace = []
|
|
347
|
+
@evaluated_candidates = []
|
|
348
|
+
@trial_history = {}
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Main MIPROv2 optimization method
|
|
352
|
+
sig do
|
|
353
|
+
params(
|
|
354
|
+
program: T.untyped,
|
|
355
|
+
trainset: T::Array[T.untyped],
|
|
356
|
+
valset: T.nilable(T::Array[T.untyped])
|
|
357
|
+
).returns(MIPROv2Result)
|
|
358
|
+
end
|
|
359
|
+
def compile(program, trainset:, valset: nil)
|
|
360
|
+
validate_inputs(program, trainset, valset)
|
|
361
|
+
|
|
362
|
+
instrument_step('miprov2_compile', {
|
|
363
|
+
trainset_size: trainset.size,
|
|
364
|
+
valset_size: valset&.size || 0,
|
|
365
|
+
num_trials: config.num_trials,
|
|
366
|
+
optimization_strategy: optimization_strategy_name,
|
|
367
|
+
mode: infer_auto_mode
|
|
368
|
+
}) do
|
|
369
|
+
# Convert examples to typed format
|
|
370
|
+
typed_trainset = ensure_typed_examples(trainset)
|
|
371
|
+
typed_valset = valset ? ensure_typed_examples(valset) : nil
|
|
372
|
+
|
|
373
|
+
if auto_preset_active?
|
|
374
|
+
typed_trainset, typed_valset = prepare_datasets_for_auto(typed_trainset, typed_valset)
|
|
375
|
+
typed_valset = apply_auto_preset!(program, typed_valset)
|
|
376
|
+
else
|
|
377
|
+
typed_valset = limit_validation_set(typed_valset, config.valset_target_size)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Use validation set if available, otherwise use part of training set
|
|
381
|
+
evaluation_set = typed_valset || typed_trainset.take([typed_trainset.size / 3, 10].max)
|
|
382
|
+
|
|
383
|
+
# Phase 1: Bootstrap few-shot examples
|
|
384
|
+
emit_event('phase_start', { phase: 1, name: 'bootstrap' })
|
|
385
|
+
demo_candidates = phase_1_bootstrap(program, typed_trainset)
|
|
386
|
+
emit_event('phase_complete', {
|
|
387
|
+
phase: 1,
|
|
388
|
+
num_predictors: demo_candidates.keys.size,
|
|
389
|
+
demo_sets_per_predictor: demo_candidates[0]&.size || 0
|
|
390
|
+
})
|
|
391
|
+
|
|
392
|
+
# Phase 2: Generate instruction candidates
|
|
393
|
+
emit_event('phase_start', { phase: 2, name: 'instruction_proposal' })
|
|
394
|
+
proposal_result = phase_2_propose_instructions(program, typed_trainset, demo_candidates)
|
|
395
|
+
emit_event('phase_complete', {
|
|
396
|
+
phase: 2,
|
|
397
|
+
num_candidates: proposal_result.num_candidates,
|
|
398
|
+
best_instruction_preview: proposal_result.best_instruction[0, 50]
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
# Phase 3: Bayesian optimization
|
|
402
|
+
emit_event('phase_start', { phase: 3, name: 'optimization' })
|
|
403
|
+
optimization_result = phase_3_optimize(
|
|
404
|
+
program,
|
|
405
|
+
evaluation_set,
|
|
406
|
+
proposal_result,
|
|
407
|
+
demo_candidates
|
|
408
|
+
)
|
|
409
|
+
emit_event('phase_complete', {
|
|
410
|
+
phase: 3,
|
|
411
|
+
best_score: optimization_result[:best_score],
|
|
412
|
+
trials_completed: optimization_result[:trials_completed]
|
|
413
|
+
})
|
|
414
|
+
|
|
415
|
+
# Build final result
|
|
416
|
+
final_result = build_miprov2_result(
|
|
417
|
+
optimization_result,
|
|
418
|
+
demo_candidates,
|
|
419
|
+
proposal_result
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
@trial_history = optimization_result[:trial_logs] || {}
|
|
423
|
+
|
|
424
|
+
save_results(final_result)
|
|
425
|
+
final_result
|
|
426
|
+
end
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
private
|
|
430
|
+
|
|
431
|
+
sig { returns(T::Boolean) }
|
|
432
|
+
def auto_preset_active?
|
|
433
|
+
config.auto_preset != AutoPreset::None
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
sig { params(trainset: T::Array[DSPy::Example], valset: T.nilable(T::Array[DSPy::Example])).returns([T::Array[DSPy::Example], T::Array[DSPy::Example]]) }
|
|
437
|
+
def prepare_datasets_for_auto(trainset, valset)
|
|
438
|
+
settings = auto_settings_for(config.auto_preset)
|
|
439
|
+
target_size = settings[:valset_target_size]
|
|
440
|
+
config.valset_target_size = target_size
|
|
441
|
+
|
|
442
|
+
if valset && valset.any?
|
|
443
|
+
[trainset, limit_validation_set(valset, target_size)]
|
|
444
|
+
else
|
|
445
|
+
raise ArgumentError, "Training set must contain at least 2 examples when auto presets are enabled" if trainset.size < 2
|
|
446
|
+
|
|
447
|
+
shuffled = trainset.shuffle(random: Random.new(config.auto_seed))
|
|
448
|
+
default_val_size = [
|
|
449
|
+
[(trainset.size * 0.8).ceil, 1].max,
|
|
450
|
+
trainset.size - 1
|
|
451
|
+
].min
|
|
452
|
+
|
|
453
|
+
desired_val_size = target_size ? [default_val_size, target_size].min : default_val_size
|
|
454
|
+
desired_val_size = [[desired_val_size, 1].max, trainset.size - 1].min
|
|
455
|
+
|
|
456
|
+
validation_examples = shuffled.take(desired_val_size)
|
|
457
|
+
training_examples = shuffled.drop(desired_val_size)
|
|
458
|
+
|
|
459
|
+
[training_examples, limit_validation_set(validation_examples, target_size)]
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
sig { params(program: T.untyped, valset: T::Array[DSPy::Example]).returns(T::Array[DSPy::Example]) }
|
|
464
|
+
def apply_auto_preset!(program, valset)
|
|
465
|
+
settings = auto_settings_for(config.auto_preset)
|
|
466
|
+
zeroshot = zero_shot_for_settings?(settings)
|
|
467
|
+
candidate_budget = settings[:candidate_budget]
|
|
468
|
+
|
|
469
|
+
if candidate_budget && candidate_budget.positive?
|
|
470
|
+
config.num_trials = compute_trials_from_candidate_budget(program, candidate_budget, zeroshot)
|
|
471
|
+
instruction_candidates = if zeroshot
|
|
472
|
+
candidate_budget
|
|
473
|
+
else
|
|
474
|
+
settings[:instruction_candidates_when_fewshot] || (candidate_budget / 2.0).ceil
|
|
475
|
+
end
|
|
476
|
+
config.num_instruction_candidates = [instruction_candidates, 1].max
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
|
|
480
|
+
config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
|
|
481
|
+
config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
|
|
482
|
+
config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
|
|
483
|
+
config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
|
|
484
|
+
config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
|
|
485
|
+
|
|
486
|
+
config.valset_target_size = settings[:valset_target_size]
|
|
487
|
+
limit_validation_set(valset, config.valset_target_size)
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
sig { params(valset: T.nilable(T::Array[DSPy::Example]), target_size: T.nilable(Integer)).returns(T.nilable(T::Array[DSPy::Example])) }
|
|
491
|
+
def limit_validation_set(valset, target_size)
|
|
492
|
+
return valset unless valset && target_size && target_size.positive?
|
|
493
|
+
return valset if valset.size <= target_size
|
|
494
|
+
|
|
495
|
+
valset.shuffle(random: Random.new(config.auto_seed)).take(target_size)
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
sig { params(program: T.untyped, num_candidates: Integer, zeroshot: T::Boolean).returns(Integer) }
|
|
499
|
+
def compute_trials_from_candidate_budget(program, num_candidates, zeroshot)
|
|
500
|
+
predictor_count =
|
|
501
|
+
if program.respond_to?(:predictors)
|
|
502
|
+
Array(program.predictors).size
|
|
503
|
+
else
|
|
504
|
+
1
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
predictor_count = 1 if predictor_count.zero?
|
|
508
|
+
variable_count = zeroshot ? predictor_count : predictor_count * 2
|
|
509
|
+
log_term = Math.log2([num_candidates, 2].max)
|
|
510
|
+
|
|
511
|
+
[
|
|
512
|
+
(2 * variable_count * log_term).ceil,
|
|
513
|
+
(1.5 * num_candidates).ceil
|
|
514
|
+
].max
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
sig { params(settings: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
|
|
518
|
+
def zero_shot_for_settings?(settings)
|
|
519
|
+
settings.fetch(:max_bootstrapped_examples, 0).to_i.zero? &&
|
|
520
|
+
settings.fetch(:max_labeled_examples, 0).to_i.zero?
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
sig { params(preset: AutoPreset).returns(T::Hash[Symbol, T.untyped]) }
|
|
524
|
+
def auto_settings_for(preset)
|
|
525
|
+
AUTO_PRESET_SETTINGS.fetch(preset) do
|
|
526
|
+
raise ArgumentError, "Unknown auto preset: #{preset.inspect}"
|
|
527
|
+
end
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
# Phase 1: Bootstrap few-shot examples from training data
|
|
531
|
+
# Returns a hash mapping predictor indices to arrays of demo sets
|
|
532
|
+
sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
|
|
533
|
+
def phase_1_bootstrap(program, trainset)
|
|
534
|
+
Utils.create_n_fewshot_demo_sets(
|
|
535
|
+
program,
|
|
536
|
+
config.bootstrap_sets, # num_candidate_sets
|
|
537
|
+
trainset,
|
|
538
|
+
max_bootstrapped_demos: config.max_bootstrapped_examples,
|
|
539
|
+
max_labeled_demos: config.max_labeled_examples,
|
|
540
|
+
metric: @metric
|
|
541
|
+
)
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
# Phase 2: Generate instruction candidates using grounded proposer
|
|
545
|
+
sig do
|
|
546
|
+
params(
|
|
547
|
+
program: T.untyped,
|
|
548
|
+
trainset: T::Array[DSPy::Example],
|
|
549
|
+
demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
|
|
550
|
+
).returns(DSPy::Propose::GroundedProposer::ProposalResult)
|
|
551
|
+
end
|
|
552
|
+
def phase_2_propose_instructions(program, trainset, demo_candidates)
|
|
553
|
+
# Get current instruction if available
|
|
554
|
+
current_instruction = extract_current_instruction(program)
|
|
555
|
+
|
|
556
|
+
# Use few-shot examples from bootstrap if available
|
|
557
|
+
# Flatten demo sets from first predictor and take first 5 examples
|
|
558
|
+
few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
|
|
559
|
+
|
|
560
|
+
# Re-initialize proposer with program and trainset for awareness features
|
|
561
|
+
# This enables program_aware and use_dataset_summary flags to work correctly
|
|
562
|
+
proposer_config = DSPy::Propose::GroundedProposer::Config.new
|
|
563
|
+
proposer_config.num_instruction_candidates = config.num_instruction_candidates
|
|
564
|
+
|
|
565
|
+
@proposer = DSPy::Propose::GroundedProposer.new(
|
|
566
|
+
config: proposer_config,
|
|
567
|
+
program: program,
|
|
568
|
+
trainset: trainset
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
@proposer.propose_instructions_for_program(
|
|
572
|
+
trainset: trainset,
|
|
573
|
+
program: program,
|
|
574
|
+
demo_candidates: demo_candidates,
|
|
575
|
+
trial_logs: @trial_history,
|
|
576
|
+
num_instruction_candidates: config.num_instruction_candidates
|
|
577
|
+
)
|
|
578
|
+
end
|
|
579
|
+
|
|
580
|
+
# Phase 3: Bayesian optimization to find best configuration
|
|
581
|
+
sig do
|
|
582
|
+
params(
|
|
583
|
+
program: T.untyped,
|
|
584
|
+
evaluation_set: T::Array[DSPy::Example],
|
|
585
|
+
proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
|
|
586
|
+
demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
|
|
587
|
+
).returns(T::Hash[Symbol, T.untyped])
|
|
588
|
+
end
|
|
589
|
+
def phase_3_optimize(program, evaluation_set, proposal_result, demo_candidates)
|
|
590
|
+
# Generate candidate configurations
|
|
591
|
+
candidates = generate_candidate_configurations(proposal_result, demo_candidates)
|
|
592
|
+
|
|
593
|
+
# Initialize optimization state
|
|
594
|
+
optimization_state = initialize_optimization_state(candidates)
|
|
595
|
+
|
|
596
|
+
# Initialize trial tracking structures
|
|
597
|
+
trial_logs = {}
|
|
598
|
+
param_score_dict = Hash.new { |hash, key| hash[key] = [] }
|
|
599
|
+
fully_evaled_param_combos = {}
|
|
600
|
+
total_eval_calls = 0
|
|
601
|
+
|
|
602
|
+
# Run optimization trials
|
|
603
|
+
trials_completed = 0
|
|
604
|
+
best_score = 0.0
|
|
605
|
+
best_candidate = nil
|
|
606
|
+
best_program = program
|
|
607
|
+
best_evaluation_result = nil
|
|
608
|
+
|
|
609
|
+
config.num_trials.times do |trial_idx|
|
|
610
|
+
trials_completed = trial_idx + 1
|
|
611
|
+
|
|
612
|
+
# Select next candidate based on optimization strategy
|
|
613
|
+
candidate = select_next_candidate(candidates, optimization_state, trial_idx)
|
|
614
|
+
batch_size = evaluation_set.size
|
|
615
|
+
|
|
616
|
+
trial_logs[trials_completed] = create_trial_log_entry(
|
|
617
|
+
trial_number: trials_completed,
|
|
618
|
+
candidate: candidate,
|
|
619
|
+
evaluation_type: :full,
|
|
620
|
+
batch_size: batch_size
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
emit_event('trial_start', {
|
|
624
|
+
trial_number: trials_completed,
|
|
625
|
+
candidate_id: candidate.config_id,
|
|
626
|
+
instruction_preview: candidate.instruction[0, 50],
|
|
627
|
+
num_few_shot: candidate.few_shot_examples.size
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
begin
|
|
631
|
+
# Evaluate candidate
|
|
632
|
+
score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
|
|
633
|
+
total_eval_calls += batch_size
|
|
634
|
+
|
|
635
|
+
instructions_snapshot = extract_program_instructions(modified_program)
|
|
636
|
+
trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
|
|
637
|
+
trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
|
|
638
|
+
|
|
639
|
+
# Update optimization state
|
|
640
|
+
update_optimization_state(optimization_state, candidate, score)
|
|
641
|
+
record_param_score(
|
|
642
|
+
param_score_dict,
|
|
643
|
+
candidate,
|
|
644
|
+
score,
|
|
645
|
+
evaluation_type: :full,
|
|
646
|
+
instructions: instructions_snapshot
|
|
647
|
+
)
|
|
648
|
+
update_fully_evaled_param_combos(
|
|
649
|
+
fully_evaled_param_combos,
|
|
650
|
+
candidate,
|
|
651
|
+
score,
|
|
652
|
+
instructions: instructions_snapshot
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
# Track best result
|
|
656
|
+
is_best = best_candidate.nil? || score > best_score
|
|
657
|
+
if is_best
|
|
658
|
+
best_score = score
|
|
659
|
+
best_candidate = candidate
|
|
660
|
+
best_program = modified_program
|
|
661
|
+
best_evaluation_result = evaluation_result
|
|
662
|
+
end
|
|
663
|
+
|
|
664
|
+
finalize_trial_log_entry(
|
|
665
|
+
trial_logs,
|
|
666
|
+
trials_completed,
|
|
667
|
+
score: score,
|
|
668
|
+
evaluation_type: :full,
|
|
669
|
+
batch_size: batch_size,
|
|
670
|
+
total_eval_calls: total_eval_calls
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
emit_event('trial_complete', {
|
|
674
|
+
trial_number: trials_completed,
|
|
675
|
+
score: score,
|
|
676
|
+
is_best: is_best,
|
|
677
|
+
candidate_id: candidate.config_id
|
|
678
|
+
})
|
|
679
|
+
|
|
680
|
+
# Check early stopping
|
|
681
|
+
if should_early_stop?(optimization_state, trial_idx)
|
|
682
|
+
DSPy.logger.info("Early stopping at trial #{trials_completed}")
|
|
683
|
+
break
|
|
684
|
+
end
|
|
685
|
+
|
|
686
|
+
rescue => error
|
|
687
|
+
finalize_trial_log_entry(
|
|
688
|
+
trial_logs,
|
|
689
|
+
trials_completed,
|
|
690
|
+
score: nil,
|
|
691
|
+
evaluation_type: :full,
|
|
692
|
+
batch_size: batch_size,
|
|
693
|
+
total_eval_calls: total_eval_calls,
|
|
694
|
+
error: error.message
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
emit_event('trial_error', {
|
|
698
|
+
trial_number: trials_completed,
|
|
699
|
+
error: error.message,
|
|
700
|
+
candidate_id: candidate.config_id
|
|
701
|
+
})
|
|
702
|
+
|
|
703
|
+
DSPy.logger.warn("Trial #{trials_completed} failed: #{error.message}")
|
|
704
|
+
end
|
|
705
|
+
end
|
|
706
|
+
|
|
707
|
+
{
|
|
708
|
+
best_score: best_score,
|
|
709
|
+
best_candidate: best_candidate,
|
|
710
|
+
best_program: best_program,
|
|
711
|
+
best_evaluation_result: best_evaluation_result,
|
|
712
|
+
trials_completed: trials_completed,
|
|
713
|
+
optimization_state: optimization_state,
|
|
714
|
+
evaluated_candidates: @evaluated_candidates,
|
|
715
|
+
trial_logs: trial_logs,
|
|
716
|
+
param_score_dict: param_score_dict,
|
|
717
|
+
fully_evaled_param_combos: fully_evaled_param_combos,
|
|
718
|
+
total_eval_calls: total_eval_calls
|
|
719
|
+
}
|
|
720
|
+
end
|
|
721
|
+
|
|
722
|
+
# Generate candidate configurations from proposals and demo candidates
|
|
723
|
+
sig do
|
|
724
|
+
params(
|
|
725
|
+
proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
|
|
726
|
+
demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
|
|
727
|
+
).returns(T::Array[EvaluatedCandidate])
|
|
728
|
+
end
|
|
729
|
+
def generate_candidate_configurations(proposal_result, demo_candidates)
|
|
730
|
+
candidates = []
|
|
731
|
+
seen_signatures = Set.new
|
|
732
|
+
|
|
733
|
+
add_candidate = lambda do |instruction:, few_shot_examples:, type:, metadata:, config_id:|
|
|
734
|
+
signature = candidate_signature(type, instruction, metadata, few_shot_examples)
|
|
735
|
+
next if seen_signatures.include?(signature)
|
|
736
|
+
|
|
737
|
+
seen_signatures << signature
|
|
738
|
+
candidates << EvaluatedCandidate.new(
|
|
739
|
+
instruction: instruction,
|
|
740
|
+
few_shot_examples: few_shot_examples,
|
|
741
|
+
type: type,
|
|
742
|
+
metadata: metadata,
|
|
743
|
+
config_id: config_id
|
|
744
|
+
)
|
|
745
|
+
end
|
|
746
|
+
|
|
747
|
+
predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
|
|
748
|
+
proposal_result.predictor_instructions
|
|
749
|
+
else
|
|
750
|
+
{ 0 => proposal_result.candidate_instructions }
|
|
751
|
+
end
|
|
752
|
+
|
|
753
|
+
instruction_maps = build_instruction_maps(predictor_instruction_map)
|
|
754
|
+
demo_maps = build_demo_maps(demo_candidates)
|
|
755
|
+
|
|
756
|
+
# Base configuration (no modifications)
|
|
757
|
+
add_candidate.call(
|
|
758
|
+
instruction: "",
|
|
759
|
+
few_shot_examples: [],
|
|
760
|
+
type: CandidateType::Baseline,
|
|
761
|
+
metadata: {
|
|
762
|
+
instructions_map: {},
|
|
763
|
+
demos_map: {}
|
|
764
|
+
},
|
|
765
|
+
config_id: SecureRandom.hex(6)
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
|
769
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
|
770
|
+
add_candidate.call(
|
|
771
|
+
instruction: primary_instruction,
|
|
772
|
+
few_shot_examples: [],
|
|
773
|
+
type: CandidateType::InstructionOnly,
|
|
774
|
+
metadata: {
|
|
775
|
+
proposal_rank: combo_idx,
|
|
776
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
|
777
|
+
demos_map: {}
|
|
778
|
+
},
|
|
779
|
+
config_id: SecureRandom.hex(6)
|
|
780
|
+
)
|
|
781
|
+
end
|
|
782
|
+
|
|
783
|
+
demo_maps.each_with_index do |demo_map, idx|
|
|
784
|
+
next if demo_map.empty?
|
|
785
|
+
|
|
786
|
+
flattened_examples = demo_map.values.flatten
|
|
787
|
+
add_candidate.call(
|
|
788
|
+
instruction: "",
|
|
789
|
+
few_shot_examples: flattened_examples,
|
|
790
|
+
type: CandidateType::FewShotOnly,
|
|
791
|
+
metadata: {
|
|
792
|
+
bootstrap_rank: idx,
|
|
793
|
+
instructions_map: {},
|
|
794
|
+
demos_map: duplicate_demo_map(demo_map)
|
|
795
|
+
},
|
|
796
|
+
config_id: SecureRandom.hex(6)
|
|
797
|
+
)
|
|
798
|
+
end
|
|
799
|
+
|
|
800
|
+
# Combined candidates (instruction + few-shot)
|
|
801
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
|
802
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
|
803
|
+
demo_maps.first(3).each_with_index do |demo_map, demo_idx|
|
|
804
|
+
next if demo_map.empty?
|
|
805
|
+
|
|
806
|
+
flattened_examples = demo_map.values.flatten
|
|
807
|
+
add_candidate.call(
|
|
808
|
+
instruction: primary_instruction,
|
|
809
|
+
few_shot_examples: flattened_examples,
|
|
810
|
+
type: CandidateType::Combined,
|
|
811
|
+
metadata: {
|
|
812
|
+
instruction_rank: combo_idx,
|
|
813
|
+
bootstrap_rank: demo_idx,
|
|
814
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
|
815
|
+
demos_map: duplicate_demo_map(demo_map)
|
|
816
|
+
},
|
|
817
|
+
config_id: SecureRandom.hex(6)
|
|
818
|
+
)
|
|
819
|
+
end
|
|
820
|
+
end
|
|
821
|
+
|
|
822
|
+
candidates
|
|
823
|
+
end
|
|
824
|
+
|
|
825
|
+
sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
|
|
826
|
+
def build_instruction_maps(predictor_instruction_map)
|
|
827
|
+
return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
|
|
828
|
+
|
|
829
|
+
normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
|
|
830
|
+
next if instructions.nil? || instructions.empty?
|
|
831
|
+
memo[index] = instructions.take(3)
|
|
832
|
+
end
|
|
833
|
+
|
|
834
|
+
return [{}] if normalized.empty?
|
|
835
|
+
|
|
836
|
+
cartesian_product(normalized)
|
|
837
|
+
end
|
|
838
|
+
|
|
839
|
+
sig do
|
|
840
|
+
params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
|
|
841
|
+
end
|
|
842
|
+
def build_demo_maps(demo_candidates)
|
|
843
|
+
return [{}] if demo_candidates.nil? || demo_candidates.empty?
|
|
844
|
+
|
|
845
|
+
normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
|
|
846
|
+
next if sets.nil? || sets.empty?
|
|
847
|
+
memo[index] = sets.take(3)
|
|
848
|
+
end
|
|
849
|
+
|
|
850
|
+
return [{}] if normalized.empty?
|
|
851
|
+
|
|
852
|
+
cartesian_product(normalized)
|
|
853
|
+
end
|
|
854
|
+
|
|
855
|
+
sig do
|
|
856
|
+
params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
|
|
857
|
+
end
|
|
858
|
+
def cartesian_product(options_hash)
|
|
859
|
+
options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
|
|
860
|
+
next acc if values.nil? || values.empty?
|
|
861
|
+
|
|
862
|
+
acc.flat_map do |existing|
|
|
863
|
+
values.map do |value|
|
|
864
|
+
existing.merge(index => value)
|
|
865
|
+
end
|
|
866
|
+
end
|
|
867
|
+
end
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
|
|
871
|
+
def duplicate_instruction_map(instruction_map)
|
|
872
|
+
instruction_map.each_with_object({}) do |(index, instruction), memo|
|
|
873
|
+
memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
|
|
874
|
+
end
|
|
875
|
+
end
|
|
876
|
+
|
|
877
|
+
sig do
|
|
878
|
+
params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
|
|
879
|
+
end
|
|
880
|
+
def duplicate_demo_map(demo_map)
|
|
881
|
+
demo_map.each_with_object({}) do |(index, demos), memo|
|
|
882
|
+
next if demos.nil?
|
|
883
|
+
memo[index] = demos.map { |demo| demo }
|
|
884
|
+
end
|
|
885
|
+
end
|
|
886
|
+
|
|
887
|
+
sig do
|
|
888
|
+
params(
|
|
889
|
+
type: CandidateType,
|
|
890
|
+
instruction: String,
|
|
891
|
+
metadata: T::Hash[Symbol, T.untyped],
|
|
892
|
+
few_shot_examples: T::Array[T.untyped]
|
|
893
|
+
).returns(String)
|
|
894
|
+
end
|
|
895
|
+
def candidate_signature(type, instruction, metadata, few_shot_examples)
|
|
896
|
+
JSON.generate(
|
|
897
|
+
type: type.serialize,
|
|
898
|
+
instruction: instruction,
|
|
899
|
+
instructions_map: normalize_instruction_map(metadata[:instructions_map] || {}),
|
|
900
|
+
demos_map: normalize_demo_map(metadata[:demos_map] || {}),
|
|
901
|
+
few_shot_examples: few_shot_examples.map { |example| serialize_few_shot_example(example) }
|
|
902
|
+
)
|
|
903
|
+
end
|
|
904
|
+
|
|
905
|
+
sig { params(map: T::Hash[Integer, T.untyped]).returns(T::Hash[Integer, String]) }
|
|
906
|
+
def normalize_instruction_map(map)
|
|
907
|
+
map.sort_by { |index, _| index }.each_with_object({}) do |(index, value), memo|
|
|
908
|
+
memo[index] = value.to_s
|
|
909
|
+
end
|
|
910
|
+
end
|
|
911
|
+
|
|
912
|
+
sig { params(map: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Hash[Integer, T::Array[T.untyped]]) }
|
|
913
|
+
def normalize_demo_map(map)
|
|
914
|
+
map.sort_by { |index, _| index }.each_with_object({}) do |(index, demos), memo|
|
|
915
|
+
memo[index] = Array(demos).map { |demo| serialize_few_shot_example(demo) }
|
|
916
|
+
end
|
|
917
|
+
end
|
|
918
|
+
|
|
919
|
+
sig { params(example: T.untyped).returns(T.untyped) }
|
|
920
|
+
def serialize_few_shot_example(example)
|
|
921
|
+
case example
|
|
922
|
+
when DSPy::FewShotExample
|
|
923
|
+
deep_dup(example.to_h)
|
|
924
|
+
when DSPy::Example
|
|
925
|
+
{
|
|
926
|
+
input: deep_dup(example.input_values),
|
|
927
|
+
expected: deep_dup(example.expected_values)
|
|
928
|
+
}
|
|
929
|
+
when Hash
|
|
930
|
+
deep_dup(example)
|
|
931
|
+
else
|
|
932
|
+
example
|
|
933
|
+
end
|
|
934
|
+
end
|
|
935
|
+
|
|
936
|
+
sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
|
|
937
|
+
def normalize_few_shot_examples(examples)
|
|
938
|
+
examples.map do |example|
|
|
939
|
+
if example.is_a?(DSPy::FewShotExample)
|
|
940
|
+
example
|
|
941
|
+
elsif example.is_a?(DSPy::Example)
|
|
942
|
+
DSPy::FewShotExample.new(
|
|
943
|
+
input: example.input_values,
|
|
944
|
+
output: example.expected_values,
|
|
945
|
+
reasoning: extract_reasoning_from_example(example)
|
|
946
|
+
)
|
|
947
|
+
else
|
|
948
|
+
example
|
|
949
|
+
end
|
|
950
|
+
end
|
|
951
|
+
end
|
|
952
|
+
|
|
953
|
+
sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
|
|
954
|
+
def assign_predictor_examples(predictor, examples)
|
|
955
|
+
predictor.demos = examples if predictor.respond_to?(:demos=)
|
|
956
|
+
return unless predictor.respond_to?(:prompt)
|
|
957
|
+
|
|
958
|
+
cloned_examples = examples.map { |ex| ex }
|
|
959
|
+
predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
|
|
960
|
+
end
|
|
961
|
+
|
|
962
|
+
# Initialize optimization state for candidate selection
|
|
963
|
+
sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
|
|
964
|
+
def initialize_optimization_state(candidates)
|
|
965
|
+
{
|
|
966
|
+
candidates: candidates,
|
|
967
|
+
scores: {},
|
|
968
|
+
exploration_counts: Hash.new(0),
|
|
969
|
+
temperature: config.init_temperature,
|
|
970
|
+
best_score_history: [],
|
|
971
|
+
diversity_scores: {},
|
|
972
|
+
no_improvement_count: 0
|
|
973
|
+
}
|
|
974
|
+
end
|
|
975
|
+
|
|
976
|
+
# Select next candidate based on optimization strategy
|
|
977
|
+
sig do
|
|
978
|
+
params(
|
|
979
|
+
candidates: T::Array[EvaluatedCandidate],
|
|
980
|
+
state: T::Hash[Symbol, T.untyped],
|
|
981
|
+
trial_idx: Integer
|
|
982
|
+
).returns(EvaluatedCandidate)
|
|
983
|
+
end
|
|
984
|
+
def select_next_candidate(candidates, state, trial_idx)
|
|
985
|
+
case config.optimization_strategy
|
|
986
|
+
when OptimizationStrategy::Greedy
|
|
987
|
+
select_candidate_greedy(candidates, state)
|
|
988
|
+
when OptimizationStrategy::Adaptive
|
|
989
|
+
select_candidate_adaptive(candidates, state, trial_idx)
|
|
990
|
+
when OptimizationStrategy::Bayesian
|
|
991
|
+
select_candidate_bayesian(candidates, state, trial_idx)
|
|
992
|
+
else
|
|
993
|
+
candidates.sample # Random fallback
|
|
994
|
+
end
|
|
995
|
+
end
|
|
996
|
+
|
|
997
|
+
# Greedy candidate selection (exploit best known configurations)
|
|
998
|
+
sig { params(candidates: T::Array[EvaluatedCandidate], state: T::Hash[Symbol, T.untyped]).returns(EvaluatedCandidate) }
|
|
999
|
+
def select_candidate_greedy(candidates, state)
|
|
1000
|
+
# Prioritize unexplored candidates, then highest scoring
|
|
1001
|
+
unexplored = candidates.reject { |c| state[:scores].key?(c.config_id) }
|
|
1002
|
+
return unexplored.sample if unexplored.any?
|
|
1003
|
+
|
|
1004
|
+
# Among explored, pick the best
|
|
1005
|
+
scored_candidates = candidates.select { |c| state[:scores].key?(c.config_id) }
|
|
1006
|
+
scored_candidates.max_by { |c| state[:scores][c.config_id] } || candidates.first
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
# Adaptive candidate selection (balance exploration and exploitation)
|
|
1010
|
+
sig do
|
|
1011
|
+
params(
|
|
1012
|
+
candidates: T::Array[EvaluatedCandidate],
|
|
1013
|
+
state: T::Hash[Symbol, T.untyped],
|
|
1014
|
+
trial_idx: Integer
|
|
1015
|
+
).returns(EvaluatedCandidate)
|
|
1016
|
+
end
|
|
1017
|
+
def select_candidate_adaptive(candidates, state, trial_idx)
|
|
1018
|
+
# Update temperature based on progress
|
|
1019
|
+
progress = trial_idx.to_f / config.num_trials
|
|
1020
|
+
state[:temperature] = config.init_temperature * (1 - progress) + config.final_temperature * progress
|
|
1021
|
+
|
|
1022
|
+
# Calculate selection scores combining exploitation and exploration
|
|
1023
|
+
candidate_scores = candidates.map do |candidate|
|
|
1024
|
+
exploitation_score = state[:scores][candidate.config_id] || 0.0
|
|
1025
|
+
exploration_bonus = 1.0 / (state[:exploration_counts][candidate.config_id] + 1)
|
|
1026
|
+
|
|
1027
|
+
total_score = exploitation_score + state[:temperature] * exploration_bonus
|
|
1028
|
+
[candidate, total_score]
|
|
1029
|
+
end
|
|
1030
|
+
|
|
1031
|
+
# Select using softmax with temperature
|
|
1032
|
+
if state[:temperature] > 0.01
|
|
1033
|
+
# Probabilistic selection
|
|
1034
|
+
weights = candidate_scores.map { |_, score| Math.exp(score / state[:temperature]) }
|
|
1035
|
+
total_weight = weights.sum
|
|
1036
|
+
probabilities = weights.map { |w| w / total_weight }
|
|
1037
|
+
|
|
1038
|
+
random_value = rand
|
|
1039
|
+
cumulative = 0.0
|
|
1040
|
+
candidate_scores.each_with_index do |(candidate, _), idx|
|
|
1041
|
+
cumulative += probabilities[idx]
|
|
1042
|
+
return candidate if random_value <= cumulative
|
|
1043
|
+
end
|
|
1044
|
+
end
|
|
1045
|
+
|
|
1046
|
+
# Fallback to highest scoring
|
|
1047
|
+
candidate_scores.max_by { |_, score| score }.first
|
|
1048
|
+
end
|
|
1049
|
+
|
|
1050
|
+
# Bayesian candidate selection (use probabilistic model)
|
|
1051
|
+
sig do
|
|
1052
|
+
params(
|
|
1053
|
+
candidates: T::Array[EvaluatedCandidate],
|
|
1054
|
+
state: T::Hash[Symbol, T.untyped],
|
|
1055
|
+
trial_idx: Integer
|
|
1056
|
+
).returns(EvaluatedCandidate)
|
|
1057
|
+
end
|
|
1058
|
+
def select_candidate_bayesian(candidates, state, trial_idx)
|
|
1059
|
+
# Need at least 3 observations to fit GP, otherwise fall back to adaptive
|
|
1060
|
+
return select_candidate_adaptive(candidates, state, trial_idx) if state[:scores].size < 3
|
|
1061
|
+
|
|
1062
|
+
# Get scored candidates for training the GP
|
|
1063
|
+
scored_candidates = candidates.select { |c| state[:scores].key?(c.config_id) }
|
|
1064
|
+
return select_candidate_adaptive(candidates, state, trial_idx) if scored_candidates.size < 3
|
|
1065
|
+
|
|
1066
|
+
begin
|
|
1067
|
+
# Encode candidates as numerical features
|
|
1068
|
+
all_candidate_features = encode_candidates_for_gp(candidates)
|
|
1069
|
+
scored_features = encode_candidates_for_gp(scored_candidates)
|
|
1070
|
+
scored_targets = scored_candidates.map { |c| state[:scores][c.config_id].to_f }
|
|
1071
|
+
|
|
1072
|
+
# Train Gaussian Process
|
|
1073
|
+
gp = DSPy::Optimizers::GaussianProcess.new(
|
|
1074
|
+
length_scale: 1.0,
|
|
1075
|
+
signal_variance: 1.0,
|
|
1076
|
+
noise_variance: 0.01
|
|
1077
|
+
)
|
|
1078
|
+
gp.fit(scored_features, scored_targets)
|
|
1079
|
+
|
|
1080
|
+
# Predict mean and uncertainty for all candidates
|
|
1081
|
+
means, stds = gp.predict(all_candidate_features, return_std: true)
|
|
1082
|
+
|
|
1083
|
+
# Upper Confidence Bound (UCB) acquisition function
|
|
1084
|
+
kappa = 2.0 * Math.sqrt(Math.log(trial_idx + 1)) # Exploration parameter
|
|
1085
|
+
acquisition_scores = means.to_a.zip(stds.to_a).map { |m, s| m + kappa * s }
|
|
1086
|
+
|
|
1087
|
+
# Select candidate with highest acquisition score
|
|
1088
|
+
best_idx = acquisition_scores.each_with_index.max_by { |score, _| score }[1]
|
|
1089
|
+
candidates[best_idx]
|
|
1090
|
+
|
|
1091
|
+
rescue => e
|
|
1092
|
+
# If GP fails for any reason, fall back to adaptive selection
|
|
1093
|
+
DSPy.logger.warn("Bayesian optimization failed: #{e.message}. Falling back to adaptive selection.")
|
|
1094
|
+
select_candidate_adaptive(candidates, state, trial_idx)
|
|
1095
|
+
end
|
|
1096
|
+
end
|
|
1097
|
+
|
|
1098
|
+
private
|
|
1099
|
+
|
|
1100
|
+
|
|
1101
|
+
# Encode candidates as numerical features for Gaussian Process
|
|
1102
|
+
sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Array[T::Array[Float]]) }
|
|
1103
|
+
def encode_candidates_for_gp(candidates)
|
|
1104
|
+
# Simple encoding: use hash of config as features
|
|
1105
|
+
# In practice, this could be more sophisticated (e.g., instruction embeddings)
|
|
1106
|
+
candidates.map do |candidate|
|
|
1107
|
+
# Create deterministic numerical features from the candidate config
|
|
1108
|
+
config_hash = candidate.config_id.hash.abs
|
|
1109
|
+
|
|
1110
|
+
# Extract multiple features to create a feature vector
|
|
1111
|
+
features = []
|
|
1112
|
+
features << (config_hash % 1000).to_f / 1000.0 # Feature 1: hash mod 1000, normalized
|
|
1113
|
+
features << ((config_hash / 1000) % 1000).to_f / 1000.0 # Feature 2: different part of hash
|
|
1114
|
+
features << ((config_hash / 1_000_000) % 1000).to_f / 1000.0 # Feature 3: high bits
|
|
1115
|
+
|
|
1116
|
+
# Add instruction length if available (Python-compatible: no cap)
|
|
1117
|
+
instruction = candidate.instruction
|
|
1118
|
+
if instruction && !instruction.empty?
|
|
1119
|
+
features << instruction.length.to_f / 100.0 # Instruction length, uncapped
|
|
1120
|
+
else
|
|
1121
|
+
features << 0.5 # Default value
|
|
1122
|
+
end
|
|
1123
|
+
|
|
1124
|
+
features
|
|
1125
|
+
end
|
|
1126
|
+
end
|
|
1127
|
+
|
|
1128
|
+
# Evaluate a candidate configuration
|
|
1129
|
+
sig do
|
|
1130
|
+
params(
|
|
1131
|
+
program: T.untyped,
|
|
1132
|
+
candidate: EvaluatedCandidate,
|
|
1133
|
+
evaluation_set: T::Array[DSPy::Example]
|
|
1134
|
+
).returns([Float, T.untyped, DSPy::Evaluate::BatchEvaluationResult])
|
|
1135
|
+
end
|
|
1136
|
+
def evaluate_candidate(program, candidate, evaluation_set)
|
|
1137
|
+
# Apply candidate configuration to program
|
|
1138
|
+
modified_program = apply_candidate_configuration(program, candidate)
|
|
1139
|
+
|
|
1140
|
+
# Evaluate modified program
|
|
1141
|
+
evaluation_result = if use_concurrent_evaluation?(evaluation_set)
|
|
1142
|
+
evaluate_candidate_concurrently(modified_program, evaluation_set)
|
|
1143
|
+
else
|
|
1144
|
+
evaluate_program(modified_program, evaluation_set)
|
|
1145
|
+
end
|
|
1146
|
+
|
|
1147
|
+
# Store evaluation details
|
|
1148
|
+
@evaluated_candidates << candidate
|
|
1149
|
+
|
|
1150
|
+
[evaluation_result.pass_rate, modified_program, evaluation_result]
|
|
1151
|
+
end
|
|
1152
|
+
|
|
1153
|
+
sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
|
|
1154
|
+
def use_concurrent_evaluation?(evaluation_set)
|
|
1155
|
+
minibatch_size = config.minibatch_size
|
|
1156
|
+
return false unless minibatch_size&.positive?
|
|
1157
|
+
return false unless config.num_threads && config.num_threads > 1
|
|
1158
|
+
|
|
1159
|
+
evaluation_set.size > minibatch_size
|
|
1160
|
+
end
|
|
1161
|
+
|
|
1162
|
+
sig do
|
|
1163
|
+
params(
|
|
1164
|
+
modified_program: T.untyped,
|
|
1165
|
+
evaluation_set: T::Array[DSPy::Example]
|
|
1166
|
+
).returns(DSPy::Evaluate::BatchEvaluationResult)
|
|
1167
|
+
end
|
|
1168
|
+
def evaluate_candidate_concurrently(modified_program, evaluation_set)
|
|
1169
|
+
chunk_size = T.must(config.minibatch_size)
|
|
1170
|
+
chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
|
|
1171
|
+
return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
|
|
1172
|
+
|
|
1173
|
+
pool_size = [config.num_threads, chunks.size].min
|
|
1174
|
+
pool_size = 1 if pool_size <= 0
|
|
1175
|
+
executor = Concurrent::FixedThreadPool.new(pool_size)
|
|
1176
|
+
|
|
1177
|
+
futures = chunks.map do |chunk|
|
|
1178
|
+
Concurrent::Promises.future_on(executor) do
|
|
1179
|
+
evaluate_program(modified_program, chunk)
|
|
1180
|
+
end
|
|
1181
|
+
end
|
|
1182
|
+
|
|
1183
|
+
results = futures.map(&:value!)
|
|
1184
|
+
combine_batch_results(results)
|
|
1185
|
+
ensure
|
|
1186
|
+
if executor
|
|
1187
|
+
executor.shutdown
|
|
1188
|
+
executor.wait_for_termination
|
|
1189
|
+
end
|
|
1190
|
+
end
|
|
1191
|
+
|
|
1192
|
+
sig do
|
|
1193
|
+
params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
|
|
1194
|
+
end
|
|
1195
|
+
def combine_batch_results(batch_results)
|
|
1196
|
+
return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
|
|
1197
|
+
|
|
1198
|
+
combined_results = batch_results.flat_map(&:results)
|
|
1199
|
+
total_examples = batch_results.sum(&:total_examples)
|
|
1200
|
+
aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
|
|
1201
|
+
|
|
1202
|
+
DSPy::Evaluate::BatchEvaluationResult.new(
|
|
1203
|
+
results: combined_results,
|
|
1204
|
+
aggregated_metrics: aggregated_metrics
|
|
1205
|
+
)
|
|
1206
|
+
end
|
|
1207
|
+
|
|
1208
|
+
sig do
|
|
1209
|
+
params(
|
|
1210
|
+
batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
|
|
1211
|
+
total_examples: Integer
|
|
1212
|
+
).returns(T::Hash[Symbol, T.untyped])
|
|
1213
|
+
end
|
|
1214
|
+
def merge_aggregated_metrics(batch_results, total_examples)
|
|
1215
|
+
return {} if total_examples.zero?
|
|
1216
|
+
|
|
1217
|
+
keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
|
|
1218
|
+
keys.each_with_object({}) do |key, memo|
|
|
1219
|
+
numeric_weight = 0.0
|
|
1220
|
+
numeric_sum = 0.0
|
|
1221
|
+
fallback_value = nil
|
|
1222
|
+
|
|
1223
|
+
batch_results.each do |res|
|
|
1224
|
+
value = res.aggregated_metrics[key]
|
|
1225
|
+
next if value.nil?
|
|
1226
|
+
|
|
1227
|
+
if value.is_a?(Numeric)
|
|
1228
|
+
numeric_sum += value.to_f * res.total_examples
|
|
1229
|
+
numeric_weight += res.total_examples
|
|
1230
|
+
else
|
|
1231
|
+
fallback_value = value
|
|
1232
|
+
end
|
|
1233
|
+
end
|
|
1234
|
+
|
|
1235
|
+
if numeric_weight.positive?
|
|
1236
|
+
memo[key] = numeric_sum / numeric_weight
|
|
1237
|
+
elsif fallback_value
|
|
1238
|
+
memo[key] = fallback_value
|
|
1239
|
+
end
|
|
1240
|
+
end
|
|
1241
|
+
end
|
|
1242
|
+
|
|
1243
|
+
# Apply candidate configuration to program
|
|
1244
|
+
sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
|
|
1245
|
+
def apply_candidate_configuration(program, candidate)
|
|
1246
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
|
1247
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
|
1248
|
+
|
|
1249
|
+
modified_program = program
|
|
1250
|
+
if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
|
|
1251
|
+
modified_program = modified_program.clone
|
|
1252
|
+
modified_program.predictors.each_with_index do |predictor, idx|
|
|
1253
|
+
if instructions_map.key?(idx)
|
|
1254
|
+
signature = Utils.get_signature(predictor)
|
|
1255
|
+
updated_signature = signature.with_instructions(instructions_map[idx])
|
|
1256
|
+
Utils.set_signature(predictor, updated_signature)
|
|
1257
|
+
end
|
|
1258
|
+
|
|
1259
|
+
if demos_map.key?(idx)
|
|
1260
|
+
normalized_examples = normalize_few_shot_examples(demos_map[idx])
|
|
1261
|
+
assign_predictor_examples(predictor, normalized_examples)
|
|
1262
|
+
end
|
|
1263
|
+
end
|
|
1264
|
+
end
|
|
1265
|
+
|
|
1266
|
+
# Apply instruction if provided (top-level programs still respect with_instruction)
|
|
1267
|
+
if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
|
|
1268
|
+
modified_program = modified_program.with_instruction(candidate.instruction)
|
|
1269
|
+
end
|
|
1270
|
+
|
|
1271
|
+
should_apply_global_examples = candidate.few_shot_examples.any? &&
|
|
1272
|
+
modified_program.respond_to?(:with_examples) &&
|
|
1273
|
+
(demos_map.empty? || !modified_program.respond_to?(:predictors))
|
|
1274
|
+
|
|
1275
|
+
if should_apply_global_examples
|
|
1276
|
+
normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
|
|
1277
|
+
modified_program = modified_program.with_examples(normalized_few_shot)
|
|
1278
|
+
end
|
|
1279
|
+
|
|
1280
|
+
modified_program
|
|
1281
|
+
end
|
|
1282
|
+
|
|
1283
|
+
# Update optimization state after candidate evaluation
|
|
1284
|
+
sig do
|
|
1285
|
+
params(
|
|
1286
|
+
state: T::Hash[Symbol, T.untyped],
|
|
1287
|
+
candidate: EvaluatedCandidate,
|
|
1288
|
+
score: Float
|
|
1289
|
+
).void
|
|
1290
|
+
end
|
|
1291
|
+
def update_optimization_state(state, candidate, score)
|
|
1292
|
+
state[:scores][candidate.config_id] = score
|
|
1293
|
+
state[:exploration_counts][candidate.config_id] += 1
|
|
1294
|
+
state[:best_score_history] << score
|
|
1295
|
+
|
|
1296
|
+
# Track diversity if enabled
|
|
1297
|
+
if config.track_diversity
|
|
1298
|
+
state[:diversity_scores][candidate.config_id] = calculate_diversity_score(candidate)
|
|
1299
|
+
end
|
|
1300
|
+
|
|
1301
|
+
# Update no improvement counter
|
|
1302
|
+
if state[:best_score_history].size > 1 && score > state[:best_score_history][-2]
|
|
1303
|
+
state[:no_improvement_count] = 0
|
|
1304
|
+
else
|
|
1305
|
+
state[:no_improvement_count] += 1
|
|
1306
|
+
end
|
|
1307
|
+
end
|
|
1308
|
+
|
|
1309
|
+
# Check if optimization should stop early
|
|
1310
|
+
sig { params(state: T::Hash[Symbol, T.untyped], trial_idx: Integer).returns(T::Boolean) }
|
|
1311
|
+
def should_early_stop?(state, trial_idx)
|
|
1312
|
+
# Don't stop too early
|
|
1313
|
+
return false if trial_idx < config.early_stopping_patience
|
|
1314
|
+
|
|
1315
|
+
# Stop if no improvement for patience trials
|
|
1316
|
+
state[:no_improvement_count] >= config.early_stopping_patience
|
|
1317
|
+
end
|
|
1318
|
+
|
|
1319
|
+
# Calculate diversity score for candidate (Python-compatible: only few-shot count)
|
|
1320
|
+
sig { params(candidate: EvaluatedCandidate).returns(Float) }
|
|
1321
|
+
def calculate_diversity_score(candidate)
|
|
1322
|
+
# Python DSPy doesn't use instruction length for diversity, only few-shot count
|
|
1323
|
+
few_shot_diversity = candidate.few_shot_examples.size / 10.0
|
|
1324
|
+
|
|
1325
|
+
[few_shot_diversity, 1.0].min
|
|
1326
|
+
end
|
|
1327
|
+
|
|
1328
|
+
# Build final MIPROv2 result
|
|
1329
|
+
sig do
|
|
1330
|
+
params(
|
|
1331
|
+
optimization_result: T::Hash[Symbol, T.untyped],
|
|
1332
|
+
demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]],
|
|
1333
|
+
proposal_result: DSPy::Propose::GroundedProposer::ProposalResult
|
|
1334
|
+
).returns(MIPROv2Result)
|
|
1335
|
+
end
|
|
1336
|
+
def build_miprov2_result(optimization_result, demo_candidates, proposal_result)
|
|
1337
|
+
best_candidate = optimization_result[:best_candidate]
|
|
1338
|
+
best_program = optimization_result[:best_program]
|
|
1339
|
+
best_score = optimization_result[:best_score]
|
|
1340
|
+
best_evaluation_result = optimization_result[:best_evaluation_result]
|
|
1341
|
+
|
|
1342
|
+
scores = { pass_rate: best_score }
|
|
1343
|
+
|
|
1344
|
+
history = {
|
|
1345
|
+
total_trials: optimization_result[:trials_completed],
|
|
1346
|
+
optimization_strategy: optimization_strategy_name,
|
|
1347
|
+
early_stopped: optimization_result[:trials_completed] < config.num_trials,
|
|
1348
|
+
score_history: optimization_result[:optimization_state][:best_score_history],
|
|
1349
|
+
total_eval_calls: optimization_result[:total_eval_calls]
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
metadata = {
|
|
1353
|
+
optimizer: "MIPROv2",
|
|
1354
|
+
auto_mode: infer_auto_mode,
|
|
1355
|
+
optimization_strategy: optimization_strategy_name,
|
|
1356
|
+
best_instruction: best_candidate&.instruction || "",
|
|
1357
|
+
best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
|
|
1358
|
+
best_candidate_type: best_candidate&.type&.serialize || "unknown",
|
|
1359
|
+
optimization_timestamp: Time.now.iso8601
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
# Create bootstrap statistics from demo_candidates
|
|
1363
|
+
num_predictors = demo_candidates.keys.size
|
|
1364
|
+
sets_per_predictor = demo_candidates.values.map(&:size)
|
|
1365
|
+
all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
|
|
1366
|
+
bootstrap_statistics = {
|
|
1367
|
+
num_predictors: num_predictors,
|
|
1368
|
+
demo_sets_per_predictor: sets_per_predictor.max || 0,
|
|
1369
|
+
avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
|
|
1370
|
+
}
|
|
1371
|
+
bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
|
|
1372
|
+
|
|
1373
|
+
optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
|
|
1374
|
+
optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
|
|
1375
|
+
optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
|
|
1376
|
+
optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
|
|
1377
|
+
optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
|
|
1378
|
+
|
|
1379
|
+
MIPROv2Result.new(
|
|
1380
|
+
optimized_program: best_program,
|
|
1381
|
+
scores: scores,
|
|
1382
|
+
history: history,
|
|
1383
|
+
best_score_name: "pass_rate",
|
|
1384
|
+
best_score_value: best_score,
|
|
1385
|
+
metadata: metadata,
|
|
1386
|
+
evaluated_candidates: @evaluated_candidates,
|
|
1387
|
+
optimization_trace: optimization_trace,
|
|
1388
|
+
bootstrap_statistics: bootstrap_statistics,
|
|
1389
|
+
proposal_statistics: proposal_result.analysis,
|
|
1390
|
+
best_evaluation_result: best_evaluation_result
|
|
1391
|
+
)
|
|
1392
|
+
end
|
|
1393
|
+
|
|
1394
|
+
# Serialize optimization trace for better JSON output
|
|
1395
|
+
sig { params(optimization_state: T.nilable(T::Hash[Symbol, T.untyped])).returns(T::Hash[Symbol, T.untyped]) }
|
|
1396
|
+
def serialize_optimization_trace(optimization_state)
|
|
1397
|
+
return {} unless optimization_state
|
|
1398
|
+
|
|
1399
|
+
serialized_trace = optimization_state.dup
|
|
1400
|
+
|
|
1401
|
+
# Convert candidate objects to their hash representations
|
|
1402
|
+
if serialized_trace[:candidates]
|
|
1403
|
+
serialized_trace[:candidates] = serialized_trace[:candidates].map(&:to_h)
|
|
1404
|
+
end
|
|
1405
|
+
|
|
1406
|
+
serialized_trace
|
|
1407
|
+
end
|
|
1408
|
+
|
|
1409
|
+
sig do
|
|
1410
|
+
params(
|
|
1411
|
+
trial_number: Integer,
|
|
1412
|
+
candidate: EvaluatedCandidate,
|
|
1413
|
+
evaluation_type: Symbol,
|
|
1414
|
+
batch_size: Integer
|
|
1415
|
+
).returns(T::Hash[Symbol, T.untyped])
|
|
1416
|
+
end
|
|
1417
|
+
def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
|
|
1418
|
+
# Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
|
|
1419
|
+
trial_number # no-op to acknowledge parameter usage
|
|
1420
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
|
1421
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
|
1422
|
+
entry = {
|
|
1423
|
+
candidate_id: candidate.config_id,
|
|
1424
|
+
candidate_type: candidate.type.serialize,
|
|
1425
|
+
instruction_preview: candidate.instruction.to_s[0, 160],
|
|
1426
|
+
few_shot_count: candidate.few_shot_examples.size,
|
|
1427
|
+
metadata: deep_dup(candidate.metadata),
|
|
1428
|
+
evaluation_type: evaluation_type,
|
|
1429
|
+
batch_size: batch_size,
|
|
1430
|
+
status: :in_progress,
|
|
1431
|
+
started_at: Time.now.iso8601
|
|
1432
|
+
}
|
|
1433
|
+
if instructions_map.any?
|
|
1434
|
+
entry[:instructions] = duplicate_instruction_map(instructions_map)
|
|
1435
|
+
entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
|
|
1436
|
+
elsif candidate.instruction && !candidate.instruction.empty?
|
|
1437
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1438
|
+
entry[:instruction] = candidate.instruction
|
|
1439
|
+
entry[:instructions] = { predictor_index => candidate.instruction }
|
|
1440
|
+
end
|
|
1441
|
+
entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
|
|
1442
|
+
entry
|
|
1443
|
+
end
|
|
1444
|
+
|
|
1445
|
+
sig do
|
|
1446
|
+
params(
|
|
1447
|
+
trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
|
|
1448
|
+
trial_number: Integer,
|
|
1449
|
+
score: T.nilable(Float),
|
|
1450
|
+
evaluation_type: Symbol,
|
|
1451
|
+
batch_size: Integer,
|
|
1452
|
+
total_eval_calls: Integer,
|
|
1453
|
+
error: T.nilable(String)
|
|
1454
|
+
).void
|
|
1455
|
+
end
|
|
1456
|
+
def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
|
|
1457
|
+
entry = trial_logs[trial_number] || {}
|
|
1458
|
+
entry[:score] = score if score
|
|
1459
|
+
entry[:evaluation_type] = evaluation_type
|
|
1460
|
+
entry[:batch_size] = batch_size
|
|
1461
|
+
entry[:total_eval_calls] = total_eval_calls
|
|
1462
|
+
entry[:status] = error ? :error : :completed
|
|
1463
|
+
entry[:error] = error if error
|
|
1464
|
+
entry[:completed_at] = Time.now.iso8601
|
|
1465
|
+
trial_logs[trial_number] = entry
|
|
1466
|
+
end
|
|
1467
|
+
|
|
1468
|
+
sig do
|
|
1469
|
+
params(
|
|
1470
|
+
param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
|
|
1471
|
+
candidate: EvaluatedCandidate,
|
|
1472
|
+
score: Float,
|
|
1473
|
+
evaluation_type: Symbol,
|
|
1474
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
|
1475
|
+
).void
|
|
1476
|
+
end
|
|
1477
|
+
def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
|
|
1478
|
+
instructions_hash = instructions || {}
|
|
1479
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
|
1480
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1481
|
+
instructions_hash[predictor_index] = candidate.instruction
|
|
1482
|
+
end
|
|
1483
|
+
|
|
1484
|
+
record = {
|
|
1485
|
+
candidate_id: candidate.config_id,
|
|
1486
|
+
candidate_type: candidate.type.serialize,
|
|
1487
|
+
score: score,
|
|
1488
|
+
evaluation_type: evaluation_type,
|
|
1489
|
+
timestamp: Time.now.iso8601,
|
|
1490
|
+
metadata: deep_dup(candidate.metadata)
|
|
1491
|
+
}
|
|
1492
|
+
primary_instruction = instructions_hash[0] || candidate.instruction
|
|
1493
|
+
record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
|
|
1494
|
+
record[:instructions] = instructions_hash unless instructions_hash.empty?
|
|
1495
|
+
|
|
1496
|
+
param_score_dict[candidate.config_id] << record
|
|
1497
|
+
end
|
|
1498
|
+
|
|
1499
|
+
sig do
|
|
1500
|
+
params(
|
|
1501
|
+
fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
|
|
1502
|
+
candidate: EvaluatedCandidate,
|
|
1503
|
+
score: Float,
|
|
1504
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
|
1505
|
+
).void
|
|
1506
|
+
end
|
|
1507
|
+
def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
|
|
1508
|
+
existing = fully_evaled_param_combos[candidate.config_id]
|
|
1509
|
+
if existing.nil? || score > existing[:score]
|
|
1510
|
+
instructions_hash = instructions || {}
|
|
1511
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
|
1512
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
|
1513
|
+
instructions_hash[predictor_index] = candidate.instruction
|
|
1514
|
+
end
|
|
1515
|
+
|
|
1516
|
+
fully_evaled_param_combos[candidate.config_id] = {
|
|
1517
|
+
candidate_id: candidate.config_id,
|
|
1518
|
+
candidate_type: candidate.type.serialize,
|
|
1519
|
+
score: score,
|
|
1520
|
+
metadata: deep_dup(candidate.metadata),
|
|
1521
|
+
updated_at: Time.now.iso8601
|
|
1522
|
+
}
|
|
1523
|
+
unless instructions_hash.empty?
|
|
1524
|
+
fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
|
|
1525
|
+
fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
|
|
1526
|
+
end
|
|
1527
|
+
end
|
|
1528
|
+
end
|
|
1529
|
+
|
|
1530
|
+
sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
|
|
1531
|
+
def serialize_trial_logs(trial_logs)
|
|
1532
|
+
return {} unless trial_logs
|
|
1533
|
+
|
|
1534
|
+
allowed_keys = [
|
|
1535
|
+
:candidate_id,
|
|
1536
|
+
:candidate_type,
|
|
1537
|
+
:instruction_preview,
|
|
1538
|
+
:instruction,
|
|
1539
|
+
:instructions,
|
|
1540
|
+
:few_shot_count,
|
|
1541
|
+
:metadata,
|
|
1542
|
+
:evaluation_type,
|
|
1543
|
+
:batch_size,
|
|
1544
|
+
:score,
|
|
1545
|
+
:status,
|
|
1546
|
+
:error,
|
|
1547
|
+
:started_at,
|
|
1548
|
+
:completed_at,
|
|
1549
|
+
:total_eval_calls
|
|
1550
|
+
]
|
|
1551
|
+
|
|
1552
|
+
trial_logs.transform_values do |entry|
|
|
1553
|
+
entry.each_with_object({}) do |(key, value), memo|
|
|
1554
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1555
|
+
end
|
|
1556
|
+
end
|
|
1557
|
+
end
|
|
1558
|
+
|
|
1559
|
+
sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
|
|
1560
|
+
def serialize_param_score_dict(param_score_dict)
|
|
1561
|
+
return {} unless param_score_dict
|
|
1562
|
+
|
|
1563
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
|
|
1564
|
+
|
|
1565
|
+
param_score_dict.transform_values do |records|
|
|
1566
|
+
records.map do |record|
|
|
1567
|
+
record.each_with_object({}) do |(key, value), memo|
|
|
1568
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1569
|
+
end
|
|
1570
|
+
end
|
|
1571
|
+
end
|
|
1572
|
+
end
|
|
1573
|
+
|
|
1574
|
+
sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
|
|
1575
|
+
def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
|
|
1576
|
+
return {} unless fully_evaled_param_combos
|
|
1577
|
+
|
|
1578
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
|
|
1579
|
+
|
|
1580
|
+
fully_evaled_param_combos.transform_values do |record|
|
|
1581
|
+
record.each_with_object({}) do |(key, value), memo|
|
|
1582
|
+
memo[key] = value if allowed_keys.include?(key)
|
|
1583
|
+
end
|
|
1584
|
+
end
|
|
1585
|
+
end
|
|
1586
|
+
|
|
1587
|
+
sig { params(value: T.untyped).returns(T.untyped) }
|
|
1588
|
+
def deep_dup(value)
|
|
1589
|
+
case value
|
|
1590
|
+
when Hash
|
|
1591
|
+
value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
|
|
1592
|
+
when Array
|
|
1593
|
+
value.map { |element| deep_dup(element) }
|
|
1594
|
+
else
|
|
1595
|
+
value
|
|
1596
|
+
end
|
|
1597
|
+
end
|
|
1598
|
+
|
|
1599
|
+
# Helper methods
|
|
1600
|
+
sig { returns(String) }
|
|
1601
|
+
def optimization_strategy_name
|
|
1602
|
+
strategy = config.optimization_strategy
|
|
1603
|
+
return strategy.serialize if strategy.respond_to?(:serialize)
|
|
1604
|
+
|
|
1605
|
+
strategy.to_s
|
|
1606
|
+
end
|
|
1607
|
+
|
|
1608
|
+
sig { params(program: T.untyped).returns(T.nilable(String)) }
|
|
1609
|
+
def extract_current_instruction(program)
|
|
1610
|
+
if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
|
|
1611
|
+
program.prompt.instruction
|
|
1612
|
+
elsif program.respond_to?(:system_signature)
|
|
1613
|
+
system_sig = program.system_signature
|
|
1614
|
+
system_sig.is_a?(String) ? system_sig : nil
|
|
1615
|
+
else
|
|
1616
|
+
nil
|
|
1617
|
+
end
|
|
1618
|
+
end
|
|
1619
|
+
|
|
1620
|
+
sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
|
|
1621
|
+
def extract_program_instructions(program)
|
|
1622
|
+
instructions = {}
|
|
1623
|
+
if program.respond_to?(:predictors)
|
|
1624
|
+
program.predictors.each_with_index do |predictor, index|
|
|
1625
|
+
if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
|
|
1626
|
+
value = predictor.prompt.instruction
|
|
1627
|
+
instructions[index] = value if value
|
|
1628
|
+
end
|
|
1629
|
+
end
|
|
1630
|
+
else
|
|
1631
|
+
fallback_instruction = extract_current_instruction(program)
|
|
1632
|
+
instructions[0] = fallback_instruction if fallback_instruction
|
|
1633
|
+
end
|
|
1634
|
+
instructions
|
|
1635
|
+
end
|
|
1636
|
+
|
|
1637
|
+
sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
|
|
1638
|
+
def extract_signature_class(program)
|
|
1639
|
+
program.respond_to?(:signature_class) ? program.signature_class : nil
|
|
1640
|
+
end
|
|
1641
|
+
|
|
1642
|
+
sig { params(example: T.untyped).returns(T.nilable(String)) }
|
|
1643
|
+
def extract_reasoning_from_example(example)
|
|
1644
|
+
case example
|
|
1645
|
+
when DSPy::Example
|
|
1646
|
+
if example.expected_values.key?(:reasoning)
|
|
1647
|
+
example.expected_values[:reasoning]
|
|
1648
|
+
elsif example.expected_values.key?(:explanation)
|
|
1649
|
+
example.expected_values[:explanation]
|
|
1650
|
+
else
|
|
1651
|
+
nil
|
|
1652
|
+
end
|
|
1653
|
+
else
|
|
1654
|
+
nil
|
|
1655
|
+
end
|
|
1656
|
+
end
|
|
1657
|
+
|
|
1658
|
+
# Infer auto mode based on configuration
|
|
1659
|
+
sig { returns(String) }
|
|
1660
|
+
def infer_auto_mode
|
|
1661
|
+
return config.auto_preset.serialize unless config.auto_preset == AutoPreset::None
|
|
1662
|
+
|
|
1663
|
+
case config.num_trials
|
|
1664
|
+
when 0..6 then "light"
|
|
1665
|
+
when 7..12 then "medium"
|
|
1666
|
+
when 13..Float::INFINITY then "heavy"
|
|
1667
|
+
else "manual"
|
|
1668
|
+
end
|
|
1669
|
+
end
|
|
1670
|
+
end
|
|
1671
|
+
end
|
|
1672
|
+
end
|