dspy 0.28.2 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -3
- data/lib/dspy/code_act.rb +14 -1
- data/lib/dspy/datasets/ade.rb +90 -0
- data/lib/dspy/datasets.rb +8 -0
- data/lib/dspy/lm.rb +4 -8
- data/lib/dspy/mixins/struct_builder.rb +17 -25
- data/lib/dspy/module.rb +12 -1
- data/lib/dspy/observability/async_span_processor.rb +67 -93
- data/lib/dspy/observability.rb +43 -1
- data/lib/dspy/predict.rb +10 -0
- data/lib/dspy/propose/dataset_summary_generator.rb +36 -3
- data/lib/dspy/propose/grounded_proposer.rb +118 -11
- data/lib/dspy/re_act.rb +13 -0
- data/lib/dspy/reflection_lm.rb +36 -0
- data/lib/dspy/teleprompt/gepa.rb +448 -2803
- data/lib/dspy/teleprompt/mipro_v2.rb +564 -65
- data/lib/dspy/teleprompt/utils.rb +8 -3
- data/lib/dspy/version.rb +2 -2
- data/lib/dspy.rb +3 -2
- data/lib/gepa/api.rb +61 -0
- data/lib/gepa/core/engine.rb +226 -0
- data/lib/gepa/core/evaluation_batch.rb +26 -0
- data/lib/gepa/core/result.rb +92 -0
- data/lib/gepa/core/state.rb +231 -0
- data/lib/gepa/logging/experiment_tracker.rb +54 -0
- data/lib/gepa/logging/logger.rb +57 -0
- data/lib/gepa/logging.rb +9 -0
- data/lib/gepa/proposer/base.rb +27 -0
- data/lib/gepa/proposer/merge_proposer.rb +424 -0
- data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
- data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
- data/lib/gepa/strategies/batch_sampler.rb +91 -0
- data/lib/gepa/strategies/candidate_selector.rb +97 -0
- data/lib/gepa/strategies/component_selector.rb +57 -0
- data/lib/gepa/strategies/instruction_proposal.rb +120 -0
- data/lib/gepa/telemetry.rb +122 -0
- data/lib/gepa/utils/pareto.rb +119 -0
- data/lib/gepa.rb +21 -0
- metadata +42 -4
- data/lib/dspy/teleprompt/simple_optimizer.rb +0 -503
@@ -1,7 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'digest'
|
4
|
+
require 'time'
|
5
|
+
require 'concurrent-ruby'
|
4
6
|
require 'sorbet-runtime'
|
7
|
+
require 'securerandom'
|
5
8
|
require_relative 'teleprompter'
|
6
9
|
require_relative 'utils'
|
7
10
|
require_relative '../propose/grounded_proposer'
|
@@ -124,6 +127,7 @@ module DSPy
|
|
124
127
|
setting :track_diversity, default: true
|
125
128
|
setting :max_errors, default: 3
|
126
129
|
setting :num_threads, default: 1
|
130
|
+
setting :minibatch_size, default: nil
|
127
131
|
|
128
132
|
# Class-level configuration method - sets defaults for new instances
|
129
133
|
def self.configure(&block)
|
@@ -265,6 +269,7 @@ module DSPy
|
|
265
269
|
@proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
|
266
270
|
@optimization_trace = []
|
267
271
|
@evaluated_candidates = []
|
272
|
+
@trial_history = {}
|
268
273
|
end
|
269
274
|
|
270
275
|
# Main MIPROv2 optimization method
|
@@ -282,7 +287,7 @@ module DSPy
|
|
282
287
|
trainset_size: trainset.size,
|
283
288
|
valset_size: valset&.size || 0,
|
284
289
|
num_trials: config.num_trials,
|
285
|
-
optimization_strategy:
|
290
|
+
optimization_strategy: optimization_strategy_name,
|
286
291
|
mode: infer_auto_mode
|
287
292
|
}) do
|
288
293
|
# Convert examples to typed format
|
@@ -331,6 +336,8 @@ module DSPy
|
|
331
336
|
proposal_result
|
332
337
|
)
|
333
338
|
|
339
|
+
@trial_history = optimization_result[:trial_logs] || {}
|
340
|
+
|
334
341
|
save_results(final_result)
|
335
342
|
final_result
|
336
343
|
end
|
@@ -368,10 +375,6 @@ module DSPy
|
|
368
375
|
# Flatten demo sets from first predictor and take first 5 examples
|
369
376
|
few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
|
370
377
|
|
371
|
-
# Get signature class from program
|
372
|
-
signature_class = extract_signature_class(program)
|
373
|
-
raise ArgumentError, "Cannot extract signature class from program" unless signature_class
|
374
|
-
|
375
378
|
# Re-initialize proposer with program and trainset for awareness features
|
376
379
|
# This enables program_aware and use_dataset_summary flags to work correctly
|
377
380
|
proposer_config = DSPy::Propose::GroundedProposer::Config.new
|
@@ -383,11 +386,12 @@ module DSPy
|
|
383
386
|
trainset: trainset
|
384
387
|
)
|
385
388
|
|
386
|
-
@proposer.
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
389
|
+
@proposer.propose_instructions_for_program(
|
390
|
+
trainset: trainset,
|
391
|
+
program: program,
|
392
|
+
demo_candidates: demo_candidates,
|
393
|
+
trial_logs: @trial_history,
|
394
|
+
num_instruction_candidates: config.num_instruction_candidates
|
391
395
|
)
|
392
396
|
end
|
393
397
|
|
@@ -406,12 +410,18 @@ module DSPy
|
|
406
410
|
|
407
411
|
# Initialize optimization state
|
408
412
|
optimization_state = initialize_optimization_state(candidates)
|
409
|
-
|
413
|
+
|
414
|
+
# Initialize trial tracking structures
|
415
|
+
trial_logs = {}
|
416
|
+
param_score_dict = Hash.new { |hash, key| hash[key] = [] }
|
417
|
+
fully_evaled_param_combos = {}
|
418
|
+
total_eval_calls = 0
|
419
|
+
|
410
420
|
# Run optimization trials
|
411
421
|
trials_completed = 0
|
412
422
|
best_score = 0.0
|
413
423
|
best_candidate = nil
|
414
|
-
best_program =
|
424
|
+
best_program = program
|
415
425
|
best_evaluation_result = nil
|
416
426
|
|
417
427
|
config.num_trials.times do |trial_idx|
|
@@ -419,6 +429,14 @@ module DSPy
|
|
419
429
|
|
420
430
|
# Select next candidate based on optimization strategy
|
421
431
|
candidate = select_next_candidate(candidates, optimization_state, trial_idx)
|
432
|
+
batch_size = evaluation_set.size
|
433
|
+
|
434
|
+
trial_logs[trials_completed] = create_trial_log_entry(
|
435
|
+
trial_number: trials_completed,
|
436
|
+
candidate: candidate,
|
437
|
+
evaluation_type: :full,
|
438
|
+
batch_size: batch_size
|
439
|
+
)
|
422
440
|
|
423
441
|
emit_event('trial_start', {
|
424
442
|
trial_number: trials_completed,
|
@@ -430,12 +448,30 @@ module DSPy
|
|
430
448
|
begin
|
431
449
|
# Evaluate candidate
|
432
450
|
score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
|
451
|
+
total_eval_calls += batch_size
|
452
|
+
|
453
|
+
instructions_snapshot = extract_program_instructions(modified_program)
|
454
|
+
trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
|
455
|
+
trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
|
433
456
|
|
434
457
|
# Update optimization state
|
435
458
|
update_optimization_state(optimization_state, candidate, score)
|
459
|
+
record_param_score(
|
460
|
+
param_score_dict,
|
461
|
+
candidate,
|
462
|
+
score,
|
463
|
+
evaluation_type: :full,
|
464
|
+
instructions: instructions_snapshot
|
465
|
+
)
|
466
|
+
update_fully_evaled_param_combos(
|
467
|
+
fully_evaled_param_combos,
|
468
|
+
candidate,
|
469
|
+
score,
|
470
|
+
instructions: instructions_snapshot
|
471
|
+
)
|
436
472
|
|
437
473
|
# Track best result
|
438
|
-
is_best = score > best_score
|
474
|
+
is_best = best_candidate.nil? || score > best_score
|
439
475
|
if is_best
|
440
476
|
best_score = score
|
441
477
|
best_candidate = candidate
|
@@ -443,6 +479,15 @@ module DSPy
|
|
443
479
|
best_evaluation_result = evaluation_result
|
444
480
|
end
|
445
481
|
|
482
|
+
finalize_trial_log_entry(
|
483
|
+
trial_logs,
|
484
|
+
trials_completed,
|
485
|
+
score: score,
|
486
|
+
evaluation_type: :full,
|
487
|
+
batch_size: batch_size,
|
488
|
+
total_eval_calls: total_eval_calls
|
489
|
+
)
|
490
|
+
|
446
491
|
emit_event('trial_complete', {
|
447
492
|
trial_number: trials_completed,
|
448
493
|
score: score,
|
@@ -457,6 +502,16 @@ module DSPy
|
|
457
502
|
end
|
458
503
|
|
459
504
|
rescue => error
|
505
|
+
finalize_trial_log_entry(
|
506
|
+
trial_logs,
|
507
|
+
trials_completed,
|
508
|
+
score: nil,
|
509
|
+
evaluation_type: :full,
|
510
|
+
batch_size: batch_size,
|
511
|
+
total_eval_calls: total_eval_calls,
|
512
|
+
error: error.message
|
513
|
+
)
|
514
|
+
|
460
515
|
emit_event('trial_error', {
|
461
516
|
trial_number: trials_completed,
|
462
517
|
error: error.message,
|
@@ -474,7 +529,11 @@ module DSPy
|
|
474
529
|
best_evaluation_result: best_evaluation_result,
|
475
530
|
trials_completed: trials_completed,
|
476
531
|
optimization_state: optimization_state,
|
477
|
-
evaluated_candidates: @evaluated_candidates
|
532
|
+
evaluated_candidates: @evaluated_candidates,
|
533
|
+
trial_logs: trial_logs,
|
534
|
+
param_score_dict: param_score_dict,
|
535
|
+
fully_evaled_param_combos: fully_evaled_param_combos,
|
536
|
+
total_eval_calls: total_eval_calls
|
478
537
|
}
|
479
538
|
end
|
480
539
|
|
@@ -488,61 +547,172 @@ module DSPy
|
|
488
547
|
def generate_candidate_configurations(proposal_result, demo_candidates)
|
489
548
|
candidates = []
|
490
549
|
|
550
|
+
predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
|
551
|
+
proposal_result.predictor_instructions
|
552
|
+
else
|
553
|
+
{ 0 => proposal_result.candidate_instructions }
|
554
|
+
end
|
555
|
+
|
556
|
+
instruction_maps = build_instruction_maps(predictor_instruction_map)
|
557
|
+
demo_maps = build_demo_maps(demo_candidates)
|
558
|
+
|
491
559
|
# Base configuration (no modifications)
|
492
560
|
candidates << EvaluatedCandidate.new(
|
493
561
|
instruction: "",
|
494
562
|
few_shot_examples: [],
|
495
563
|
type: CandidateType::Baseline,
|
496
|
-
metadata: {
|
564
|
+
metadata: {
|
565
|
+
instructions_map: {},
|
566
|
+
demos_map: {}
|
567
|
+
},
|
497
568
|
config_id: SecureRandom.hex(6)
|
498
569
|
)
|
499
570
|
|
500
|
-
|
501
|
-
|
571
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
572
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
502
573
|
candidates << EvaluatedCandidate.new(
|
503
|
-
instruction:
|
574
|
+
instruction: primary_instruction,
|
504
575
|
few_shot_examples: [],
|
505
576
|
type: CandidateType::InstructionOnly,
|
506
|
-
metadata: {
|
577
|
+
metadata: {
|
578
|
+
proposal_rank: combo_idx,
|
579
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
580
|
+
demos_map: {}
|
581
|
+
},
|
507
582
|
config_id: SecureRandom.hex(6)
|
508
583
|
)
|
509
584
|
end
|
510
585
|
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
586
|
+
demo_maps.each_with_index do |demo_map, idx|
|
587
|
+
next if demo_map.empty?
|
588
|
+
|
589
|
+
flattened_examples = demo_map.values.flatten
|
515
590
|
candidates << EvaluatedCandidate.new(
|
516
591
|
instruction: "",
|
517
|
-
few_shot_examples:
|
592
|
+
few_shot_examples: flattened_examples,
|
518
593
|
type: CandidateType::FewShotOnly,
|
519
|
-
metadata: {
|
594
|
+
metadata: {
|
595
|
+
bootstrap_rank: idx,
|
596
|
+
instructions_map: {},
|
597
|
+
demos_map: duplicate_demo_map(demo_map)
|
598
|
+
},
|
520
599
|
config_id: SecureRandom.hex(6)
|
521
600
|
)
|
522
601
|
end
|
523
602
|
|
524
603
|
# Combined candidates (instruction + few-shot)
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
604
|
+
instruction_maps.each_with_index do |instruction_map, combo_idx|
|
605
|
+
primary_instruction = instruction_map[0] || instruction_map.values.first || ""
|
606
|
+
demo_maps.first(3).each_with_index do |demo_map, demo_idx|
|
607
|
+
next if demo_map.empty?
|
608
|
+
|
609
|
+
flattened_examples = demo_map.values.flatten
|
530
610
|
candidates << EvaluatedCandidate.new(
|
531
|
-
instruction:
|
532
|
-
few_shot_examples:
|
611
|
+
instruction: primary_instruction,
|
612
|
+
few_shot_examples: flattened_examples,
|
533
613
|
type: CandidateType::Combined,
|
534
|
-
metadata: {
|
535
|
-
instruction_rank:
|
536
|
-
bootstrap_rank:
|
614
|
+
metadata: {
|
615
|
+
instruction_rank: combo_idx,
|
616
|
+
bootstrap_rank: demo_idx,
|
617
|
+
instructions_map: duplicate_instruction_map(instruction_map),
|
618
|
+
demos_map: duplicate_demo_map(demo_map)
|
537
619
|
},
|
538
620
|
config_id: SecureRandom.hex(6)
|
539
621
|
)
|
540
622
|
end
|
541
623
|
end
|
542
|
-
|
624
|
+
|
543
625
|
candidates
|
544
626
|
end
|
545
627
|
|
628
|
+
sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
|
629
|
+
def build_instruction_maps(predictor_instruction_map)
|
630
|
+
return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
|
631
|
+
|
632
|
+
normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
|
633
|
+
next if instructions.nil? || instructions.empty?
|
634
|
+
memo[index] = instructions.take(3)
|
635
|
+
end
|
636
|
+
|
637
|
+
return [{}] if normalized.empty?
|
638
|
+
|
639
|
+
cartesian_product(normalized)
|
640
|
+
end
|
641
|
+
|
642
|
+
sig do
|
643
|
+
params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
|
644
|
+
end
|
645
|
+
def build_demo_maps(demo_candidates)
|
646
|
+
return [{}] if demo_candidates.nil? || demo_candidates.empty?
|
647
|
+
|
648
|
+
normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
|
649
|
+
next if sets.nil? || sets.empty?
|
650
|
+
memo[index] = sets.take(3)
|
651
|
+
end
|
652
|
+
|
653
|
+
return [{}] if normalized.empty?
|
654
|
+
|
655
|
+
cartesian_product(normalized)
|
656
|
+
end
|
657
|
+
|
658
|
+
sig do
|
659
|
+
params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
|
660
|
+
end
|
661
|
+
def cartesian_product(options_hash)
|
662
|
+
options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
|
663
|
+
next acc if values.nil? || values.empty?
|
664
|
+
|
665
|
+
acc.flat_map do |existing|
|
666
|
+
values.map do |value|
|
667
|
+
existing.merge(index => value)
|
668
|
+
end
|
669
|
+
end
|
670
|
+
end
|
671
|
+
end
|
672
|
+
|
673
|
+
sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
|
674
|
+
def duplicate_instruction_map(instruction_map)
|
675
|
+
instruction_map.each_with_object({}) do |(index, instruction), memo|
|
676
|
+
memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
|
677
|
+
end
|
678
|
+
end
|
679
|
+
|
680
|
+
sig do
|
681
|
+
params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
|
682
|
+
end
|
683
|
+
def duplicate_demo_map(demo_map)
|
684
|
+
demo_map.each_with_object({}) do |(index, demos), memo|
|
685
|
+
next if demos.nil?
|
686
|
+
memo[index] = demos.map { |demo| demo }
|
687
|
+
end
|
688
|
+
end
|
689
|
+
|
690
|
+
sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
|
691
|
+
def normalize_few_shot_examples(examples)
|
692
|
+
examples.map do |example|
|
693
|
+
if example.is_a?(DSPy::FewShotExample)
|
694
|
+
example
|
695
|
+
elsif example.is_a?(DSPy::Example)
|
696
|
+
DSPy::FewShotExample.new(
|
697
|
+
input: example.input_values,
|
698
|
+
output: example.expected_values,
|
699
|
+
reasoning: extract_reasoning_from_example(example)
|
700
|
+
)
|
701
|
+
else
|
702
|
+
example
|
703
|
+
end
|
704
|
+
end
|
705
|
+
end
|
706
|
+
|
707
|
+
sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
|
708
|
+
def assign_predictor_examples(predictor, examples)
|
709
|
+
predictor.demos = examples if predictor.respond_to?(:demos=)
|
710
|
+
return unless predictor.respond_to?(:prompt)
|
711
|
+
|
712
|
+
cloned_examples = examples.map { |ex| ex }
|
713
|
+
predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
|
714
|
+
end
|
715
|
+
|
546
716
|
# Initialize optimization state for candidate selection
|
547
717
|
sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
|
548
718
|
def initialize_optimization_state(candidates)
|
@@ -722,7 +892,11 @@ module DSPy
|
|
722
892
|
modified_program = apply_candidate_configuration(program, candidate)
|
723
893
|
|
724
894
|
# Evaluate modified program
|
725
|
-
evaluation_result =
|
895
|
+
evaluation_result = if use_concurrent_evaluation?(evaluation_set)
|
896
|
+
evaluate_candidate_concurrently(modified_program, evaluation_set)
|
897
|
+
else
|
898
|
+
evaluate_program(modified_program, evaluation_set)
|
899
|
+
end
|
726
900
|
|
727
901
|
# Store evaluation details
|
728
902
|
@evaluated_candidates << candidate
|
@@ -730,32 +904,131 @@ module DSPy
|
|
730
904
|
[evaluation_result.pass_rate, modified_program, evaluation_result]
|
731
905
|
end
|
732
906
|
|
907
|
+
sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
|
908
|
+
def use_concurrent_evaluation?(evaluation_set)
|
909
|
+
minibatch_size = config.minibatch_size
|
910
|
+
return false unless minibatch_size&.positive?
|
911
|
+
return false unless config.num_threads && config.num_threads > 1
|
912
|
+
|
913
|
+
evaluation_set.size > minibatch_size
|
914
|
+
end
|
915
|
+
|
916
|
+
sig do
|
917
|
+
params(
|
918
|
+
modified_program: T.untyped,
|
919
|
+
evaluation_set: T::Array[DSPy::Example]
|
920
|
+
).returns(DSPy::Evaluate::BatchEvaluationResult)
|
921
|
+
end
|
922
|
+
def evaluate_candidate_concurrently(modified_program, evaluation_set)
|
923
|
+
chunk_size = T.must(config.minibatch_size)
|
924
|
+
chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
|
925
|
+
return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
|
926
|
+
|
927
|
+
pool_size = [config.num_threads, chunks.size].min
|
928
|
+
pool_size = 1 if pool_size <= 0
|
929
|
+
executor = Concurrent::FixedThreadPool.new(pool_size)
|
930
|
+
|
931
|
+
futures = chunks.map do |chunk|
|
932
|
+
Concurrent::Promises.future_on(executor) do
|
933
|
+
evaluate_program(modified_program, chunk)
|
934
|
+
end
|
935
|
+
end
|
936
|
+
|
937
|
+
results = futures.map(&:value!)
|
938
|
+
combine_batch_results(results)
|
939
|
+
ensure
|
940
|
+
if executor
|
941
|
+
executor.shutdown
|
942
|
+
executor.wait_for_termination
|
943
|
+
end
|
944
|
+
end
|
945
|
+
|
946
|
+
sig do
|
947
|
+
params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
|
948
|
+
end
|
949
|
+
def combine_batch_results(batch_results)
|
950
|
+
return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
|
951
|
+
|
952
|
+
combined_results = batch_results.flat_map(&:results)
|
953
|
+
total_examples = batch_results.sum(&:total_examples)
|
954
|
+
aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
|
955
|
+
|
956
|
+
DSPy::Evaluate::BatchEvaluationResult.new(
|
957
|
+
results: combined_results,
|
958
|
+
aggregated_metrics: aggregated_metrics
|
959
|
+
)
|
960
|
+
end
|
961
|
+
|
962
|
+
sig do
|
963
|
+
params(
|
964
|
+
batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
|
965
|
+
total_examples: Integer
|
966
|
+
).returns(T::Hash[Symbol, T.untyped])
|
967
|
+
end
|
968
|
+
def merge_aggregated_metrics(batch_results, total_examples)
|
969
|
+
return {} if total_examples.zero?
|
970
|
+
|
971
|
+
keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
|
972
|
+
keys.each_with_object({}) do |key, memo|
|
973
|
+
numeric_weight = 0.0
|
974
|
+
numeric_sum = 0.0
|
975
|
+
fallback_value = nil
|
976
|
+
|
977
|
+
batch_results.each do |res|
|
978
|
+
value = res.aggregated_metrics[key]
|
979
|
+
next if value.nil?
|
980
|
+
|
981
|
+
if value.is_a?(Numeric)
|
982
|
+
numeric_sum += value.to_f * res.total_examples
|
983
|
+
numeric_weight += res.total_examples
|
984
|
+
else
|
985
|
+
fallback_value = value
|
986
|
+
end
|
987
|
+
end
|
988
|
+
|
989
|
+
if numeric_weight.positive?
|
990
|
+
memo[key] = numeric_sum / numeric_weight
|
991
|
+
elsif fallback_value
|
992
|
+
memo[key] = fallback_value
|
993
|
+
end
|
994
|
+
end
|
995
|
+
end
|
996
|
+
|
733
997
|
# Apply candidate configuration to program
|
734
998
|
sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
|
735
999
|
def apply_candidate_configuration(program, candidate)
|
1000
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
1001
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
1002
|
+
|
736
1003
|
modified_program = program
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
else
|
750
|
-
# Convert from DSPy::Example
|
751
|
-
DSPy::FewShotExample.new(
|
752
|
-
input: example.input_values,
|
753
|
-
output: example.expected_values,
|
754
|
-
reasoning: extract_reasoning_from_example(example)
|
755
|
-
)
|
1004
|
+
if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
|
1005
|
+
modified_program = modified_program.clone
|
1006
|
+
modified_program.predictors.each_with_index do |predictor, idx|
|
1007
|
+
if instructions_map.key?(idx)
|
1008
|
+
signature = Utils.get_signature(predictor)
|
1009
|
+
updated_signature = signature.with_instructions(instructions_map[idx])
|
1010
|
+
Utils.set_signature(predictor, updated_signature)
|
1011
|
+
end
|
1012
|
+
|
1013
|
+
if demos_map.key?(idx)
|
1014
|
+
normalized_examples = normalize_few_shot_examples(demos_map[idx])
|
1015
|
+
assign_predictor_examples(predictor, normalized_examples)
|
756
1016
|
end
|
757
1017
|
end
|
758
|
-
|
1018
|
+
end
|
1019
|
+
|
1020
|
+
# Apply instruction if provided (top-level programs still respect with_instruction)
|
1021
|
+
if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
|
1022
|
+
modified_program = modified_program.with_instruction(candidate.instruction)
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
should_apply_global_examples = candidate.few_shot_examples.any? &&
|
1026
|
+
modified_program.respond_to?(:with_examples) &&
|
1027
|
+
(demos_map.empty? || !modified_program.respond_to?(:predictors))
|
1028
|
+
|
1029
|
+
if should_apply_global_examples
|
1030
|
+
normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
|
1031
|
+
modified_program = modified_program.with_examples(normalized_few_shot)
|
759
1032
|
end
|
760
1033
|
|
761
1034
|
modified_program
|
@@ -824,14 +1097,16 @@ module DSPy
|
|
824
1097
|
|
825
1098
|
history = {
|
826
1099
|
total_trials: optimization_result[:trials_completed],
|
827
|
-
optimization_strategy:
|
1100
|
+
optimization_strategy: optimization_strategy_name,
|
828
1101
|
early_stopped: optimization_result[:trials_completed] < config.num_trials,
|
829
|
-
score_history: optimization_result[:optimization_state][:best_score_history]
|
1102
|
+
score_history: optimization_result[:optimization_state][:best_score_history],
|
1103
|
+
total_eval_calls: optimization_result[:total_eval_calls]
|
830
1104
|
}
|
831
1105
|
|
832
1106
|
metadata = {
|
833
1107
|
optimizer: "MIPROv2",
|
834
1108
|
auto_mode: infer_auto_mode,
|
1109
|
+
optimization_strategy: optimization_strategy_name,
|
835
1110
|
best_instruction: best_candidate&.instruction || "",
|
836
1111
|
best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
|
837
1112
|
best_candidate_type: best_candidate&.type&.serialize || "unknown",
|
@@ -839,12 +1114,21 @@ module DSPy
|
|
839
1114
|
}
|
840
1115
|
|
841
1116
|
# Create bootstrap statistics from demo_candidates
|
842
|
-
|
1117
|
+
num_predictors = demo_candidates.keys.size
|
1118
|
+
sets_per_predictor = demo_candidates.values.map(&:size)
|
1119
|
+
all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
|
843
1120
|
bootstrap_statistics = {
|
844
|
-
num_predictors:
|
845
|
-
demo_sets_per_predictor:
|
846
|
-
avg_demos_per_set:
|
1121
|
+
num_predictors: num_predictors,
|
1122
|
+
demo_sets_per_predictor: sets_per_predictor.max || 0,
|
1123
|
+
avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
|
847
1124
|
}
|
1125
|
+
bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
|
1126
|
+
|
1127
|
+
optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
|
1128
|
+
optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
|
1129
|
+
optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
|
1130
|
+
optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
|
1131
|
+
optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
|
848
1132
|
|
849
1133
|
MIPROv2Result.new(
|
850
1134
|
optimized_program: best_program,
|
@@ -854,7 +1138,7 @@ module DSPy
|
|
854
1138
|
best_score_value: best_score,
|
855
1139
|
metadata: metadata,
|
856
1140
|
evaluated_candidates: @evaluated_candidates,
|
857
|
-
optimization_trace:
|
1141
|
+
optimization_trace: optimization_trace,
|
858
1142
|
bootstrap_statistics: bootstrap_statistics,
|
859
1143
|
proposal_statistics: proposal_result.analysis,
|
860
1144
|
best_evaluation_result: best_evaluation_result
|
@@ -876,7 +1160,205 @@ module DSPy
|
|
876
1160
|
serialized_trace
|
877
1161
|
end
|
878
1162
|
|
1163
|
+
sig do
|
1164
|
+
params(
|
1165
|
+
trial_number: Integer,
|
1166
|
+
candidate: EvaluatedCandidate,
|
1167
|
+
evaluation_type: Symbol,
|
1168
|
+
batch_size: Integer
|
1169
|
+
).returns(T::Hash[Symbol, T.untyped])
|
1170
|
+
end
|
1171
|
+
def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
|
1172
|
+
# Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
|
1173
|
+
trial_number # no-op to acknowledge parameter usage
|
1174
|
+
instructions_map = candidate.metadata[:instructions_map] || {}
|
1175
|
+
demos_map = candidate.metadata[:demos_map] || {}
|
1176
|
+
entry = {
|
1177
|
+
candidate_id: candidate.config_id,
|
1178
|
+
candidate_type: candidate.type.serialize,
|
1179
|
+
instruction_preview: candidate.instruction.to_s[0, 160],
|
1180
|
+
few_shot_count: candidate.few_shot_examples.size,
|
1181
|
+
metadata: deep_dup(candidate.metadata),
|
1182
|
+
evaluation_type: evaluation_type,
|
1183
|
+
batch_size: batch_size,
|
1184
|
+
status: :in_progress,
|
1185
|
+
started_at: Time.now.iso8601
|
1186
|
+
}
|
1187
|
+
if instructions_map.any?
|
1188
|
+
entry[:instructions] = duplicate_instruction_map(instructions_map)
|
1189
|
+
entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
|
1190
|
+
elsif candidate.instruction && !candidate.instruction.empty?
|
1191
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
1192
|
+
entry[:instruction] = candidate.instruction
|
1193
|
+
entry[:instructions] = { predictor_index => candidate.instruction }
|
1194
|
+
end
|
1195
|
+
entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
|
1196
|
+
entry
|
1197
|
+
end
|
1198
|
+
|
1199
|
+
sig do
|
1200
|
+
params(
|
1201
|
+
trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
|
1202
|
+
trial_number: Integer,
|
1203
|
+
score: T.nilable(Float),
|
1204
|
+
evaluation_type: Symbol,
|
1205
|
+
batch_size: Integer,
|
1206
|
+
total_eval_calls: Integer,
|
1207
|
+
error: T.nilable(String)
|
1208
|
+
).void
|
1209
|
+
end
|
1210
|
+
def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
|
1211
|
+
entry = trial_logs[trial_number] || {}
|
1212
|
+
entry[:score] = score if score
|
1213
|
+
entry[:evaluation_type] = evaluation_type
|
1214
|
+
entry[:batch_size] = batch_size
|
1215
|
+
entry[:total_eval_calls] = total_eval_calls
|
1216
|
+
entry[:status] = error ? :error : :completed
|
1217
|
+
entry[:error] = error if error
|
1218
|
+
entry[:completed_at] = Time.now.iso8601
|
1219
|
+
trial_logs[trial_number] = entry
|
1220
|
+
end
|
1221
|
+
|
1222
|
+
sig do
|
1223
|
+
params(
|
1224
|
+
param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
|
1225
|
+
candidate: EvaluatedCandidate,
|
1226
|
+
score: Float,
|
1227
|
+
evaluation_type: Symbol,
|
1228
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
1229
|
+
).void
|
1230
|
+
end
|
1231
|
+
def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
|
1232
|
+
instructions_hash = instructions || {}
|
1233
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
1234
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
1235
|
+
instructions_hash[predictor_index] = candidate.instruction
|
1236
|
+
end
|
1237
|
+
|
1238
|
+
record = {
|
1239
|
+
candidate_id: candidate.config_id,
|
1240
|
+
candidate_type: candidate.type.serialize,
|
1241
|
+
score: score,
|
1242
|
+
evaluation_type: evaluation_type,
|
1243
|
+
timestamp: Time.now.iso8601,
|
1244
|
+
metadata: deep_dup(candidate.metadata)
|
1245
|
+
}
|
1246
|
+
primary_instruction = instructions_hash[0] || candidate.instruction
|
1247
|
+
record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
|
1248
|
+
record[:instructions] = instructions_hash unless instructions_hash.empty?
|
1249
|
+
|
1250
|
+
param_score_dict[candidate.config_id] << record
|
1251
|
+
end
|
1252
|
+
|
1253
|
+
sig do
|
1254
|
+
params(
|
1255
|
+
fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
|
1256
|
+
candidate: EvaluatedCandidate,
|
1257
|
+
score: Float,
|
1258
|
+
instructions: T.nilable(T::Hash[Integer, String])
|
1259
|
+
).void
|
1260
|
+
end
|
1261
|
+
def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
|
1262
|
+
existing = fully_evaled_param_combos[candidate.config_id]
|
1263
|
+
if existing.nil? || score > existing[:score]
|
1264
|
+
instructions_hash = instructions || {}
|
1265
|
+
if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
|
1266
|
+
predictor_index = candidate.metadata[:predictor_index] || 0
|
1267
|
+
instructions_hash[predictor_index] = candidate.instruction
|
1268
|
+
end
|
1269
|
+
|
1270
|
+
fully_evaled_param_combos[candidate.config_id] = {
|
1271
|
+
candidate_id: candidate.config_id,
|
1272
|
+
candidate_type: candidate.type.serialize,
|
1273
|
+
score: score,
|
1274
|
+
metadata: deep_dup(candidate.metadata),
|
1275
|
+
updated_at: Time.now.iso8601
|
1276
|
+
}
|
1277
|
+
unless instructions_hash.empty?
|
1278
|
+
fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
|
1279
|
+
fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
|
1280
|
+
end
|
1281
|
+
end
|
1282
|
+
end
|
1283
|
+
|
1284
|
+
sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
|
1285
|
+
def serialize_trial_logs(trial_logs)
|
1286
|
+
return {} unless trial_logs
|
1287
|
+
|
1288
|
+
allowed_keys = [
|
1289
|
+
:candidate_id,
|
1290
|
+
:candidate_type,
|
1291
|
+
:instruction_preview,
|
1292
|
+
:instruction,
|
1293
|
+
:instructions,
|
1294
|
+
:few_shot_count,
|
1295
|
+
:metadata,
|
1296
|
+
:evaluation_type,
|
1297
|
+
:batch_size,
|
1298
|
+
:score,
|
1299
|
+
:status,
|
1300
|
+
:error,
|
1301
|
+
:started_at,
|
1302
|
+
:completed_at,
|
1303
|
+
:total_eval_calls
|
1304
|
+
]
|
1305
|
+
|
1306
|
+
trial_logs.transform_values do |entry|
|
1307
|
+
entry.each_with_object({}) do |(key, value), memo|
|
1308
|
+
memo[key] = value if allowed_keys.include?(key)
|
1309
|
+
end
|
1310
|
+
end
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
|
1314
|
+
def serialize_param_score_dict(param_score_dict)
|
1315
|
+
return {} unless param_score_dict
|
1316
|
+
|
1317
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
|
1318
|
+
|
1319
|
+
param_score_dict.transform_values do |records|
|
1320
|
+
records.map do |record|
|
1321
|
+
record.each_with_object({}) do |(key, value), memo|
|
1322
|
+
memo[key] = value if allowed_keys.include?(key)
|
1323
|
+
end
|
1324
|
+
end
|
1325
|
+
end
|
1326
|
+
end
|
1327
|
+
|
1328
|
+
sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
|
1329
|
+
def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
|
1330
|
+
return {} unless fully_evaled_param_combos
|
1331
|
+
|
1332
|
+
allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
|
1333
|
+
|
1334
|
+
fully_evaled_param_combos.transform_values do |record|
|
1335
|
+
record.each_with_object({}) do |(key, value), memo|
|
1336
|
+
memo[key] = value if allowed_keys.include?(key)
|
1337
|
+
end
|
1338
|
+
end
|
1339
|
+
end
|
1340
|
+
|
1341
|
+
sig { params(value: T.untyped).returns(T.untyped) }
|
1342
|
+
def deep_dup(value)
|
1343
|
+
case value
|
1344
|
+
when Hash
|
1345
|
+
value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
|
1346
|
+
when Array
|
1347
|
+
value.map { |element| deep_dup(element) }
|
1348
|
+
else
|
1349
|
+
value
|
1350
|
+
end
|
1351
|
+
end
|
1352
|
+
|
879
1353
|
# Helper methods
|
1354
|
+
sig { returns(String) }
|
1355
|
+
def optimization_strategy_name
|
1356
|
+
strategy = config.optimization_strategy
|
1357
|
+
return strategy.serialize if strategy.respond_to?(:serialize)
|
1358
|
+
|
1359
|
+
strategy.to_s
|
1360
|
+
end
|
1361
|
+
|
880
1362
|
sig { params(program: T.untyped).returns(T.nilable(String)) }
|
881
1363
|
def extract_current_instruction(program)
|
882
1364
|
if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
|
@@ -889,6 +1371,23 @@ module DSPy
|
|
889
1371
|
end
|
890
1372
|
end
|
891
1373
|
|
1374
|
+
sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
|
1375
|
+
def extract_program_instructions(program)
|
1376
|
+
instructions = {}
|
1377
|
+
if program.respond_to?(:predictors)
|
1378
|
+
program.predictors.each_with_index do |predictor, index|
|
1379
|
+
if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
|
1380
|
+
value = predictor.prompt.instruction
|
1381
|
+
instructions[index] = value if value
|
1382
|
+
end
|
1383
|
+
end
|
1384
|
+
else
|
1385
|
+
fallback_instruction = extract_current_instruction(program)
|
1386
|
+
instructions[0] = fallback_instruction if fallback_instruction
|
1387
|
+
end
|
1388
|
+
instructions
|
1389
|
+
end
|
1390
|
+
|
892
1391
|
sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
|
893
1392
|
def extract_signature_class(program)
|
894
1393
|
program.respond_to?(:signature_class) ? program.signature_class : nil
|
@@ -921,4 +1420,4 @@ module DSPy
|
|
921
1420
|
end
|
922
1421
|
end
|
923
1422
|
end
|
924
|
-
end
|
1423
|
+
end
|