dspy 0.28.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -3
  3. data/lib/dspy/callbacks.rb +222 -0
  4. data/lib/dspy/chain_of_thought.rb +2 -1
  5. data/lib/dspy/code_act.rb +14 -1
  6. data/lib/dspy/datasets/ade.rb +90 -0
  7. data/lib/dspy/datasets.rb +8 -0
  8. data/lib/dspy/lm.rb +9 -12
  9. data/lib/dspy/mixins/struct_builder.rb +17 -25
  10. data/lib/dspy/module.rb +45 -1
  11. data/lib/dspy/observability/async_span_processor.rb +67 -93
  12. data/lib/dspy/observability.rb +43 -1
  13. data/lib/dspy/predict.rb +17 -0
  14. data/lib/dspy/prompt.rb +90 -20
  15. data/lib/dspy/propose/dataset_summary_generator.rb +210 -0
  16. data/lib/dspy/propose/grounded_proposer.rb +320 -66
  17. data/lib/dspy/re_act.rb +13 -0
  18. data/lib/dspy/reflection_lm.rb +36 -0
  19. data/lib/dspy/teleprompt/bootstrap_strategy.rb +26 -0
  20. data/lib/dspy/teleprompt/gepa.rb +448 -2803
  21. data/lib/dspy/teleprompt/mipro_v2.rb +624 -100
  22. data/lib/dspy/teleprompt/utils.rb +349 -42
  23. data/lib/dspy/version.rb +2 -2
  24. data/lib/dspy.rb +4 -2
  25. data/lib/gepa/api.rb +61 -0
  26. data/lib/gepa/core/engine.rb +226 -0
  27. data/lib/gepa/core/evaluation_batch.rb +26 -0
  28. data/lib/gepa/core/result.rb +92 -0
  29. data/lib/gepa/core/state.rb +231 -0
  30. data/lib/gepa/logging/experiment_tracker.rb +54 -0
  31. data/lib/gepa/logging/logger.rb +57 -0
  32. data/lib/gepa/logging.rb +9 -0
  33. data/lib/gepa/proposer/base.rb +27 -0
  34. data/lib/gepa/proposer/merge_proposer.rb +424 -0
  35. data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
  36. data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
  37. data/lib/gepa/strategies/batch_sampler.rb +91 -0
  38. data/lib/gepa/strategies/candidate_selector.rb +97 -0
  39. data/lib/gepa/strategies/component_selector.rb +57 -0
  40. data/lib/gepa/strategies/instruction_proposal.rb +120 -0
  41. data/lib/gepa/telemetry.rb +122 -0
  42. data/lib/gepa/utils/pareto.rb +119 -0
  43. data/lib/gepa.rb +21 -0
  44. metadata +59 -4
  45. data/lib/dspy/teleprompt/simple_optimizer.rb +0 -497
@@ -1,7 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'digest'
4
+ require 'time'
5
+ require 'concurrent-ruby'
4
6
  require 'sorbet-runtime'
7
+ require 'securerandom'
5
8
  require_relative 'teleprompter'
6
9
  require_relative 'utils'
7
10
  require_relative '../propose/grounded_proposer'
@@ -124,6 +127,7 @@ module DSPy
124
127
  setting :track_diversity, default: true
125
128
  setting :max_errors, default: 3
126
129
  setting :num_threads, default: 1
130
+ setting :minibatch_size, default: nil
127
131
 
128
132
  # Class-level configuration method - sets defaults for new instances
129
133
  def self.configure(&block)
@@ -265,6 +269,7 @@ module DSPy
265
269
  @proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
266
270
  @optimization_trace = []
267
271
  @evaluated_candidates = []
272
+ @trial_history = {}
268
273
  end
269
274
 
270
275
  # Main MIPROv2 optimization method
@@ -282,7 +287,7 @@ module DSPy
282
287
  trainset_size: trainset.size,
283
288
  valset_size: valset&.size || 0,
284
289
  num_trials: config.num_trials,
285
- optimization_strategy: config.optimization_strategy,
290
+ optimization_strategy: optimization_strategy_name,
286
291
  mode: infer_auto_mode
287
292
  }) do
288
293
  # Convert examples to typed format
@@ -294,18 +299,18 @@ module DSPy
294
299
 
295
300
  # Phase 1: Bootstrap few-shot examples
296
301
  emit_event('phase_start', { phase: 1, name: 'bootstrap' })
297
- bootstrap_result = phase_1_bootstrap(program, typed_trainset)
298
- emit_event('phase_complete', {
299
- phase: 1,
300
- success_rate: bootstrap_result.statistics[:success_rate],
301
- candidate_sets: bootstrap_result.candidate_sets.size
302
+ demo_candidates = phase_1_bootstrap(program, typed_trainset)
303
+ emit_event('phase_complete', {
304
+ phase: 1,
305
+ num_predictors: demo_candidates.keys.size,
306
+ demo_sets_per_predictor: demo_candidates[0]&.size || 0
302
307
  })
303
308
 
304
309
  # Phase 2: Generate instruction candidates
305
310
  emit_event('phase_start', { phase: 2, name: 'instruction_proposal' })
306
- proposal_result = phase_2_propose_instructions(program, typed_trainset, bootstrap_result)
307
- emit_event('phase_complete', {
308
- phase: 2,
311
+ proposal_result = phase_2_propose_instructions(program, typed_trainset, demo_candidates)
312
+ emit_event('phase_complete', {
313
+ phase: 2,
309
314
  num_candidates: proposal_result.num_candidates,
310
315
  best_instruction_preview: proposal_result.best_instruction[0, 50]
311
316
  })
@@ -316,7 +321,7 @@ module DSPy
316
321
  program,
317
322
  evaluation_set,
318
323
  proposal_result,
319
- bootstrap_result
324
+ demo_candidates
320
325
  )
321
326
  emit_event('phase_complete', {
322
327
  phase: 3,
@@ -327,10 +332,12 @@ module DSPy
327
332
  # Build final result
328
333
  final_result = build_miprov2_result(
329
334
  optimization_result,
330
- bootstrap_result,
335
+ demo_candidates,
331
336
  proposal_result
332
337
  )
333
338
 
339
+ @trial_history = optimization_result[:trial_logs] || {}
340
+
334
341
  save_results(final_result)
335
342
  final_result
336
343
  end
@@ -339,16 +346,17 @@ module DSPy
339
346
  private
340
347
 
341
348
  # Phase 1: Bootstrap few-shot examples from training data
342
- sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(Utils::BootstrapResult) }
349
+ # Returns a hash mapping predictor indices to arrays of demo sets
350
+ sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
343
351
  def phase_1_bootstrap(program, trainset)
344
- bootstrap_config = Utils::BootstrapConfig.new
345
- bootstrap_config.max_bootstrapped_examples = config.max_bootstrapped_examples
346
- bootstrap_config.max_labeled_examples = config.max_labeled_examples
347
- bootstrap_config.num_candidate_sets = config.bootstrap_sets
348
- bootstrap_config.max_errors = config.max_errors
349
- bootstrap_config.num_threads = config.num_threads
350
-
351
- Utils.create_n_fewshot_demo_sets(program, trainset, config: bootstrap_config, metric: @metric)
352
+ Utils.create_n_fewshot_demo_sets(
353
+ program,
354
+ config.bootstrap_sets, # num_candidate_sets
355
+ trainset,
356
+ max_bootstrapped_demos: config.max_bootstrapped_examples,
357
+ max_labeled_demos: config.max_labeled_examples,
358
+ metric: @metric
359
+ )
352
360
  end
353
361
 
354
362
  # Phase 2: Generate instruction candidates using grounded proposer
@@ -356,28 +364,34 @@ module DSPy
356
364
  params(
357
365
  program: T.untyped,
358
366
  trainset: T::Array[DSPy::Example],
359
- bootstrap_result: Utils::BootstrapResult
367
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
360
368
  ).returns(DSPy::Propose::GroundedProposer::ProposalResult)
361
369
  end
362
- def phase_2_propose_instructions(program, trainset, bootstrap_result)
370
+ def phase_2_propose_instructions(program, trainset, demo_candidates)
363
371
  # Get current instruction if available
364
372
  current_instruction = extract_current_instruction(program)
365
-
373
+
366
374
  # Use few-shot examples from bootstrap if available
367
- few_shot_examples = bootstrap_result.successful_examples.take(5)
375
+ # Flatten demo sets from first predictor and take first 5 examples
376
+ few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
368
377
 
369
- # Get signature class from program
370
- signature_class = extract_signature_class(program)
371
- raise ArgumentError, "Cannot extract signature class from program" unless signature_class
378
+ # Re-initialize proposer with program and trainset for awareness features
379
+ # This enables program_aware and use_dataset_summary flags to work correctly
380
+ proposer_config = DSPy::Propose::GroundedProposer::Config.new
381
+ proposer_config.num_instruction_candidates = config.num_instruction_candidates
372
382
 
373
- # Configure proposer for this optimization run
374
- @proposer.config.num_instruction_candidates = config.num_instruction_candidates
383
+ @proposer = DSPy::Propose::GroundedProposer.new(
384
+ config: proposer_config,
385
+ program: program,
386
+ trainset: trainset
387
+ )
375
388
 
376
- @proposer.propose_instructions(
377
- signature_class,
378
- trainset,
379
- few_shot_examples: few_shot_examples,
380
- current_instruction: current_instruction
389
+ @proposer.propose_instructions_for_program(
390
+ trainset: trainset,
391
+ program: program,
392
+ demo_candidates: demo_candidates,
393
+ trial_logs: @trial_history,
394
+ num_instruction_candidates: config.num_instruction_candidates
381
395
  )
382
396
  end
383
397
 
@@ -387,21 +401,27 @@ module DSPy
387
401
  program: T.untyped,
388
402
  evaluation_set: T::Array[DSPy::Example],
389
403
  proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
390
- bootstrap_result: Utils::BootstrapResult
404
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
391
405
  ).returns(T::Hash[Symbol, T.untyped])
392
406
  end
393
- def phase_3_optimize(program, evaluation_set, proposal_result, bootstrap_result)
407
+ def phase_3_optimize(program, evaluation_set, proposal_result, demo_candidates)
394
408
  # Generate candidate configurations
395
- candidates = generate_candidate_configurations(proposal_result, bootstrap_result)
409
+ candidates = generate_candidate_configurations(proposal_result, demo_candidates)
396
410
 
397
411
  # Initialize optimization state
398
412
  optimization_state = initialize_optimization_state(candidates)
399
-
413
+
414
+ # Initialize trial tracking structures
415
+ trial_logs = {}
416
+ param_score_dict = Hash.new { |hash, key| hash[key] = [] }
417
+ fully_evaled_param_combos = {}
418
+ total_eval_calls = 0
419
+
400
420
  # Run optimization trials
401
421
  trials_completed = 0
402
422
  best_score = 0.0
403
423
  best_candidate = nil
404
- best_program = nil
424
+ best_program = program
405
425
  best_evaluation_result = nil
406
426
 
407
427
  config.num_trials.times do |trial_idx|
@@ -409,6 +429,14 @@ module DSPy
409
429
 
410
430
  # Select next candidate based on optimization strategy
411
431
  candidate = select_next_candidate(candidates, optimization_state, trial_idx)
432
+ batch_size = evaluation_set.size
433
+
434
+ trial_logs[trials_completed] = create_trial_log_entry(
435
+ trial_number: trials_completed,
436
+ candidate: candidate,
437
+ evaluation_type: :full,
438
+ batch_size: batch_size
439
+ )
412
440
 
413
441
  emit_event('trial_start', {
414
442
  trial_number: trials_completed,
@@ -420,12 +448,30 @@ module DSPy
420
448
  begin
421
449
  # Evaluate candidate
422
450
  score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
451
+ total_eval_calls += batch_size
452
+
453
+ instructions_snapshot = extract_program_instructions(modified_program)
454
+ trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
455
+ trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
423
456
 
424
457
  # Update optimization state
425
458
  update_optimization_state(optimization_state, candidate, score)
459
+ record_param_score(
460
+ param_score_dict,
461
+ candidate,
462
+ score,
463
+ evaluation_type: :full,
464
+ instructions: instructions_snapshot
465
+ )
466
+ update_fully_evaled_param_combos(
467
+ fully_evaled_param_combos,
468
+ candidate,
469
+ score,
470
+ instructions: instructions_snapshot
471
+ )
426
472
 
427
473
  # Track best result
428
- is_best = score > best_score
474
+ is_best = best_candidate.nil? || score > best_score
429
475
  if is_best
430
476
  best_score = score
431
477
  best_candidate = candidate
@@ -433,6 +479,15 @@ module DSPy
433
479
  best_evaluation_result = evaluation_result
434
480
  end
435
481
 
482
+ finalize_trial_log_entry(
483
+ trial_logs,
484
+ trials_completed,
485
+ score: score,
486
+ evaluation_type: :full,
487
+ batch_size: batch_size,
488
+ total_eval_calls: total_eval_calls
489
+ )
490
+
436
491
  emit_event('trial_complete', {
437
492
  trial_number: trials_completed,
438
493
  score: score,
@@ -447,6 +502,16 @@ module DSPy
447
502
  end
448
503
 
449
504
  rescue => error
505
+ finalize_trial_log_entry(
506
+ trial_logs,
507
+ trials_completed,
508
+ score: nil,
509
+ evaluation_type: :full,
510
+ batch_size: batch_size,
511
+ total_eval_calls: total_eval_calls,
512
+ error: error.message
513
+ )
514
+
450
515
  emit_event('trial_error', {
451
516
  trial_number: trials_completed,
452
517
  error: error.message,
@@ -464,73 +529,190 @@ module DSPy
464
529
  best_evaluation_result: best_evaluation_result,
465
530
  trials_completed: trials_completed,
466
531
  optimization_state: optimization_state,
467
- evaluated_candidates: @evaluated_candidates
532
+ evaluated_candidates: @evaluated_candidates,
533
+ trial_logs: trial_logs,
534
+ param_score_dict: param_score_dict,
535
+ fully_evaled_param_combos: fully_evaled_param_combos,
536
+ total_eval_calls: total_eval_calls
468
537
  }
469
538
  end
470
539
 
471
- # Generate candidate configurations from proposals and bootstrap results
540
+ # Generate candidate configurations from proposals and demo candidates
472
541
  sig do
473
542
  params(
474
543
  proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
475
- bootstrap_result: Utils::BootstrapResult
544
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
476
545
  ).returns(T::Array[EvaluatedCandidate])
477
546
  end
478
- def generate_candidate_configurations(proposal_result, bootstrap_result)
547
+ def generate_candidate_configurations(proposal_result, demo_candidates)
479
548
  candidates = []
480
-
549
+
550
+ predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
551
+ proposal_result.predictor_instructions
552
+ else
553
+ { 0 => proposal_result.candidate_instructions }
554
+ end
555
+
556
+ instruction_maps = build_instruction_maps(predictor_instruction_map)
557
+ demo_maps = build_demo_maps(demo_candidates)
558
+
481
559
  # Base configuration (no modifications)
482
560
  candidates << EvaluatedCandidate.new(
483
561
  instruction: "",
484
562
  few_shot_examples: [],
485
563
  type: CandidateType::Baseline,
486
- metadata: {},
564
+ metadata: {
565
+ instructions_map: {},
566
+ demos_map: {}
567
+ },
487
568
  config_id: SecureRandom.hex(6)
488
569
  )
489
-
490
- # Instruction-only candidates
491
- proposal_result.candidate_instructions.each_with_index do |instruction, idx|
570
+
571
+ instruction_maps.each_with_index do |instruction_map, combo_idx|
572
+ primary_instruction = instruction_map[0] || instruction_map.values.first || ""
492
573
  candidates << EvaluatedCandidate.new(
493
- instruction: instruction,
574
+ instruction: primary_instruction,
494
575
  few_shot_examples: [],
495
576
  type: CandidateType::InstructionOnly,
496
- metadata: { proposal_rank: idx },
577
+ metadata: {
578
+ proposal_rank: combo_idx,
579
+ instructions_map: duplicate_instruction_map(instruction_map),
580
+ demos_map: {}
581
+ },
497
582
  config_id: SecureRandom.hex(6)
498
583
  )
499
584
  end
500
-
501
- # Few-shot only candidates
502
- bootstrap_result.candidate_sets.each_with_index do |candidate_set, idx|
585
+
586
+ demo_maps.each_with_index do |demo_map, idx|
587
+ next if demo_map.empty?
588
+
589
+ flattened_examples = demo_map.values.flatten
503
590
  candidates << EvaluatedCandidate.new(
504
591
  instruction: "",
505
- few_shot_examples: candidate_set,
592
+ few_shot_examples: flattened_examples,
506
593
  type: CandidateType::FewShotOnly,
507
- metadata: { bootstrap_rank: idx },
594
+ metadata: {
595
+ bootstrap_rank: idx,
596
+ instructions_map: {},
597
+ demos_map: duplicate_demo_map(demo_map)
598
+ },
508
599
  config_id: SecureRandom.hex(6)
509
600
  )
510
601
  end
511
602
 
512
603
  # Combined candidates (instruction + few-shot)
513
- top_instructions = proposal_result.candidate_instructions.take(3)
514
- top_bootstrap_sets = bootstrap_result.candidate_sets.take(3)
515
-
516
- top_instructions.each_with_index do |instruction, i_idx|
517
- top_bootstrap_sets.each_with_index do |candidate_set, b_idx|
604
+ instruction_maps.each_with_index do |instruction_map, combo_idx|
605
+ primary_instruction = instruction_map[0] || instruction_map.values.first || ""
606
+ demo_maps.first(3).each_with_index do |demo_map, demo_idx|
607
+ next if demo_map.empty?
608
+
609
+ flattened_examples = demo_map.values.flatten
518
610
  candidates << EvaluatedCandidate.new(
519
- instruction: instruction,
520
- few_shot_examples: candidate_set,
611
+ instruction: primary_instruction,
612
+ few_shot_examples: flattened_examples,
521
613
  type: CandidateType::Combined,
522
- metadata: {
523
- instruction_rank: i_idx,
524
- bootstrap_rank: b_idx
614
+ metadata: {
615
+ instruction_rank: combo_idx,
616
+ bootstrap_rank: demo_idx,
617
+ instructions_map: duplicate_instruction_map(instruction_map),
618
+ demos_map: duplicate_demo_map(demo_map)
525
619
  },
526
620
  config_id: SecureRandom.hex(6)
527
621
  )
528
622
  end
529
623
  end
530
-
624
+
531
625
  candidates
532
626
  end
533
627
 
628
+ sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
629
+ def build_instruction_maps(predictor_instruction_map)
630
+ return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
631
+
632
+ normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
633
+ next if instructions.nil? || instructions.empty?
634
+ memo[index] = instructions.take(3)
635
+ end
636
+
637
+ return [{}] if normalized.empty?
638
+
639
+ cartesian_product(normalized)
640
+ end
641
+
642
+ sig do
643
+ params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
644
+ end
645
+ def build_demo_maps(demo_candidates)
646
+ return [{}] if demo_candidates.nil? || demo_candidates.empty?
647
+
648
+ normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
649
+ next if sets.nil? || sets.empty?
650
+ memo[index] = sets.take(3)
651
+ end
652
+
653
+ return [{}] if normalized.empty?
654
+
655
+ cartesian_product(normalized)
656
+ end
657
+
658
+ sig do
659
+ params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
660
+ end
661
+ def cartesian_product(options_hash)
662
+ options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
663
+ next acc if values.nil? || values.empty?
664
+
665
+ acc.flat_map do |existing|
666
+ values.map do |value|
667
+ existing.merge(index => value)
668
+ end
669
+ end
670
+ end
671
+ end
672
+
673
+ sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
674
+ def duplicate_instruction_map(instruction_map)
675
+ instruction_map.each_with_object({}) do |(index, instruction), memo|
676
+ memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
677
+ end
678
+ end
679
+
680
+ sig do
681
+ params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
682
+ end
683
+ def duplicate_demo_map(demo_map)
684
+ demo_map.each_with_object({}) do |(index, demos), memo|
685
+ next if demos.nil?
686
+ memo[index] = demos.map { |demo| demo }
687
+ end
688
+ end
689
+
690
+ sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
691
+ def normalize_few_shot_examples(examples)
692
+ examples.map do |example|
693
+ if example.is_a?(DSPy::FewShotExample)
694
+ example
695
+ elsif example.is_a?(DSPy::Example)
696
+ DSPy::FewShotExample.new(
697
+ input: example.input_values,
698
+ output: example.expected_values,
699
+ reasoning: extract_reasoning_from_example(example)
700
+ )
701
+ else
702
+ example
703
+ end
704
+ end
705
+ end
706
+
707
+ sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
708
+ def assign_predictor_examples(predictor, examples)
709
+ predictor.demos = examples if predictor.respond_to?(:demos=)
710
+ return unless predictor.respond_to?(:prompt)
711
+
712
+ cloned_examples = examples.map { |ex| ex }
713
+ predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
714
+ end
715
+
534
716
  # Initialize optimization state for candidate selection
535
717
  sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
536
718
  def initialize_optimization_state(candidates)
@@ -685,10 +867,10 @@ module DSPy
685
867
  features << ((config_hash / 1000) % 1000).to_f / 1000.0 # Feature 2: different part of hash
686
868
  features << ((config_hash / 1_000_000) % 1000).to_f / 1000.0 # Feature 3: high bits
687
869
 
688
- # Add instruction length if available
870
+ # Add instruction length if available (Python-compatible: no cap)
689
871
  instruction = candidate.instruction
690
872
  if instruction && !instruction.empty?
691
- features << [instruction.length.to_f / 100.0, 2.0].min # Instruction length, capped at 200 chars
873
+ features << instruction.length.to_f / 100.0 # Instruction length, uncapped
692
874
  else
693
875
  features << 0.5 # Default value
694
876
  end
@@ -710,7 +892,11 @@ module DSPy
710
892
  modified_program = apply_candidate_configuration(program, candidate)
711
893
 
712
894
  # Evaluate modified program
713
- evaluation_result = evaluate_program(modified_program, evaluation_set)
895
+ evaluation_result = if use_concurrent_evaluation?(evaluation_set)
896
+ evaluate_candidate_concurrently(modified_program, evaluation_set)
897
+ else
898
+ evaluate_program(modified_program, evaluation_set)
899
+ end
714
900
 
715
901
  # Store evaluation details
716
902
  @evaluated_candidates << candidate
@@ -718,26 +904,131 @@ module DSPy
718
904
  [evaluation_result.pass_rate, modified_program, evaluation_result]
719
905
  end
720
906
 
907
+ sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
908
+ def use_concurrent_evaluation?(evaluation_set)
909
+ minibatch_size = config.minibatch_size
910
+ return false unless minibatch_size&.positive?
911
+ return false unless config.num_threads && config.num_threads > 1
912
+
913
+ evaluation_set.size > minibatch_size
914
+ end
915
+
916
+ sig do
917
+ params(
918
+ modified_program: T.untyped,
919
+ evaluation_set: T::Array[DSPy::Example]
920
+ ).returns(DSPy::Evaluate::BatchEvaluationResult)
921
+ end
922
+ def evaluate_candidate_concurrently(modified_program, evaluation_set)
923
+ chunk_size = T.must(config.minibatch_size)
924
+ chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
925
+ return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
926
+
927
+ pool_size = [config.num_threads, chunks.size].min
928
+ pool_size = 1 if pool_size <= 0
929
+ executor = Concurrent::FixedThreadPool.new(pool_size)
930
+
931
+ futures = chunks.map do |chunk|
932
+ Concurrent::Promises.future_on(executor) do
933
+ evaluate_program(modified_program, chunk)
934
+ end
935
+ end
936
+
937
+ results = futures.map(&:value!)
938
+ combine_batch_results(results)
939
+ ensure
940
+ if executor
941
+ executor.shutdown
942
+ executor.wait_for_termination
943
+ end
944
+ end
945
+
946
+ sig do
947
+ params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
948
+ end
949
+ def combine_batch_results(batch_results)
950
+ return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
951
+
952
+ combined_results = batch_results.flat_map(&:results)
953
+ total_examples = batch_results.sum(&:total_examples)
954
+ aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
955
+
956
+ DSPy::Evaluate::BatchEvaluationResult.new(
957
+ results: combined_results,
958
+ aggregated_metrics: aggregated_metrics
959
+ )
960
+ end
961
+
962
+ sig do
963
+ params(
964
+ batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
965
+ total_examples: Integer
966
+ ).returns(T::Hash[Symbol, T.untyped])
967
+ end
968
+ def merge_aggregated_metrics(batch_results, total_examples)
969
+ return {} if total_examples.zero?
970
+
971
+ keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
972
+ keys.each_with_object({}) do |key, memo|
973
+ numeric_weight = 0.0
974
+ numeric_sum = 0.0
975
+ fallback_value = nil
976
+
977
+ batch_results.each do |res|
978
+ value = res.aggregated_metrics[key]
979
+ next if value.nil?
980
+
981
+ if value.is_a?(Numeric)
982
+ numeric_sum += value.to_f * res.total_examples
983
+ numeric_weight += res.total_examples
984
+ else
985
+ fallback_value = value
986
+ end
987
+ end
988
+
989
+ if numeric_weight.positive?
990
+ memo[key] = numeric_sum / numeric_weight
991
+ elsif fallback_value
992
+ memo[key] = fallback_value
993
+ end
994
+ end
995
+ end
996
+
721
997
  # Apply candidate configuration to program
722
998
  sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
723
999
  def apply_candidate_configuration(program, candidate)
1000
+ instructions_map = candidate.metadata[:instructions_map] || {}
1001
+ demos_map = candidate.metadata[:demos_map] || {}
1002
+
724
1003
  modified_program = program
725
-
726
- # Apply instruction if provided
727
- if !candidate.instruction.empty? && program.respond_to?(:with_instruction)
1004
+ if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
1005
+ modified_program = modified_program.clone
1006
+ modified_program.predictors.each_with_index do |predictor, idx|
1007
+ if instructions_map.key?(idx)
1008
+ signature = Utils.get_signature(predictor)
1009
+ updated_signature = signature.with_instructions(instructions_map[idx])
1010
+ Utils.set_signature(predictor, updated_signature)
1011
+ end
1012
+
1013
+ if demos_map.key?(idx)
1014
+ normalized_examples = normalize_few_shot_examples(demos_map[idx])
1015
+ assign_predictor_examples(predictor, normalized_examples)
1016
+ end
1017
+ end
1018
+ end
1019
+
1020
+ # Apply instruction if provided (top-level programs still respect with_instruction)
1021
+ if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
728
1022
  modified_program = modified_program.with_instruction(candidate.instruction)
729
1023
  end
730
-
731
- # Apply few-shot examples if provided
732
- if candidate.few_shot_examples.any? && program.respond_to?(:with_examples)
733
- few_shot_examples = candidate.few_shot_examples.map do |example|
734
- DSPy::FewShotExample.new(
735
- input: example.input_values,
736
- output: example.expected_values,
737
- reasoning: extract_reasoning_from_example(example)
738
- )
739
- end
740
- modified_program = modified_program.with_examples(few_shot_examples)
1024
+
1025
+ should_apply_global_examples = candidate.few_shot_examples.any? &&
1026
+ modified_program.respond_to?(:with_examples) &&
1027
+ (demos_map.empty? || !modified_program.respond_to?(:predictors))
1028
+
1029
+ if should_apply_global_examples
1030
+ normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
1031
+ modified_program = modified_program.with_examples(normalized_few_shot)
741
1032
  end
742
1033
 
743
1034
  modified_program
@@ -779,48 +1070,66 @@ module DSPy
779
1070
  state[:no_improvement_count] >= config.early_stopping_patience
780
1071
  end
781
1072
 
782
- # Calculate diversity score for candidate
1073
+ # Calculate diversity score for candidate (Python-compatible: only few-shot count)
783
1074
  sig { params(candidate: EvaluatedCandidate).returns(Float) }
784
1075
  def calculate_diversity_score(candidate)
785
- # Simple diversity metric based on instruction length and few-shot count
786
- instruction_diversity = candidate.instruction.length / 200.0
1076
+ # Python DSPy doesn't use instruction length for diversity, only few-shot count
787
1077
  few_shot_diversity = candidate.few_shot_examples.size / 10.0
788
-
789
- [instruction_diversity + few_shot_diversity, 1.0].min
1078
+
1079
+ [few_shot_diversity, 1.0].min
790
1080
  end
791
1081
 
792
1082
  # Build final MIPROv2 result
793
1083
  sig do
794
1084
  params(
795
1085
  optimization_result: T::Hash[Symbol, T.untyped],
796
- bootstrap_result: Utils::BootstrapResult,
1086
+ demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]],
797
1087
  proposal_result: DSPy::Propose::GroundedProposer::ProposalResult
798
1088
  ).returns(MIPROv2Result)
799
1089
  end
800
- def build_miprov2_result(optimization_result, bootstrap_result, proposal_result)
1090
+ def build_miprov2_result(optimization_result, demo_candidates, proposal_result)
801
1091
  best_candidate = optimization_result[:best_candidate]
802
1092
  best_program = optimization_result[:best_program]
803
1093
  best_score = optimization_result[:best_score]
804
1094
  best_evaluation_result = optimization_result[:best_evaluation_result]
805
-
1095
+
806
1096
  scores = { pass_rate: best_score }
807
-
1097
+
808
1098
  history = {
809
1099
  total_trials: optimization_result[:trials_completed],
810
- optimization_strategy: config.optimization_strategy,
1100
+ optimization_strategy: optimization_strategy_name,
811
1101
  early_stopped: optimization_result[:trials_completed] < config.num_trials,
812
- score_history: optimization_result[:optimization_state][:best_score_history]
1102
+ score_history: optimization_result[:optimization_state][:best_score_history],
1103
+ total_eval_calls: optimization_result[:total_eval_calls]
813
1104
  }
814
-
1105
+
815
1106
  metadata = {
816
1107
  optimizer: "MIPROv2",
817
1108
  auto_mode: infer_auto_mode,
1109
+ optimization_strategy: optimization_strategy_name,
818
1110
  best_instruction: best_candidate&.instruction || "",
819
1111
  best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
820
1112
  best_candidate_type: best_candidate&.type&.serialize || "unknown",
821
1113
  optimization_timestamp: Time.now.iso8601
822
1114
  }
823
-
1115
+
1116
+ # Create bootstrap statistics from demo_candidates
1117
+ num_predictors = demo_candidates.keys.size
1118
+ sets_per_predictor = demo_candidates.values.map(&:size)
1119
+ all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
1120
+ bootstrap_statistics = {
1121
+ num_predictors: num_predictors,
1122
+ demo_sets_per_predictor: sets_per_predictor.max || 0,
1123
+ avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
1124
+ }
1125
+ bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
1126
+
1127
+ optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
1128
+ optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
1129
+ optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
1130
+ optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
1131
+ optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
1132
+
824
1133
  MIPROv2Result.new(
825
1134
  optimized_program: best_program,
826
1135
  scores: scores,
@@ -829,8 +1138,8 @@ module DSPy
829
1138
  best_score_value: best_score,
830
1139
  metadata: metadata,
831
1140
  evaluated_candidates: @evaluated_candidates,
832
- optimization_trace: serialize_optimization_trace(optimization_result[:optimization_state]),
833
- bootstrap_statistics: bootstrap_result.statistics,
1141
+ optimization_trace: optimization_trace,
1142
+ bootstrap_statistics: bootstrap_statistics,
834
1143
  proposal_statistics: proposal_result.analysis,
835
1144
  best_evaluation_result: best_evaluation_result
836
1145
  )
@@ -851,7 +1160,205 @@ module DSPy
851
1160
  serialized_trace
852
1161
  end
853
1162
 
1163
+ sig do
1164
+ params(
1165
+ trial_number: Integer,
1166
+ candidate: EvaluatedCandidate,
1167
+ evaluation_type: Symbol,
1168
+ batch_size: Integer
1169
+ ).returns(T::Hash[Symbol, T.untyped])
1170
+ end
1171
+ def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
1172
+ # Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
1173
+ trial_number # no-op to acknowledge parameter usage
1174
+ instructions_map = candidate.metadata[:instructions_map] || {}
1175
+ demos_map = candidate.metadata[:demos_map] || {}
1176
+ entry = {
1177
+ candidate_id: candidate.config_id,
1178
+ candidate_type: candidate.type.serialize,
1179
+ instruction_preview: candidate.instruction.to_s[0, 160],
1180
+ few_shot_count: candidate.few_shot_examples.size,
1181
+ metadata: deep_dup(candidate.metadata),
1182
+ evaluation_type: evaluation_type,
1183
+ batch_size: batch_size,
1184
+ status: :in_progress,
1185
+ started_at: Time.now.iso8601
1186
+ }
1187
+ if instructions_map.any?
1188
+ entry[:instructions] = duplicate_instruction_map(instructions_map)
1189
+ entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
1190
+ elsif candidate.instruction && !candidate.instruction.empty?
1191
+ predictor_index = candidate.metadata[:predictor_index] || 0
1192
+ entry[:instruction] = candidate.instruction
1193
+ entry[:instructions] = { predictor_index => candidate.instruction }
1194
+ end
1195
+ entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
1196
+ entry
1197
+ end
1198
+
1199
+ sig do
1200
+ params(
1201
+ trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
1202
+ trial_number: Integer,
1203
+ score: T.nilable(Float),
1204
+ evaluation_type: Symbol,
1205
+ batch_size: Integer,
1206
+ total_eval_calls: Integer,
1207
+ error: T.nilable(String)
1208
+ ).void
1209
+ end
1210
+ def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
1211
+ entry = trial_logs[trial_number] || {}
1212
+ entry[:score] = score if score
1213
+ entry[:evaluation_type] = evaluation_type
1214
+ entry[:batch_size] = batch_size
1215
+ entry[:total_eval_calls] = total_eval_calls
1216
+ entry[:status] = error ? :error : :completed
1217
+ entry[:error] = error if error
1218
+ entry[:completed_at] = Time.now.iso8601
1219
+ trial_logs[trial_number] = entry
1220
+ end
1221
+
1222
+ sig do
1223
+ params(
1224
+ param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
1225
+ candidate: EvaluatedCandidate,
1226
+ score: Float,
1227
+ evaluation_type: Symbol,
1228
+ instructions: T.nilable(T::Hash[Integer, String])
1229
+ ).void
1230
+ end
1231
+ def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
1232
+ instructions_hash = instructions || {}
1233
+ if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
1234
+ predictor_index = candidate.metadata[:predictor_index] || 0
1235
+ instructions_hash[predictor_index] = candidate.instruction
1236
+ end
1237
+
1238
+ record = {
1239
+ candidate_id: candidate.config_id,
1240
+ candidate_type: candidate.type.serialize,
1241
+ score: score,
1242
+ evaluation_type: evaluation_type,
1243
+ timestamp: Time.now.iso8601,
1244
+ metadata: deep_dup(candidate.metadata)
1245
+ }
1246
+ primary_instruction = instructions_hash[0] || candidate.instruction
1247
+ record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
1248
+ record[:instructions] = instructions_hash unless instructions_hash.empty?
1249
+
1250
+ param_score_dict[candidate.config_id] << record
1251
+ end
1252
+
1253
+ sig do
1254
+ params(
1255
+ fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
1256
+ candidate: EvaluatedCandidate,
1257
+ score: Float,
1258
+ instructions: T.nilable(T::Hash[Integer, String])
1259
+ ).void
1260
+ end
1261
+ def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
1262
+ existing = fully_evaled_param_combos[candidate.config_id]
1263
+ if existing.nil? || score > existing[:score]
1264
+ instructions_hash = instructions || {}
1265
+ if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
1266
+ predictor_index = candidate.metadata[:predictor_index] || 0
1267
+ instructions_hash[predictor_index] = candidate.instruction
1268
+ end
1269
+
1270
+ fully_evaled_param_combos[candidate.config_id] = {
1271
+ candidate_id: candidate.config_id,
1272
+ candidate_type: candidate.type.serialize,
1273
+ score: score,
1274
+ metadata: deep_dup(candidate.metadata),
1275
+ updated_at: Time.now.iso8601
1276
+ }
1277
+ unless instructions_hash.empty?
1278
+ fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
1279
+ fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
1280
+ end
1281
+ end
1282
+ end
1283
+
1284
+ sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
1285
+ def serialize_trial_logs(trial_logs)
1286
+ return {} unless trial_logs
1287
+
1288
+ allowed_keys = [
1289
+ :candidate_id,
1290
+ :candidate_type,
1291
+ :instruction_preview,
1292
+ :instruction,
1293
+ :instructions,
1294
+ :few_shot_count,
1295
+ :metadata,
1296
+ :evaluation_type,
1297
+ :batch_size,
1298
+ :score,
1299
+ :status,
1300
+ :error,
1301
+ :started_at,
1302
+ :completed_at,
1303
+ :total_eval_calls
1304
+ ]
1305
+
1306
+ trial_logs.transform_values do |entry|
1307
+ entry.each_with_object({}) do |(key, value), memo|
1308
+ memo[key] = value if allowed_keys.include?(key)
1309
+ end
1310
+ end
1311
+ end
1312
+
1313
+ sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
1314
+ def serialize_param_score_dict(param_score_dict)
1315
+ return {} unless param_score_dict
1316
+
1317
+ allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
1318
+
1319
+ param_score_dict.transform_values do |records|
1320
+ records.map do |record|
1321
+ record.each_with_object({}) do |(key, value), memo|
1322
+ memo[key] = value if allowed_keys.include?(key)
1323
+ end
1324
+ end
1325
+ end
1326
+ end
1327
+
1328
+ sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
1329
+ def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
1330
+ return {} unless fully_evaled_param_combos
1331
+
1332
+ allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
1333
+
1334
+ fully_evaled_param_combos.transform_values do |record|
1335
+ record.each_with_object({}) do |(key, value), memo|
1336
+ memo[key] = value if allowed_keys.include?(key)
1337
+ end
1338
+ end
1339
+ end
1340
+
1341
+ sig { params(value: T.untyped).returns(T.untyped) }
1342
+ def deep_dup(value)
1343
+ case value
1344
+ when Hash
1345
+ value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
1346
+ when Array
1347
+ value.map { |element| deep_dup(element) }
1348
+ else
1349
+ value
1350
+ end
1351
+ end
1352
+
854
1353
  # Helper methods
1354
+ sig { returns(String) }
1355
+ def optimization_strategy_name
1356
+ strategy = config.optimization_strategy
1357
+ return strategy.serialize if strategy.respond_to?(:serialize)
1358
+
1359
+ strategy.to_s
1360
+ end
1361
+
855
1362
  sig { params(program: T.untyped).returns(T.nilable(String)) }
856
1363
  def extract_current_instruction(program)
857
1364
  if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
@@ -864,6 +1371,23 @@ module DSPy
864
1371
  end
865
1372
  end
866
1373
 
1374
+ sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
1375
+ def extract_program_instructions(program)
1376
+ instructions = {}
1377
+ if program.respond_to?(:predictors)
1378
+ program.predictors.each_with_index do |predictor, index|
1379
+ if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
1380
+ value = predictor.prompt.instruction
1381
+ instructions[index] = value if value
1382
+ end
1383
+ end
1384
+ else
1385
+ fallback_instruction = extract_current_instruction(program)
1386
+ instructions[0] = fallback_instruction if fallback_instruction
1387
+ end
1388
+ instructions
1389
+ end
1390
+
867
1391
  sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
868
1392
  def extract_signature_class(program)
869
1393
  program.respond_to?(:signature_class) ? program.signature_class : nil
@@ -896,4 +1420,4 @@ module DSPy
896
1420
  end
897
1421
  end
898
1422
  end
899
- end
1423
+ end