RubyGems - dspy - Versions diffs - 0.28.2 → 0.29.1 - Mend

dspy 0.28.2 → 0.29.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/README.md +3 -4
data/lib/dspy/code_act.rb +14 -1
data/lib/dspy/datasets/ade.rb +90 -0
data/lib/dspy/datasets.rb +8 -0
data/lib/dspy/lm.rb +4 -8
data/lib/dspy/mixins/struct_builder.rb +17 -25
data/lib/dspy/module.rb +12 -1
data/lib/dspy/observability/async_span_processor.rb +67 -93
data/lib/dspy/observability.rb +43 -1
data/lib/dspy/predict.rb +10 -0
data/lib/dspy/propose/dataset_summary_generator.rb +36 -3
data/lib/dspy/propose/grounded_proposer.rb +118 -11
data/lib/dspy/re_act.rb +13 -0
data/lib/dspy/reflection_lm.rb +36 -0
data/lib/dspy/teleprompt/gepa.rb +448 -2803
data/lib/dspy/teleprompt/mipro_v2.rb +839 -91
data/lib/dspy/teleprompt/utils.rb +8 -3
data/lib/dspy/version.rb +2 -2
data/lib/dspy.rb +3 -2
data/lib/gepa/api.rb +61 -0
data/lib/gepa/core/engine.rb +226 -0
data/lib/gepa/core/evaluation_batch.rb +26 -0
data/lib/gepa/core/result.rb +92 -0
data/lib/gepa/core/state.rb +231 -0
data/lib/gepa/logging/experiment_tracker.rb +54 -0
data/lib/gepa/logging/logger.rb +57 -0
data/lib/gepa/logging.rb +9 -0
data/lib/gepa/proposer/base.rb +27 -0
data/lib/gepa/proposer/merge_proposer.rb +424 -0
data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
data/lib/gepa/strategies/batch_sampler.rb +91 -0
data/lib/gepa/strategies/candidate_selector.rb +97 -0
data/lib/gepa/strategies/component_selector.rb +57 -0
data/lib/gepa/strategies/instruction_proposal.rb +120 -0
data/lib/gepa/telemetry.rb +122 -0
data/lib/gepa/utils/pareto.rb +119 -0
data/lib/gepa.rb +21 -0
metadata +38 -3
data/lib/dspy/teleprompt/simple_optimizer.rb +0 -503

data/lib/dspy/teleprompt/mipro_v2.rb CHANGED Viewed

@@ -1,7 +1,12 @@
 # frozen_string_literal: true
 require 'digest'
+require 'time'
+require 'json'
+require 'concurrent-ruby'
 require 'sorbet-runtime'
+require 'securerandom'
+require 'set'
 require_relative 'teleprompter'
 require_relative 'utils'
 require_relative '../propose/grounded_proposer'
@@ -27,6 +32,58 @@ module DSPy
         Bayesian = new("bayesian")
       end
     end
+    class AutoPreset < T::Enum
+      enums do
+        None = new("none")
+        Light = new("light")
+        Medium = new("medium")
+        Heavy = new("heavy")
+      end
+    end
+    AUTO_PRESET_SETTINGS = {
+      AutoPreset::None => {},
+      AutoPreset::Light => {
+        candidate_budget: 6,
+        instruction_candidates: 3,
+        instruction_candidates_when_fewshot: 3,
+        bootstrap_sets: 3,
+        max_bootstrapped_examples: 2,
+        max_labeled_examples: 8,
+        optimization_strategy: OptimizationStrategy::Greedy,
+        early_stopping_patience: 2,
+        valset_target_size: 100,
+        minibatch_size: nil
+      },
+      AutoPreset::Medium => {
+        candidate_budget: 12,
+        instruction_candidates: 5,
+        instruction_candidates_when_fewshot: 5,
+        bootstrap_sets: 5,
+        max_bootstrapped_examples: 4,
+        max_labeled_examples: 16,
+        optimization_strategy: OptimizationStrategy::Adaptive,
+        early_stopping_patience: 3,
+        valset_target_size: 300,
+        minibatch_size: nil
+      },
+      AutoPreset::Heavy => {
+        candidate_budget: 18,
+        instruction_candidates: 8,
+        instruction_candidates_when_fewshot: 8,
+        bootstrap_sets: 8,
+        max_bootstrapped_examples: 6,
+        max_labeled_examples: 24,
+        optimization_strategy: OptimizationStrategy::Bayesian,
+        early_stopping_patience: 5,
+        valset_target_size: 1000,
+        minibatch_size: nil
+      }
+    }.freeze
+    DEFAULT_AUTO_SEED = 42
     # MIPROv2: Multi-prompt Instruction Proposal with Retrieval Optimization
     # State-of-the-art prompt optimization combining bootstrap sampling,
     # instruction generation, and Bayesian optimization
@@ -47,13 +104,7 @@ module DSPy
         def self.light(metric: nil, **kwargs)
           optimizer = MIPROv2.new(metric: metric, **kwargs)
           optimizer.configure do |config|
-            config.num_trials = 6
-            config.num_instruction_candidates = 3
-            config.max_bootstrapped_examples = 2
-            config.max_labeled_examples = 8
-            config.bootstrap_sets = 3
-            config.optimization_strategy = :greedy
-            config.early_stopping_patience = 2
+            MIPROv2.apply_auto_defaults(config, AutoPreset::Light)
           end
           optimizer
         end
@@ -67,13 +118,7 @@ module DSPy
         def self.medium(metric: nil, **kwargs)
           optimizer = MIPROv2.new(metric: metric, **kwargs)
           optimizer.configure do |config|
-            config.num_trials = 12
-            config.num_instruction_candidates = 5
-            config.max_bootstrapped_examples = 4
-            config.max_labeled_examples = 16
-            config.bootstrap_sets = 5
-            config.optimization_strategy = :adaptive
-            config.early_stopping_patience = 3
+            MIPROv2.apply_auto_defaults(config, AutoPreset::Medium)
           end
           optimizer
         end
@@ -87,19 +132,33 @@ module DSPy
         def self.heavy(metric: nil, **kwargs)
           optimizer = MIPROv2.new(metric: metric, **kwargs)
           optimizer.configure do |config|
-            config.num_trials = 18
-            config.num_instruction_candidates = 8
-            config.max_bootstrapped_examples = 6
-            config.max_labeled_examples = 24
-            config.bootstrap_sets = 8
-            config.optimization_strategy = :bayesian
-            config.early_stopping_patience = 5
+            MIPROv2.apply_auto_defaults(config, AutoPreset::Heavy)
           end
           optimizer
         end
       end
       # Dry-configurable settings for MIPROv2
+      setting :auto_preset, default: AutoPreset::None, constructor: ->(value) {
+        case value
+        when AutoPreset
+          value
+        when String, Symbol
+          begin
+            AutoPreset.deserialize(value.to_s.downcase)
+          rescue ArgumentError
+            raise ArgumentError, "Invalid auto preset: #{value}. Must be one of :none, :light, :medium, :heavy"
+          end
+        when nil
+          AutoPreset::None
+        else
+          raise ArgumentError, "Invalid auto preset: #{value.inspect}"
+        end
+      }
+      setting :auto_seed, default: DEFAULT_AUTO_SEED, constructor: ->(value) {
+        value.nil? ? DEFAULT_AUTO_SEED : Integer(value)
+      }
+      setting :valset_target_size, default: nil
       setting :num_trials, default: 12
       setting :num_instruction_candidates, default: 5
       setting :bootstrap_sets, default: 5
@@ -124,6 +183,7 @@ module DSPy
       setting :track_diversity, default: true
       setting :max_errors, default: 3
       setting :num_threads, default: 1
+      setting :minibatch_size, default: nil
       # Class-level configuration method - sets defaults for new instances
       def self.configure(&block)
@@ -138,6 +198,26 @@ module DSPy
         @default_config_block
       end
+      class << self
+        extend T::Sig
+        sig { params(config: T.untyped, preset: AutoPreset).void }
+        def apply_auto_defaults(config, preset)
+          settings = AUTO_PRESET_SETTINGS.fetch(preset) { {} }
+          config.auto_preset = preset
+          config.num_trials = settings[:candidate_budget] if settings[:candidate_budget]
+          config.num_instruction_candidates = settings[:instruction_candidates] if settings[:instruction_candidates]
+          config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
+          config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
+          config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
+          config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
+          config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
+          config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
+          config.valset_target_size = settings[:valset_target_size] if settings[:valset_target_size]
+        end
+      end
       # Simple data structure for evaluated candidate configurations (immutable)
       EvaluatedCandidate = Data.define(
@@ -265,6 +345,7 @@ module DSPy
         @proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
         @optimization_trace = []
         @evaluated_candidates = []
+        @trial_history = {}
       end
       # Main MIPROv2 optimization method
@@ -282,13 +363,20 @@ module DSPy
           trainset_size: trainset.size,
           valset_size: valset&.size || 0,
           num_trials: config.num_trials,
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           mode: infer_auto_mode
         }) do
           # Convert examples to typed format
           typed_trainset = ensure_typed_examples(trainset)
           typed_valset = valset ? ensure_typed_examples(valset) : nil
+          if auto_preset_active?
+            typed_trainset, typed_valset = prepare_datasets_for_auto(typed_trainset, typed_valset)
+            typed_valset = apply_auto_preset!(program, typed_valset)
+          else
+            typed_valset = limit_validation_set(typed_valset, config.valset_target_size)
+          end
           # Use validation set if available, otherwise use part of training set
           evaluation_set = typed_valset || typed_trainset.take([typed_trainset.size / 3, 10].max)
@@ -331,6 +419,8 @@ module DSPy
             proposal_result
           )
+          @trial_history = optimization_result[:trial_logs] || {}
           save_results(final_result)
           final_result
         end
@@ -338,6 +428,105 @@ module DSPy
       private
+      sig { returns(T::Boolean) }
+      def auto_preset_active?
+        config.auto_preset != AutoPreset::None
+      end
+      sig { params(trainset: T::Array[DSPy::Example], valset: T.nilable(T::Array[DSPy::Example])).returns([T::Array[DSPy::Example], T::Array[DSPy::Example]]) }
+      def prepare_datasets_for_auto(trainset, valset)
+        settings = auto_settings_for(config.auto_preset)
+        target_size = settings[:valset_target_size]
+        config.valset_target_size = target_size
+        if valset && valset.any?
+          [trainset, limit_validation_set(valset, target_size)]
+        else
+          raise ArgumentError, "Training set must contain at least 2 examples when auto presets are enabled" if trainset.size < 2
+          shuffled = trainset.shuffle(random: Random.new(config.auto_seed))
+          default_val_size = [
+            [(trainset.size * 0.8).ceil, 1].max,
+            trainset.size - 1
+          ].min
+          desired_val_size = target_size ? [default_val_size, target_size].min : default_val_size
+          desired_val_size = [[desired_val_size, 1].max, trainset.size - 1].min
+          validation_examples = shuffled.take(desired_val_size)
+          training_examples = shuffled.drop(desired_val_size)
+          [training_examples, limit_validation_set(validation_examples, target_size)]
+        end
+      end
+      sig { params(program: T.untyped, valset: T::Array[DSPy::Example]).returns(T::Array[DSPy::Example]) }
+      def apply_auto_preset!(program, valset)
+        settings = auto_settings_for(config.auto_preset)
+        zeroshot = zero_shot_for_settings?(settings)
+        candidate_budget = settings[:candidate_budget]
+        if candidate_budget && candidate_budget.positive?
+          config.num_trials = compute_trials_from_candidate_budget(program, candidate_budget, zeroshot)
+          instruction_candidates = if zeroshot
+            candidate_budget
+          else
+            settings[:instruction_candidates_when_fewshot] || (candidate_budget / 2.0).ceil
+          end
+          config.num_instruction_candidates = [instruction_candidates, 1].max
+        end
+        config.bootstrap_sets = settings[:bootstrap_sets] if settings[:bootstrap_sets]
+        config.max_bootstrapped_examples = settings[:max_bootstrapped_examples] if settings.key?(:max_bootstrapped_examples)
+        config.max_labeled_examples = settings[:max_labeled_examples] if settings.key?(:max_labeled_examples)
+        config.optimization_strategy = settings[:optimization_strategy] if settings[:optimization_strategy]
+        config.early_stopping_patience = settings[:early_stopping_patience] if settings[:early_stopping_patience]
+        config.minibatch_size = settings[:minibatch_size] if settings.key?(:minibatch_size)
+        config.valset_target_size = settings[:valset_target_size]
+        limit_validation_set(valset, config.valset_target_size)
+      end
+      sig { params(valset: T.nilable(T::Array[DSPy::Example]), target_size: T.nilable(Integer)).returns(T.nilable(T::Array[DSPy::Example])) }
+      def limit_validation_set(valset, target_size)
+        return valset unless valset && target_size && target_size.positive?
+        return valset if valset.size <= target_size
+        valset.shuffle(random: Random.new(config.auto_seed)).take(target_size)
+      end
+      sig { params(program: T.untyped, num_candidates: Integer, zeroshot: T::Boolean).returns(Integer) }
+      def compute_trials_from_candidate_budget(program, num_candidates, zeroshot)
+        predictor_count =
+          if program.respond_to?(:predictors)
+            Array(program.predictors).size
+          else
+            1
+          end
+        predictor_count = 1 if predictor_count.zero?
+        variable_count = zeroshot ? predictor_count : predictor_count * 2
+        log_term = Math.log2([num_candidates, 2].max)
+        [
+          (2 * variable_count * log_term).ceil,
+          (1.5 * num_candidates).ceil
+        ].max
+      end
+      sig { params(settings: T::Hash[Symbol, T.untyped]).returns(T::Boolean) }
+      def zero_shot_for_settings?(settings)
+        settings.fetch(:max_bootstrapped_examples, 0).to_i.zero? &&
+          settings.fetch(:max_labeled_examples, 0).to_i.zero?
+      end
+      sig { params(preset: AutoPreset).returns(T::Hash[Symbol, T.untyped]) }
+      def auto_settings_for(preset)
+        AUTO_PRESET_SETTINGS.fetch(preset) do
+          raise ArgumentError, "Unknown auto preset: #{preset.inspect}"
+        end
+      end
       # Phase 1: Bootstrap few-shot examples from training data
       # Returns a hash mapping predictor indices to arrays of demo sets
       sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
@@ -368,10 +557,6 @@ module DSPy
         # Flatten demo sets from first predictor and take first 5 examples
         few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
-        # Get signature class from program
-        signature_class = extract_signature_class(program)
-        raise ArgumentError, "Cannot extract signature class from program" unless signature_class
         # Re-initialize proposer with program and trainset for awareness features
         # This enables program_aware and use_dataset_summary flags to work correctly
         proposer_config = DSPy::Propose::GroundedProposer::Config.new
@@ -383,11 +568,12 @@ module DSPy
           trainset: trainset
         )
-        @proposer.propose_instructions(
-          signature_class,
-          trainset,
-          few_shot_examples: few_shot_examples,
-          current_instruction: current_instruction
+        @proposer.propose_instructions_for_program(
+          trainset: trainset,
+          program: program,
+          demo_candidates: demo_candidates,
+          trial_logs: @trial_history,
+          num_instruction_candidates: config.num_instruction_candidates
         )
       end
@@ -406,12 +592,18 @@ module DSPy
         # Initialize optimization state
         optimization_state = initialize_optimization_state(candidates)
+        # Initialize trial tracking structures
+        trial_logs = {}
+        param_score_dict = Hash.new { |hash, key| hash[key] = [] }
+        fully_evaled_param_combos = {}
+        total_eval_calls = 0
         # Run optimization trials
         trials_completed = 0
         best_score = 0.0
         best_candidate = nil
-        best_program = nil
+        best_program = program
         best_evaluation_result = nil
         config.num_trials.times do |trial_idx|
@@ -419,6 +611,14 @@ module DSPy
           # Select next candidate based on optimization strategy
           candidate = select_next_candidate(candidates, optimization_state, trial_idx)
+          batch_size = evaluation_set.size
+          trial_logs[trials_completed] = create_trial_log_entry(
+            trial_number: trials_completed,
+            candidate: candidate,
+            evaluation_type: :full,
+            batch_size: batch_size
+          )
           emit_event('trial_start', {
             trial_number: trials_completed,
@@ -430,12 +630,30 @@ module DSPy
           begin
             # Evaluate candidate
             score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
+            total_eval_calls += batch_size
+            instructions_snapshot = extract_program_instructions(modified_program)
+            trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
+            trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
             # Update optimization state
             update_optimization_state(optimization_state, candidate, score)
+            record_param_score(
+              param_score_dict,
+              candidate,
+              score,
+              evaluation_type: :full,
+              instructions: instructions_snapshot
+            )
+            update_fully_evaled_param_combos(
+              fully_evaled_param_combos,
+              candidate,
+              score,
+              instructions: instructions_snapshot
+            )
             # Track best result
-            is_best = score > best_score
+            is_best = best_candidate.nil? || score > best_score
             if is_best
               best_score = score
               best_candidate = candidate
@@ -443,6 +661,15 @@ module DSPy
               best_evaluation_result = evaluation_result
             end
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: score,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls
+            )
             emit_event('trial_complete', {
               trial_number: trials_completed,
               score: score,
@@ -457,6 +684,16 @@ module DSPy
             end
           rescue => error
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: nil,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls,
+              error: error.message
+            )
             emit_event('trial_error', {
               trial_number: trials_completed,
               error: error.message,
@@ -474,7 +711,11 @@ module DSPy
           best_evaluation_result: best_evaluation_result,
           trials_completed: trials_completed,
           optimization_state: optimization_state,
-          evaluated_candidates: @evaluated_candidates
+          evaluated_candidates: @evaluated_candidates,
+          trial_logs: trial_logs,
+          param_score_dict: param_score_dict,
+          fully_evaled_param_combos: fully_evaled_param_combos,
+          total_eval_calls: total_eval_calls
         }
       end
@@ -487,62 +728,237 @@ module DSPy
       end
       def generate_candidate_configurations(proposal_result, demo_candidates)
         candidates = []
+        seen_signatures = Set.new
+        add_candidate = lambda do |instruction:, few_shot_examples:, type:, metadata:, config_id:|
+          signature = candidate_signature(type, instruction, metadata, few_shot_examples)
+          next if seen_signatures.include?(signature)
+          seen_signatures << signature
+          candidates << EvaluatedCandidate.new(
+            instruction: instruction,
+            few_shot_examples: few_shot_examples,
+            type: type,
+            metadata: metadata,
+            config_id: config_id
+          )
+        end
+        predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
+          proposal_result.predictor_instructions
+        else
+          { 0 => proposal_result.candidate_instructions }
+        end
+        instruction_maps = build_instruction_maps(predictor_instruction_map)
+        demo_maps = build_demo_maps(demo_candidates)
         # Base configuration (no modifications)
-        candidates << EvaluatedCandidate.new(
+        add_candidate.call(
           instruction: "",
           few_shot_examples: [],
           type: CandidateType::Baseline,
-          metadata: {},
+          metadata: {
+            instructions_map: {},
+            demos_map: {}
+          },
           config_id: SecureRandom.hex(6)
         )
-        # Instruction-only candidates
-        proposal_result.candidate_instructions.each_with_index do |instruction, idx|
-          candidates << EvaluatedCandidate.new(
-            instruction: instruction,
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
+          add_candidate.call(
+            instruction: primary_instruction,
             few_shot_examples: [],
             type: CandidateType::InstructionOnly,
-            metadata: { proposal_rank: idx },
+            metadata: {
+              proposal_rank: combo_idx,
+              instructions_map: duplicate_instruction_map(instruction_map),
+              demos_map: {}
+            },
             config_id: SecureRandom.hex(6)
           )
         end
-        # Few-shot only candidates
-        # Extract demo sets from first predictor (predictor index 0)
-        demo_sets = demo_candidates[0] || []
-        demo_sets.each_with_index do |demo_set, idx|
-          candidates << EvaluatedCandidate.new(
+        demo_maps.each_with_index do |demo_map, idx|
+          next if demo_map.empty?
+          flattened_examples = demo_map.values.flatten
+          add_candidate.call(
             instruction: "",
-            few_shot_examples: demo_set,
+            few_shot_examples: flattened_examples,
             type: CandidateType::FewShotOnly,
-            metadata: { bootstrap_rank: idx },
+            metadata: {
+              bootstrap_rank: idx,
+              instructions_map: {},
+              demos_map: duplicate_demo_map(demo_map)
+            },
             config_id: SecureRandom.hex(6)
           )
         end
         # Combined candidates (instruction + few-shot)
-        top_instructions = proposal_result.candidate_instructions.take(3)
-        top_bootstrap_sets = demo_sets.take(3)
-        top_instructions.each_with_index do |instruction, i_idx|
-          top_bootstrap_sets.each_with_index do |candidate_set, b_idx|
-            candidates << EvaluatedCandidate.new(
-              instruction: instruction,
-              few_shot_examples: candidate_set,
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
+          demo_maps.first(3).each_with_index do |demo_map, demo_idx|
+            next if demo_map.empty?
+            flattened_examples = demo_map.values.flatten
+            add_candidate.call(
+              instruction: primary_instruction,
+              few_shot_examples: flattened_examples,
               type: CandidateType::Combined,
-              metadata: {
-                instruction_rank: i_idx,
-                bootstrap_rank: b_idx
+              metadata: {
+                instruction_rank: combo_idx,
+                bootstrap_rank: demo_idx,
+                instructions_map: duplicate_instruction_map(instruction_map),
+                demos_map: duplicate_demo_map(demo_map)
               },
               config_id: SecureRandom.hex(6)
             )
           end
         end
         candidates
       end
+      sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
+      def build_instruction_maps(predictor_instruction_map)
+        return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
+        normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
+          next if instructions.nil? || instructions.empty?
+          memo[index] = instructions.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
+      end
+      def build_demo_maps(demo_candidates)
+        return [{}] if demo_candidates.nil? || demo_candidates.empty?
+        normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
+          next if sets.nil? || sets.empty?
+          memo[index] = sets.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
+      end
+      def cartesian_product(options_hash)
+        options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
+          next acc if values.nil? || values.empty?
+          acc.flat_map do |existing|
+            values.map do |value|
+              existing.merge(index => value)
+            end
+          end
+        end
+      end
+      sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
+      def duplicate_instruction_map(instruction_map)
+        instruction_map.each_with_object({}) do |(index, instruction), memo|
+          memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
+        end
+      end
+      sig do
+        params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
+      end
+      def duplicate_demo_map(demo_map)
+        demo_map.each_with_object({}) do |(index, demos), memo|
+          next if demos.nil?
+          memo[index] = demos.map { |demo| demo }
+        end
+      end
+      sig do
+        params(
+          type: CandidateType,
+          instruction: String,
+          metadata: T::Hash[Symbol, T.untyped],
+          few_shot_examples: T::Array[T.untyped]
+        ).returns(String)
+      end
+      def candidate_signature(type, instruction, metadata, few_shot_examples)
+        JSON.generate(
+          type: type.serialize,
+          instruction: instruction,
+          instructions_map: normalize_instruction_map(metadata[:instructions_map] || {}),
+          demos_map: normalize_demo_map(metadata[:demos_map] || {}),
+          few_shot_examples: few_shot_examples.map { |example| serialize_few_shot_example(example) }
+        )
+      end
+      sig { params(map: T::Hash[Integer, T.untyped]).returns(T::Hash[Integer, String]) }
+      def normalize_instruction_map(map)
+        map.sort_by { |index, _| index }.each_with_object({}) do |(index, value), memo|
+          memo[index] = value.to_s
+        end
+      end
+      sig { params(map: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Hash[Integer, T::Array[T.untyped]]) }
+      def normalize_demo_map(map)
+        map.sort_by { |index, _| index }.each_with_object({}) do |(index, demos), memo|
+          memo[index] = Array(demos).map { |demo| serialize_few_shot_example(demo) }
+        end
+      end
+      sig { params(example: T.untyped).returns(T.untyped) }
+      def serialize_few_shot_example(example)
+        case example
+        when DSPy::FewShotExample
+          deep_dup(example.to_h)
+        when DSPy::Example
+          {
+            input: deep_dup(example.input_values),
+            expected: deep_dup(example.expected_values)
+          }
+        when Hash
+          deep_dup(example)
+        else
+          example
+        end
+      end
+      sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
+      def normalize_few_shot_examples(examples)
+        examples.map do |example|
+          if example.is_a?(DSPy::FewShotExample)
+            example
+          elsif example.is_a?(DSPy::Example)
+            DSPy::FewShotExample.new(
+              input: example.input_values,
+              output: example.expected_values,
+              reasoning: extract_reasoning_from_example(example)
+            )
+          else
+            example
+          end
+        end
+      end
+      sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
+      def assign_predictor_examples(predictor, examples)
+        predictor.demos = examples if predictor.respond_to?(:demos=)
+        return unless predictor.respond_to?(:prompt)
+        cloned_examples = examples.map { |ex| ex }
+        predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
+      end
       # Initialize optimization state for candidate selection
       sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
       def initialize_optimization_state(candidates)
@@ -722,7 +1138,11 @@ module DSPy
         modified_program = apply_candidate_configuration(program, candidate)
         # Evaluate modified program
-        evaluation_result = evaluate_program(modified_program, evaluation_set)
+        evaluation_result = if use_concurrent_evaluation?(evaluation_set)
+          evaluate_candidate_concurrently(modified_program, evaluation_set)
+        else
+          evaluate_program(modified_program, evaluation_set)
+        end
         # Store evaluation details
         @evaluated_candidates << candidate
@@ -730,32 +1150,131 @@ module DSPy
         [evaluation_result.pass_rate, modified_program, evaluation_result]
       end
+      sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
+      def use_concurrent_evaluation?(evaluation_set)
+        minibatch_size = config.minibatch_size
+        return false unless minibatch_size&.positive?
+        return false unless config.num_threads && config.num_threads > 1
+        evaluation_set.size > minibatch_size
+      end
+      sig do
+        params(
+          modified_program: T.untyped,
+          evaluation_set: T::Array[DSPy::Example]
+        ).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def evaluate_candidate_concurrently(modified_program, evaluation_set)
+        chunk_size = T.must(config.minibatch_size)
+        chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
+        return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
+        pool_size = [config.num_threads, chunks.size].min
+        pool_size = 1 if pool_size <= 0
+        executor = Concurrent::FixedThreadPool.new(pool_size)
+        futures = chunks.map do |chunk|
+          Concurrent::Promises.future_on(executor) do
+            evaluate_program(modified_program, chunk)
+          end
+        end
+        results = futures.map(&:value!)
+        combine_batch_results(results)
+      ensure
+        if executor
+          executor.shutdown
+          executor.wait_for_termination
+        end
+      end
+      sig do
+        params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def combine_batch_results(batch_results)
+        return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
+        combined_results = batch_results.flat_map(&:results)
+        total_examples = batch_results.sum(&:total_examples)
+        aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
+        DSPy::Evaluate::BatchEvaluationResult.new(
+          results: combined_results,
+          aggregated_metrics: aggregated_metrics
+        )
+      end
+      sig do
+        params(
+          batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
+          total_examples: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def merge_aggregated_metrics(batch_results, total_examples)
+        return {} if total_examples.zero?
+        keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
+        keys.each_with_object({}) do |key, memo|
+          numeric_weight = 0.0
+          numeric_sum = 0.0
+          fallback_value = nil
+          batch_results.each do |res|
+            value = res.aggregated_metrics[key]
+            next if value.nil?
+            if value.is_a?(Numeric)
+              numeric_sum += value.to_f * res.total_examples
+              numeric_weight += res.total_examples
+            else
+              fallback_value = value
+            end
+          end
+          if numeric_weight.positive?
+            memo[key] = numeric_sum / numeric_weight
+          elsif fallback_value
+            memo[key] = fallback_value
+          end
+        end
+      end
       # Apply candidate configuration to program
       sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
       def apply_candidate_configuration(program, candidate)
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
         modified_program = program
-        # Apply instruction if provided
-        if !candidate.instruction.empty? && program.respond_to?(:with_instruction)
-          modified_program = modified_program.with_instruction(candidate.instruction)
-        end
-        # Apply few-shot examples if provided
-        if candidate.few_shot_examples.any? && program.respond_to?(:with_examples)
-          few_shot_examples = candidate.few_shot_examples.map do |example|
-            # If already a FewShotExample, use it directly
-            if example.is_a?(DSPy::FewShotExample)
-              example
-            else
-              # Convert from DSPy::Example
-              DSPy::FewShotExample.new(
-                input: example.input_values,
-                output: example.expected_values,
-                reasoning: extract_reasoning_from_example(example)
-              )
+        if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
+          modified_program = modified_program.clone
+          modified_program.predictors.each_with_index do |predictor, idx|
+            if instructions_map.key?(idx)
+              signature = Utils.get_signature(predictor)
+              updated_signature = signature.with_instructions(instructions_map[idx])
+              Utils.set_signature(predictor, updated_signature)
+            end
+            if demos_map.key?(idx)
+              normalized_examples = normalize_few_shot_examples(demos_map[idx])
+              assign_predictor_examples(predictor, normalized_examples)
             end
           end
-          modified_program = modified_program.with_examples(few_shot_examples)
+        end
+        # Apply instruction if provided (top-level programs still respect with_instruction)
+        if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
+          modified_program = modified_program.with_instruction(candidate.instruction)
+        end
+        should_apply_global_examples = candidate.few_shot_examples.any? &&
+          modified_program.respond_to?(:with_examples) &&
+          (demos_map.empty? || !modified_program.respond_to?(:predictors))
+        if should_apply_global_examples
+          normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
+          modified_program = modified_program.with_examples(normalized_few_shot)
         end
         modified_program
@@ -824,14 +1343,16 @@ module DSPy
         history = {
           total_trials: optimization_result[:trials_completed],
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           early_stopped: optimization_result[:trials_completed] < config.num_trials,
-          score_history: optimization_result[:optimization_state][:best_score_history]
+          score_history: optimization_result[:optimization_state][:best_score_history],
+          total_eval_calls: optimization_result[:total_eval_calls]
         }
         metadata = {
           optimizer: "MIPROv2",
           auto_mode: infer_auto_mode,
+          optimization_strategy: optimization_strategy_name,
           best_instruction: best_candidate&.instruction || "",
           best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
           best_candidate_type: best_candidate&.type&.serialize || "unknown",
@@ -839,12 +1360,21 @@ module DSPy
         }
         # Create bootstrap statistics from demo_candidates
-        demo_sets = demo_candidates[0] || []
+        num_predictors = demo_candidates.keys.size
+        sets_per_predictor = demo_candidates.values.map(&:size)
+        all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
         bootstrap_statistics = {
-          num_predictors: demo_candidates.keys.size,
-          demo_sets_per_predictor: demo_sets.size,
-          avg_demos_per_set: demo_sets.empty? ? 0 : demo_sets.map(&:size).sum.to_f / demo_sets.size
+          num_predictors: num_predictors,
+          demo_sets_per_predictor: sets_per_predictor.max || 0,
+          avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
         }
+        bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
+        optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
+        optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
+        optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
+        optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
+        optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
         MIPROv2Result.new(
           optimized_program: best_program,
@@ -854,7 +1384,7 @@ module DSPy
           best_score_value: best_score,
           metadata: metadata,
           evaluated_candidates: @evaluated_candidates,
-          optimization_trace: serialize_optimization_trace(optimization_result[:optimization_state]),
+          optimization_trace: optimization_trace,
           bootstrap_statistics: bootstrap_statistics,
           proposal_statistics: proposal_result.analysis,
           best_evaluation_result: best_evaluation_result
@@ -876,7 +1406,205 @@ module DSPy
         serialized_trace
       end
+      sig do
+        params(
+          trial_number: Integer,
+          candidate: EvaluatedCandidate,
+          evaluation_type: Symbol,
+          batch_size: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
+        # Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
+        trial_number # no-op to acknowledge parameter usage
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
+        entry = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          instruction_preview: candidate.instruction.to_s[0, 160],
+          few_shot_count: candidate.few_shot_examples.size,
+          metadata: deep_dup(candidate.metadata),
+          evaluation_type: evaluation_type,
+          batch_size: batch_size,
+          status: :in_progress,
+          started_at: Time.now.iso8601
+        }
+        if instructions_map.any?
+          entry[:instructions] = duplicate_instruction_map(instructions_map)
+          entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
+        elsif candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          entry[:instruction] = candidate.instruction
+          entry[:instructions] = { predictor_index => candidate.instruction }
+        end
+        entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
+        entry
+      end
+      sig do
+        params(
+          trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
+          trial_number: Integer,
+          score: T.nilable(Float),
+          evaluation_type: Symbol,
+          batch_size: Integer,
+          total_eval_calls: Integer,
+          error: T.nilable(String)
+        ).void
+      end
+      def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
+        entry = trial_logs[trial_number] || {}
+        entry[:score] = score if score
+        entry[:evaluation_type] = evaluation_type
+        entry[:batch_size] = batch_size
+        entry[:total_eval_calls] = total_eval_calls
+        entry[:status] = error ? :error : :completed
+        entry[:error] = error if error
+        entry[:completed_at] = Time.now.iso8601
+        trial_logs[trial_number] = entry
+      end
+      sig do
+        params(
+          param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          evaluation_type: Symbol,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
+        instructions_hash = instructions || {}
+        if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          instructions_hash[predictor_index] = candidate.instruction
+        end
+        record = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          score: score,
+          evaluation_type: evaluation_type,
+          timestamp: Time.now.iso8601,
+          metadata: deep_dup(candidate.metadata)
+        }
+        primary_instruction = instructions_hash[0] || candidate.instruction
+        record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
+        record[:instructions] = instructions_hash unless instructions_hash.empty?
+        param_score_dict[candidate.config_id] << record
+      end
+      sig do
+        params(
+          fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
+        existing = fully_evaled_param_combos[candidate.config_id]
+        if existing.nil? || score > existing[:score]
+          instructions_hash = instructions || {}
+          if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+            predictor_index = candidate.metadata[:predictor_index] || 0
+            instructions_hash[predictor_index] = candidate.instruction
+          end
+          fully_evaled_param_combos[candidate.config_id] = {
+            candidate_id: candidate.config_id,
+            candidate_type: candidate.type.serialize,
+            score: score,
+            metadata: deep_dup(candidate.metadata),
+            updated_at: Time.now.iso8601
+          }
+          unless instructions_hash.empty?
+            fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
+            fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
+          end
+        end
+      end
+      sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
+      def serialize_trial_logs(trial_logs)
+        return {} unless trial_logs
+        allowed_keys = [
+          :candidate_id,
+          :candidate_type,
+          :instruction_preview,
+          :instruction,
+          :instructions,
+          :few_shot_count,
+          :metadata,
+          :evaluation_type,
+          :batch_size,
+          :score,
+          :status,
+          :error,
+          :started_at,
+          :completed_at,
+          :total_eval_calls
+        ]
+        trial_logs.transform_values do |entry|
+          entry.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
+      def serialize_param_score_dict(param_score_dict)
+        return {} unless param_score_dict
+        allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
+        param_score_dict.transform_values do |records|
+          records.map do |record|
+            record.each_with_object({}) do |(key, value), memo|
+              memo[key] = value if allowed_keys.include?(key)
+            end
+          end
+        end
+      end
+      sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
+      def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
+        return {} unless fully_evaled_param_combos
+        allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
+        fully_evaled_param_combos.transform_values do |record|
+          record.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(value: T.untyped).returns(T.untyped) }
+      def deep_dup(value)
+        case value
+        when Hash
+          value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
+        when Array
+          value.map { |element| deep_dup(element) }
+        else
+          value
+        end
+      end
       # Helper methods
+      sig { returns(String) }
+      def optimization_strategy_name
+        strategy = config.optimization_strategy
+        return strategy.serialize if strategy.respond_to?(:serialize)
+        strategy.to_s
+      end
       sig { params(program: T.untyped).returns(T.nilable(String)) }
       def extract_current_instruction(program)
         if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
@@ -889,6 +1617,23 @@ module DSPy
         end
       end
+      sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
+      def extract_program_instructions(program)
+        instructions = {}
+        if program.respond_to?(:predictors)
+          program.predictors.each_with_index do |predictor, index|
+            if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
+              value = predictor.prompt.instruction
+              instructions[index] = value if value
+            end
+          end
+        else
+          fallback_instruction = extract_current_instruction(program)
+          instructions[0] = fallback_instruction if fallback_instruction
+        end
+        instructions
+      end
       sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
       def extract_signature_class(program)
         program.respond_to?(:signature_class) ? program.signature_class : nil
@@ -913,12 +1658,15 @@ module DSPy
       # Infer auto mode based on configuration
       sig { returns(String) }
       def infer_auto_mode
+        return config.auto_preset.serialize unless config.auto_preset == AutoPreset::None
         case config.num_trials
         when 0..6 then "light"
         when 7..12 then "medium"
-        else "heavy"
+        when 13..Float::INFINITY then "heavy"
+        else "manual"
         end
       end
     end
   end
-end
+end