RubyGems - dspy - Versions diffs - 0.28.1 → 0.29.0 - Mend

dspy 0.28.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/README.md +2 -3
data/lib/dspy/callbacks.rb +222 -0
data/lib/dspy/chain_of_thought.rb +2 -1
data/lib/dspy/code_act.rb +14 -1
data/lib/dspy/datasets/ade.rb +90 -0
data/lib/dspy/datasets.rb +8 -0
data/lib/dspy/lm.rb +9 -12
data/lib/dspy/mixins/struct_builder.rb +17 -25
data/lib/dspy/module.rb +45 -1
data/lib/dspy/observability/async_span_processor.rb +67 -93
data/lib/dspy/observability.rb +43 -1
data/lib/dspy/predict.rb +17 -0
data/lib/dspy/prompt.rb +90 -20
data/lib/dspy/propose/dataset_summary_generator.rb +210 -0
data/lib/dspy/propose/grounded_proposer.rb +320 -66
data/lib/dspy/re_act.rb +13 -0
data/lib/dspy/reflection_lm.rb +36 -0
data/lib/dspy/teleprompt/bootstrap_strategy.rb +26 -0
data/lib/dspy/teleprompt/gepa.rb +448 -2803
data/lib/dspy/teleprompt/mipro_v2.rb +624 -100
data/lib/dspy/teleprompt/utils.rb +349 -42
data/lib/dspy/version.rb +2 -2
data/lib/dspy.rb +4 -2
data/lib/gepa/api.rb +61 -0
data/lib/gepa/core/engine.rb +226 -0
data/lib/gepa/core/evaluation_batch.rb +26 -0
data/lib/gepa/core/result.rb +92 -0
data/lib/gepa/core/state.rb +231 -0
data/lib/gepa/logging/experiment_tracker.rb +54 -0
data/lib/gepa/logging/logger.rb +57 -0
data/lib/gepa/logging.rb +9 -0
data/lib/gepa/proposer/base.rb +27 -0
data/lib/gepa/proposer/merge_proposer.rb +424 -0
data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
data/lib/gepa/strategies/batch_sampler.rb +91 -0
data/lib/gepa/strategies/candidate_selector.rb +97 -0
data/lib/gepa/strategies/component_selector.rb +57 -0
data/lib/gepa/strategies/instruction_proposal.rb +120 -0
data/lib/gepa/telemetry.rb +122 -0
data/lib/gepa/utils/pareto.rb +119 -0
data/lib/gepa.rb +21 -0
metadata +59 -4
data/lib/dspy/teleprompt/simple_optimizer.rb +0 -497

data/lib/dspy/teleprompt/mipro_v2.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 # frozen_string_literal: true
 require 'digest'
+require 'time'
+require 'concurrent-ruby'
 require 'sorbet-runtime'
+require 'securerandom'
 require_relative 'teleprompter'
 require_relative 'utils'
 require_relative '../propose/grounded_proposer'
@@ -124,6 +127,7 @@ module DSPy
       setting :track_diversity, default: true
       setting :max_errors, default: 3
       setting :num_threads, default: 1
+      setting :minibatch_size, default: nil
       # Class-level configuration method - sets defaults for new instances
       def self.configure(&block)
@@ -265,6 +269,7 @@ module DSPy
         @proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
         @optimization_trace = []
         @evaluated_candidates = []
+        @trial_history = {}
       end
       # Main MIPROv2 optimization method
@@ -282,7 +287,7 @@ module DSPy
           trainset_size: trainset.size,
           valset_size: valset&.size || 0,
           num_trials: config.num_trials,
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           mode: infer_auto_mode
         }) do
           # Convert examples to typed format
@@ -294,18 +299,18 @@ module DSPy
           # Phase 1: Bootstrap few-shot examples
           emit_event('phase_start', { phase: 1, name: 'bootstrap' })
-          bootstrap_result = phase_1_bootstrap(program, typed_trainset)
-          emit_event('phase_complete', {
-            phase: 1,
-            success_rate: bootstrap_result.statistics[:success_rate],
-            candidate_sets: bootstrap_result.candidate_sets.size
+          demo_candidates = phase_1_bootstrap(program, typed_trainset)
+          emit_event('phase_complete', {
+            phase: 1,
+            num_predictors: demo_candidates.keys.size,
+            demo_sets_per_predictor: demo_candidates[0]&.size || 0
           })
           # Phase 2: Generate instruction candidates
           emit_event('phase_start', { phase: 2, name: 'instruction_proposal' })
-          proposal_result = phase_2_propose_instructions(program, typed_trainset, bootstrap_result)
-          emit_event('phase_complete', {
-            phase: 2,
+          proposal_result = phase_2_propose_instructions(program, typed_trainset, demo_candidates)
+          emit_event('phase_complete', {
+            phase: 2,
             num_candidates: proposal_result.num_candidates,
             best_instruction_preview: proposal_result.best_instruction[0, 50]
           })
@@ -316,7 +321,7 @@ module DSPy
             program,
             evaluation_set,
             proposal_result,
-            bootstrap_result
+            demo_candidates
           )
           emit_event('phase_complete', {
             phase: 3,
@@ -327,10 +332,12 @@ module DSPy
           # Build final result
           final_result = build_miprov2_result(
             optimization_result,
-            bootstrap_result,
+            demo_candidates,
             proposal_result
           )
+          @trial_history = optimization_result[:trial_logs] || {}
           save_results(final_result)
           final_result
         end
@@ -339,16 +346,17 @@ module DSPy
       private
       # Phase 1: Bootstrap few-shot examples from training data
-      sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(Utils::BootstrapResult) }
+      # Returns a hash mapping predictor indices to arrays of demo sets
+      sig { params(program: T.untyped, trainset: T::Array[DSPy::Example]).returns(T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]) }
       def phase_1_bootstrap(program, trainset)
-        bootstrap_config = Utils::BootstrapConfig.new
-        bootstrap_config.max_bootstrapped_examples = config.max_bootstrapped_examples
-        bootstrap_config.max_labeled_examples = config.max_labeled_examples
-        bootstrap_config.num_candidate_sets = config.bootstrap_sets
-        bootstrap_config.max_errors = config.max_errors
-        bootstrap_config.num_threads = config.num_threads
-        Utils.create_n_fewshot_demo_sets(program, trainset, config: bootstrap_config, metric: @metric)
+        Utils.create_n_fewshot_demo_sets(
+          program,
+          config.bootstrap_sets,  # num_candidate_sets
+          trainset,
+          max_bootstrapped_demos: config.max_bootstrapped_examples,
+          max_labeled_demos: config.max_labeled_examples,
+          metric: @metric
+        )
       end
       # Phase 2: Generate instruction candidates using grounded proposer
@@ -356,28 +364,34 @@ module DSPy
         params(
           program: T.untyped,
           trainset: T::Array[DSPy::Example],
-          bootstrap_result: Utils::BootstrapResult
+          demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
         ).returns(DSPy::Propose::GroundedProposer::ProposalResult)
       end
-      def phase_2_propose_instructions(program, trainset, bootstrap_result)
+      def phase_2_propose_instructions(program, trainset, demo_candidates)
         # Get current instruction if available
         current_instruction = extract_current_instruction(program)
         # Use few-shot examples from bootstrap if available
-        few_shot_examples = bootstrap_result.successful_examples.take(5)
+        # Flatten demo sets from first predictor and take first 5 examples
+        few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
-        # Get signature class from program
-        signature_class = extract_signature_class(program)
-        raise ArgumentError, "Cannot extract signature class from program" unless signature_class
+        # Re-initialize proposer with program and trainset for awareness features
+        # This enables program_aware and use_dataset_summary flags to work correctly
+        proposer_config = DSPy::Propose::GroundedProposer::Config.new
+        proposer_config.num_instruction_candidates = config.num_instruction_candidates
-        # Configure proposer for this optimization run
-        @proposer.config.num_instruction_candidates = config.num_instruction_candidates
+        @proposer = DSPy::Propose::GroundedProposer.new(
+          config: proposer_config,
+          program: program,
+          trainset: trainset
+        )
-        @proposer.propose_instructions(
-          signature_class,
-          trainset,
-          few_shot_examples: few_shot_examples,
-          current_instruction: current_instruction
+        @proposer.propose_instructions_for_program(
+          trainset: trainset,
+          program: program,
+          demo_candidates: demo_candidates,
+          trial_logs: @trial_history,
+          num_instruction_candidates: config.num_instruction_candidates
         )
       end
@@ -387,21 +401,27 @@ module DSPy
           program: T.untyped,
           evaluation_set: T::Array[DSPy::Example],
           proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
-          bootstrap_result: Utils::BootstrapResult
+          demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
         ).returns(T::Hash[Symbol, T.untyped])
       end
-      def phase_3_optimize(program, evaluation_set, proposal_result, bootstrap_result)
+      def phase_3_optimize(program, evaluation_set, proposal_result, demo_candidates)
         # Generate candidate configurations
-        candidates = generate_candidate_configurations(proposal_result, bootstrap_result)
+        candidates = generate_candidate_configurations(proposal_result, demo_candidates)
         # Initialize optimization state
         optimization_state = initialize_optimization_state(candidates)
+        # Initialize trial tracking structures
+        trial_logs = {}
+        param_score_dict = Hash.new { |hash, key| hash[key] = [] }
+        fully_evaled_param_combos = {}
+        total_eval_calls = 0
         # Run optimization trials
         trials_completed = 0
         best_score = 0.0
         best_candidate = nil
-        best_program = nil
+        best_program = program
         best_evaluation_result = nil
         config.num_trials.times do |trial_idx|
@@ -409,6 +429,14 @@ module DSPy
           # Select next candidate based on optimization strategy
           candidate = select_next_candidate(candidates, optimization_state, trial_idx)
+          batch_size = evaluation_set.size
+          trial_logs[trials_completed] = create_trial_log_entry(
+            trial_number: trials_completed,
+            candidate: candidate,
+            evaluation_type: :full,
+            batch_size: batch_size
+          )
           emit_event('trial_start', {
             trial_number: trials_completed,
@@ -420,12 +448,30 @@ module DSPy
           begin
             # Evaluate candidate
             score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
+            total_eval_calls += batch_size
+            instructions_snapshot = extract_program_instructions(modified_program)
+            trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
+            trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
             # Update optimization state
             update_optimization_state(optimization_state, candidate, score)
+            record_param_score(
+              param_score_dict,
+              candidate,
+              score,
+              evaluation_type: :full,
+              instructions: instructions_snapshot
+            )
+            update_fully_evaled_param_combos(
+              fully_evaled_param_combos,
+              candidate,
+              score,
+              instructions: instructions_snapshot
+            )
             # Track best result
-            is_best = score > best_score
+            is_best = best_candidate.nil? || score > best_score
             if is_best
               best_score = score
               best_candidate = candidate
@@ -433,6 +479,15 @@ module DSPy
               best_evaluation_result = evaluation_result
             end
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: score,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls
+            )
             emit_event('trial_complete', {
               trial_number: trials_completed,
               score: score,
@@ -447,6 +502,16 @@ module DSPy
             end
           rescue => error
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: nil,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls,
+              error: error.message
+            )
             emit_event('trial_error', {
               trial_number: trials_completed,
               error: error.message,
@@ -464,73 +529,190 @@ module DSPy
           best_evaluation_result: best_evaluation_result,
           trials_completed: trials_completed,
           optimization_state: optimization_state,
-          evaluated_candidates: @evaluated_candidates
+          evaluated_candidates: @evaluated_candidates,
+          trial_logs: trial_logs,
+          param_score_dict: param_score_dict,
+          fully_evaled_param_combos: fully_evaled_param_combos,
+          total_eval_calls: total_eval_calls
         }
       end
-      # Generate candidate configurations from proposals and bootstrap results
+      # Generate candidate configurations from proposals and demo candidates
       sig do
         params(
           proposal_result: DSPy::Propose::GroundedProposer::ProposalResult,
-          bootstrap_result: Utils::BootstrapResult
+          demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]
         ).returns(T::Array[EvaluatedCandidate])
       end
-      def generate_candidate_configurations(proposal_result, bootstrap_result)
+      def generate_candidate_configurations(proposal_result, demo_candidates)
         candidates = []
+        predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
+          proposal_result.predictor_instructions
+        else
+          { 0 => proposal_result.candidate_instructions }
+        end
+        instruction_maps = build_instruction_maps(predictor_instruction_map)
+        demo_maps = build_demo_maps(demo_candidates)
         # Base configuration (no modifications)
         candidates << EvaluatedCandidate.new(
           instruction: "",
           few_shot_examples: [],
           type: CandidateType::Baseline,
-          metadata: {},
+          metadata: {
+            instructions_map: {},
+            demos_map: {}
+          },
           config_id: SecureRandom.hex(6)
         )
-        # Instruction-only candidates
-        proposal_result.candidate_instructions.each_with_index do |instruction, idx|
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
           candidates << EvaluatedCandidate.new(
-            instruction: instruction,
+            instruction: primary_instruction,
             few_shot_examples: [],
             type: CandidateType::InstructionOnly,
-            metadata: { proposal_rank: idx },
+            metadata: {
+              proposal_rank: combo_idx,
+              instructions_map: duplicate_instruction_map(instruction_map),
+              demos_map: {}
+            },
             config_id: SecureRandom.hex(6)
           )
         end
-        # Few-shot only candidates
-        bootstrap_result.candidate_sets.each_with_index do |candidate_set, idx|
+        demo_maps.each_with_index do |demo_map, idx|
+          next if demo_map.empty?
+          flattened_examples = demo_map.values.flatten
           candidates << EvaluatedCandidate.new(
             instruction: "",
-            few_shot_examples: candidate_set,
+            few_shot_examples: flattened_examples,
             type: CandidateType::FewShotOnly,
-            metadata: { bootstrap_rank: idx },
+            metadata: {
+              bootstrap_rank: idx,
+              instructions_map: {},
+              demos_map: duplicate_demo_map(demo_map)
+            },
             config_id: SecureRandom.hex(6)
           )
         end
         # Combined candidates (instruction + few-shot)
-        top_instructions = proposal_result.candidate_instructions.take(3)
-        top_bootstrap_sets = bootstrap_result.candidate_sets.take(3)
-        top_instructions.each_with_index do |instruction, i_idx|
-          top_bootstrap_sets.each_with_index do |candidate_set, b_idx|
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
+          demo_maps.first(3).each_with_index do |demo_map, demo_idx|
+            next if demo_map.empty?
+            flattened_examples = demo_map.values.flatten
             candidates << EvaluatedCandidate.new(
-              instruction: instruction,
-              few_shot_examples: candidate_set,
+              instruction: primary_instruction,
+              few_shot_examples: flattened_examples,
               type: CandidateType::Combined,
-              metadata: {
-                instruction_rank: i_idx,
-                bootstrap_rank: b_idx
+              metadata: {
+                instruction_rank: combo_idx,
+                bootstrap_rank: demo_idx,
+                instructions_map: duplicate_instruction_map(instruction_map),
+                demos_map: duplicate_demo_map(demo_map)
               },
               config_id: SecureRandom.hex(6)
             )
           end
         end
         candidates
       end
+      sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
+      def build_instruction_maps(predictor_instruction_map)
+        return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
+        normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
+          next if instructions.nil? || instructions.empty?
+          memo[index] = instructions.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
+      end
+      def build_demo_maps(demo_candidates)
+        return [{}] if demo_candidates.nil? || demo_candidates.empty?
+        normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
+          next if sets.nil? || sets.empty?
+          memo[index] = sets.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
+      end
+      def cartesian_product(options_hash)
+        options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
+          next acc if values.nil? || values.empty?
+          acc.flat_map do |existing|
+            values.map do |value|
+              existing.merge(index => value)
+            end
+          end
+        end
+      end
+      sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
+      def duplicate_instruction_map(instruction_map)
+        instruction_map.each_with_object({}) do |(index, instruction), memo|
+          memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
+        end
+      end
+      sig do
+        params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
+      end
+      def duplicate_demo_map(demo_map)
+        demo_map.each_with_object({}) do |(index, demos), memo|
+          next if demos.nil?
+          memo[index] = demos.map { |demo| demo }
+        end
+      end
+      sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
+      def normalize_few_shot_examples(examples)
+        examples.map do |example|
+          if example.is_a?(DSPy::FewShotExample)
+            example
+          elsif example.is_a?(DSPy::Example)
+            DSPy::FewShotExample.new(
+              input: example.input_values,
+              output: example.expected_values,
+              reasoning: extract_reasoning_from_example(example)
+            )
+          else
+            example
+          end
+        end
+      end
+      sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
+      def assign_predictor_examples(predictor, examples)
+        predictor.demos = examples if predictor.respond_to?(:demos=)
+        return unless predictor.respond_to?(:prompt)
+        cloned_examples = examples.map { |ex| ex }
+        predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
+      end
       # Initialize optimization state for candidate selection
       sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
       def initialize_optimization_state(candidates)
@@ -685,10 +867,10 @@ module DSPy
           features << ((config_hash / 1000) % 1000).to_f / 1000.0  # Feature 2: different part of hash
           features << ((config_hash / 1_000_000) % 1000).to_f / 1000.0  # Feature 3: high bits
-          # Add instruction length if available
+          # Add instruction length if available (Python-compatible: no cap)
           instruction = candidate.instruction
           if instruction && !instruction.empty?
-            features << [instruction.length.to_f / 100.0, 2.0].min  # Instruction length, capped at 200 chars
+            features << instruction.length.to_f / 100.0  # Instruction length, uncapped
           else
             features << 0.5  # Default value
           end
@@ -710,7 +892,11 @@ module DSPy
         modified_program = apply_candidate_configuration(program, candidate)
         # Evaluate modified program
-        evaluation_result = evaluate_program(modified_program, evaluation_set)
+        evaluation_result = if use_concurrent_evaluation?(evaluation_set)
+          evaluate_candidate_concurrently(modified_program, evaluation_set)
+        else
+          evaluate_program(modified_program, evaluation_set)
+        end
         # Store evaluation details
         @evaluated_candidates << candidate
@@ -718,26 +904,131 @@ module DSPy
         [evaluation_result.pass_rate, modified_program, evaluation_result]
       end
+      sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
+      def use_concurrent_evaluation?(evaluation_set)
+        minibatch_size = config.minibatch_size
+        return false unless minibatch_size&.positive?
+        return false unless config.num_threads && config.num_threads > 1
+        evaluation_set.size > minibatch_size
+      end
+      sig do
+        params(
+          modified_program: T.untyped,
+          evaluation_set: T::Array[DSPy::Example]
+        ).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def evaluate_candidate_concurrently(modified_program, evaluation_set)
+        chunk_size = T.must(config.minibatch_size)
+        chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
+        return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
+        pool_size = [config.num_threads, chunks.size].min
+        pool_size = 1 if pool_size <= 0
+        executor = Concurrent::FixedThreadPool.new(pool_size)
+        futures = chunks.map do |chunk|
+          Concurrent::Promises.future_on(executor) do
+            evaluate_program(modified_program, chunk)
+          end
+        end
+        results = futures.map(&:value!)
+        combine_batch_results(results)
+      ensure
+        if executor
+          executor.shutdown
+          executor.wait_for_termination
+        end
+      end
+      sig do
+        params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def combine_batch_results(batch_results)
+        return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
+        combined_results = batch_results.flat_map(&:results)
+        total_examples = batch_results.sum(&:total_examples)
+        aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
+        DSPy::Evaluate::BatchEvaluationResult.new(
+          results: combined_results,
+          aggregated_metrics: aggregated_metrics
+        )
+      end
+      sig do
+        params(
+          batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
+          total_examples: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def merge_aggregated_metrics(batch_results, total_examples)
+        return {} if total_examples.zero?
+        keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
+        keys.each_with_object({}) do |key, memo|
+          numeric_weight = 0.0
+          numeric_sum = 0.0
+          fallback_value = nil
+          batch_results.each do |res|
+            value = res.aggregated_metrics[key]
+            next if value.nil?
+            if value.is_a?(Numeric)
+              numeric_sum += value.to_f * res.total_examples
+              numeric_weight += res.total_examples
+            else
+              fallback_value = value
+            end
+          end
+          if numeric_weight.positive?
+            memo[key] = numeric_sum / numeric_weight
+          elsif fallback_value
+            memo[key] = fallback_value
+          end
+        end
+      end
       # Apply candidate configuration to program
       sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
       def apply_candidate_configuration(program, candidate)
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
         modified_program = program
-        # Apply instruction if provided
-        if !candidate.instruction.empty? && program.respond_to?(:with_instruction)
+        if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
+          modified_program = modified_program.clone
+          modified_program.predictors.each_with_index do |predictor, idx|
+            if instructions_map.key?(idx)
+              signature = Utils.get_signature(predictor)
+              updated_signature = signature.with_instructions(instructions_map[idx])
+              Utils.set_signature(predictor, updated_signature)
+            end
+            if demos_map.key?(idx)
+              normalized_examples = normalize_few_shot_examples(demos_map[idx])
+              assign_predictor_examples(predictor, normalized_examples)
+            end
+          end
+        end
+        # Apply instruction if provided (top-level programs still respect with_instruction)
+        if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
           modified_program = modified_program.with_instruction(candidate.instruction)
         end
-        # Apply few-shot examples if provided
-        if candidate.few_shot_examples.any? && program.respond_to?(:with_examples)
-          few_shot_examples = candidate.few_shot_examples.map do |example|
-            DSPy::FewShotExample.new(
-              input: example.input_values,
-              output: example.expected_values,
-              reasoning: extract_reasoning_from_example(example)
-            )
-          end
-          modified_program = modified_program.with_examples(few_shot_examples)
+        should_apply_global_examples = candidate.few_shot_examples.any? &&
+          modified_program.respond_to?(:with_examples) &&
+          (demos_map.empty? || !modified_program.respond_to?(:predictors))
+        if should_apply_global_examples
+          normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
+          modified_program = modified_program.with_examples(normalized_few_shot)
         end
         modified_program
@@ -779,48 +1070,66 @@ module DSPy
         state[:no_improvement_count] >= config.early_stopping_patience
       end
-      # Calculate diversity score for candidate
+      # Calculate diversity score for candidate (Python-compatible: only few-shot count)
       sig { params(candidate: EvaluatedCandidate).returns(Float) }
       def calculate_diversity_score(candidate)
-        # Simple diversity metric based on instruction length and few-shot count
-        instruction_diversity = candidate.instruction.length / 200.0
+        # Python DSPy doesn't use instruction length for diversity, only few-shot count
         few_shot_diversity = candidate.few_shot_examples.size / 10.0
-        [instruction_diversity + few_shot_diversity, 1.0].min
+        [few_shot_diversity, 1.0].min
       end
       # Build final MIPROv2 result
       sig do
         params(
           optimization_result: T::Hash[Symbol, T.untyped],
-          bootstrap_result: Utils::BootstrapResult,
+          demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]],
           proposal_result: DSPy::Propose::GroundedProposer::ProposalResult
         ).returns(MIPROv2Result)
       end
-      def build_miprov2_result(optimization_result, bootstrap_result, proposal_result)
+      def build_miprov2_result(optimization_result, demo_candidates, proposal_result)
         best_candidate = optimization_result[:best_candidate]
         best_program = optimization_result[:best_program]
         best_score = optimization_result[:best_score]
         best_evaluation_result = optimization_result[:best_evaluation_result]
         scores = { pass_rate: best_score }
         history = {
           total_trials: optimization_result[:trials_completed],
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           early_stopped: optimization_result[:trials_completed] < config.num_trials,
-          score_history: optimization_result[:optimization_state][:best_score_history]
+          score_history: optimization_result[:optimization_state][:best_score_history],
+          total_eval_calls: optimization_result[:total_eval_calls]
         }
         metadata = {
           optimizer: "MIPROv2",
           auto_mode: infer_auto_mode,
+          optimization_strategy: optimization_strategy_name,
           best_instruction: best_candidate&.instruction || "",
           best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
           best_candidate_type: best_candidate&.type&.serialize || "unknown",
           optimization_timestamp: Time.now.iso8601
         }
+        # Create bootstrap statistics from demo_candidates
+        num_predictors = demo_candidates.keys.size
+        sets_per_predictor = demo_candidates.values.map(&:size)
+        all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
+        bootstrap_statistics = {
+          num_predictors: num_predictors,
+          demo_sets_per_predictor: sets_per_predictor.max || 0,
+          avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
+        }
+        bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
+        optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
+        optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
+        optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
+        optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
+        optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
         MIPROv2Result.new(
           optimized_program: best_program,
           scores: scores,
@@ -829,8 +1138,8 @@ module DSPy
           best_score_value: best_score,
           metadata: metadata,
           evaluated_candidates: @evaluated_candidates,
-          optimization_trace: serialize_optimization_trace(optimization_result[:optimization_state]),
-          bootstrap_statistics: bootstrap_result.statistics,
+          optimization_trace: optimization_trace,
+          bootstrap_statistics: bootstrap_statistics,
           proposal_statistics: proposal_result.analysis,
           best_evaluation_result: best_evaluation_result
         )
@@ -851,7 +1160,205 @@ module DSPy
         serialized_trace
       end
+      sig do
+        params(
+          trial_number: Integer,
+          candidate: EvaluatedCandidate,
+          evaluation_type: Symbol,
+          batch_size: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
+        # Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
+        trial_number # no-op to acknowledge parameter usage
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
+        entry = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          instruction_preview: candidate.instruction.to_s[0, 160],
+          few_shot_count: candidate.few_shot_examples.size,
+          metadata: deep_dup(candidate.metadata),
+          evaluation_type: evaluation_type,
+          batch_size: batch_size,
+          status: :in_progress,
+          started_at: Time.now.iso8601
+        }
+        if instructions_map.any?
+          entry[:instructions] = duplicate_instruction_map(instructions_map)
+          entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
+        elsif candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          entry[:instruction] = candidate.instruction
+          entry[:instructions] = { predictor_index => candidate.instruction }
+        end
+        entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
+        entry
+      end
+      sig do
+        params(
+          trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
+          trial_number: Integer,
+          score: T.nilable(Float),
+          evaluation_type: Symbol,
+          batch_size: Integer,
+          total_eval_calls: Integer,
+          error: T.nilable(String)
+        ).void
+      end
+      def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
+        entry = trial_logs[trial_number] || {}
+        entry[:score] = score if score
+        entry[:evaluation_type] = evaluation_type
+        entry[:batch_size] = batch_size
+        entry[:total_eval_calls] = total_eval_calls
+        entry[:status] = error ? :error : :completed
+        entry[:error] = error if error
+        entry[:completed_at] = Time.now.iso8601
+        trial_logs[trial_number] = entry
+      end
+      sig do
+        params(
+          param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          evaluation_type: Symbol,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
+        instructions_hash = instructions || {}
+        if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          instructions_hash[predictor_index] = candidate.instruction
+        end
+        record = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          score: score,
+          evaluation_type: evaluation_type,
+          timestamp: Time.now.iso8601,
+          metadata: deep_dup(candidate.metadata)
+        }
+        primary_instruction = instructions_hash[0] || candidate.instruction
+        record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
+        record[:instructions] = instructions_hash unless instructions_hash.empty?
+        param_score_dict[candidate.config_id] << record
+      end
+      sig do
+        params(
+          fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
+        existing = fully_evaled_param_combos[candidate.config_id]
+        if existing.nil? || score > existing[:score]
+          instructions_hash = instructions || {}
+          if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+            predictor_index = candidate.metadata[:predictor_index] || 0
+            instructions_hash[predictor_index] = candidate.instruction
+          end
+          fully_evaled_param_combos[candidate.config_id] = {
+            candidate_id: candidate.config_id,
+            candidate_type: candidate.type.serialize,
+            score: score,
+            metadata: deep_dup(candidate.metadata),
+            updated_at: Time.now.iso8601
+          }
+          unless instructions_hash.empty?
+            fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
+            fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
+          end
+        end
+      end
+      sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
+      def serialize_trial_logs(trial_logs)
+        return {} unless trial_logs
+        allowed_keys = [
+          :candidate_id,
+          :candidate_type,
+          :instruction_preview,
+          :instruction,
+          :instructions,
+          :few_shot_count,
+          :metadata,
+          :evaluation_type,
+          :batch_size,
+          :score,
+          :status,
+          :error,
+          :started_at,
+          :completed_at,
+          :total_eval_calls
+        ]
+        trial_logs.transform_values do |entry|
+          entry.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
+      def serialize_param_score_dict(param_score_dict)
+        return {} unless param_score_dict
+        allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
+        param_score_dict.transform_values do |records|
+          records.map do |record|
+            record.each_with_object({}) do |(key, value), memo|
+              memo[key] = value if allowed_keys.include?(key)
+            end
+          end
+        end
+      end
+      sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
+      def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
+        return {} unless fully_evaled_param_combos
+        allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
+        fully_evaled_param_combos.transform_values do |record|
+          record.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(value: T.untyped).returns(T.untyped) }
+      def deep_dup(value)
+        case value
+        when Hash
+          value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
+        when Array
+          value.map { |element| deep_dup(element) }
+        else
+          value
+        end
+      end
       # Helper methods
+      sig { returns(String) }
+      def optimization_strategy_name
+        strategy = config.optimization_strategy
+        return strategy.serialize if strategy.respond_to?(:serialize)
+        strategy.to_s
+      end
       sig { params(program: T.untyped).returns(T.nilable(String)) }
       def extract_current_instruction(program)
         if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
@@ -864,6 +1371,23 @@ module DSPy
         end
       end
+      sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
+      def extract_program_instructions(program)
+        instructions = {}
+        if program.respond_to?(:predictors)
+          program.predictors.each_with_index do |predictor, index|
+            if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
+              value = predictor.prompt.instruction
+              instructions[index] = value if value
+            end
+          end
+        else
+          fallback_instruction = extract_current_instruction(program)
+          instructions[0] = fallback_instruction if fallback_instruction
+        end
+        instructions
+      end
       sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
       def extract_signature_class(program)
         program.respond_to?(:signature_class) ? program.signature_class : nil
@@ -896,4 +1420,4 @@ module DSPy
       end
     end
   end
-end
+end