RubyGems - dspy - Versions diffs - 0.28.2 → 0.29.0 - Mend

dspy 0.28.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/README.md +2 -3
data/lib/dspy/code_act.rb +14 -1
data/lib/dspy/datasets/ade.rb +90 -0
data/lib/dspy/datasets.rb +8 -0
data/lib/dspy/lm.rb +4 -8
data/lib/dspy/mixins/struct_builder.rb +17 -25
data/lib/dspy/module.rb +12 -1
data/lib/dspy/observability/async_span_processor.rb +67 -93
data/lib/dspy/observability.rb +43 -1
data/lib/dspy/predict.rb +10 -0
data/lib/dspy/propose/dataset_summary_generator.rb +36 -3
data/lib/dspy/propose/grounded_proposer.rb +118 -11
data/lib/dspy/re_act.rb +13 -0
data/lib/dspy/reflection_lm.rb +36 -0
data/lib/dspy/teleprompt/gepa.rb +448 -2803
data/lib/dspy/teleprompt/mipro_v2.rb +564 -65
data/lib/dspy/teleprompt/utils.rb +8 -3
data/lib/dspy/version.rb +2 -2
data/lib/dspy.rb +3 -2
data/lib/gepa/api.rb +61 -0
data/lib/gepa/core/engine.rb +226 -0
data/lib/gepa/core/evaluation_batch.rb +26 -0
data/lib/gepa/core/result.rb +92 -0
data/lib/gepa/core/state.rb +231 -0
data/lib/gepa/logging/experiment_tracker.rb +54 -0
data/lib/gepa/logging/logger.rb +57 -0
data/lib/gepa/logging.rb +9 -0
data/lib/gepa/proposer/base.rb +27 -0
data/lib/gepa/proposer/merge_proposer.rb +424 -0
data/lib/gepa/proposer/reflective_mutation/base.rb +48 -0
data/lib/gepa/proposer/reflective_mutation/reflective_mutation.rb +188 -0
data/lib/gepa/strategies/batch_sampler.rb +91 -0
data/lib/gepa/strategies/candidate_selector.rb +97 -0
data/lib/gepa/strategies/component_selector.rb +57 -0
data/lib/gepa/strategies/instruction_proposal.rb +120 -0
data/lib/gepa/telemetry.rb +122 -0
data/lib/gepa/utils/pareto.rb +119 -0
data/lib/gepa.rb +21 -0
metadata +42 -4
data/lib/dspy/teleprompt/simple_optimizer.rb +0 -503

data/lib/dspy/teleprompt/mipro_v2.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 # frozen_string_literal: true
 require 'digest'
+require 'time'
+require 'concurrent-ruby'
 require 'sorbet-runtime'
+require 'securerandom'
 require_relative 'teleprompter'
 require_relative 'utils'
 require_relative '../propose/grounded_proposer'
@@ -124,6 +127,7 @@ module DSPy
       setting :track_diversity, default: true
       setting :max_errors, default: 3
       setting :num_threads, default: 1
+      setting :minibatch_size, default: nil
       # Class-level configuration method - sets defaults for new instances
       def self.configure(&block)
@@ -265,6 +269,7 @@ module DSPy
         @proposer = DSPy::Propose::GroundedProposer.new(config: DSPy::Propose::GroundedProposer::Config.new)
         @optimization_trace = []
         @evaluated_candidates = []
+        @trial_history = {}
       end
       # Main MIPROv2 optimization method
@@ -282,7 +287,7 @@ module DSPy
           trainset_size: trainset.size,
           valset_size: valset&.size || 0,
           num_trials: config.num_trials,
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           mode: infer_auto_mode
         }) do
           # Convert examples to typed format
@@ -331,6 +336,8 @@ module DSPy
             proposal_result
           )
+          @trial_history = optimization_result[:trial_logs] || {}
           save_results(final_result)
           final_result
         end
@@ -368,10 +375,6 @@ module DSPy
         # Flatten demo sets from first predictor and take first 5 examples
         few_shot_examples = demo_candidates[0]&.flatten&.take(5) || []
-        # Get signature class from program
-        signature_class = extract_signature_class(program)
-        raise ArgumentError, "Cannot extract signature class from program" unless signature_class
         # Re-initialize proposer with program and trainset for awareness features
         # This enables program_aware and use_dataset_summary flags to work correctly
         proposer_config = DSPy::Propose::GroundedProposer::Config.new
@@ -383,11 +386,12 @@ module DSPy
           trainset: trainset
         )
-        @proposer.propose_instructions(
-          signature_class,
-          trainset,
-          few_shot_examples: few_shot_examples,
-          current_instruction: current_instruction
+        @proposer.propose_instructions_for_program(
+          trainset: trainset,
+          program: program,
+          demo_candidates: demo_candidates,
+          trial_logs: @trial_history,
+          num_instruction_candidates: config.num_instruction_candidates
         )
       end
@@ -406,12 +410,18 @@ module DSPy
         # Initialize optimization state
         optimization_state = initialize_optimization_state(candidates)
+        # Initialize trial tracking structures
+        trial_logs = {}
+        param_score_dict = Hash.new { |hash, key| hash[key] = [] }
+        fully_evaled_param_combos = {}
+        total_eval_calls = 0
         # Run optimization trials
         trials_completed = 0
         best_score = 0.0
         best_candidate = nil
-        best_program = nil
+        best_program = program
         best_evaluation_result = nil
         config.num_trials.times do |trial_idx|
@@ -419,6 +429,14 @@ module DSPy
           # Select next candidate based on optimization strategy
           candidate = select_next_candidate(candidates, optimization_state, trial_idx)
+          batch_size = evaluation_set.size
+          trial_logs[trials_completed] = create_trial_log_entry(
+            trial_number: trials_completed,
+            candidate: candidate,
+            evaluation_type: :full,
+            batch_size: batch_size
+          )
           emit_event('trial_start', {
             trial_number: trials_completed,
@@ -430,12 +448,30 @@ module DSPy
           begin
             # Evaluate candidate
             score, modified_program, evaluation_result = evaluate_candidate(program, candidate, evaluation_set)
+            total_eval_calls += batch_size
+            instructions_snapshot = extract_program_instructions(modified_program)
+            trial_logs[trials_completed][:instructions] = instructions_snapshot unless instructions_snapshot.empty?
+            trial_logs[trials_completed][:instruction] = instructions_snapshot[0] if instructions_snapshot.key?(0)
             # Update optimization state
             update_optimization_state(optimization_state, candidate, score)
+            record_param_score(
+              param_score_dict,
+              candidate,
+              score,
+              evaluation_type: :full,
+              instructions: instructions_snapshot
+            )
+            update_fully_evaled_param_combos(
+              fully_evaled_param_combos,
+              candidate,
+              score,
+              instructions: instructions_snapshot
+            )
             # Track best result
-            is_best = score > best_score
+            is_best = best_candidate.nil? || score > best_score
             if is_best
               best_score = score
               best_candidate = candidate
@@ -443,6 +479,15 @@ module DSPy
               best_evaluation_result = evaluation_result
             end
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: score,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls
+            )
             emit_event('trial_complete', {
               trial_number: trials_completed,
               score: score,
@@ -457,6 +502,16 @@ module DSPy
             end
           rescue => error
+            finalize_trial_log_entry(
+              trial_logs,
+              trials_completed,
+              score: nil,
+              evaluation_type: :full,
+              batch_size: batch_size,
+              total_eval_calls: total_eval_calls,
+              error: error.message
+            )
             emit_event('trial_error', {
               trial_number: trials_completed,
               error: error.message,
@@ -474,7 +529,11 @@ module DSPy
           best_evaluation_result: best_evaluation_result,
           trials_completed: trials_completed,
           optimization_state: optimization_state,
-          evaluated_candidates: @evaluated_candidates
+          evaluated_candidates: @evaluated_candidates,
+          trial_logs: trial_logs,
+          param_score_dict: param_score_dict,
+          fully_evaled_param_combos: fully_evaled_param_combos,
+          total_eval_calls: total_eval_calls
         }
       end
@@ -488,61 +547,172 @@ module DSPy
       def generate_candidate_configurations(proposal_result, demo_candidates)
         candidates = []
+        predictor_instruction_map = if proposal_result.respond_to?(:predictor_instructions) && proposal_result.predictor_instructions.any?
+          proposal_result.predictor_instructions
+        else
+          { 0 => proposal_result.candidate_instructions }
+        end
+        instruction_maps = build_instruction_maps(predictor_instruction_map)
+        demo_maps = build_demo_maps(demo_candidates)
         # Base configuration (no modifications)
         candidates << EvaluatedCandidate.new(
           instruction: "",
           few_shot_examples: [],
           type: CandidateType::Baseline,
-          metadata: {},
+          metadata: {
+            instructions_map: {},
+            demos_map: {}
+          },
           config_id: SecureRandom.hex(6)
         )
-        # Instruction-only candidates
-        proposal_result.candidate_instructions.each_with_index do |instruction, idx|
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
           candidates << EvaluatedCandidate.new(
-            instruction: instruction,
+            instruction: primary_instruction,
             few_shot_examples: [],
             type: CandidateType::InstructionOnly,
-            metadata: { proposal_rank: idx },
+            metadata: {
+              proposal_rank: combo_idx,
+              instructions_map: duplicate_instruction_map(instruction_map),
+              demos_map: {}
+            },
             config_id: SecureRandom.hex(6)
           )
         end
-        # Few-shot only candidates
-        # Extract demo sets from first predictor (predictor index 0)
-        demo_sets = demo_candidates[0] || []
-        demo_sets.each_with_index do |demo_set, idx|
+        demo_maps.each_with_index do |demo_map, idx|
+          next if demo_map.empty?
+          flattened_examples = demo_map.values.flatten
           candidates << EvaluatedCandidate.new(
             instruction: "",
-            few_shot_examples: demo_set,
+            few_shot_examples: flattened_examples,
             type: CandidateType::FewShotOnly,
-            metadata: { bootstrap_rank: idx },
+            metadata: {
+              bootstrap_rank: idx,
+              instructions_map: {},
+              demos_map: duplicate_demo_map(demo_map)
+            },
             config_id: SecureRandom.hex(6)
           )
         end
         # Combined candidates (instruction + few-shot)
-        top_instructions = proposal_result.candidate_instructions.take(3)
-        top_bootstrap_sets = demo_sets.take(3)
-        top_instructions.each_with_index do |instruction, i_idx|
-          top_bootstrap_sets.each_with_index do |candidate_set, b_idx|
+        instruction_maps.each_with_index do |instruction_map, combo_idx|
+          primary_instruction = instruction_map[0] || instruction_map.values.first || ""
+          demo_maps.first(3).each_with_index do |demo_map, demo_idx|
+            next if demo_map.empty?
+            flattened_examples = demo_map.values.flatten
             candidates << EvaluatedCandidate.new(
-              instruction: instruction,
-              few_shot_examples: candidate_set,
+              instruction: primary_instruction,
+              few_shot_examples: flattened_examples,
               type: CandidateType::Combined,
-              metadata: {
-                instruction_rank: i_idx,
-                bootstrap_rank: b_idx
+              metadata: {
+                instruction_rank: combo_idx,
+                bootstrap_rank: demo_idx,
+                instructions_map: duplicate_instruction_map(instruction_map),
+                demos_map: duplicate_demo_map(demo_map)
               },
               config_id: SecureRandom.hex(6)
             )
           end
         end
         candidates
       end
+      sig { params(predictor_instruction_map: T::Hash[Integer, T::Array[String]]).returns(T::Array[T::Hash[Integer, String]]) }
+      def build_instruction_maps(predictor_instruction_map)
+        return [{}] if predictor_instruction_map.nil? || predictor_instruction_map.empty?
+        normalized = predictor_instruction_map.each_with_object({}) do |(index, instructions), memo|
+          next if instructions.nil? || instructions.empty?
+          memo[index] = instructions.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(demo_candidates: T::Hash[Integer, T::Array[T::Array[DSPy::FewShotExample]]]).returns(T::Array[T::Hash[Integer, T::Array[DSPy::FewShotExample]]])
+      end
+      def build_demo_maps(demo_candidates)
+        return [{}] if demo_candidates.nil? || demo_candidates.empty?
+        normalized = demo_candidates.each_with_object({}) do |(index, sets), memo|
+          next if sets.nil? || sets.empty?
+          memo[index] = sets.take(3)
+        end
+        return [{}] if normalized.empty?
+        cartesian_product(normalized)
+      end
+      sig do
+        params(options_hash: T::Hash[Integer, T::Array[T.untyped]]).returns(T::Array[T::Hash[Integer, T.untyped]])
+      end
+      def cartesian_product(options_hash)
+        options_hash.sort_by { |index, _| index }.reduce([{}]) do |acc, (index, values)|
+          next acc if values.nil? || values.empty?
+          acc.flat_map do |existing|
+            values.map do |value|
+              existing.merge(index => value)
+            end
+          end
+        end
+      end
+      sig { params(instruction_map: T::Hash[Integer, String]).returns(T::Hash[Integer, String]) }
+      def duplicate_instruction_map(instruction_map)
+        instruction_map.each_with_object({}) do |(index, instruction), memo|
+          memo[index] = instruction.is_a?(String) ? instruction.dup : instruction
+        end
+      end
+      sig do
+        params(demo_map: T::Hash[Integer, T::Array[DSPy::FewShotExample]]).returns(T::Hash[Integer, T::Array[DSPy::FewShotExample]])
+      end
+      def duplicate_demo_map(demo_map)
+        demo_map.each_with_object({}) do |(index, demos), memo|
+          next if demos.nil?
+          memo[index] = demos.map { |demo| demo }
+        end
+      end
+      sig { params(examples: T::Array[T.untyped]).returns(T::Array[DSPy::FewShotExample]) }
+      def normalize_few_shot_examples(examples)
+        examples.map do |example|
+          if example.is_a?(DSPy::FewShotExample)
+            example
+          elsif example.is_a?(DSPy::Example)
+            DSPy::FewShotExample.new(
+              input: example.input_values,
+              output: example.expected_values,
+              reasoning: extract_reasoning_from_example(example)
+            )
+          else
+            example
+          end
+        end
+      end
+      sig { params(predictor: T.untyped, examples: T::Array[DSPy::FewShotExample]).void }
+      def assign_predictor_examples(predictor, examples)
+        predictor.demos = examples if predictor.respond_to?(:demos=)
+        return unless predictor.respond_to?(:prompt)
+        cloned_examples = examples.map { |ex| ex }
+        predictor.prompt.instance_variable_set(:@few_shot_examples, cloned_examples.freeze)
+      end
       # Initialize optimization state for candidate selection
       sig { params(candidates: T::Array[EvaluatedCandidate]).returns(T::Hash[Symbol, T.untyped]) }
       def initialize_optimization_state(candidates)
@@ -722,7 +892,11 @@ module DSPy
         modified_program = apply_candidate_configuration(program, candidate)
         # Evaluate modified program
-        evaluation_result = evaluate_program(modified_program, evaluation_set)
+        evaluation_result = if use_concurrent_evaluation?(evaluation_set)
+          evaluate_candidate_concurrently(modified_program, evaluation_set)
+        else
+          evaluate_program(modified_program, evaluation_set)
+        end
         # Store evaluation details
         @evaluated_candidates << candidate
@@ -730,32 +904,131 @@ module DSPy
         [evaluation_result.pass_rate, modified_program, evaluation_result]
       end
+      sig { params(evaluation_set: T::Array[DSPy::Example]).returns(T::Boolean) }
+      def use_concurrent_evaluation?(evaluation_set)
+        minibatch_size = config.minibatch_size
+        return false unless minibatch_size&.positive?
+        return false unless config.num_threads && config.num_threads > 1
+        evaluation_set.size > minibatch_size
+      end
+      sig do
+        params(
+          modified_program: T.untyped,
+          evaluation_set: T::Array[DSPy::Example]
+        ).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def evaluate_candidate_concurrently(modified_program, evaluation_set)
+        chunk_size = T.must(config.minibatch_size)
+        chunks = evaluation_set.each_slice(chunk_size).map(&:dup)
+        return evaluate_program(modified_program, evaluation_set) if chunks.size <= 1
+        pool_size = [config.num_threads, chunks.size].min
+        pool_size = 1 if pool_size <= 0
+        executor = Concurrent::FixedThreadPool.new(pool_size)
+        futures = chunks.map do |chunk|
+          Concurrent::Promises.future_on(executor) do
+            evaluate_program(modified_program, chunk)
+          end
+        end
+        results = futures.map(&:value!)
+        combine_batch_results(results)
+      ensure
+        if executor
+          executor.shutdown
+          executor.wait_for_termination
+        end
+      end
+      sig do
+        params(batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult]).returns(DSPy::Evaluate::BatchEvaluationResult)
+      end
+      def combine_batch_results(batch_results)
+        return DSPy::Evaluate::BatchEvaluationResult.new(results: [], aggregated_metrics: {}) if batch_results.empty?
+        combined_results = batch_results.flat_map(&:results)
+        total_examples = batch_results.sum(&:total_examples)
+        aggregated_metrics = merge_aggregated_metrics(batch_results, total_examples)
+        DSPy::Evaluate::BatchEvaluationResult.new(
+          results: combined_results,
+          aggregated_metrics: aggregated_metrics
+        )
+      end
+      sig do
+        params(
+          batch_results: T::Array[DSPy::Evaluate::BatchEvaluationResult],
+          total_examples: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def merge_aggregated_metrics(batch_results, total_examples)
+        return {} if total_examples.zero?
+        keys = batch_results.flat_map { |res| res.aggregated_metrics.keys }.uniq
+        keys.each_with_object({}) do |key, memo|
+          numeric_weight = 0.0
+          numeric_sum = 0.0
+          fallback_value = nil
+          batch_results.each do |res|
+            value = res.aggregated_metrics[key]
+            next if value.nil?
+            if value.is_a?(Numeric)
+              numeric_sum += value.to_f * res.total_examples
+              numeric_weight += res.total_examples
+            else
+              fallback_value = value
+            end
+          end
+          if numeric_weight.positive?
+            memo[key] = numeric_sum / numeric_weight
+          elsif fallback_value
+            memo[key] = fallback_value
+          end
+        end
+      end
       # Apply candidate configuration to program
       sig { params(program: T.untyped, candidate: EvaluatedCandidate).returns(T.untyped) }
       def apply_candidate_configuration(program, candidate)
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
         modified_program = program
-        # Apply instruction if provided
-        if !candidate.instruction.empty? && program.respond_to?(:with_instruction)
-          modified_program = modified_program.with_instruction(candidate.instruction)
-        end
-        # Apply few-shot examples if provided
-        if candidate.few_shot_examples.any? && program.respond_to?(:with_examples)
-          few_shot_examples = candidate.few_shot_examples.map do |example|
-            # If already a FewShotExample, use it directly
-            if example.is_a?(DSPy::FewShotExample)
-              example
-            else
-              # Convert from DSPy::Example
-              DSPy::FewShotExample.new(
-                input: example.input_values,
-                output: example.expected_values,
-                reasoning: extract_reasoning_from_example(example)
-              )
+        if modified_program.respond_to?(:predictors) && (instructions_map.any? || demos_map.any?)
+          modified_program = modified_program.clone
+          modified_program.predictors.each_with_index do |predictor, idx|
+            if instructions_map.key?(idx)
+              signature = Utils.get_signature(predictor)
+              updated_signature = signature.with_instructions(instructions_map[idx])
+              Utils.set_signature(predictor, updated_signature)
+            end
+            if demos_map.key?(idx)
+              normalized_examples = normalize_few_shot_examples(demos_map[idx])
+              assign_predictor_examples(predictor, normalized_examples)
             end
           end
-          modified_program = modified_program.with_examples(few_shot_examples)
+        end
+        # Apply instruction if provided (top-level programs still respect with_instruction)
+        if !candidate.instruction.empty? && modified_program.respond_to?(:with_instruction)
+          modified_program = modified_program.with_instruction(candidate.instruction)
+        end
+        should_apply_global_examples = candidate.few_shot_examples.any? &&
+          modified_program.respond_to?(:with_examples) &&
+          (demos_map.empty? || !modified_program.respond_to?(:predictors))
+        if should_apply_global_examples
+          normalized_few_shot = normalize_few_shot_examples(candidate.few_shot_examples)
+          modified_program = modified_program.with_examples(normalized_few_shot)
         end
         modified_program
@@ -824,14 +1097,16 @@ module DSPy
         history = {
           total_trials: optimization_result[:trials_completed],
-          optimization_strategy: config.optimization_strategy,
+          optimization_strategy: optimization_strategy_name,
           early_stopped: optimization_result[:trials_completed] < config.num_trials,
-          score_history: optimization_result[:optimization_state][:best_score_history]
+          score_history: optimization_result[:optimization_state][:best_score_history],
+          total_eval_calls: optimization_result[:total_eval_calls]
         }
         metadata = {
           optimizer: "MIPROv2",
           auto_mode: infer_auto_mode,
+          optimization_strategy: optimization_strategy_name,
           best_instruction: best_candidate&.instruction || "",
           best_few_shot_count: best_candidate&.few_shot_examples&.size || 0,
           best_candidate_type: best_candidate&.type&.serialize || "unknown",
@@ -839,12 +1114,21 @@ module DSPy
         }
         # Create bootstrap statistics from demo_candidates
-        demo_sets = demo_candidates[0] || []
+        num_predictors = demo_candidates.keys.size
+        sets_per_predictor = demo_candidates.values.map(&:size)
+        all_demo_sets = demo_candidates.values.flat_map { |sets| sets }
         bootstrap_statistics = {
-          num_predictors: demo_candidates.keys.size,
-          demo_sets_per_predictor: demo_sets.size,
-          avg_demos_per_set: demo_sets.empty? ? 0 : demo_sets.map(&:size).sum.to_f / demo_sets.size
+          num_predictors: num_predictors,
+          demo_sets_per_predictor: sets_per_predictor.max || 0,
+          avg_demos_per_set: all_demo_sets.empty? ? 0 : all_demo_sets.map(&:size).sum.to_f / all_demo_sets.size
         }
+        bootstrap_statistics[:per_predictor_demo_counts] = sets_per_predictor if sets_per_predictor.any?
+        optimization_trace = serialize_optimization_trace(optimization_result[:optimization_state])
+        optimization_trace[:trial_logs] = serialize_trial_logs(optimization_result[:trial_logs])
+        optimization_trace[:param_score_dict] = serialize_param_score_dict(optimization_result[:param_score_dict])
+        optimization_trace[:fully_evaled_param_combos] = serialize_fully_evaled_param_combos(optimization_result[:fully_evaled_param_combos])
+        optimization_trace[:total_eval_calls] = optimization_result[:total_eval_calls]
         MIPROv2Result.new(
           optimized_program: best_program,
@@ -854,7 +1138,7 @@ module DSPy
           best_score_value: best_score,
           metadata: metadata,
           evaluated_candidates: @evaluated_candidates,
-          optimization_trace: serialize_optimization_trace(optimization_result[:optimization_state]),
+          optimization_trace: optimization_trace,
           bootstrap_statistics: bootstrap_statistics,
           proposal_statistics: proposal_result.analysis,
           best_evaluation_result: best_evaluation_result
@@ -876,7 +1160,205 @@ module DSPy
         serialized_trace
       end
+      sig do
+        params(
+          trial_number: Integer,
+          candidate: EvaluatedCandidate,
+          evaluation_type: Symbol,
+          batch_size: Integer
+        ).returns(T::Hash[Symbol, T.untyped])
+      end
+      def create_trial_log_entry(trial_number:, candidate:, evaluation_type:, batch_size:)
+        # Preserve interface parity with Python implementation (trial number stored implicitly via hash key)
+        trial_number # no-op to acknowledge parameter usage
+        instructions_map = candidate.metadata[:instructions_map] || {}
+        demos_map = candidate.metadata[:demos_map] || {}
+        entry = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          instruction_preview: candidate.instruction.to_s[0, 160],
+          few_shot_count: candidate.few_shot_examples.size,
+          metadata: deep_dup(candidate.metadata),
+          evaluation_type: evaluation_type,
+          batch_size: batch_size,
+          status: :in_progress,
+          started_at: Time.now.iso8601
+        }
+        if instructions_map.any?
+          entry[:instructions] = duplicate_instruction_map(instructions_map)
+          entry[:instruction] = entry[:instructions][0] if entry[:instructions].key?(0)
+        elsif candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          entry[:instruction] = candidate.instruction
+          entry[:instructions] = { predictor_index => candidate.instruction }
+        end
+        entry[:few_shot_map] = duplicate_demo_map(demos_map) if demos_map.any?
+        entry
+      end
+      sig do
+        params(
+          trial_logs: T::Hash[Integer, T::Hash[Symbol, T.untyped]],
+          trial_number: Integer,
+          score: T.nilable(Float),
+          evaluation_type: Symbol,
+          batch_size: Integer,
+          total_eval_calls: Integer,
+          error: T.nilable(String)
+        ).void
+      end
+      def finalize_trial_log_entry(trial_logs, trial_number, score:, evaluation_type:, batch_size:, total_eval_calls:, error: nil)
+        entry = trial_logs[trial_number] || {}
+        entry[:score] = score if score
+        entry[:evaluation_type] = evaluation_type
+        entry[:batch_size] = batch_size
+        entry[:total_eval_calls] = total_eval_calls
+        entry[:status] = error ? :error : :completed
+        entry[:error] = error if error
+        entry[:completed_at] = Time.now.iso8601
+        trial_logs[trial_number] = entry
+      end
+      sig do
+        params(
+          param_score_dict: T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          evaluation_type: Symbol,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def record_param_score(param_score_dict, candidate, score, evaluation_type:, instructions: nil)
+        instructions_hash = instructions || {}
+        if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+          predictor_index = candidate.metadata[:predictor_index] || 0
+          instructions_hash[predictor_index] = candidate.instruction
+        end
+        record = {
+          candidate_id: candidate.config_id,
+          candidate_type: candidate.type.serialize,
+          score: score,
+          evaluation_type: evaluation_type,
+          timestamp: Time.now.iso8601,
+          metadata: deep_dup(candidate.metadata)
+        }
+        primary_instruction = instructions_hash[0] || candidate.instruction
+        record[:instruction] = primary_instruction if primary_instruction && !primary_instruction.empty?
+        record[:instructions] = instructions_hash unless instructions_hash.empty?
+        param_score_dict[candidate.config_id] << record
+      end
+      sig do
+        params(
+          fully_evaled_param_combos: T::Hash[String, T::Hash[Symbol, T.untyped]],
+          candidate: EvaluatedCandidate,
+          score: Float,
+          instructions: T.nilable(T::Hash[Integer, String])
+        ).void
+      end
+      def update_fully_evaled_param_combos(fully_evaled_param_combos, candidate, score, instructions: nil)
+        existing = fully_evaled_param_combos[candidate.config_id]
+        if existing.nil? || score > existing[:score]
+          instructions_hash = instructions || {}
+          if instructions_hash.empty? && candidate.instruction && !candidate.instruction.empty?
+            predictor_index = candidate.metadata[:predictor_index] || 0
+            instructions_hash[predictor_index] = candidate.instruction
+          end
+          fully_evaled_param_combos[candidate.config_id] = {
+            candidate_id: candidate.config_id,
+            candidate_type: candidate.type.serialize,
+            score: score,
+            metadata: deep_dup(candidate.metadata),
+            updated_at: Time.now.iso8601
+          }
+          unless instructions_hash.empty?
+            fully_evaled_param_combos[candidate.config_id][:instructions] = instructions_hash
+            fully_evaled_param_combos[candidate.config_id][:instruction] = instructions_hash[0] || candidate.instruction
+          end
+        end
+      end
+      sig { params(trial_logs: T.nilable(T::Hash[Integer, T::Hash[Symbol, T.untyped]])).returns(T::Hash[Integer, T::Hash[Symbol, T.untyped]]) }
+      def serialize_trial_logs(trial_logs)
+        return {} unless trial_logs
+        allowed_keys = [
+          :candidate_id,
+          :candidate_type,
+          :instruction_preview,
+          :instruction,
+          :instructions,
+          :few_shot_count,
+          :metadata,
+          :evaluation_type,
+          :batch_size,
+          :score,
+          :status,
+          :error,
+          :started_at,
+          :completed_at,
+          :total_eval_calls
+        ]
+        trial_logs.transform_values do |entry|
+          entry.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(param_score_dict: T.nilable(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]])).returns(T::Hash[String, T::Array[T::Hash[Symbol, T.untyped]]]) }
+      def serialize_param_score_dict(param_score_dict)
+        return {} unless param_score_dict
+        allowed_keys = [:candidate_id, :candidate_type, :score, :evaluation_type, :timestamp, :metadata, :instruction, :instructions]
+        param_score_dict.transform_values do |records|
+          records.map do |record|
+            record.each_with_object({}) do |(key, value), memo|
+              memo[key] = value if allowed_keys.include?(key)
+            end
+          end
+        end
+      end
+      sig { params(fully_evaled_param_combos: T.nilable(T::Hash[String, T::Hash[Symbol, T.untyped]])).returns(T::Hash[String, T::Hash[Symbol, T.untyped]]) }
+      def serialize_fully_evaled_param_combos(fully_evaled_param_combos)
+        return {} unless fully_evaled_param_combos
+        allowed_keys = [:candidate_id, :candidate_type, :score, :metadata, :updated_at, :instruction, :instructions]
+        fully_evaled_param_combos.transform_values do |record|
+          record.each_with_object({}) do |(key, value), memo|
+            memo[key] = value if allowed_keys.include?(key)
+          end
+        end
+      end
+      sig { params(value: T.untyped).returns(T.untyped) }
+      def deep_dup(value)
+        case value
+        when Hash
+          value.each_with_object({}) { |(k, v), memo| memo[k] = deep_dup(v) }
+        when Array
+          value.map { |element| deep_dup(element) }
+        else
+          value
+        end
+      end
       # Helper methods
+      sig { returns(String) }
+      def optimization_strategy_name
+        strategy = config.optimization_strategy
+        return strategy.serialize if strategy.respond_to?(:serialize)
+        strategy.to_s
+      end
       sig { params(program: T.untyped).returns(T.nilable(String)) }
       def extract_current_instruction(program)
         if program.respond_to?(:prompt) && program.prompt.respond_to?(:instruction)
@@ -889,6 +1371,23 @@ module DSPy
         end
       end
+      sig { params(program: T.untyped).returns(T::Hash[Integer, String]) }
+      def extract_program_instructions(program)
+        instructions = {}
+        if program.respond_to?(:predictors)
+          program.predictors.each_with_index do |predictor, index|
+            if predictor.respond_to?(:prompt) && predictor.prompt.respond_to?(:instruction)
+              value = predictor.prompt.instruction
+              instructions[index] = value if value
+            end
+          end
+        else
+          fallback_instruction = extract_current_instruction(program)
+          instructions[0] = fallback_instruction if fallback_instruction
+        end
+        instructions
+      end
       sig { params(program: T.untyped).returns(T.nilable(T.class_of(DSPy::Signature))) }
       def extract_signature_class(program)
         program.respond_to?(:signature_class) ? program.signature_class : nil
@@ -921,4 +1420,4 @@ module DSPy
       end
     end
   end
-end
+end