dspy 0.24.2 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,7 +24,7 @@ module DSPy
24
24
  end
25
25
  end
26
26
 
27
- # Enum for crossover operation types
27
+ # Enum for crossover operation types
28
28
  class CrossoverType < T::Enum
29
29
  enums do
30
30
  Uniform = new
@@ -59,6 +59,7 @@ module DSPy
59
59
  metadata: T.nilable(MetadataHash)
60
60
  ).void
61
61
  end
62
+
62
63
  def initialize(trace_id:, event_name:, timestamp:, span_id: nil, attributes: {}, metadata: nil)
63
64
  # Freeze nested structures for true immutability
64
65
  frozen_attributes = attributes.freeze
@@ -236,7 +237,7 @@ module DSPy
236
237
  def summary
237
238
  confidence_pct = (confidence * 100).round
238
239
  mutation_list = suggested_mutations.map(&:to_s).join(', ')
239
-
240
+
240
241
  "#{diagnosis.split('.').first}. " \
241
242
  "Confidence: #{confidence_pct}%. " \
242
243
  "#{improvements.size} improvements suggested. " \
@@ -289,7 +290,7 @@ module DSPy
289
290
  def collect_trace(event_name, event_data)
290
291
  @traces_mutex.synchronize do
291
292
  trace_id = event_data['trace_id'] || event_data[:trace_id] || generate_trace_id
292
-
293
+
293
294
  # Avoid duplicates
294
295
  return if @traces.any? { |t| t.trace_id == trace_id }
295
296
 
@@ -350,7 +351,7 @@ module DSPy
350
351
  collect_trace(name, attrs)
351
352
  end
352
353
 
353
- # Subscribe to module events
354
+ # Subscribe to module events
354
355
  self.class.add_subscription('*.reasoning_complete') do |name, attrs|
355
356
  collect_trace(name, attrs)
356
357
  end
@@ -394,7 +395,7 @@ module DSPy
394
395
  reasoning: 'Cannot provide reflection without execution traces',
395
396
  suggested_mutations: [],
396
397
  metadata: {
397
- reflection_model: @config.reflection_lm,
398
+ reflection_model: @config.reflection_lm&.model,
398
399
  analysis_timestamp: Time.now,
399
400
  trace_count: 0
400
401
  }
@@ -404,7 +405,7 @@ module DSPy
404
405
  patterns = analyze_execution_patterns(traces)
405
406
  improvements = generate_improvement_suggestions(patterns)
406
407
  mutations = suggest_mutations(patterns)
407
-
408
+
408
409
  # For Phase 1, we generate a simple rule-based analysis
409
410
  # Future phases will use LLM-based reflection
410
411
  diagnosis = generate_diagnosis(patterns)
@@ -419,7 +420,7 @@ module DSPy
419
420
  reasoning: reasoning,
420
421
  suggested_mutations: mutations,
421
422
  metadata: {
422
- reflection_model: @config.reflection_lm,
423
+ reflection_model: @config.reflection_lm&.model,
423
424
  analysis_timestamp: Time.now,
424
425
  trace_count: traces.size,
425
426
  token_usage: 0 # Phase 1 doesn't use actual LLM reflection
@@ -485,326 +486,17 @@ module DSPy
485
486
  mutations << :combine if llm_count > 2
486
487
  mutations << :rewrite if llm_count == 1
487
488
  mutations << :rephrase if mutations.empty?
488
-
489
- mutations.uniq
490
- end
491
-
492
- private
493
-
494
- # Generate unique reflection ID
495
- sig { returns(String) }
496
- def generate_reflection_id
497
- "reflection-#{SecureRandom.hex(4)}"
498
- end
499
-
500
- # Generate diagnosis text
501
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
502
- def generate_diagnosis(patterns)
503
- if patterns[:total_tokens] > 400
504
- 'High token usage indicates potential inefficiency in prompt design'
505
- elsif patterns[:llm_traces_count] == 0
506
- 'No LLM interactions found - execution may not be working as expected'
507
- elsif patterns[:avg_response_length] < 10
508
- 'Responses are unusually brief which may indicate prompt clarity issues'
509
- else
510
- 'Execution patterns appear normal with room for optimization'
511
- end
512
- end
513
-
514
- # Generate reasoning text
515
- sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
516
- def generate_reasoning(patterns, traces)
517
- reasoning_parts = []
518
-
519
- reasoning_parts << "Analyzed #{traces.size} execution traces"
520
- reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
521
- reasoning_parts << "#{patterns[:module_traces_count]} module operations"
522
- reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
523
-
524
- reasoning_parts.join('. ') + '.'
525
- end
526
489
 
527
- # Calculate confidence based on patterns
528
- sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
529
- def calculate_confidence(patterns)
530
- base_confidence = 0.7
531
-
532
- # More traces = higher confidence
533
- trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
534
-
535
- # Reasonable token usage = higher confidence
536
- token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
537
-
538
- [(base_confidence + trace_bonus + token_penalty), 1.0].min
539
- end
540
-
541
- # Calculate average response length from LLM traces
542
- sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
543
- def calculate_avg_response_length(llm_traces)
544
- return 0 if llm_traces.empty?
545
-
546
- total_length = llm_traces.sum do |trace|
547
- response = trace.response_text
548
- response ? response.length : 0
549
- end
550
-
551
- total_length / llm_traces.size
490
+ mutations.uniq
552
491
  end
553
492
 
554
- # Calculate timespan of traces
555
- sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
556
- def calculate_timespan(traces)
557
- return 0.0 if traces.size < 2
558
-
559
- timestamps = traces.map(&:timestamp).sort
560
- (timestamps.last - timestamps.first).to_f
561
- end
562
-
563
- # LLM-based reflection methods for Phase 2
564
-
565
493
  public
566
-
567
- # Perform LLM-based reflection on execution traces using DSPy::Predict
568
- sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
569
- def reflect_with_llm(traces)
570
- return reflect_on_traces(traces) if traces.empty?
571
-
572
- begin
573
- # Use DSPy::Predict for analysis instead of raw prompts
574
- prediction = analyze_traces_with_dspy(traces)
575
- convert_prediction_to_reflection_result(prediction, traces)
576
- rescue => e
577
- # Fallback to rule-based analysis on LLM failure
578
- fallback_result = reflect_on_traces(traces)
579
- fallback_result.class.new(
580
- trace_id: fallback_result.trace_id,
581
- diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
582
- improvements: fallback_result.improvements,
583
- confidence: [fallback_result.confidence * 0.5, 0.5].min,
584
- reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
585
- suggested_mutations: fallback_result.suggested_mutations,
586
- metadata: fallback_result.metadata.merge(
587
- llm_error: e.message,
588
- fallback_used: true
589
- )
590
- )
591
- end
592
- end
593
-
594
- # Generate structured reflection prompt for LLM (public API)
595
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
596
- def generate_reflection_prompt(traces)
597
- if traces.empty?
598
- return <<~PROMPT
599
- You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
600
-
601
- **Task**: Analyze execution patterns and provide optimization recommendations.
602
-
603
- **Context**: No execution traces available.
604
-
605
- Please provide your analysis in the following JSON format:
606
- {
607
- "diagnosis": "Brief description of what you observed",
608
- "improvements": ["List of actionable improvement suggestions"],
609
- "confidence": 0.0,
610
- "reasoning": "Your reasoning process",
611
- "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
612
- "insights": {
613
- "pattern_detected": "no_data",
614
- "optimization_opportunity": "data_collection"
615
- }
616
- }
617
- PROMPT
618
- end
619
-
620
- summary = trace_summary_for_reflection(traces)
621
- insights = extract_optimization_insights(traces)
622
-
623
- <<~PROMPT
624
- You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
625
-
626
- **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
627
-
628
- **Execution Summary**:
629
- #{summary}
630
-
631
- **Optimization Context**:
632
- - This is part of a genetic algorithm for prompt optimization
633
- - Available mutation types: rewrite, expand, simplify, combine, rephrase
634
- - Goal is to improve prompt effectiveness through iterative evolution
635
- - Focus on actionable insights that can guide mutation and crossover operations
636
-
637
- **Key Optimization Insights**:
638
- #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
639
-
640
- **Sample Traces**:
641
- #{format_traces_for_prompt(traces.take(3))}
642
-
643
- Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
644
- {
645
- "diagnosis": "Brief description of execution patterns and issues identified",
646
- "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
647
- "confidence": 0.85,
648
- "reasoning": "Your detailed reasoning process for the analysis",
649
- "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
650
- "insights": {
651
- "pattern_detected": "primary_pattern_identified",
652
- "optimization_opportunity": "key_area_for_improvement"
653
- }
654
- }
655
-
656
- Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
657
- PROMPT
658
- end
659
-
660
- # Parse LLM reflection response into ReflectionResult (public API)
661
- sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
662
- def parse_llm_reflection(response_text, original_traces)
663
- reflection_id = generate_reflection_id
664
-
665
- begin
666
- parsed = JSON.parse(response_text)
667
-
668
- # Extract and validate components
669
- diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
670
- improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
671
- confidence = [parsed['confidence'].to_f, 1.0].min
672
- reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
673
-
674
- # Validate and sanitize mutation suggestions
675
- raw_mutations = Array(parsed['suggested_mutations'])
676
- valid_mutations = raw_mutations.filter_map do |mut|
677
- mutation_symbol = mut.to_s.downcase.to_sym
678
- if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
679
- mutation_symbol
680
- end
681
- end.uniq
682
-
683
- # Ensure we have at least one valid mutation suggestion
684
- valid_mutations = [:rewrite] if valid_mutations.empty?
685
-
686
- ReflectionResult.new(
687
- trace_id: reflection_id,
688
- diagnosis: diagnosis,
689
- improvements: improvements,
690
- confidence: confidence,
691
- reasoning: reasoning,
692
- suggested_mutations: valid_mutations,
693
- metadata: {
694
- reflection_model: @config.reflection_lm,
695
- analysis_timestamp: Time.now,
696
- trace_count: original_traces.size,
697
- token_usage: estimate_token_usage(response_text),
698
- llm_based: true,
699
- insights: parsed['insights'] || {}
700
- }
701
- )
702
-
703
- rescue JSON::ParserError => e
704
- # Handle malformed JSON response
705
- ReflectionResult.new(
706
- trace_id: reflection_id,
707
- diagnosis: "LLM reflection JSON parsing error: #{e.message}",
708
- improvements: ['Review prompt structure and LLM response format'],
709
- confidence: 0.3,
710
- reasoning: "Failed to parse LLM reflection response as valid JSON",
711
- suggested_mutations: [:rewrite],
712
- metadata: {
713
- reflection_model: @config.reflection_lm,
714
- analysis_timestamp: Time.now,
715
- trace_count: original_traces.size,
716
- token_usage: 0,
717
- parsing_error: e.message,
718
- raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
719
- }
720
- )
721
- end
722
- end
723
-
724
- # Create comprehensive trace summary for reflection (public API)
725
- sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
726
- def trace_summary_for_reflection(traces)
727
- return "No execution traces available" if traces.empty?
728
-
729
- llm_traces = traces.select(&:llm_trace?)
730
- module_traces = traces.select(&:module_trace?)
731
-
732
- total_tokens = llm_traces.sum(&:token_usage)
733
- unique_models = llm_traces.map(&:model_name).compact.uniq
734
- timespan = calculate_timespan(traces)
735
-
736
- avg_response_length = if llm_traces.any?
737
- total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
738
- total_length / llm_traces.size
739
- else
740
- 0
741
- end
742
-
743
- <<~SUMMARY
744
- Total traces: #{traces.size}
745
- LLM interactions: #{llm_traces.size}
746
- Module calls: #{module_traces.size}
747
- Total tokens: #{total_tokens}
748
- Models used: #{unique_models.join(', ')}
749
- Average response length: #{avg_response_length} characters
750
- Execution timespan: #{timespan.round(2)} seconds
751
- SUMMARY
752
- end
753
-
754
- # Extract optimization insights from trace analysis (public API)
755
- sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
756
- def extract_optimization_insights(traces)
757
- llm_traces = traces.select(&:llm_trace?)
758
-
759
- insights = {
760
- token_efficiency: analyze_token_efficiency(llm_traces),
761
- response_quality: analyze_response_quality(llm_traces),
762
- model_consistency: analyze_model_consistency(llm_traces)
763
- }
764
-
765
- insights
766
- end
767
-
768
- # Reflection with optimization context (public API)
769
- sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
770
- def reflection_with_context(traces, context)
771
- base_result = reflect_with_llm(traces)
772
-
773
- # Incorporate context into reasoning
774
- context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
775
- context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
776
-
777
- if context[:current_best_score]
778
- context_reasoning += "Current best score: #{context[:current_best_score]}. "
779
- end
780
-
781
- # Adjust mutation suggestions based on history
782
- adjusted_mutations = adjust_mutations_for_history(
783
- base_result.suggested_mutations,
784
- context[:mutation_history] || [],
785
- context[:recent_performance_trend]
786
- )
787
-
788
- ReflectionResult.new(
789
- trace_id: base_result.trace_id,
790
- diagnosis: base_result.diagnosis,
791
- improvements: base_result.improvements,
792
- confidence: base_result.confidence,
793
- reasoning: context_reasoning + base_result.reasoning,
794
- suggested_mutations: adjusted_mutations,
795
- metadata: base_result.metadata.merge(optimization_context: context)
796
- )
797
- end
798
-
799
- # LLM-based reflection methods for Phase 2
800
-
801
- public
802
-
494
+
803
495
  # Perform LLM-based reflection on execution traces using DSPy::Predict
804
496
  sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
805
497
  def reflect_with_llm(traces)
806
498
  return reflect_on_traces(traces) if traces.empty?
807
-
499
+
808
500
  begin
809
501
  # Use DSPy::Predict for analysis instead of raw prompts
810
502
  prediction = analyze_traces_with_dspy(traces)
@@ -826,7 +518,7 @@ module DSPy
826
518
  )
827
519
  end
828
520
  end
829
-
521
+
830
522
  # Generate structured reflection prompt for LLM (public API)
831
523
  sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
832
524
  def generate_reflection_prompt(traces)
@@ -852,10 +544,10 @@ module DSPy
852
544
  }
853
545
  PROMPT
854
546
  end
855
-
547
+
856
548
  summary = trace_summary_for_reflection(traces)
857
549
  insights = extract_optimization_insights(traces)
858
-
550
+
859
551
  <<~PROMPT
860
552
  You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
861
553
 
@@ -892,21 +584,21 @@ module DSPy
892
584
  Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
893
585
  PROMPT
894
586
  end
895
-
587
+
896
588
  # Parse LLM reflection response into ReflectionResult (public API)
897
589
  sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
898
590
  def parse_llm_reflection(response_text, original_traces)
899
591
  reflection_id = generate_reflection_id
900
-
592
+
901
593
  begin
902
594
  parsed = JSON.parse(response_text)
903
-
595
+
904
596
  # Extract and validate components
905
597
  diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
906
598
  improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
907
599
  confidence = [parsed['confidence'].to_f, 1.0].min
908
600
  reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
909
-
601
+
910
602
  # Validate and sanitize mutation suggestions
911
603
  raw_mutations = Array(parsed['suggested_mutations'])
912
604
  valid_mutations = raw_mutations.filter_map do |mut|
@@ -915,10 +607,10 @@ module DSPy
915
607
  mutation_symbol
916
608
  end
917
609
  end.uniq
918
-
610
+
919
611
  # Ensure we have at least one valid mutation suggestion
920
612
  valid_mutations = [:rewrite] if valid_mutations.empty?
921
-
613
+
922
614
  ReflectionResult.new(
923
615
  trace_id: reflection_id,
924
616
  diagnosis: diagnosis,
@@ -927,7 +619,7 @@ module DSPy
927
619
  reasoning: reasoning,
928
620
  suggested_mutations: valid_mutations,
929
621
  metadata: {
930
- reflection_model: @config.reflection_lm,
622
+ reflection_model: @config.reflection_lm&.model,
931
623
  analysis_timestamp: Time.now,
932
624
  trace_count: original_traces.size,
933
625
  token_usage: estimate_token_usage(response_text),
@@ -935,7 +627,7 @@ module DSPy
935
627
  insights: parsed['insights'] || {}
936
628
  }
937
629
  )
938
-
630
+
939
631
  rescue JSON::ParserError => e
940
632
  # Handle malformed JSON response
941
633
  ReflectionResult.new(
@@ -946,7 +638,7 @@ module DSPy
946
638
  reasoning: "Failed to parse LLM reflection response as valid JSON",
947
639
  suggested_mutations: [:rewrite],
948
640
  metadata: {
949
- reflection_model: @config.reflection_lm,
641
+ reflection_model: @config.reflection_lm&.model,
950
642
  analysis_timestamp: Time.now,
951
643
  trace_count: original_traces.size,
952
644
  token_usage: 0,
@@ -956,26 +648,26 @@ module DSPy
956
648
  )
957
649
  end
958
650
  end
959
-
651
+
960
652
  # Create comprehensive trace summary for reflection (public API)
961
653
  sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
962
654
  def trace_summary_for_reflection(traces)
963
655
  return "No execution traces available" if traces.empty?
964
-
656
+
965
657
  llm_traces = traces.select(&:llm_trace?)
966
658
  module_traces = traces.select(&:module_trace?)
967
-
659
+
968
660
  total_tokens = llm_traces.sum(&:token_usage)
969
661
  unique_models = llm_traces.map(&:model_name).compact.uniq
970
662
  timespan = calculate_timespan(traces)
971
-
663
+
972
664
  avg_response_length = if llm_traces.any?
973
665
  total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
974
666
  total_length / llm_traces.size
975
667
  else
976
668
  0
977
669
  end
978
-
670
+
979
671
  <<~SUMMARY
980
672
  Total traces: #{traces.size}
981
673
  LLM interactions: #{llm_traces.size}
@@ -986,41 +678,41 @@ module DSPy
986
678
  Execution timespan: #{timespan.round(2)} seconds
987
679
  SUMMARY
988
680
  end
989
-
681
+
990
682
  # Extract optimization insights from trace analysis (public API)
991
683
  sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
992
684
  def extract_optimization_insights(traces)
993
685
  llm_traces = traces.select(&:llm_trace?)
994
-
686
+
995
687
  insights = {
996
688
  token_efficiency: analyze_token_efficiency(llm_traces),
997
689
  response_quality: analyze_response_quality(llm_traces),
998
690
  model_consistency: analyze_model_consistency(llm_traces)
999
691
  }
1000
-
692
+
1001
693
  insights
1002
694
  end
1003
-
695
+
1004
696
  # Reflection with optimization context (public API)
1005
697
  sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
1006
698
  def reflection_with_context(traces, context)
1007
699
  base_result = reflect_with_llm(traces)
1008
-
700
+
1009
701
  # Incorporate context into reasoning
1010
702
  context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
1011
703
  context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
1012
-
704
+
1013
705
  if context[:current_best_score]
1014
706
  context_reasoning += "Current best score: #{context[:current_best_score]}. "
1015
707
  end
1016
-
708
+
1017
709
  # Adjust mutation suggestions based on history
1018
710
  adjusted_mutations = adjust_mutations_for_history(
1019
711
  base_result.suggested_mutations,
1020
712
  context[:mutation_history] || [],
1021
713
  context[:recent_performance_trend]
1022
714
  )
1023
-
715
+
1024
716
  ReflectionResult.new(
1025
717
  trace_id: base_result.trace_id,
1026
718
  diagnosis: base_result.diagnosis,
@@ -1031,22 +723,22 @@ module DSPy
1031
723
  metadata: base_result.metadata.merge(optimization_context: context)
1032
724
  )
1033
725
  end
1034
-
726
+
1035
727
  public
1036
-
728
+
1037
729
  # Create signature for trace reflection analysis (public API)
1038
730
  sig { returns(T.class_of(DSPy::Signature)) }
1039
731
  def create_trace_reflection_signature
1040
732
  @trace_reflection_signature ||= Class.new(DSPy::Signature) do
1041
733
  description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
1042
-
734
+
1043
735
  input do
1044
736
  const :execution_summary, String, description: "Summary of execution traces and performance patterns"
1045
737
  const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
1046
- const :key_insights, String, description: "Key insights extracted from trace analysis"
738
+ const :key_insights, String, description: "Key insights extracted from trace analysis"
1047
739
  const :sample_traces, String, description: "Representative execution trace samples"
1048
740
  end
1049
-
741
+
1050
742
  output do
1051
743
  const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
1052
744
  const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
@@ -1060,35 +752,40 @@ module DSPy
1060
752
  end
1061
753
 
1062
754
  # Perform LLM analysis using DSPy::Predict (public API)
1063
- sig { params(traces: T::Array[ExecutionTrace]).returns(DSPy::Prediction) }
755
+ sig { params(traces: T::Array[ExecutionTrace]).returns(T.untyped) }
1064
756
  def analyze_traces_with_dspy(traces)
757
+ raise ArgumentError, "reflection_lm must be configured on GEPAConfig for LLM-based reflection" unless @config.reflection_lm
758
+
1065
759
  predictor = DSPy::Predict.new(create_trace_reflection_signature)
1066
-
760
+
761
+ # Configure predictor to use reflection-specific LM
762
+ predictor.config.lm = @config.reflection_lm
763
+
1067
764
  # Prepare input data
1068
765
  summary = trace_summary_for_reflection(traces)
1069
766
  insights = extract_optimization_insights(traces)
1070
767
  insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
1071
-
768
+
1072
769
  # Get LLM analysis
1073
- predictor.call(
770
+ T.unsafe(predictor.call(
1074
771
  execution_summary: summary,
1075
772
  optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
1076
773
  key_insights: insights_text,
1077
774
  sample_traces: format_traces_for_prompt(traces.take(3))
1078
- )
775
+ ))
1079
776
  end
1080
777
 
1081
778
  # Convert DSPy prediction to ReflectionResult (public API)
1082
- sig { params(prediction: DSPy::Prediction, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
779
+ sig { params(prediction: T.untyped, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
1083
780
  def convert_prediction_to_reflection_result(prediction, original_traces)
1084
781
  reflection_id = generate_reflection_id
1085
-
782
+
1086
783
  # Extract and validate prediction results
1087
784
  diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
1088
785
  improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
1089
786
  confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
1090
787
  reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
1091
-
788
+
1092
789
  # Validate mutation suggestions
1093
790
  valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
1094
791
  mutation_symbol = mut.to_s.downcase.to_sym
@@ -1096,10 +793,10 @@ module DSPy
1096
793
  mutation_symbol
1097
794
  end
1098
795
  end.uniq
1099
-
796
+
1100
797
  # Ensure we have at least one valid mutation suggestion
1101
798
  valid_mutations = [:rewrite] if valid_mutations.empty?
1102
-
799
+
1103
800
  ReflectionResult.new(
1104
801
  trace_id: reflection_id,
1105
802
  diagnosis: diagnosis,
@@ -1108,7 +805,7 @@ module DSPy
1108
805
  reasoning: reasoning,
1109
806
  suggested_mutations: valid_mutations,
1110
807
  metadata: {
1111
- reflection_model: @config.reflection_lm,
808
+ reflection_model: @config.reflection_lm&.model,
1112
809
  analysis_timestamp: Time.now,
1113
810
  trace_count: original_traces.size,
1114
811
  token_usage: estimate_token_usage(prediction.to_s),
@@ -1121,9 +818,9 @@ module DSPy
1121
818
  }
1122
819
  )
1123
820
  end
1124
-
821
+
1125
822
  private
1126
-
823
+
1127
824
  # Generate unique reflection ID
1128
825
  sig { returns(String) }
1129
826
  def generate_reflection_id
@@ -1148,12 +845,12 @@ module DSPy
1148
845
  sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
1149
846
  def generate_reasoning(patterns, traces)
1150
847
  reasoning_parts = []
1151
-
848
+
1152
849
  reasoning_parts << "Analyzed #{traces.size} execution traces"
1153
850
  reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
1154
851
  reasoning_parts << "#{patterns[:module_traces_count]} module operations"
1155
852
  reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
1156
-
853
+
1157
854
  reasoning_parts.join('. ') + '.'
1158
855
  end
1159
856
 
@@ -1161,13 +858,13 @@ module DSPy
1161
858
  sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
1162
859
  def calculate_confidence(patterns)
1163
860
  base_confidence = 0.7
1164
-
861
+
1165
862
  # More traces = higher confidence
1166
863
  trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
1167
-
864
+
1168
865
  # Reasonable token usage = higher confidence
1169
866
  token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
1170
-
867
+
1171
868
  [(base_confidence + trace_bonus + token_penalty), 1.0].min
1172
869
  end
1173
870
 
@@ -1175,12 +872,12 @@ module DSPy
1175
872
  sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
1176
873
  def calculate_avg_response_length(llm_traces)
1177
874
  return 0 if llm_traces.empty?
1178
-
875
+
1179
876
  total_length = llm_traces.sum do |trace|
1180
877
  response = trace.response_text
1181
878
  response ? response.length : 0
1182
879
  end
1183
-
880
+
1184
881
  total_length / llm_traces.size
1185
882
  end
1186
883
 
@@ -1188,11 +885,11 @@ module DSPy
1188
885
  sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
1189
886
  def calculate_timespan(traces)
1190
887
  return 0.0 if traces.size < 2
1191
-
888
+
1192
889
  timestamps = traces.map(&:timestamp).sort
1193
890
  (timestamps.last - timestamps.first).to_f
1194
891
  end
1195
-
892
+
1196
893
 
1197
894
  # Format traces for inclusion in prompt
1198
895
  sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
@@ -1203,22 +900,22 @@ module DSPy
1203
900
  "#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
1204
901
  end.join("\n")
1205
902
  end
1206
-
903
+
1207
904
  # Estimate token usage from response
1208
905
  sig { params(text: String).returns(Integer) }
1209
906
  def estimate_token_usage(text)
1210
907
  # Rough estimation: ~4 characters per token
1211
908
  (text.length / 4.0).ceil
1212
909
  end
1213
-
910
+
1214
911
  # Analyze token efficiency patterns
1215
912
  sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1216
913
  def analyze_token_efficiency(llm_traces)
1217
914
  return { status: 'no_data', suggestions: [] } if llm_traces.empty?
1218
-
915
+
1219
916
  total_tokens = llm_traces.sum(&:token_usage)
1220
917
  avg_tokens = total_tokens.to_f / llm_traces.size
1221
-
918
+
1222
919
  if avg_tokens > 400
1223
920
  {
1224
921
  status: 'poor',
@@ -1239,15 +936,15 @@ module DSPy
1239
936
  }
1240
937
  end
1241
938
  end
1242
-
939
+
1243
940
  # Analyze response quality patterns
1244
941
  sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1245
942
  def analyze_response_quality(llm_traces)
1246
943
  return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
1247
-
944
+
1248
945
  response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
1249
946
  length_variance = calculate_variance(response_lengths)
1250
-
947
+
1251
948
  if length_variance > 1000
1252
949
  {
1253
950
  consistency: 'inconsistent',
@@ -1265,50 +962,50 @@ module DSPy
1265
962
  }
1266
963
  end
1267
964
  end
1268
-
965
+
1269
966
  # Analyze model consistency
1270
967
  sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1271
968
  def analyze_model_consistency(llm_traces)
1272
969
  models = llm_traces.map(&:model_name).compact.uniq
1273
-
970
+
1274
971
  {
1275
972
  unique_models: models.size,
1276
973
  models_used: models,
1277
974
  recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
1278
975
  }
1279
976
  end
1280
-
977
+
1281
978
  # Adjust mutations based on history to avoid repetition
1282
979
  sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
1283
980
  def adjust_mutations_for_history(suggested, history, trend)
1284
981
  # Count recent usage of each mutation type
1285
982
  recent_usage = history.last(5).tally
1286
-
983
+
1287
984
  # Filter out overused mutations
1288
985
  adjusted = suggested.reject do |mutation|
1289
986
  recent_usage[mutation] && recent_usage[mutation] >= 2
1290
987
  end
1291
-
988
+
1292
989
  # If trend is declining, prefer different strategies
1293
990
  if trend == 'declining'
1294
991
  adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
1295
992
  adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
1296
993
  end
1297
-
994
+
1298
995
  # Ensure we always have at least one suggestion
1299
996
  adjusted.empty? ? [:rewrite] : adjusted.uniq
1300
997
  end
1301
-
998
+
1302
999
  # Calculate variance for array of numbers
1303
1000
  sig { params(values: T::Array[Integer]).returns(Float) }
1304
1001
  def calculate_variance(values)
1305
1002
  return 0.0 if values.size < 2
1306
-
1003
+
1307
1004
  mean = values.sum.to_f / values.size
1308
1005
  sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
1309
1006
  sum_squared_diff / values.size
1310
1007
  end
1311
-
1008
+
1312
1009
  # Truncate text to specified length with ellipsis
1313
1010
  sig { params(text: String, length: Integer).returns(String) }
1314
1011
  def truncate_text(text, length)
@@ -1325,8 +1022,8 @@ module DSPy
1325
1022
  sig { returns(GEPAConfig) }
1326
1023
  attr_reader :config
1327
1024
 
1328
- sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1329
- attr_reader :metric
1025
+ sig { returns(FitnessEvaluator) }
1026
+ attr_reader :fitness_evaluator
1330
1027
 
1331
1028
  sig { returns(T::Array[T.untyped]) }
1332
1029
  attr_reader :population
@@ -1334,59 +1031,69 @@ module DSPy
1334
1031
  sig { returns(Integer) }
1335
1032
  attr_reader :generation
1336
1033
 
1337
- sig { params(config: GEPAConfig, metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)).void }
1338
- def initialize(config:, metric:)
1034
+ sig { params(config: GEPAConfig, fitness_evaluator: FitnessEvaluator).void }
1035
+ def initialize(config:, fitness_evaluator:)
1339
1036
  @config = config
1340
- @metric = metric
1037
+ @fitness_evaluator = fitness_evaluator
1341
1038
  @population = T.let([], T::Array[T.untyped])
1342
1039
  @generation = 0
1343
- @fitness_scores = T.let([], T::Array[Float])
1040
+ @fitness_scores = T.let([], T::Array[FitnessScore])
1344
1041
  end
1345
1042
 
1346
1043
  # Initialize population with diverse instruction variants
1347
1044
  sig { params(program: T.untyped).void }
1348
1045
  def initialize_population(program)
1349
1046
  @population = []
1350
-
1047
+
1351
1048
  # Start with original program
1352
1049
  @population << program
1353
-
1354
- # Generate instruction variants to fill population
1355
- original_instruction = program.signature_class.description
1356
- variants = generate_instruction_variants(original_instruction)
1357
-
1050
+
1051
+ # Generate instruction variants to fill population if program has signature_class
1052
+ if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
1053
+ original_instruction = program.signature_class.description
1054
+ if original_instruction && !original_instruction.empty?
1055
+ variants = generate_instruction_variants(original_instruction)
1056
+ else
1057
+ variants = []
1058
+ end
1059
+ else
1060
+ variants = []
1061
+ end
1062
+
1358
1063
  # Create program copies with different instructions
1359
1064
  variants.take(@config.population_size - 1).each do |variant|
1360
1065
  variant_program = create_program_with_instruction(program, variant)
1361
1066
  @population << variant_program
1362
1067
  end
1363
-
1068
+
1364
1069
  # If we need more candidates, duplicate and mutate
1365
1070
  while @population.size < @config.population_size
1366
1071
  base_program = @population.sample
1367
- mutated = create_program_with_instruction(base_program,
1368
- generate_instruction_variants(base_program.signature_class.description).first)
1369
- @population << mutated
1072
+ if base_program.respond_to?(:signature_class) && base_program.signature_class.respond_to?(:description)
1073
+ instruction_variants = generate_instruction_variants(base_program.signature_class.description)
1074
+ if instruction_variants.any?
1075
+ mutated = create_program_with_instruction(base_program, instruction_variants.first)
1076
+ @population << mutated
1077
+ else
1078
+ # If no variants available, just duplicate the base program
1079
+ @population << base_program
1080
+ end
1081
+ else
1082
+ # If no signature_class available, just duplicate the base program
1083
+ @population << base_program
1084
+ end
1370
1085
  end
1371
-
1086
+
1372
1087
  @generation = 0
1373
1088
  end
1374
1089
 
1375
1090
  # Evaluate all population members on the training set
1376
- sig { params(trainset: T::Array[T.untyped]).returns(T::Array[Float]) }
1091
+ sig { params(trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
1377
1092
  def evaluate_population(trainset)
1378
1093
  @fitness_scores = @population.map do |candidate|
1379
- scores = trainset.map do |example|
1380
- prediction = candidate.call(**example.input_values)
1381
- @metric.call(example, prediction).to_f
1382
- rescue => e
1383
- # Handle evaluation errors gracefully
1384
- 0.0
1385
- end
1386
-
1387
- scores.sum / scores.size
1094
+ @fitness_evaluator.evaluate_candidate(candidate, trainset)
1388
1095
  end
1389
-
1096
+
1390
1097
  @fitness_scores
1391
1098
  end
1392
1099
 
@@ -1394,27 +1101,32 @@ module DSPy
1394
1101
  sig { params(trainset: T::Array[T.untyped]).void }
1395
1102
  def evolve_generation(trainset)
1396
1103
  current_scores = evaluate_population(trainset)
1397
-
1104
+
1398
1105
  # Simple selection: keep top 50% and mutate them
1399
- sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i] }
1400
- survivors = sorted_indices.take(@config.population_size / 2)
1401
-
1106
+ sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i].overall_score }
1107
+ survivors = sorted_indices.take([@config.population_size / 2, 1].max)
1108
+
1402
1109
  new_population = []
1403
-
1110
+
1404
1111
  # Keep best performers
1405
1112
  survivors.each { |i| new_population << @population[i] }
1406
-
1113
+
1407
1114
  # Fill rest with mutations of survivors
1408
1115
  while new_population.size < @config.population_size
1409
1116
  parent_index = survivors.sample
1410
1117
  parent = @population[parent_index]
1411
-
1412
- # Generate mutation
1413
- variants = generate_instruction_variants(parent.signature_class.description)
1414
- mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
1415
- new_population << mutated
1118
+
1119
+ # Generate mutation if parent has signature_class
1120
+ if parent.respond_to?(:signature_class) && parent.signature_class.respond_to?(:description)
1121
+ variants = generate_instruction_variants(parent.signature_class.description)
1122
+ mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
1123
+ new_population << mutated
1124
+ else
1125
+ # If no signature_class, just duplicate the parent
1126
+ new_population << parent
1127
+ end
1416
1128
  end
1417
-
1129
+
1418
1130
  @population = new_population
1419
1131
  @generation += 1
1420
1132
  end
@@ -1423,35 +1135,46 @@ module DSPy
1423
1135
  sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
1424
1136
  def run_evolution(program, trainset)
1425
1137
  initialize_population(program)
1426
-
1138
+
1427
1139
  history = []
1428
-
1140
+
1429
1141
  # Initial evaluation
1430
1142
  initial_scores = evaluate_population(trainset)
1143
+ best_initial = initial_scores.max_by(&:overall_score)
1144
+ avg_initial = initial_scores.map(&:overall_score).sum / initial_scores.size
1431
1145
  history << {
1432
1146
  generation: 0,
1433
- best_fitness: initial_scores.max,
1434
- avg_fitness: initial_scores.sum / initial_scores.size,
1147
+ best_fitness: best_initial.overall_score,
1148
+ avg_fitness: avg_initial,
1435
1149
  diversity: population_diversity
1436
1150
  }
1437
-
1151
+
1438
1152
  # Evolution loop
1439
1153
  @config.num_generations.times do
1440
1154
  evolve_generation(trainset)
1441
1155
  scores = evaluate_population(trainset)
1442
-
1156
+ best_score = scores.max_by(&:overall_score)
1157
+ avg_score = scores.map(&:overall_score).sum / scores.size
1158
+
1443
1159
  history << {
1444
1160
  generation: @generation,
1445
- best_fitness: scores.max,
1446
- avg_fitness: scores.sum / scores.size,
1161
+ best_fitness: best_score.overall_score,
1162
+ avg_fitness: avg_score,
1447
1163
  diversity: population_diversity
1448
1164
  }
1449
1165
  end
1450
-
1166
+
1167
+ best_fitness_score = @fitness_scores.max_by(&:overall_score)
1451
1168
  {
1452
1169
  best_candidate: get_best_candidate,
1453
- best_fitness: @fitness_scores.max,
1170
+ best_fitness: best_fitness_score || FitnessScore.new(
1171
+ primary_score: 0.0,
1172
+ secondary_scores: {},
1173
+ overall_score: 0.0,
1174
+ metadata: {}
1175
+ ),
1454
1176
  generation_history: history,
1177
+ generation_count: @generation,
1455
1178
  final_population: @population.dup
1456
1179
  }
1457
1180
  end
@@ -1460,8 +1183,8 @@ module DSPy
1460
1183
  sig { returns(T.untyped) }
1461
1184
  def get_best_candidate
1462
1185
  return @population.first if @fitness_scores.empty?
1463
-
1464
- best_index = @fitness_scores.each_with_index.max_by { |score, _| score }[1]
1186
+
1187
+ best_index = @fitness_scores.each_with_index.max_by { |score, _| score.overall_score }[1]
1465
1188
  @population[best_index]
1466
1189
  end
1467
1190
 
@@ -1469,11 +1192,20 @@ module DSPy
1469
1192
  sig { returns(Float) }
1470
1193
  def population_diversity
1471
1194
  return 0.0 if @population.empty?
1472
-
1473
- instructions = @population.map(&:signature_class).map(&:description)
1195
+
1196
+ # Only calculate diversity for programs that have signature_class
1197
+ instructions = @population.filter_map do |program|
1198
+ if program.respond_to?(:signature_class) && program.signature_class.respond_to?(:description)
1199
+ program.signature_class.description
1200
+ else
1201
+ nil
1202
+ end
1203
+ end
1204
+
1205
+ return 0.0 if instructions.empty?
1206
+
1474
1207
  unique_instructions = instructions.uniq.size
1475
-
1476
- unique_instructions.to_f / @population.size.to_f
1208
+ unique_instructions.to_f / instructions.size.to_f
1477
1209
  end
1478
1210
 
1479
1211
  private
@@ -1482,32 +1214,32 @@ module DSPy
1482
1214
  sig { params(original_instruction: String).returns(T::Array[String]) }
1483
1215
  def generate_instruction_variants(original_instruction)
1484
1216
  variants = []
1485
-
1217
+
1486
1218
  # Add "step by step" variant
1487
1219
  unless original_instruction.include?("step")
1488
1220
  variants << "#{original_instruction} Think step by step."
1489
1221
  end
1490
-
1222
+
1491
1223
  # Add "detailed" variant
1492
1224
  unless original_instruction.include?("detail")
1493
1225
  variants << "#{original_instruction} Provide detailed reasoning."
1494
1226
  end
1495
-
1227
+
1496
1228
  # Add "careful" variant
1497
1229
  unless original_instruction.include?("careful")
1498
1230
  variants << "Be careful and accurate. #{original_instruction}"
1499
1231
  end
1500
-
1232
+
1501
1233
  # Add "examples" variant
1502
1234
  unless original_instruction.include?("example")
1503
1235
  variants << "#{original_instruction} Use examples in your response."
1504
1236
  end
1505
-
1237
+
1506
1238
  # Add "precise" variant
1507
1239
  unless original_instruction.include?("precise")
1508
1240
  variants << "Be precise and specific. #{original_instruction}"
1509
1241
  end
1510
-
1242
+
1511
1243
  variants.shuffle.take(5) # Return up to 5 variants, shuffled
1512
1244
  end
1513
1245
 
@@ -1545,11 +1277,11 @@ module DSPy
1545
1277
  begin
1546
1278
  # Create a new instance of the same class
1547
1279
  new_module = original_module.class.new
1548
-
1280
+
1549
1281
  # Try to find and update any internal predictors
1550
1282
  original_module.instance_variables.each do |var_name|
1551
1283
  var_value = original_module.instance_variable_get(var_name)
1552
-
1284
+
1553
1285
  if var_value.is_a?(DSPy::Predict)
1554
1286
  # Update the instruction for internal predictors
1555
1287
  modified_predictor = var_value.with_instruction(new_instruction)
@@ -1559,7 +1291,7 @@ module DSPy
1559
1291
  new_module.instance_variable_set(var_name, var_value)
1560
1292
  end
1561
1293
  end
1562
-
1294
+
1563
1295
  new_module
1564
1296
  rescue => e
1565
1297
  # Fallback to original module
@@ -1571,6 +1303,7 @@ module DSPy
1571
1303
  # FitnessScore represents multi-dimensional evaluation results
1572
1304
  class FitnessScore < T::Struct
1573
1305
  extend T::Sig
1306
+ include Comparable
1574
1307
 
1575
1308
  const :primary_score, Float
1576
1309
  const :secondary_scores, T::Hash[Symbol, Float]
@@ -1607,6 +1340,13 @@ module DSPy
1607
1340
  )
1608
1341
  end
1609
1342
 
1343
+ # Comparison method for Comparable module
1344
+ sig { params(other: FitnessScore).returns(T.nilable(Integer)) }
1345
+ def <=>(other)
1346
+ return nil unless other.is_a?(FitnessScore)
1347
+ overall_score <=> other.overall_score
1348
+ end
1349
+
1610
1350
  # Check if this score is dominated by another (for Pareto analysis)
1611
1351
  sig { params(other: FitnessScore).returns(T::Boolean) }
1612
1352
  def dominated_by?(other)
@@ -1692,7 +1432,7 @@ module DSPy
1692
1432
 
1693
1433
  # Calculate secondary metrics
1694
1434
  secondary_scores = {}
1695
-
1435
+
1696
1436
  # Token efficiency (mock data for now - will be replaced with real trace collection)
1697
1437
  mock_traces = predictions.map.with_index do |pred, i|
1698
1438
  OpenStruct.new(token_usage: 50 + rand(100))
@@ -1784,7 +1524,7 @@ module DSPy
1784
1524
 
1785
1525
  # Simple consistency measure: average word overlap between responses
1786
1526
  word_sets = responses.map { |response| response.downcase.split.to_set }
1787
-
1527
+
1788
1528
  total_similarity = 0.0
1789
1529
  comparisons = 0
1790
1530
 
@@ -1792,7 +1532,7 @@ module DSPy
1792
1532
  word_sets[(i+1)..-1].each do |set2|
1793
1533
  intersection = set1 & set2
1794
1534
  union = set1 | set2
1795
-
1535
+
1796
1536
  similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
1797
1537
  total_similarity += similarity
1798
1538
  comparisons += 1
@@ -1808,7 +1548,7 @@ module DSPy
1808
1548
  return 1.0 if latencies.empty?
1809
1549
 
1810
1550
  avg_latency = latencies.sum / latencies.size
1811
-
1551
+
1812
1552
  # Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
1813
1553
  baseline_latency = 2.0
1814
1554
  latency_score = baseline_latency / (baseline_latency + avg_latency)
@@ -1930,10 +1670,10 @@ module DSPy
1930
1670
  if llm_traces.any?
1931
1671
  token_usage = llm_traces.sum(&:token_usage)
1932
1672
  avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
1933
-
1673
+
1934
1674
  analysis << "- Total tokens used: #{token_usage}"
1935
1675
  analysis << "- Average response length: #{avg_response_length} characters"
1936
-
1676
+
1937
1677
  # Identify models used
1938
1678
  models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
1939
1679
  analysis << "- Models used: #{models.join(', ')}" if models.any?
@@ -2001,14 +1741,14 @@ module DSPy
2001
1741
 
2002
1742
  begin
2003
1743
  original_instruction = extract_instruction(program)
2004
-
1744
+
2005
1745
  # Use LLM-based instruction proposal instead of hardcoded mutations
2006
1746
  improved_instruction = @instruction_proposer.propose_instruction(
2007
1747
  original_instruction: original_instruction,
2008
1748
  execution_traces: execution_traces,
2009
1749
  failed_examples: failed_examples
2010
1750
  )
2011
-
1751
+
2012
1752
  create_mutated_program(program, improved_instruction)
2013
1753
  rescue => e
2014
1754
  emit_event('mutation_error', {
@@ -2024,7 +1764,7 @@ module DSPy
2024
1764
  sig { params(programs: T::Array[T.untyped], execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2025
1765
  def batch_mutate(programs, execution_traces: [], failed_examples: [])
2026
1766
  return [] if programs.empty?
2027
-
1767
+
2028
1768
  programs.map { |program| mutate_program(program, execution_traces: execution_traces, failed_examples: failed_examples) }
2029
1769
  end
2030
1770
 
@@ -2075,7 +1815,7 @@ module DSPy
2075
1815
  -> (inst) { "Please #{inst.downcase}" },
2076
1816
  -> (inst) { "#{inst} with precision" }
2077
1817
  ]
2078
-
1818
+
2079
1819
  patterns.sample.call(instruction)
2080
1820
  end
2081
1821
 
@@ -2088,7 +1828,7 @@ module DSPy
2088
1828
  "Consider all aspects carefully.",
2089
1829
  "Explain your thought process."
2090
1830
  ]
2091
-
1831
+
2092
1832
  "#{instruction} #{expansions.sample}"
2093
1833
  end
2094
1834
 
@@ -2099,7 +1839,7 @@ module DSPy
2099
1839
  simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
2100
1840
  .gsub(/\s+/, ' ')
2101
1841
  .strip
2102
-
1842
+
2103
1843
  simplified.empty? ? instruction : simplified
2104
1844
  end
2105
1845
 
@@ -2112,12 +1852,12 @@ module DSPy
2112
1852
  "Apply domain knowledge.",
2113
1853
  "Consider edge cases."
2114
1854
  ]
2115
-
1855
+
2116
1856
  "#{instruction} #{strategies.sample}"
2117
1857
  end
2118
1858
 
2119
1859
  # Rephrase instruction with synonyms
2120
- sig { params(instruction: String).returns(String) }
1860
+ sig { params(instruction: String).returns(String) }
2121
1861
  def apply_rephrase_mutation(instruction)
2122
1862
  # Simple synonym replacement - in full implementation would use LLM
2123
1863
  synonyms = {
@@ -2127,12 +1867,12 @@ module DSPy
2127
1867
  'calculate' => 'compute',
2128
1868
  'determine' => 'identify'
2129
1869
  }
2130
-
1870
+
2131
1871
  result = instruction.dup
2132
1872
  synonyms.each do |original, replacement|
2133
1873
  result.gsub!(/\b#{original}\b/i, replacement) if rand < 0.3
2134
1874
  end
2135
-
1875
+
2136
1876
  result
2137
1877
  end
2138
1878
 
@@ -2183,11 +1923,11 @@ module DSPy
2183
1923
  begin
2184
1924
  # Create a new instance of the same class
2185
1925
  new_module = original_module.class.new
2186
-
1926
+
2187
1927
  # Try to find and update any internal predictors
2188
1928
  original_module.instance_variables.each do |var_name|
2189
1929
  var_value = original_module.instance_variable_get(var_name)
2190
-
1930
+
2191
1931
  if var_value.is_a?(DSPy::Predict)
2192
1932
  # Update the instruction for internal predictors
2193
1933
  mutated_predictor = var_value.with_instruction(new_instruction)
@@ -2197,7 +1937,7 @@ module DSPy
2197
1937
  new_module.instance_variable_set(var_name, var_value)
2198
1938
  end
2199
1939
  end
2200
-
1940
+
2201
1941
  new_module
2202
1942
  rescue => e
2203
1943
  emit_event('module_mutation_error', {
@@ -2229,10 +1969,10 @@ module DSPy
2229
1969
  sig { params(mutations: T::Array[MutationType]).returns(Float) }
2230
1970
  def mutation_diversity(mutations)
2231
1971
  return 0.0 if mutations.empty?
2232
-
1972
+
2233
1973
  unique_types = mutations.uniq.size
2234
1974
  total_types = @config.mutation_types.size
2235
-
1975
+
2236
1976
  unique_types.to_f / total_types
2237
1977
  end
2238
1978
  end
@@ -2263,15 +2003,15 @@ module DSPy
2263
2003
  begin
2264
2004
  instruction_a = extract_instruction(parent_a)
2265
2005
  instruction_b = extract_instruction(parent_b)
2266
-
2006
+
2267
2007
  crossover_type = select_crossover_type(instruction_a, instruction_b)
2268
2008
  offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
2269
-
2009
+
2270
2010
  offspring = [
2271
2011
  create_crossover_program(parent_a, offspring_instructions[0]),
2272
2012
  create_crossover_program(parent_b, offspring_instructions[1])
2273
2013
  ]
2274
-
2014
+
2275
2015
  offspring
2276
2016
  rescue => e
2277
2017
  # Return original parents on crossover failure
@@ -2284,9 +2024,9 @@ module DSPy
2284
2024
  def batch_crossover(population)
2285
2025
  return [] if population.empty?
2286
2026
  return [population.first] if population.size == 1
2287
-
2027
+
2288
2028
  offspring = []
2289
-
2029
+
2290
2030
  # Pair up population for crossover
2291
2031
  population.each_slice(2) do |pair|
2292
2032
  if pair.size == 2
@@ -2296,7 +2036,7 @@ module DSPy
2296
2036
  offspring << pair[0] # Unpaired individual passes through
2297
2037
  end
2298
2038
  end
2299
-
2039
+
2300
2040
  offspring
2301
2041
  end
2302
2042
 
@@ -2331,20 +2071,20 @@ module DSPy
2331
2071
  sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2332
2072
  def uniform_crossover(instruction_a, instruction_b)
2333
2073
  return [instruction_a, instruction_b] if instruction_a == instruction_b
2334
-
2074
+
2335
2075
  words_a = instruction_a.split
2336
2076
  words_b = instruction_b.split
2337
-
2077
+
2338
2078
  # Create offspring by randomly selecting words from parents
2339
2079
  offspring_a_words = []
2340
2080
  offspring_b_words = []
2341
-
2081
+
2342
2082
  max_length = [words_a.size, words_b.size].max
2343
-
2083
+
2344
2084
  max_length.times do |i|
2345
2085
  word_a = words_a[i]
2346
2086
  word_b = words_b[i]
2347
-
2087
+
2348
2088
  if rand < 0.5
2349
2089
  offspring_a_words << (word_a || word_b)
2350
2090
  offspring_b_words << (word_b || word_a)
@@ -2353,7 +2093,7 @@ module DSPy
2353
2093
  offspring_b_words << (word_a || word_b)
2354
2094
  end
2355
2095
  end
2356
-
2096
+
2357
2097
  [
2358
2098
  offspring_a_words.compact.join(' '),
2359
2099
  offspring_b_words.compact.join(' ')
@@ -2370,9 +2110,9 @@ module DSPy
2370
2110
  -> (a, b) { "#{b} while #{a.downcase}" },
2371
2111
  -> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
2372
2112
  ]
2373
-
2113
+
2374
2114
  pattern = patterns.sample
2375
-
2115
+
2376
2116
  [
2377
2117
  pattern.call(instruction_a, instruction_b),
2378
2118
  pattern.call(instruction_b, instruction_a)
@@ -2385,11 +2125,11 @@ module DSPy
2385
2125
  # Extract structural components
2386
2126
  components_a = extract_components(instruction_a)
2387
2127
  components_b = extract_components(instruction_b)
2388
-
2128
+
2389
2129
  # Cross structural components
2390
2130
  offspring_a = combine_components(components_a.action, components_b.modifiers)
2391
2131
  offspring_b = combine_components(components_b.action, components_a.modifiers)
2392
-
2132
+
2393
2133
  [offspring_a, offspring_b]
2394
2134
  end
2395
2135
 
@@ -2397,10 +2137,10 @@ module DSPy
2397
2137
  sig { params(instruction: String).returns(InstructionComponents) }
2398
2138
  def extract_components(instruction)
2399
2139
  words = instruction.split
2400
-
2140
+
2401
2141
  # Simple heuristic: first verb-like word is action, rest are modifiers
2402
2142
  action_idx = words.find_index { |word| verb_like?(word) } || 0
2403
-
2143
+
2404
2144
  InstructionComponents.new(
2405
2145
  action: words[action_idx] || words.first || "complete",
2406
2146
  modifiers: (words - [words[action_idx]]).join(' ')
@@ -2438,7 +2178,7 @@ module DSPy
2438
2178
  # Adaptive selection based on instruction characteristics
2439
2179
  if instruction_a && instruction_b
2440
2180
  combined_length = instruction_a.length + instruction_b.length
2441
-
2181
+
2442
2182
  if combined_length < 40
2443
2183
  # Short instructions benefit from blending
2444
2184
  [CrossoverType::Blend, CrossoverType::Uniform].sample
@@ -2458,10 +2198,10 @@ module DSPy
2458
2198
  sig { params(crossovers: T::Array[CrossoverType]).returns(Float) }
2459
2199
  def crossover_diversity(crossovers)
2460
2200
  return 0.0 if crossovers.empty?
2461
-
2201
+
2462
2202
  unique_types = crossovers.uniq.size
2463
2203
  total_types = @config.crossover_types.size
2464
-
2204
+
2465
2205
  unique_types.to_f / total_types
2466
2206
  end
2467
2207
  end
@@ -2487,15 +2227,15 @@ module DSPy
2487
2227
  def select_parents(population_with_scores, count:)
2488
2228
  return [] if population_with_scores.empty?
2489
2229
  return population_with_scores.map(&:first) if count >= population_with_scores.size
2490
-
2230
+
2491
2231
  # Combine tournament and Pareto-based selection for parent selection
2492
2232
  selected = []
2493
-
2233
+
2494
2234
  count.times do
2495
2235
  parent = tournament_selection(population_with_scores)
2496
2236
  selected << parent
2497
2237
  end
2498
-
2238
+
2499
2239
  selected
2500
2240
  end
2501
2241
 
@@ -2504,14 +2244,14 @@ module DSPy
2504
2244
  def select_survivors(population_with_scores, count:)
2505
2245
  return [] if population_with_scores.empty?
2506
2246
  return population_with_scores.map(&:first) if count >= population_with_scores.size
2507
-
2247
+
2508
2248
  scores = population_with_scores.map(&:last)
2509
-
2249
+
2510
2250
  # Find Pareto frontier first
2511
2251
  pareto_frontier = find_pareto_frontier(scores)
2512
2252
  frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
2513
2253
  frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
2514
-
2254
+
2515
2255
  if frontier_programs.size >= count
2516
2256
  # Use diversity selection within frontier
2517
2257
  frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
@@ -2520,7 +2260,7 @@ module DSPy
2520
2260
  # Include all frontier + fill remaining with elite selection
2521
2261
  remaining_count = count - frontier_programs.size
2522
2262
  remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
2523
-
2263
+
2524
2264
  additional = elite_selection(remaining_population, count: remaining_count)
2525
2265
  frontier_programs + additional
2526
2266
  end
@@ -2533,18 +2273,18 @@ module DSPy
2533
2273
  def find_pareto_frontier(fitness_scores)
2534
2274
  return [] if fitness_scores.empty?
2535
2275
  return fitness_scores if fitness_scores.size == 1
2536
-
2276
+
2537
2277
  frontier = []
2538
-
2278
+
2539
2279
  fitness_scores.each do |candidate|
2540
2280
  # Check if candidate is dominated by any other solution
2541
2281
  is_dominated = fitness_scores.any? do |other|
2542
2282
  other != candidate && candidate.dominated_by?(other)
2543
2283
  end
2544
-
2284
+
2545
2285
  frontier << candidate unless is_dominated
2546
2286
  end
2547
-
2287
+
2548
2288
  frontier
2549
2289
  end
2550
2290
 
@@ -2552,17 +2292,17 @@ module DSPy
2552
2292
  sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
2553
2293
  def calculate_crowding_distance(fitness_scores)
2554
2294
  distances = {}
2555
-
2295
+
2556
2296
  # Initialize distances for all solutions
2557
2297
  fitness_scores.each { |score| distances[score] = 0.0 }
2558
-
2298
+
2559
2299
  return distances if fitness_scores.size <= 2
2560
-
2300
+
2561
2301
  # Calculate crowding distance for each objective
2562
2302
  objectives = [:primary_score, :overall_score]
2563
2303
  secondary_objectives = fitness_scores.first.secondary_scores.keys
2564
2304
  all_objectives = objectives + secondary_objectives
2565
-
2305
+
2566
2306
  all_objectives.each do |objective|
2567
2307
  # Sort by current objective
2568
2308
  sorted_scores = fitness_scores.sort_by do |score|
@@ -2575,29 +2315,29 @@ module DSPy
2575
2315
  score.secondary_scores[objective] || 0.0
2576
2316
  end
2577
2317
  end
2578
-
2318
+
2579
2319
  # Set boundary solutions to high distance
2580
2320
  distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
2581
2321
  distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
2582
-
2322
+
2583
2323
  next if sorted_scores.size <= 2
2584
-
2324
+
2585
2325
  # Calculate range for normalization
2586
2326
  min_val = get_objective_value(sorted_scores.first, objective)
2587
2327
  max_val = get_objective_value(sorted_scores.last, objective)
2588
2328
  range = max_val - min_val
2589
-
2329
+
2590
2330
  next if range <= 0
2591
-
2331
+
2592
2332
  # Calculate crowding distance for intermediate solutions
2593
2333
  (1...(sorted_scores.size - 1)).each do |i|
2594
2334
  prev_val = get_objective_value(sorted_scores[i - 1], objective)
2595
2335
  next_val = get_objective_value(sorted_scores[i + 1], objective)
2596
-
2336
+
2597
2337
  distances[sorted_scores[i]] += (next_val - prev_val) / range
2598
2338
  end
2599
2339
  end
2600
-
2340
+
2601
2341
  distances
2602
2342
  end
2603
2343
 
@@ -2618,13 +2358,13 @@ module DSPy
2618
2358
  sig { params(population_with_scores: T::Array[T::Array[T.untyped]]).returns(T.untyped) }
2619
2359
  def tournament_selection(population_with_scores)
2620
2360
  return population_with_scores.first.first if population_with_scores.size == 1
2621
-
2361
+
2622
2362
  tournament_size = [3, population_with_scores.size].min
2623
2363
  tournament = population_with_scores.sample(tournament_size)
2624
-
2364
+
2625
2365
  # Select best from tournament based on Pareto dominance and crowding
2626
2366
  best_program, best_score = tournament.first
2627
-
2367
+
2628
2368
  tournament[1..].each do |program, score|
2629
2369
  if score.dominated_by?(best_score)
2630
2370
  # Current best dominates this candidate, keep current
@@ -2639,7 +2379,7 @@ module DSPy
2639
2379
  end
2640
2380
  end
2641
2381
  end
2642
-
2382
+
2643
2383
  best_program
2644
2384
  end
2645
2385
 
@@ -2647,13 +2387,13 @@ module DSPy
2647
2387
  sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2648
2388
  def diversity_selection(population_with_scores, count:)
2649
2389
  return population_with_scores.map(&:first) if count >= population_with_scores.size
2650
-
2390
+
2651
2391
  scores = population_with_scores.map(&:last)
2652
2392
  distances = calculate_crowding_distance(scores)
2653
-
2393
+
2654
2394
  # Sort by crowding distance (descending - prefer more diverse)
2655
2395
  sorted_pairs = population_with_scores.sort_by { |_, score| -distances[score] }
2656
-
2396
+
2657
2397
  sorted_pairs.take(count).map(&:first)
2658
2398
  end
2659
2399
 
@@ -2661,10 +2401,10 @@ module DSPy
2661
2401
  sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2662
2402
  def elite_selection(population_with_scores, count:)
2663
2403
  return population_with_scores.map(&:first) if count >= population_with_scores.size
2664
-
2404
+
2665
2405
  # Sort by overall score (descending - best first)
2666
2406
  sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
2667
-
2407
+
2668
2408
  sorted_pairs.take(count).map(&:first)
2669
2409
  end
2670
2410
  end
@@ -2673,7 +2413,7 @@ module DSPy
2673
2413
  class GEPAConfig < Config
2674
2414
  extend T::Sig
2675
2415
 
2676
- sig { returns(String) }
2416
+ sig { returns(DSPy::LM) }
2677
2417
  attr_accessor :reflection_lm
2678
2418
 
2679
2419
  sig { returns(Integer) }
@@ -2688,8 +2428,6 @@ module DSPy
2688
2428
  sig { returns(T::Boolean) }
2689
2429
  attr_accessor :use_pareto_selection
2690
2430
 
2691
- sig { returns(T::Boolean) }
2692
- attr_accessor :simple_mode
2693
2431
  sig { returns(T::Array[MutationType]) }
2694
2432
  attr_accessor :mutation_types
2695
2433
  sig { returns(Float) }
@@ -2700,12 +2438,12 @@ module DSPy
2700
2438
  sig { void }
2701
2439
  def initialize
2702
2440
  super
2703
- @reflection_lm = 'gpt-4o'
2441
+ # reflection_lm must be explicitly set by user - no default provided
2442
+ @reflection_lm = nil
2704
2443
  @num_generations = 10
2705
2444
  @population_size = 8
2706
2445
  @mutation_rate = 0.7
2707
2446
  @use_pareto_selection = true
2708
- @simple_mode = false
2709
2447
  @mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
2710
2448
  @crossover_rate = 0.6
2711
2449
  @crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
@@ -2714,12 +2452,11 @@ module DSPy
2714
2452
  sig { returns(T::Hash[Symbol, T.untyped]) }
2715
2453
  def to_h
2716
2454
  super.merge({
2717
- reflection_lm: @reflection_lm,
2455
+ reflection_lm: @reflection_lm&.model, # Serialize the model name for hash representation
2718
2456
  num_generations: @num_generations,
2719
2457
  population_size: @population_size,
2720
2458
  mutation_rate: @mutation_rate,
2721
2459
  use_pareto_selection: @use_pareto_selection,
2722
- simple_mode: @simple_mode,
2723
2460
  mutation_types: @mutation_types,
2724
2461
  crossover_rate: @crossover_rate,
2725
2462
  crossover_types: @crossover_types
@@ -2738,6 +2475,12 @@ module DSPy
2738
2475
  end
2739
2476
  def initialize(metric: nil, config: nil)
2740
2477
  @config = config || GEPAConfig.new
2478
+
2479
+ # Validate that reflection_lm is configured
2480
+ unless @config.reflection_lm
2481
+ raise ArgumentError, "reflection_lm must be configured for GEPA optimization. Set config.reflection_lm to a DSPy::LM instance."
2482
+ end
2483
+
2741
2484
  super(metric: metric, config: @config)
2742
2485
  end
2743
2486
 
@@ -2749,6 +2492,7 @@ module DSPy
2749
2492
  valset: T.nilable(T::Array[T.untyped])
2750
2493
  ).returns(OptimizationResult)
2751
2494
  end
2495
+
2752
2496
  def compile(program, trainset:, valset: nil)
2753
2497
  validate_inputs(program, trainset, valset)
2754
2498
 
@@ -2758,200 +2502,13 @@ module DSPy
2758
2502
  num_generations: @config.num_generations,
2759
2503
  population_size: @config.population_size
2760
2504
  }) do
2761
- # Simple optimization for Phase 1.5 - basic instruction optimization
2762
- if @config.simple_mode
2763
- perform_simple_optimization(program, trainset, valset)
2764
- else
2765
- # Phase 2 - Full GEPA genetic algorithm implementation
2766
- perform_gepa_optimization(program, trainset, valset)
2767
- end
2505
+ # Always perform full GEPA genetic algorithm optimization
2506
+ perform_gepa_optimization(program, trainset, valset)
2768
2507
  end
2769
2508
  end
2770
2509
 
2771
2510
  private
2772
2511
 
2773
- # Simple optimization implementation for testing
2774
- sig do
2775
- params(
2776
- program: T.untyped,
2777
- trainset: T::Array[T.untyped],
2778
- valset: T.nilable(T::Array[T.untyped])
2779
- ).returns(OptimizationResult)
2780
- end
2781
- def perform_simple_optimization(program, trainset, valset)
2782
- return basic_result(program) unless program.respond_to?(:signature_class)
2783
-
2784
- original_description = program.signature_class.description
2785
- best_program = program
2786
- best_score = simple_evaluate_program(program, trainset)
2787
-
2788
- # Try different instruction variations
2789
- instruction_variants = generate_instruction_variants(original_description)
2790
-
2791
- instruction_variants.each_with_index do |variant, index|
2792
- emit_event('instruction_variant_test', {
2793
- variant: variant,
2794
- iteration: index + 1,
2795
- total_variants: instruction_variants.size
2796
- })
2797
-
2798
- # Create modified program
2799
- modified_program = create_program_with_instruction(program, variant)
2800
- score = simple_evaluate_program(modified_program, trainset)
2801
-
2802
- if score > best_score
2803
- best_program = modified_program
2804
- best_score = score
2805
-
2806
- emit_event('improvement_found', {
2807
- new_score: score,
2808
- previous_score: best_score,
2809
- instruction: variant
2810
- })
2811
- end
2812
- end
2813
-
2814
- OptimizationResult.new(
2815
- optimized_program: best_program,
2816
- scores: { accuracy: best_score },
2817
- history: {
2818
- original_score: simple_evaluate_program(program, trainset),
2819
- variants_tested: instruction_variants.size,
2820
- best_instruction: best_program.signature_class.description
2821
- },
2822
- best_score_name: 'accuracy',
2823
- best_score_value: best_score,
2824
- metadata: {
2825
- optimizer: 'GEPA',
2826
- mode: 'Simple Optimization',
2827
- reflection_lm: @config.reflection_lm
2828
- }
2829
- )
2830
- end
2831
-
2832
- # Generate variations of the instruction
2833
- sig { params(original_instruction: String).returns(T::Array[String]) }
2834
- def generate_instruction_variants(original_instruction)
2835
- variants = []
2836
-
2837
- # Add "step by step" variant
2838
- unless original_instruction.include?("step")
2839
- variants << "#{original_instruction} Think step by step."
2840
- end
2841
-
2842
- # Add "detailed" variant
2843
- unless original_instruction.include?("detail")
2844
- variants << "#{original_instruction} Provide detailed reasoning."
2845
- end
2846
-
2847
- # Add "careful" variant
2848
- unless original_instruction.include?("careful")
2849
- variants << "Be careful and accurate. #{original_instruction}"
2850
- end
2851
-
2852
- variants.take(3) # Limit to 3 variants for simple mode
2853
- end
2854
-
2855
- # Create a new program instance with modified instruction using DSPy.rb dynamic capabilities
2856
- sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2857
- def create_program_with_instruction(original_program, new_instruction)
2858
- case original_program
2859
- when DSPy::Predict
2860
- # DSPy::Predict has built-in support for instruction modification
2861
- original_program.with_instruction(new_instruction)
2862
- when DSPy::Module
2863
- # For custom DSPy::Module classes, create new instance with updated predictors
2864
- create_modified_module_instance(original_program, new_instruction)
2865
- else
2866
- # For other types (like test doubles), check available methods
2867
- if original_program.respond_to?(:with_instruction)
2868
- original_program.with_instruction(new_instruction)
2869
- elsif original_program.respond_to?(:signature_class)
2870
- # Create new DSPy::Predict with the same signature but new instruction
2871
- signature_class = original_program.signature_class
2872
- DSPy::Predict.new(signature_class).with_instruction(new_instruction)
2873
- else
2874
- # Fallback: return original if we can't modify
2875
- emit_event('program_modification_fallback', {
2876
- program_type: original_program.class.name,
2877
- reason: 'No modification method available'
2878
- })
2879
- original_program
2880
- end
2881
- end
2882
- rescue => e
2883
- emit_event('program_modification_error', {
2884
- error: e.message,
2885
- program_type: original_program.class.name
2886
- })
2887
- # Return original program on error
2888
- original_program
2889
- end
2890
-
2891
- # Create modified version of custom DSPy::Module instance (for main GEPA class)
2892
- sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
2893
- def create_modified_module_instance(original_module, new_instruction)
2894
- begin
2895
- # Create a new instance of the same class
2896
- new_module = original_module.class.new
2897
-
2898
- # Try to find and update any internal predictors
2899
- original_module.instance_variables.each do |var_name|
2900
- var_value = original_module.instance_variable_get(var_name)
2901
-
2902
- if var_value.is_a?(DSPy::Predict)
2903
- # Update the instruction for internal predictors
2904
- modified_predictor = var_value.with_instruction(new_instruction)
2905
- new_module.instance_variable_set(var_name, modified_predictor)
2906
- else
2907
- # Copy other instance variables as-is
2908
- new_module.instance_variable_set(var_name, var_value)
2909
- end
2910
- end
2911
-
2912
- new_module
2913
- rescue => e
2914
- emit_event('module_modification_error', {
2915
- error: e.message,
2916
- module_class: original_module.class.name
2917
- })
2918
- # Fallback to original module
2919
- original_module
2920
- end
2921
- end
2922
-
2923
- # Simple evaluation for testing (different from base class evaluate_program)
2924
- sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(Float) }
2925
- def simple_evaluate_program(program, trainset)
2926
- return 0.0 unless @metric
2927
-
2928
- scores = trainset.map do |example|
2929
- prediction = program.call(**example.input_values)
2930
- @metric.call(example, prediction).to_f
2931
- rescue => e
2932
- emit_event('evaluation_error', { error: e.message, example_id: example.object_id.to_s })
2933
- 0.0
2934
- end
2935
-
2936
- scores.sum / scores.size
2937
- end
2938
-
2939
- # Return basic result when simple optimization isn't applicable
2940
- sig { params(program: T.untyped).returns(OptimizationResult) }
2941
- def basic_result(program)
2942
- OptimizationResult.new(
2943
- optimized_program: program,
2944
- scores: { gepa_score: 0.0 },
2945
- history: { phase: 'Phase 1 - Basic Structure' },
2946
- best_score_name: 'gepa_score',
2947
- best_score_value: 0.0,
2948
- metadata: {
2949
- optimizer: 'GEPA',
2950
- implementation_status: 'Phase 1 - Infrastructure Complete'
2951
- }
2952
- )
2953
- end
2954
-
2955
2512
  # Complete GEPA genetic algorithm optimization
2956
2513
  sig do
2957
2514
  params(
@@ -2968,11 +2525,11 @@ module DSPy
2968
2525
  mutation_engine = create_mutation_engine
2969
2526
  crossover_engine = create_crossover_engine
2970
2527
  pareto_selector = create_pareto_selector(fitness_evaluator)
2971
-
2528
+
2972
2529
  # Initialize trace collection for reflection
2973
2530
  trace_collector = TraceCollector.new
2974
2531
  optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
2975
-
2532
+
2976
2533
  emit_event('gepa_optimization_start', {
2977
2534
  optimization_run_id: optimization_run_id,
2978
2535
  num_generations: @config.num_generations,
@@ -2980,17 +2537,17 @@ module DSPy
2980
2537
  mutation_rate: @config.mutation_rate,
2981
2538
  crossover_rate: @config.crossover_rate
2982
2539
  })
2983
-
2540
+
2984
2541
  begin
2985
2542
  # Run the complete genetic algorithm evolution
2986
2543
  evolution_result = genetic_engine.run_evolution(program, trainset)
2987
-
2544
+
2988
2545
  # Collect traces for reflection analysis
2989
2546
  execution_traces = trace_collector.traces_for_run(optimization_run_id)
2990
-
2547
+
2991
2548
  # Generate reflection insights on the optimization process
2992
2549
  reflection_result = reflection_engine.reflect_with_llm(execution_traces)
2993
-
2550
+
2994
2551
  # Evaluate final candidate on validation set if provided
2995
2552
  final_validation_score = if valset && !valset.empty?
2996
2553
  validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
@@ -2998,7 +2555,7 @@ module DSPy
2998
2555
  else
2999
2556
  evolution_result[:best_fitness].overall_score
3000
2557
  end
3001
-
2558
+
3002
2559
  emit_event('gepa_optimization_complete', {
3003
2560
  optimization_run_id: optimization_run_id,
3004
2561
  best_fitness: evolution_result[:best_fitness].overall_score,
@@ -3006,7 +2563,7 @@ module DSPy
3006
2563
  validation_score: final_validation_score,
3007
2564
  reflection_confidence: reflection_result.confidence
3008
2565
  })
3009
-
2566
+
3010
2567
  # Create comprehensive optimization result
3011
2568
  OptimizationResult.new(
3012
2569
  optimized_program: evolution_result[:best_candidate],
@@ -3030,7 +2587,7 @@ module DSPy
3030
2587
  best_score_value: evolution_result[:best_fitness].overall_score,
3031
2588
  metadata: {
3032
2589
  optimizer: 'GEPA',
3033
- reflection_lm: @config.reflection_lm,
2590
+ reflection_lm: @config.reflection_lm&.model,
3034
2591
  implementation_status: 'Phase 2 - Complete Implementation',
3035
2592
  optimization_run_id: optimization_run_id,
3036
2593
  reflection_insights: {
@@ -3047,7 +2604,7 @@ module DSPy
3047
2604
  },
3048
2605
  component_versions: {
3049
2606
  genetic_engine: 'v2.0',
3050
- fitness_evaluator: 'v2.0',
2607
+ fitness_evaluator: 'v2.0',
3051
2608
  reflection_engine: 'v2.0',
3052
2609
  mutation_engine: 'v2.0',
3053
2610
  crossover_engine: 'v2.0',
@@ -3055,20 +2612,20 @@ module DSPy
3055
2612
  }
3056
2613
  }
3057
2614
  )
3058
-
2615
+
3059
2616
  rescue => e
3060
2617
  emit_event('gepa_optimization_error', {
3061
2618
  optimization_run_id: optimization_run_id,
3062
2619
  error: e.message,
3063
2620
  backtrace: e.backtrace&.take(5)
3064
2621
  })
3065
-
2622
+
3066
2623
  # Return fallback result on optimization failure
3067
2624
  fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
3068
-
2625
+
3069
2626
  OptimizationResult.new(
3070
2627
  optimized_program: program,
3071
- scores: {
2628
+ scores: {
3072
2629
  fitness_score: fallback_fitness.overall_score,
3073
2630
  primary_score: fallback_fitness.primary_score,
3074
2631
  **fallback_fitness.secondary_scores
@@ -3079,11 +2636,11 @@ module DSPy
3079
2636
  phase: 'Phase 2 - Error Recovery',
3080
2637
  error: e.message
3081
2638
  },
3082
- best_score_name: 'fitness_score',
2639
+ best_score_name: 'fitness_score',
3083
2640
  best_score_value: fallback_fitness.overall_score,
3084
2641
  metadata: {
3085
2642
  optimizer: 'GEPA',
3086
- reflection_lm: @config.reflection_lm,
2643
+ reflection_lm: @config.reflection_lm&.model,
3087
2644
  implementation_status: 'Phase 2 - Error Recovery',
3088
2645
  optimization_run_id: optimization_run_id,
3089
2646
  error_details: {
@@ -3095,48 +2652,48 @@ module DSPy
3095
2652
  )
3096
2653
  end
3097
2654
  end
3098
-
2655
+
3099
2656
  # Create and configure fitness evaluator
3100
2657
  sig { returns(FitnessEvaluator) }
3101
2658
  def create_fitness_evaluator
3102
2659
  FitnessEvaluator.new(primary_metric: @metric, config: @config)
3103
2660
  end
3104
-
2661
+
3105
2662
  # Create and configure genetic engine
3106
2663
  sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
3107
2664
  def create_genetic_engine(fitness_evaluator)
3108
- GeneticEngine.new(config: @config, metric: @metric)
2665
+ GeneticEngine.new(config: @config, fitness_evaluator: fitness_evaluator)
3109
2666
  end
3110
-
2667
+
3111
2668
  # Create and configure reflection engine
3112
2669
  sig { returns(ReflectionEngine) }
3113
2670
  def create_reflection_engine
3114
2671
  ReflectionEngine.new(@config)
3115
2672
  end
3116
-
3117
- # Create and configure mutation engine
2673
+
2674
+ # Create and configure mutation engine
3118
2675
  sig { returns(MutationEngine) }
3119
2676
  def create_mutation_engine
3120
2677
  MutationEngine.new(config: @config)
3121
2678
  end
3122
-
2679
+
3123
2680
  # Create and configure crossover engine
3124
2681
  sig { returns(CrossoverEngine) }
3125
2682
  def create_crossover_engine
3126
2683
  CrossoverEngine.new(config: @config)
3127
2684
  end
3128
-
2685
+
3129
2686
  # Create and configure pareto selector
3130
2687
  sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
3131
2688
  def create_pareto_selector(fitness_evaluator)
3132
2689
  ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
3133
2690
  end
3134
-
2691
+
3135
2692
  # Calculate execution timespan from traces
3136
2693
  sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
3137
2694
  def calculate_execution_timespan(traces)
3138
2695
  return 0.0 if traces.size < 2
3139
-
2696
+
3140
2697
  timestamps = traces.map(&:timestamp).sort
3141
2698
  (timestamps.last - timestamps.first).to_f
3142
2699
  end
@@ -3147,9 +2704,9 @@ module DSPy
3147
2704
  module GEPAFeedbackMetric
3148
2705
  extend T::Sig
3149
2706
  extend T::Helpers
3150
-
2707
+
3151
2708
  interface!
3152
-
2709
+
3153
2710
  # Evaluates prediction and provides score with optional feedback
3154
2711
  sig do
3155
2712
  abstract
@@ -3166,11 +2723,11 @@ module DSPy
3166
2723
  # Extended prediction result with score and feedback
3167
2724
  class ScoreWithFeedback < T::Struct
3168
2725
  extend T::Sig
3169
-
2726
+
3170
2727
  const :score, Float
3171
2728
  const :feedback, T.nilable(String)
3172
2729
  const :prediction, DSPy::Prediction
3173
-
2730
+
3174
2731
  sig { params(score: Float, prediction: DSPy::Prediction, feedback: T.nilable(String)).void }
3175
2732
  def initialize(score:, prediction:, feedback: nil)
3176
2733
  super
@@ -3180,7 +2737,7 @@ module DSPy
3180
2737
  # Module Evaluator - Evaluates DSPy modules with metrics and feedback
3181
2738
  class ModuleEvaluator
3182
2739
  extend T::Sig
3183
-
2740
+
3184
2741
  sig do
3185
2742
  params(
3186
2743
  student: T.untyped, # DSPy::Module or similar callable
@@ -3224,9 +2781,9 @@ module DSPy
3224
2781
  def evaluate_batch(batch, candidate_instruction, capture_traces: true)
3225
2782
  program = build_program(candidate_instruction)
3226
2783
  results = []
3227
-
2784
+
3228
2785
  batch.each do |example|
3229
- begin
2786
+ begin
3230
2787
  # Execute program on example
3231
2788
  prediction = if program.respond_to?(:call)
3232
2789
  program.call(**example.input_values)
@@ -3235,11 +2792,11 @@ module DSPy
3235
2792
  else
3236
2793
  raise "Program must respond to :call or :forward"
3237
2794
  end
3238
-
2795
+
3239
2796
  # Get collected traces (if trace collection is enabled)
3240
2797
  # Note: TraceCollector automatically collects via event subscriptions
3241
2798
  traces = capture_traces ? @trace_collector.traces : []
3242
-
2799
+
3243
2800
  # Evaluate with metric
3244
2801
  # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
3245
2802
  begin
@@ -3257,7 +2814,7 @@ module DSPy
3257
2814
  raise arg_error
3258
2815
  end
3259
2816
  end
3260
-
2817
+
3261
2818
  # Ensure we always have a ScoreWithFeedback object
3262
2819
  if score_result.is_a?(ScoreWithFeedback)
3263
2820
  results << score_result
@@ -3269,14 +2826,14 @@ module DSPy
3269
2826
  feedback: nil
3270
2827
  )
3271
2828
  end
3272
-
2829
+
3273
2830
  rescue => e
3274
2831
  DSPy.logger.error("Evaluation error: #{e.message}")
3275
2832
  # Return zero score on failure
3276
2833
  results << 0.0
3277
2834
  end
3278
2835
  end
3279
-
2836
+
3280
2837
  results
3281
2838
  end
3282
2839
 
@@ -3292,21 +2849,21 @@ module DSPy
3292
2849
  end
3293
2850
  def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
3294
2851
  reflective_data = []
3295
-
2852
+
3296
2853
  examples.zip(predictions, scores).each do |example, prediction, score|
3297
2854
  # Extract score value
3298
2855
  score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
3299
-
2856
+
3300
2857
  # Include failed predictions (below threshold)
3301
2858
  next if score_value >= threshold
3302
-
2859
+
3303
2860
  # Extract feedback if available
3304
2861
  feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
3305
2862
  score.feedback
3306
2863
  else
3307
2864
  "Low performance (score: #{score_value.round(2)})"
3308
2865
  end
3309
-
2866
+
3310
2867
  reflective_data << {
3311
2868
  'input' => example.input_values,
3312
2869
  'expected' => example.expected_values,
@@ -3315,7 +2872,7 @@ module DSPy
3315
2872
  'feedback' => feedback
3316
2873
  }
3317
2874
  end
3318
-
2875
+
3319
2876
  reflective_data
3320
2877
  end
3321
2878
 
@@ -3358,32 +2915,32 @@ module DSPy
3358
2915
  end
3359
2916
  def analyze_failures_and_propose(current_instruction, reflective_dataset)
3360
2917
  return [current_instruction] if reflective_dataset.empty?
3361
-
2918
+
3362
2919
  # Extract common failure patterns
3363
2920
  feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
3364
-
2921
+
3365
2922
  # Simple heuristic-based proposals
3366
2923
  proposals = []
3367
-
2924
+
3368
2925
  # If many failures, suggest more detailed instruction
3369
2926
  if reflective_dataset.size >= 3
3370
2927
  proposals << "#{current_instruction} Please provide step-by-step reasoning."
3371
2928
  end
3372
-
2929
+
3373
2930
  # If feedback mentions specific issues, address them
3374
2931
  if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
3375
2932
  proposals << "#{current_instruction} Be specific and clear in your response."
3376
2933
  end
3377
-
2934
+
3378
2935
  if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
3379
2936
  proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
3380
2937
  end
3381
-
2938
+
3382
2939
  # Always include at least one proposal
3383
2940
  proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
3384
-
2941
+
3385
2942
  proposals.uniq.take(3) # Return up to 3 proposals
3386
2943
  end
3387
2944
  end
3388
2945
  end
3389
- end
2946
+ end