dspy 0.22.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'ostruct'
3
4
  require 'sorbet-runtime'
4
5
  require_relative 'teleprompter'
6
+ require_relative '../events/subscriber_mixin'
5
7
 
6
8
  module DSPy
7
9
  module Teleprompt
@@ -11,6 +13,26 @@ module DSPy
11
13
  class GEPA < Teleprompter
12
14
  extend T::Sig
13
15
 
16
+ # Enum for mutation operation types
17
+ class MutationType < T::Enum
18
+ enums do
19
+ Rewrite = new
20
+ Expand = new
21
+ Simplify = new
22
+ Combine = new
23
+ Rephrase = new
24
+ end
25
+ end
26
+
27
+ # Enum for crossover operation types
28
+ class CrossoverType < T::Enum
29
+ enums do
30
+ Uniform = new
31
+ Blend = new
32
+ Structured = new
33
+ end
34
+ end
35
+
14
36
  # Immutable execution trace record using Ruby's Data class
15
37
  # Captures execution events for GEPA's reflective analysis
16
38
  class ExecutionTrace < Data.define(
@@ -537,51 +559,2176 @@ module DSPy
537
559
  timestamps = traces.map(&:timestamp).sort
538
560
  (timestamps.last - timestamps.first).to_f
539
561
  end
540
- end
562
+
563
+ # LLM-based reflection methods for Phase 2
564
+
565
+ public
566
+
567
+ # Perform LLM-based reflection on execution traces using DSPy::Predict
568
+ sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
569
+ def reflect_with_llm(traces)
570
+ return reflect_on_traces(traces) if traces.empty?
571
+
572
+ begin
573
+ # Use DSPy::Predict for analysis instead of raw prompts
574
+ prediction = analyze_traces_with_dspy(traces)
575
+ convert_prediction_to_reflection_result(prediction, traces)
576
+ rescue => e
577
+ # Fallback to rule-based analysis on LLM failure
578
+ fallback_result = reflect_on_traces(traces)
579
+ fallback_result.class.new(
580
+ trace_id: fallback_result.trace_id,
581
+ diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
582
+ improvements: fallback_result.improvements,
583
+ confidence: [fallback_result.confidence * 0.5, 0.5].min,
584
+ reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
585
+ suggested_mutations: fallback_result.suggested_mutations,
586
+ metadata: fallback_result.metadata.merge(
587
+ llm_error: e.message,
588
+ fallback_used: true
589
+ )
590
+ )
591
+ end
592
+ end
593
+
594
+ # Generate structured reflection prompt for LLM (public API)
595
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
596
+ def generate_reflection_prompt(traces)
597
+ if traces.empty?
598
+ return <<~PROMPT
599
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
600
+
601
+ **Task**: Analyze execution patterns and provide optimization recommendations.
602
+
603
+ **Context**: No execution traces available.
604
+
605
+ Please provide your analysis in the following JSON format:
606
+ {
607
+ "diagnosis": "Brief description of what you observed",
608
+ "improvements": ["List of actionable improvement suggestions"],
609
+ "confidence": 0.0,
610
+ "reasoning": "Your reasoning process",
611
+ "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
612
+ "insights": {
613
+ "pattern_detected": "no_data",
614
+ "optimization_opportunity": "data_collection"
615
+ }
616
+ }
617
+ PROMPT
618
+ end
619
+
620
+ summary = trace_summary_for_reflection(traces)
621
+ insights = extract_optimization_insights(traces)
622
+
623
+ <<~PROMPT
624
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
625
+
626
+ **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
627
+
628
+ **Execution Summary**:
629
+ #{summary}
630
+
631
+ **Optimization Context**:
632
+ - This is part of a genetic algorithm for prompt optimization
633
+ - Available mutation types: rewrite, expand, simplify, combine, rephrase
634
+ - Goal is to improve prompt effectiveness through iterative evolution
635
+ - Focus on actionable insights that can guide mutation and crossover operations
636
+
637
+ **Key Optimization Insights**:
638
+ #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
639
+
640
+ **Sample Traces**:
641
+ #{format_traces_for_prompt(traces.take(3))}
642
+
643
+ Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
644
+ {
645
+ "diagnosis": "Brief description of execution patterns and issues identified",
646
+ "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
647
+ "confidence": 0.85,
648
+ "reasoning": "Your detailed reasoning process for the analysis",
649
+ "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
650
+ "insights": {
651
+ "pattern_detected": "primary_pattern_identified",
652
+ "optimization_opportunity": "key_area_for_improvement"
653
+ }
654
+ }
655
+
656
+ Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
657
+ PROMPT
658
+ end
659
+
660
+ # Parse LLM reflection response into ReflectionResult (public API)
661
+ sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
662
+ def parse_llm_reflection(response_text, original_traces)
663
+ reflection_id = generate_reflection_id
664
+
665
+ begin
666
+ parsed = JSON.parse(response_text)
667
+
668
+ # Extract and validate components
669
+ diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
670
+ improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
671
+ confidence = [parsed['confidence'].to_f, 1.0].min
672
+ reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
673
+
674
+ # Validate and sanitize mutation suggestions
675
+ raw_mutations = Array(parsed['suggested_mutations'])
676
+ valid_mutations = raw_mutations.filter_map do |mut|
677
+ mutation_symbol = mut.to_s.downcase.to_sym
678
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
679
+ mutation_symbol
680
+ end
681
+ end.uniq
682
+
683
+ # Ensure we have at least one valid mutation suggestion
684
+ valid_mutations = [:rewrite] if valid_mutations.empty?
685
+
686
+ ReflectionResult.new(
687
+ trace_id: reflection_id,
688
+ diagnosis: diagnosis,
689
+ improvements: improvements,
690
+ confidence: confidence,
691
+ reasoning: reasoning,
692
+ suggested_mutations: valid_mutations,
693
+ metadata: {
694
+ reflection_model: @config.reflection_lm,
695
+ analysis_timestamp: Time.now,
696
+ trace_count: original_traces.size,
697
+ token_usage: estimate_token_usage(response_text),
698
+ llm_based: true,
699
+ insights: parsed['insights'] || {}
700
+ }
701
+ )
702
+
703
+ rescue JSON::ParserError => e
704
+ # Handle malformed JSON response
705
+ ReflectionResult.new(
706
+ trace_id: reflection_id,
707
+ diagnosis: "LLM reflection JSON parsing error: #{e.message}",
708
+ improvements: ['Review prompt structure and LLM response format'],
709
+ confidence: 0.3,
710
+ reasoning: "Failed to parse LLM reflection response as valid JSON",
711
+ suggested_mutations: [:rewrite],
712
+ metadata: {
713
+ reflection_model: @config.reflection_lm,
714
+ analysis_timestamp: Time.now,
715
+ trace_count: original_traces.size,
716
+ token_usage: 0,
717
+ parsing_error: e.message,
718
+ raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
719
+ }
720
+ )
721
+ end
722
+ end
723
+
724
+ # Create comprehensive trace summary for reflection (public API)
725
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
726
+ def trace_summary_for_reflection(traces)
727
+ return "No execution traces available" if traces.empty?
728
+
729
+ llm_traces = traces.select(&:llm_trace?)
730
+ module_traces = traces.select(&:module_trace?)
731
+
732
+ total_tokens = llm_traces.sum(&:token_usage)
733
+ unique_models = llm_traces.map(&:model_name).compact.uniq
734
+ timespan = calculate_timespan(traces)
735
+
736
+ avg_response_length = if llm_traces.any?
737
+ total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
738
+ total_length / llm_traces.size
739
+ else
740
+ 0
741
+ end
742
+
743
+ <<~SUMMARY
744
+ Total traces: #{traces.size}
745
+ LLM interactions: #{llm_traces.size}
746
+ Module calls: #{module_traces.size}
747
+ Total tokens: #{total_tokens}
748
+ Models used: #{unique_models.join(', ')}
749
+ Average response length: #{avg_response_length} characters
750
+ Execution timespan: #{timespan.round(2)} seconds
751
+ SUMMARY
752
+ end
753
+
754
+ # Extract optimization insights from trace analysis (public API)
755
+ sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
756
+ def extract_optimization_insights(traces)
757
+ llm_traces = traces.select(&:llm_trace?)
758
+
759
+ insights = {
760
+ token_efficiency: analyze_token_efficiency(llm_traces),
761
+ response_quality: analyze_response_quality(llm_traces),
762
+ model_consistency: analyze_model_consistency(llm_traces)
763
+ }
764
+
765
+ insights
766
+ end
767
+
768
+ # Reflection with optimization context (public API)
769
+ sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
770
+ def reflection_with_context(traces, context)
771
+ base_result = reflect_with_llm(traces)
772
+
773
+ # Incorporate context into reasoning
774
+ context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
775
+ context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
776
+
777
+ if context[:current_best_score]
778
+ context_reasoning += "Current best score: #{context[:current_best_score]}. "
779
+ end
780
+
781
+ # Adjust mutation suggestions based on history
782
+ adjusted_mutations = adjust_mutations_for_history(
783
+ base_result.suggested_mutations,
784
+ context[:mutation_history] || [],
785
+ context[:recent_performance_trend]
786
+ )
787
+
788
+ ReflectionResult.new(
789
+ trace_id: base_result.trace_id,
790
+ diagnosis: base_result.diagnosis,
791
+ improvements: base_result.improvements,
792
+ confidence: base_result.confidence,
793
+ reasoning: context_reasoning + base_result.reasoning,
794
+ suggested_mutations: adjusted_mutations,
795
+ metadata: base_result.metadata.merge(optimization_context: context)
796
+ )
797
+ end
798
+
799
+ # LLM-based reflection methods for Phase 2
800
+
801
+ public
802
+
803
+ # Perform LLM-based reflection on execution traces using DSPy::Predict
804
+ sig { params(traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
805
+ def reflect_with_llm(traces)
806
+ return reflect_on_traces(traces) if traces.empty?
807
+
808
+ begin
809
+ # Use DSPy::Predict for analysis instead of raw prompts
810
+ prediction = analyze_traces_with_dspy(traces)
811
+ convert_prediction_to_reflection_result(prediction, traces)
812
+ rescue => e
813
+ # Fallback to rule-based analysis on LLM failure
814
+ fallback_result = reflect_on_traces(traces)
815
+ fallback_result.class.new(
816
+ trace_id: fallback_result.trace_id,
817
+ diagnosis: "LLM reflection failed (#{e.message}), using fallback analysis: #{fallback_result.diagnosis}",
818
+ improvements: fallback_result.improvements,
819
+ confidence: [fallback_result.confidence * 0.5, 0.5].min,
820
+ reasoning: "Fallback to rule-based analysis after LLM error: #{fallback_result.reasoning}",
821
+ suggested_mutations: fallback_result.suggested_mutations,
822
+ metadata: fallback_result.metadata.merge(
823
+ llm_error: e.message,
824
+ fallback_used: true
825
+ )
826
+ )
827
+ end
828
+ end
829
+
830
+ # Generate structured reflection prompt for LLM (public API)
831
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
832
+ def generate_reflection_prompt(traces)
833
+ if traces.empty?
834
+ return <<~PROMPT
835
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
836
+
837
+ **Task**: Analyze execution patterns and provide optimization recommendations.
838
+
839
+ **Context**: No execution traces available.
840
+
841
+ Please provide your analysis in the following JSON format:
842
+ {
843
+ "diagnosis": "Brief description of what you observed",
844
+ "improvements": ["List of actionable improvement suggestions"],
845
+ "confidence": 0.0,
846
+ "reasoning": "Your reasoning process",
847
+ "suggested_mutations": ["expand", "rewrite", "simplify", "combine", "rephrase"],
848
+ "insights": {
849
+ "pattern_detected": "no_data",
850
+ "optimization_opportunity": "data_collection"
851
+ }
852
+ }
853
+ PROMPT
854
+ end
855
+
856
+ summary = trace_summary_for_reflection(traces)
857
+ insights = extract_optimization_insights(traces)
858
+
859
+ <<~PROMPT
860
+ You are analyzing execution traces for a genetic algorithm-based prompt optimization system called GEPA.
861
+
862
+ **Task**: Analyze execution patterns and provide optimization recommendations for prompt evolution.
863
+
864
+ **Execution Summary**:
865
+ #{summary}
866
+
867
+ **Optimization Context**:
868
+ - This is part of a genetic algorithm for prompt optimization
869
+ - Available mutation types: rewrite, expand, simplify, combine, rephrase
870
+ - Goal is to improve prompt effectiveness through iterative evolution
871
+ - Focus on actionable insights that can guide mutation and crossover operations
872
+
873
+ **Key Optimization Insights**:
874
+ #{insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")}
875
+
876
+ **Sample Traces**:
877
+ #{format_traces_for_prompt(traces.take(3))}
878
+
879
+ Please analyze these execution patterns and provide optimization recommendations in the following JSON format:
880
+ {
881
+ "diagnosis": "Brief description of execution patterns and issues identified",
882
+ "improvements": ["List of 2-4 specific, actionable improvement suggestions"],
883
+ "confidence": 0.85,
884
+ "reasoning": "Your detailed reasoning process for the analysis",
885
+ "suggested_mutations": ["List of 2-3 mutation types that would be most beneficial"],
886
+ "insights": {
887
+ "pattern_detected": "primary_pattern_identified",
888
+ "optimization_opportunity": "key_area_for_improvement"
889
+ }
890
+ }
891
+
892
+ Focus on practical recommendations that will improve prompt performance through genetic algorithm evolution.
893
+ PROMPT
894
+ end
895
+
896
+ # Parse LLM reflection response into ReflectionResult (public API)
897
+ sig { params(response_text: String, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
898
+ def parse_llm_reflection(response_text, original_traces)
899
+ reflection_id = generate_reflection_id
900
+
901
+ begin
902
+ parsed = JSON.parse(response_text)
903
+
904
+ # Extract and validate components
905
+ diagnosis = parsed['diagnosis'] || 'LLM reflection analysis'
906
+ improvements = Array(parsed['improvements']).select { |i| i.is_a?(String) && !i.strip.empty? }
907
+ confidence = [parsed['confidence'].to_f, 1.0].min
908
+ reasoning = parsed['reasoning'] || 'LLM-based analysis of execution traces'
909
+
910
+ # Validate and sanitize mutation suggestions
911
+ raw_mutations = Array(parsed['suggested_mutations'])
912
+ valid_mutations = raw_mutations.filter_map do |mut|
913
+ mutation_symbol = mut.to_s.downcase.to_sym
914
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
915
+ mutation_symbol
916
+ end
917
+ end.uniq
918
+
919
+ # Ensure we have at least one valid mutation suggestion
920
+ valid_mutations = [:rewrite] if valid_mutations.empty?
921
+
922
+ ReflectionResult.new(
923
+ trace_id: reflection_id,
924
+ diagnosis: diagnosis,
925
+ improvements: improvements,
926
+ confidence: confidence,
927
+ reasoning: reasoning,
928
+ suggested_mutations: valid_mutations,
929
+ metadata: {
930
+ reflection_model: @config.reflection_lm,
931
+ analysis_timestamp: Time.now,
932
+ trace_count: original_traces.size,
933
+ token_usage: estimate_token_usage(response_text),
934
+ llm_based: true,
935
+ insights: parsed['insights'] || {}
936
+ }
937
+ )
938
+
939
+ rescue JSON::ParserError => e
940
+ # Handle malformed JSON response
941
+ ReflectionResult.new(
942
+ trace_id: reflection_id,
943
+ diagnosis: "LLM reflection JSON parsing error: #{e.message}",
944
+ improvements: ['Review prompt structure and LLM response format'],
945
+ confidence: 0.3,
946
+ reasoning: "Failed to parse LLM reflection response as valid JSON",
947
+ suggested_mutations: [:rewrite],
948
+ metadata: {
949
+ reflection_model: @config.reflection_lm,
950
+ analysis_timestamp: Time.now,
951
+ trace_count: original_traces.size,
952
+ token_usage: 0,
953
+ parsing_error: e.message,
954
+ raw_response: response_text.length > 500 ? "#{response_text[0..500]}..." : response_text
955
+ }
956
+ )
957
+ end
958
+ end
959
+
960
+ # Create comprehensive trace summary for reflection (public API)
961
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
962
+ def trace_summary_for_reflection(traces)
963
+ return "No execution traces available" if traces.empty?
964
+
965
+ llm_traces = traces.select(&:llm_trace?)
966
+ module_traces = traces.select(&:module_trace?)
967
+
968
+ total_tokens = llm_traces.sum(&:token_usage)
969
+ unique_models = llm_traces.map(&:model_name).compact.uniq
970
+ timespan = calculate_timespan(traces)
971
+
972
+ avg_response_length = if llm_traces.any?
973
+ total_length = llm_traces.sum { |t| t.response_text&.length || 0 }
974
+ total_length / llm_traces.size
975
+ else
976
+ 0
977
+ end
978
+
979
+ <<~SUMMARY
980
+ Total traces: #{traces.size}
981
+ LLM interactions: #{llm_traces.size}
982
+ Module calls: #{module_traces.size}
983
+ Total tokens: #{total_tokens}
984
+ Models used: #{unique_models.join(', ')}
985
+ Average response length: #{avg_response_length} characters
986
+ Execution timespan: #{timespan.round(2)} seconds
987
+ SUMMARY
988
+ end
989
+
990
+ # Extract optimization insights from trace analysis (public API)
991
+ sig { params(traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
992
+ def extract_optimization_insights(traces)
993
+ llm_traces = traces.select(&:llm_trace?)
994
+
995
+ insights = {
996
+ token_efficiency: analyze_token_efficiency(llm_traces),
997
+ response_quality: analyze_response_quality(llm_traces),
998
+ model_consistency: analyze_model_consistency(llm_traces)
999
+ }
1000
+
1001
+ insights
1002
+ end
1003
+
1004
+ # Reflection with optimization context (public API)
1005
+ sig { params(traces: T::Array[ExecutionTrace], context: T::Hash[Symbol, T.untyped]).returns(ReflectionResult) }
1006
+ def reflection_with_context(traces, context)
1007
+ base_result = reflect_with_llm(traces)
1008
+
1009
+ # Incorporate context into reasoning
1010
+ context_reasoning = "Generation #{context[:generation] || 'unknown'} analysis. "
1011
+ context_reasoning += "Population size: #{context[:population_size] || 'unknown'}. "
1012
+
1013
+ if context[:current_best_score]
1014
+ context_reasoning += "Current best score: #{context[:current_best_score]}. "
1015
+ end
1016
+
1017
+ # Adjust mutation suggestions based on history
1018
+ adjusted_mutations = adjust_mutations_for_history(
1019
+ base_result.suggested_mutations,
1020
+ context[:mutation_history] || [],
1021
+ context[:recent_performance_trend]
1022
+ )
1023
+
1024
+ ReflectionResult.new(
1025
+ trace_id: base_result.trace_id,
1026
+ diagnosis: base_result.diagnosis,
1027
+ improvements: base_result.improvements,
1028
+ confidence: base_result.confidence,
1029
+ reasoning: context_reasoning + base_result.reasoning,
1030
+ suggested_mutations: adjusted_mutations,
1031
+ metadata: base_result.metadata.merge(optimization_context: context)
1032
+ )
1033
+ end
1034
+
1035
+ public
1036
+
1037
+ # Create signature for trace reflection analysis (public API)
1038
+ sig { returns(T.class_of(DSPy::Signature)) }
1039
+ def create_trace_reflection_signature
1040
+ @trace_reflection_signature ||= Class.new(DSPy::Signature) do
1041
+ description "Analyze execution traces from GEPA optimization system and provide actionable optimization insights"
1042
+
1043
+ input do
1044
+ const :execution_summary, String, description: "Summary of execution traces and performance patterns"
1045
+ const :optimization_context, String, description: "Context about the genetic algorithm optimization goals"
1046
+ const :key_insights, String, description: "Key insights extracted from trace analysis"
1047
+ const :sample_traces, String, description: "Representative execution trace samples"
1048
+ end
1049
+
1050
+ output do
1051
+ const :diagnosis, String, description: "Brief description of execution patterns and issues identified"
1052
+ const :improvements, T::Array[String], description: "List of 2-4 specific actionable improvement suggestions"
1053
+ const :confidence, Float, description: "Confidence level in analysis (0.0 to 1.0)"
1054
+ const :reasoning, String, description: "Detailed reasoning process for the analysis"
1055
+ const :suggested_mutations, T::Array[String], description: "List of 2-3 most beneficial mutation types from: rewrite, expand, simplify, combine, rephrase"
1056
+ const :pattern_detected, String, description: "Primary pattern identified in execution traces"
1057
+ const :optimization_opportunity, String, description: "Key area identified for performance improvement"
1058
+ end
1059
+ end
1060
+ end
541
1061
 
542
- # Configuration for GEPA optimization
543
- class GEPAConfig < Config
544
- extend T::Sig
1062
+ # Perform LLM analysis using DSPy::Predict (public API)
1063
+ sig { params(traces: T::Array[ExecutionTrace]).returns(DSPy::Prediction) }
1064
+ def analyze_traces_with_dspy(traces)
1065
+ predictor = DSPy::Predict.new(create_trace_reflection_signature)
1066
+
1067
+ # Prepare input data
1068
+ summary = trace_summary_for_reflection(traces)
1069
+ insights = extract_optimization_insights(traces)
1070
+ insights_text = insights.map { |k, v| "- #{k}: #{v.is_a?(Hash) ? v.values.join(', ') : v}" }.join("\n")
1071
+
1072
+ # Get LLM analysis
1073
+ predictor.call(
1074
+ execution_summary: summary,
1075
+ optimization_context: "GEPA genetic algorithm for prompt optimization. Available mutations: rewrite, expand, simplify, combine, rephrase. Goal: improve prompt effectiveness through iterative evolution.",
1076
+ key_insights: insights_text,
1077
+ sample_traces: format_traces_for_prompt(traces.take(3))
1078
+ )
1079
+ end
545
1080
 
1081
+ # Convert DSPy prediction to ReflectionResult (public API)
1082
+ sig { params(prediction: DSPy::Prediction, original_traces: T::Array[ExecutionTrace]).returns(ReflectionResult) }
1083
+ def convert_prediction_to_reflection_result(prediction, original_traces)
1084
+ reflection_id = generate_reflection_id
1085
+
1086
+ # Extract and validate prediction results
1087
+ diagnosis = prediction.diagnosis || 'DSPy reflection analysis'
1088
+ improvements = Array(prediction.improvements).select { |i| i.is_a?(String) && !i.strip.empty? }
1089
+ confidence = [[prediction.confidence&.to_f || 0.0, 1.0].min, 0.0].max
1090
+ reasoning = prediction.reasoning || 'DSPy-based analysis of execution traces'
1091
+
1092
+ # Validate mutation suggestions
1093
+ valid_mutations = Array(prediction.suggested_mutations).filter_map do |mut|
1094
+ mutation_symbol = mut.to_s.downcase.to_sym
1095
+ if [:rewrite, :expand, :simplify, :combine, :rephrase].include?(mutation_symbol)
1096
+ mutation_symbol
1097
+ end
1098
+ end.uniq
1099
+
1100
+ # Ensure we have at least one valid mutation suggestion
1101
+ valid_mutations = [:rewrite] if valid_mutations.empty?
1102
+
1103
+ ReflectionResult.new(
1104
+ trace_id: reflection_id,
1105
+ diagnosis: diagnosis,
1106
+ improvements: improvements,
1107
+ confidence: confidence,
1108
+ reasoning: reasoning,
1109
+ suggested_mutations: valid_mutations,
1110
+ metadata: {
1111
+ reflection_model: @config.reflection_lm,
1112
+ analysis_timestamp: Time.now,
1113
+ trace_count: original_traces.size,
1114
+ token_usage: estimate_token_usage(prediction.to_s),
1115
+ llm_based: true,
1116
+ dspy_prediction: true,
1117
+ insights: {
1118
+ pattern_detected: prediction.pattern_detected || "unknown_pattern",
1119
+ optimization_opportunity: prediction.optimization_opportunity || "general_optimization"
1120
+ }
1121
+ }
1122
+ )
1123
+ end
1124
+
1125
+ private
1126
+
1127
+ # Generate unique reflection ID
546
1128
  sig { returns(String) }
547
- attr_accessor :reflection_lm
548
-
549
- sig { returns(Integer) }
550
- attr_accessor :num_generations
551
-
552
- sig { returns(Integer) }
553
- attr_accessor :population_size
1129
+ def generate_reflection_id
1130
+ "reflection-#{SecureRandom.hex(4)}"
1131
+ end
554
1132
 
555
- sig { returns(Float) }
556
- attr_accessor :mutation_rate
1133
+ # Generate diagnosis text
1134
+ sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(String) }
1135
+ def generate_diagnosis(patterns)
1136
+ if patterns[:total_tokens] > 400
1137
+ 'High token usage indicates potential inefficiency in prompt design'
1138
+ elsif patterns[:llm_traces_count] == 0
1139
+ 'No LLM interactions found - execution may not be working as expected'
1140
+ elsif patterns[:avg_response_length] < 10
1141
+ 'Responses are unusually brief which may indicate prompt clarity issues'
1142
+ else
1143
+ 'Execution patterns appear normal with room for optimization'
1144
+ end
1145
+ end
557
1146
 
558
- sig { returns(T::Boolean) }
559
- attr_accessor :use_pareto_selection
1147
+ # Generate reasoning text
1148
+ sig { params(patterns: T::Hash[Symbol, T.untyped], traces: T::Array[ExecutionTrace]).returns(String) }
1149
+ def generate_reasoning(patterns, traces)
1150
+ reasoning_parts = []
1151
+
1152
+ reasoning_parts << "Analyzed #{traces.size} execution traces"
1153
+ reasoning_parts << "#{patterns[:llm_traces_count]} LLM interactions"
1154
+ reasoning_parts << "#{patterns[:module_traces_count]} module operations"
1155
+ reasoning_parts << "Total token usage: #{patterns[:total_tokens]}"
1156
+
1157
+ reasoning_parts.join('. ') + '.'
1158
+ end
560
1159
 
561
- sig { void }
562
- def initialize
563
- super
564
- @reflection_lm = 'gpt-4o'
565
- @num_generations = 10
566
- @population_size = 8
567
- @mutation_rate = 0.7
568
- @use_pareto_selection = true
1160
+ # Calculate confidence based on patterns
1161
+ sig { params(patterns: T::Hash[Symbol, T.untyped]).returns(Float) }
1162
+ def calculate_confidence(patterns)
1163
+ base_confidence = 0.7
1164
+
1165
+ # More traces = higher confidence
1166
+ trace_bonus = [patterns[:llm_traces_count] + patterns[:module_traces_count], 10].min * 0.02
1167
+
1168
+ # Reasonable token usage = higher confidence
1169
+ token_penalty = patterns[:total_tokens] > 1000 ? -0.1 : 0.0
1170
+
1171
+ [(base_confidence + trace_bonus + token_penalty), 1.0].min
569
1172
  end
570
1173
 
571
- sig { returns(T::Hash[Symbol, T.untyped]) }
572
- def to_h
573
- super.merge({
574
- reflection_lm: @reflection_lm,
575
- num_generations: @num_generations,
576
- population_size: @population_size,
577
- mutation_rate: @mutation_rate,
578
- use_pareto_selection: @use_pareto_selection
579
- })
1174
+ # Calculate average response length from LLM traces
1175
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(Integer) }
1176
+ def calculate_avg_response_length(llm_traces)
1177
+ return 0 if llm_traces.empty?
1178
+
1179
+ total_length = llm_traces.sum do |trace|
1180
+ response = trace.response_text
1181
+ response ? response.length : 0
1182
+ end
1183
+
1184
+ total_length / llm_traces.size
580
1185
  end
581
- end
582
1186
 
583
- sig { returns(GEPAConfig) }
584
- attr_reader :config
1187
+ # Calculate timespan of traces
1188
+ sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
1189
+ def calculate_timespan(traces)
1190
+ return 0.0 if traces.size < 2
1191
+
1192
+ timestamps = traces.map(&:timestamp).sort
1193
+ (timestamps.last - timestamps.first).to_f
1194
+ end
1195
+
1196
+
1197
+ # Format traces for inclusion in prompt
1198
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
1199
+ def format_traces_for_prompt(traces)
1200
+ traces.map.with_index do |trace, idx|
1201
+ prompt_preview = truncate_text(trace.prompt_text || 'N/A', 100)
1202
+ response_preview = truncate_text(trace.response_text || 'N/A', 100)
1203
+ "#{idx + 1}. [#{trace.event_name}] #{prompt_preview} → #{response_preview}"
1204
+ end.join("\n")
1205
+ end
1206
+
1207
+ # Estimate token usage from response
1208
+ sig { params(text: String).returns(Integer) }
1209
+ def estimate_token_usage(text)
1210
+ # Rough estimation: ~4 characters per token
1211
+ (text.length / 4.0).ceil
1212
+ end
1213
+
1214
+ # Analyze token efficiency patterns
1215
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1216
+ def analyze_token_efficiency(llm_traces)
1217
+ return { status: 'no_data', suggestions: [] } if llm_traces.empty?
1218
+
1219
+ total_tokens = llm_traces.sum(&:token_usage)
1220
+ avg_tokens = total_tokens.to_f / llm_traces.size
1221
+
1222
+ if avg_tokens > 400
1223
+ {
1224
+ status: 'poor',
1225
+ average_tokens: avg_tokens,
1226
+ suggestions: ['Consider reducing prompt length', 'Optimize instruction clarity']
1227
+ }
1228
+ elsif avg_tokens > 200
1229
+ {
1230
+ status: 'moderate',
1231
+ average_tokens: avg_tokens,
1232
+ suggestions: ['Monitor token usage trends', 'Consider prompt optimization']
1233
+ }
1234
+ else
1235
+ {
1236
+ status: 'good',
1237
+ average_tokens: avg_tokens,
1238
+ suggestions: ['Token usage appears efficient']
1239
+ }
1240
+ end
1241
+ end
1242
+
1243
+ # Analyze response quality patterns
1244
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1245
+ def analyze_response_quality(llm_traces)
1246
+ return { consistency: 'no_data', recommendations: [] } if llm_traces.empty?
1247
+
1248
+ response_lengths = llm_traces.map { |t| t.response_text&.length || 0 }
1249
+ length_variance = calculate_variance(response_lengths)
1250
+
1251
+ if length_variance > 1000
1252
+ {
1253
+ consistency: 'inconsistent',
1254
+ variance: length_variance,
1255
+ recommendations: [
1256
+ 'Add response format guidelines',
1257
+ 'Consider structured output templates'
1258
+ ]
1259
+ }
1260
+ else
1261
+ {
1262
+ consistency: 'consistent',
1263
+ variance: length_variance,
1264
+ recommendations: ['Response quality appears consistent']
1265
+ }
1266
+ end
1267
+ end
1268
+
1269
+ # Analyze model consistency
1270
+ sig { params(llm_traces: T::Array[ExecutionTrace]).returns(T::Hash[Symbol, T.untyped]) }
1271
+ def analyze_model_consistency(llm_traces)
1272
+ models = llm_traces.map(&:model_name).compact.uniq
1273
+
1274
+ {
1275
+ unique_models: models.size,
1276
+ models_used: models,
1277
+ recommendation: models.size > 1 ? 'Consider using single model for consistency' : 'Model usage is consistent'
1278
+ }
1279
+ end
1280
+
1281
+ # Adjust mutations based on history to avoid repetition
1282
+ sig { params(suggested: T::Array[Symbol], history: T::Array[Symbol], trend: T.nilable(String)).returns(T::Array[Symbol]) }
1283
+ def adjust_mutations_for_history(suggested, history, trend)
1284
+ # Count recent usage of each mutation type
1285
+ recent_usage = history.last(5).tally
1286
+
1287
+ # Filter out overused mutations
1288
+ adjusted = suggested.reject do |mutation|
1289
+ recent_usage[mutation] && recent_usage[mutation] >= 2
1290
+ end
1291
+
1292
+ # If trend is declining, prefer different strategies
1293
+ if trend == 'declining'
1294
+ adjusted = adjusted.reject { |m| m == :expand } # Avoid expansion if performance declining
1295
+ adjusted += [:simplify, :rephrase] unless adjusted.include?(:simplify) || adjusted.include?(:rephrase)
1296
+ end
1297
+
1298
+ # Ensure we always have at least one suggestion
1299
+ adjusted.empty? ? [:rewrite] : adjusted.uniq
1300
+ end
1301
+
1302
+ # Calculate variance for array of numbers
1303
+ sig { params(values: T::Array[Integer]).returns(Float) }
1304
+ def calculate_variance(values)
1305
+ return 0.0 if values.size < 2
1306
+
1307
+ mean = values.sum.to_f / values.size
1308
+ sum_squared_diff = values.sum { |v| (v - mean) ** 2 }
1309
+ sum_squared_diff / values.size
1310
+ end
1311
+
1312
+ # Truncate text to specified length with ellipsis
1313
+ sig { params(text: String, length: Integer).returns(String) }
1314
+ def truncate_text(text, length)
1315
+ return text if text.length <= length
1316
+ "#{text[0...length]}..."
1317
+ end
1318
+ end
1319
+
1320
+ # GeneticEngine orchestrates the genetic algorithm for prompt evolution
1321
+ # Manages population, selection, and evolution across generations
1322
+ class GeneticEngine
1323
+ extend T::Sig
1324
+
1325
+ sig { returns(GEPAConfig) }
1326
+ attr_reader :config
1327
+
1328
+ sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1329
+ attr_reader :metric
1330
+
1331
+ sig { returns(T::Array[T.untyped]) }
1332
+ attr_reader :population
1333
+
1334
+ sig { returns(Integer) }
1335
+ attr_reader :generation
1336
+
1337
+ sig { params(config: GEPAConfig, metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)).void }
1338
+ def initialize(config:, metric:)
1339
+ @config = config
1340
+ @metric = metric
1341
+ @population = T.let([], T::Array[T.untyped])
1342
+ @generation = 0
1343
+ @fitness_scores = T.let([], T::Array[Float])
1344
+ end
1345
+
1346
+ # Initialize population with diverse instruction variants
1347
+ sig { params(program: T.untyped).void }
1348
+ def initialize_population(program)
1349
+ @population = []
1350
+
1351
+ # Start with original program
1352
+ @population << program
1353
+
1354
+ # Generate instruction variants to fill population
1355
+ original_instruction = program.signature_class.description
1356
+ variants = generate_instruction_variants(original_instruction)
1357
+
1358
+ # Create program copies with different instructions
1359
+ variants.take(@config.population_size - 1).each do |variant|
1360
+ variant_program = create_program_with_instruction(program, variant)
1361
+ @population << variant_program
1362
+ end
1363
+
1364
+ # If we need more candidates, duplicate and mutate
1365
+ while @population.size < @config.population_size
1366
+ base_program = @population.sample
1367
+ mutated = create_program_with_instruction(base_program,
1368
+ generate_instruction_variants(base_program.signature_class.description).first)
1369
+ @population << mutated
1370
+ end
1371
+
1372
+ @generation = 0
1373
+ end
1374
+
1375
+ # Evaluate all population members on the training set
1376
+ sig { params(trainset: T::Array[T.untyped]).returns(T::Array[Float]) }
1377
+ def evaluate_population(trainset)
1378
+ @fitness_scores = @population.map do |candidate|
1379
+ scores = trainset.map do |example|
1380
+ prediction = candidate.call(**example.input_values)
1381
+ @metric.call(example, prediction).to_f
1382
+ rescue => e
1383
+ # Handle evaluation errors gracefully
1384
+ 0.0
1385
+ end
1386
+
1387
+ scores.sum / scores.size
1388
+ end
1389
+
1390
+ @fitness_scores
1391
+ end
1392
+
1393
+ # Evolve to next generation using selection and mutation
1394
+ sig { params(trainset: T::Array[T.untyped]).void }
1395
+ def evolve_generation(trainset)
1396
+ current_scores = evaluate_population(trainset)
1397
+
1398
+ # Simple selection: keep top 50% and mutate them
1399
+ sorted_indices = (0...@population.size).sort_by { |i| -current_scores[i] }
1400
+ survivors = sorted_indices.take(@config.population_size / 2)
1401
+
1402
+ new_population = []
1403
+
1404
+ # Keep best performers
1405
+ survivors.each { |i| new_population << @population[i] }
1406
+
1407
+ # Fill rest with mutations of survivors
1408
+ while new_population.size < @config.population_size
1409
+ parent_index = survivors.sample
1410
+ parent = @population[parent_index]
1411
+
1412
+ # Generate mutation
1413
+ variants = generate_instruction_variants(parent.signature_class.description)
1414
+ mutated = create_program_with_instruction(parent, variants.first || parent.signature_class.description)
1415
+ new_population << mutated
1416
+ end
1417
+
1418
+ @population = new_population
1419
+ @generation += 1
1420
+ end
1421
+
1422
+ # Run complete evolution process
1423
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
1424
+ def run_evolution(program, trainset)
1425
+ initialize_population(program)
1426
+
1427
+ history = []
1428
+
1429
+ # Initial evaluation
1430
+ initial_scores = evaluate_population(trainset)
1431
+ history << {
1432
+ generation: 0,
1433
+ best_fitness: initial_scores.max,
1434
+ avg_fitness: initial_scores.sum / initial_scores.size,
1435
+ diversity: population_diversity
1436
+ }
1437
+
1438
+ # Evolution loop
1439
+ @config.num_generations.times do
1440
+ evolve_generation(trainset)
1441
+ scores = evaluate_population(trainset)
1442
+
1443
+ history << {
1444
+ generation: @generation,
1445
+ best_fitness: scores.max,
1446
+ avg_fitness: scores.sum / scores.size,
1447
+ diversity: population_diversity
1448
+ }
1449
+ end
1450
+
1451
+ {
1452
+ best_candidate: get_best_candidate,
1453
+ best_fitness: @fitness_scores.max,
1454
+ generation_history: history,
1455
+ final_population: @population.dup
1456
+ }
1457
+ end
1458
+
1459
+ # Get the best performing candidate from current population
1460
+ sig { returns(T.untyped) }
1461
+ def get_best_candidate
1462
+ return @population.first if @fitness_scores.empty?
1463
+
1464
+ best_index = @fitness_scores.each_with_index.max_by { |score, _| score }[1]
1465
+ @population[best_index]
1466
+ end
1467
+
1468
+ # Measure diversity of instructions in current population
1469
+ sig { returns(Float) }
1470
+ def population_diversity
1471
+ return 0.0 if @population.empty?
1472
+
1473
+ instructions = @population.map(&:signature_class).map(&:description)
1474
+ unique_instructions = instructions.uniq.size
1475
+
1476
+ unique_instructions.to_f / @population.size.to_f
1477
+ end
1478
+
1479
+ private
1480
+
1481
+ # Generate instruction variants (similar to simple optimization)
1482
+ sig { params(original_instruction: String).returns(T::Array[String]) }
1483
+ def generate_instruction_variants(original_instruction)
1484
+ variants = []
1485
+
1486
+ # Add "step by step" variant
1487
+ unless original_instruction.include?("step")
1488
+ variants << "#{original_instruction} Think step by step."
1489
+ end
1490
+
1491
+ # Add "detailed" variant
1492
+ unless original_instruction.include?("detail")
1493
+ variants << "#{original_instruction} Provide detailed reasoning."
1494
+ end
1495
+
1496
+ # Add "careful" variant
1497
+ unless original_instruction.include?("careful")
1498
+ variants << "Be careful and accurate. #{original_instruction}"
1499
+ end
1500
+
1501
+ # Add "examples" variant
1502
+ unless original_instruction.include?("example")
1503
+ variants << "#{original_instruction} Use examples in your response."
1504
+ end
1505
+
1506
+ # Add "precise" variant
1507
+ unless original_instruction.include?("precise")
1508
+ variants << "Be precise and specific. #{original_instruction}"
1509
+ end
1510
+
1511
+ variants.shuffle.take(5) # Return up to 5 variants, shuffled
1512
+ end
1513
+
1514
+ # Create program copy with modified instruction using DSPy.rb dynamic capabilities
1515
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
1516
+ def create_program_with_instruction(original_program, new_instruction)
1517
+ case original_program
1518
+ when DSPy::Predict
1519
+ # DSPy::Predict has built-in support for instruction modification
1520
+ original_program.with_instruction(new_instruction)
1521
+ when DSPy::Module
1522
+ # For custom DSPy::Module classes, create new instance with updated predictors
1523
+ create_modified_module(original_program, new_instruction)
1524
+ else
1525
+ # For other types (like test doubles), check available methods
1526
+ if original_program.respond_to?(:with_instruction)
1527
+ original_program.with_instruction(new_instruction)
1528
+ elsif original_program.respond_to?(:signature_class)
1529
+ # Create new DSPy::Predict with the same signature but new instruction
1530
+ signature_class = original_program.signature_class
1531
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
1532
+ else
1533
+ # Fallback: return original if we can't modify
1534
+ original_program
1535
+ end
1536
+ end
1537
+ rescue => e
1538
+ # Return original program on error
1539
+ original_program
1540
+ end
1541
+
1542
+ # Create modified version of custom DSPy::Module (for GeneticEngine)
1543
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
1544
+ def create_modified_module(original_module, new_instruction)
1545
+ begin
1546
+ # Create a new instance of the same class
1547
+ new_module = original_module.class.new
1548
+
1549
+ # Try to find and update any internal predictors
1550
+ original_module.instance_variables.each do |var_name|
1551
+ var_value = original_module.instance_variable_get(var_name)
1552
+
1553
+ if var_value.is_a?(DSPy::Predict)
1554
+ # Update the instruction for internal predictors
1555
+ modified_predictor = var_value.with_instruction(new_instruction)
1556
+ new_module.instance_variable_set(var_name, modified_predictor)
1557
+ else
1558
+ # Copy other instance variables as-is
1559
+ new_module.instance_variable_set(var_name, var_value)
1560
+ end
1561
+ end
1562
+
1563
+ new_module
1564
+ rescue => e
1565
+ # Fallback to original module
1566
+ original_module
1567
+ end
1568
+ end
1569
+ end
1570
+
1571
+ # FitnessScore represents multi-dimensional evaluation results
1572
+ class FitnessScore < T::Struct
1573
+ extend T::Sig
1574
+
1575
+ const :primary_score, Float
1576
+ const :secondary_scores, T::Hash[Symbol, Float]
1577
+ const :overall_score, Float
1578
+ const :metadata, T::Hash[Symbol, T.untyped]
1579
+
1580
+ sig do
1581
+ params(
1582
+ primary_score: Float,
1583
+ secondary_scores: T::Hash[Symbol, Float],
1584
+ overall_score: Float,
1585
+ metadata: T.nilable(T::Hash[Symbol, T.untyped])
1586
+ ).void
1587
+ end
1588
+ def initialize(primary_score:, secondary_scores:, overall_score:, metadata: nil)
1589
+ # Validate score ranges
1590
+ [primary_score, overall_score].each do |score|
1591
+ if score < 0.0 || score > 1.0
1592
+ raise ArgumentError, "Score must be between 0.0 and 1.0, got #{score}"
1593
+ end
1594
+ end
1595
+
1596
+ secondary_scores.each do |name, score|
1597
+ if score < 0.0 || score > 1.0
1598
+ raise ArgumentError, "Secondary score #{name} must be between 0.0 and 1.0, got #{score}"
1599
+ end
1600
+ end
1601
+
1602
+ super(
1603
+ primary_score: primary_score,
1604
+ secondary_scores: secondary_scores.freeze,
1605
+ overall_score: overall_score,
1606
+ metadata: (metadata || {}).freeze
1607
+ )
1608
+ end
1609
+
1610
+ # Check if this score is dominated by another (for Pareto analysis)
1611
+ sig { params(other: FitnessScore).returns(T::Boolean) }
1612
+ def dominated_by?(other)
1613
+ return false if overall_score > other.overall_score
1614
+ return true if overall_score < other.overall_score
1615
+
1616
+ # If overall scores are equal, check secondary metrics
1617
+ secondary_scores.all? do |metric, score|
1618
+ other_score = other.secondary_scores[metric] || 0.0
1619
+ score <= other_score
1620
+ end
1621
+ end
1622
+
1623
+ # Get combined score for specific objectives
1624
+ sig { params(objectives: T::Array[Symbol]).returns(Float) }
1625
+ def score_for_objectives(objectives)
1626
+ relevant_scores = objectives.map { |obj| secondary_scores[obj] || 0.0 }
1627
+ return primary_score if relevant_scores.empty?
1628
+
1629
+ (primary_score + relevant_scores.sum) / (objectives.size + 1)
1630
+ end
1631
+ end
1632
+
1633
+ # FitnessEvaluator provides multi-dimensional evaluation of prompt candidates
1634
+ class FitnessEvaluator
1635
+ extend T::Sig
1636
+
1637
+ sig { returns(T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)) }
1638
+ attr_reader :primary_metric
1639
+
1640
+ sig { returns(GEPAConfig) }
1641
+ attr_reader :config
1642
+
1643
+ sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1644
+ attr_reader :secondary_metrics
1645
+
1646
+ sig do
1647
+ params(
1648
+ primary_metric: T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped),
1649
+ config: GEPAConfig,
1650
+ secondary_metrics: T.nilable(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)])
1651
+ ).void
1652
+ end
1653
+ def initialize(primary_metric:, config:, secondary_metrics: nil)
1654
+ @primary_metric = primary_metric
1655
+ @config = config
1656
+ @secondary_metrics = secondary_metrics || default_secondary_metrics
1657
+ @trace_collector = TraceCollector.new
1658
+ end
1659
+
1660
+ # Evaluate a single candidate program
1661
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(FitnessScore) }
1662
+ def evaluate_candidate(program, trainset)
1663
+ start_time = Time.now
1664
+ predictions = []
1665
+ traces = []
1666
+
1667
+ # Collect primary metric scores and execution data
1668
+ primary_scores = trainset.map do |example|
1669
+ prediction_start = Time.now
1670
+ prediction = program.call(**example.input_values)
1671
+ prediction_time = Time.now - prediction_start
1672
+
1673
+ predictions << {
1674
+ prediction: prediction,
1675
+ latency: prediction_time,
1676
+ example: example
1677
+ }
1678
+
1679
+ @primary_metric.call(example, prediction).to_f
1680
+ rescue => e
1681
+ # Handle prediction errors
1682
+ predictions << {
1683
+ prediction: nil,
1684
+ latency: 0.0,
1685
+ example: example,
1686
+ error: e.message
1687
+ }
1688
+ 0.0
1689
+ end
1690
+
1691
+ primary_score = primary_scores.sum / primary_scores.size
1692
+
1693
+ # Calculate secondary metrics
1694
+ secondary_scores = {}
1695
+
1696
+ # Token efficiency (mock data for now - will be replaced with real trace collection)
1697
+ mock_traces = predictions.map.with_index do |pred, i|
1698
+ OpenStruct.new(token_usage: 50 + rand(100))
1699
+ end
1700
+ secondary_scores[:token_efficiency] = calculate_token_efficiency(mock_traces, predictions.size)
1701
+
1702
+ # Response consistency - use first output field for any signature
1703
+ response_texts = predictions.map do |p|
1704
+ pred = p[:prediction]
1705
+ if pred && pred.respond_to?(:class) && pred.class.respond_to?(:props)
1706
+ # Get first output field name and value
1707
+ first_field = pred.class.props.keys.first
1708
+ first_field ? (pred.send(first_field)&.to_s || '') : ''
1709
+ else
1710
+ ''
1711
+ end
1712
+ end
1713
+ secondary_scores[:consistency] = calculate_consistency(response_texts)
1714
+
1715
+ # Latency performance
1716
+ latencies = predictions.map { |p| p[:latency] }
1717
+ secondary_scores[:latency] = calculate_latency_score(latencies)
1718
+
1719
+ # Calculate weighted overall score
1720
+ overall_score = calculate_overall_score(primary_score, secondary_scores)
1721
+
1722
+ FitnessScore.new(
1723
+ primary_score: primary_score,
1724
+ secondary_scores: secondary_scores,
1725
+ overall_score: overall_score,
1726
+ metadata: {
1727
+ evaluation_time: Time.now - start_time,
1728
+ examples_count: trainset.size,
1729
+ errors_count: predictions.count { |p| p[:error] }
1730
+ }
1731
+ )
1732
+ end
1733
+
1734
+ # Evaluate multiple candidates in batch
1735
+ sig { params(programs: T::Array[T.untyped], trainset: T::Array[T.untyped]).returns(T::Array[FitnessScore]) }
1736
+ def batch_evaluate(programs, trainset)
1737
+ programs.map { |program| evaluate_candidate(program, trainset) }
1738
+ end
1739
+
1740
+ # Compare two fitness scores (positive if first is better)
1741
+ sig { params(score1: FitnessScore, score2: FitnessScore).returns(Float) }
1742
+ def compare_candidates(score1, score2)
1743
+ score1.overall_score - score2.overall_score
1744
+ end
1745
+
1746
+ # Rank candidates by fitness (returns indices sorted by fitness, best first)
1747
+ sig { params(scores: T::Array[FitnessScore]).returns(T::Array[Integer]) }
1748
+ def rank_candidates(scores)
1749
+ scores.each_with_index.sort_by { |score, _| -score.overall_score }.map(&:last)
1750
+ end
1751
+
1752
+ private
1753
+
1754
+ # Default secondary metrics for fitness evaluation
1755
+ sig { returns(T::Hash[Symbol, T.proc.params(arg0: T.untyped, arg1: T.untyped).returns(T.untyped)]) }
1756
+ def default_secondary_metrics
1757
+ {
1758
+ token_efficiency: proc { |traces, count| calculate_token_efficiency(traces, count) },
1759
+ consistency: proc { |responses| calculate_consistency(responses) },
1760
+ latency: proc { |latencies| calculate_latency_score(latencies) }
1761
+ }
1762
+ end
1763
+
1764
+ # Calculate token usage efficiency (lower usage = higher score)
1765
+ sig { params(traces: T::Array[T.untyped], example_count: Integer).returns(Float) }
1766
+ def calculate_token_efficiency(traces, example_count)
1767
+ return 1.0 if traces.empty? || example_count == 0
1768
+
1769
+ total_tokens = traces.sum(&:token_usage)
1770
+ avg_tokens_per_example = total_tokens.to_f / example_count
1771
+
1772
+ # Efficiency decreases as token usage increases
1773
+ # Assume 100 tokens per example is baseline (score 0.5)
1774
+ baseline_tokens = 100.0
1775
+ efficiency = baseline_tokens / (baseline_tokens + avg_tokens_per_example)
1776
+
1777
+ [efficiency, 1.0].min
1778
+ end
1779
+
1780
+ # Calculate consistency of responses (similar structure = higher score)
1781
+ sig { params(responses: T::Array[String]).returns(Float) }
1782
+ def calculate_consistency(responses)
1783
+ return 1.0 if responses.empty? || responses.size == 1
1784
+
1785
+ # Simple consistency measure: average word overlap between responses
1786
+ word_sets = responses.map { |response| response.downcase.split.to_set }
1787
+
1788
+ total_similarity = 0.0
1789
+ comparisons = 0
1790
+
1791
+ word_sets.each_with_index do |set1, i|
1792
+ word_sets[(i+1)..-1].each do |set2|
1793
+ intersection = set1 & set2
1794
+ union = set1 | set2
1795
+
1796
+ similarity = union.empty? ? 0.0 : intersection.size.to_f / union.size
1797
+ total_similarity += similarity
1798
+ comparisons += 1
1799
+ end
1800
+ end
1801
+
1802
+ comparisons == 0 ? 1.0 : total_similarity / comparisons
1803
+ end
1804
+
1805
+ # Calculate latency performance score (faster = higher score)
1806
+ sig { params(latencies: T::Array[Float]).returns(Float) }
1807
+ def calculate_latency_score(latencies)
1808
+ return 1.0 if latencies.empty?
1809
+
1810
+ avg_latency = latencies.sum / latencies.size
1811
+
1812
+ # Penalize high latencies (assume 2 seconds is baseline for 0.5 score)
1813
+ baseline_latency = 2.0
1814
+ latency_score = baseline_latency / (baseline_latency + avg_latency)
1815
+
1816
+ [latency_score, 1.0].min
1817
+ end
1818
+
1819
+ # Calculate weighted overall score combining primary and secondary metrics
1820
+ sig { params(primary_score: Float, secondary_scores: T::Hash[Symbol, Float]).returns(Float) }
1821
+ def calculate_overall_score(primary_score, secondary_scores)
1822
+ # Weight primary metric at 70%, secondary metrics at 30%
1823
+ primary_weight = 0.7
1824
+ secondary_weight = 0.3
1825
+
1826
+ return primary_score if secondary_scores.empty?
1827
+
1828
+ avg_secondary = secondary_scores.values.sum / secondary_scores.size
1829
+ overall = (primary_score * primary_weight) + (avg_secondary * secondary_weight)
1830
+
1831
+ [overall, 1.0].min
1832
+ end
1833
+ end
1834
+
1835
+ # InstructionProposer: Analyzes execution traces and generates improved instructions using LLM reflection
1836
+ class InstructionProposer
1837
+ extend T::Sig
1838
+
1839
+ sig { params(config: GEPAConfig).void }
1840
+ def initialize(config:)
1841
+ @config = config
1842
+ end
1843
+
1844
+ # Generate improved instruction based on execution traces and failures
1845
+ sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1846
+ def propose_instruction(original_instruction:, execution_traces:, failed_examples:)
1847
+ if execution_traces.empty? && failed_examples.empty?
1848
+ # No traces or failures to analyze, return original
1849
+ return original_instruction
1850
+ end
1851
+
1852
+ # Use LLM-based reflection to generate improved instruction
1853
+ reflect_and_propose(
1854
+ original_instruction: original_instruction,
1855
+ execution_traces: execution_traces,
1856
+ failed_examples: failed_examples
1857
+ )
1858
+ rescue => e
1859
+ # Fallback to original instruction on error
1860
+ original_instruction
1861
+ end
1862
+
1863
+ private
1864
+
1865
+ sig { returns(GEPAConfig) }
1866
+ attr_reader :config
1867
+
1868
+ # Use LLM reflection to propose improved instruction
1869
+ sig { params(original_instruction: String, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(String) }
1870
+ def reflect_and_propose(original_instruction:, execution_traces:, failed_examples:)
1871
+ # Create signature for instruction improvement
1872
+ improvement_signature = create_instruction_improvement_signature
1873
+
1874
+ # Create predictor for instruction proposal
1875
+ proposer = DSPy::Predict.new(improvement_signature)
1876
+
1877
+ # Analyze traces and failures
1878
+ trace_analysis = analyze_execution_traces(execution_traces)
1879
+ failure_analysis = analyze_failed_examples(failed_examples)
1880
+
1881
+ # Generate improved instruction
1882
+ result = proposer.call(
1883
+ original_instruction: original_instruction,
1884
+ trace_analysis: trace_analysis,
1885
+ failure_analysis: failure_analysis,
1886
+ improvement_context: "GEPA prompt optimization for better performance"
1887
+ )
1888
+
1889
+ result.improved_instruction || original_instruction
1890
+ rescue => e
1891
+ # Return original instruction if LLM call fails
1892
+ original_instruction
1893
+ end
1894
+
1895
+ # Create signature for instruction improvement
1896
+ sig { returns(T.class_of(DSPy::Signature)) }
1897
+ def create_instruction_improvement_signature
1898
+ Class.new(DSPy::Signature) do
1899
+ description "Analyze execution traces and propose improved instructions for better AI system performance"
1900
+
1901
+ input do
1902
+ const :original_instruction, String, description: "The current instruction/prompt being used"
1903
+ const :trace_analysis, String, description: "Analysis of execution traces showing patterns and issues"
1904
+ const :failure_analysis, String, description: "Analysis of failed examples and their patterns"
1905
+ const :improvement_context, String, description: "Context about what kind of improvement is needed"
1906
+ end
1907
+
1908
+ output do
1909
+ const :improved_instruction, String, description: "Improved instruction that addresses identified issues"
1910
+ const :reasoning, String, description: "Explanation of why this improvement should work better"
1911
+ const :confidence, Float, description: "Confidence in the improvement (0.0-1.0)"
1912
+ end
1913
+ end
1914
+ end
1915
+
1916
+ # Analyze execution traces to identify patterns
1917
+ sig { params(traces: T::Array[ExecutionTrace]).returns(String) }
1918
+ def analyze_execution_traces(traces)
1919
+ return "No execution traces available" if traces.empty?
1920
+
1921
+ llm_traces = traces.select(&:llm_trace?)
1922
+ module_traces = traces.select(&:module_trace?)
1923
+
1924
+ analysis = []
1925
+ analysis << "Execution Trace Analysis:"
1926
+ analysis << "- Total traces: #{traces.size}"
1927
+ analysis << "- LLM interactions: #{llm_traces.size}"
1928
+ analysis << "- Module calls: #{module_traces.size}"
1929
+
1930
+ if llm_traces.any?
1931
+ token_usage = llm_traces.sum(&:token_usage)
1932
+ avg_response_length = llm_traces.map { |t| t.attributes['response']&.to_s&.length || 0 }.sum / llm_traces.size
1933
+
1934
+ analysis << "- Total tokens used: #{token_usage}"
1935
+ analysis << "- Average response length: #{avg_response_length} characters"
1936
+
1937
+ # Identify models used
1938
+ models = llm_traces.map { |t| t.attributes['gen_ai.request.model'] }.compact.uniq
1939
+ analysis << "- Models used: #{models.join(', ')}" if models.any?
1940
+ end
1941
+
1942
+ # Analyze timing patterns
1943
+ if traces.size > 1
1944
+ timespan = traces.max_by(&:timestamp).timestamp - traces.min_by(&:timestamp).timestamp
1945
+ analysis << "- Execution timespan: #{timespan.round(2)} seconds"
1946
+ end
1947
+
1948
+ analysis.join("\n")
1949
+ end
1950
+
1951
+ # Analyze failed examples to identify failure patterns
1952
+ sig { params(failed_examples: T::Array[T.untyped]).returns(String) }
1953
+ def analyze_failed_examples(failed_examples)
1954
+ return "No failed examples to analyze" if failed_examples.empty?
1955
+
1956
+ analysis = []
1957
+ analysis << "Failure Pattern Analysis:"
1958
+ analysis << "- Failed examples count: #{failed_examples.size}"
1959
+
1960
+ # Group failures by type if possible
1961
+ if failed_examples.first.respond_to?(:input)
1962
+ input_patterns = failed_examples.map { |ex| ex.input.keys }.flatten.uniq
1963
+ analysis << "- Input fields involved: #{input_patterns.join(', ')}"
1964
+ end
1965
+
1966
+ # Sample some failure cases for context
1967
+ sample_size = [failed_examples.size, 3].min
1968
+ analysis << "- Sample failures:"
1969
+ failed_examples.take(sample_size).each_with_index do |example, idx|
1970
+ if example.respond_to?(:input) && example.respond_to?(:expected_values)
1971
+ input_summary = example.input.values.first.to_s[0..50] + "..."
1972
+ expected = example.expected_values.values.first.to_s[0..30] + "..."
1973
+ analysis << " #{idx + 1}. Input: #{input_summary} | Expected: #{expected}"
1974
+ end
1975
+ end
1976
+
1977
+ analysis.join("\n")
1978
+ end
1979
+ end
1980
+
1981
+ # MutationEngine: Handles LLM-based prompt transformations for genetic evolution
1982
+ class MutationEngine
1983
+ extend T::Sig
1984
+
1985
+ sig { returns(GEPAConfig) }
1986
+ attr_reader :config
1987
+
1988
+ sig { returns(InstructionProposer) }
1989
+ attr_reader :instruction_proposer
1990
+
1991
+ sig { params(config: GEPAConfig).void }
1992
+ def initialize(config:)
1993
+ @config = config
1994
+ @instruction_proposer = InstructionProposer.new(config: config)
1995
+ end
1996
+
1997
+ # Mutate a single program with LLM-based instruction proposal
1998
+ sig { params(program: T.untyped, execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T.untyped) }
1999
+ def mutate_program(program, execution_traces: [], failed_examples: [])
2000
+ return program if rand > @config.mutation_rate
2001
+
2002
+ begin
2003
+ original_instruction = extract_instruction(program)
2004
+
2005
+ # Use LLM-based instruction proposal instead of hardcoded mutations
2006
+ improved_instruction = @instruction_proposer.propose_instruction(
2007
+ original_instruction: original_instruction,
2008
+ execution_traces: execution_traces,
2009
+ failed_examples: failed_examples
2010
+ )
2011
+
2012
+ create_mutated_program(program, improved_instruction)
2013
+ rescue => e
2014
+ emit_event('mutation_error', {
2015
+ error: e.message,
2016
+ program_type: program.class.name
2017
+ })
2018
+ # Return original program on mutation failure
2019
+ program
2020
+ end
2021
+ end
2022
+
2023
+ # Batch mutation of multiple programs with shared execution context
2024
+ sig { params(programs: T::Array[T.untyped], execution_traces: T::Array[ExecutionTrace], failed_examples: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2025
+ def batch_mutate(programs, execution_traces: [], failed_examples: [])
2026
+ return [] if programs.empty?
2027
+
2028
+ programs.map { |program| mutate_program(program, execution_traces: execution_traces, failed_examples: failed_examples) }
2029
+ end
2030
+
2031
+ # Emit events for logging and monitoring
2032
+ sig { params(event_name: String, data: T::Hash[Symbol, T.untyped]).void }
2033
+ def emit_event(event_name, data = {})
2034
+ # For now, just a placeholder - could integrate with DSPy event system
2035
+ # In full implementation, this would emit events for monitoring
2036
+ end
2037
+
2038
+ private
2039
+
2040
+ # Extract instruction text from program
2041
+ sig { params(program: T.untyped).returns(String) }
2042
+ def extract_instruction(program)
2043
+ if program.signature_class&.description
2044
+ program.signature_class.description
2045
+ else
2046
+ "Analyze the input and complete the task accurately"
2047
+ end
2048
+ end
2049
+
2050
+ # Apply specific mutation type to instruction
2051
+ sig { params(instruction: String, mutation_type: MutationType).returns(String) }
2052
+ def apply_mutation(instruction, mutation_type)
2053
+ case mutation_type
2054
+ when MutationType::Rewrite
2055
+ apply_rewrite_mutation(instruction)
2056
+ when MutationType::Expand
2057
+ apply_expand_mutation(instruction)
2058
+ when MutationType::Simplify
2059
+ apply_simplify_mutation(instruction)
2060
+ when MutationType::Combine
2061
+ apply_combine_mutation(instruction)
2062
+ when MutationType::Rephrase
2063
+ apply_rephrase_mutation(instruction)
2064
+ else
2065
+ instruction
2066
+ end
2067
+ end
2068
+
2069
+ # Rewrite the instruction with different phrasing
2070
+ sig { params(instruction: String).returns(String) }
2071
+ def apply_rewrite_mutation(instruction)
2072
+ # Simple rewrite patterns for now - in full implementation would use LLM
2073
+ patterns = [
2074
+ -> (inst) { "Carefully #{inst.downcase}" },
2075
+ -> (inst) { "Please #{inst.downcase}" },
2076
+ -> (inst) { "#{inst} with precision" }
2077
+ ]
2078
+
2079
+ patterns.sample.call(instruction)
2080
+ end
2081
+
2082
+ # Expand instruction with additional context
2083
+ sig { params(instruction: String).returns(String) }
2084
+ def apply_expand_mutation(instruction)
2085
+ expansions = [
2086
+ "Think step by step.",
2087
+ "Provide detailed reasoning.",
2088
+ "Consider all aspects carefully.",
2089
+ "Explain your thought process."
2090
+ ]
2091
+
2092
+ "#{instruction} #{expansions.sample}"
2093
+ end
2094
+
2095
+ # Simplify instruction by removing complex terms
2096
+ sig { params(instruction: String).returns(String) }
2097
+ def apply_simplify_mutation(instruction)
2098
+ # Remove common complexity words
2099
+ simplified = instruction.gsub(/\b(carefully|detailed|comprehensive|thorough)\b/i, '')
2100
+ .gsub(/\s+/, ' ')
2101
+ .strip
2102
+
2103
+ simplified.empty? ? instruction : simplified
2104
+ end
2105
+
2106
+ # Combine instruction with complementary strategies
2107
+ sig { params(instruction: String).returns(String) }
2108
+ def apply_combine_mutation(instruction)
2109
+ strategies = [
2110
+ "Break down the problem systematically.",
2111
+ "Use logical reasoning.",
2112
+ "Apply domain knowledge.",
2113
+ "Consider edge cases."
2114
+ ]
2115
+
2116
+ "#{instruction} #{strategies.sample}"
2117
+ end
2118
+
2119
+ # Rephrase instruction with synonyms
2120
+ sig { params(instruction: String).returns(String) }
2121
+ def apply_rephrase_mutation(instruction)
2122
+ # Simple synonym replacement - in full implementation would use LLM
2123
+ synonyms = {
2124
+ 'solve' => 'resolve',
2125
+ 'answer' => 'respond to',
2126
+ 'analyze' => 'examine',
2127
+ 'calculate' => 'compute',
2128
+ 'determine' => 'identify'
2129
+ }
2130
+
2131
+ result = instruction.dup
2132
+ synonyms.each do |original, replacement|
2133
+ result.gsub!(/\b#{original}\b/i, replacement) if rand < 0.3
2134
+ end
2135
+
2136
+ result
2137
+ end
2138
+
2139
+ # Create new program with mutated instruction
2140
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2141
+ def create_mutated_program(original_program, new_instruction)
2142
+ case original_program
2143
+ when DSPy::Predict
2144
+ # DSPy::Predict has built-in support for instruction modification
2145
+ original_program.with_instruction(new_instruction)
2146
+ when DSPy::Module
2147
+ # For custom DSPy::Module classes, we need to create a new instance
2148
+ # and update any internal predictors that have instruction-based signatures
2149
+ create_mutated_module(original_program, new_instruction)
2150
+ else
2151
+ # For other types (like test doubles), check if they respond to with_instruction
2152
+ if original_program.respond_to?(:with_instruction)
2153
+ original_program.with_instruction(new_instruction)
2154
+ elsif original_program.respond_to?(:signature_class)
2155
+ # Try to create a new DSPy::Predict with the same signature but new instruction
2156
+ signature_class = original_program.signature_class
2157
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
2158
+ else
2159
+ # Fallback: return original if we can't mutate
2160
+ emit_event('mutation_fallback', {
2161
+ program_type: original_program.class.name,
2162
+ reason: 'No mutation method available'
2163
+ })
2164
+ original_program
2165
+ end
2166
+ end
2167
+ rescue => e
2168
+ emit_event('mutation_error', {
2169
+ error: e.message,
2170
+ program_type: original_program.class.name,
2171
+ backtrace: e.backtrace&.first(3)
2172
+ })
2173
+ # Return original program on error
2174
+ original_program
2175
+ end
2176
+
2177
+ # Create mutated version of custom DSPy::Module
2178
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
2179
+ def create_mutated_module(original_module, new_instruction)
2180
+ # For custom modules, we need to create a new instance
2181
+ # This is a simplified approach - in practice, modules might need
2182
+ # more sophisticated copying of their internal state
2183
+ begin
2184
+ # Create a new instance of the same class
2185
+ new_module = original_module.class.new
2186
+
2187
+ # Try to find and update any internal predictors
2188
+ original_module.instance_variables.each do |var_name|
2189
+ var_value = original_module.instance_variable_get(var_name)
2190
+
2191
+ if var_value.is_a?(DSPy::Predict)
2192
+ # Update the instruction for internal predictors
2193
+ mutated_predictor = var_value.with_instruction(new_instruction)
2194
+ new_module.instance_variable_set(var_name, mutated_predictor)
2195
+ else
2196
+ # Copy other instance variables as-is
2197
+ new_module.instance_variable_set(var_name, var_value)
2198
+ end
2199
+ end
2200
+
2201
+ new_module
2202
+ rescue => e
2203
+ emit_event('module_mutation_error', {
2204
+ error: e.message,
2205
+ module_class: original_module.class.name
2206
+ })
2207
+ # Fallback to original module
2208
+ original_module
2209
+ end
2210
+ end
2211
+
2212
+ # Select mutation type based on context and configuration
2213
+ sig { params(instruction: T.nilable(String)).returns(MutationType) }
2214
+ def select_mutation_type(instruction = nil)
2215
+ # Adaptive selection based on instruction characteristics
2216
+ if instruction && instruction.length < 20
2217
+ # Short instructions benefit from expansion
2218
+ [MutationType::Expand, MutationType::Combine].sample
2219
+ elsif instruction && instruction.length > 100
2220
+ # Long instructions benefit from simplification
2221
+ [MutationType::Simplify, MutationType::Rephrase].sample
2222
+ else
2223
+ # Balanced selection from all types
2224
+ @config.mutation_types.sample
2225
+ end
2226
+ end
2227
+
2228
+ # Calculate diversity of mutations applied
2229
+ sig { params(mutations: T::Array[MutationType]).returns(Float) }
2230
+ def mutation_diversity(mutations)
2231
+ return 0.0 if mutations.empty?
2232
+
2233
+ unique_types = mutations.uniq.size
2234
+ total_types = @config.mutation_types.size
2235
+
2236
+ unique_types.to_f / total_types
2237
+ end
2238
+ end
2239
+
2240
+ # CrossoverEngine: Handles genetic recombination of prompts for diversity
2241
+ class CrossoverEngine
2242
+ extend T::Sig
2243
+
2244
+ # Struct for instruction components
2245
+ class InstructionComponents < T::Struct
2246
+ prop :action, String
2247
+ prop :modifiers, String
2248
+ end
2249
+
2250
+ sig { returns(GEPAConfig) }
2251
+ attr_reader :config
2252
+
2253
+ sig { params(config: GEPAConfig).void }
2254
+ def initialize(config:)
2255
+ @config = config
2256
+ end
2257
+
2258
+ # Perform crossover between two parent programs
2259
+ sig { params(parent_a: T.untyped, parent_b: T.untyped).returns(T::Array[T.untyped]) }
2260
+ def crossover_programs(parent_a, parent_b)
2261
+ return [parent_a, parent_b] if rand > @config.crossover_rate
2262
+
2263
+ begin
2264
+ instruction_a = extract_instruction(parent_a)
2265
+ instruction_b = extract_instruction(parent_b)
2266
+
2267
+ crossover_type = select_crossover_type(instruction_a, instruction_b)
2268
+ offspring_instructions = apply_crossover(instruction_a, instruction_b, crossover_type)
2269
+
2270
+ offspring = [
2271
+ create_crossover_program(parent_a, offspring_instructions[0]),
2272
+ create_crossover_program(parent_b, offspring_instructions[1])
2273
+ ]
2274
+
2275
+ offspring
2276
+ rescue => e
2277
+ # Return original parents on crossover failure
2278
+ [parent_a, parent_b]
2279
+ end
2280
+ end
2281
+
2282
+ # Batch crossover for entire population
2283
+ sig { params(population: T::Array[T.untyped]).returns(T::Array[T.untyped]) }
2284
+ def batch_crossover(population)
2285
+ return [] if population.empty?
2286
+ return [population.first] if population.size == 1
2287
+
2288
+ offspring = []
2289
+
2290
+ # Pair up population for crossover
2291
+ population.each_slice(2) do |pair|
2292
+ if pair.size == 2
2293
+ crossed = crossover_programs(pair[0], pair[1])
2294
+ offspring.concat(crossed)
2295
+ else
2296
+ offspring << pair[0] # Unpaired individual passes through
2297
+ end
2298
+ end
2299
+
2300
+ offspring
2301
+ end
2302
+
2303
+ private
2304
+
2305
+ # Extract instruction text from program
2306
+ sig { params(program: T.untyped).returns(String) }
2307
+ def extract_instruction(program)
2308
+ if program.signature_class&.description
2309
+ program.signature_class.description
2310
+ else
2311
+ "Analyze the input and complete the task accurately"
2312
+ end
2313
+ end
2314
+
2315
+ # Apply specific crossover type to two instructions
2316
+ sig { params(instruction_a: String, instruction_b: String, crossover_type: CrossoverType).returns(T::Array[String]) }
2317
+ def apply_crossover(instruction_a, instruction_b, crossover_type)
2318
+ case crossover_type
2319
+ when CrossoverType::Uniform
2320
+ uniform_crossover(instruction_a, instruction_b)
2321
+ when CrossoverType::Blend
2322
+ blend_crossover(instruction_a, instruction_b)
2323
+ when CrossoverType::Structured
2324
+ structured_crossover(instruction_a, instruction_b)
2325
+ else
2326
+ [instruction_a, instruction_b]
2327
+ end
2328
+ end
2329
+
2330
+ # Uniform crossover: Exchange elements randomly at word level
2331
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2332
+ def uniform_crossover(instruction_a, instruction_b)
2333
+ return [instruction_a, instruction_b] if instruction_a == instruction_b
2334
+
2335
+ words_a = instruction_a.split
2336
+ words_b = instruction_b.split
2337
+
2338
+ # Create offspring by randomly selecting words from parents
2339
+ offspring_a_words = []
2340
+ offspring_b_words = []
2341
+
2342
+ max_length = [words_a.size, words_b.size].max
2343
+
2344
+ max_length.times do |i|
2345
+ word_a = words_a[i]
2346
+ word_b = words_b[i]
2347
+
2348
+ if rand < 0.5
2349
+ offspring_a_words << (word_a || word_b)
2350
+ offspring_b_words << (word_b || word_a)
2351
+ else
2352
+ offspring_a_words << (word_b || word_a)
2353
+ offspring_b_words << (word_a || word_b)
2354
+ end
2355
+ end
2356
+
2357
+ [
2358
+ offspring_a_words.compact.join(' '),
2359
+ offspring_b_words.compact.join(' ')
2360
+ ]
2361
+ end
2362
+
2363
+ # Blend crossover: Semantically combine instructions
2364
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2365
+ def blend_crossover(instruction_a, instruction_b)
2366
+ # Simple blending patterns - in full implementation would use LLM
2367
+ patterns = [
2368
+ -> (a, b) { "#{a} and #{b}" },
2369
+ -> (a, b) { "#{a}, specifically #{b}" },
2370
+ -> (a, b) { "#{b} while #{a.downcase}" },
2371
+ -> (a, b) { "Combine #{a.downcase} with #{b.downcase}" }
2372
+ ]
2373
+
2374
+ pattern = patterns.sample
2375
+
2376
+ [
2377
+ pattern.call(instruction_a, instruction_b),
2378
+ pattern.call(instruction_b, instruction_a)
2379
+ ]
2380
+ end
2381
+
2382
+ # Structured crossover: Maintain grammatical and logical structure
2383
+ sig { params(instruction_a: String, instruction_b: String).returns(T::Array[String]) }
2384
+ def structured_crossover(instruction_a, instruction_b)
2385
+ # Extract structural components
2386
+ components_a = extract_components(instruction_a)
2387
+ components_b = extract_components(instruction_b)
2388
+
2389
+ # Cross structural components
2390
+ offspring_a = combine_components(components_a.action, components_b.modifiers)
2391
+ offspring_b = combine_components(components_b.action, components_a.modifiers)
2392
+
2393
+ [offspring_a, offspring_b]
2394
+ end
2395
+
2396
+ # Extract structural components from instruction
2397
+ sig { params(instruction: String).returns(InstructionComponents) }
2398
+ def extract_components(instruction)
2399
+ words = instruction.split
2400
+
2401
+ # Simple heuristic: first verb-like word is action, rest are modifiers
2402
+ action_idx = words.find_index { |word| verb_like?(word) } || 0
2403
+
2404
+ InstructionComponents.new(
2405
+ action: words[action_idx] || words.first || "complete",
2406
+ modifiers: (words - [words[action_idx]]).join(' ')
2407
+ )
2408
+ end
2409
+
2410
+ # Combine action and modifiers into coherent instruction
2411
+ sig { params(action: String, modifiers: String).returns(String) }
2412
+ def combine_components(action, modifiers)
2413
+ if modifiers.empty?
2414
+ "#{action.capitalize} the task"
2415
+ else
2416
+ "#{action.capitalize} #{modifiers}"
2417
+ end
2418
+ end
2419
+
2420
+ # Simple heuristic to identify verb-like words
2421
+ sig { params(word: String).returns(T::Boolean) }
2422
+ def verb_like?(word)
2423
+ verb_patterns = %w[solve answer calculate determine analyze compute resolve examine]
2424
+ verb_patterns.any? { |pattern| word.downcase.include?(pattern) }
2425
+ end
2426
+
2427
+ # Create new program with crossover instruction
2428
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2429
+ def create_crossover_program(original_program, new_instruction)
2430
+ # For now, return the original program as we don't modify instruction in place
2431
+ # In full implementation, would create new program instance with modified instruction
2432
+ original_program
2433
+ end
2434
+
2435
+ # Select crossover type based on instruction characteristics
2436
+ sig { params(instruction_a: T.nilable(String), instruction_b: T.nilable(String)).returns(CrossoverType) }
2437
+ def select_crossover_type(instruction_a = nil, instruction_b = nil)
2438
+ # Adaptive selection based on instruction characteristics
2439
+ if instruction_a && instruction_b
2440
+ combined_length = instruction_a.length + instruction_b.length
2441
+
2442
+ if combined_length < 40
2443
+ # Short instructions benefit from blending
2444
+ [CrossoverType::Blend, CrossoverType::Uniform].sample
2445
+ elsif combined_length > 200
2446
+ # Long instructions benefit from structured crossover
2447
+ [CrossoverType::Structured, CrossoverType::Uniform].sample
2448
+ else
2449
+ # Balanced selection
2450
+ @config.crossover_types.sample
2451
+ end
2452
+ else
2453
+ @config.crossover_types.sample
2454
+ end
2455
+ end
2456
+
2457
+ # Calculate diversity of crossover operations
2458
+ sig { params(crossovers: T::Array[CrossoverType]).returns(Float) }
2459
+ def crossover_diversity(crossovers)
2460
+ return 0.0 if crossovers.empty?
2461
+
2462
+ unique_types = crossovers.uniq.size
2463
+ total_types = @config.crossover_types.size
2464
+
2465
+ unique_types.to_f / total_types
2466
+ end
2467
+ end
2468
+
2469
+ # ParetoSelector: Multi-objective optimization using Pareto frontier analysis
2470
+ class ParetoSelector
2471
+ extend T::Sig
2472
+
2473
+ sig { returns(FitnessEvaluator) }
2474
+ attr_reader :evaluator
2475
+
2476
+ sig { returns(GEPAConfig) }
2477
+ attr_reader :config
2478
+
2479
+ sig { params(evaluator: FitnessEvaluator, config: GEPAConfig).void }
2480
+ def initialize(evaluator:, config:)
2481
+ @evaluator = evaluator
2482
+ @config = config
2483
+ end
2484
+
2485
+ # Select parents for breeding using Pareto-based selection
2486
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2487
+ def select_parents(population_with_scores, count:)
2488
+ return [] if population_with_scores.empty?
2489
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2490
+
2491
+ # Combine tournament and Pareto-based selection for parent selection
2492
+ selected = []
2493
+
2494
+ count.times do
2495
+ parent = tournament_selection(population_with_scores)
2496
+ selected << parent
2497
+ end
2498
+
2499
+ selected
2500
+ end
2501
+
2502
+ # Select survivors for next generation balancing elite and diversity
2503
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2504
+ def select_survivors(population_with_scores, count:)
2505
+ return [] if population_with_scores.empty?
2506
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2507
+
2508
+ scores = population_with_scores.map(&:last)
2509
+
2510
+ # Find Pareto frontier first
2511
+ pareto_frontier = find_pareto_frontier(scores)
2512
+ frontier_indices = scores.each_index.select { |i| pareto_frontier.include?(scores[i]) }
2513
+ frontier_programs = frontier_indices.map { |i| population_with_scores[i].first }
2514
+
2515
+ if frontier_programs.size >= count
2516
+ # Use diversity selection within frontier
2517
+ frontier_with_scores = frontier_indices.map { |i| population_with_scores[i] }
2518
+ return diversity_selection(frontier_with_scores, count: count)
2519
+ else
2520
+ # Include all frontier + fill remaining with elite selection
2521
+ remaining_count = count - frontier_programs.size
2522
+ remaining_population = population_with_scores.reject.with_index { |_, i| frontier_indices.include?(i) }
2523
+
2524
+ additional = elite_selection(remaining_population, count: remaining_count)
2525
+ frontier_programs + additional
2526
+ end
2527
+ end
2528
+
2529
+ private
2530
+
2531
+ # Find Pareto frontier (non-dominated solutions)
2532
+ sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Array[FitnessScore]) }
2533
+ def find_pareto_frontier(fitness_scores)
2534
+ return [] if fitness_scores.empty?
2535
+ return fitness_scores if fitness_scores.size == 1
2536
+
2537
+ frontier = []
2538
+
2539
+ fitness_scores.each do |candidate|
2540
+ # Check if candidate is dominated by any other solution
2541
+ is_dominated = fitness_scores.any? do |other|
2542
+ other != candidate && candidate.dominated_by?(other)
2543
+ end
2544
+
2545
+ frontier << candidate unless is_dominated
2546
+ end
2547
+
2548
+ frontier
2549
+ end
2550
+
2551
+ # Calculate crowding distance for diversity preservation
2552
+ sig { params(fitness_scores: T::Array[FitnessScore]).returns(T::Hash[FitnessScore, Float]) }
2553
+ def calculate_crowding_distance(fitness_scores)
2554
+ distances = {}
2555
+
2556
+ # Initialize distances for all solutions
2557
+ fitness_scores.each { |score| distances[score] = 0.0 }
2558
+
2559
+ return distances if fitness_scores.size <= 2
2560
+
2561
+ # Calculate crowding distance for each objective
2562
+ objectives = [:primary_score, :overall_score]
2563
+ secondary_objectives = fitness_scores.first.secondary_scores.keys
2564
+ all_objectives = objectives + secondary_objectives
2565
+
2566
+ all_objectives.each do |objective|
2567
+ # Sort by current objective
2568
+ sorted_scores = fitness_scores.sort_by do |score|
2569
+ case objective
2570
+ when :primary_score
2571
+ score.primary_score
2572
+ when :overall_score
2573
+ score.overall_score
2574
+ else
2575
+ score.secondary_scores[objective] || 0.0
2576
+ end
2577
+ end
2578
+
2579
+ # Set boundary solutions to high distance
2580
+ distances[sorted_scores.first] = Float::INFINITY if sorted_scores.size > 0
2581
+ distances[sorted_scores.last] = Float::INFINITY if sorted_scores.size > 1
2582
+
2583
+ next if sorted_scores.size <= 2
2584
+
2585
+ # Calculate range for normalization
2586
+ min_val = get_objective_value(sorted_scores.first, objective)
2587
+ max_val = get_objective_value(sorted_scores.last, objective)
2588
+ range = max_val - min_val
2589
+
2590
+ next if range <= 0
2591
+
2592
+ # Calculate crowding distance for intermediate solutions
2593
+ (1...(sorted_scores.size - 1)).each do |i|
2594
+ prev_val = get_objective_value(sorted_scores[i - 1], objective)
2595
+ next_val = get_objective_value(sorted_scores[i + 1], objective)
2596
+
2597
+ distances[sorted_scores[i]] += (next_val - prev_val) / range
2598
+ end
2599
+ end
2600
+
2601
+ distances
2602
+ end
2603
+
2604
+ # Get objective value from fitness score
2605
+ sig { params(score: FitnessScore, objective: Symbol).returns(Float) }
2606
+ def get_objective_value(score, objective)
2607
+ case objective
2608
+ when :primary_score
2609
+ score.primary_score
2610
+ when :overall_score
2611
+ score.overall_score
2612
+ else
2613
+ score.secondary_scores[objective] || 0.0
2614
+ end
2615
+ end
2616
+
2617
+ # Tournament selection with Pareto preference
2618
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]]).returns(T.untyped) }
2619
+ def tournament_selection(population_with_scores)
2620
+ return population_with_scores.first.first if population_with_scores.size == 1
2621
+
2622
+ tournament_size = [3, population_with_scores.size].min
2623
+ tournament = population_with_scores.sample(tournament_size)
2624
+
2625
+ # Select best from tournament based on Pareto dominance and crowding
2626
+ best_program, best_score = tournament.first
2627
+
2628
+ tournament[1..].each do |program, score|
2629
+ if score.dominated_by?(best_score)
2630
+ # Current best dominates this candidate, keep current
2631
+ next
2632
+ elsif best_score.dominated_by?(score)
2633
+ # This candidate dominates current best, replace
2634
+ best_program, best_score = program, score
2635
+ else
2636
+ # Non-dominated comparison, use overall score as tiebreaker
2637
+ if score.overall_score > best_score.overall_score
2638
+ best_program, best_score = program, score
2639
+ end
2640
+ end
2641
+ end
2642
+
2643
+ best_program
2644
+ end
2645
+
2646
+ # Diversity-based selection using crowding distance
2647
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2648
+ def diversity_selection(population_with_scores, count:)
2649
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2650
+
2651
+ scores = population_with_scores.map(&:last)
2652
+ distances = calculate_crowding_distance(scores)
2653
+
2654
+ # Sort by crowding distance (descending - prefer more diverse)
2655
+ sorted_pairs = population_with_scores.sort_by { |_, score| -distances[score] }
2656
+
2657
+ sorted_pairs.take(count).map(&:first)
2658
+ end
2659
+
2660
+ # Elite selection based on overall fitness
2661
+ sig { params(population_with_scores: T::Array[T::Array[T.untyped]], count: Integer).returns(T::Array[T.untyped]) }
2662
+ def elite_selection(population_with_scores, count:)
2663
+ return population_with_scores.map(&:first) if count >= population_with_scores.size
2664
+
2665
+ # Sort by overall score (descending - best first)
2666
+ sorted_pairs = population_with_scores.sort_by { |_, score| -score.overall_score }
2667
+
2668
+ sorted_pairs.take(count).map(&:first)
2669
+ end
2670
+ end
2671
+
2672
+ # Configuration for GEPA optimization
2673
+ class GEPAConfig < Config
2674
+ extend T::Sig
2675
+
2676
+ sig { returns(String) }
2677
+ attr_accessor :reflection_lm
2678
+
2679
+ sig { returns(Integer) }
2680
+ attr_accessor :num_generations
2681
+
2682
+ sig { returns(Integer) }
2683
+ attr_accessor :population_size
2684
+
2685
+ sig { returns(Float) }
2686
+ attr_accessor :mutation_rate
2687
+
2688
+ sig { returns(T::Boolean) }
2689
+ attr_accessor :use_pareto_selection
2690
+
2691
+ sig { returns(T::Boolean) }
2692
+ attr_accessor :simple_mode
2693
+ sig { returns(T::Array[MutationType]) }
2694
+ attr_accessor :mutation_types
2695
+ sig { returns(Float) }
2696
+ attr_accessor :crossover_rate
2697
+ sig { returns(T::Array[CrossoverType]) }
2698
+ attr_accessor :crossover_types
2699
+
2700
+ sig { void }
2701
+ def initialize
2702
+ super
2703
+ @reflection_lm = 'gpt-4o'
2704
+ @num_generations = 10
2705
+ @population_size = 8
2706
+ @mutation_rate = 0.7
2707
+ @use_pareto_selection = true
2708
+ @simple_mode = false
2709
+ @mutation_types = [MutationType::Rewrite, MutationType::Expand, MutationType::Simplify, MutationType::Combine, MutationType::Rephrase]
2710
+ @crossover_rate = 0.6
2711
+ @crossover_types = [CrossoverType::Uniform, CrossoverType::Blend, CrossoverType::Structured]
2712
+ end
2713
+
2714
+ sig { returns(T::Hash[Symbol, T.untyped]) }
2715
+ def to_h
2716
+ super.merge({
2717
+ reflection_lm: @reflection_lm,
2718
+ num_generations: @num_generations,
2719
+ population_size: @population_size,
2720
+ mutation_rate: @mutation_rate,
2721
+ use_pareto_selection: @use_pareto_selection,
2722
+ simple_mode: @simple_mode,
2723
+ mutation_types: @mutation_types,
2724
+ crossover_rate: @crossover_rate,
2725
+ crossover_types: @crossover_types
2726
+ })
2727
+ end
2728
+ end
2729
+
2730
+ sig { returns(GEPAConfig) }
2731
+ attr_reader :config
585
2732
 
586
2733
  sig do
587
2734
  params(
@@ -611,27 +2758,632 @@ module DSPy
611
2758
  num_generations: @config.num_generations,
612
2759
  population_size: @config.population_size
613
2760
  }) do
614
- # For Phase 1, return a basic optimization result
615
- # Future phases will implement the full genetic algorithm
2761
+ # Simple optimization for Phase 1.5 - basic instruction optimization
2762
+ if @config.simple_mode
2763
+ perform_simple_optimization(program, trainset, valset)
2764
+ else
2765
+ # Phase 2 - Full GEPA genetic algorithm implementation
2766
+ perform_gepa_optimization(program, trainset, valset)
2767
+ end
2768
+ end
2769
+ end
2770
+
2771
+ private
2772
+
2773
+ # Simple optimization implementation for testing
2774
+ sig do
2775
+ params(
2776
+ program: T.untyped,
2777
+ trainset: T::Array[T.untyped],
2778
+ valset: T.nilable(T::Array[T.untyped])
2779
+ ).returns(OptimizationResult)
2780
+ end
2781
+ def perform_simple_optimization(program, trainset, valset)
2782
+ return basic_result(program) unless program.respond_to?(:signature_class)
2783
+
2784
+ original_description = program.signature_class.description
2785
+ best_program = program
2786
+ best_score = simple_evaluate_program(program, trainset)
2787
+
2788
+ # Try different instruction variations
2789
+ instruction_variants = generate_instruction_variants(original_description)
2790
+
2791
+ instruction_variants.each_with_index do |variant, index|
2792
+ emit_event('instruction_variant_test', {
2793
+ variant: variant,
2794
+ iteration: index + 1,
2795
+ total_variants: instruction_variants.size
2796
+ })
2797
+
2798
+ # Create modified program
2799
+ modified_program = create_program_with_instruction(program, variant)
2800
+ score = simple_evaluate_program(modified_program, trainset)
2801
+
2802
+ if score > best_score
2803
+ best_program = modified_program
2804
+ best_score = score
2805
+
2806
+ emit_event('improvement_found', {
2807
+ new_score: score,
2808
+ previous_score: best_score,
2809
+ instruction: variant
2810
+ })
2811
+ end
2812
+ end
2813
+
2814
+ OptimizationResult.new(
2815
+ optimized_program: best_program,
2816
+ scores: { accuracy: best_score },
2817
+ history: {
2818
+ original_score: simple_evaluate_program(program, trainset),
2819
+ variants_tested: instruction_variants.size,
2820
+ best_instruction: best_program.signature_class.description
2821
+ },
2822
+ best_score_name: 'accuracy',
2823
+ best_score_value: best_score,
2824
+ metadata: {
2825
+ optimizer: 'GEPA',
2826
+ mode: 'Simple Optimization',
2827
+ reflection_lm: @config.reflection_lm
2828
+ }
2829
+ )
2830
+ end
2831
+
2832
+ # Generate variations of the instruction
2833
+ sig { params(original_instruction: String).returns(T::Array[String]) }
2834
+ def generate_instruction_variants(original_instruction)
2835
+ variants = []
2836
+
2837
+ # Add "step by step" variant
2838
+ unless original_instruction.include?("step")
2839
+ variants << "#{original_instruction} Think step by step."
2840
+ end
2841
+
2842
+ # Add "detailed" variant
2843
+ unless original_instruction.include?("detail")
2844
+ variants << "#{original_instruction} Provide detailed reasoning."
2845
+ end
2846
+
2847
+ # Add "careful" variant
2848
+ unless original_instruction.include?("careful")
2849
+ variants << "Be careful and accurate. #{original_instruction}"
2850
+ end
2851
+
2852
+ variants.take(3) # Limit to 3 variants for simple mode
2853
+ end
2854
+
2855
+ # Create a new program instance with modified instruction using DSPy.rb dynamic capabilities
2856
+ sig { params(original_program: T.untyped, new_instruction: String).returns(T.untyped) }
2857
+ def create_program_with_instruction(original_program, new_instruction)
2858
+ case original_program
2859
+ when DSPy::Predict
2860
+ # DSPy::Predict has built-in support for instruction modification
2861
+ original_program.with_instruction(new_instruction)
2862
+ when DSPy::Module
2863
+ # For custom DSPy::Module classes, create new instance with updated predictors
2864
+ create_modified_module_instance(original_program, new_instruction)
2865
+ else
2866
+ # For other types (like test doubles), check available methods
2867
+ if original_program.respond_to?(:with_instruction)
2868
+ original_program.with_instruction(new_instruction)
2869
+ elsif original_program.respond_to?(:signature_class)
2870
+ # Create new DSPy::Predict with the same signature but new instruction
2871
+ signature_class = original_program.signature_class
2872
+ DSPy::Predict.new(signature_class).with_instruction(new_instruction)
2873
+ else
2874
+ # Fallback: return original if we can't modify
2875
+ emit_event('program_modification_fallback', {
2876
+ program_type: original_program.class.name,
2877
+ reason: 'No modification method available'
2878
+ })
2879
+ original_program
2880
+ end
2881
+ end
2882
+ rescue => e
2883
+ emit_event('program_modification_error', {
2884
+ error: e.message,
2885
+ program_type: original_program.class.name
2886
+ })
2887
+ # Return original program on error
2888
+ original_program
2889
+ end
2890
+
2891
+ # Create modified version of custom DSPy::Module instance (for main GEPA class)
2892
+ sig { params(original_module: DSPy::Module, new_instruction: String).returns(DSPy::Module) }
2893
+ def create_modified_module_instance(original_module, new_instruction)
2894
+ begin
2895
+ # Create a new instance of the same class
2896
+ new_module = original_module.class.new
2897
+
2898
+ # Try to find and update any internal predictors
2899
+ original_module.instance_variables.each do |var_name|
2900
+ var_value = original_module.instance_variable_get(var_name)
2901
+
2902
+ if var_value.is_a?(DSPy::Predict)
2903
+ # Update the instruction for internal predictors
2904
+ modified_predictor = var_value.with_instruction(new_instruction)
2905
+ new_module.instance_variable_set(var_name, modified_predictor)
2906
+ else
2907
+ # Copy other instance variables as-is
2908
+ new_module.instance_variable_set(var_name, var_value)
2909
+ end
2910
+ end
2911
+
2912
+ new_module
2913
+ rescue => e
2914
+ emit_event('module_modification_error', {
2915
+ error: e.message,
2916
+ module_class: original_module.class.name
2917
+ })
2918
+ # Fallback to original module
2919
+ original_module
2920
+ end
2921
+ end
2922
+
2923
+ # Simple evaluation for testing (different from base class evaluate_program)
2924
+ sig { params(program: T.untyped, trainset: T::Array[T.untyped]).returns(Float) }
2925
+ def simple_evaluate_program(program, trainset)
2926
+ return 0.0 unless @metric
2927
+
2928
+ scores = trainset.map do |example|
2929
+ prediction = program.call(**example.input_values)
2930
+ @metric.call(example, prediction).to_f
2931
+ rescue => e
2932
+ emit_event('evaluation_error', { error: e.message, example_id: example.object_id.to_s })
2933
+ 0.0
2934
+ end
2935
+
2936
+ scores.sum / scores.size
2937
+ end
2938
+
2939
+ # Return basic result when simple optimization isn't applicable
2940
+ sig { params(program: T.untyped).returns(OptimizationResult) }
2941
+ def basic_result(program)
2942
+ OptimizationResult.new(
2943
+ optimized_program: program,
2944
+ scores: { gepa_score: 0.0 },
2945
+ history: { phase: 'Phase 1 - Basic Structure' },
2946
+ best_score_name: 'gepa_score',
2947
+ best_score_value: 0.0,
2948
+ metadata: {
2949
+ optimizer: 'GEPA',
2950
+ implementation_status: 'Phase 1 - Infrastructure Complete'
2951
+ }
2952
+ )
2953
+ end
2954
+
2955
+ # Complete GEPA genetic algorithm optimization
2956
+ sig do
2957
+ params(
2958
+ program: T.untyped,
2959
+ trainset: T::Array[T.untyped],
2960
+ valset: T.nilable(T::Array[T.untyped])
2961
+ ).returns(OptimizationResult)
2962
+ end
2963
+ def perform_gepa_optimization(program, trainset, valset)
2964
+ # Initialize all GEPA components
2965
+ fitness_evaluator = create_fitness_evaluator
2966
+ genetic_engine = create_genetic_engine(fitness_evaluator)
2967
+ reflection_engine = create_reflection_engine
2968
+ mutation_engine = create_mutation_engine
2969
+ crossover_engine = create_crossover_engine
2970
+ pareto_selector = create_pareto_selector(fitness_evaluator)
2971
+
2972
+ # Initialize trace collection for reflection
2973
+ trace_collector = TraceCollector.new
2974
+ optimization_run_id = "gepa-run-#{SecureRandom.hex(4)}"
2975
+
2976
+ emit_event('gepa_optimization_start', {
2977
+ optimization_run_id: optimization_run_id,
2978
+ num_generations: @config.num_generations,
2979
+ population_size: @config.population_size,
2980
+ mutation_rate: @config.mutation_rate,
2981
+ crossover_rate: @config.crossover_rate
2982
+ })
2983
+
2984
+ begin
2985
+ # Run the complete genetic algorithm evolution
2986
+ evolution_result = genetic_engine.run_evolution(program, trainset)
2987
+
2988
+ # Collect traces for reflection analysis
2989
+ execution_traces = trace_collector.traces_for_run(optimization_run_id)
2990
+
2991
+ # Generate reflection insights on the optimization process
2992
+ reflection_result = reflection_engine.reflect_with_llm(execution_traces)
2993
+
2994
+ # Evaluate final candidate on validation set if provided
2995
+ final_validation_score = if valset && !valset.empty?
2996
+ validation_fitness = fitness_evaluator.evaluate_candidate(evolution_result[:best_candidate], valset)
2997
+ validation_fitness.overall_score
2998
+ else
2999
+ evolution_result[:best_fitness].overall_score
3000
+ end
3001
+
3002
+ emit_event('gepa_optimization_complete', {
3003
+ optimization_run_id: optimization_run_id,
3004
+ best_fitness: evolution_result[:best_fitness].overall_score,
3005
+ final_generation: evolution_result[:generation_count],
3006
+ validation_score: final_validation_score,
3007
+ reflection_confidence: reflection_result.confidence
3008
+ })
3009
+
3010
+ # Create comprehensive optimization result
3011
+ OptimizationResult.new(
3012
+ optimized_program: evolution_result[:best_candidate],
3013
+ scores: {
3014
+ fitness_score: evolution_result[:best_fitness].overall_score,
3015
+ validation_score: final_validation_score,
3016
+ primary_score: evolution_result[:best_fitness].primary_score,
3017
+ **evolution_result[:best_fitness].secondary_scores
3018
+ },
3019
+ history: {
3020
+ num_generations: evolution_result[:generation_count],
3021
+ population_size: @config.population_size,
3022
+ generation_history: evolution_result[:generation_history],
3023
+ final_population: evolution_result[:final_population],
3024
+ phase: 'Phase 2 - Complete GEPA',
3025
+ mutation_rate: @config.mutation_rate,
3026
+ crossover_rate: @config.crossover_rate,
3027
+ selection_strategy: @config.use_pareto_selection ? 'pareto' : 'tournament'
3028
+ },
3029
+ best_score_name: 'fitness_score',
3030
+ best_score_value: evolution_result[:best_fitness].overall_score,
3031
+ metadata: {
3032
+ optimizer: 'GEPA',
3033
+ reflection_lm: @config.reflection_lm,
3034
+ implementation_status: 'Phase 2 - Complete Implementation',
3035
+ optimization_run_id: optimization_run_id,
3036
+ reflection_insights: {
3037
+ diagnosis: reflection_result.diagnosis,
3038
+ improvements: reflection_result.improvements,
3039
+ confidence: reflection_result.confidence,
3040
+ suggested_mutations: reflection_result.suggested_mutations
3041
+ },
3042
+ trace_analysis: {
3043
+ total_traces: execution_traces.size,
3044
+ llm_traces: execution_traces.count(&:llm_trace?),
3045
+ module_traces: execution_traces.count(&:module_trace?),
3046
+ execution_timespan: calculate_execution_timespan(execution_traces)
3047
+ },
3048
+ component_versions: {
3049
+ genetic_engine: 'v2.0',
3050
+ fitness_evaluator: 'v2.0',
3051
+ reflection_engine: 'v2.0',
3052
+ mutation_engine: 'v2.0',
3053
+ crossover_engine: 'v2.0',
3054
+ pareto_selector: 'v2.0'
3055
+ }
3056
+ }
3057
+ )
3058
+
3059
+ rescue => e
3060
+ emit_event('gepa_optimization_error', {
3061
+ optimization_run_id: optimization_run_id,
3062
+ error: e.message,
3063
+ backtrace: e.backtrace&.take(5)
3064
+ })
3065
+
3066
+ # Return fallback result on optimization failure
3067
+ fallback_fitness = fitness_evaluator.evaluate_candidate(program, trainset)
616
3068
 
617
3069
  OptimizationResult.new(
618
3070
  optimized_program: program,
619
- scores: { gepa_score: 0.0 },
620
- history: {
621
- num_generations: @config.num_generations,
3071
+ scores: {
3072
+ fitness_score: fallback_fitness.overall_score,
3073
+ primary_score: fallback_fitness.primary_score,
3074
+ **fallback_fitness.secondary_scores
3075
+ },
3076
+ history: {
3077
+ num_generations: 0,
622
3078
  population_size: @config.population_size,
623
- phase: 'Phase 1 - Basic Structure'
3079
+ phase: 'Phase 2 - Error Recovery',
3080
+ error: e.message
624
3081
  },
625
- best_score_name: 'gepa_score',
626
- best_score_value: 0.0,
3082
+ best_score_name: 'fitness_score',
3083
+ best_score_value: fallback_fitness.overall_score,
627
3084
  metadata: {
628
3085
  optimizer: 'GEPA',
629
3086
  reflection_lm: @config.reflection_lm,
630
- implementation_status: 'Phase 1 - Infrastructure Complete'
3087
+ implementation_status: 'Phase 2 - Error Recovery',
3088
+ optimization_run_id: optimization_run_id,
3089
+ error_details: {
3090
+ message: e.message,
3091
+ class: e.class.name,
3092
+ recovery_strategy: 'fallback_to_original'
3093
+ }
631
3094
  }
632
3095
  )
633
3096
  end
634
3097
  end
3098
+
3099
+ # Create and configure fitness evaluator
3100
+ sig { returns(FitnessEvaluator) }
3101
+ def create_fitness_evaluator
3102
+ FitnessEvaluator.new(primary_metric: @metric, config: @config)
3103
+ end
3104
+
3105
+ # Create and configure genetic engine
3106
+ sig { params(fitness_evaluator: FitnessEvaluator).returns(GeneticEngine) }
3107
+ def create_genetic_engine(fitness_evaluator)
3108
+ GeneticEngine.new(config: @config, metric: @metric)
3109
+ end
3110
+
3111
+ # Create and configure reflection engine
3112
+ sig { returns(ReflectionEngine) }
3113
+ def create_reflection_engine
3114
+ ReflectionEngine.new(@config)
3115
+ end
3116
+
3117
+ # Create and configure mutation engine
3118
+ sig { returns(MutationEngine) }
3119
+ def create_mutation_engine
3120
+ MutationEngine.new(config: @config)
3121
+ end
3122
+
3123
+ # Create and configure crossover engine
3124
+ sig { returns(CrossoverEngine) }
3125
+ def create_crossover_engine
3126
+ CrossoverEngine.new(config: @config)
3127
+ end
3128
+
3129
+ # Create and configure pareto selector
3130
+ sig { params(fitness_evaluator: FitnessEvaluator).returns(ParetoSelector) }
3131
+ def create_pareto_selector(fitness_evaluator)
3132
+ ParetoSelector.new(evaluator: fitness_evaluator, config: @config)
3133
+ end
3134
+
3135
+ # Calculate execution timespan from traces
3136
+ sig { params(traces: T::Array[ExecutionTrace]).returns(Float) }
3137
+ def calculate_execution_timespan(traces)
3138
+ return 0.0 if traces.size < 2
3139
+
3140
+ timestamps = traces.map(&:timestamp).sort
3141
+ (timestamps.last - timestamps.first).to_f
3142
+ end
3143
+ end
3144
+
3145
+ # GEPA Feedback Metric Protocol
3146
+ # Defines interface for providing scores with optional textual feedback
3147
+ module GEPAFeedbackMetric
3148
+ extend T::Sig
3149
+ extend T::Helpers
3150
+
3151
+ interface!
3152
+
3153
+ # Evaluates prediction and provides score with optional feedback
3154
+ sig do
3155
+ abstract
3156
+ .params(
3157
+ example: DSPy::Example,
3158
+ prediction: DSPy::Prediction,
3159
+ trace: T.nilable(T::Array[ExecutionTrace])
3160
+ )
3161
+ .returns(ScoreWithFeedback)
3162
+ end
3163
+ def call(example, prediction, trace = nil); end
3164
+ end
3165
+
3166
+ # Extended prediction result with score and feedback
3167
+ class ScoreWithFeedback < T::Struct
3168
+ extend T::Sig
3169
+
3170
+ const :score, Float
3171
+ const :feedback, T.nilable(String)
3172
+ const :prediction, DSPy::Prediction
3173
+
3174
+ sig { params(score: Float, prediction: DSPy::Prediction, feedback: T.nilable(String)).void }
3175
+ def initialize(score:, prediction:, feedback: nil)
3176
+ super
3177
+ end
3178
+ end
3179
+
3180
+ # Module Evaluator - Evaluates DSPy modules with metrics and feedback
3181
+ class ModuleEvaluator
3182
+ extend T::Sig
3183
+
3184
+ sig do
3185
+ params(
3186
+ student: T.untyped, # DSPy::Module or similar callable
3187
+ metric: T.untyped,
3188
+ feedback_map: T::Hash[String, String],
3189
+ custom_instruction_proposer: T.nilable(T.untyped)
3190
+ ).void
3191
+ end
3192
+ def initialize(student:, metric:, feedback_map: {}, custom_instruction_proposer: nil)
3193
+ @student = student
3194
+ @metric = metric
3195
+ @feedback_map = feedback_map
3196
+ @custom_instruction_proposer = custom_instruction_proposer
3197
+ @trace_collector = GEPA::TraceCollector.new
3198
+ end
3199
+
3200
+ # Build program with candidate instruction
3201
+ sig { params(candidate_instruction: String).returns(T.untyped) }
3202
+ def build_program(candidate_instruction)
3203
+ # For DSPy::Module compatibility, we'll need to create a new instance
3204
+ # with modified signature description
3205
+ if @student.respond_to?(:signature_class) && @student.signature_class.respond_to?(:description=)
3206
+ modified_student = @student.class.new
3207
+ modified_student.signature_class.description = candidate_instruction
3208
+ modified_student
3209
+ else
3210
+ # Fallback: return student as-is for non-standard modules
3211
+ @student
3212
+ end
3213
+ end
3214
+
3215
+ # Evaluate program on batch with trace capture
3216
+ sig do
3217
+ params(
3218
+ batch: T::Array[DSPy::Example],
3219
+ candidate_instruction: String,
3220
+ capture_traces: T::Boolean
3221
+ )
3222
+ .returns(T::Array[T.any(Float, ScoreWithFeedback)])
3223
+ end
3224
+ def evaluate_batch(batch, candidate_instruction, capture_traces: true)
3225
+ program = build_program(candidate_instruction)
3226
+ results = []
3227
+
3228
+ batch.each do |example|
3229
+ begin
3230
+ # Execute program on example
3231
+ prediction = if program.respond_to?(:call)
3232
+ program.call(**example.input_values)
3233
+ elsif program.respond_to?(:forward)
3234
+ program.forward(**example.input_values)
3235
+ else
3236
+ raise "Program must respond to :call or :forward"
3237
+ end
3238
+
3239
+ # Get collected traces (if trace collection is enabled)
3240
+ # Note: TraceCollector automatically collects via event subscriptions
3241
+ traces = capture_traces ? @trace_collector.traces : []
3242
+
3243
+ # Evaluate with metric
3244
+ # Try with traces first (for GEPAFeedbackMetric), fallback to standard metric
3245
+ begin
3246
+ # Check if metric can accept 3 parameters (example, prediction, traces)
3247
+ if @metric.respond_to?(:arity) && (@metric.arity == 3 || @metric.arity < 0)
3248
+ score_result = @metric.call(example, prediction, traces)
3249
+ else
3250
+ score_result = @metric.call(example, prediction)
3251
+ end
3252
+ rescue ArgumentError => arg_error
3253
+ # If 3-arg call fails, try 2-arg call
3254
+ if arg_error.message.include?('wrong number of arguments')
3255
+ score_result = @metric.call(example, prediction)
3256
+ else
3257
+ raise arg_error
3258
+ end
3259
+ end
3260
+
3261
+ # Ensure we always have a ScoreWithFeedback object
3262
+ if score_result.is_a?(ScoreWithFeedback)
3263
+ results << score_result
3264
+ else
3265
+ # Wrap plain float scores in ScoreWithFeedback
3266
+ results << ScoreWithFeedback.new(
3267
+ score: score_result.to_f,
3268
+ prediction: prediction,
3269
+ feedback: nil
3270
+ )
3271
+ end
3272
+
3273
+ rescue => e
3274
+ DSPy.logger.error("Evaluation error: #{e.message}")
3275
+ # Return zero score on failure
3276
+ results << 0.0
3277
+ end
3278
+ end
3279
+
3280
+ results
3281
+ end
3282
+
3283
+ # Create reflective dataset from failed predictions
3284
+ sig do
3285
+ params(
3286
+ examples: T::Array[DSPy::Example],
3287
+ predictions: T::Array[DSPy::Prediction],
3288
+ scores: T::Array[T.any(Float, ScoreWithFeedback)],
3289
+ threshold: Float
3290
+ )
3291
+ .returns(T::Array[T::Hash[String, T.untyped]])
3292
+ end
3293
+ def make_reflective_dataset(examples, predictions, scores, threshold: 0.5)
3294
+ reflective_data = []
3295
+
3296
+ examples.zip(predictions, scores).each do |example, prediction, score|
3297
+ # Extract score value
3298
+ score_value = score.is_a?(ScoreWithFeedback) ? score.score : score
3299
+
3300
+ # Include failed predictions (below threshold)
3301
+ next if score_value >= threshold
3302
+
3303
+ # Extract feedback if available
3304
+ feedback = if score.is_a?(ScoreWithFeedback) && score.feedback
3305
+ score.feedback
3306
+ else
3307
+ "Low performance (score: #{score_value.round(2)})"
3308
+ end
3309
+
3310
+ reflective_data << {
3311
+ 'input' => example.input_values,
3312
+ 'expected' => example.expected_values,
3313
+ 'prediction' => extract_prediction_values(prediction),
3314
+ 'score' => score_value,
3315
+ 'feedback' => feedback
3316
+ }
3317
+ end
3318
+
3319
+ reflective_data
3320
+ end
3321
+
3322
+ # Propose new instruction texts based on reflective dataset
3323
+ sig do
3324
+ params(
3325
+ current_instruction: String,
3326
+ reflective_dataset: T::Array[T::Hash[String, T.untyped]],
3327
+ components_to_update: T::Array[String]
3328
+ )
3329
+ .returns(T::Array[String])
3330
+ end
3331
+ def propose_new_texts(current_instruction, reflective_dataset, components_to_update = ['instruction'])
3332
+ if @custom_instruction_proposer
3333
+ # Use custom proposer if provided
3334
+ proposed = @custom_instruction_proposer.call(current_instruction, reflective_dataset)
3335
+ [proposed].compact
3336
+ else
3337
+ # Use built-in proposal logic
3338
+ analyze_failures_and_propose(current_instruction, reflective_dataset)
3339
+ end
3340
+ end
3341
+
3342
+ private
3343
+
3344
+ # Extract prediction values for reflective analysis
3345
+ sig { params(prediction: DSPy::Prediction).returns(T::Hash[String, T.untyped]) }
3346
+ def extract_prediction_values(prediction)
3347
+ # DSPy::Prediction implements to_h which returns the underlying struct's data
3348
+ prediction.to_h.transform_keys(&:to_s)
3349
+ end
3350
+
3351
+ # Analyze failures and propose improvements
3352
+ sig do
3353
+ params(
3354
+ current_instruction: String,
3355
+ reflective_dataset: T::Array[T::Hash[String, T.untyped]]
3356
+ )
3357
+ .returns(T::Array[String])
3358
+ end
3359
+ def analyze_failures_and_propose(current_instruction, reflective_dataset)
3360
+ return [current_instruction] if reflective_dataset.empty?
3361
+
3362
+ # Extract common failure patterns
3363
+ feedback_texts = reflective_dataset.map { |data| data['feedback'] }.compact
3364
+
3365
+ # Simple heuristic-based proposals
3366
+ proposals = []
3367
+
3368
+ # If many failures, suggest more detailed instruction
3369
+ if reflective_dataset.size >= 3
3370
+ proposals << "#{current_instruction} Please provide step-by-step reasoning."
3371
+ end
3372
+
3373
+ # If feedback mentions specific issues, address them
3374
+ if feedback_texts.any? { |fb| fb.include?('unclear') || fb.include?('ambiguous') }
3375
+ proposals << "#{current_instruction} Be specific and clear in your response."
3376
+ end
3377
+
3378
+ if feedback_texts.any? { |fb| fb.include?('incomplete') || fb.include?('missing') }
3379
+ proposals << "#{current_instruction} Ensure your answer is complete and addresses all aspects."
3380
+ end
3381
+
3382
+ # Always include at least one proposal
3383
+ proposals << "#{current_instruction.strip}. Think carefully before responding." if proposals.empty?
3384
+
3385
+ proposals.uniq.take(3) # Return up to 3 proposals
3386
+ end
635
3387
  end
636
3388
  end
637
3389
  end